From a83479b6efcd2be6928fb9e8ee8109813cc6df6c Mon Sep 17 00:00:00 2001 From: zhangyue19921010 Date: Wed, 10 Jun 2026 11:09:28 +0800 Subject: [PATCH 1/3] feat(index): consolidate bitmap segments and unindexed data on optimize --- rust/lance-index/src/scalar.rs | 9 + rust/lance-index/src/scalar/bitmap.rs | 444 ++------------------------ rust/lance-select/src/mask.rs | 5 + rust/lance/src/index/append.rs | 207 +++++++++++- rust/lance/src/index/create.rs | 217 +++++++------ rust/lance/src/index/scalar.rs | 47 +-- rust/lance/src/index/scalar/bitmap.rs | 82 +++-- rust/lance/src/index/scalar/btree.rs | 28 +- 8 files changed, 415 insertions(+), 624 deletions(-) diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index 772dfaf4089..2aef324de83 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -928,6 +928,15 @@ impl OldIndexDataFilter { .collect(), } } + + /// Filter a posting list of row addresses in place, retaining only the rows + /// selected by this filter. + pub fn retain_row_addrs(&self, addrs: &mut RowAddrTreeMap) { + match self { + Self::Fragments { to_keep, .. } => addrs.retain_fragments_in(to_keep), + Self::RowIds(valid_row_ids) => *addrs &= valid_row_ids, + } + } } impl UpdateCriteria { diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index 10254e699c5..7b88e7c5d29 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -3,8 +3,7 @@ use std::{ any::Any, - cmp::Reverse, - collections::{BTreeMap, BinaryHeap, HashMap}, + collections::{BTreeMap, HashMap}, fmt::Debug, ops::Bound, sync::Arc, @@ -29,14 +28,12 @@ use lance_core::{ error::LanceOptionExt, utils::tokio::get_num_compute_intensive_cpus, }; -use lance_io::object_store::ObjectStore; use lance_select::{NullableRowAddrSet, RowAddrTreeMap, RowSetOps}; -use object_store::path::Path; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use tracing::{instrument, warn}; -use super::{AnyQuery, IndexStore, ScalarIndex}; +use super::{AnyQuery, IndexStore, OldIndexDataFilter, ScalarIndex}; use super::{ BuiltinIndexType, SargableQuery, ScalarIndexParams, SearchResult, btree::OrderableScalarValue, }; @@ -58,18 +55,10 @@ use crate::{scalar::IndexReader, scalar::expression::ScalarQueryParser}; pub const BITMAP_LOOKUP_NAME: &str = "bitmap_page_lookup.lance"; pub const INDEX_STATS_METADATA_KEY: &str = "lance:index_stats"; -const BITMAP_PART_LOOKUP_PREFIX: &str = "part_"; -const BITMAP_PART_LOOKUP_SUFFIX: &str = "_bitmap_page_lookup.lance"; -const EXPLICIT_SHARD_ID_TAG: u64 = 0; -const IMPLICIT_FRAGMENT_ID_TAG: u64 = 1; const MAX_BITMAP_ARRAY_LENGTH: usize = i32::MAX as usize - 1024 * 1024; // leave headroom const MAX_ROWS_PER_CHUNK: usize = 2 * 1024; -// Smaller than MAX_ROWS_PER_CHUNK to bound the per-cursor in-memory batch -// footprint during a k-way merge (N cursors × chunk), while still amortising -// I/O over a reasonable number of rows per read. -const MERGE_ROWS_PER_CHUNK: usize = 512; const BITMAP_INDEX_VERSION: u32 = 0; @@ -883,64 +872,6 @@ impl BitmapBatchWriter { } } -fn bitmap_shard_file_name(partition_id: u64) -> String { - format!("{BITMAP_PART_LOOKUP_PREFIX}{partition_id}{BITMAP_PART_LOOKUP_SUFFIX}") -} - -fn tagged_bitmap_partition_id(id: u32, tag: u64) -> u64 { - ((id as u64) << 32) | tag -} - -fn bitmap_shard_partition_id(fragment_ids: &[u32], shard_id: Option) -> Result { - if fragment_ids.is_empty() { - return Err(Error::invalid_input( - "Bitmap shard build requires at least one fragment id".to_string(), - )); - } - - if let Some(shard_id) = shard_id { - return Ok(tagged_bitmap_partition_id(shard_id, EXPLICIT_SHARD_ID_TAG)); - } - - let [fragment_id] = fragment_ids else { - return Err(Error::invalid_input(format!( - "Bitmap distributed build over multiple fragments requires an explicit shard_id. \ - Received {} fragment ids: {:?}. Please assign mutually exclusive shard_id values \ - to disjoint fragment groups.", - fragment_ids.len(), - fragment_ids - ))); - }; - - Ok(tagged_bitmap_partition_id( - *fragment_id, - IMPLICIT_FRAGMENT_ID_TAG, - )) -} - -fn extract_bitmap_shard_id(filename: &str) -> Result { - let partition_id = filename - .strip_prefix(BITMAP_PART_LOOKUP_PREFIX) - .and_then(|name| name.strip_suffix(BITMAP_PART_LOOKUP_SUFFIX)) - .ok_or_else(|| { - Error::internal(format!("Invalid bitmap shard file name format: {filename}")) - })?; - partition_id.parse::().map_err(|_| { - Error::internal(format!( - "Failed to parse bitmap partition id from file name: {filename}" - )) - }) -} - -fn deserialize_bitmap(bitmap_bytes: &[u8], file_name: &str) -> Result { - RowAddrTreeMap::deserialize_from(bitmap_bytes).map_err(|error| { - Error::corrupt_file( - Path::from(file_name), - format!("Failed to deserialize bitmap bytes: {error}"), - ) - }) -} - async fn new_bitmap_batch_writer( index_store: &dyn IndexStore, file_name: &str, @@ -954,218 +885,6 @@ async fn new_bitmap_batch_writer( Ok(BitmapBatchWriter::new(index_file)) } -#[derive(Clone, Debug, Eq, PartialEq)] -struct BitmapHeapItem { - key: OrderableScalarValue, - shard_idx: usize, -} - -impl Ord for BitmapHeapItem { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.key - .cmp(&other.key) - .then_with(|| self.shard_idx.cmp(&other.shard_idx)) - } -} - -impl PartialOrd for BitmapHeapItem { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -struct BitmapShardCursor { - file_name: String, - reader: Arc, - total_rows: usize, - next_row_offset: usize, - batch: Option, - batch_row_idx: usize, -} - -impl BitmapShardCursor { - async fn try_new(file_name: String, reader: Arc) -> Result> { - let total_rows = reader.num_rows(); - if total_rows == 0 { - return Ok(None); - } - - let mut cursor = Self { - file_name, - reader, - total_rows, - next_row_offset: 0, - batch: None, - batch_row_idx: 0, - }; - if cursor.advance().await? { - Ok(Some(cursor)) - } else { - Ok(None) - } - } - - fn peek_key(&self) -> Result { - let batch = self.batch.as_ref().ok_or_else(|| { - Error::internal(format!( - "Bitmap shard {} has no active batch", - self.file_name - )) - })?; - let key = ScalarValue::try_from_array(batch.column(0), self.batch_row_idx)?; - Ok(OrderableScalarValue(key)) - } - - fn take_current(&mut self) -> Result<(ScalarValue, RowAddrTreeMap)> { - let batch = self.batch.as_ref().ok_or_else(|| { - Error::internal(format!( - "Bitmap shard {} has no active batch", - self.file_name - )) - })?; - let keys = batch.column(0); - let binary_bitmaps = batch - .column(1) - .as_any() - .downcast_ref::() - .ok_or_else(|| { - Error::corrupt_file( - Path::from(self.file_name.as_str()), - "Bitmap shard batch has non-binary bitmap column".to_string(), - ) - })?; - let key = ScalarValue::try_from_array(keys, self.batch_row_idx)?; - let bitmap = deserialize_bitmap(binary_bitmaps.value(self.batch_row_idx), &self.file_name)?; - self.batch_row_idx += 1; - Ok((key, bitmap)) - } - - async fn advance(&mut self) -> Result { - loop { - if let Some(batch) = &self.batch - && self.batch_row_idx < batch.num_rows() - { - return Ok(true); - } - - if self.next_row_offset >= self.total_rows { - self.batch = None; - return Ok(false); - } - - let end_row = (self.next_row_offset + MERGE_ROWS_PER_CHUNK).min(self.total_rows); - let batch = self - .reader - .read_range(self.next_row_offset..end_row, None) - .await?; - self.next_row_offset = end_row; - self.batch = Some(batch); - self.batch_row_idx = 0; - } - } -} - -async fn advance_cursor_and_push( - cursors: &mut [BitmapShardCursor], - heap: &mut BinaryHeap>, - shard_idx: usize, -) -> Result<()> { - if cursors[shard_idx].advance().await? { - heap.push(Reverse(BitmapHeapItem { - key: cursors[shard_idx].peek_key()?, - shard_idx, - })); - } - Ok(()) -} - -async fn drain_same_key_bitmaps( - cursors: &mut [BitmapShardCursor], - heap: &mut BinaryHeap>, - item: BitmapHeapItem, -) -> Result<(ScalarValue, RowAddrTreeMap)> { - let (key, mut merged_bitmap) = cursors[item.shard_idx].take_current()?; - let merged_key = OrderableScalarValue(key); - advance_cursor_and_push(cursors, heap, item.shard_idx).await?; - - loop { - let Some(Reverse(next_item)) = heap.peek() else { - break; - }; - if next_item.key != merged_key { - break; - } - - let shard_idx = next_item.shard_idx; - let _ = heap.pop(); - let (_, bitmap) = cursors[shard_idx].take_current()?; - merged_bitmap |= &bitmap; - advance_cursor_and_push(cursors, heap, shard_idx).await?; - } - - Ok((merged_key.0, merged_bitmap)) -} - -async fn list_bitmap_shard_files( - object_store: &ObjectStore, - index_dir: &Path, - progress: &dyn IndexBuildProgress, -) -> Result> { - let mut shard_files = Vec::new(); - let mut list_stream = object_store.list(Some(index_dir.clone())); - while let Some(item) = list_stream.next().await { - match item { - Ok(meta) => { - let file_name = meta.location.filename().unwrap_or_default(); - if file_name.starts_with(BITMAP_PART_LOOKUP_PREFIX) - && file_name.ends_with(BITMAP_PART_LOOKUP_SUFFIX) - { - shard_files.push(file_name.to_string()); - progress - .stage_progress("scan_bitmap_shards", shard_files.len() as u64) - .await?; - } - } - Err(err) => { - return Err(Error::io(format!( - "Failed to list bitmap shard files in {}: {err}", - index_dir - ))); - } - } - } - let mut shard_files = shard_files - .into_iter() - .map(|file_name| extract_bitmap_shard_id(&file_name).map(|shard_id| (shard_id, file_name))) - .collect::>>()?; - shard_files.sort_unstable_by_key(|(shard_id, _)| *shard_id); - let shard_files = shard_files - .into_iter() - .map(|(_, file_name)| file_name) - .collect::>(); - if shard_files.is_empty() { - return Err(Error::invalid_input(format!( - "No bitmap shard files found in index directory: {}; \ - call build_index for each fragment before calling merge_index_metadata", - index_dir - ))); - } - Ok(shard_files) -} - -async fn cleanup_bitmap_shard_files(store: &dyn IndexStore, shard_files: &[String]) { - for file_name in shard_files { - if let Err(error) = store.delete_index_file(file_name).await { - warn!( - "Failed to delete bitmap shard file '{}': {}. \ - This does not affect the merged bitmap index, but the shard file \ - may need manual cleanup.", - file_name, error - ); - } - } -} - #[derive(Debug, Default)] pub struct BitmapIndexPlugin; @@ -1305,23 +1024,6 @@ impl BitmapIndexPlugin { Self::streaming_build_and_write(data, None, index_store, BITMAP_LOOKUP_NAME).await } - async fn train_bitmap_shard( - data: SendableRecordBatchStream, - index_store: &dyn IndexStore, - fragment_ids: &[u32], - shard_id: Option, - progress: Arc, - ) -> Result<()> { - let partition_id = bitmap_shard_partition_id(fragment_ids, shard_id)?; - let file_name = bitmap_shard_file_name(partition_id); - progress - .stage_start("build_bitmap_shard", None, "rows") - .await?; - Self::streaming_build_and_write(data, None, index_store, &file_name).await?; - progress.stage_complete("build_bitmap_shard").await?; - Ok(()) - } - /// Builds and writes a bitmap index in a streaming fashion from value-sorted /// input. Only one value's bitmap is in memory at a time, reducing peak memory /// from O(unique_values * avg_bitmap) to O(largest_single_bitmap). @@ -1499,104 +1201,21 @@ impl BitmapIndexPlugin { }) .collect() } - - /// Merge per-shard bitmap lookup files into a single bitmap index file. - /// - /// Each shard file is already sorted by key and can contain many distinct keys. - /// This method does not materialize an entire shard in memory. Instead, it keeps - /// one cursor per shard, where each cursor tracks the shard's current row within - /// a small in-memory batch. A min-heap stores the current key for each shard. - /// - /// The merge then proceeds as a streaming K-way merge: - /// - pop the smallest current key across all shards - /// - union the bitmap for that key with any other shards currently positioned on - /// the same key - /// - advance only those shards that participated in the union and push their next - /// keys back into the heap - /// - /// This keeps memory usage proportional to the number of shards plus the bitmaps - /// currently being merged, instead of the total number of keys across all shards. - async fn merge_shards( - store: &dyn IndexStore, - shard_files: &[String], - progress: Arc, - ) -> Result<()> { - progress - .stage_start("merge_bitmap_shards", None, "bitmaps") - .await?; - - let mut cursors = Vec::with_capacity(shard_files.len()); - let mut heap = BinaryHeap::with_capacity(shard_files.len()); - let mut value_type: Option = None; - - for file_name in shard_files { - let reader = store.open_index_file(file_name).await?; - let shard_value_type = reader.schema().fields[0].data_type().clone(); - if let Some(existing_type) = &value_type { - if existing_type != &shard_value_type { - return Err(Error::invalid_input(format!( - "Bitmap shard {} has value type {:?}, expected {:?}", - file_name, shard_value_type, existing_type - ))); - } - } else { - value_type = Some(shard_value_type); - } - if let Some(cursor) = BitmapShardCursor::try_new(file_name.clone(), reader).await? { - let key = cursor.peek_key()?; - let shard_idx = cursors.len(); - cursors.push(cursor); - heap.push(Reverse(BitmapHeapItem { key, shard_idx })); - } - } - - let value_type = value_type.ok_or_else(|| { - Error::invalid_input("Bitmap shard merge requires at least one shard file".to_string()) - })?; - let mut writer = new_bitmap_batch_writer(store, BITMAP_LOOKUP_NAME, &value_type).await?; - let mut merged_keys = 0u64; - - while let Some(Reverse(item)) = heap.pop() { - let (key, merged_bitmap) = - drain_same_key_bitmaps(&mut cursors, &mut heap, item).await?; - writer.emit(key, &merged_bitmap).await?; - merged_keys += 1; - progress - .stage_progress("merge_bitmap_shards", merged_keys) - .await?; - } - - progress.stage_complete("merge_bitmap_shards").await?; - progress - .stage_start("write_bitmap_index", Some(1), "files") - .await?; - writer.finish().await?; - progress.stage_progress("write_bitmap_index", 1).await?; - progress.stage_complete("write_bitmap_index").await?; - Ok(()) - } -} - -pub async fn merge_index_files( - object_store: &ObjectStore, - index_dir: &Path, - store: Arc, - progress: Arc, -) -> Result<()> { - progress - .stage_start("scan_bitmap_shards", None, "files") - .await?; - let shard_files = list_bitmap_shard_files(object_store, index_dir, progress.as_ref()).await?; - progress.stage_complete("scan_bitmap_shards").await?; - - BitmapIndexPlugin::merge_shards(store.as_ref(), &shard_files, progress).await?; - cleanup_bitmap_shard_files(store.as_ref(), &shard_files).await; - Ok(()) } +/// Consolidate the materialized state of several bitmap segments (and, +/// optionally, a stream of not-yet-indexed `new_data`) into a single canonical +/// bitmap written to `dest_store`. +/// +/// `old_data_filter` is applied only to the rows coming from `source_indices`, +/// dropping row addresses whose fragments compaction/deletion has retired; rows +/// from `new_data` are inserted unfiltered. The whole merged state is held in +/// memory, as bitmap segment consolidation has always done. pub async fn merge_bitmap_indices( source_indices: &[Arc], + new_data: Option, dest_store: &dyn IndexStore, + old_data_filter: Option, progress: Arc, ) -> Result { if source_indices.is_empty() { @@ -1636,6 +1255,18 @@ pub async fn merge_bitmap_indices( .await?; } progress.stage_complete("merge_bitmap_segments").await?; + if let Some(old_data_filter) = old_data_filter { + merged_state.retain(|_, postings| { + old_data_filter.retain_row_addrs(postings); + !postings.is_empty() + }); + } + + // Fold the not-yet-indexed rows into the same in-memory state. + if let Some(new_data) = new_data { + (merged_state, _) = + BitmapIndexPlugin::build_bitmap_index_state(new_data, merged_state).await?; + } progress .stage_start("write_bitmap_index", Some(1), "files") @@ -1700,8 +1331,8 @@ impl ScalarIndexPlugin for BitmapIndexPlugin { data: SendableRecordBatchStream, index_store: &dyn IndexStore, request: Box, - fragment_ids: Option>, - progress: Arc, + _fragment_ids: Option>, + _progress: Arc, ) -> Result { let request = request .as_any() @@ -1712,23 +1343,14 @@ impl ScalarIndexPlugin for BitmapIndexPlugin { .to_string(), ) })?; - if let Some(fragment_ids) = fragment_ids.as_ref() { - Self::train_bitmap_shard( - data, - index_store, - fragment_ids, - request.parameters.shard_id, - progress, - ) - .await?; - } else if request.parameters.shard_id.is_some() { - return Err(Error::invalid_input( - "Bitmap shard_id requires fragment_ids and is only supported for distributed shard builds" - .to_string(), - )); - } else { - Self::train_bitmap_index(data, index_store).await?; + if request.parameters.shard_id.is_some() { + warn!( + "Bitmap `shard_id` is deprecated and now ignored; each build now produces one \ + canonical segment. Use the segmented-index APIs instead. The `shard_id` field \ + will be removed in a future release." + ); } + Self::train_bitmap_index(data, index_store).await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pbold::BitmapIndexDetails::default()) .unwrap(), diff --git a/rust/lance-select/src/mask.rs b/rust/lance-select/src/mask.rs index a10ad9a6f50..c44b77fe770 100644 --- a/rust/lance-select/src/mask.rs +++ b/rust/lance-select/src/mask.rs @@ -572,6 +572,11 @@ impl RowAddrTreeMap { .retain(|frag_id, _| frag_id_set.contains(frag_id)); } + /// Retain only the rows whose fragment id is contained in `keep`. + pub fn retain_fragments_in(&mut self, keep: &RoaringBitmap) { + self.inner.retain(|frag_id, _| keep.contains(*frag_id)); + } + /// Compute the serialized size of the set. pub fn serialized_size(&self) -> usize { // Starts at 4 because of the u32 num_entries diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs index a89b64df276..f6e5ce54219 100644 --- a/rust/lance/src/index/append.rs +++ b/rust/lance/src/index/append.rs @@ -94,6 +94,45 @@ pub async fn build_old_data_filter( } } +/// Split the stored fragment coverage of `segments` into fragments still live +/// in `dataset` (`effective`) and fragments that compaction or deletion has +/// already retired (`deleted`). +pub fn split_segment_coverage<'a>( + dataset: &Dataset, + segments: impl IntoIterator, +) -> (RoaringBitmap, RoaringBitmap) { + let mut effective = RoaringBitmap::new(); + let mut deleted = RoaringBitmap::new(); + for segment in segments { + if let Some(eff) = segment.effective_fragment_bitmap(&dataset.fragment_bitmap) { + effective |= eff; + } + if let Some(del) = segment.deleted_fragment_bitmap(&dataset.fragment_bitmap) { + deleted |= del; + } + } + (effective, deleted) +} + +/// Validate that every segment carries fragment coverage, split that coverage +/// into still-live and retired fragments, and build the matching [`OldIndexDataFilter`]. +pub async fn effective_coverage_and_filter( + dataset: &Dataset, + segments: &[IndexMetadata], +) -> Result<(RoaringBitmap, Option)> { + for segment in segments { + if segment.fragment_bitmap.is_none() { + return Err(Error::invalid_input(format!( + "CreateIndex: segment {} is missing fragment coverage", + segment.uuid + ))); + } + } + let (effective, deleted) = split_segment_coverage(dataset, segments); + let old_data_filter = build_old_data_filter(dataset, &effective, &deleted).await?; + Ok((effective, old_data_filter)) +} + async fn load_unindexed_training_data( dataset: &Dataset, field_path: &str, @@ -194,16 +233,8 @@ async fn merge_scalar_indices<'a>( .await?; // Effective = bitmap ∩ live fragments; deleted = bitmap \ live fragments. - let mut effective_old_frags = RoaringBitmap::new(); - let mut deleted_old_frags = RoaringBitmap::new(); - for idx in selected_old_indices { - if let Some(effective) = idx.effective_fragment_bitmap(&dataset.fragment_bitmap) { - effective_old_frags |= effective; - } - if let Some(deleted) = idx.deleted_fragment_bitmap(&dataset.fragment_bitmap) { - deleted_old_frags |= deleted; - } - } + let (effective_old_frags, deleted_old_frags) = + split_segment_coverage(dataset.as_ref(), selected_old_indices.iter().copied()); let mut frag_bitmap = base_unindexed_bitmap.clone(); frag_bitmap |= &effective_old_frags; @@ -211,7 +242,7 @@ async fn merge_scalar_indices<'a>( // Scalar Index that expos an N:1 segment-merge primitive reachable without // rescanning the dataset - let has_segment_merge_primitive = matches!(index_type, IndexType::BTree); + let has_segment_merge_primitive = matches!(index_type, IndexType::BTree | IndexType::Bitmap); // Merge new data into the existing segment(s) instead of rebuilding from // scratch, when both hold: @@ -256,6 +287,25 @@ async fn merge_scalar_indices<'a>( ) .await? } + IndexType::Bitmap => { + if selected_old_indices.len() == 1 { + // Memory optimization: a single segment can absorb the new data + // via `BitmapIndex::update` without loading all into memory at once. + reference_index + .update(new_data_stream, &new_store, None) + .await? + } else { + crate::index::scalar::bitmap::open_and_merge_segments( + dataset.as_ref(), + field_path, + selected_old_indices, + new_data_stream, + &new_store, + old_data_filter, + ) + .await? + } + } _ => { reference_index .update(new_data_stream, &new_store, old_data_filter) @@ -1710,6 +1760,141 @@ mod tests { assert_eq!(rows, 2, "value 'd' lives in appended fragment"); } + #[tokio::test] + async fn test_optimize_bitmap_multi_segment_consolidation() { + async fn query_count(dataset: &Dataset, value: &str) -> usize { + dataset + .scan() + .filter(&format!("category = '{}'", value)) + .unwrap() + .project(&["category"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows() + } + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![Field::new( + "category", + DataType::Utf8, + false, + )])); + let make_batch = |labels: &[&str]| { + let arr = StringArray::from_iter_values(labels.iter().copied()); + RecordBatch::try_new(schema.clone(), vec![Arc::new(arr)]).unwrap() + }; + + // Three fragments, each committed as its own Bitmap segment so optimize + // sees a multi-segment logical index. + // frag0={a,b}, frag1={a,c}, frag2={b,c}. + let reader = RecordBatchIterator::new( + vec![ + Ok(make_batch(&["a", "b"])), + Ok(make_batch(&["a", "c"])), + Ok(make_batch(&["b", "c"])), + ], + schema.clone(), + ); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 2, + ..Default::default() + }), + ) + .await + .unwrap(); + + let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::Bitmap); + let fragments = dataset.get_fragments(); + assert_eq!(fragments.len(), 3); + let frag0_id = fragments[0].id() as u32; + let mut staged_segments = Vec::new(); + for fragment in &fragments { + staged_segments.push( + crate::index::create::CreateIndexBuilder::new( + &mut dataset, + &["category"], + IndexType::Bitmap, + ¶ms, + ) + .name("cat_idx".into()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() + .await + .unwrap(), + ); + } + dataset + .commit_existing_index_segments("cat_idx", "category", staged_segments) + .await + .unwrap(); + assert_eq!( + dataset.load_indices_by_name("cat_idx").await.unwrap().len(), + 3 + ); + + dataset.delete("category IN ('a', 'b')").await.unwrap(); + let live_frag_ids: Vec = dataset + .get_fragments() + .iter() + .map(|f| f.id() as u32) + .collect(); + assert!( + !live_frag_ids.contains(&frag0_id), + "frag0 should be retired after deleting all its rows" + ); + assert_eq!(live_frag_ids.len(), 2); + + // Append a fourth fragment, leave it unindexed. + let appended = RecordBatchIterator::new(vec![Ok(make_batch(&["a", "d"]))], schema.clone()); + let mut dataset = Dataset::write( + appended, + test_uri, + Some(WriteParams { + max_rows_per_file: 2, + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await + .unwrap(); + + // merge(3) selects all three old segments (one now backed only by the + // retired frag0) and consolidates them, together with the unindexed + // fragment, into a single segment. + dataset + .optimize_indices(&OptimizeOptions::merge(3)) + .await + .unwrap(); + + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + + // Live rows after the delete + append: frag1={c}, frag2={c}, frag3={a,d}. + // The retired frag0's 'a'/'b' rows must not resurface. + assert_eq!(query_count(&dataset, "a").await, 1); + assert_eq!(query_count(&dataset, "b").await, 0); + assert_eq!(query_count(&dataset, "c").await, 2); + assert_eq!(query_count(&dataset, "d").await, 1); + + // The segments collapsed into a single one covering only the still-live + // fragments (frag1, frag2, frag3); the retired frag0 was filtered out of + // the consolidated coverage. + let segments_after = dataset.load_indices_by_name("cat_idx").await.unwrap(); + assert_eq!(segments_after.len(), 1); + let coverage = segments_after[0].fragment_bitmap.as_ref().unwrap(); + assert_eq!(coverage.len(), 3); + assert!( + !coverage.contains(frag0_id), + "retired frag0 must not appear in the consolidated coverage" + ); + } + #[tokio::test] async fn test_optimize_btree_keeps_rows_with_stable_row_ids_after_compaction() { async fn query_id_count(dataset: &Dataset, id: &str) -> usize { diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index ce8e65d8356..2dd1fa3d2e5 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -10,7 +10,7 @@ use crate::{ index::{ DatasetIndexExt, DatasetIndexInternalExt, IntoIndexSegment, build_index_metadata_from_segments, - scalar::{build_bitmap_index_segment, build_scalar_index}, + scalar::build_scalar_index, vector::{ LANCE_VECTOR_INDEX, VectorIndexParams, build_distributed_vector_index, build_empty_vector_index, build_vector_index, @@ -259,44 +259,17 @@ impl<'a> CreateIndexBuilder<'a> { .preprocessed_data .take() .map(|reader| lance_datafusion::utils::reader_to_stream(Box::new(reader))); - if self.index_type == IndexType::Bitmap && self.fragments.is_some() { - if !train { - return Err(Error::invalid_input( - "canonical bitmap segment build requires train=true".to_string(), - )); - } - if preprocesssed_data.is_some() { - return Err(Error::invalid_input( - "canonical bitmap segment build does not accept preprocessed data" - .to_string(), - )); - } - let fragments = self.fragments.clone().ok_or_else(|| { - Error::invalid_input( - "canonical bitmap segment build requires fragment ids".to_string(), - ) - })?; - build_bitmap_index_segment( - self.dataset, - column, - &index_id.to_string(), - fragments, - self.progress.clone(), - ) - .await? - } else { - build_scalar_index( - self.dataset, - column, - &index_id.to_string(), - ¶ms, - train, - self.fragments.clone(), - preprocesssed_data, - self.progress.clone(), - ) - .await? - } + build_scalar_index( + self.dataset, + column, + &index_id.to_string(), + ¶ms, + train, + self.fragments.clone(), + preprocesssed_data, + self.progress.clone(), + ) + .await? } (IndexType::Scalar, LANCE_SCALAR_INDEX) => { // Guess the index type @@ -569,6 +542,13 @@ fn is_btree_scalar_params(params: &dyn IndexParams) -> bool { .is_some_and(|p| p.index_type.eq_ignore_ascii_case("btree")) } +fn is_bitmap_scalar_params(params: &dyn IndexParams) -> bool { + params + .as_any() + .downcast_ref::() + .is_some_and(|p| p.index_type.eq_ignore_ascii_case("bitmap")) +} + /// Validate that a user-supplied `index_uuid` is permitted for this build. fn ensure_index_uuid_allowed( index_type: IndexType, @@ -576,17 +556,16 @@ fn ensure_index_uuid_allowed( fragments: Option<&Vec>, index_uuid: Option<&str>, ) -> Result<()> { - let is_btree = index_type == IndexType::BTree - || params - .as_any() - .downcast_ref::() - .map(|params| params.index_type.eq_ignore_ascii_case("btree")) - .unwrap_or(false); - - if index_uuid.is_some() && fragments.is_some_and(|fragments| !fragments.is_empty()) && is_btree + let is_segmented_scalar = matches!(index_type, IndexType::BTree | IndexType::Bitmap) + || is_btree_scalar_params(params) + || is_bitmap_scalar_params(params); + + if index_uuid.is_some() + && fragments.is_some_and(|fragments| !fragments.is_empty()) + && is_segmented_scalar { return Err(Error::invalid_input( - "index_uuid is no longer accepted for BTree distributed index builds; segment UUIDs \ + "index_uuid is no longer accepted for distributed scalar index builds; segment UUIDs \ are generated by Lance and returned in the index metadata." .to_string(), )); @@ -617,8 +596,9 @@ fn uses_segment_commit_path(index_type: IndexType, params: &dyn IndexParams) -> if params_family == LANCE_SCALAR_INDEX { match index_type { - IndexType::BTree => return true, + IndexType::BTree | IndexType::Bitmap => return true, IndexType::Scalar if is_btree_scalar_params(params) => return true, + IndexType::Scalar if is_bitmap_scalar_params(params) => return true, _ => {} } } @@ -1165,7 +1145,7 @@ mod tests { } #[tokio::test] - async fn test_merge_index_metadata_btree_soft_break() { + async fn test_merge_index_metadata_soft_break() { let tmpdir = TempStrDir::default(); let dataset_uri = format!("file://{}", tmpdir.as_str()); let reader = gen_batch() @@ -1176,20 +1156,24 @@ mod tests { ); let dataset = Dataset::write(reader, &dataset_uri, None).await.unwrap(); - let err = dataset - .merge_index_metadata( - &Uuid::new_v4().to_string(), - IndexType::BTree, - None, - Arc::new(NoopIndexBuildProgress), - ) - .await - .unwrap_err(); - assert!( - err.to_string() - .contains("no longer supports merge_index_metadata"), - "expected BTree merge_index_metadata soft-break error, got: {err}" - ); + // Both segmented scalar families have left the legacy distributed-merge + // entry point and must report the soft-break. + for index_type in [IndexType::BTree, IndexType::Bitmap] { + let err = dataset + .merge_index_metadata( + &Uuid::new_v4().to_string(), + index_type, + None, + Arc::new(NoopIndexBuildProgress), + ) + .await + .unwrap_err(); + assert!( + err.to_string() + .contains("no longer supports merge_index_metadata"), + "expected {index_type} merge_index_metadata soft-break error, got: {err}" + ); + } } /// Assert a committed segment directory holds exactly one canonical BTree @@ -1310,7 +1294,7 @@ mod tests { } #[tokio::test] - async fn test_btree_distributed_index_uuid_rejected() { + async fn test_distributed_index_uuid_rejected() { let test_dir = TempStrDir::default(); let dataset = gen_batch() .col("value", lance_datagen::array::step::()) @@ -1324,25 +1308,39 @@ mod tests { let mut dataset = dataset; let fragment_id = dataset.get_fragments()[0].id() as u32; - let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree); - for index_type in [IndexType::BTree, IndexType::Scalar] { - let err = CreateIndexBuilder::new(&mut dataset, &["value"], index_type, ¶ms) - .name("value_btree_segments".to_string()) - .fragments(vec![fragment_id]) - .index_uuid(Uuid::new_v4().to_string()) - .execute_uncommitted() - .await - .unwrap_err(); - assert!( - matches!(err, Error::InvalidInput { .. }), - "expected invalid input error, got: {err}" - ); - assert!( - err.to_string().contains( - "index_uuid is no longer accepted for BTree distributed index builds" - ), - "unexpected error: {err}" - ); + // Each segmented scalar family rejects a user-supplied UUID for a + // fragment-scoped build, whether requested via its own IndexType or the + // generic Scalar wrapper. + for (builtin, native_type) in [ + ( + lance_index::scalar::BuiltinIndexType::BTree, + IndexType::BTree, + ), + ( + lance_index::scalar::BuiltinIndexType::Bitmap, + IndexType::Bitmap, + ), + ] { + let params = ScalarIndexParams::for_builtin(builtin); + for index_type in [native_type, IndexType::Scalar] { + let err = CreateIndexBuilder::new(&mut dataset, &["value"], index_type, ¶ms) + .name("value_segments".to_string()) + .fragments(vec![fragment_id]) + .index_uuid(Uuid::new_v4().to_string()) + .execute_uncommitted() + .await + .unwrap_err(); + assert!( + matches!(err, Error::InvalidInput { .. }), + "expected invalid input error for {index_type}, got: {err}" + ); + assert!( + err.to_string().contains( + "index_uuid is no longer accepted for distributed scalar index builds" + ), + "unexpected error for {index_type}: {err}" + ); + } } } @@ -1474,30 +1472,37 @@ mod tests { let fragments = dataset.get_fragments(); let fragment_ids: Vec = fragments.iter().map(|f| f.id() as u32).collect(); let selected_fragments = fragment_ids[..2].to_vec(); - let index = - CreateIndexBuilder::new(&mut dataset, &["category"], IndexType::Bitmap, &base_params) - .name("bitmap_segment".to_string()) - .fragments(selected_fragments.clone()) - .execute_uncommitted() - .await - .unwrap(); - assert_eq!( - index - .fragment_bitmap - .as_ref() - .unwrap() - .iter() - .collect::>(), - selected_fragments - ); + for index_type in [IndexType::Bitmap, IndexType::Scalar] { + let index = + CreateIndexBuilder::new(&mut dataset, &["category"], index_type, &base_params) + .name(format!("bitmap_segment_{index_type}")) + .fragments(selected_fragments.clone()) + .execute_uncommitted() + .await + .unwrap(); - let files = index.files.as_ref().unwrap(); - assert!(files.iter().any(|file| file.path == BITMAP_LOOKUP_NAME)); - assert!( - files.iter().all(|file| !file.path.starts_with("part_")), - "staged bitmap segment should only reference canonical files" - ); + assert_eq!( + index + .fragment_bitmap + .as_ref() + .unwrap() + .iter() + .collect::>(), + selected_fragments, + "{index_type}: unexpected fragment coverage" + ); + + let files = index.files.as_ref().unwrap(); + assert!( + files.iter().any(|file| file.path == BITMAP_LOOKUP_NAME), + "{index_type}: staged segment is missing canonical {BITMAP_LOOKUP_NAME}" + ); + assert!( + files.iter().all(|file| !file.path.starts_with("part_")), + "{index_type}: staged bitmap segment should only reference canonical files" + ); + } } #[tokio::test] diff --git a/rust/lance/src/index/scalar.rs b/rust/lance/src/index/scalar.rs index 92b06f0a1a5..c9618dbff27 100644 --- a/rust/lance/src/index/scalar.rs +++ b/rust/lance/src/index/scalar.rs @@ -42,7 +42,7 @@ use lance_index::scalar::label_list::{ use lance_index::scalar::registry::{ ScalarIndexPlugin, TrainingCriteria, TrainingOrdering, VALUE_COLUMN_NAME, }; -use lance_index::scalar::{BuiltinIndexType, CreatedIndex, InvertedIndexParams}; +use lance_index::scalar::{CreatedIndex, InvertedIndexParams}; use lance_index::scalar::{ ScalarIndex, ScalarIndexParams, bitmap::BITMAP_LOOKUP_NAME, inverted::INVERT_LIST_FILE, lance_format::LanceIndexStore, @@ -323,51 +323,6 @@ pub(super) async fn build_scalar_index( Ok(created_index) } -/// Build a canonical bitmap index segment over a caller-selected fragment set. -/// -/// This is intentionally separate from `build_scalar_index(..., fragment_ids=Some(...))`. -/// The latter is the legacy distributed scalar-index shard path. Here fragment ids only -/// restrict the scanned rows; the bitmap plugin receives no shard id and writes the -/// canonical bitmap layout for the staged segment root. -#[instrument(level = "debug", skip_all)] -pub(super) async fn build_bitmap_index_segment( - dataset: &Dataset, - column: &str, - uuid: &str, - fragment_ids: Vec, - progress: Arc, -) -> Result { - let field = dataset - .schema() - .field(column) - .ok_or(Error::invalid_input_source( - format!("No column with name {}", column).into(), - ))?; - let field: arrow_schema::Field = field.into(); - - let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap); - let plugin = SCALAR_INDEX_PLUGIN_REGISTRY.get_plugin_by_name(¶ms.index_type)?; - let training_request = - plugin.new_training_request(params.params.as_deref().unwrap_or("{}"), &field)?; - let criteria = training_request.criteria(); - - progress.stage_start("load_data", None, "rows").await?; - let training_data = - load_training_data(dataset, column, criteria, None, true, Some(fragment_ids)).await?; - progress.stage_complete("load_data").await?; - - let index_store = LanceIndexStore::from_dataset_for_new(dataset, uuid)?; - plugin - .train_index( - training_data, - &index_store, - training_request, - None, - progress, - ) - .await -} - /// Fetches the scalar index plugin for a given index metadata /// /// The fast path, on newer datasets, is just a plugin lookup by the type URL of the index details. diff --git a/rust/lance/src/index/scalar/bitmap.rs b/rust/lance/src/index/scalar/bitmap.rs index 11214a9bfdc..7411e41876e 100644 --- a/rust/lance/src/index/scalar/bitmap.rs +++ b/rust/lance/src/index/scalar/bitmap.rs @@ -1,16 +1,42 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +use datafusion::physical_plan::SendableRecordBatchStream; use lance_index::metrics::NoOpMetricsCollector; use lance_index::scalar::bitmap::BitmapIndex; use lance_index::scalar::lance_format::LanceIndexStore; +use lance_index::scalar::{CreatedIndex, OldIndexDataFilter}; use lance_table::format::IndexMetadata; -use roaring::RoaringBitmap; use std::sync::Arc; use uuid::Uuid; use crate::{Dataset, Error, Result, dataset::index::LanceIndexStoreExt}; +/// Open the given bitmap `segments` and downcast them to [`BitmapIndex`]. +async fn open_bitmap_segments( + dataset: &Dataset, + field_path: &str, + segments: &[&IndexMetadata], +) -> Result>> { + let mut source_indices = Vec::with_capacity(segments.len()); + for &segment in segments { + let scalar_index = + super::open_scalar_index(dataset, field_path, segment, &NoOpMetricsCollector).await?; + let bitmap_index = scalar_index + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::index(format!( + "Bitmap merge: expected bitmap segment {}, got {:?}", + segment.uuid, + scalar_index.index_type() + )) + })?; + source_indices.push(Arc::new(bitmap_index.clone())); + } + Ok(source_indices) +} + /// Merge one caller-defined group of source bitmap segments into a single segment. pub(in crate::index) async fn merge_segments( dataset: &Dataset, @@ -28,35 +54,22 @@ pub(in crate::index) async fn merge_segments( })?; let field_path = dataset.schema().field_path(field_id)?; - let mut source_indices = Vec::with_capacity(segments.len()); - let mut fragment_bitmap = RoaringBitmap::new(); - for segment in &segments { - fragment_bitmap |= segment.fragment_bitmap.as_ref().cloned().ok_or_else(|| { - Error::invalid_input(format!( - "CreateIndex: segment {} is missing fragment coverage", - segment.uuid - )) - })?; - let scalar_index = - super::open_scalar_index(dataset, &field_path, segment, &NoOpMetricsCollector).await?; - let bitmap_index = scalar_index - .as_any() - .downcast_ref::() - .ok_or_else(|| { - Error::index(format!( - "merge_existing_index_segments: expected bitmap segment {}, got {:?}", - segment.uuid, - scalar_index.index_type() - )) - })?; - source_indices.push(Arc::new(bitmap_index.clone())); - } + // Intersect each segment's stored coverage with the dataset's current + // fragments so we don't claim coverage on row addresses that compaction or + // pruning has already retired. + let (fragment_bitmap, old_data_filter) = + crate::index::append::effective_coverage_and_filter(dataset, &segments).await?; + + let segment_refs: Vec<&IndexMetadata> = segments.iter().collect(); + let source_indices = open_bitmap_segments(dataset, &field_path, &segment_refs).await?; let new_uuid = Uuid::new_v4(); let new_store = LanceIndexStore::from_dataset_for_new(dataset, &new_uuid.to_string())?; let created_index = lance_index::scalar::bitmap::merge_bitmap_indices( &source_indices, + None, &new_store, + old_data_filter, lance_index::progress::noop_progress(), ) .await?; @@ -74,3 +87,24 @@ pub(in crate::index) async fn merge_segments( ..segments[0].clone() }) } + +/// Open the given bitmap `segments` and merge their materialized state, together +/// with `new_data`, into a single canonical bitmap written to `new_store`. +pub(in crate::index) async fn open_and_merge_segments( + dataset: &Dataset, + field_path: &str, + segments: &[&IndexMetadata], + new_data: SendableRecordBatchStream, + new_store: &LanceIndexStore, + old_data_filter: Option, +) -> Result { + let source_indices = open_bitmap_segments(dataset, field_path, segments).await?; + lance_index::scalar::bitmap::merge_bitmap_indices( + &source_indices, + Some(new_data), + new_store, + old_data_filter, + lance_index::progress::noop_progress(), + ) + .await +} diff --git a/rust/lance/src/index/scalar/btree.rs b/rust/lance/src/index/scalar/btree.rs index 34534f6811b..d945ba621bd 100644 --- a/rust/lance/src/index/scalar/btree.rs +++ b/rust/lance/src/index/scalar/btree.rs @@ -17,7 +17,6 @@ use lance_index::scalar::lance_format::LanceIndexStore; use lance_index::scalar::registry::VALUE_COLUMN_NAME; use lance_index::scalar::{CreatedIndex, OldIndexDataFilter}; use lance_table::format::IndexMetadata; -use roaring::RoaringBitmap; use uuid::Uuid; use crate::{Dataset, Error, Result, dataset::index::LanceIndexStoreExt}; @@ -121,31 +120,8 @@ pub(crate) async fn merge_segments( // Intersect each segment's stored bitmap with the dataset's current // fragments so we don't claim coverage on IDs that compaction or pruning // has already retired. - let dataset_fragments = dataset.fragment_bitmap.as_ref(); - let mut effective_old_frags = RoaringBitmap::new(); - let mut deleted_old_frags = RoaringBitmap::new(); - for segment in &segments { - if segment.fragment_bitmap.is_none() { - return Err(Error::invalid_input(format!( - "CreateIndex: segment {} is missing fragment coverage", - segment.uuid - ))); - } - if let Some(effective) = segment.effective_fragment_bitmap(dataset_fragments) { - effective_old_frags |= effective; - } - if let Some(deleted) = segment.deleted_fragment_bitmap(dataset_fragments) { - deleted_old_frags |= deleted; - } - } - - let fragment_bitmap = effective_old_frags.clone(); - let old_data_filter = crate::index::append::build_old_data_filter( - dataset, - &effective_old_frags, - &deleted_old_frags, - ) - .await?; + let (fragment_bitmap, old_data_filter) = + crate::index::append::effective_coverage_and_filter(dataset, &segments).await?; let output_uuid = Uuid::new_v4(); let new_store = LanceIndexStore::from_dataset_for_new(dataset, &output_uuid.to_string())?; From f10201829eba37fb0c59e1a89326ea041c502dbd Mon Sep 17 00:00:00 2001 From: zhangyue19921010 Date: Fri, 12 Jun 2026 13:02:40 +0800 Subject: [PATCH 2/3] feat(index): consolidate bitmap segments and unindexed data on optimize --- rust/lance-index/src/scalar/bitmap.rs | 30 +-- rust/lance-index/src/scalar/btree.rs | 15 +- rust/lance/src/index/append.rs | 267 ++++++++++++++++++++++++-- rust/lance/src/index/scalar/bitmap.rs | 10 +- rust/lance/src/index/scalar/btree.rs | 10 +- 5 files changed, 294 insertions(+), 38 deletions(-) diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index a765cd94dd2..4a212713e1f 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -1204,15 +1204,12 @@ impl BitmapIndexPlugin { /// optionally, a stream of not-yet-indexed `new_data`) into a single canonical /// bitmap written to `dest_store`. /// -/// `old_data_filter` is applied only to the rows coming from `source_indices`, -/// dropping row addresses whose fragments compaction/deletion has retired; rows -/// from `new_data` are inserted unfiltered. The whole merged state is held in -/// memory, as bitmap segment consolidation has always done. +/// `old_data_filters` carries one optional filter per source segment pub async fn merge_bitmap_indices( source_indices: &[Arc], new_data: Option, dest_store: &dyn IndexStore, - old_data_filter: Option, + old_data_filters: &[Option], progress: Arc, ) -> Result { if source_indices.is_empty() { @@ -1221,6 +1218,15 @@ pub async fn merge_bitmap_indices( )); } + if old_data_filters.len() != source_indices.len() { + return Err(Error::invalid_input(format!( + "Bitmap merge: expected one old-data filter per source segment \ + ({} segments) but got {}", + source_indices.len(), + old_data_filters.len() + ))); + } + let value_type = source_indices[0].value_type().clone(); let mut merged_state = HashMap::::new(); @@ -1240,7 +1246,13 @@ pub async fn merge_bitmap_indices( ))); } - let state = source_index.load_bitmap_index_state().await?; + let mut state = source_index.load_bitmap_index_state().await?; + if let Some(old_data_filter) = &old_data_filters[idx] { + state.retain(|_, postings| { + old_data_filter.retain_row_addrs(postings); + !postings.is_empty() + }); + } for (key, bitmap) in state { merged_state .entry(key) @@ -1252,12 +1264,6 @@ pub async fn merge_bitmap_indices( .await?; } progress.stage_complete("merge_bitmap_segments").await?; - if let Some(old_data_filter) = old_data_filter { - merged_state.retain(|_, postings| { - old_data_filter.retain_row_addrs(postings); - !postings.is_empty() - }); - } // Fold the not-yet-indexed rows into the same in-memory state. if let Some(new_data) = new_data { diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index 6128248308e..e8e5c42a248 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -1798,7 +1798,7 @@ impl BTreeIndex { segments: &[Arc], new_data: SendableRecordBatchStream, dest_store: &dyn IndexStore, - old_data_filter: Option, + old_data_filters: &[Option], ) -> Result { let Some(first) = segments.first() else { return Err(Error::invalid_input( @@ -1806,6 +1806,15 @@ impl BTreeIndex { )); }; + if old_data_filters.len() != segments.len() { + return Err(Error::invalid_input(format!( + "BTree merge: expected one old-data filter per source segment \ + ({} segments) but got {}", + segments.len(), + old_data_filters.len() + ))); + } + for segment in segments.iter().skip(1) { if segment.data_type != first.data_type { return Err(Error::index(format!( @@ -1827,7 +1836,7 @@ impl BTreeIndex { } let mut inputs: Vec> = Vec::with_capacity(segments.len() + 1); - for segment in segments { + for (segment, old_data_filter) in segments.iter().zip(old_data_filters) { let stream = segment.data_stream().await?; let stream = match old_data_filter.clone() { Some(filter) => filter_row_ids(stream, filter), @@ -2235,7 +2244,7 @@ impl ScalarIndex for BTreeIndex { &[Arc::new(self.clone())], new_data, dest_store, - old_data_filter, + &[old_data_filter], ) .await } diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs index 212e9fe9609..388f3170251 100644 --- a/rust/lance/src/index/append.rs +++ b/rust/lance/src/index/append.rs @@ -114,12 +114,32 @@ pub fn split_segment_coverage<'a>( (effective, deleted) } -/// Validate that every segment carries fragment coverage, split that coverage -/// into still-live and retired fragments, and build the matching [`OldIndexDataFilter`]. -pub async fn effective_coverage_and_filter( +/// Build one [`OldIndexDataFilter`] per segment, each derived from that +/// segment's *own* effective (still-live) and retired fragment coverage. +pub async fn build_per_segment_filters( + dataset: &Dataset, + segments: &[&IndexMetadata], +) -> Result>> { + let mut filters = Vec::with_capacity(segments.len()); + for segment in segments { + let effective = segment + .effective_fragment_bitmap(&dataset.fragment_bitmap) + .unwrap_or_default(); + let deleted = segment + .deleted_fragment_bitmap(&dataset.fragment_bitmap) + .unwrap_or_default(); + filters.push(build_old_data_filter(dataset, &effective, &deleted).await?); + } + Ok(filters) +} + +/// Validate that every segment carries fragment coverage, then return the +/// combined still-live coverage (for the merged segment's fragment bitmap) +/// together with one [`OldIndexDataFilter`] per segment. +pub async fn effective_coverage_and_filters( dataset: &Dataset, segments: &[IndexMetadata], -) -> Result<(RoaringBitmap, Option)> { +) -> Result<(RoaringBitmap, Vec>)> { for segment in segments { if segment.fragment_bitmap.is_none() { return Err(Error::invalid_input(format!( @@ -128,9 +148,10 @@ pub async fn effective_coverage_and_filter( ))); } } - let (effective, deleted) = split_segment_coverage(dataset, segments); - let old_data_filter = build_old_data_filter(dataset, &effective, &deleted).await?; - Ok((effective, old_data_filter)) + let (effective, _deleted) = split_segment_coverage(dataset, segments); + let segment_refs: Vec<&IndexMetadata> = segments.iter().collect(); + let filters = build_per_segment_filters(dataset, &segment_refs).await?; + Ok((effective, filters)) } async fn load_unindexed_training_data( @@ -271,9 +292,8 @@ async fn merge_scalar_indices<'a>( load_unindexed_training_data(dataset.as_ref(), field_path, &update_criteria, unindexed) .await?; let new_store = LanceIndexStore::from_dataset_for_new(&dataset, &new_uuid)?; - let old_data_filter = - build_old_data_filter(dataset.as_ref(), &effective_old_frags, &deleted_old_frags) - .await?; + let old_data_filters = + build_per_segment_filters(dataset.as_ref(), selected_old_indices).await?; match index_type { IndexType::BTree => { @@ -283,7 +303,7 @@ async fn merge_scalar_indices<'a>( selected_old_indices, new_data_stream, &new_store, - old_data_filter, + &old_data_filters, ) .await? } @@ -301,12 +321,22 @@ async fn merge_scalar_indices<'a>( selected_old_indices, new_data_stream, &new_store, - old_data_filter, + &old_data_filters, ) .await? } } _ => { + // Non-segmented scalar types only reach this branch with a single + // selected segment, so the union filter equals that segment's + // filter. Built lazily here so the segmented BTree/Bitmap paths + // above don't pay an extra row-id-sequence load they never use. + let old_data_filter = build_old_data_filter( + dataset.as_ref(), + &effective_old_frags, + &deleted_old_frags, + ) + .await?; reference_index .update(new_data_stream, &new_store, old_data_filter) .await? @@ -790,7 +820,7 @@ mod tests { use arrow::datatypes::{Float32Type, UInt32Type}; use arrow_array::cast::AsArray; use arrow_array::{ - FixedSizeListArray, RecordBatch, RecordBatchIterator, StringArray, UInt32Array, + FixedSizeListArray, Int32Array, RecordBatch, RecordBatchIterator, StringArray, UInt32Array, }; use arrow_schema::{DataType, Field, Schema}; use futures::TryStreamExt; @@ -1984,6 +2014,217 @@ mod tests { ); } + #[tokio::test] + async fn test_optimize_btree_no_duplicate_row_addr() { + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("payload", DataType::Int32, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1])), + Arc::new(Int32Array::from(vec![10])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let mut dataset = Dataset::write(reader, test_uri, None).await.unwrap(); + + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::BTree); + dataset + .create_index( + &["id"], + IndexType::BTree, + Some("id_idx".into()), + ¶ms, + true, + ) + .await + .unwrap(); + + // Reordered source columns (payload, id) force the partial-schema + // RewriteColumns path instead of a row rewrite. + let source_schema = Arc::new(Schema::new(vec![ + Field::new("payload", DataType::Int32, false), + Field::new("id", DataType::Int32, false), + ])); + let source_batch = RecordBatch::try_new( + source_schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![100])), + Arc::new(Int32Array::from(vec![1])), + ], + ) + .unwrap(); + let merge_job = + MergeInsertBuilder::try_new(Arc::new(dataset.clone()), vec!["id".to_string()]) + .unwrap() + .when_matched(WhenMatched::UpdateAll) + .try_build() + .unwrap(); + let source_reader = Box::new(RecordBatchIterator::new( + [Ok(source_batch)], + source_schema.clone(), + )); + merge_job + .execute(reader_to_stream(source_reader)) + .await + .unwrap(); + + // Build a delta BTree segment over the now-unindexed fragment. + let mut dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + dataset + .optimize_indices(&OptimizeOptions::append()) + .await + .unwrap(); + assert_eq!( + dataset.load_indices_by_name("id_idx").await.unwrap().len(), + 2, + "append must create a delta segment over the rewritten fragment" + ); + + // Force the old segment + delta segment to merge. + dataset + .optimize_indices(&OptimizeOptions::merge(2)) + .await + .unwrap(); + + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + let rows = dataset + .scan() + .filter("id = 1") + .unwrap() + .project(&["id"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(rows, 1, "id = 1 must return exactly one row after merge"); + } + + #[tokio::test] + async fn test_optimize_bitmap_no_stale_postings() { + async fn query_count(dataset: &Dataset, value: &str) -> usize { + dataset + .scan() + .filter(&format!("cat = '{}'", value)) + .unwrap() + .project(&["cat"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows() + } + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![ + Field::new("key", DataType::Int32, false), + Field::new("cat", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1])), + Arc::new(StringArray::from(vec!["a"])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let mut dataset = Dataset::write(reader, test_uri, None).await.unwrap(); + + // A scalar index on the join key forces merge_insert down the in-place + // RewriteColumns path, keeping the fragment live. + dataset + .create_index( + &["key"], + IndexType::BTree, + Some("key_idx".into()), + &ScalarIndexParams::for_builtin(BuiltinIndexType::BTree), + true, + ) + .await + .unwrap(); + dataset + .create_index( + &["cat"], + IndexType::Bitmap, + Some("cat_idx".into()), + &ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap), + true, + ) + .await + .unwrap(); + + // Reordered source columns (cat, key) force the in-place RewriteColumns + // path; the indexed `cat` value changes 'a' -> 'b' on the same row, + // pruning the cat index's coverage of the still-live fragment. + let source_schema = Arc::new(Schema::new(vec![ + Field::new("cat", DataType::Utf8, false), + Field::new("key", DataType::Int32, false), + ])); + let source_batch = RecordBatch::try_new( + source_schema.clone(), + vec![ + Arc::new(StringArray::from(vec!["b"])), + Arc::new(Int32Array::from(vec![1])), + ], + ) + .unwrap(); + let merge_job = + MergeInsertBuilder::try_new(Arc::new(dataset.clone()), vec!["key".to_string()]) + .unwrap() + .when_matched(WhenMatched::UpdateAll) + .try_build() + .unwrap(); + let source_reader = Box::new(RecordBatchIterator::new( + [Ok(source_batch)], + source_schema.clone(), + )); + merge_job + .execute(reader_to_stream(source_reader)) + .await + .unwrap(); + + let cat_only = || OptimizeOptions::append().index_names(vec!["cat_idx".to_string()]); + + let mut dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + dataset.optimize_indices(&cat_only()).await.unwrap(); + assert_eq!( + dataset.load_indices_by_name("cat_idx").await.unwrap().len(), + 2, + "append must create a delta segment over the rewritten fragment" + ); + dataset + .optimize_indices(&OptimizeOptions::merge(2).index_names(vec!["cat_idx".to_string()])) + .await + .unwrap(); + + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + assert_eq!( + query_count(&dataset, "a").await, + 0, + "stale 'a' posting must be filtered out of the consolidated segment" + ); + assert_eq!( + query_count(&dataset, "b").await, + 1, + "the updated 'b' row must remain queryable" + ); + assert_eq!( + dataset.load_indices_by_name("cat_idx").await.unwrap().len(), + 1, + "the segments must collapse into a single consolidated segment" + ); + } + #[tokio::test] async fn test_optimize_btree_keeps_rows_with_stable_row_ids_after_compaction() { async fn query_id_count(dataset: &Dataset, id: &str) -> usize { diff --git a/rust/lance/src/index/scalar/bitmap.rs b/rust/lance/src/index/scalar/bitmap.rs index 06218118467..d5bbdcf2961 100644 --- a/rust/lance/src/index/scalar/bitmap.rs +++ b/rust/lance/src/index/scalar/bitmap.rs @@ -57,8 +57,8 @@ pub(in crate::index) async fn merge_segments( // Intersect each segment's stored coverage with the dataset's current // fragments so we don't claim coverage on row addresses that compaction or // pruning has already retired. - let (fragment_bitmap, old_data_filter) = - crate::index::append::effective_coverage_and_filter(dataset, &segments).await?; + let (fragment_bitmap, old_data_filters) = + crate::index::append::effective_coverage_and_filters(dataset, &segments).await?; let segment_refs: Vec<&IndexMetadata> = segments.iter().collect(); let source_indices = open_bitmap_segments(dataset, &field_path, &segment_refs).await?; @@ -69,7 +69,7 @@ pub(in crate::index) async fn merge_segments( &source_indices, None, &new_store, - old_data_filter, + &old_data_filters, lance_index::progress::noop_progress(), ) .await?; @@ -96,14 +96,14 @@ pub(in crate::index) async fn open_and_merge_segments( segments: &[&IndexMetadata], new_data: SendableRecordBatchStream, new_store: &LanceIndexStore, - old_data_filter: Option, + old_data_filters: &[Option], ) -> Result { let source_indices = open_bitmap_segments(dataset, field_path, segments).await?; lance_index::scalar::bitmap::merge_bitmap_indices( &source_indices, Some(new_data), new_store, - old_data_filter, + old_data_filters, lance_index::progress::noop_progress(), ) .await diff --git a/rust/lance/src/index/scalar/btree.rs b/rust/lance/src/index/scalar/btree.rs index 081957ecdad..268048da4dd 100644 --- a/rust/lance/src/index/scalar/btree.rs +++ b/rust/lance/src/index/scalar/btree.rs @@ -63,7 +63,7 @@ pub(crate) async fn open_and_merge_segments( segments: &[&IndexMetadata], new_data: SendableRecordBatchStream, new_store: &LanceIndexStore, - old_data_filter: Option, + old_data_filters: &[Option], ) -> Result { let mut source_indices = Vec::with_capacity(segments.len()); for &segment in segments { @@ -81,7 +81,7 @@ pub(crate) async fn open_and_merge_segments( })?; source_indices.push(Arc::new(btree.clone())); } - BTreeIndex::merge_segments(&source_indices, new_data, new_store, old_data_filter).await + BTreeIndex::merge_segments(&source_indices, new_data, new_store, old_data_filters).await } /// Merge one caller-defined group of source BTree segments into a single @@ -120,8 +120,8 @@ pub(crate) async fn merge_segments( // Intersect each segment's stored bitmap with the dataset's current // fragments so we don't claim coverage on IDs that compaction or pruning // has already retired. - let (fragment_bitmap, old_data_filter) = - crate::index::append::effective_coverage_and_filter(dataset, &segments).await?; + let (fragment_bitmap, old_data_filters) = + crate::index::append::effective_coverage_and_filters(dataset, &segments).await?; let output_uuid = Uuid::new_v4(); let new_store = LanceIndexStore::from_dataset_for_new(dataset, &output_uuid)?; @@ -135,7 +135,7 @@ pub(crate) async fn merge_segments( &segment_refs, empty_new_data, &new_store, - old_data_filter, + &old_data_filters, ) .await?; From cfc95cb81ab8ffdbd117aed410df319e7fb24487 Mon Sep 17 00:00:00 2001 From: zhangyue19921010 Date: Mon, 22 Jun 2026 20:08:16 +0800 Subject: [PATCH 3/3] merge main --- .bumpversion.toml | 2 +- .../workflows/build_linux_wheel/action.yml | 2 +- .github/workflows/build_mac_wheel/action.yml | 2 +- .../workflows/build_windows_wheel/action.yml | 2 +- .github/workflows/java-publish.yml | 139 +- .github/workflows/pypi-publish.yml | 6 +- .github/workflows/python.yml | 2 +- CONTRIBUTING.md | 2 +- Cargo.lock | 551 +- Cargo.toml | 51 +- ci/create_release_branch.sh | 12 +- ci/publish_beta.sh | 12 +- ci/release_common.sh | 6 +- docs/src/format/file/encoding.md | 7 +- docs/src/format/index/scalar/ngram.md | 8 +- docs/src/guide/blob.md | 10 + docs/src/guide/object_store.md | 205 + java/lance-jni/Cargo.lock | 463 +- java/lance-jni/Cargo.toml | 2 +- java/lance-jni/src/index.rs | 2 + java/lance-jni/src/mem_wal.rs | 37 + java/pom.xml | 2 +- .../java/org/lance/OpenDatasetBuilder.java | 4 +- .../main/java/org/lance/index/IndexType.java | 1 + .../lance/index/scalar/ScalarIndexParams.java | 6 +- .../java/org/lance/index/ScalarIndexTest.java | 78 + .../java/org/lance/memwal/MemWalTest.java | 29 +- .../namespace/DirectoryNamespaceTest.java | 48 + memtest/pyproject.toml | 3 +- python/Cargo.lock | 554 +- python/Cargo.toml | 10 +- python/pyproject.toml | 13 +- python/python/benchmarks/test_search.py | 12 +- python/python/lance/__init__.py | 4 +- python/python/lance/blob.py | 67 +- python/python/lance/dataset.py | 9 +- python/python/lance/indices/builder.py | 2 +- python/python/lance/lance/__init__.pyi | 23 + python/python/lance/lance/optimize.pyi | 6 +- python/python/lance/namespace.py | 48 + python/python/lance/optimize.py | 8 + python/python/lance/vector.py | 147 + python/python/tests/test_blob.py | 367 ++ python/python/tests/test_dataset.py | 49 +- python/python/tests/test_indices.py | 6 +- python/python/tests/test_mem_wal.py | 7 + python/python/tests/test_namespace_dir.py | 170 + .../tests/test_namespace_integration.py | 96 + python/python/tests/test_optimize.py | 41 + python/python/tests/test_s3_ddb.py | 52 + python/python/tests/test_scalar_index.py | 50 +- python/python/tests/test_vector.py | 37 +- python/python/tests/test_vector_index.py | 2 + python/src/dataset.rs | 182 + python/src/dataset/optimize.rs | 17 +- python/src/lib.rs | 1 + python/src/mem_wal.rs | 25 + python/src/namespace.rs | 102 +- python/uv.lock | 875 +-- rust/examples/Cargo.toml | 2 +- rust/lance-arrow/src/ipc.rs | 216 +- rust/lance-arrow/src/lib.rs | 2 + rust/lance-core/src/cache/backend.rs | 12 + rust/lance-core/src/cache/codec.rs | 517 +- rust/lance-core/src/cache/entry_io.rs | 202 + rust/lance-core/src/cache/mod.rs | 150 +- rust/lance-core/src/cache/moka.rs | 9 +- rust/lance-core/src/datatypes.rs | 1 + rust/lance-core/src/datatypes/field.rs | 60 + rust/lance-core/src/datatypes/schema.rs | 90 +- rust/lance-core/src/utils.rs | 1 + rust/lance-core/src/utils/io_stats.rs | 30 + rust/lance-datafusion/src/expr.rs | 108 + rust/lance-datafusion/src/logical_expr.rs | 54 + rust/lance-datagen/Cargo.toml | 1 - rust/lance-datagen/src/generator.rs | 226 +- rust/lance-encoding/src/decoder.rs | 52 +- .../src/encodings/logical/primitive.rs | 81 +- .../encodings/logical/primitive/miniblock.rs | 26 +- rust/lance-encoding/src/lib.rs | 16 + rust/lance-file/src/io.rs | 10 + rust/lance-file/src/reader.rs | 17 + rust/lance-file/src/writer.rs | 13 +- rust/lance-index/Cargo.toml | 1 + rust/lance-index/benches/rq.rs | 529 +- rust/lance-index/build.rs | 11 +- rust/lance-index/protos-cache/cache.proto | 194 + rust/lance-index/src/lib.rs | 13 + rust/lance-index/src/metrics.rs | 13 + rust/lance-index/src/scalar.rs | 49 +- rust/lance-index/src/scalar/bitmap.rs | 125 +- rust/lance-index/src/scalar/btree.rs | 470 +- rust/lance-index/src/scalar/btree/flat.rs | 139 +- rust/lance-index/src/scalar/expression.rs | 355 +- rust/lance-index/src/scalar/fmindex.rs | 245 +- .../src/scalar/inverted/builder.rs | 955 +++- .../src/scalar/inverted/cache_codec.rs | 715 ++- rust/lance-index/src/scalar/inverted/index.rs | 107 +- .../src/scalar/inverted/tokenizer.rs | 72 +- rust/lance-index/src/scalar/inverted/wand.rs | 143 +- rust/lance-index/src/scalar/label_list.rs | 118 +- rust/lance-index/src/scalar/lance_format.rs | 47 +- rust/lance-index/src/scalar/ngram.rs | 151 +- .../src/scalar/ngram/ngram_regex.rs | 673 +++ rust/lance-index/src/scalar/zonemap.rs | 46 +- rust/lance-index/src/vector.rs | 8 + rust/lance-index/src/vector/bq.rs | 3 + rust/lance-index/src/vector/bq/builder.rs | 34 +- .../src/vector/bq/dist_table_quant.rs | 935 +++ rust/lance-index/src/vector/bq/ex_dot.rs | 1078 ++++ rust/lance-index/src/vector/bq/prune.rs | 527 ++ rust/lance-index/src/vector/bq/storage.rs | 1609 ++++-- rust/lance-index/src/vector/bq/transform.rs | 21 +- .../src/vector/distributed/index_merger.rs | 28 +- rust/lance-index/src/vector/pq/storage.rs | 7 +- rust/lance-index/src/vector/storage.rs | 27 +- rust/lance-io/src/scheduler.rs | 153 +- rust/lance-linalg/Cargo.toml | 5 +- rust/lance-linalg/benches/hamming.rs | 52 - rust/lance-linalg/src/distance.rs | 6 +- rust/lance-linalg/src/distance/hamming.rs | 1323 ++++- rust/lance-namespace-datafusion/tests/sql.rs | 2 + rust/lance-namespace-impls/BENCHMARK.md | 73 + rust/lance-namespace-impls/Cargo.toml | 14 + .../benches/manifest_commit_sweep.sh | 146 + .../examples/manifest_bench.rs | 714 +++ rust/lance-namespace-impls/src/dir.rs | 605 +- .../lance-namespace-impls/src/dir/manifest.rs | 5023 +++++++++++------ .../src/dir/manifest_feature_flags.rs | 194 + .../lance-namespace-impls/src/rest_adapter.rs | 7 +- rust/lance-select/src/mask.rs | 15 +- rust/lance-table/src/format/index.rs | 27 +- rust/lance-table/src/io/commit.rs | 20 + .../src/io/commit/external_manifest.rs | 25 + rust/lance-tokenizer/Cargo.toml | 1 + rust/lance-tokenizer/src/stop_word_filter.rs | 80 +- .../src/stop_word_filter/stopwords.rs | 6 + rust/lance/Cargo.toml | 8 + rust/lance/benches/hamming.rs | 228 + .../benches/mem_wal/write/mem_wal_write.rs | 2 + rust/lance/benches/regex_ngram.rs | 134 + rust/lance/src/blob.rs | 99 +- rust/lance/src/dataset.rs | 54 +- rust/lance/src/dataset/blob.rs | 1285 ++++- rust/lance/src/dataset/branch_location.rs | 59 +- rust/lance/src/dataset/cleanup.rs | 773 ++- rust/lance/src/dataset/fragment.rs | 2 +- rust/lance/src/dataset/index/frag_reuse.rs | 194 + rust/lance/src/dataset/mem_wal/api.rs | 12 +- rust/lance/src/dataset/mem_wal/index.rs | 294 +- .../lance/src/dataset/mem_wal/index/pk_key.rs | 204 + .../dataset/mem_wal/memtable/batch_store.rs | 47 + .../src/dataset/mem_wal/memtable/flush.rs | 387 +- .../mem_wal/memtable/scanner/builder.rs | 8 + rust/lance/src/dataset/mem_wal/scanner.rs | 7 +- .../src/dataset/mem_wal/scanner/block_list.rs | 934 ++- .../src/dataset/mem_wal/scanner/builder.rs | 218 +- .../src/dataset/mem_wal/scanner/collector.rs | 68 + .../dataset/mem_wal/scanner/data_source.rs | 23 + .../lance/src/dataset/mem_wal/scanner/exec.rs | 12 +- .../mem_wal/scanner/exec/newest_pk_filter.rs | 393 ++ .../src/dataset/mem_wal/scanner/exec/pk.rs | 2 +- .../mem_wal/scanner/exec/pk_block_filter.rs | 373 ++ .../mem_wal/scanner/exec/pk_hash_filter.rs | 350 -- .../scanner/exec/within_source_dedup.rs | 432 -- .../dataset/mem_wal/scanner/flushed_cache.rs | 189 +- .../src/dataset/mem_wal/scanner/fts_search.rs | 214 +- .../src/dataset/mem_wal/scanner/planner.rs | 194 +- .../dataset/mem_wal/scanner/point_lookup.rs | 197 +- .../dataset/mem_wal/scanner/vector_search.rs | 328 +- rust/lance/src/dataset/mem_wal/util.rs | 10 + rust/lance/src/dataset/mem_wal/write.rs | 288 +- rust/lance/src/dataset/optimize.rs | 869 +++ rust/lance/src/dataset/optimize/remapping.rs | 188 +- rust/lance/src/dataset/scanner.rs | 357 +- rust/lance/src/dataset/schema_evolution.rs | 1400 ++++- rust/lance/src/dataset/tests/dataset_index.rs | 78 +- .../src/dataset/tests/dataset_versioning.rs | 71 + rust/lance/src/dataset/updater.rs | 40 +- rust/lance/src/dataset/write.rs | 147 +- rust/lance/src/dataset/write/insert.rs | 37 +- rust/lance/src/dataset/write/merge_insert.rs | 160 +- rust/lance/src/index.rs | 39 +- rust/lance/src/index/append.rs | 451 +- rust/lance/src/index/create.rs | 30 + rust/lance/src/index/scalar.rs | 2 + rust/lance/src/index/scalar/bitmap.rs | 7 +- rust/lance/src/index/scalar/btree.rs | 7 +- rust/lance/src/index/scalar_logical.rs | 21 +- rust/lance/src/index/vector.rs | 1 + rust/lance/src/index/vector/builder.rs | 2 +- rust/lance/src/index/vector/hamming.rs | 938 +++ rust/lance/src/index/vector/ivf.rs | 2 +- .../src/index/vector/ivf/partition_serde.rs | 628 ++- rust/lance/src/index/vector/ivf/v2.rs | 239 +- rust/lance/src/io/commit/external_manifest.rs | 26 + .../lance/src/io/commit/namespace_manifest.rs | 116 +- rust/lance/src/io/exec/knn.rs | 128 + rust/lance/src/io/exec/take.rs | 104 +- rust/lance/src/io/exec/utils.rs | 36 +- rust/lance/src/lib.rs | 2 +- rust/lance/src/session.rs | 92 +- 202 files changed, 30260 insertions(+), 8491 deletions(-) create mode 100644 rust/lance-core/src/cache/entry_io.rs create mode 100644 rust/lance-core/src/utils/io_stats.rs create mode 100644 rust/lance-index/protos-cache/cache.proto create mode 100644 rust/lance-index/src/scalar/ngram/ngram_regex.rs create mode 100644 rust/lance-index/src/vector/bq/dist_table_quant.rs create mode 100644 rust/lance-index/src/vector/bq/ex_dot.rs create mode 100644 rust/lance-index/src/vector/bq/prune.rs delete mode 100644 rust/lance-linalg/benches/hamming.rs create mode 100644 rust/lance-namespace-impls/BENCHMARK.md create mode 100644 rust/lance-namespace-impls/benches/manifest_commit_sweep.sh create mode 100644 rust/lance-namespace-impls/examples/manifest_bench.rs create mode 100644 rust/lance-namespace-impls/src/dir/manifest_feature_flags.rs create mode 100644 rust/lance/benches/hamming.rs create mode 100644 rust/lance/benches/regex_ngram.rs create mode 100644 rust/lance/src/dataset/mem_wal/index/pk_key.rs create mode 100644 rust/lance/src/dataset/mem_wal/scanner/exec/newest_pk_filter.rs create mode 100644 rust/lance/src/dataset/mem_wal/scanner/exec/pk_block_filter.rs delete mode 100644 rust/lance/src/dataset/mem_wal/scanner/exec/pk_hash_filter.rs delete mode 100644 rust/lance/src/dataset/mem_wal/scanner/exec/within_source_dedup.rs create mode 100644 rust/lance/src/index/vector/hamming.rs diff --git a/.bumpversion.toml b/.bumpversion.toml index 7d766a80aff..80668862afb 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.0.0-beta.11" +current_version = "8.1.0-beta.0" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/.github/workflows/build_linux_wheel/action.yml b/.github/workflows/build_linux_wheel/action.yml index 9016ae67b1a..d6e6e0f1ada 100644 --- a/.github/workflows/build_linux_wheel/action.yml +++ b/.github/workflows/build_linux_wheel/action.yml @@ -3,7 +3,7 @@ name: build-linux-wheel description: "Build a manylinux wheel for lance" inputs: python-minor-version: - description: "9, 10, 11, 12" + description: "10, 11, 12, 13" required: true args: description: "--release" diff --git a/.github/workflows/build_mac_wheel/action.yml b/.github/workflows/build_mac_wheel/action.yml index 9d45bde42aa..0cac76c49cf 100644 --- a/.github/workflows/build_mac_wheel/action.yml +++ b/.github/workflows/build_mac_wheel/action.yml @@ -3,7 +3,7 @@ name: build_wheel description: "Build a lance wheel" inputs: python-minor-version: - description: "9, 10, 11, 12" + description: "10, 11, 12, 13" required: true args: description: "--release" diff --git a/.github/workflows/build_windows_wheel/action.yml b/.github/workflows/build_windows_wheel/action.yml index 03b601db019..94475059c75 100644 --- a/.github/workflows/build_windows_wheel/action.yml +++ b/.github/workflows/build_windows_wheel/action.yml @@ -3,7 +3,7 @@ name: build_wheel description: "Build a lance wheel" inputs: python-minor-version: - description: "9, 10, 11, 12" + description: "10, 11, 12, 13" required: true args: description: "--release" diff --git a/.github/workflows/java-publish.yml b/.github/workflows/java-publish.yml index a51cf969a87..2b22b60dc92 100644 --- a/.github/workflows/java-publish.yml +++ b/.github/workflows/java-publish.yml @@ -28,10 +28,24 @@ permissions: contents: read jobs: - linux-arm64: - name: Build on Linux Arm64 - runs-on: ubuntu-24.04-arm64-8x + build-linux: + name: Build on Linux ${{ matrix.arch }} + runs-on: ${{ matrix.runner }} timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + include: + - arch: x86-64 + runner: ubuntu-24.04 + docker_platform: linux/amd64 + protoc_arch: x86_64 + artifact: liblance_jni_linux_x86_64.zip + - arch: arm64 + runner: ubuntu-24.04-arm64-8x + docker_platform: linux/arm64 + protoc_arch: aarch_64 + artifact: liblance_jni_linux_arm_64.zip steps: - name: Checkout repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 @@ -41,9 +55,9 @@ jobs: uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3 - name: Check glibc version outside docker run: ldd --version - - name: Build and run in Debian 10 Arm64 container + - name: Build and run in Debian 10 container run: | - docker run --platform linux/arm64 -v ${{ github.workspace }}:/workspace -w /workspace debian:10 bash -c " + docker run --platform ${{ matrix.docker_platform }} -v ${{ github.workspace }}:/workspace -w /workspace debian:10 bash -c " set -ex # Update sources.list to use archive repositories for Debian 10 (EOL) @@ -81,7 +95,7 @@ jobs: unzip # https://github.com/databendlabs/databend/issues/8035 - PROTOC_ZIP=protoc-3.15.0-linux-aarch_64.zip + PROTOC_ZIP=protoc-3.15.0-linux-${{ matrix.protoc_arch }}.zip curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v3.15.0/\$PROTOC_ZIP unzip -o \$PROTOC_ZIP -d /usr/local rm -f \$PROTOC_ZIP @@ -102,101 +116,44 @@ jobs: " - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: - name: liblance_jni_linux_arm_64.zip + name: ${{ matrix.artifact }} path: java/lance-jni/target/release/liblance_jni.so retention-days: 1 if-no-files-found: error - linux-x86: - name: Build on Linux x86-64 - runs-on: ubuntu-24.04 + build-macos: + name: Build on MacOS Arm64 + runs-on: warp-macos-14-arm64-6x timeout-minutes: 60 steps: - name: Checkout repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 with: ref: ${{ inputs.ref || github.ref }} - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3 - - name: Check glibc version outside docker - run: ldd --version - - name: Build and run in Debian 10 X86-64 container - run: | - docker run --platform linux/amd64 -v ${{ github.workspace }}:/workspace -w /workspace debian:10 bash -c " - - set -ex - # Update sources.list to use archive repositories for Debian 10 (EOL) - echo 'deb http://archive.debian.org/debian/ buster main' > /etc/apt/sources.list - echo 'deb http://archive.debian.org/debian-security buster/updates main' >> /etc/apt/sources.list - echo 'deb http://archive.debian.org/debian/ buster-updates main' >> /etc/apt/sources.list - apt-get update - - DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends --assume-yes \ - apt-transport-https \ - ca-certificates \ - curl \ - gpg \ - bash \ - less \ - openssl \ - libssl-dev \ - pkg-config \ - libsqlite3-dev \ - libsqlite3-0 \ - libreadline-dev \ - git \ - cmake \ - dh-autoreconf \ - clang \ - g++ \ - libc++-dev \ - libc++abi-dev \ - libprotobuf-dev \ - libncurses5-dev \ - libncursesw5-dev \ - libudev-dev \ - libhidapi-dev \ - zip \ - unzip - - # https://github.com/databendlabs/databend/issues/8035 - PROTOC_ZIP=protoc-3.15.0-linux-x86_64.zip - curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v3.15.0/\$PROTOC_ZIP - unzip -o \$PROTOC_ZIP -d /usr/local - rm -f \$PROTOC_ZIP - protoc --version - - curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain stable - source \$HOME/.cargo/env - cargo --version - - cd java/lance-jni - - # https://github.com/rustls/rustls/issues/1967 - export CC=clang - export CXX=clang++ - ldd --version - - cargo build --release - " + - uses: Swatinem/rust-cache@779680da715d629ac1d338a641029a2f4372abb5 # v2 + - uses: Homebrew/actions/setup-homebrew@50b8c2ab4a835c38897ed2c56c293b07167c0b59 # master 2026-03-07 + - name: Install dependencies + run: brew install protobuf + - name: Build native lib + working-directory: java/lance-jni + run: cargo build --release - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: - name: liblance_jni_linux_x86_64.zip - path: java/lance-jni/target/release/liblance_jni.so + name: liblance_jni_darwin_aarch64.zip + path: java/lance-jni/target/release/liblance_jni.dylib retention-days: 1 if-no-files-found: error - macos-arm64: - name: Build on MacOS Arm64 and release - runs-on: warp-macos-14-arm64-6x - timeout-minutes: 60 + publish: + name: Publish Java packages + runs-on: ubuntu-latest + timeout-minutes: 30 needs: - - linux-arm64 - - linux-x86 + - build-linux + - build-macos steps: - name: Checkout repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 with: ref: ${{ inputs.ref || github.ref }} - - uses: Swatinem/rust-cache@779680da715d629ac1d338a641029a2f4372abb5 # v2 - name: Set up Java 11 uses: actions/setup-java@c1e323688fd81a25caa38c78aa6df2d33d3e20d9 # v4 with: @@ -208,18 +165,16 @@ jobs: server-password: SONATYPE_TOKEN gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }} gpg-passphrase: ${{ secrets.GPG_PASSPHRASE }} - - uses: Homebrew/actions/setup-homebrew@50b8c2ab4a835c38897ed2c56c293b07167c0b59 # master 2026-03-07 - - name: Install dependencies - run: | - brew install protobuf - brew install gpg - - name: Download artifact + - name: Download artifacts uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 - name: Copy native libs run: | - mkdir -p ./java/target/classes/nativelib/linux-x86-64 ./java/target/classes/nativelib/linux-aarch64 + mkdir -p ./java/target/classes/nativelib/linux-x86-64 \ + ./java/target/classes/nativelib/linux-aarch64 \ + ./java/target/classes/nativelib/darwin-aarch64 cp ./liblance_jni_linux_x86_64.zip/liblance_jni.so ./java/target/classes/nativelib/linux-x86-64/liblance_jni.so cp ./liblance_jni_linux_arm_64.zip/liblance_jni.so ./java/target/classes/nativelib/linux-aarch64/liblance_jni.so + cp ./liblance_jni_darwin_aarch64.zip/liblance_jni.dylib ./java/target/classes/nativelib/darwin-aarch64/liblance_jni.dylib - name: Set github run: | git config --global user.email "Lance Github Runner" @@ -230,7 +185,7 @@ jobs: inputs.mode == 'dry_run' working-directory: java run: | - mvn --batch-mode -DskipTests -Drust.release.build=true package + mvn --batch-mode -DskipTests -Dskip.build.jni=true package - name: Publish with Java 11 if: | github.event_name == 'release' || @@ -240,14 +195,14 @@ jobs: echo "use-agent" >> ~/.gnupg/gpg.conf echo "pinentry-mode loopback" >> ~/.gnupg/gpg.conf export GPG_TTY=$(tty) - mvn --batch-mode -DskipTests -Drust.release.build=true -DpushChanges=false -Dgpg.passphrase=${{ secrets.GPG_PASSPHRASE }} deploy -P deploy-to-ossrh -P shade-jar + mvn --batch-mode -DskipTests -Dskip.build.jni=true -DpushChanges=false -Dgpg.passphrase=${{ secrets.GPG_PASSPHRASE }} deploy -P deploy-to-ossrh env: SONATYPE_USER: ${{ secrets.SONATYPE_USER }} SONATYPE_TOKEN: ${{ secrets.SONATYPE_TOKEN }} report-failure: name: Report Workflow Failure runs-on: ubuntu-latest - needs: [linux-arm64, linux-x86, macos-arm64] + needs: [build-linux, build-macos, publish] if: always() && (github.event_name == 'release' || github.event_name == 'workflow_dispatch') permissions: contents: read diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml index b2bfe284fb5..77c76d6fc69 100644 --- a/.github/workflows/pypi-publish.yml +++ b/.github/workflows/pypi-publish.yml @@ -35,7 +35,7 @@ jobs: name: Python Linux 3.${{ matrix.python-minor-version }} ${{ matrix.config.platform }} manylinux${{ matrix.config.manylinux }} strategy: matrix: - python-minor-version: ["9"] + python-minor-version: ["10"] config: - platform: x86_64 manylinux: "2_17" @@ -101,7 +101,7 @@ jobs: runs-on: ${{ matrix.config.runner }} strategy: matrix: - python-minor-version: ["9"] + python-minor-version: ["10"] config: - target: aarch64-apple-darwin runner: warp-macos-14-arm64-6x @@ -152,7 +152,7 @@ jobs: runs-on: windows-latest-4x strategy: matrix: - python-minor-version: ["9"] + python-minor-version: ["10"] steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 with: diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index f9bb3132b38..cce465807e3 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -97,7 +97,7 @@ jobs: timeout-minutes: 45 strategy: matrix: - python-minor-version: ["9", "13"] + python-minor-version: ["10", "13"] name: "Python Linux 3.${{ matrix.python-minor-version }} x86_64" runs-on: "ubuntu-24.04-4x" defaults: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cf332215e49..8f3ec285f31 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,7 +19,7 @@ If you have any questions, please join our [Discord](https://discord.gg/zMM32dvN Currently Lance is implemented in Rust and comes with a Python wrapper. So you'll want to make sure you setup both. 1. Install Rust: https://www.rust-lang.org/tools/install -2. Install Python 3.9+: https://www.python.org/downloads/ +2. Install Python 3.10+: https://www.python.org/downloads/ 3. Install protoctol buffers: https://grpc.io/docs/protoc-installation/ (make sure you have version 3.20 or higher) 4. Install commit hooks: a. Install pre-commit: https://pre-commit.com/#install diff --git a/Cargo.lock b/Cargo.lock index 866eb9b4b0e..11a5fb65a7b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -66,21 +66,6 @@ version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "514ce16346f9fc96702fd52f2ae7e383b185516ee6f556efd7c3176be8fe7bea" -[[package]] -name = "alloc-no-stdlib" -version = "2.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" - -[[package]] -name = "alloc-stdlib" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" -dependencies = [ - "alloc-no-stdlib", -] - [[package]] name = "alloca" version = "0.4.0" @@ -475,7 +460,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -486,7 +471,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1134,9 +1119,9 @@ dependencies = [ [[package]] name = "bitvec" -version = "1.0.1" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +checksum = "ddcec3d12c579d40898fe0a9a358a803c23e9c52ca3c425707f81c9436211837" dependencies = [ "funty", "radium", @@ -1178,9 +1163,9 @@ dependencies = [ [[package]] name = "block-buffer" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be" +checksum = "d2f6c7dbe95a6ed67ad9f18e57daf93a2f034c524b99fd2b76d18fdfeb6660aa" dependencies = [ "hybrid-array", ] @@ -1194,27 +1179,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "brotli" -version = "8.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", - "brotli-decompressor", -] - -[[package]] -name = "brotli-decompressor" -version = "5.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", -] - [[package]] name = "bs58" version = "0.5.1" @@ -1261,7 +1225,7 @@ checksum = "89385e82b5d1821d2219e0b095efa2cc1f246cbf99080f3be46a1a85c0d392d9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1284,9 +1248,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.11.1" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +checksum = "8ae3f5d315924270530207e2a68396c3cc547f6dca3fbdca317cfb1a51edb593" [[package]] name = "bytes-utils" @@ -1315,9 +1279,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.63" +version = "1.2.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f" +checksum = "dad887fd958be91b5098c0248def011f4523ab786cd411be668777e55063501f" dependencies = [ "find-msvc-tools", "jobserver", @@ -1455,7 +1419,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1944,7 +1908,7 @@ dependencies = [ "proc-macro2", "quote", "strsim 0.11.1", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1957,7 +1921,7 @@ dependencies = [ "proc-macro2", "quote", "strsim 0.11.1", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1979,7 +1943,7 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ "darling_core 0.20.11", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1990,7 +1954,7 @@ checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ "darling_core 0.23.0", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2026,7 +1990,6 @@ dependencies = [ "datafusion-datasource-arrow", "datafusion-datasource-csv", "datafusion-datasource-json", - "datafusion-datasource-parquet", "datafusion-execution", "datafusion-expr", "datafusion-expr-common", @@ -2048,7 +2011,6 @@ dependencies = [ "log", "object_store", "parking_lot", - "parquet", "rand 0.9.4", "regex", "sqlparser", @@ -2123,7 +2085,6 @@ dependencies = [ "libc", "log", "object_store", - "parquet", "paste", "sqlparser", "tokio", @@ -2241,36 +2202,6 @@ dependencies = [ "tokio-stream", ] -[[package]] -name = "datafusion-datasource-parquet" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" -dependencies = [ - "arrow", - "async-trait", - "bytes", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate-common", - "datafusion-physical-expr", - "datafusion-physical-expr-adapter", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-pruning", - "datafusion-session", - "futures", - "itertools 0.14.0", - "log", - "object_store", - "parking_lot", - "parquet", - "tokio", -] - [[package]] name = "datafusion-doc" version = "53.1.0" @@ -2479,7 +2410,7 @@ checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" dependencies = [ "datafusion-doc", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2762,7 +2693,7 @@ dependencies = [ "darling 0.20.11", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2782,7 +2713,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core 0.20.2", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2809,7 +2740,7 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2" dependencies = [ - "block-buffer 0.12.0", + "block-buffer 0.12.1", "const-oid 0.10.2", "crypto-common 0.2.2", "ctutils", @@ -2844,7 +2775,7 @@ checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2969,7 +2900,7 @@ checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -3078,7 +3009,6 @@ checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", "miniz_oxide", - "zlib-rs", ] [[package]] @@ -3146,7 +3076,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3225,7 +3155,7 @@ checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -3448,17 +3378,15 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +checksum = "300e883d756b2e4ec94e02791f39b04b522276138852cfc41d9fb7e904106099" dependencies = [ "cfg-if 1.0.4", "js-sys", "libc", "r-efi 6.0.0", "rand_core 0.10.1", - "wasip2", - "wasip3", "wasm-bindgen", ] @@ -3485,7 +3413,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -3532,9 +3460,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.14" +version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733" +checksum = "6cb093c84e8bd9b188d4c4a8cb6579fc016968d14c99882163cd3ff402a4f155" dependencies = [ "atomic-waker", "bytes", @@ -3835,7 +3763,7 @@ dependencies = [ "tokio", "tokio-rustls", "tower-service", - "webpki-roots 1.0.7", + "webpki-roots 1.0.8", ] [[package]] @@ -4095,12 +4023,6 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4a2c462a4d927d512f5f882a033ddd62f33a05bb9f230d98f736ac3dc85938f" -[[package]] -name = "id-arena" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" - [[package]] name = "ident_case" version = "1.0.1" @@ -4322,7 +4244,7 @@ checksum = "782d32378dddf207193ac91cefb848ad41abb58195c95168e1291227a0832b47" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -4367,7 +4289,7 @@ dependencies = [ "quote", "rustc_version", "simd_cesu8", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -4386,7 +4308,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" dependencies = [ "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -4401,9 +4323,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.100" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2025f20d7a4fa7785846e7b63d10a76d3f1cee98ee5cb79ea59703f95e42162" +checksum = "03d04c30968dffe80775bd4d7fb676131cd04a1fb46d2686dbffbaec2d9dfd31" dependencies = [ "cfg-if 1.0.4", "futures-util", @@ -4458,7 +4380,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "all_asserts", "approx", @@ -4561,7 +4483,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -4609,7 +4531,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrayref", "paste", @@ -4618,7 +4540,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -4658,7 +4580,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -4691,7 +4613,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -4706,21 +4628,20 @@ dependencies = [ "rand 0.9.4", "rand_distr", "rand_xoshiro", - "random_word", ] [[package]] name = "lance-derive" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] name = "lance-encoding" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-arith", "arrow-array", @@ -4765,7 +4686,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "all_asserts", "arrow", @@ -4791,7 +4712,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-arith", "arrow-array", @@ -4830,7 +4751,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "datafusion", "geo-traits", @@ -4844,7 +4765,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "approx", "arc-swap", @@ -4906,6 +4827,7 @@ dependencies = [ "rand_distr", "rangemap", "rayon", + "regex-syntax", "roaring", "rstest", "serde", @@ -4920,7 +4842,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-arith", @@ -4968,7 +4890,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "approx", "arrow-array", @@ -4983,11 +4905,12 @@ dependencies = [ "num-traits", "proptest", "rand 0.9.4", + "rayon", ] [[package]] name = "lance-namespace" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow", "async-trait", @@ -4999,7 +4922,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-schema", @@ -5015,7 +4938,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -5028,6 +4951,8 @@ dependencies = [ "base64 0.22.1", "bytes", "chrono", + "datafusion-common", + "datafusion-physical-plan", "futures", "hmac 0.12.1", "lance", @@ -5045,24 +4970,27 @@ dependencies = [ "rand 0.9.4", "reqwest 0.12.28", "ring", + "roaring", "rstest", "rustls-pki-types", "serde", "serde_json", "sha2 0.10.9", "tempfile", + "time", "tokio", "tower", "tower-http 0.5.2", "url", + "uuid", "wiremock", ] [[package]] name = "lance-namespace-reqwest-client" -version = "0.8.4" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04b4e5caefa132a9cce54b2d4dc95016b949b3a290a83ad5057e705df43d75be" +checksum = "ba3f0a235e3ed5f8805205649ccc7d7d0f3df23ce1294242c9265ad488d7f19d" dependencies = [ "reqwest 0.12.28", "serde", @@ -5074,7 +5002,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -5092,7 +5020,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -5138,16 +5066,16 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] name = "lance-testing" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-schema", @@ -5160,19 +5088,20 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "icu_segmenter", "jieba-rs", "lindera", "rust-stemmers", "serde", + "stop-words", "unicode-normalization", ] [[package]] name = "lance-tools" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "clap", "lance-core", @@ -5192,12 +5121,6 @@ dependencies = [ "spin 0.9.8", ] -[[package]] -name = "leb128fmt" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" - [[package]] name = "lexical-core" version = "1.0.6" @@ -5520,9 +5443,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.8.1" +version = "2.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" [[package]] name = "memmap2" @@ -5605,7 +5528,7 @@ dependencies = [ "cfg-if 1.0.4", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -5647,7 +5570,7 @@ checksum = "e4db6d5580af57bf992f59068d4ea26fd518574ff48d7639b255a36f9de6e7e9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -5679,7 +5602,7 @@ checksum = "4568f25ccbd45ab5d5603dc34318c1ec56b117531781260002151b8530a9f931" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -5872,7 +5795,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6292,9 +6215,9 @@ dependencies = [ [[package]] name = "openssl" -version = "0.10.80" +version = "0.10.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a45fa2aa886c42762255da344f0a0d313e254066c46aad76f300c3d3da62d967" +checksum = "77823a27f0babb03091cb9ed9ef80af3b39dbc82f97e8fa530374b7dafd87a45" dependencies = [ "bitflags 2.13.0", "cfg-if 1.0.4", @@ -6312,7 +6235,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6323,9 +6246,9 @@ checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" [[package]] name = "openssl-sys" -version = "0.9.116" +version = "0.9.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f28a22dc7140cda5f096e5e7724a6962ca81a7f8bfd2979f9b18c11af56318c4" +checksum = "b47e7e6bb2c38cd930d25a23b40fa52e068c10e85f3e03a7f5ba5aaca5713695" dependencies = [ "cc", "libc", @@ -6435,26 +6358,19 @@ dependencies = [ "arrow-schema", "arrow-select", "base64 0.22.1", - "brotli", "bytes", "chrono", - "flate2", "futures", "half", "hashbrown 0.17.1", - "lz4_flex", "num-bigint", "num-integer", "num-traits", - "object_store", "paste", "seq-macro", - "simdutf8", - "snap", "thrift", "tokio", "twox-hash", - "zstd", ] [[package]] @@ -6639,7 +6555,7 @@ checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6832,7 +6748,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6897,7 +6813,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.117", + "syn 2.0.118", "tempfile", ] @@ -6911,7 +6827,7 @@ dependencies = [ "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6949,7 +6865,7 @@ checksum = "7347867d0a7e1208d93b46767be83e2b8f978c3dad35f775ac8d8847551d6fe1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7116,7 +7032,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" dependencies = [ "chacha20", - "getrandom 0.4.2", + "getrandom 0.4.3", "rand_core 0.10.1", ] @@ -7192,19 +7108,6 @@ dependencies = [ "rand_core 0.9.5", ] -[[package]] -name = "random_word" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e47a395bdb55442b883c89062d6bcff25dc90fa5f8369af81e0ac6d49d78cf81" -dependencies = [ - "ahash", - "brotli", - "paste", - "rand 0.9.4", - "unicase", -] - [[package]] name = "rangemap" version = "1.7.1" @@ -7294,7 +7197,7 @@ checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7545,7 +7448,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams 0.4.2", "web-sys", - "webpki-roots 1.0.7", + "webpki-roots 1.0.8", ] [[package]] @@ -7652,7 +7555,7 @@ checksum = "5d2ed0b54125315fb36bd021e82d314d1c126548f871634b483f46b31d13cac6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7728,7 +7631,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.117", + "syn 2.0.118", "unicode-ident", ] @@ -7965,7 +7868,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8057,7 +7960,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8068,7 +7971,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8103,7 +8006,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8115,7 +8018,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8159,7 +8062,7 @@ dependencies = [ "darling 0.23.0", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8210,7 +8113,7 @@ checksum = "94e153fc76e1c6a068703d6d29c508a0b15c061c4b7e43da59cc097bc342673c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8349,9 +8252,9 @@ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "smallvec" -version = "1.15.1" +version = "1.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +checksum = "8ed6a63f02c8539c91a8685a86f4099661ba3da017932f6ebbea6de3f0fa7c90" [[package]] name = "snafu" @@ -8371,15 +8274,9 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] -[[package]] -name = "snap" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" - [[package]] name = "socket2" version = "0.6.4" @@ -8468,7 +8365,7 @@ checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8505,6 +8402,15 @@ version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978" +[[package]] +name = "stop-words" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d68df56303396bcfb639455b3c166804aeb7994005010aab5e9e8a1277b8871d" +dependencies = [ + "serde_json", +] + [[package]] name = "str_stack" version = "0.1.1" @@ -8551,7 +8457,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8563,7 +8469,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8586,7 +8492,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.117", + "syn 2.0.118", "typify", "walkdir", ] @@ -8639,9 +8545,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.117" +version = "2.0.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422" dependencies = [ "proc-macro2", "quote", @@ -8665,7 +8571,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8722,7 +8628,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom 0.4.2", + "getrandom 0.4.3", "once_cell", "rustix", "windows-sys 0.61.2", @@ -8753,7 +8659,7 @@ checksum = "c26ef8b00e4d382e59f6a8ddb3cd790b3a5bb29f21a358a9a69ea2f29f13f27b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8762,7 +8668,7 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "944ad38adcbb71eaa682c56bceeb079e4ca82b4b3edc2a0fde5cb297b77dac8d" dependencies = [ - "syn 2.0.117", + "syn 2.0.118", "test-log-core", ] @@ -8792,7 +8698,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8803,7 +8709,7 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8969,7 +8875,7 @@ checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -9201,7 +9107,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -9339,7 +9245,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.117", + "syn 2.0.118", "thiserror 2.0.18", "unicode-ident", ] @@ -9357,7 +9263,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.117", + "syn 2.0.118", "typify-impl", ] @@ -9415,12 +9321,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" -[[package]] -name = "unicode-xid" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" - [[package]] name = "unicode_categories" version = "0.1.1" @@ -9501,7 +9401,7 @@ version = "1.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7" dependencies = [ - "getrandom 0.4.2", + "getrandom 0.4.3", "js-sys", "serde_core", "wasm-bindgen", @@ -9576,20 +9476,11 @@ dependencies = [ [[package]] name = "wasip2" -version = "1.0.3+wasi-0.2.9" +version = "1.0.4+wasi-0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" +checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487" dependencies = [ - "wit-bindgen 0.57.1", -] - -[[package]] -name = "wasip3" -version = "0.4.0+wasi-0.3.0-rc-2026-01-06" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" -dependencies = [ - "wit-bindgen 0.51.0", + "wit-bindgen", ] [[package]] @@ -9603,9 +9494,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a254a4b10c19a76f09a27640e7ffbf9bc30bf67e16a3bf28aaefa4920fe81563" +checksum = "8ddb3f79143bced6de84270411622a2699cee572fc0875aeaf1e7867cf9fca1a" dependencies = [ "cfg-if 1.0.4", "once_cell", @@ -9616,9 +9507,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.73" +version = "0.4.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54568702fabf5d4849ce2b90fadfa64168a097eaf4b351ce9df8b687a0086aaf" +checksum = "503b14d284f2c8dac03b819967e155ea753f573586193b2b2c95990cb5d69280" dependencies = [ "js-sys", "wasm-bindgen", @@ -9626,9 +9517,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24a40fc75b0ec6f3746ceb10d36f53a93dcd68a93b11b6445983945d79eba0dc" +checksum = "4e21a184b13fb19e157296e2c46056aec9092264fab83e4ba59e68c61b323c3d" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -9636,48 +9527,26 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "908f34bd9b9ce3d4caf07b72dfab63d61504d156856c6bd3cd87fa350cf3985b" +checksum = "fecefd9c35bd935a20fc3fc344b5f29138961e4f47fb03297d88f2587afb5ebd" dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7acbf7616c27b194bbb550bf77ed0c2c3e5b7fd1260a93082b95fb7f47959b92" +checksum = "23939e44bb9a5d7576fa2b563dc2e136628f1224e88a8deed09e04858b77871f" dependencies = [ "unicode-ident", ] -[[package]] -name = "wasm-encoder" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" -dependencies = [ - "leb128fmt", - "wasmparser", -] - -[[package]] -name = "wasm-metadata" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" -dependencies = [ - "anyhow", - "indexmap 2.14.0", - "wasm-encoder", - "wasmparser", -] - [[package]] name = "wasm-streams" version = "0.4.2" @@ -9704,23 +9573,11 @@ dependencies = [ "web-sys", ] -[[package]] -name = "wasmparser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" -dependencies = [ - "bitflags 2.13.0", - "hashbrown 0.15.5", - "indexmap 2.14.0", - "semver", -] - [[package]] name = "web-sys" -version = "0.3.100" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e0871acf327f283dc6da28a1696cdc64fb355ba9f935d052021fa77f35cce69" +checksum = "a6430a72df5eb332242960fe84b3002a241163998241eb596d4f739b9757061d" dependencies = [ "js-sys", "wasm-bindgen", @@ -9738,9 +9595,9 @@ dependencies = [ [[package]] name = "webpki-root-certs" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c" +checksum = "0d46a5a140e6f7afeccd8eae97eff335163939eac8b929834875168b29b3d267" dependencies = [ "rustls-pki-types", ] @@ -9751,14 +9608,14 @@ version = "0.26.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" dependencies = [ - "webpki-roots 1.0.7", + "webpki-roots 1.0.8", ] [[package]] name = "webpki-roots" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" +checksum = "bf85cb06032201fa7c6f829d7db5a7e5aa45bcc0655327713065f6f0576731bf" dependencies = [ "rustls-pki-types", ] @@ -9860,7 +9717,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -9871,7 +9728,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -10125,100 +9982,12 @@ dependencies = [ "url", ] -[[package]] -name = "wit-bindgen" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" -dependencies = [ - "wit-bindgen-rust-macro", -] - [[package]] name = "wit-bindgen" version = "0.57.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" -[[package]] -name = "wit-bindgen-core" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" -dependencies = [ - "anyhow", - "heck", - "wit-parser", -] - -[[package]] -name = "wit-bindgen-rust" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" -dependencies = [ - "anyhow", - "heck", - "indexmap 2.14.0", - "prettyplease", - "syn 2.0.117", - "wasm-metadata", - "wit-bindgen-core", - "wit-component", -] - -[[package]] -name = "wit-bindgen-rust-macro" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" -dependencies = [ - "anyhow", - "prettyplease", - "proc-macro2", - "quote", - "syn 2.0.117", - "wit-bindgen-core", - "wit-bindgen-rust", -] - -[[package]] -name = "wit-component" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" -dependencies = [ - "anyhow", - "bitflags 2.13.0", - "indexmap 2.14.0", - "log", - "serde", - "serde_derive", - "serde_json", - "wasm-encoder", - "wasm-metadata", - "wasmparser", - "wit-parser", -] - -[[package]] -name = "wit-parser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" -dependencies = [ - "anyhow", - "id-arena", - "indexmap 2.14.0", - "log", - "semver", - "serde", - "serde_derive", - "serde_json", - "unicode-xid", - "wasmparser", -] - [[package]] name = "wkb" version = "0.9.2" @@ -10313,7 +10082,7 @@ dependencies = [ "csv", "futures", "futures-util", - "getrandom 0.4.2", + "getrandom 0.4.3", "heapify", "itertools 0.14.0", "lazy_static", @@ -10443,7 +10212,7 @@ checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "synstructure", ] @@ -10464,7 +10233,7 @@ checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -10484,15 +10253,15 @@ checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "synstructure", ] [[package]] name = "zeroize" -version = "1.8.2" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" +checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e" [[package]] name = "zerotrie" @@ -10526,15 +10295,9 @@ checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] -[[package]] -name = "zlib-rs" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" - [[package]] name = "zmij" version = "1.0.21" diff --git a/Cargo.toml b/Cargo.toml index 1996e2a2d57..f902f10496b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ resolver = "3" [workspace.package] -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -57,27 +57,27 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.0.0-beta.11", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.0.0-beta.11", path = "./rust/lance-arrow" } -lance-core = { version = "=8.0.0-beta.11", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.0.0-beta.11", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.0.0-beta.11", path = "./rust/lance-datagen" } -lance-derive = { version = "=8.0.0-beta.11", path = "./rust/lance-derive" } -lance-encoding = { version = "=8.0.0-beta.11", path = "./rust/lance-encoding" } -lance-file = { version = "=8.0.0-beta.11", path = "./rust/lance-file" } -lance-geo = { version = "=8.0.0-beta.11", path = "./rust/lance-geo" } -lance-index = { version = "=8.0.0-beta.11", path = "./rust/lance-index" } -lance-io = { version = "=8.0.0-beta.11", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.0.0-beta.11", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.0.0-beta.11", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.0.0-beta.11", path = "./rust/lance-namespace-impls" } +lance = { version = "=8.1.0-beta.0", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=8.1.0-beta.0", path = "./rust/lance-arrow" } +lance-core = { version = "=8.1.0-beta.0", path = "./rust/lance-core" } +lance-datafusion = { version = "=8.1.0-beta.0", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=8.1.0-beta.0", path = "./rust/lance-datagen" } +lance-derive = { version = "=8.1.0-beta.0", path = "./rust/lance-derive" } +lance-encoding = { version = "=8.1.0-beta.0", path = "./rust/lance-encoding" } +lance-file = { version = "=8.1.0-beta.0", path = "./rust/lance-file" } +lance-geo = { version = "=8.1.0-beta.0", path = "./rust/lance-geo" } +lance-index = { version = "=8.1.0-beta.0", path = "./rust/lance-index" } +lance-io = { version = "=8.1.0-beta.0", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=8.1.0-beta.0", path = "./rust/lance-linalg" } +lance-namespace = { version = "=8.1.0-beta.0", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=8.1.0-beta.0", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } -lance-namespace-reqwest-client = "0.8.4" -lance-select = { version = "=8.0.0-beta.11", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.0.0-beta.11", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.0.0-beta.11", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.0.0-beta.11", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.0.0-beta.11", path = "./rust/lance-testing" } +lance-namespace-reqwest-client = "0.8.6" +lance-select = { version = "=8.1.0-beta.0", path = "./rust/lance-select" } +lance-tokenizer = { version = "=8.1.0-beta.0", path = "./rust/lance-tokenizer" } +lance-table = { version = "=8.1.0-beta.0", path = "./rust/lance-table" } +lance-test-macros = { version = "=8.1.0-beta.0", path = "./rust/lance-test-macros" } +lance-testing = { version = "=8.1.0-beta.0", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -104,7 +104,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.0.0-beta.11", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=8.1.0-beta.0", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -133,17 +133,17 @@ datafusion = { version = "53.0.0", default-features = false, features = [ "unicode_expressions", ] } datafusion-common = "53.0.0" -datafusion-functions = { version = "53.0.0", features = ["regex_expressions"] } +datafusion-functions = { version = "53.0.0", default-features = false, features = ["regex_expressions"] } datafusion-sql = "53.0.0" datafusion-expr = "53.0.0" datafusion-ffi = "53.0.0" datafusion-physical-expr = "53.0.0" datafusion-physical-plan = "53.0.0" -datafusion-substrait = "53.0.0" +datafusion-substrait = { version = "53.0.0", default-features = false } dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.0.0-beta.11", path = "./rust/compression/fsst" } +fsst = { version = "=8.1.0-beta.0", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" @@ -180,6 +180,7 @@ rand_distr = { version = "0.5.1" } rand_xoshiro = "0.7.0" rangemap = { version = "1.0" } rayon = "1.10" +regex-syntax = "0.8.10" roaring = "0.11.4" rstest = "0.26.1" serde = { version = "^1" } diff --git a/ci/create_release_branch.sh b/ci/create_release_branch.sh index 9c7d9d3e58a..db88f5b6b24 100755 --- a/ci/create_release_branch.sh +++ b/ci/create_release_branch.sh @@ -229,9 +229,9 @@ else bump-my-version bump -vv --new-version "${RC_VERSION}" --no-tag patch # Update Cargo.lock files after version bump - cargo update - (cd python && cargo update) - (cd java/lance-jni && cargo update) + cargo update --workspace + (cd python && cargo update --workspace) + (cd java/lance-jni && cargo update --workspace) # Commit the RC version git add -A @@ -259,9 +259,9 @@ else bump-my-version bump -vv --new-version "${NEXT_VERSION}" --no-tag patch # Update Cargo.lock files after version bump - cargo update - (cd python && cargo update) - (cd java/lance-jni && cargo update) + cargo update --workspace + (cd python && cargo update --workspace) + (cd java/lance-jni && cargo update --workspace) git add -A git commit -m "chore: bump main to ${NEXT_VERSION} diff --git a/ci/publish_beta.sh b/ci/publish_beta.sh index f50798a52e0..06fa5c16a91 100644 --- a/ci/publish_beta.sh +++ b/ci/publish_beta.sh @@ -93,9 +93,9 @@ if [[ "${BRANCH}" == "main" ]] && [[ "${CURRENT_VERSION}" =~ -beta\.[0-9]+$ ]]; bump-my-version bump -vv --new-version "${NEXT_VERSION}" --no-tag patch # Update Cargo.lock files after version bump - cargo update - (cd python && cargo update) - (cd java/lance-jni && cargo update) + cargo update --workspace + (cd python && cargo update --workspace) + (cd java/lance-jni && cargo update --workspace) git add -A git commit -m "chore: bump to ${NEXT_VERSION} based on breaking change detection" @@ -133,9 +133,9 @@ echo "Bumping beta version" bump-my-version bump -vv prerelease_num # Update Cargo.lock files after version bump -cargo update -(cd python && cargo update) -(cd java/lance-jni && cargo update) +cargo update --workspace +(cd python && cargo update --workspace) +(cd java/lance-jni && cargo update --workspace) # Get new version NEW_VERSION=$(grep '^version = ' Cargo.toml | head -n1 | cut -d'"' -f2) diff --git a/ci/release_common.sh b/ci/release_common.sh index cd653212aae..573202d1689 100644 --- a/ci/release_common.sh +++ b/ci/release_common.sh @@ -29,9 +29,9 @@ bump_and_commit_version() { bump-my-version bump -vv --new-version "${NEW_VERSION}" --no-tag patch # Update Cargo.lock files after version bump - cargo update - (cd python && cargo update) - (cd java/lance-jni && cargo update) + cargo update --workspace + (cd python && cargo update --workspace) + (cd java/lance-jni && cargo update --workspace) git add -A git commit -m "${COMMIT_MESSAGE}" diff --git a/docs/src/format/file/encoding.md b/docs/src/format/file/encoding.md index a3d99ef39cb..4ca053d4fa6 100644 --- a/docs/src/format/file/encoding.md +++ b/docs/src/format/file/encoding.md @@ -683,9 +683,10 @@ the default mini-block size is negligible. You should only consider changing thi confirmed — through profiling — that mini-block read amplification is saturating your available bandwidth (for example, accessing a remote object store over a constrained network link). -The maximum number of values per mini-block can be lowered via an environment variable: +The maximum number of values per mini-block can be tuned via an environment variable: -- `LANCE_MINIBLOCK_MAX_VALUES` (default `4096`): upper bound on the number of values in a single mini-block chunk. +- `LANCE_MINIBLOCK_MAX_VALUES` (default `4096`, maximum `32768`): upper bound on the number of values in a single mini-block chunk. Reducing this value produces smaller mini-blocks, which reduces the amount of data fetched per read at the -cost of more mini-blocks and slightly more metadata overhead. +cost of more mini-blocks and slightly more metadata overhead. Increasing it can reduce metadata overhead and +improve throughput for highly compressible data, but it may increase random-read amplification. diff --git a/docs/src/format/index/scalar/ngram.md b/docs/src/format/index/scalar/ngram.md index bdf78474d50..d437363d264 100644 --- a/docs/src/format/index/scalar/ngram.md +++ b/docs/src/format/index/scalar/ngram.md @@ -29,4 +29,10 @@ The N-gram index provides inexact results for the following query types: | Query Type | Description | Operation | Result Type | |----------------|--------------------------|-------------------------------------------------------|-------------| -| **contains** | Substring search in text | Finds all trigrams in query, intersects posting lists | AtMost | \ No newline at end of file +| **contains** | Substring search in text | Finds all trigrams in query, intersects posting lists | AtMost | +| **regexp_like** / **regexp_match** | Regular-expression match | Derives a necessary trigram condition from the pattern (AND of intersections, OR of unions), then rechecks the true regex | AtMost | +| **LIKE** (infix) | Wildcard match such as `%foo%bar%` | Uses the literal segments of the pattern as a trigram condition, then rechecks the LIKE | AtMost | + +Patterns from which no trigram can be derived - for example `a.b`, `.*`, +case-insensitive matches, or literal runs shorter than three characters - fall +back to rechecking every row. This is always correct, just not accelerated. diff --git a/docs/src/guide/blob.md b/docs/src/guide/blob.md index b1f956a19e7..dd13fcaab34 100644 --- a/docs/src/guide/blob.md +++ b/docs/src/guide/blob.md @@ -95,6 +95,16 @@ Note: - By default, external blob URIs must map to a registered non-dataset-root base path. - If you need to reference external objects outside those bases, set `allow_external_blob_outside_bases=True` when writing. +- Blob v2 storage layout thresholds can be configured per column with + `blob_field(..., inline_size_threshold=..., dedicated_size_threshold=...)`. + The inline threshold controls when values move from the data file to packed + `.blob` sidecar storage. The dedicated threshold controls when values move + from packed sidecar storage to a dedicated `.blob` file. The dedicated + threshold is checked first. For existing columns, these thresholds are stored + in the dataset schema; appends that explicitly provide different threshold + metadata for the same column are rejected. +- `blob_pack_file_size_threshold` is a write option for rolling packed `.blob` + sidecar files. It does not control inline-vs-packed placement. ### Example: packed external blobs (single container file) diff --git a/docs/src/guide/object_store.md b/docs/src/guide/object_store.md index 182b93c0574..f901d2c2411 100644 --- a/docs/src/guide/object_store.md +++ b/docs/src/guide/object_store.md @@ -248,3 +248,208 @@ ds = lance.dataset( | `tos_access_key_id` | Access key ID used for TOS authentication. Optional if credentials are provided by environment. | | `tos_secret_access_key` | Secret access key used for TOS authentication. Optional if credentials are provided by environment. | | `tos_security_token` | Security token for temporary credentials. Optional. | + +## Tencent Cloud COS Configuration + +[COS (Cloud Object Storage)](https://cloud.tencent.com/product/cos) credentials can be set in environment variables prefixed +with `COS_` or `TENCENTCLOUD_` (for example, `COS_ENDPOINT`, `COS_SECRET_ID`, +`COS_SECRET_KEY`, `TENCENTCLOUD_REGION`, `TENCENTCLOUD_SECURITY_TOKEN`). +Alternatively, credentials can be passed as parameters to the `storage_options` +parameter; explicit `storage_options` override environment variables: + +=== "Python" + + ```python + import lance + ds = lance.dataset( + "cos://bucket/path", + storage_options={ + "cos_endpoint": "https://cos.ap-guangzhou.myqcloud.com", + "cos_secret_id": "my-secret-id", + "cos_secret_key": "my-secret-key", + } + ) + ``` + +=== "Rust" + + In this Lance distribution, `tencent` is already part of the **default + features** of the `lance` crate, so simply depending on `lance` is enough: + + ```toml + [dependencies] + lance = "*" + ``` + + You only need to enable the `tencent` feature explicitly in the following + cases: + + - You opted out of default features, e.g. + `lance = { version = "*", default-features = false, features = ["tencent", ...] }`. + - You depend on `lance-io` directly (without `lance`); `tencent` is **not** + a default feature of `lance-io`: + `lance-io = { version = "*", features = ["tencent"] }`. + +| Key | Description | +|-----|-------------| +| `cos_endpoint` | COS endpoint. Required (for example, `https://cos.ap-guangzhou.myqcloud.com`). Can also be set via the `COS_ENDPOINT` environment variable. | +| `cos_secret_id` | Secret ID used for COS authentication. Optional if credentials are provided by environment. | +| `cos_secret_key` | Secret key used for COS authentication. Optional if credentials are provided by environment. | +| `cos_enable_versioning` | Whether to enable object versioning on the bucket. Optional. | + +!!! note + + The OpenDAL `CosConfig` currently exposes a limited set of options. Additional + settings such as the security token (`TENCENTCLOUD_SECURITY_TOKEN`) and region + (`TENCENTCLOUD_REGION`) must be configured via environment variables. + +## GooseFS Configuration + +[GooseFS](https://cloud.tencent.com/product/goosefs) is a distributed caching +filesystem. Lance accesses GooseFS through its Master gRPC service. The URL format +is `goosefs://host:port/path`, where `host:port` is the GooseFS Master address +(default port: `9200`, may be omitted, e.g. `goosefs://10.0.0.1/path`) and +`/path` is the filesystem path within GooseFS. + +!!! note "About the dataset path" + + `/path` is just an arbitrary directory inside GooseFS — Lance does **not** + require the path to end with a `.lance` suffix. Any valid GooseFS directory + works, for example: + + - `goosefs://10.0.0.1:9200/data/my-dataset` + - `goosefs://10.0.0.1:9200/data/my-dataset.lance` + - `goosefs://10.0.0.1:9200/lance-test/lance-io` + + The `.lance` suffix used in the examples below is only a naming convention + that makes it easy to recognize a Lance dataset directory at a glance; it + has no special meaning to Lance itself. The only requirement is that the + same path is used consistently for reads and writes of a given dataset. + +=== "Python" + + ```python + import lance + + ds = lance.dataset( + "goosefs://10.0.0.1:9200/data/my-dataset.lance", + storage_options={ + "goosefs_auth_type": "simple", + "goosefs_auth_username": "lance", + }, + ) + ``` + +=== "Rust" + + In this Lance distribution, `goosefs` is already part of the **default + features** of the `lance` crate, so simply depending on `lance` is enough: + + ```toml + [dependencies] + lance = "*" + ``` + + You only need to enable the `goosefs` feature explicitly in the following + cases: + + - You opted out of default features, e.g. + `lance = { version = "*", default-features = false, features = ["goosefs", ...] }`. + - You depend on `lance-io` directly (without `lance`); `goosefs` is **not** + a default feature of `lance-io`: + `lance-io = { version = "*", features = ["goosefs"] }`. + + Open the underlying `lance_io::object_store::ObjectStore` directly (mirrors + the integration test in `rust/lance-io/tests/goosefs_integration.rs`): + + ```rust + use lance_io::object_store::ObjectStore; + + let uri = "goosefs://10.0.0.1:9200/lance-test/lance-io"; + let (store, path) = ObjectStore::from_uri(uri).await?; + + // Read / write through the underlying `object_store::ObjectStore` API + store.inner.put(&path, (&b"hello"[..]).into()).await?; + let result = store.inner.get(&path).await?; + let bytes = result.bytes().await?; + ``` + + Open a Lance dataset with custom storage options: + + ```rust + use std::collections::HashMap; + use lance::dataset::DatasetBuilder; + + let mut storage_options = HashMap::new(); + storage_options.insert("goosefs_master_addr".to_string(), "10.0.0.1:9200".to_string()); + storage_options.insert("goosefs_auth_type".to_string(), "simple".to_string()); + storage_options.insert("goosefs_auth_username".to_string(), "lance".to_string()); + + let dataset = DatasetBuilder::from_uri("goosefs://10.0.0.1:9200/data/my-dataset.lance") + .with_storage_options(storage_options) + .load() + .await?; + ``` + +=== "Java" + + Pass the GooseFS configuration through `ReadOptions.setStorageOptions` + when opening the dataset: + + ```java + import org.lance.Dataset; + import org.lance.ReadOptions; + + import java.util.HashMap; + import java.util.Map; + + Map storageOptions = new HashMap<>(); + storageOptions.put("goosefs_master_addr", "10.0.0.1:9200"); + storageOptions.put("goosefs_auth_type", "simple"); + storageOptions.put("goosefs_auth_username", "lance"); + + ReadOptions options = new ReadOptions.Builder() + .setStorageOptions(storageOptions) + .build(); + + try (Dataset dataset = Dataset.open() + .uri("goosefs://10.0.0.1:9200/data/my-dataset.lance") + .readOptions(options) + .build()) { + // ... use the dataset + } + ``` + + For writes, the same `storageOptions(...)` setter is available on + `WriteDatasetBuilder` and `WriteFragmentBuilder`. + +The Master address can be resolved from (in priority order): + +1. The `goosefs_master_addr` storage option (supports HA: `"addr1:port,addr2:port"`). +2. The `GOOSEFS_MASTER_ADDR` environment variable. +3. The host and port from the URL authority. + +The following keys can be used as both environment variables or keys in the +`storage_options` parameter: + +| Key | Description | +|-----|-------------| +| `goosefs_master_addr` / `GOOSEFS_MASTER_ADDR` | GooseFS Master address. Supports a single address (`host:port`) or comma-separated HA addresses (`addr1:port,addr2:port`). Optional if the address is provided in the URL. | +| `goosefs_write_type` / `GOOSEFS_WRITE_TYPE` | Write type, e.g. `MUST_CACHE`, `CACHE_THROUGH`, `THROUGH`, `ASYNC_THROUGH`. Optional. | +| `goosefs_block_size` / `GOOSEFS_BLOCK_SIZE` | GooseFS block size in bytes (this is the GooseFS-side block size, not Lance's I/O block size). Optional. | +| `goosefs_chunk_size` / `GOOSEFS_CHUNK_SIZE` | Chunk size in bytes used when reading or writing files. Optional. | +| `goosefs_auth_type` / `GOOSEFS_AUTH_TYPE` | Authentication type. Either `nosasl` or `simple` (case-insensitive; the value is passed through to OpenDAL). Optional. | +| `goosefs_auth_username` / `GOOSEFS_AUTH_USERNAME` | Username used in `simple` authentication mode. Optional. | + +!!! note "Running the GooseFS integration tests" + + The Rust integration tests for GooseFS live at + `rust/lance-io/tests/goosefs_integration.rs` and are gated behind feature + flags. They require a reachable GooseFS cluster (configured via the + `GOOSEFS_MASTER_ADDR` and `GOOSEFS_AUTH_TYPE` environment variables) and + can be run with: + + ```bash + cargo test -p lance-io --features "goosefs goosefs-test" \ + --test goosefs_integration -- --ignored --nocapture --test-threads=1 + ``` diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index f4cfc21ec9c..ee52544ba57 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -42,21 +42,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "alloc-no-stdlib" -version = "2.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" - -[[package]] -name = "alloc-stdlib" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" -dependencies = [ - "alloc-no-stdlib", -] - [[package]] name = "allocator-api2" version = "0.2.21" @@ -944,9 +929,9 @@ dependencies = [ [[package]] name = "bitvec" -version = "1.0.1" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +checksum = "ddcec3d12c579d40898fe0a9a358a803c23e9c52ca3c425707f81c9436211837" dependencies = [ "funty", "radium", @@ -988,9 +973,9 @@ dependencies = [ [[package]] name = "block-buffer" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be" +checksum = "d2f6c7dbe95a6ed67ad9f18e57daf93a2f034c524b99fd2b76d18fdfeb6660aa" dependencies = [ "hybrid-array", ] @@ -1004,27 +989,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "brotli" -version = "8.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", - "brotli-decompressor", -] - -[[package]] -name = "brotli-decompressor" -version = "5.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", -] - [[package]] name = "bs58" version = "0.5.1" @@ -1065,9 +1029,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.11.1" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +checksum = "8ae3f5d315924270530207e2a68396c3cc547f6dca3fbdca317cfb1a51edb593" [[package]] name = "bytes-utils" @@ -1090,9 +1054,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.63" +version = "1.2.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f" +checksum = "dad887fd958be91b5098c0248def011f4523ab786cd411be668777e55063501f" dependencies = [ "find-msvc-tools", "jobserver", @@ -1610,7 +1574,6 @@ dependencies = [ "datafusion-datasource-arrow", "datafusion-datasource-csv", "datafusion-datasource-json", - "datafusion-datasource-parquet", "datafusion-execution", "datafusion-expr", "datafusion-expr-common", @@ -1632,7 +1595,6 @@ dependencies = [ "log", "object_store", "parking_lot", - "parquet", "rand 0.9.4", "regex", "sqlparser", @@ -1707,7 +1669,6 @@ dependencies = [ "libc", "log", "object_store", - "parquet", "paste", "sqlparser", "tokio", @@ -1825,36 +1786,6 @@ dependencies = [ "tokio-stream", ] -[[package]] -name = "datafusion-datasource-parquet" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" -dependencies = [ - "arrow", - "async-trait", - "bytes", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate-common", - "datafusion-physical-expr", - "datafusion-physical-expr-adapter", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-pruning", - "datafusion-session", - "futures", - "itertools 0.14.0", - "log", - "object_store", - "parking_lot", - "parquet", - "tokio", -] - [[package]] name = "datafusion-doc" version = "53.1.0" @@ -2298,7 +2229,7 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2" dependencies = [ - "block-buffer 0.12.0", + "block-buffer 0.12.1", "const-oid 0.10.2", "crypto-common 0.2.2", "ctutils", @@ -2505,7 +2436,6 @@ checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", "miniz_oxide", - "zlib-rs", ] [[package]] @@ -2549,7 +2479,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "rand 0.9.4", @@ -2843,17 +2773,15 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +checksum = "300e883d756b2e4ec94e02791f39b04b522276138852cfc41d9fb7e904106099" dependencies = [ "cfg-if 1.0.4", "js-sys", "libc", "r-efi 6.0.0", "rand_core 0.10.1", - "wasip2", - "wasip3", "wasm-bindgen", ] @@ -2921,9 +2849,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.14" +version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733" +checksum = "6cb093c84e8bd9b188d4c4a8cb6579fc016968d14c99882163cd3ff402a4f155" dependencies = [ "atomic-waker", "bytes", @@ -3444,12 +3372,6 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4a2c462a4d927d512f5f882a033ddd62f33a05bb9f230d98f736ac3dc85938f" -[[package]] -name = "id-arena" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" - [[package]] name = "ident_case" version = "1.0.1" @@ -3510,12 +3432,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "integer-encoding" -version = "3.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" - [[package]] name = "io-uring" version = "0.7.12" @@ -3701,9 +3617,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.100" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2025f20d7a4fa7785846e7b63d10a76d3f1cee98ee5cb79ea59703f95e42162" +checksum = "03d04c30968dffe80775bd4d7fb676131cd04a1fb46d2686dbffbaec2d9dfd31" dependencies = [ "cfg-if 1.0.4", "futures-util", @@ -3723,7 +3639,7 @@ dependencies = [ "jiff", "nom", "num-traits", - "ordered-float 5.3.0", + "ordered-float", "rand 0.9.4", "serde", "serde_json", @@ -3749,7 +3665,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arc-swap", "arrow", @@ -3822,7 +3738,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -3864,7 +3780,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrayref", "paste", @@ -3873,7 +3789,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -3911,7 +3827,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -3943,7 +3859,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -3956,12 +3872,11 @@ dependencies = [ "rand 0.9.4", "rand_distr", "rand_xoshiro", - "random_word", ] [[package]] name = "lance-derive" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "proc-macro2", "quote", @@ -3970,7 +3885,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-arith", "arrow-array", @@ -4005,7 +3920,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-arith", "arrow-array", @@ -4035,7 +3950,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "datafusion", "geo-traits", @@ -4049,7 +3964,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arc-swap", "arrow", @@ -4104,6 +4019,7 @@ dependencies = [ "rand_distr", "rangemap", "rayon", + "regex-syntax", "roaring", "serde", "serde_json", @@ -4116,7 +4032,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-arith", @@ -4157,7 +4073,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -4193,7 +4109,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -4208,7 +4124,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow", "async-trait", @@ -4220,7 +4136,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-ipc", @@ -4228,6 +4144,8 @@ dependencies = [ "async-trait", "axum", "bytes", + "datafusion-common", + "datafusion-physical-plan", "futures", "lance", "lance-core", @@ -4240,19 +4158,22 @@ dependencies = [ "object_store", "rand 0.9.4", "reqwest 0.12.28", + "roaring", "serde", "serde_json", + "time", "tokio", "tower", "tower-http 0.5.2", "url", + "uuid", ] [[package]] name = "lance-namespace-reqwest-client" -version = "0.8.4" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04b4e5caefa132a9cce54b2d4dc95016b949b3a290a83ad5057e705df43d75be" +checksum = "ba3f0a235e3ed5f8805205649ccc7d7d0f3df23ce1294242c9265ad488d7f19d" dependencies = [ "reqwest 0.12.28", "serde", @@ -4264,7 +4185,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -4279,7 +4200,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -4316,11 +4237,12 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "icu_segmenter", "rust-stemmers", "serde", + "stop-words", "unicode-normalization", ] @@ -4333,12 +4255,6 @@ dependencies = [ "spin", ] -[[package]] -name = "leb128fmt" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" - [[package]] name = "lexical-core" version = "1.0.6" @@ -4571,9 +4487,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.8.1" +version = "2.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" [[package]] name = "mime" @@ -5173,15 +5089,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" -[[package]] -name = "ordered-float" -version = "2.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" -dependencies = [ - "num-traits", -] - [[package]] name = "ordered-float" version = "5.3.0" @@ -5245,42 +5152,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "parquet" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908" -dependencies = [ - "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ipc", - "arrow-schema", - "arrow-select", - "base64", - "brotli", - "bytes", - "chrono", - "flate2", - "futures", - "half", - "hashbrown 0.17.1", - "lz4_flex", - "num-bigint", - "num-integer", - "num-traits", - "object_store", - "paste", - "seq-macro", - "simdutf8", - "snap", - "thrift", - "tokio", - "twox-hash", - "zstd", -] - [[package]] name = "paste" version = "1.0.15" @@ -5733,7 +5604,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" dependencies = [ "chacha20", - "getrandom 0.4.2", + "getrandom 0.4.3", "rand_core 0.10.1", ] @@ -5800,19 +5671,6 @@ dependencies = [ "rand_core 0.9.5", ] -[[package]] -name = "random_word" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e47a395bdb55442b883c89062d6bcff25dc90fa5f8369af81e0ac6d49d78cf81" -dependencies = [ - "ahash", - "brotli", - "paste", - "rand 0.9.4", - "unicase", -] - [[package]] name = "rangemap" version = "1.7.1" @@ -6792,9 +6650,9 @@ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "smallvec" -version = "1.15.1" +version = "1.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +checksum = "8ed6a63f02c8539c91a8685a86f4099661ba3da017932f6ebbea6de3f0fa7c90" [[package]] name = "snafu" @@ -6817,12 +6675,6 @@ dependencies = [ "syn", ] -[[package]] -name = "snap" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" - [[package]] name = "socket2" version = "0.6.4" @@ -6916,6 +6768,15 @@ version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978" +[[package]] +name = "stop-words" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d68df56303396bcfb639455b3c166804aeb7994005010aab5e9e8a1277b8871d" +dependencies = [ + "serde_json", +] + [[package]] name = "strsim" version = "0.11.1" @@ -6983,9 +6844,9 @@ checksum = "a7973cce6668464ea31f176d85b13c7ab3bba2cb3b77a2ed26abd7801688010a" [[package]] name = "syn" -version = "2.0.117" +version = "2.0.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422" dependencies = [ "proc-macro2", "quote", @@ -7066,7 +6927,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom 0.4.2", + "getrandom 0.4.3", "once_cell", "rustix", "windows-sys 0.61.2", @@ -7130,17 +6991,6 @@ dependencies = [ "cfg-if 1.0.4", ] -[[package]] -name = "thrift" -version = "0.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" -dependencies = [ - "byteorder", - "integer-encoding", - "ordered-float 2.10.1", -] - [[package]] name = "time" version = "0.3.47" @@ -7616,12 +7466,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" -[[package]] -name = "unicode-xid" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" - [[package]] name = "unsafe-libyaml" version = "0.2.11" @@ -7676,7 +7520,7 @@ version = "1.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7" dependencies = [ - "getrandom 0.4.2", + "getrandom 0.4.3", "js-sys", "serde_core", "wasm-bindgen", @@ -7736,20 +7580,11 @@ dependencies = [ [[package]] name = "wasip2" -version = "1.0.3+wasi-0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" -dependencies = [ - "wit-bindgen 0.57.1", -] - -[[package]] -name = "wasip3" -version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +version = "1.0.4+wasi-0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487" dependencies = [ - "wit-bindgen 0.51.0", + "wit-bindgen", ] [[package]] @@ -7763,9 +7598,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a254a4b10c19a76f09a27640e7ffbf9bc30bf67e16a3bf28aaefa4920fe81563" +checksum = "8ddb3f79143bced6de84270411622a2699cee572fc0875aeaf1e7867cf9fca1a" dependencies = [ "cfg-if 1.0.4", "once_cell", @@ -7776,9 +7611,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.73" +version = "0.4.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54568702fabf5d4849ce2b90fadfa64168a097eaf4b351ce9df8b687a0086aaf" +checksum = "503b14d284f2c8dac03b819967e155ea753f573586193b2b2c95990cb5d69280" dependencies = [ "js-sys", "wasm-bindgen", @@ -7786,9 +7621,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24a40fc75b0ec6f3746ceb10d36f53a93dcd68a93b11b6445983945d79eba0dc" +checksum = "4e21a184b13fb19e157296e2c46056aec9092264fab83e4ba59e68c61b323c3d" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -7796,9 +7631,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "908f34bd9b9ce3d4caf07b72dfab63d61504d156856c6bd3cd87fa350cf3985b" +checksum = "fecefd9c35bd935a20fc3fc344b5f29138961e4f47fb03297d88f2587afb5ebd" dependencies = [ "bumpalo", "proc-macro2", @@ -7809,35 +7644,13 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7acbf7616c27b194bbb550bf77ed0c2c3e5b7fd1260a93082b95fb7f47959b92" +checksum = "23939e44bb9a5d7576fa2b563dc2e136628f1224e88a8deed09e04858b77871f" dependencies = [ "unicode-ident", ] -[[package]] -name = "wasm-encoder" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" -dependencies = [ - "leb128fmt", - "wasmparser", -] - -[[package]] -name = "wasm-metadata" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" -dependencies = [ - "anyhow", - "indexmap 2.14.0", - "wasm-encoder", - "wasmparser", -] - [[package]] name = "wasm-streams" version = "0.4.2" @@ -7864,23 +7677,11 @@ dependencies = [ "web-sys", ] -[[package]] -name = "wasmparser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" -dependencies = [ - "bitflags", - "hashbrown 0.15.5", - "indexmap 2.14.0", - "semver", -] - [[package]] name = "web-sys" -version = "0.3.100" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e0871acf327f283dc6da28a1696cdc64fb355ba9f935d052021fa77f35cce69" +checksum = "a6430a72df5eb332242960fe84b3002a241163998241eb596d4f739b9757061d" dependencies = [ "js-sys", "wasm-bindgen", @@ -7898,18 +7699,18 @@ dependencies = [ [[package]] name = "webpki-root-certs" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c" +checksum = "0d46a5a140e6f7afeccd8eae97eff335163939eac8b929834875168b29b3d267" dependencies = [ "rustls-pki-types", ] [[package]] name = "webpki-roots" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" +checksum = "bf85cb06032201fa7c6f829d7db5a7e5aa45bcc0655327713065f6f0576731bf" dependencies = [ "rustls-pki-types", ] @@ -8310,100 +8111,12 @@ dependencies = [ "memchr", ] -[[package]] -name = "wit-bindgen" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" -dependencies = [ - "wit-bindgen-rust-macro", -] - [[package]] name = "wit-bindgen" version = "0.57.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" -[[package]] -name = "wit-bindgen-core" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" -dependencies = [ - "anyhow", - "heck", - "wit-parser", -] - -[[package]] -name = "wit-bindgen-rust" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" -dependencies = [ - "anyhow", - "heck", - "indexmap 2.14.0", - "prettyplease", - "syn", - "wasm-metadata", - "wit-bindgen-core", - "wit-component", -] - -[[package]] -name = "wit-bindgen-rust-macro" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" -dependencies = [ - "anyhow", - "prettyplease", - "proc-macro2", - "quote", - "syn", - "wit-bindgen-core", - "wit-bindgen-rust", -] - -[[package]] -name = "wit-component" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" -dependencies = [ - "anyhow", - "bitflags", - "indexmap 2.14.0", - "log", - "serde", - "serde_derive", - "serde_json", - "wasm-encoder", - "wasm-metadata", - "wasmparser", - "wit-parser", -] - -[[package]] -name = "wit-parser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" -dependencies = [ - "anyhow", - "id-arena", - "indexmap 2.14.0", - "log", - "semver", - "serde", - "serde_derive", - "serde_json", - "unicode-xid", - "wasmparser", -] - [[package]] name = "wkb" version = "0.9.2" @@ -8498,7 +8211,7 @@ dependencies = [ "csv", "futures", "futures-util", - "getrandom 0.4.2", + "getrandom 0.4.3", "heapify", "itertools 0.14.0", "lazy_static", @@ -8669,9 +8382,9 @@ dependencies = [ [[package]] name = "zeroize" -version = "1.8.2" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" +checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e" [[package]] name = "zerotrie" @@ -8708,12 +8421,6 @@ dependencies = [ "syn", ] -[[package]] -name = "zlib-rs" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" - [[package]] name = "zmij" version = "1.0.21" diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index f1144423c0d..6210c5daf1d 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/lance-jni/src/index.rs b/java/lance-jni/src/index.rs index 1e533eed9fc..6cb64a05a81 100644 --- a/java/lance-jni/src/index.rs +++ b/java/lance-jni/src/index.rs @@ -173,6 +173,8 @@ fn determine_index_type<'local>( Some("ZONEMAP") } else if lower.contains("bloomfilter") { Some("BLOOM_FILTER") + } else if lower.contains("rtree") { + Some("RTREE") } else if lower.contains("ivfhnsw") { if lower.contains("sq") { Some("IVF_HNSW_SQ") diff --git a/java/lance-jni/src/mem_wal.rs b/java/lance-jni/src/mem_wal.rs index 9ba3fdd7440..20404b6a88b 100644 --- a/java/lance-jni/src/mem_wal.rs +++ b/java/lance-jni/src/mem_wal.rs @@ -27,6 +27,7 @@ use jni::sys::{jdouble, jint, jlong}; use lance::dataset::Dataset as LanceDataset; use lance::dataset::mem_wal::scanner::{ FlushedGeneration, LsmDataSourceCollector, LsmPointLookupPlanner, LsmVectorSearchPlanner, + write_pk_sidecar, }; use lance::dataset::mem_wal::write::{MemTableStats, WriteStatsSnapshot}; use lance::dataset::mem_wal::{ @@ -180,6 +181,42 @@ fn inner_put(env: &mut JNIEnv, this: JObject, stream_addr: jlong) -> Result<()> Ok(()) } +/// Test-support: write a primary-key dedup sidecar (`_pk_index/`) for a +/// flushed-generation dataset already staged at `gen_path`, mirroring what +/// production flush emits. Lets Java tests stage a *faithful* flushed +/// generation (dataset + sidecar); production always writes the sidecar during +/// flush, so a dataset-without-sidecar is not a state the system produces. +/// Mirrors the Python `_write_pk_sidecar` binding. +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_memwal_MemWalTest_nativeWritePkSidecar( + mut env: JNIEnv, + _class: JClass, + gen_path: JString, + stream_addr: jlong, + pk_columns: JObject, +) { + ok_or_throw_without_return!( + env, + inner_write_pk_sidecar(&mut env, gen_path, stream_addr, pk_columns) + ); +} + +fn inner_write_pk_sidecar( + env: &mut JNIEnv, + gen_path: JString, + stream_addr: jlong, + pk_columns: JObject, +) -> Result<()> { + let gen_path: String = env.get_string(&gen_path)?.into(); + let pk_columns = env.get_strings(&pk_columns)?; + let stream_ptr = stream_addr as *mut FFI_ArrowArrayStream; + let reader = unsafe { ArrowArrayStreamReader::from_raw(stream_ptr) }?; + let batches: Vec = reader.collect::>()?; + let pk_refs: Vec<&str> = pk_columns.iter().map(String::as_str).collect(); + RT.block_on(write_pk_sidecar(&gen_path, &batches, &pk_refs))?; + Ok(()) +} + #[unsafe(no_mangle)] pub extern "system" fn Java_org_lance_memwal_ShardWriter_nativeStats<'local>( mut env: JNIEnv<'local>, diff --git a/java/pom.xml b/java/pom.xml index e5791f8155d..6306ecc63f9 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.0.0-beta.11 + 8.1.0-beta.0 jar Lance Format Java API diff --git a/java/src/main/java/org/lance/OpenDatasetBuilder.java b/java/src/main/java/org/lance/OpenDatasetBuilder.java index baece0767a1..32fd5ca7635 100644 --- a/java/src/main/java/org/lance/OpenDatasetBuilder.java +++ b/java/src/main/java/org/lance/OpenDatasetBuilder.java @@ -216,8 +216,8 @@ private Dataset buildFromNamespaceClient() { // Call describe_table to get location and storage options DescribeTableRequest request = new DescribeTableRequest(); request.setId(tableId); - // Only set version if present - options.getVersion().ifPresent(v -> request.setVersion(Long.valueOf(v))); + // Do not set the dataset version here. Some namespace implementations only support describing + // the latest table metadata; the requested version is applied when opening the dataset below. DescribeTableResponse response = namespaceClient.describeTable(request); diff --git a/java/src/main/java/org/lance/index/IndexType.java b/java/src/main/java/org/lance/index/IndexType.java index 3a03934effd..1fff86fc7e0 100644 --- a/java/src/main/java/org/lance/index/IndexType.java +++ b/java/src/main/java/org/lance/index/IndexType.java @@ -24,6 +24,7 @@ public enum IndexType { MEM_WAL(7), ZONEMAP(8), BLOOM_FILTER(9), + RTREE(10), VECTOR(100), IVF_FLAT(101), IVF_SQ(102), diff --git a/java/src/main/java/org/lance/index/scalar/ScalarIndexParams.java b/java/src/main/java/org/lance/index/scalar/ScalarIndexParams.java index 345a55f20b2..b3408e2d68d 100644 --- a/java/src/main/java/org/lance/index/scalar/ScalarIndexParams.java +++ b/java/src/main/java/org/lance/index/scalar/ScalarIndexParams.java @@ -31,7 +31,7 @@ private ScalarIndexParams(Builder builder) { * Create a new ScalarIndexParams with the given index type and no parameters. * * @param indexType the index type (e.g., "btree", "zonemap", "bitmap", "inverted", "labellist", - * "ngram") + * "ngram", "rtree") * @return ScalarIndexParams */ public static ScalarIndexParams create(String indexType) { @@ -42,7 +42,7 @@ public static ScalarIndexParams create(String indexType) { * Create a new ScalarIndexParams with the given index type and JSON parameters. * * @param indexType the index type (e.g., "btree", "zonemap", "bitmap", "inverted", "labellist", - * "ngram") + * "ngram", "rtree") * @param jsonParams JSON string containing index-specific parameters * @return ScalarIndexParams */ @@ -58,7 +58,7 @@ public static class Builder { * Create a new builder for scalar index parameters. * * @param indexType the index type (e.g., "btree", "zonemap", "bitmap", "inverted", "labellist", - * "ngram") + * "ngram", "rtree") */ public Builder(String indexType) { this.indexType = indexType; diff --git a/java/src/test/java/org/lance/index/ScalarIndexTest.java b/java/src/test/java/org/lance/index/ScalarIndexTest.java index b993a7e8a5f..cb090e7c955 100644 --- a/java/src/test/java/org/lance/index/ScalarIndexTest.java +++ b/java/src/test/java/org/lance/index/ScalarIndexTest.java @@ -25,14 +25,18 @@ import org.apache.arrow.c.Data; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.ipc.ArrowReader; import org.apache.arrow.vector.ipc.ArrowStreamReader; import org.apache.arrow.vector.ipc.ArrowStreamWriter; +import org.apache.arrow.vector.types.FloatingPointPrecision; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.types.pojo.Schema; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -318,4 +322,78 @@ public void testCreateZonemapIndex(@TempDir Path tempDir) throws Exception { } } } + + @Test + public void testCreateRTreeIndex(@TempDir Path tempDir) throws Exception { + String datasetPath = tempDir.resolve("rtree_test").toString(); + ArrowType f64 = new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE); + Field geometryField = + new Field( + "geometry", + new FieldType( + true, + new ArrowType.Struct(), + null, + Collections.singletonMap("ARROW:extension:name", "geoarrow.point")), + Arrays.asList(Field.notNullable("x", f64), Field.notNullable("y", f64))); + Schema schema = new Schema(Collections.singletonList(geometryField), null); + + int rowCount = 3; + try (RootAllocator allocator = new RootAllocator(); + VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + root.allocateNew(); + StructVector geometry = (StructVector) root.getVector("geometry"); + Float8Vector x = (Float8Vector) geometry.getChild("x"); + Float8Vector y = (Float8Vector) geometry.getChild("y"); + for (int i = 0; i < rowCount; i++) { + geometry.setIndexDefined(i); + x.setSafe(i, (double) i); + y.setSafe(i, i * 2.0); + } + geometry.setValueCount(rowCount); + root.setRowCount(rowCount); + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + try (ArrowStreamWriter writer = new ArrowStreamWriter(root, null, out)) { + writer.start(); + writer.writeBatch(); + writer.end(); + } + + try (ArrowStreamReader reader = + new ArrowStreamReader(new ByteArrayInputStream(out.toByteArray()), allocator); + Dataset dataset = + Dataset.write() + .reader(reader) + .uri(datasetPath) + .allocator(allocator) + .mode(WriteParams.WriteMode.CREATE) + .execute()) { + // The point data round-trips through Lance. + assertEquals(rowCount, dataset.countRows()); + try (ArrowReader scan = dataset.newScan(new ScanOptions.Builder().build()).scanBatches()) { + assertTrue(scan.loadNextBatch()); + StructVector readGeometry = + (StructVector) scan.getVectorSchemaRoot().getVector("geometry"); + assertEquals(2.0, ((Float8Vector) readGeometry.getChild("x")).get(2)); + assertEquals(4.0, ((Float8Vector) readGeometry.getChild("y")).get(2)); + } + + // Creating and listing an RTree index via the typed IndexType works end-to-end. + Index index = + dataset.createIndex( + Collections.singletonList("geometry"), + IndexType.RTREE, + Optional.of("rtree_geometry_index"), + IndexParams.builder() + .setScalarIndexParams(ScalarIndexParams.create("rtree")) + .build(), + true); + assertEquals(IndexType.RTREE, index.indexType()); + assertTrue( + dataset.listIndexes().contains("rtree_geometry_index"), + "Expected 'rtree_geometry_index' in: " + dataset.listIndexes()); + } + } + } } diff --git a/java/src/test/java/org/lance/memwal/MemWalTest.java b/java/src/test/java/org/lance/memwal/MemWalTest.java index ee26932dd59..5af3bd3f474 100644 --- a/java/src/test/java/org/lance/memwal/MemWalTest.java +++ b/java/src/test/java/org/lance/memwal/MemWalTest.java @@ -50,6 +50,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Optional; import java.util.UUID; @@ -142,6 +143,30 @@ private static Dataset writeAppendOnlyDataset( } } + /** + * Stage a faithful flushed generation at {@code genPath}: the Lance dataset plus its + * primary-key dedup sidecar ({@code _pk_index/}), mirroring what production flush emits. The LSM + * scanner's cross-generation block-list opens the sidecar, so a dataset alone (no sidecar) is not + * a state production produces. Mirrors the Python {@code _write_flushed_gen} test helper. + */ + private static void writeFlushedGen( + BufferAllocator allocator, String genPath, long[] ids, String prefix) throws Exception { + writeLookupDataset(allocator, genPath, ids, prefix).close(); + try (VectorSchemaRoot root = lookupRoot(allocator, ids, prefix); + ArrowReader reader = toReader(allocator, root); + ArrowArrayStream stream = ArrowArrayStream.allocateNew(allocator)) { + Data.exportArrayStream(allocator, reader, stream); + nativeWritePkSidecar(genPath, stream.memoryAddress(), Collections.singletonList("id")); + } + } + + /** + * Test-support native: write the primary-key dedup sidecar for a flushed-generation dataset + * already staged at {@code genPath}. See {@link #writeFlushedGen}. + */ + private static native void nativeWritePkSidecar( + String genPath, long streamAddress, List pkColumns); + /** Read an LSM scanner fully into an {@code id -> name} map. */ private static Map readByName(ArrowReader reader) throws Exception { Map byId = new HashMap<>(); @@ -367,7 +392,7 @@ void testLsmScannerFromSnapshots(@TempDir Path tempDir) throws Exception { // Flushed generation overwrites id=2. String genPath = basePath + "/_mem_wal/" + shardId + "/gen_1"; - writeLookupDataset(allocator, genPath, new long[] {2}, "gen1").close(); + writeFlushedGen(allocator, genPath, new long[] {2}, "gen1"); ShardSnapshot snapshot = new ShardSnapshot(shardId).withFlushedGeneration(1, "gen_1").withCurrentGeneration(2); @@ -393,7 +418,7 @@ void testPointLookup(@TempDir Path tempDir) throws Exception { dataset.initializeMemWal(new InitializeMemWalParams()); String genPath = basePath + "/_mem_wal/" + shardId + "/gen_1"; - writeLookupDataset(allocator, genPath, new long[] {2}, "gen1").close(); + writeFlushedGen(allocator, genPath, new long[] {2}, "gen1"); ShardSnapshot snapshot = new ShardSnapshot(shardId).withFlushedGeneration(1, "gen_1").withCurrentGeneration(2); diff --git a/java/src/test/java/org/lance/namespace/DirectoryNamespaceTest.java b/java/src/test/java/org/lance/namespace/DirectoryNamespaceTest.java index f425ddcc4f9..c622bac9fcd 100644 --- a/java/src/test/java/org/lance/namespace/DirectoryNamespaceTest.java +++ b/java/src/test/java/org/lance/namespace/DirectoryNamespaceTest.java @@ -189,6 +189,33 @@ void testNamespaceId() { "namespaceId should contain 'DirectoryNamespace', got: " + namespaceId); } + @Test + void testOpenSpecificVersionDoesNotPassVersionToDescribeTable() throws Exception { + VersionRejectingNamespace versionRejectingNamespace = + new VersionRejectingNamespace(innerNamespaceClient); + namespaceClient = versionRejectingNamespace; + List tableId = Arrays.asList("test_table"); + + namespaceClient.createTable(new CreateTableRequest().id(tableId), createTestTableData()); + namespaceClient.insertIntoTable( + new InsertIntoTableRequest().id(tableId).mode("append"), createTestTableData()); + + try (Dataset versionOne = + Dataset.open() + .allocator(allocator) + .namespaceClient(namespaceClient) + .tableId(tableId) + .readOptions(new ReadOptions.Builder().setVersion(1L).build()) + .build()) { + assertEquals(1, versionOne.version()); + assertEquals(3, versionOne.countRows()); + } + + assertTrue( + versionRejectingNamespace.getDescribeTableCallCount() > 0, + "Expected describeTable to be called when opening through namespace"); + } + @Test void testCreateAndListNamespaces() { // Create a namespace @@ -1439,4 +1466,25 @@ private byte[] createVectorTableData(int numRows, int dim) throws Exception { return out.toByteArray(); } } + + private static class VersionRejectingNamespace extends CustomNamespace { + private final AtomicInteger describeTableCallCount = new AtomicInteger(); + + VersionRejectingNamespace(DirectoryNamespace inner) { + super(inner); + } + + @Override + public DescribeTableResponse describeTable(DescribeTableRequest request) { + describeTableCallCount.incrementAndGet(); + assertNull( + request.getVersion(), + "Dataset version should be passed to dataset open, not describeTable"); + return super.describeTable(request); + } + + int getDescribeTableCallCount() { + return describeTableCallCount.get(); + } + } } diff --git a/memtest/pyproject.toml b/memtest/pyproject.toml index 396d7c442e0..4418d0e19c8 100644 --- a/memtest/pyproject.toml +++ b/memtest/pyproject.toml @@ -7,7 +7,7 @@ name = "lance-memtest" version = "0.1.0" description = "Memory allocation testing utilities for Python test suites" readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.10" license = { text = "Apache-2.0" } authors = [ { name = "Lance Developers" } @@ -17,7 +17,6 @@ classifiers = [ "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", diff --git a/python/Cargo.lock b/python/Cargo.lock index 879195811cf..126714795cc 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -185,15 +185,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "ar_archive_writer" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4087686b4b0a3427190bae57a1d9a478dbb2d40c5dc1bd6e2b6d797913bdd348" -dependencies = [ - "object", -] - [[package]] name = "arc-swap" version = "1.9.1" @@ -517,7 +508,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -528,7 +519,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1068,9 +1059,9 @@ dependencies = [ [[package]] name = "bitvec" -version = "1.0.1" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +checksum = "ddcec3d12c579d40898fe0a9a358a803c23e9c52ca3c425707f81c9436211837" dependencies = [ "funty", "radium", @@ -1112,9 +1103,9 @@ dependencies = [ [[package]] name = "block-buffer" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be" +checksum = "d2f6c7dbe95a6ed67ad9f18e57daf93a2f034c524b99fd2b76d18fdfeb6660aa" dependencies = [ "hybrid-array", ] @@ -1130,9 +1121,9 @@ dependencies = [ [[package]] name = "brotli" -version = "8.0.3" +version = "8.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610" +checksum = "5cc91aac060a7a1e25823bdccbfb6af1875b88f17c6daac97894eed8207166b3" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -1141,9 +1132,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "5.0.1" +version = "5.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924" +checksum = "3a32acac15fe1967bc3986b2a6347dffc965602354ea6f450ad07e8bfd253583" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -1195,7 +1186,7 @@ checksum = "89385e82b5d1821d2219e0b095efa2cc1f246cbf99080f3be46a1a85c0d392d9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1218,9 +1209,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.11.1" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +checksum = "8ae3f5d315924270530207e2a68396c3cc547f6dca3fbdca317cfb1a51edb593" [[package]] name = "bytes-utils" @@ -1232,15 +1223,6 @@ dependencies = [ "either", ] -[[package]] -name = "bzip2" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" -dependencies = [ - "libbz2-rs-sys", -] - [[package]] name = "cbc" version = "0.1.2" @@ -1252,9 +1234,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.63" +version = "1.2.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f" +checksum = "dad887fd958be91b5098c0248def011f4523ab786cd411be668777e55063501f" dependencies = [ "find-msvc-tools", "jobserver", @@ -1365,7 +1347,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1430,13 +1412,9 @@ version = "0.4.38" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ce2548391e9c1929c21bf6aa2680af86fe4c1b33e6cea9ac1cfeec0bd11218cf" dependencies = [ - "bzip2", "compression-core", "flate2", - "liblzma", "memchr", - "zstd", - "zstd-safe", ] [[package]] @@ -1764,7 +1742,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1777,7 +1755,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1788,7 +1766,7 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ "darling_core 0.20.11", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1799,7 +1777,7 @@ checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ "darling_core 0.23.0", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1826,7 +1804,6 @@ dependencies = [ "arrow-schema", "async-trait", "bytes", - "bzip2", "chrono", "datafusion-catalog", "datafusion-catalog-listing", @@ -1836,7 +1813,6 @@ dependencies = [ "datafusion-datasource-arrow", "datafusion-datasource-csv", "datafusion-datasource-json", - "datafusion-datasource-parquet", "datafusion-execution", "datafusion-expr", "datafusion-expr-common", @@ -1853,14 +1829,11 @@ dependencies = [ "datafusion-physical-plan", "datafusion-session", "datafusion-sql", - "flate2", "futures", "itertools 0.14.0", - "liblzma", "log", "object_store", "parking_lot", - "parquet", "rand 0.9.4", "regex", "sqlparser", @@ -1868,7 +1841,6 @@ dependencies = [ "tokio", "url", "uuid", - "zstd", ] [[package]] @@ -1938,7 +1910,6 @@ dependencies = [ "object_store", "parquet", "paste", - "recursive", "sqlparser", "tokio", "web-time", @@ -1962,10 +1933,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" dependencies = [ "arrow", - "async-compression", "async-trait", "bytes", - "bzip2", "chrono", "datafusion-common", "datafusion-common-runtime", @@ -1976,18 +1945,14 @@ dependencies = [ "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-session", - "flate2", "futures", "glob", "itertools 0.14.0", - "liblzma", "log", "object_store", "rand 0.9.4", "tokio", - "tokio-util", "url", - "zstd", ] [[package]] @@ -2138,7 +2103,6 @@ dependencies = [ "indexmap 2.14.0", "itertools 0.14.0", "paste", - "recursive", "serde_json", "sqlparser", ] @@ -2330,7 +2294,7 @@ checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" dependencies = [ "datafusion-doc", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2348,7 +2312,6 @@ dependencies = [ "indexmap 2.14.0", "itertools 0.14.0", "log", - "recursive", "regex", "regex-syntax", ] @@ -2373,7 +2336,6 @@ dependencies = [ "parking_lot", "paste", "petgraph", - "recursive", "tokio", ] @@ -2425,7 +2387,6 @@ dependencies = [ "datafusion-physical-plan", "datafusion-pruning", "itertools 0.14.0", - "recursive", ] [[package]] @@ -2544,7 +2505,6 @@ dependencies = [ "datafusion-functions-nested", "indexmap 2.14.0", "log", - "recursive", "regex", "sqlparser", ] @@ -2608,7 +2568,7 @@ dependencies = [ "darling 0.20.11", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2618,7 +2578,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2639,7 +2599,7 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2" dependencies = [ - "block-buffer 0.12.0", + "block-buffer 0.12.1", "const-oid 0.10.2", "crypto-common 0.2.2", "ctutils", @@ -2674,7 +2634,7 @@ checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2899,7 +2859,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "rand 0.9.4", @@ -2976,7 +2936,7 @@ checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -3202,17 +3162,15 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +checksum = "300e883d756b2e4ec94e02791f39b04b522276138852cfc41d9fb7e904106099" dependencies = [ "cfg-if 1.0.4", "js-sys", "libc", "r-efi 6.0.0", "rand_core 0.10.1", - "wasip2", - "wasip3", "wasm-bindgen", ] @@ -3233,7 +3191,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -3280,9 +3238,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.14" +version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733" +checksum = "6cb093c84e8bd9b188d4c4a8cb6579fc016968d14c99882163cd3ff402a4f155" dependencies = [ "atomic-waker", "bytes", @@ -3803,12 +3761,6 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4a2c462a4d927d512f5f882a033ddd62f33a05bb9f230d98f736ac3dc85938f" -[[package]] -name = "id-arena" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" - [[package]] name = "ident_case" version = "1.0.1" @@ -3979,7 +3931,7 @@ checksum = "782d32378dddf207193ac91cefb848ad41abb58195c95168e1291227a0832b47" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -4024,7 +3976,7 @@ dependencies = [ "quote", "rustc_version", "simd_cesu8", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -4043,7 +3995,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" dependencies = [ "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -4058,9 +4010,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.100" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2025f20d7a4fa7785846e7b63d10a76d3f1cee98ee5cb79ea59703f95e42162" +checksum = "03d04c30968dffe80775bd4d7fb676131cd04a1fb46d2686dbffbaec2d9dfd31" dependencies = [ "cfg-if 1.0.4", "futures-util", @@ -4115,7 +4067,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arc-swap", "arrow", @@ -4189,7 +4141,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -4231,7 +4183,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrayref", "paste", @@ -4240,7 +4192,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -4278,7 +4230,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -4310,7 +4262,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -4323,21 +4275,20 @@ dependencies = [ "rand 0.9.4", "rand_distr", "rand_xoshiro", - "random_word", ] [[package]] name = "lance-derive" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] name = "lance-encoding" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-arith", "arrow-array", @@ -4372,7 +4323,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-arith", "arrow-array", @@ -4402,7 +4353,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "datafusion", "geo-traits", @@ -4416,7 +4367,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arc-swap", "arrow", @@ -4472,6 +4423,7 @@ dependencies = [ "rand_distr", "rangemap", "rayon", + "regex-syntax", "roaring", "serde", "serde_json", @@ -4484,7 +4436,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-arith", @@ -4525,7 +4477,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -4536,11 +4488,12 @@ dependencies = [ "lance-core", "num-traits", "rand 0.9.4", + "rayon", ] [[package]] name = "lance-namespace" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow", "async-trait", @@ -4552,7 +4505,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-ipc", @@ -4560,6 +4513,8 @@ dependencies = [ "async-trait", "axum", "bytes", + "datafusion-common", + "datafusion-physical-plan", "futures", "lance", "lance-core", @@ -4572,19 +4527,22 @@ dependencies = [ "object_store", "rand 0.9.4", "reqwest 0.12.28", + "roaring", "serde", "serde_json", + "time", "tokio", "tower", "tower-http 0.5.2", "url", + "uuid", ] [[package]] name = "lance-namespace-reqwest-client" -version = "0.8.4" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04b4e5caefa132a9cce54b2d4dc95016b949b3a290a83ad5057e705df43d75be" +checksum = "ba3f0a235e3ed5f8805205649ccc7d7d0f3df23ce1294242c9265ad488d7f19d" dependencies = [ "reqwest 0.12.28", "serde", @@ -4596,7 +4554,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -4611,7 +4569,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -4650,13 +4608,14 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ "icu_segmenter", "jieba-rs", "lindera", "rust-stemmers", "serde", + "stop-words", "unicode-normalization", ] @@ -4669,12 +4628,6 @@ dependencies = [ "spin", ] -[[package]] -name = "leb128fmt" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" - [[package]] name = "lexical-core" version = "1.0.6" @@ -4732,12 +4685,6 @@ dependencies = [ "lexical-util", ] -[[package]] -name = "libbz2-rs-sys" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34b357333733e8260735ba5894eb928c02ecc69c78715f01a8019e7fa7f2db4c" - [[package]] name = "libc" version = "0.2.186" @@ -4754,26 +4701,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "liblzma" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6033b77c21d1f56deeae8014eb9fbe7bdf1765185a6c508b5ca82eeaed7f899" -dependencies = [ - "liblzma-sys", -] - -[[package]] -name = "liblzma-sys" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a60851d15cd8c5346eca4ab8babff585be2ae4bc8097c067291d3ffe2add3b6" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "libm" version = "0.2.16" @@ -4997,9 +4924,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.8.1" +version = "2.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" [[package]] name = "memmap2" @@ -5096,7 +5023,7 @@ checksum = "4568f25ccbd45ab5d5603dc34318c1ec56b117531781260002151b8530a9f931" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -5241,7 +5168,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -5272,15 +5199,6 @@ dependencies = [ "objc2-core-foundation", ] -[[package]] -name = "object" -version = "0.37.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" -dependencies = [ - "memchr", -] - [[package]] name = "object_store" version = "0.13.2" @@ -5927,7 +5845,7 @@ checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6034,7 +5952,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6080,7 +5998,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.117", + "syn 2.0.118", "tempfile", ] @@ -6094,7 +6012,7 @@ dependencies = [ "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6106,16 +6024,6 @@ dependencies = [ "prost", ] -[[package]] -name = "psm" -version = "0.1.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645dbe486e346d9b5de3ef16ede18c26e6c70ad97418f4874b8b1889d6e761ea" -dependencies = [ - "ar_archive_writer", - "cc", -] - [[package]] name = "ptr_meta" version = "0.3.1" @@ -6133,13 +6041,14 @@ checksum = "7347867d0a7e1208d93b46767be83e2b8f978c3dad35f775ac8d8847551d6fe1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] name = "pylance" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" dependencies = [ + "alloc-stdlib", "arrow", "arrow-array", "arrow-cast", @@ -6228,7 +6137,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6241,7 +6150,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6393,7 +6302,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" dependencies = [ "chacha20", - "getrandom 0.4.2", + "getrandom 0.4.3", "rand_core 0.10.1", ] @@ -6460,19 +6369,6 @@ dependencies = [ "rand_core 0.9.5", ] -[[package]] -name = "random_word" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e47a395bdb55442b883c89062d6bcff25dc90fa5f8369af81e0ac6d49d78cf81" -dependencies = [ - "ahash", - "brotli", - "paste", - "rand 0.9.4", - "unicase", -] - [[package]] name = "rangemap" version = "1.7.1" @@ -6505,26 +6401,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "recursive" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" -dependencies = [ - "recursive-proc-macro-impl", - "stacker", -] - -[[package]] -name = "recursive-proc-macro-impl" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" -dependencies = [ - "quote", - "syn 2.0.117", -] - [[package]] name = "redb" version = "3.1.3" @@ -6571,7 +6447,7 @@ checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6920,7 +6796,7 @@ checksum = "5d2ed0b54125315fb36bd021e82d314d1c126548f871634b483f46b31d13cac6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7185,7 +7061,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7277,7 +7153,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7288,7 +7164,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7323,7 +7199,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7335,7 +7211,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7379,7 +7255,7 @@ dependencies = [ "darling 0.23.0", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7533,9 +7409,9 @@ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "smallvec" -version = "1.15.1" +version = "1.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +checksum = "8ed6a63f02c8539c91a8685a86f4099661ba3da017932f6ebbea6de3f0fa7c90" [[package]] name = "snafu" @@ -7555,7 +7431,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7609,7 +7485,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbf5ea8d4d7c808e1af1cbabebca9a2abe603bcefc22294c5b95018d53200cb7" dependencies = [ "log", - "recursive", "sqlparser_derive", ] @@ -7621,7 +7496,7 @@ checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7630,19 +7505,6 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" -[[package]] -name = "stacker" -version = "0.1.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "640c8cdd92b6b12f5bcb1803ca3bbf5ab96e5e6b6b96b9ab77dabe9e880b3190" -dependencies = [ - "cc", - "cfg-if 1.0.4", - "libc", - "psm", - "windows-sys 0.61.2", -] - [[package]] name = "static_assertions" version = "1.1.0" @@ -7671,6 +7533,15 @@ version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978" +[[package]] +name = "stop-words" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d68df56303396bcfb639455b3c166804aeb7994005010aab5e9e8a1277b8871d" +dependencies = [ + "serde_json", +] + [[package]] name = "strsim" version = "0.11.1" @@ -7705,7 +7576,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7717,7 +7588,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7740,7 +7611,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.117", + "syn 2.0.118", "typify", "walkdir", ] @@ -7770,9 +7641,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.117" +version = "2.0.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422" dependencies = [ "proc-macro2", "quote", @@ -7796,7 +7667,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7859,7 +7730,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom 0.4.2", + "getrandom 0.4.3", "once_cell", "rustix", "windows-sys 0.61.2", @@ -7891,7 +7762,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7902,7 +7773,7 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8025,7 +7896,7 @@ checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8247,7 +8118,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8386,7 +8257,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.117", + "syn 2.0.118", "thiserror 2.0.18", "unicode-ident", ] @@ -8404,7 +8275,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.117", + "syn 2.0.118", "typify-impl", ] @@ -8447,12 +8318,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" -[[package]] -name = "unicode-xid" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" - [[package]] name = "unsafe-libyaml" version = "0.2.11" @@ -8507,7 +8372,7 @@ version = "1.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7" dependencies = [ - "getrandom 0.4.2", + "getrandom 0.4.3", "js-sys", "serde_core", "wasm-bindgen", @@ -8567,20 +8432,11 @@ dependencies = [ [[package]] name = "wasip2" -version = "1.0.3+wasi-0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" -dependencies = [ - "wit-bindgen 0.57.1", -] - -[[package]] -name = "wasip3" -version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +version = "1.0.4+wasi-0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487" dependencies = [ - "wit-bindgen 0.51.0", + "wit-bindgen", ] [[package]] @@ -8594,9 +8450,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a254a4b10c19a76f09a27640e7ffbf9bc30bf67e16a3bf28aaefa4920fe81563" +checksum = "8ddb3f79143bced6de84270411622a2699cee572fc0875aeaf1e7867cf9fca1a" dependencies = [ "cfg-if 1.0.4", "once_cell", @@ -8607,9 +8463,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.73" +version = "0.4.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54568702fabf5d4849ce2b90fadfa64168a097eaf4b351ce9df8b687a0086aaf" +checksum = "503b14d284f2c8dac03b819967e155ea753f573586193b2b2c95990cb5d69280" dependencies = [ "js-sys", "wasm-bindgen", @@ -8617,9 +8473,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24a40fc75b0ec6f3746ceb10d36f53a93dcd68a93b11b6445983945d79eba0dc" +checksum = "4e21a184b13fb19e157296e2c46056aec9092264fab83e4ba59e68c61b323c3d" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -8627,48 +8483,26 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "908f34bd9b9ce3d4caf07b72dfab63d61504d156856c6bd3cd87fa350cf3985b" +checksum = "fecefd9c35bd935a20fc3fc344b5f29138961e4f47fb03297d88f2587afb5ebd" dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7acbf7616c27b194bbb550bf77ed0c2c3e5b7fd1260a93082b95fb7f47959b92" +checksum = "23939e44bb9a5d7576fa2b563dc2e136628f1224e88a8deed09e04858b77871f" dependencies = [ "unicode-ident", ] -[[package]] -name = "wasm-encoder" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" -dependencies = [ - "leb128fmt", - "wasmparser", -] - -[[package]] -name = "wasm-metadata" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" -dependencies = [ - "anyhow", - "indexmap 2.14.0", - "wasm-encoder", - "wasmparser", -] - [[package]] name = "wasm-streams" version = "0.4.2" @@ -8695,23 +8529,11 @@ dependencies = [ "web-sys", ] -[[package]] -name = "wasmparser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" -dependencies = [ - "bitflags 2.13.0", - "hashbrown 0.15.5", - "indexmap 2.14.0", - "semver", -] - [[package]] name = "web-sys" -version = "0.3.100" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e0871acf327f283dc6da28a1696cdc64fb355ba9f935d052021fa77f35cce69" +checksum = "a6430a72df5eb332242960fe84b3002a241163998241eb596d4f739b9757061d" dependencies = [ "js-sys", "wasm-bindgen", @@ -8729,18 +8551,18 @@ dependencies = [ [[package]] name = "webpki-root-certs" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c" +checksum = "0d46a5a140e6f7afeccd8eae97eff335163939eac8b929834875168b29b3d267" dependencies = [ "rustls-pki-types", ] [[package]] name = "webpki-roots" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" +checksum = "bf85cb06032201fa7c6f829d7db5a7e5aa45bcc0655327713065f6f0576731bf" dependencies = [ "rustls-pki-types", ] @@ -8842,7 +8664,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8853,7 +8675,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -9075,100 +8897,12 @@ dependencies = [ "memchr", ] -[[package]] -name = "wit-bindgen" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" -dependencies = [ - "wit-bindgen-rust-macro", -] - [[package]] name = "wit-bindgen" version = "0.57.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" -[[package]] -name = "wit-bindgen-core" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" -dependencies = [ - "anyhow", - "heck", - "wit-parser", -] - -[[package]] -name = "wit-bindgen-rust" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" -dependencies = [ - "anyhow", - "heck", - "indexmap 2.14.0", - "prettyplease", - "syn 2.0.117", - "wasm-metadata", - "wit-bindgen-core", - "wit-component", -] - -[[package]] -name = "wit-bindgen-rust-macro" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" -dependencies = [ - "anyhow", - "prettyplease", - "proc-macro2", - "quote", - "syn 2.0.117", - "wit-bindgen-core", - "wit-bindgen-rust", -] - -[[package]] -name = "wit-component" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" -dependencies = [ - "anyhow", - "bitflags 2.13.0", - "indexmap 2.14.0", - "log", - "serde", - "serde_derive", - "serde_json", - "wasm-encoder", - "wasm-metadata", - "wasmparser", - "wit-parser", -] - -[[package]] -name = "wit-parser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" -dependencies = [ - "anyhow", - "id-arena", - "indexmap 2.14.0", - "log", - "semver", - "serde", - "serde_derive", - "serde_json", - "unicode-xid", - "wasmparser", -] - [[package]] name = "wkb" version = "0.9.2" @@ -9263,7 +8997,7 @@ dependencies = [ "csv", "futures", "futures-util", - "getrandom 0.4.2", + "getrandom 0.4.3", "heapify", "itertools 0.14.0", "lazy_static", @@ -9387,7 +9121,7 @@ checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "synstructure", ] @@ -9408,7 +9142,7 @@ checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -9428,15 +9162,15 @@ checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "synstructure", ] [[package]] name = "zeroize" -version = "1.8.2" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" +checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e" [[package]] name = "zerotrie" @@ -9470,7 +9204,7 @@ checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] diff --git a/python/Cargo.toml b/python/Cargo.toml index f7d6280644a..240c046e5ff 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.0.0-beta.11" +version = "8.1.0-beta.0" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -19,9 +19,13 @@ arrow-cast = "58.0.0" arrow-data = "58.0.0" arrow-schema = "58.0.0" object_store = "0.13.2" -datafusion = "53.0.0" +datafusion = { version = "53.0.0", default-features = false } datafusion-ffi = "53.0.0" datafusion-common = "53.0.0" +# Keep the Python FFI build on the working Brotli allocator resolution until +# datafusion-ffi no longer enables datafusion-proto/default. +# See https://github.com/lance-format/lance/issues/7271. +alloc-stdlib = "=0.2.2" async-trait = "0.1" chrono = "0.4.42" env_logger = "0.11.7" @@ -56,7 +60,7 @@ prost = "0.14.1" prost-types = "0.14.1" pyo3 = { version = "0.28", features = [ "extension-module", - "abi3-py39", + "abi3-py310", "py-clone", "chrono", ] } diff --git a/python/pyproject.toml b/python/pyproject.toml index a1e69855a0f..d863fe38517 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,13 +1,13 @@ [project] name = "pylance" dynamic = ["version"] -dependencies = ["pyarrow>=14", "numpy>=1.22", "lance-namespace>=0.8.0,<0.9"] +dependencies = ["pyarrow>=14", "numpy>=1.22", "lance-namespace>=0.8.5,<0.9"] description = "python wrapper for Lance columnar format" authors = [{ name = "Lance Devs", email = "dev@lance.org" }] license = { file = "LICENSE" } repository = "https://github.com/lancedb/lance" readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.10" keywords = [ "data-format", "data-science", @@ -30,7 +30,6 @@ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -61,7 +60,7 @@ tests = [ # Only test tensorflow on linux for now. We will deprecate tensorflow soon. "tensorflow; sys_platform == 'linux'", "tqdm", - "datafusion>=53,<54; python_version >= '3.10'", + "datafusion>=53,<54", ] dev = ["ruff==0.11.2", "pyright"] benchmarks = ["pytest-benchmark"] @@ -74,7 +73,7 @@ geo = [ [dependency-groups] tests = [ "boto3==1.40.43", - "datasets==4.1.1; python_version >= '3.10'", + "datasets==4.1.1", "duckdb==1.4.0", "ml_dtypes==0.5.3", "pillow==11.3.0", @@ -82,9 +81,9 @@ tests = [ "polars[pyarrow,pandas]==1.34.0", "psutil==7.1.0", "pytest==8.4.2", - "tensorflow==2.20.0; sys_platform == 'linux' and python_version >= '3.10'", + "tensorflow==2.20.0; sys_platform == 'linux'", "tqdm==4.67.1", - "datafusion==53.0.0; python_version >= '3.10'", + "datafusion==53.0.0", ] dev = [ "maturin==1.13.3", diff --git a/python/python/benchmarks/test_search.py b/python/python/benchmarks/test_search.py index 61076e61687..b4e33338cb1 100644 --- a/python/python/benchmarks/test_search.py +++ b/python/python/benchmarks/test_search.py @@ -78,10 +78,12 @@ def create_base_dataset(data_dir: Path) -> lance.LanceDataset: rows_remaining -= next_batch_length table = create_table(next_batch_length, offset) if offset == 0: - dataset = lance.write_dataset(table, tmp_path, use_legacy_format=False) + dataset = lance.write_dataset( + table, tmp_path, data_storage_version="stable" + ) else: dataset = lance.write_dataset( - table, tmp_path, mode="append", use_legacy_format=False + table, tmp_path, mode="append", data_storage_version="stable" ) offset += next_batch_length @@ -98,7 +100,7 @@ def create_base_dataset(data_dir: Path) -> lance.LanceDataset: dataset.create_scalar_index("category", "BITMAP") dataset.create_scalar_index("genres", "LABEL_LIST") - return lance.dataset(tmp_path, index_cache_size=64 * 1024) + return lance.dataset(tmp_path, index_cache_size_bytes=512 * 1024 * 1024) def create_delete_dataset(data_dir): @@ -113,7 +115,7 @@ def create_delete_dataset(data_dir): dataset = lance.dataset(tmp_path) dataset.delete("filterable % 2 != 0") - return lance.dataset(tmp_path, index_cache_size=64 * 1024) + return lance.dataset(tmp_path, index_cache_size_bytes=512 * 1024 * 1024) def create_new_rows_dataset(data_dir): @@ -129,7 +131,7 @@ def create_new_rows_dataset(data_dir): table = create_table(NEW_ROWS, offset=NUM_ROWS) dataset = lance.write_dataset(table, tmp_path, mode="append") - return lance.dataset(tmp_path, index_cache_size=64 * 1024) + return lance.dataset(tmp_path, index_cache_size_bytes=512 * 1024 * 1024) class Datasets(NamedTuple): diff --git a/python/python/lance/__init__.py b/python/python/lance/__init__.py index f58b169a47a..be99eb05cc5 100644 --- a/python/python/lance/__init__.py +++ b/python/python/lance/__init__.py @@ -230,7 +230,9 @@ def dataset( "Both 'namespace_client' and 'table_id' must be provided together." ) - request = DescribeTableRequest(id=table_id, version=version) + # Resolve the latest table metadata here. The requested dataset version is + # applied by the lower-level dataset open path after namespace resolution. + request = DescribeTableRequest(id=table_id, version=None) response = namespace_client.describe_table(request) uri = response.location diff --git a/python/python/lance/blob.py b/python/python/lance/blob.py index 46faf760cdd..a87c9302736 100644 --- a/python/python/lance/blob.py +++ b/python/python/lance/blob.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright The Lance Authors +import ctypes import io from dataclasses import dataclass from typing import IO, Any, Iterator, Optional, Union @@ -9,6 +10,12 @@ from .lance import LanceBlobFile +_BLOB_INLINE_SIZE_THRESHOLD_META_KEY = b"lance-encoding:blob-inline-size-threshold" +_BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY = ( + b"lance-encoding:blob-dedicated-size-threshold" +) +_MAX_RUST_USIZE = ctypes.c_size_t(-1).value + @dataclass(frozen=True) class Blob: @@ -190,9 +197,63 @@ def blob_array(values: list[Any]) -> BlobArray: return BlobArray.from_pylist(values) -def blob_field(name: str, *, nullable: bool = True) -> pa.Field: - """Construct an Arrow field for a Lance blob column.""" - return pa.field(name, BlobType(), nullable=nullable) +def _validate_threshold(name: str, value: Optional[int], *, allow_zero: bool) -> None: + if value is None: + return + if isinstance(value, bool) or not isinstance(value, int): + raise TypeError(f"{name} must be an int, got {type(value).__name__}") + if allow_zero: + if value < 0: + raise ValueError(f"{name} must be non-negative") + elif value <= 0: + raise ValueError(f"{name} must be positive") + if value > _MAX_RUST_USIZE: + raise OverflowError(f"{name} must fit in a Rust usize") + + +def blob_field( + name: str, + *, + nullable: bool = True, + inline_size_threshold: Optional[int] = None, + dedicated_size_threshold: Optional[int] = None, +) -> pa.Field: + """ + Construct an Arrow field for a Lance blob column. + + Parameters + ---------- + name : str + Field name. + nullable : bool, default True + Whether the blob column accepts null values. + inline_size_threshold : optional, int + Maximum payload size in bytes to keep inline in the data file before + using packed blob storage. + dedicated_size_threshold : optional, int + Maximum payload size in bytes to store in packed blob storage before + using dedicated blob storage. This threshold is checked before + ``inline_size_threshold``. + """ + _validate_threshold("inline_size_threshold", inline_size_threshold, allow_zero=True) + _validate_threshold( + "dedicated_size_threshold", dedicated_size_threshold, allow_zero=False + ) + + field = pa.field(name, BlobType(), nullable=nullable) + if inline_size_threshold is None and dedicated_size_threshold is None: + return field + + metadata = dict(field.metadata or {}) + if inline_size_threshold is not None: + metadata[_BLOB_INLINE_SIZE_THRESHOLD_META_KEY] = str( + inline_size_threshold + ).encode() + if dedicated_size_threshold is not None: + metadata[_BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY] = str( + dedicated_size_threshold + ).encode() + return field.with_metadata(metadata) class BlobIterator: diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index e96d9305ce5..45dc1b253d3 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -950,6 +950,9 @@ def create_branch( ds._base_store_params = self._base_store_params ds._namespace_client = self._namespace_client ds._table_id = self._table_id + ds._namespace_client_managed_versioning = ( + self._namespace_client_managed_versioning + ) ds._default_scan_options = self._default_scan_options ds._read_params = self._read_params return ds @@ -1350,7 +1353,10 @@ def data_storage_version(self) -> str: @property def has_stable_row_ids(self) -> bool: """ - Whether this dataset has stable row IDs enabled + Whether this dataset has stable row IDs enabled. + + This is based on the dataset manifest feature flag and does not depend on + whether the current version has any fragments. """ return self._ds.has_stable_row_ids @@ -4579,6 +4585,7 @@ def commit_batch( ds._base_store_params = base_store_params ds._namespace_client = None ds._table_id = None + ds._namespace_client_managed_versioning = False ds._default_scan_options = None ds._read_params = None return BulkCommitResult( diff --git a/python/python/lance/indices/builder.py b/python/python/lance/indices/builder.py index d3d61c5f8ff..6059166d6ba 100644 --- a/python/python/lance/indices/builder.py +++ b/python/python/lance/indices/builder.py @@ -150,7 +150,7 @@ def train_ivf( max_iters=max_iters, ) num_dims = ivf_centroids.shape[1] - ivf_centroids.shape = -1 + ivf_centroids = ivf_centroids.reshape(-1) flat_centroids_array = pa.array(ivf_centroids) centroids_array = pa.FixedSizeListArray.from_arrays( flat_centroids_array, num_dims diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index 38d82738063..26ad75a27b7 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -226,6 +226,8 @@ class _Dataset: def replace_field_metadata(self, field_name: str, metadata: Dict[str, str]): ... @property def data_storage_version(self) -> str: ... + @property + def has_stable_row_ids(self) -> bool: ... def index_statistics(self, index_name: str) -> str: ... def serialized_manifest(self) -> bytes: ... def describe_indices(self) -> List[IndexDescription]: ... @@ -461,6 +463,27 @@ class _Dataset: def get_transactions( self, recent_transactions=10 ) -> List[Optional[Transaction]]: ... + def hamming_clustering_for_ivf_partition( + self, + index_name: str, + partition_id: int, + hamming_threshold: int, + ) -> pa.RecordBatchReader: ... + def get_ivf_partition_info(self, index_name: str) -> List[dict]: ... + def hamming_clustering_for_sample( + self, + column: str, + sample_size: Optional[int], + hamming_threshold: int, + ) -> pa.RecordBatchReader: ... + def hamming_clustering_for_range( + self, + column: str, + fragment_id: int, + start_row: int, + num_rows: int, + hamming_threshold: int, + ) -> pa.RecordBatchReader: ... class _MergeInsertBuilder: def __init__(self, dataset: _Dataset, on: str | Iterable[str]): ... diff --git a/python/python/lance/lance/optimize.pyi b/python/python/lance/lance/optimize.pyi index 9a26d23c003..c4b6b6546e6 100644 --- a/python/python/lance/lance/optimize.pyi +++ b/python/python/lance/lance/optimize.pyi @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List +from typing import List, Optional from lance import LanceDataset from lance.fragment import FragmentMetadata @@ -51,5 +51,7 @@ class Compaction: def plan(dataset: "LanceDataset", options: CompactionOptions) -> CompactionPlan: ... @staticmethod def commit( - dataset: "LanceDataset", rewrites: List[RewriteResult] + dataset: "LanceDataset", + rewrites: List[RewriteResult], + options: Optional[CompactionOptions] = None, ) -> CompactionMetrics: ... diff --git a/python/python/lance/namespace.py b/python/python/lance/namespace.py index f448e5c3368..fec3a1cfb1e 100644 --- a/python/python/lance/namespace.py +++ b/python/python/lance/namespace.py @@ -32,6 +32,8 @@ CreateMaterializedViewResponse, CreateNamespaceRequest, CreateNamespaceResponse, + CreateTableBranchRequest, + CreateTableBranchResponse, CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, @@ -42,6 +44,8 @@ DeclareTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, + DeleteTableBranchRequest, + DeleteTableBranchResponse, DeleteTableTagRequest, DeleteTableTagResponse, DeregisterTableRequest, @@ -70,6 +74,8 @@ LanceNamespace, ListNamespacesRequest, ListNamespacesResponse, + ListTableBranchesRequest, + ListTableBranchesResponse, ListTableIndicesRequest, ListTableIndicesResponse, ListTablesRequest, @@ -850,6 +856,27 @@ def update_table_tag( response_dict = self._inner.update_table_tag(request.model_dump()) return UpdateTableTagResponse.from_dict(response_dict) + def create_table_branch( + self, request: CreateTableBranchRequest + ) -> CreateTableBranchResponse: + """Create a new branch forked from a table version.""" + response_dict = self._inner.create_table_branch(request.model_dump()) + return CreateTableBranchResponse.from_dict(response_dict) + + def list_table_branches( + self, request: ListTableBranchesRequest + ) -> ListTableBranchesResponse: + """List all branches of a table.""" + response_dict = self._inner.list_table_branches(request.model_dump()) + return ListTableBranchesResponse.from_dict(response_dict) + + def delete_table_branch( + self, request: DeleteTableBranchRequest + ) -> DeleteTableBranchResponse: + """Delete a branch from a table.""" + response_dict = self._inner.delete_table_branch(request.model_dump()) + return DeleteTableBranchResponse.from_dict(response_dict) + # Operation metrics methods def retrieve_ops_metrics(self) -> Dict[str, int]: @@ -1420,6 +1447,27 @@ def update_table_tag( response_dict = self._inner.update_table_tag(request.model_dump()) return UpdateTableTagResponse.from_dict(response_dict) + def create_table_branch( + self, request: CreateTableBranchRequest + ) -> CreateTableBranchResponse: + """Create a new branch forked from a table version.""" + response_dict = self._inner.create_table_branch(request.model_dump()) + return CreateTableBranchResponse.from_dict(response_dict) + + def list_table_branches( + self, request: ListTableBranchesRequest + ) -> ListTableBranchesResponse: + """List all branches of a table.""" + response_dict = self._inner.list_table_branches(request.model_dump()) + return ListTableBranchesResponse.from_dict(response_dict) + + def delete_table_branch( + self, request: DeleteTableBranchRequest + ) -> DeleteTableBranchResponse: + """Delete a branch from a table.""" + response_dict = self._inner.delete_table_branch(request.model_dump()) + return DeleteTableBranchResponse.from_dict(response_dict) + # Operation metrics methods def retrieve_ops_metrics(self) -> Dict[str, int]: diff --git a/python/python/lance/optimize.py b/python/python/lance/optimize.py index 8b98308d442..3ac7547960b 100644 --- a/python/python/lance/optimize.py +++ b/python/python/lance/optimize.py @@ -57,6 +57,14 @@ class CompactionOptions(TypedDict): The batch size to use when scanning input fragments. You may want to reduce this if you are running out of memory during compaction. + The default will use the same default from ``scanner``. + """ + io_buffer_size: Optional[int] + """ + The number of bytes to allow to queue up in the I/O buffer when scanning + input fragments. Increasing this can avoid a deadlock that occurs when a + single batch of data is larger than the I/O buffer size. + The default will use the same default from ``scanner``. """ compaction_mode: Optional[ diff --git a/python/python/lance/vector.py b/python/python/lance/vector.py index 34a6154a321..5ce5e8b61e5 100644 --- a/python/python/lance/vector.py +++ b/python/python/lance/vector.py @@ -749,3 +749,150 @@ def _partition_and_pq_codes_assignment() -> Iterable[pa.RecordBatch]: data_file.path for frag in ds.get_fragments() for data_file in frag.data_files() ] return dst_dataset_uri, shuffle_buffers + + +# ============================================================================= +# Hamming Distance Clustering +# ============================================================================= + + +def hamming_clustering_for_ivf_partition( + dataset: "LanceDataset", + index_name: str, + partition_id: int, + hamming_threshold: int, +) -> pa.RecordBatchReader: + """ + Perform hamming clustering on a partition of an IVF_FLAT index. + + Loads a partition from an IVF_FLAT index on a hash column, computes + pairwise hamming distances between all hashes in the partition, + filters by threshold, and clusters the results using union-find. + + Parameters + ---------- + dataset : LanceDataset + The Lance dataset containing the hash column with an IVF_FLAT index. + index_name : str + Name of the IVF_FLAT index on the hash column + partition_id : int + The partition ID within the IVF_FLAT index + hamming_threshold : int + Maximum hamming distance to consider as similar + + Returns + ------- + pa.RecordBatchReader + A reader yielding batches with columns: + + - 'representative': uint64 - The representative row ID for each cluster + - 'duplicates': list - List of duplicate row IDs in each cluster + """ + return dataset._ds.hamming_clustering_for_ivf_partition( + index_name, partition_id, hamming_threshold + ) + + +def get_ivf_partition_info( + dataset: "LanceDataset", + index_name: str, +) -> List[dict]: + """ + Get partition information for an IVF_FLAT index. + + Parameters + ---------- + dataset : LanceDataset + The Lance dataset containing the hash column with an IVF_FLAT index. + index_name : str + Name of the IVF_FLAT index + + Returns + ------- + list[dict] + List of partition info dicts with 'partition_id' and 'size' + """ + return dataset._ds.get_ivf_partition_info(index_name) + + +def hamming_clustering_for_sample( + dataset: "LanceDataset", + column: str, + sample_size: Optional[int] = None, + hamming_threshold: int = 10, +) -> pa.RecordBatchReader: + """ + Perform pairwise hamming distance clustering on a sample of the dataset. + + Randomly samples rows from the dataset, computes pairwise hamming distances + between all hashes in the sample, filters by threshold, and clusters the + results using union-find. + + Parameters + ---------- + dataset : LanceDataset + The Lance dataset containing the hash column. + column : str + Name of the hash column (must be FixedSizeList) + sample_size : int, optional + Number of rows to sample. If None, uses all rows. + hamming_threshold : int, default 10 + Maximum hamming distance to consider as similar + + Returns + ------- + pa.RecordBatchReader + A reader yielding batches with columns: + + - 'representative': uint64 - The representative row ID for each cluster + - 'duplicates': list - List of duplicate row IDs in each cluster + """ + return dataset._ds.hamming_clustering_for_sample( + column, sample_size, hamming_threshold + ) + + +def hamming_clustering_for_range( + dataset: "LanceDataset", + column: str, + fragment_id: int, + start_row: int, + num_rows: int, + hamming_threshold: int = 10, +) -> pa.RecordBatchReader: + """ + Perform pairwise hamming distance clustering on a contiguous range of rows. + + Reads a contiguous range of rows from a specific fragment, computes pairwise + hamming distances between all hashes in the range, filters by threshold, + and clusters the results using union-find. + + Unlike sampling, this reads sequential rows which is useful for distributed + processing where each worker handles a specific range of a fragment. + + Parameters + ---------- + dataset : LanceDataset + The Lance dataset containing the hash column. + column : str + Name of the hash column (must be FixedSizeList) + fragment_id : int + The fragment ID to read from + start_row : int + The starting row offset within the fragment + num_rows : int + Number of rows to read from the start position + hamming_threshold : int, default 10 + Maximum hamming distance to consider as similar + + Returns + ------- + pa.RecordBatchReader + A reader yielding batches with columns: + + - 'representative': uint64 - The representative row ID for each cluster + - 'duplicates': list - List of duplicate row IDs in each cluster + """ + return dataset._ds.hamming_clustering_for_range( + column, fragment_id, start_row, num_rows, hamming_threshold + ) diff --git a/python/python/tests/test_blob.py b/python/python/tests/test_blob.py index 5a896d21c5d..fc879c9cbaa 100644 --- a/python/python/tests/test_blob.py +++ b/python/python/tests/test_blob.py @@ -45,6 +45,56 @@ def _external_blob_table(blob_path, payload=b"hello"): return pa.table({"blob": lance.blob_array([blob_path.as_uri()])}) +def _add_columns_blob_v2_values(tmp_path): + external_base = tmp_path / "external_base" + external_blob = external_base / "external_blob.bin" + external_blob.parent.mkdir(parents=True, exist_ok=True) + external_blob.write_bytes(b"external") + + payloads = [ + b"inline", + b"p" * (64 * 1024 + 1024), + b"d" * (4 * 1024 * 1024 + 1024), + b"external", + ] + values = [payloads[0], payloads[1], payloads[2], external_blob.as_uri()] + initial_bases = [DatasetBasePath(external_base.as_uri(), name="external", id=1)] + return values, payloads, initial_bases + + +def _assert_blob_v2_add_columns_result(dataset, column, payloads): + desc = dataset.to_table(columns=[column]).column(column).chunk(0) + + assert desc.field("kind").to_pylist() == [0, 1, 2, 3] + assert desc.field("blob_id").to_pylist()[3] == 1 + assert desc.field("blob_uri").to_pylist()[3] == "external_blob.bin" + + blobs = dataset.take_blobs(column, indices=range(len(payloads))) + assert [blob.readall() for blob in blobs] == payloads + + +def _dataset_file_set(dataset_path): + return { + path.relative_to(dataset_path) + for path in dataset_path.rglob("*") + if path.is_file() + } + + +def _write_two_fragment_blob_v2_seed_dataset(tmp_path, name): + values, payloads, initial_bases = _add_columns_blob_v2_values(tmp_path) + dataset_path = tmp_path / name + ds = lance.write_dataset( + pa.table({"id": range(8)}), + dataset_path, + data_storage_version="2.2", + initial_bases=initial_bases, + max_rows_per_file=4, + max_rows_per_group=4, + ) + return ds, dataset_path, values, payloads + + def _out_of_order_blob_selection(dataset_with_blobs, selection_kind): addresses = _blob_row_addresses(dataset_with_blobs) expected = [(addresses[4], b"quux"), (addresses[0], b"foo")] @@ -533,6 +583,160 @@ def test_blob_extension_write_inline(tmp_path): assert f.read() == b"foo" +def test_blob_field_threshold_metadata(): + field = lance.blob_field( + "blob", + inline_size_threshold=16 * 1024, + dedicated_size_threshold=2 * 1024 * 1024, + ) + + assert field.metadata[b"lance-encoding:blob-inline-size-threshold"] == b"16384" + assert field.metadata[b"lance-encoding:blob-dedicated-size-threshold"] == b"2097152" + + +@pytest.mark.parametrize( + ("kwargs", "error", "message"), + [ + pytest.param( + {"inline_size_threshold": -1}, + ValueError, + "inline_size_threshold must be non-negative", + id="negative_inline", + ), + pytest.param( + {"dedicated_size_threshold": 0}, + ValueError, + "dedicated_size_threshold must be positive", + id="zero_dedicated", + ), + pytest.param( + {"dedicated_size_threshold": -1}, + ValueError, + "dedicated_size_threshold must be positive", + id="negative_dedicated", + ), + pytest.param( + {"inline_size_threshold": True}, + TypeError, + "inline_size_threshold must be an int", + id="bool_inline", + ), + pytest.param( + {"dedicated_size_threshold": True}, + TypeError, + "dedicated_size_threshold must be an int", + id="bool_dedicated", + ), + pytest.param( + {"inline_size_threshold": 1.5}, + TypeError, + "inline_size_threshold must be an int", + id="float_inline", + ), + pytest.param( + {"inline_size_threshold": 2**100}, + OverflowError, + "inline_size_threshold must fit in a Rust usize", + id="overflow_inline", + ), + pytest.param( + {"dedicated_size_threshold": 2**100}, + OverflowError, + "dedicated_size_threshold must fit in a Rust usize", + id="overflow_dedicated", + ), + ], +) +def test_blob_field_rejects_invalid_thresholds(kwargs, error, message): + with pytest.raises(error, match=message): + lance.blob_field("blob", **kwargs) + + +def test_blob_extension_inline_threshold_per_column(tmp_path): + payload = b"x" * 2048 + schema = pa.schema( + [ + lance.blob_field("inline_blob", inline_size_threshold=4096), + lance.blob_field("packed_blob", inline_size_threshold=1024), + ] + ) + table = pa.table( + { + "inline_blob": lance.blob_array([payload]), + "packed_blob": lance.blob_array([payload]), + }, + schema=schema, + ) + ds = lance.write_dataset( + table, + tmp_path / "test_ds_v2_inline_threshold_per_column", + data_storage_version="2.2", + ) + + desc = ds.to_table(columns=["inline_blob", "packed_blob"]) + assert desc.column("inline_blob").chunk(0).field("kind").to_pylist() == [0] + assert desc.column("packed_blob").chunk(0).field("kind").to_pylist() == [1] + + +def test_blob_extension_threshold_metadata_persists_after_reopen(tmp_path): + dataset_path = tmp_path / "test_ds_v2_threshold_metadata_persists" + schema = pa.schema([lance.blob_field("blob", inline_size_threshold=1024)]) + table = pa.table({"blob": lance.blob_array([b"x"])}, schema=schema) + + lance.write_dataset(table, dataset_path, data_storage_version="2.2") + reopened = lance.dataset(dataset_path) + + assert ( + reopened.schema.field("blob").metadata[ + b"lance-encoding:blob-inline-size-threshold" + ] + == b"1024" + ) + + +def test_blob_extension_append_rejects_explicit_threshold_mismatch(tmp_path): + dataset_path = tmp_path / "test_ds_v2_append_threshold_mismatch" + initial_schema = pa.schema([lance.blob_field("blob", inline_size_threshold=4096)]) + initial = pa.table( + {"blob": lance.blob_array([b"x" * 2048])}, + schema=initial_schema, + ) + lance.write_dataset(initial, dataset_path, data_storage_version="2.2") + + append_schema = pa.schema([lance.blob_field("blob", inline_size_threshold=1024)]) + append = pa.table( + {"blob": lance.blob_array([b"x" * 2048])}, + schema=append_schema, + ) + + with pytest.raises( + OSError, match="Cannot append data with blob threshold metadata" + ): + lance.write_dataset(append, dataset_path, mode="append") + + +def test_blob_extension_dedicated_threshold_precedes_inline_threshold(tmp_path): + payload = b"x" * 2048 + schema = pa.schema( + [ + lance.blob_field( + "blob", + inline_size_threshold=4096, + dedicated_size_threshold=1024, + ) + ] + ) + table = pa.table({"blob": lance.blob_array([payload])}, schema=schema) + ds = lance.write_dataset( + table, + tmp_path / "test_ds_v2_dedicated_precedes_inline", + data_storage_version="2.2", + ) + + desc = ds.to_table(columns=["blob"]).column("blob").chunk(0) + assert desc.field("kind").to_pylist() == [2] + + def test_blob_extension_write_external(tmp_path): blob_path = tmp_path / "external_blob.bin" blob_path.write_bytes(b"hello") @@ -608,6 +812,137 @@ def test_blob_extension_write_external_ingest_rejects_reference_only_options(tmp ) +def test_blob_extension_add_columns_record_batch_reader_all_kinds(tmp_path): + values, payloads, initial_bases = _add_columns_blob_v2_values(tmp_path) + ds = lance.write_dataset( + pa.table({"id": range(4)}), + tmp_path / "test_add_columns_reader_blob_v2", + data_storage_version="2.2", + initial_bases=initial_bases, + ) + + ds.add_columns(pa.table({"blob": lance.blob_array(values)}).to_reader()) + + _assert_blob_v2_add_columns_result(ds, "blob", payloads) + + +@pytest.mark.parametrize( + "failure_mode", + [ + pytest.param("raises_after_first_fragment", id="reader_raises_mid_stream"), + pytest.param("wrong_schema", id="reader_yields_wrong_schema"), + pytest.param("too_many_rows", id="reader_produces_too_many_rows"), + ], +) +def test_blob_extension_add_columns_record_batch_reader_failure_cleans_files( + tmp_path, + failure_mode, +): + ds, dataset_path, values, payloads = _write_two_fragment_blob_v2_seed_dataset( + tmp_path, + f"test_add_columns_reader_blob_v2_fail_cleanup_{failure_mode}", + ) + external_blob_path = tmp_path / "external_base" / "external_blob.bin" + files_before = _dataset_file_set(dataset_path) + + schema = pa.schema([lance.blob_field("blob")]) + first_fragment_batch = pa.record_batch([lance.blob_array(values)], schema=schema) + second_fragment_batch = pa.record_batch([lance.blob_array(values)], schema=schema) + + if failure_mode == "raises_after_first_fragment": + match = "reader failed after first fragment" + + def failing_reader(): + yield first_fragment_batch + raise RuntimeError("reader failed after first fragment") + + elif failure_mode == "wrong_schema": + match = "field names" + + def failing_reader(): + yield first_fragment_batch + yield pa.record_batch([pa.array(range(4))], ["not_blob"]) + + else: + match = "Stream produced more values than expected for dataset" + + def failing_reader(): + yield first_fragment_batch + yield second_fragment_batch + yield pa.record_batch([lance.blob_array([payloads[0]])], schema=schema) + + with pytest.raises(OSError, match=match): + ds.add_columns(failing_reader(), reader_schema=schema) + + assert ds.version == 1 + assert _dataset_file_set(dataset_path) == files_before + assert external_blob_path.exists() + + +def test_blob_extension_add_columns_batch_udf_failure_cleans_files(tmp_path): + ds, dataset_path, values, _ = _write_two_fragment_blob_v2_seed_dataset( + tmp_path, + "test_add_columns_udf_blob_v2_fail_cleanup", + ) + external_blob_path = tmp_path / "external_base" / "external_blob.bin" + files_before = _dataset_file_set(dataset_path) + call_count = 0 + + @lance.batch_udf(output_schema=pa.schema([lance.blob_field("blob")])) + def fail_on_second_fragment(batch): + nonlocal call_count + call_count += 1 + if call_count == 2: + raise RuntimeError("udf failed after first fragment") + blob_values = [values[row.as_py() % len(values)] for row in batch["id"]] + return pa.record_batch( + [lance.blob_array(blob_values)], + ["blob"], + ) + + with pytest.raises(OSError, match="udf failed after first fragment"): + ds.add_columns(fail_on_second_fragment, read_columns=["id"], batch_size=4) + + assert call_count == 2 + assert ds.version == 1 + assert _dataset_file_set(dataset_path) == files_before + assert external_blob_path.exists() + + +def test_blob_extension_add_columns_batch_udf_all_kinds(tmp_path): + values, payloads, initial_bases = _add_columns_blob_v2_values(tmp_path) + ds = lance.write_dataset( + pa.table({"id": range(4)}), + tmp_path / "test_add_columns_udf_blob_v2", + data_storage_version="2.2", + initial_bases=initial_bases, + ) + + @lance.batch_udf(output_schema=pa.schema([lance.blob_field("blob")])) + def make_blob_column(batch): + return pa.record_batch( + [lance.blob_array([values[row.as_py()] for row in batch["id"]])], + ["blob"], + ) + + ds.add_columns(make_blob_column, read_columns=["id"]) + + _assert_blob_v2_add_columns_result(ds, "blob", payloads) + + +def test_blob_extension_add_columns_all_nulls_blob_v2(tmp_path): + ds = lance.write_dataset( + pa.table({"id": range(4)}), + tmp_path / "test_add_columns_all_nulls_blob_v2", + data_storage_version="2.2", + ) + + ds.add_columns(lance.blob_field("blob")) + + assert ds.to_table(columns=["blob"]).column("blob").to_pylist() == [None] * 4 + assert ds.take_blobs("blob", indices=range(4)) == [] + + def test_blob_extension_write_fragments_external_denied_by_default(tmp_path): blob_path = tmp_path / "external_blob.bin" @@ -1125,6 +1460,38 @@ def test_read_blobs_resolves_nested_field_path(dataset_with_nested_blobs): assert [data for _, data in results] == [b"foo", b"baz"] +def test_write_nested_blob_v2_and_take_by_field_path(tmp_path): + packed = b"x" * (70 * 1024) + blob_field = lance.blob_field("blob") + info_fields = [pa.field("name", pa.string()), blob_field] + info_type = pa.struct(info_fields) + info_array = pa.StructArray.from_arrays( + [pa.array(["a", "b", "c"]), lance.blob_array([b"foo", packed, None])], + fields=info_fields, + ) + table = pa.table( + [info_array], + schema=pa.schema([pa.field("info", info_type)]), + ) + + dataset = lance.write_dataset( + table, + tmp_path / "nested_blob_v2", + data_storage_version="2.2", + ) + + desc = dataset.to_table(columns=["info.blob"]).column("info.blob").chunk(0) + assert desc.field("kind").to_pylist()[:2] == [0, 1] + + blobs = dataset.take_blobs("info.blob", indices=[0, 1]) + with blobs[0] as f: + assert f.read() == b"foo" + with blobs[1] as f: + assert f.read() == packed + + assert dataset.take_blobs("info.blob", indices=[2]) == [] + + def test_to_pandas_returns_blob_files_for_projected_nested_fields( dataset_with_nested_blobs, ): diff --git a/python/python/tests/test_dataset.py b/python/python/tests/test_dataset.py index 4af363868e1..45866f3c4da 100644 --- a/python/python/tests/test_dataset.py +++ b/python/python/tests/test_dataset.py @@ -93,6 +93,25 @@ def test_roundtrip_types(tmp_path: Path): assert dataset.to_table() == table +@pytest.mark.parametrize("data_storage_version", ["legacy", "stable", "2.1"]) +def test_write_zero_dimension_fixed_size_list( + tmp_path: Path, data_storage_version: str +): + # Zero-dimension fixed-size lists must be rejected with a clean error + # instead of a divide-by-zero panic (#5102) + schema = pa.schema( + [ + pa.field("id", pa.int64()), + pa.field("vec", pa.list_(pa.float32(), 0)), + ] + ) + table = pa.table({"id": [1], "vec": [[]]}, schema=schema) + with pytest.raises(OSError, match="dimension must be a positive integer"): + lance.write_dataset( + table, tmp_path / "ds.lance", data_storage_version=data_storage_version + ) + + def test_dataset_overwrite(tmp_path: Path): table1 = pa.Table.from_pylist([{"a": 1, "b": 2}, {"a": 10, "b": 20}]) base_dir = tmp_path / "test" @@ -424,16 +443,27 @@ def test_enable_stable_row_ids(tmp_path: Path): assert table_after["_rowaddr"][3].as_py() == (2 << 32) + 3 -def test_has_stable_row_ids_property(tmp_path: Path): - table = pa.Table.from_pylist([{"a": 1}, {"a": 2}]) +@pytest.mark.parametrize("enable_stable_row_ids", [True, False]) +@pytest.mark.parametrize( + "rows", + [[{"a": 1}, {"a": 2}], []], + ids=["non_empty", "empty"], +) +def test_has_stable_row_ids_property(tmp_path: Path, enable_stable_row_ids: bool, rows): + schema = pa.schema([pa.field("a", pa.int64())]) + table = pa.Table.from_pylist(rows, schema=schema) - stable_path = tmp_path / "stable" - lance.write_dataset(table, stable_path, enable_stable_row_ids=True) - assert lance.dataset(stable_path).has_stable_row_ids is True + path = tmp_path / f"stable_row_ids_{enable_stable_row_ids}_{len(rows)}" + lance.write_dataset( + table, + path, + enable_stable_row_ids=enable_stable_row_ids, + ) + ds = lance.dataset(path) - non_stable_path = tmp_path / "non_stable" - lance.write_dataset(table, non_stable_path, enable_stable_row_ids=False) - assert lance.dataset(non_stable_path).has_stable_row_ids is False + assert ds.count_rows() == len(rows) + assert len(ds.get_fragments()) == (0 if len(rows) == 0 else 1) + assert ds.has_stable_row_ids is enable_stable_row_ids def _list_manifests(versions_dir): @@ -1742,6 +1772,7 @@ def test_commit_batch_append(): result = lance.LanceDataset.commit_batch(dataset, [txn2, txn3]) dataset = result["dataset"] assert dataset.version == 2 + assert dataset.checkout_version(1).version == 1 assert len(dataset.get_fragments()) == 3 assert dataset.to_table() == pa.concat_tables([data1, data2, data3]) merged_txn = result["merged"] @@ -5538,6 +5569,8 @@ def test_branches(tmp_path: Path): branch1 = ds_main.create_branch("branch1") ds_main.branches.replace_metadata("branch1", {"description": "branch one"}) assert branch1.version == 1 + # The dataset returned by create_branch must be fully constructed + assert branch1.checkout_version(("main", None)).version == 1 branch1_append = pa.Table.from_pydict({"a": [7, 8], "b": [9, 10]}) branch1 = lance.write_dataset(branch1_append, branch1, mode="append") assert branch1.version == 2 diff --git a/python/python/tests/test_indices.py b/python/python/tests/test_indices.py index 7f6595f2ecc..02cf64541d6 100644 --- a/python/python/tests/test_indices.py +++ b/python/python/tests/test_indices.py @@ -25,7 +25,7 @@ def make_ds(num_rows: int, rows_per_frag: int, tmpdir: pathlib.Path, dtype: str): vectors = np.random.randn(num_rows, DIMENSION).astype(dtype) - vectors.shape = -1 + vectors = vectors.reshape(-1) vectors = pa.FixedSizeListArray.from_arrays(vectors, DIMENSION) table = pa.Table.from_arrays([vectors], names=["vectors"]) uri = str(tmpdir / "dataset") @@ -53,7 +53,7 @@ def small_rand_dataset(tmpdir, request): @pytest.fixture def mostly_null_dataset(tmpdir, request): vectors = np.random.randn(NUM_ROWS, DIMENSION).astype(np.float32) - vectors.shape = -1 + vectors = vectors.reshape(-1) vectors = pa.FixedSizeListArray.from_arrays(vectors, DIMENSION) vectors = vectors.to_pylist() vectors = [vec if i % 10 == 0 else None for i, vec in enumerate(vectors)] @@ -219,7 +219,7 @@ def test_ivf_centroids_fragment_ids(tmpdir): ], axis=0, ) - vectors.shape = -1 + vectors = vectors.reshape(-1) table = pa.Table.from_arrays( [pa.FixedSizeListArray.from_arrays(vectors, DIMENSION)], names=["vectors"] ) diff --git a/python/python/tests/test_mem_wal.py b/python/python/tests/test_mem_wal.py index b8c859cb637..c21e88b2416 100644 --- a/python/python/tests/test_mem_wal.py +++ b/python/python/tests/test_mem_wal.py @@ -60,9 +60,16 @@ def _write_flushed_gen(base_path: str, shard_id: str, gen_folder: str, data: pa. The collector resolves flushed generation paths as: {base_dataset_path}/_mem_wal/{shard_id}/{gen_folder} + + Production flush also writes a primary-key dedup sidecar (`_pk_index/`) that + the LSM scanner opens to dedup across generations; stage it here too so the + flushed generation faithfully matches what flush produces. """ + from lance.lance import _write_pk_sidecar + gen_path = os.path.join(base_path, "_mem_wal", shard_id, gen_folder) lance.write_dataset(data, gen_path, schema=_LOOKUP_SCHEMA) + _write_pk_sidecar(gen_path, data, ["id"]) def test_point_lookup_with_memtables(tmp_path): diff --git a/python/python/tests/test_namespace_dir.py b/python/python/tests/test_namespace_dir.py index 1991b82946e..fa1bc93b422 100644 --- a/python/python/tests/test_namespace_dir.py +++ b/python/python/tests/test_namespace_dir.py @@ -29,6 +29,8 @@ CountTableRowsRequest, CreateNamespaceRequest, CreateNamespaceResponse, + CreateTableBranchRequest, + CreateTableBranchResponse, CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, @@ -37,6 +39,8 @@ CreateTableVersionResponse, DeclareTableRequest, DeclareTableResponse, + DeleteTableBranchRequest, + DeleteTableBranchResponse, DeregisterTableRequest, DeregisterTableResponse, DescribeNamespaceRequest, @@ -54,6 +58,8 @@ InsertIntoTableResponse, ListNamespacesRequest, ListNamespacesResponse, + ListTableBranchesRequest, + ListTableBranchesResponse, ListTableIndicesRequest, ListTableIndicesResponse, ListTablesRequest, @@ -71,6 +77,8 @@ InvalidInputError, NamespaceNotEmptyError, NamespaceNotFoundError, + TableBranchAlreadyExistsError, + TableBranchNotFoundError, TableNotFoundError, ) @@ -151,6 +159,21 @@ def create_table_version( ) -> CreateTableVersionResponse: return self._inner.create_table_version(request) + def create_table_branch( + self, request: CreateTableBranchRequest + ) -> CreateTableBranchResponse: + return self._inner.create_table_branch(request) + + def list_table_branches( + self, request: ListTableBranchesRequest + ) -> ListTableBranchesResponse: + return self._inner.list_table_branches(request) + + def delete_table_branch( + self, request: DeleteTableBranchRequest + ) -> DeleteTableBranchResponse: + return self._inner.delete_table_branch(request) + def create_table_index( self, request: CreateTableIndexRequest ) -> CreateTableIndexResponse: @@ -564,6 +587,110 @@ def test_register_table_rejects_path_traversal(self, temp_ns_client): assert "Path traversal is not allowed" in str(exc_info.value) +class TestTableBranchOperations: + """Branch CRUD through the python bindings - mirrors the Rust branch + CRUD tests.""" + + def test_branch_crud_round_trip(self, temp_ns_client): + create_ns_req = CreateNamespaceRequest(id=["workspace"]) + temp_ns_client.create_namespace(create_ns_req) + ipc_data = table_to_ipc_bytes(create_test_data()) + table_id = ["workspace", "branched_table"] + temp_ns_client.create_table(CreateTableRequest(id=table_id), ipc_data) + + temp_ns_client.create_table_branch( + CreateTableBranchRequest(id=table_id, name="dev") + ) + listed = temp_ns_client.list_table_branches( + ListTableBranchesRequest(id=table_id) + ) + assert "dev" in listed.branches + assert listed.branches["dev"].parent_version == 1 + + # Duplicate creation and deleting a missing branch surface the typed + # branch errors (codes 23 and 22), not InternalError. + temp_ns_client.create_table_branch( + CreateTableBranchRequest(id=table_id, name="dev2") + ) + with pytest.raises(TableBranchAlreadyExistsError): + temp_ns_client.create_table_branch( + CreateTableBranchRequest(id=table_id, name="dev2") + ) + + temp_ns_client.delete_table_branch( + DeleteTableBranchRequest(id=table_id, name="dev") + ) + listed = temp_ns_client.list_table_branches( + ListTableBranchesRequest(id=table_id) + ) + assert "dev" not in listed.branches + with pytest.raises(TableBranchNotFoundError): + temp_ns_client.delete_table_branch( + DeleteTableBranchRequest(id=table_id, name="dev") + ) + + def test_create_branch_from_other_branch(self, temp_ns_client): + """Forking from a non-main source branch records the right parent.""" + create_ns_req = CreateNamespaceRequest(id=["workspace"]) + temp_ns_client.create_namespace(create_ns_req) + ipc_data = table_to_ipc_bytes(create_test_data()) + table_id = ["workspace", "fork_table"] + temp_ns_client.create_table(CreateTableRequest(id=table_id), ipc_data) + + temp_ns_client.create_table_branch( + CreateTableBranchRequest(id=table_id, name="dev") + ) + temp_ns_client.create_table_branch( + CreateTableBranchRequest(id=table_id, name="child", from_branch="dev") + ) + listed = temp_ns_client.list_table_branches( + ListTableBranchesRequest(id=table_id) + ) + assert listed.branches["child"].parent_branch == "dev" + + +class _ForeignCodeError(Exception): + """Not a LanceNamespaceError, but carries the same integer code as + TABLE_NOT_FOUND.""" + + code = 4 + + +class _RaisingNamespace(LanceNamespace): + """A namespace whose describe_table raises the configured exception.""" + + def __init__(self, exc: Exception): + self._exc = exc + + def namespace_id(self) -> str: + return "raising" + + def describe_table(self, request: DescribeTableRequest) -> DescribeTableResponse: + raise self._exc + + +class TestPythonNamespaceErrorMapping: + """The Rust adapter must trust the `code` attribute only on the + lance_namespace exception hierarchy.""" + + def test_namespace_error_identity_preserved(self): + ns = _RaisingNamespace(TableNotFoundError("no such table")) + with pytest.raises(TableNotFoundError, match="no such table"): + lance.dataset(namespace_client=ns, table_id=["t"]) + + # Branch error codes (22/23) survive the round trip too. + ns = _RaisingNamespace(TableBranchNotFoundError("no such branch")) + with pytest.raises(TableBranchNotFoundError, match="no such branch"): + lance.dataset(namespace_client=ns, table_id=["t"]) + + def test_foreign_code_attribute_not_trusted(self): + # The foreign exception must surface as itself, not be reinterpreted + # as a namespace error via its `code` attribute. + ns = _RaisingNamespace(_ForeignCodeError("boom")) + with pytest.raises(_ForeignCodeError, match="boom"): + lance.dataset(namespace_client=ns, table_id=["t"]) + + class TestChildNamespaceOperations: """Tests for operations in child namespaces - mirrors Rust tests.""" @@ -979,6 +1106,49 @@ def test_external_manifest_store_invokes_namespace_apis(use_custom): ), "describe_table_version should be called once when opening version 1" +def test_dataset_namespace_open_does_not_pass_version_to_describe_table(): + """Dataset versions are applied to dataset open, not namespace describe_table.""" + + class VersionRejectingNamespace(CustomNamespace): + def __init__(self, inner: lance.namespace.DirectoryNamespace): + super().__init__(inner) + self.describe_versions = [] + + def describe_table( + self, request: DescribeTableRequest + ) -> DescribeTableResponse: + self.describe_versions.append(request.version) + assert request.version is None + return super().describe_table(request) + + with tempfile.TemporaryDirectory() as tmpdir: + inner_ns_client = lance.namespace.DirectoryNamespace(root=tmpdir) + ns_client = VersionRejectingNamespace(inner_ns_client) + table_id = ["test_table"] + + table1 = pa.Table.from_pylist([{"a": 1}, {"a": 2}]) + ds = lance.write_dataset( + table1, namespace_client=ns_client, table_id=table_id, mode="create" + ) + assert ds.count_rows() == 2 + assert ds.version == 1 + + table2 = pa.Table.from_pylist([{"a": 3}]) + ds = lance.write_dataset( + table2, namespace_client=ns_client, table_id=table_id, mode="append" + ) + assert ds.count_rows() == 3 + assert ds.version == 2 + + version_one = lance.dataset( + namespace_client=ns_client, table_id=table_id, version=1 + ) + assert version_one.count_rows() == 2 + assert version_one.version == 1 + assert ns_client.describe_versions + assert all(version is None for version in ns_client.describe_versions) + + @pytest.mark.skipif( sys.platform == "win32", reason="Windows file locking prevents reliable concurrent filesystem operations", diff --git a/python/python/tests/test_namespace_integration.py b/python/python/tests/test_namespace_integration.py index 4605b755816..fc08370d247 100644 --- a/python/python/tests/test_namespace_integration.py +++ b/python/python/tests/test_namespace_integration.py @@ -31,6 +31,8 @@ from lance_namespace import ( CreateNamespaceRequest, CreateNamespaceResponse, + CreateTableBranchRequest, + CreateTableBranchResponse, CreateTableRequest, CreateTableResponse, CreateTableVersionRequest, @@ -136,6 +138,11 @@ def create_table_version( ) -> CreateTableVersionResponse: return self._inner.create_table_version(request) + def create_table_branch( + self, request: CreateTableBranchRequest + ) -> CreateTableBranchResponse: + return self._inner.create_table_branch(request) + def retrieve_ops_metrics(self) -> Optional[Dict[str, int]]: return self._inner.retrieve_ops_metrics() @@ -199,6 +206,7 @@ def create_tracking_namespace( storage_options: dict, credential_expires_in_seconds: int = 60, use_custom: bool = False, + managed_versioning: bool = False, ): """Create a DirectoryNamespace with ops metrics and credential vending enabled. @@ -212,6 +220,9 @@ def create_tracking_namespace( storage_options: Storage options to pass through (credentials, endpoint, etc.) credential_expires_in_seconds: Interval in seconds for credential expiration use_custom: If True, wrap in CustomNamespace for testing custom implementations + managed_versioning: If True, enable the manifest catalog so table versions + are tracked by the namespace and commits route through + create_table_version Returns: Tuple of (namespace_client, inner_namespace_client) where inner is always @@ -238,6 +249,10 @@ def create_tracking_namespace( dir_props["vend_input_storage_options_refresh_interval_millis"] = str( credential_expires_in_seconds * 1000 ) + if managed_versioning: + dir_props["manifest_enabled"] = "true" + dir_props["table_version_tracking_enabled"] = "true" + dir_props["table_version_storage_enabled"] = "true" inner_ns_client = DirectoryNamespace(**dir_props) ns_client = _wrap_if_custom(inner_ns_client, use_custom) @@ -558,6 +573,87 @@ def test_namespace_write_overwrite_mode(s3_bucket: str, use_custom: bool): assert get_describe_call_count(inner_ns_client) == call_count_before_reads +@pytest.mark.integration +@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"]) +def test_namespace_managed_branches(s3_bucket: str, use_custom: bool): + """Branches on a managed-versioning table over S3. + + Branch commits must route through the catalog (create_table_version) and + leave main's chain untouched. A cross-branch checkout at an overlapping + version number must resolve the requested chain: branch version numbers + continue from the fork point, so the same number exists on both chains + with different data. + """ + storage_options = copy.deepcopy(CONFIG) + + ns_client, inner_ns_client = create_tracking_namespace( + bucket_name=s3_bucket, + storage_options=storage_options, + credential_expires_in_seconds=3600, + use_custom=use_custom, + managed_versioning=True, + ) + + table_name = uuid.uuid4().hex + table_id = ["test_ns", table_name] + + def commit_count() -> int: + return inner_ns_client.retrieve_ops_metrics().get("create_table_version", 0) + + lance.write_dataset( + pa.Table.from_pylist([{"a": 1}]), + namespace_client=ns_client, + table_id=table_id, + mode="create", + storage_options=storage_options, + ) + ds = lance.write_dataset( + pa.Table.from_pylist([{"a": 2}]), + namespace_client=ns_client, + table_id=table_id, + mode="append", + storage_options=storage_options, + ) + assert commit_count() >= 2 + + ns_client.create_table_branch( + CreateTableBranchRequest(id=table_id, name="dev", from_version=2) + ) + + dev = ds.checkout_version(("dev", None)) + commits_before_branch_append = commit_count() + dev = lance.write_dataset( + pa.Table.from_pylist([{"a": 3}]), + dev, + mode="append", + storage_options=storage_options, + ) + assert commit_count() == commits_before_branch_append + 1 + assert sorted(dev.to_table()["a"].to_pylist()) == [1, 2, 3] + + # Diverge main to the same version number as dev's tip. + ds = lance.write_dataset( + pa.Table.from_pylist([{"a": 100}]), + namespace_client=ns_client, + table_id=table_id, + mode="append", + storage_options=storage_options, + ) + assert sorted(ds.to_table()["a"].to_pylist()) == [1, 2, 100] + + on_dev = ds.checkout_version(("dev", 3)) + assert sorted(on_dev.to_table()["a"].to_pylist()) == [1, 2, 3] + back_on_main = dev.checkout_version(("main", None)) + assert sorted(back_on_main.to_table()["a"].to_pylist()) == [1, 2, 100] + + fresh = lance.dataset( + namespace_client=ns_client, + table_id=table_id, + storage_options=storage_options, + ) + assert sorted(fresh.to_table()["a"].to_pylist()) == [1, 2, 100] + + @pytest.mark.integration @pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"]) def test_namespace_distributed_write(s3_bucket: str, use_custom: bool): diff --git a/python/python/tests/test_optimize.py b/python/python/tests/test_optimize.py index ccd889db116..049ce2cc3a5 100644 --- a/python/python/tests/test_optimize.py +++ b/python/python/tests/test_optimize.py @@ -324,6 +324,47 @@ def test_defer_index_remap(tmp_path: Path): assert any(idx.name == "__lance_frag_reuse" for idx in indices) +@pytest.mark.parametrize("use_commit_options", [True, False]) +def test_defer_index_remap_via_commit_options(tmp_path: Path, use_commit_options: bool): + """Compaction.commit respects defer_index_remap passed in options. + + When options={"defer_index_remap": True} is supplied to Compaction.commit + the __lance_frag_reuse system index must appear in describe_indices(). + When the option is omitted (default) no such system index is written. + """ + base_dir = tmp_path / f"dataset_commit_opts_{use_commit_options}" + data = pa.table({"i": range(6_000), "val": range(6_000)}) + dataset = lance.write_dataset(data, base_dir, max_rows_per_file=1_000) + dataset.create_scalar_index("i", "BTREE") + dataset.delete("i < 500") + + plan = Compaction.plan( + dataset, + options=dict(target_rows_per_fragment=2_000, num_threads=1), + ) + rewrites = [task.execute(dataset) for task in plan.tasks] + + if use_commit_options: + Compaction.commit(dataset, rewrites, options={"defer_index_remap": True}) + else: + Compaction.commit(dataset, rewrites) + + dataset = lance.dataset(base_dir) + indices = dataset.describe_indices() + has_frag_reuse = any(idx.name == "__lance_frag_reuse" for idx in indices) + + if use_commit_options: + assert has_frag_reuse, ( + "expected __lance_frag_reuse system index when defer_index_remap=True " + "is passed to Compaction.commit" + ) + else: + assert not has_frag_reuse, ( + "did not expect __lance_frag_reuse system index when options is omitted " + "from Compaction.commit" + ) + + @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_describe_indices_matches_list_indices_for_frag_reuse(tmp_path: Path): """describe_indices() and list_indices() must agree on the index_type diff --git a/python/python/tests/test_s3_ddb.py b/python/python/tests/test_s3_ddb.py index b9c9e4be6c0..dc9744115e2 100644 --- a/python/python/tests/test_s3_ddb.py +++ b/python/python/tests/test_s3_ddb.py @@ -212,6 +212,58 @@ def writh_dataset_with_start_barrier(): assert lance.dataset(table_dir).count_rows() == expected_version * 2 +@pytest.mark.integration +def test_s3_ddb_branches(s3_bucket: str, ddb_table: str): + """Branches on a table committed through the DynamoDB external manifest + store. + + The DDB store keys version chains by base uri, so each branch chain must + get its own entries via its branch-qualified path. Both chains are given + the same version number with diverged data so a wrong-chain resolution + cannot pass silently. + """ + storage_options = copy.deepcopy(CONFIG) + table_name = uuid.uuid4().hex + table_dir = f"s3+ddb://{s3_bucket}/{table_name}?ddbTableName={ddb_table}" + + # main: v1 (a=1), v2 (a=2) + lance.write_dataset( + pa.Table.from_pylist([{"a": 1}]), table_dir, storage_options=storage_options + ) + ds = lance.write_dataset( + pa.Table.from_pylist([{"a": 2}]), + table_dir, + mode="append", + storage_options=storage_options, + ) + + # Fork "dev" at v2 and commit on it, then diverge main to the same + # version number. + dev = ds.create_branch("dev", 2) + dev = lance.write_dataset( + pa.Table.from_pylist([{"a": 3}]), + dev, + mode="append", + storage_options=storage_options, + ) + ds = lance.write_dataset( + pa.Table.from_pylist([{"a": 100}]), + table_dir, + mode="append", + storage_options=storage_options, + ) + + assert sorted(dev.to_table()["a"].to_pylist()) == [1, 2, 3] + assert sorted(ds.to_table()["a"].to_pylist()) == [1, 2, 100] + + # Cross-branch checkout at the overlapping version number resolves each + # chain's own data. + on_dev = ds.checkout_version(("dev", 3)) + assert sorted(on_dev.to_table()["a"].to_pylist()) == [1, 2, 3] + back_on_main = dev.checkout_version(("main", None)) + assert sorted(back_on_main.to_table()["a"].to_pylist()) == [1, 2, 100] + + @pytest.mark.integration def test_s3_unsafe(s3_bucket: str): storage_options = copy.deepcopy(CONFIG) diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 7ddfbbc0dc8..b6e882633f5 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -648,7 +648,10 @@ def make_fts_search(ds): assert "ScalarIndexQuery" in plan assert "MaterializeIndex" not in plan assert "FlatMatchQuery" in plan - assert "LanceScan" in plan + # Flat FTS now reads via FilteredReadExec (prints as `LanceRead`) so the + # BTree on `id` pushes into the unindexed-fragment scan too. + assert "LanceRead" in plan + assert "LanceScan" not in plan assert make_fts_search(ds).to_table().num_rows == 12 # Update vector index but NOT scalar index @@ -868,6 +871,51 @@ def test_fts_custom_stop_words(tmp_path): assert len(results["_rowid"].to_pylist()) == 1 +def test_fts_stop_words_respect_language_for_simple_tokenizer(tmp_path): + data = pa.table({"text": ["the lance data", "的 lance data"]}) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + ds.create_scalar_index( + "text", + "INVERTED", + base_tokenizer="simple", + stem=False, + ) + + results = ds.to_table(full_text_query="the", with_row_id=True) + assert results.num_rows == 0 + + results = ds.to_table(full_text_query="的", with_row_id=True) + assert results["text"].to_pylist() == ["的 lance data"] + + +def test_fts_icu_stop_words_are_all_or_none(tmp_path): + data = pa.table({"text": ["the 的 lance data", "useful data"]}) + ds = lance.write_dataset(data, tmp_path / "enabled", mode="overwrite") + ds.create_scalar_index( + "text", + "INVERTED", + base_tokenizer="icu", + stem=False, + remove_stop_words=True, + ) + + assert ds.to_table(full_text_query="the", with_row_id=True).num_rows == 0 + assert ds.to_table(full_text_query="的", with_row_id=True).num_rows == 0 + assert ds.to_table(full_text_query="lance", with_row_id=True).num_rows == 1 + + ds = lance.write_dataset(data, tmp_path / "disabled", mode="overwrite") + ds.create_scalar_index( + "text", + "INVERTED", + base_tokenizer="icu", + stem=False, + remove_stop_words=False, + ) + + assert ds.to_table(full_text_query="the", with_row_id=True).num_rows == 1 + assert ds.to_table(full_text_query="的", with_row_id=True).num_rows == 1 + + def test_rowid_order(dataset): dataset.create_scalar_index("doc", index_type="INVERTED", with_position=False) results = dataset.scanner( diff --git a/python/python/tests/test_vector.py b/python/python/tests/test_vector.py index c02c8312f88..4ea4e7d425e 100644 --- a/python/python/tests/test_vector.py +++ b/python/python/tests/test_vector.py @@ -5,7 +5,7 @@ import numpy as np import pyarrow as pa import pytest -from lance.vector import vec_to_table +from lance.vector import hamming_clustering_for_sample, vec_to_table def test_dict(): @@ -147,3 +147,38 @@ def test_binary_vectors_invalid_metric(tmp_path): "metric": "l2", } ).to_table() + + +def _hash_table(hashes): + """Build a table with a ``hash`` column of FixedSizeList. + + ``hashes`` is a list of 8-byte sequences, one per row. + """ + flat = [byte for row in hashes for byte in row] + values = pa.FixedSizeListArray.from_arrays( + pa.array(flat, type=pa.uint8()), list_size=8 + ) + return pa.Table.from_arrays([values], names=["hash"]) + + +def test_hamming_clustering_for_sample(tmp_path): + hash_a = [0, 0, 0, 0, 0, 0, 0, 0] + hash_b = [255, 0, 0, 0, 0, 0, 0, 0] # 8 bits from hash_a + hash_c = [1, 2, 3, 4, 5, 6, 7, 8] # far from both + # Rows 0,1,2 share hash_a; rows 3,4 share hash_b; row 5 is unique. + table = _hash_table([hash_a, hash_a, hash_a, hash_b, hash_b, hash_c]) + dataset = lance.write_dataset(table, tmp_path / "hashes") + + # threshold 0 => only exact-match hashes cluster together. Full scan + # (sample_size=None) yields deterministic row ids 0..5. + result = hamming_clustering_for_sample(dataset, "hash", None, 0).read_all() + + clusters = { + rep: sorted(dups) + for rep, dups in zip( + result["representative"].to_pylist(), + result["duplicates"].to_pylist(), + ) + } + # Singleton row 5 is not emitted as a cluster. + assert clusters == {0: [1, 2], 3: [4]} diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 292b8079706..4e3addfedb8 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -1772,6 +1772,8 @@ def test_index_cast_centroids(tmp_path): values = pa.array([x for arr in centroids for x in arr], pa.float32()) centroids = pa.FixedSizeListArray.from_arrays(values, 128) + # Cast invalidates the attached index; drop it first per the new contract. + dataset.drop_index(index_name) dataset.alter_columns(dict(path="vector", data_type=pa.list_(pa.float16(), 128))) # centroids are f32, but the column is now f16 diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 8bfa81aeae4..31eaa96a654 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -3428,6 +3428,188 @@ impl Dataset { self.ds.clone(), )) } + + /// Perform pairwise hamming distance clustering on a partition of an IVF_FLAT index. + /// + /// This function loads a specific partition from an IVF_FLAT index on a hash column, + /// computes pairwise hamming distances between all hashes in the partition, + /// filters by threshold, and clusters the results using union-find. + /// + /// Parameters + /// ---------- + /// index_name : str + /// Name of the IVF_FLAT index on the hash column + /// partition_id : int + /// The partition ID within the IVF_FLAT index + /// hamming_threshold : int + /// Maximum hamming distance to consider as similar + /// + /// Returns + /// ------- + /// pyarrow.RecordBatchReader + /// A reader yielding batches with columns: + /// - 'representative': uint64 - The representative row ID for each cluster + /// - 'duplicates': list - List of duplicate row IDs in each cluster + #[pyo3(signature = (index_name, partition_id, hamming_threshold))] + fn hamming_clustering_for_ivf_partition( + &self, + py: Python<'_>, + index_name: &str, + partition_id: usize, + hamming_threshold: u32, + ) -> PyResult>> { + use lance::index::vector::hamming::hamming_clustering_for_ivf_partition; + + let ds = self.ds.as_ref(); + let reader = rt() + .block_on( + Some(py), + hamming_clustering_for_ivf_partition( + ds, + index_name, + partition_id, + hamming_threshold, + ), + )? + .map_err(|err| PyValueError::new_err(err.to_string()))?; + + Ok(PyArrowType(reader)) + } + + /// Get partition information for an IVF_FLAT index. + /// + /// Parameters + /// ---------- + /// index_name : str + /// Name of the IVF_FLAT index + /// + /// Returns + /// ------- + /// List[dict] + /// List of partition info dicts with 'partition_id' and 'size' + #[pyo3(signature = (index_name))] + fn get_ivf_partition_info( + &self, + py: Python<'_>, + index_name: &str, + ) -> PyResult>> { + use lance::index::vector::hamming::get_ivf_partition_info; + + let ds = self.ds.as_ref(); + let result = rt() + .block_on(Some(py), get_ivf_partition_info(ds, index_name))? + .map_err(|err| PyValueError::new_err(err.to_string()))?; + + let partitions: PyResult> = result + .iter() + .map(|p| { + let dict = PyDict::new(py); + dict.set_item("partition_id", p.partition_id)?; + dict.set_item("size", p.size)?; + Ok(dict.into()) + }) + .collect(); + + partitions + } + + /// Perform pairwise hamming distance clustering on sampled rows from a dataset. + /// + /// This function samples N rows randomly from the dataset, extracts hashes, + /// computes pairwise hamming distances, and clusters the results. + /// It's useful for benchmarking and testing without requiring an IVF index. + /// + /// Parameters + /// ---------- + /// column : str + /// Name of the hash column (must be FixedSizeList) + /// sample_size : int, optional + /// Number of rows to sample (if None or >= total rows, uses all rows) + /// hamming_threshold : int + /// Maximum hamming distance to consider as similar + /// + /// Returns + /// ------- + /// pyarrow.RecordBatchReader + /// A reader yielding batches with columns: + /// - 'representative': uint64 - The representative row ID for each cluster + /// - 'duplicates': list - List of duplicate row IDs in each cluster + #[pyo3(signature = (column, sample_size, hamming_threshold))] + fn hamming_clustering_for_sample( + &self, + py: Python<'_>, + column: &str, + sample_size: Option, + hamming_threshold: u32, + ) -> PyResult>> { + use lance::index::vector::hamming::hamming_clustering_for_sample; + + let ds = self.ds.as_ref(); + let reader = rt() + .block_on( + Some(py), + hamming_clustering_for_sample(ds, column, sample_size, hamming_threshold), + )? + .map_err(|err| PyValueError::new_err(err.to_string()))?; + + Ok(PyArrowType(reader)) + } + + /// Perform pairwise hamming distance clustering on a contiguous range of rows from a fragment. + /// + /// This function reads a contiguous range of rows from a specific fragment, + /// extracts hashes, computes pairwise hamming distances, and clusters the results. + /// Unlike sampling, this reads sequential rows which is useful for distributed + /// processing where each worker handles a specific range of a fragment. + /// + /// Parameters + /// ---------- + /// column : str + /// Name of the hash column (must be FixedSizeList) + /// fragment_id : int + /// The fragment ID to read from + /// start_row : int + /// The starting row offset within the fragment + /// num_rows : int + /// Number of rows to read from the start position + /// hamming_threshold : int + /// Maximum hamming distance to consider as similar + /// + /// Returns + /// ------- + /// pyarrow.RecordBatchReader + /// A reader yielding batches with columns: + /// - 'representative': uint64 - The representative row ID for each cluster + /// - 'duplicates': list - List of duplicate row IDs in each cluster + #[pyo3(signature = (column, fragment_id, start_row, num_rows, hamming_threshold))] + fn hamming_clustering_for_range( + &self, + py: Python<'_>, + column: &str, + fragment_id: usize, + start_row: usize, + num_rows: usize, + hamming_threshold: u32, + ) -> PyResult>> { + use lance::index::vector::hamming::hamming_clustering_for_range; + + let ds = self.ds.as_ref(); + let reader = rt() + .block_on( + Some(py), + hamming_clustering_for_range( + ds, + column, + fragment_id, + start_row, + num_rows, + hamming_threshold, + ), + )? + .map_err(|err| PyValueError::new_err(err.to_string()))?; + + Ok(PyArrowType(reader)) + } } #[pyclass(name = "SqlQuery", module = "_lib", subclass, skip_from_py_object)] diff --git a/python/src/dataset/optimize.rs b/python/src/dataset/optimize.rs index 321d7157b86..4bb29246f45 100644 --- a/python/src/dataset/optimize.rs +++ b/python/src/dataset/optimize.rs @@ -58,6 +58,9 @@ fn parse_compaction_options( "batch_size" => { opts.batch_size = value.extract()?; } + "io_buffer_size" => { + opts.io_buffer_size = value.extract()?; + } "compaction_mode" => { let mode_str: Option = value.extract()?; if let Some(mode_str) = mode_str { @@ -551,26 +554,34 @@ impl PyCompaction { /// new version once committed. /// rewrites : List[RewriteResult] /// The results of the compaction tasks to include in the commit. + /// options : dict, optional + /// Compaction options to apply at commit time. + /// When absent or ``None``, defaults to ``CompactionOptions::default()``. /// /// Returns /// ------- /// CompactionMetrics #[staticmethod] + #[pyo3(signature = (dataset, rewrites, options = None))] pub fn commit( dataset: Bound, rewrites: Vec, + options: Option>, ) -> PyResult { let dataset_ref = unwrap_dataset(dataset)?; let dataset = dataset_ref.borrow().clone(); + let config = dataset.ds.manifest.config.clone(); + let opts = match options { + Some(ref dict) => parse_compaction_options(dict, &config)?, + None => CompactionOptions::default(), + }; let rewrites: Vec = rewrites.into_iter().map(|r| r.0).collect(); let mut new_ds = dataset.ds.as_ref().clone(); - // TODO: pass compaction option from plan and execute time - let options: CompactionOptions = CompactionOptions::default(); let fut = commit_compaction( &mut new_ds, rewrites, Arc::new(DatasetIndexRemapperOptions::default()), - &options, + &opts, ); let metrics = rt() .block_on(None, fut)? diff --git a/python/src/lib.rs b/python/src/lib.rs index cf29b26c46a..3bf4eab221e 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -293,6 +293,7 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_wrapped(wrap_pyfunction!(mem_wal::py_evaluate_sharding_spec))?; + m.add_wrapped(wrap_pyfunction!(mem_wal::py_write_pk_sidecar))?; m.add_wrapped(wrap_pyfunction!(bfloat16_array))?; m.add_wrapped(wrap_pyfunction!(write_dataset))?; m.add_wrapped(wrap_pyfunction!(write_fragments))?; diff --git a/python/src/mem_wal.rs b/python/src/mem_wal.rs index 25127c95ea4..dc9718c0dce 100644 --- a/python/src/mem_wal.rs +++ b/python/src/mem_wal.rs @@ -51,6 +51,31 @@ pub fn py_evaluate_sharding_spec<'py>( result.to_pyarrow(py) } +/// Write a primary-key dedup sidecar (`_pk_index/`) for a flushed-generation +/// dataset already written at `gen_path`, mirroring what production flush emits. +/// +/// Test-support only: lets Python tests stage a *faithful* flushed generation +/// (dataset + sidecar). Production always writes the sidecar during flush, so a +/// dataset-without-sidecar is not a state the system otherwise produces. +#[pyfunction(name = "_write_pk_sidecar", signature = (gen_path, data, pk_columns))] +pub fn py_write_pk_sidecar( + py: Python<'_>, + gen_path: String, + data: &Bound<'_, PyAny>, + pk_columns: Vec, +) -> PyResult<()> { + let reader = ArrowArrayStreamReader::from_pyarrow_bound(data) + .map_err(|e| PyValueError::new_err(format!("Cannot read data as Arrow: {}", e)))?; + let batches: Vec = reader + .collect::>() + .map_err(|e| PyIOError::new_err(format!("Failed to read batches: {}", e)))?; + rt().block_on(Some(py), async move { + let pk_refs: Vec<&str> = pk_columns.iter().map(String::as_str).collect(); + lance::dataset::mem_wal::scanner::write_pk_sidecar(&gen_path, &batches, &pk_refs).await + })? + .map_err(|e: lance::Error| PyIOError::new_err(e.to_string())) +} + fn sharding_spec_from_py(spec: &Bound<'_, PyAny>) -> PyResult { let spec_id = get_py_value(spec, "spec_id")?.extract::()?; let fields_obj = get_py_value(spec, "fields")?; diff --git a/python/src/namespace.rs b/python/src/namespace.rs index cf5f7c41b0f..e88ff40de2c 100644 --- a/python/src/namespace.rs +++ b/python/src/namespace.rs @@ -392,6 +392,44 @@ impl PyDirectoryNamespace { pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) } + // Table branch operations + + fn create_table_branch<'py>( + &self, + py: Python<'py>, + request: &Bound<'_, PyAny>, + ) -> PyResult> { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.create_table_branch(request))? + .infer_error()?; + pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) + } + + fn list_table_branches<'py>( + &self, + py: Python<'py>, + request: &Bound<'_, PyAny>, + ) -> PyResult> { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.list_table_branches(request))? + .infer_error()?; + pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) + } + + fn delete_table_branch<'py>( + &self, + py: Python<'py>, + request: &Bound<'_, PyAny>, + ) -> PyResult> { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.delete_table_branch(request))? + .infer_error()?; + pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) + } + // Data manipulation operations fn count_table_rows(&self, py: Python, request: &Bound<'_, PyAny>) -> PyResult { @@ -1054,6 +1092,44 @@ impl PyRestNamespace { pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) } + // Table branch operations + + fn create_table_branch<'py>( + &self, + py: Python<'py>, + request: &Bound<'_, PyAny>, + ) -> PyResult> { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.create_table_branch(request))? + .infer_error()?; + pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) + } + + fn list_table_branches<'py>( + &self, + py: Python<'py>, + request: &Bound<'_, PyAny>, + ) -> PyResult> { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.list_table_branches(request))? + .infer_error()?; + pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) + } + + fn delete_table_branch<'py>( + &self, + py: Python<'py>, + request: &Bound<'_, PyAny>, + ) -> PyResult> { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.delete_table_branch(request))? + .infer_error()?; + pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) + } + // Data manipulation operations fn count_table_rows(&self, py: Python, request: &Bound<'_, PyAny>) -> PyResult { @@ -1472,6 +1548,30 @@ fn get_dict_with_model_dump_class(py: Python<'_>) -> PyResult> Ok(class) } +/// Convert a Python namespace exception into a lance error, preserving the +/// namespace error identity when the exception is a `lance_namespace` +/// `LanceNamespaceError` carrying an error `code`, so callers can react to +/// e.g. TableNotFound the same way they do for native clients. Foreign +/// exceptions that happen to carry an integer `code` (e.g. SystemExit) must +/// not be reinterpreted, so the extraction is gated on the exception type. +fn namespace_error_from_py(method_name: &'static str, e: PyErr) -> lance_core::Error { + Python::attach(|py| { + let value = e.value(py); + let is_namespace_error = py + .import("lance_namespace.errors") + .and_then(|module| module.getattr("LanceNamespaceError")) + .and_then(|class| value.is_instance(&class)) + .unwrap_or(false); + if is_namespace_error + && let Ok(code) = value.getattr("code").and_then(|code| code.extract::()) + { + return lance_namespace::error::NamespaceError::from_code(code, value.to_string()) + .into(); + } + lance_core::Error::io(format!("Python error in {}: {}", method_name, e)) + }) +} + /// Helper to call a Python namespace method with JSON serialization. /// For methods that take a request and return a response. /// Uses DictWithModelDump to pass a dict that also has model_dump() method, @@ -1519,7 +1619,7 @@ where }) .await .map_err(|e| lance_core::Error::io(format!("Task join error for {}: {}", method_name, e)))? - .map_err(|e: PyErr| lance_core::Error::io(format!("Python error in {}: {}", method_name, e)))?; + .map_err(|e: PyErr| namespace_error_from_py(method_name, e))?; serde_json::from_str(&response_json).map_err(|e| { lance_core::Error::io(format!( diff --git a/python/uv.lock b/python/uv.lock index 314417f5aa1..5f1fa45d755 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -1,13 +1,12 @@ version = 1 revision = 3 -requires-python = ">=3.9" +requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.14'", "python_full_version == '3.13.*'", "python_full_version == '3.12.*'", "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version < '3.10'", + "python_full_version < '3.11'", ] [[package]] @@ -33,15 +32,15 @@ name = "aiohttp" version = "3.14.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "aiohappyeyeballs", marker = "python_full_version >= '3.10'" }, - { name = "aiosignal", marker = "python_full_version >= '3.10'" }, - { name = "async-timeout", marker = "python_full_version == '3.10.*'" }, - { name = "attrs", marker = "python_full_version >= '3.10'" }, - { name = "frozenlist", marker = "python_full_version >= '3.10'" }, - { name = "multidict", marker = "python_full_version >= '3.10'" }, - { name = "propcache", marker = "python_full_version >= '3.10'" }, - { name = "typing-extensions", marker = "python_full_version >= '3.10' and python_full_version < '3.13'" }, - { name = "yarl", marker = "python_full_version >= '3.10'" }, + { name = "aiohappyeyeballs" }, + { name = "aiosignal" }, + { name = "async-timeout", marker = "python_full_version < '3.11'" }, + { name = "attrs" }, + { name = "frozenlist" }, + { name = "multidict" }, + { name = "propcache" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, + { name = "yarl" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ee/ab/93ce242f899b68c51b0578c027aafa791ab3614cb9345fa5d37b5f5c8e3e/aiohttp-3.14.0.tar.gz", hash = "sha256:2882de819734c715fd1b9c11c97e09fa020d14438203d1d354d8ed1702791c9b", size = 7940674, upload-time = "2026-06-01T19:41:02.763Z" } wheels = [ @@ -170,8 +169,8 @@ name = "aiosignal" version = "1.4.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "frozenlist", marker = "python_full_version >= '3.10'" }, - { name = "typing-extensions", marker = "python_full_version >= '3.10' and python_full_version < '3.13'" }, + { name = "frozenlist" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007, upload-time = "2025-07-03T22:54:43.528Z" } wheels = [ @@ -235,19 +234,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/28/8a/79c76ad88b16f2fac25684f7313593738f353355eb1af2307e43efd7b1ca/arro3_core-0.6.5-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:de74a2512e2e2366d4b064c498c38672bf6ddea38acec8b1999b4e66182dd001", size = 3104663, upload-time = "2025-10-13T23:11:00.582Z" }, { url = "https://files.pythonhosted.org/packages/20/66/9152feaa87f851a37c1a2bd74fb89d7e82e4c76447ee590bf8e6fff5e9d8/arro3_core-0.6.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:806ca8e20507675b2de68b3d009f76e898cc3c3e441c834ea5220866f68aac50", size = 2956440, upload-time = "2025-10-13T23:11:03.769Z" }, { url = "https://files.pythonhosted.org/packages/ad/66/f4179ef64d5c18fe76ec93cfbff42c0f401438ef771c6766b880044d7e13/arro3_core-0.6.5-cp313-cp313t-win_amd64.whl", hash = "sha256:8f6f0cc78877ade7ad6e678a4671b191406547e7b407bc9637436869c017ed47", size = 2845345, upload-time = "2025-10-13T23:11:07.447Z" }, - { url = "https://files.pythonhosted.org/packages/07/c2/407d6bc19813fb74cc2b087ad3e959e102b29ff81e35dcc0ad0dfb5b946c/arro3_core-0.6.5-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:dfac7fac3c6a302399d94644d48682a19488a5b67bd1ccbdf6c560a7ffabde6d", size = 2680237, upload-time = "2025-10-13T23:11:10.876Z" }, - { url = "https://files.pythonhosted.org/packages/d3/73/c67156794d7e9734f4cc03d2eca7e44a1cc014686e6b7663f5110f58581d/arro3_core-0.6.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fc70042e558d1cd5fbe917b58e8ef52701441e38ff30b1912858050f796a62c", size = 2386228, upload-time = "2025-10-13T23:11:14.02Z" }, - { url = "https://files.pythonhosted.org/packages/79/e8/817ee1abb0cfa7e266ef00749b144553d2bb9c4679ca932ecbca9dc7dea9/arro3_core-0.6.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1583b29b2ba83927a33e5435e5d9d134114c45a6360a8bb4db4beda13dab4fd8", size = 2886476, upload-time = "2025-10-13T23:11:17.579Z" }, - { url = "https://files.pythonhosted.org/packages/8e/d6/1b9beceab797c4510abfc25ef6e657e4c940d06a9615927ce506463691dd/arro3_core-0.6.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6a170fe53f18dda4a4647fd3b8b4a9373fc11ac42c41a4b65f55d79ad531a33e", size = 2911941, upload-time = "2025-10-13T23:11:21.131Z" }, - { url = "https://files.pythonhosted.org/packages/dc/ed/4fe1fb9a24698fe6189111836d22c9582cbc92fa159b24b8664e924738dc/arro3_core-0.6.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:83047b4e6e18835c91c8d12c5494e6ababc7c185c5a772d3429e8f9b0c185894", size = 3150419, upload-time = "2025-10-13T23:11:24.503Z" }, - { url = "https://files.pythonhosted.org/packages/a1/91/d6215b782fa91493f504ae13623db889beeaf0519037c28fc6744464439a/arro3_core-0.6.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3d4393d281d1ef18927915a11187da27287d279f99d5325bc9afb417f76084f", size = 2777891, upload-time = "2025-10-13T23:11:28.11Z" }, - { url = "https://files.pythonhosted.org/packages/d4/de/0aa3504e6cbf406086de49b59cb0dcb3ab11f64acbb38602143e479831dc/arro3_core-0.6.5-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:f0c88d8babcf51affdd69390882e2f0ecb1890a1b8a5abfc087d003e7181eb6e", size = 2519673, upload-time = "2025-10-13T23:11:31.426Z" }, - { url = "https://files.pythonhosted.org/packages/05/69/47bf9c9ab66bafc7056a41f6db9d2149639eea6417299e3fe6c01ef99b6c/arro3_core-0.6.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:36424e1d62212466a5cacdc27d414e99bf0fdab1544cc2b7e5b81e41437e5970", size = 3026254, upload-time = "2025-10-13T23:11:36.199Z" }, - { url = "https://files.pythonhosted.org/packages/b1/e8/638582437ab41ba52d3c7f2a1b0a98e4a05a51e3f660985e594b4f6c18d5/arro3_core-0.6.5-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4eb4d96f7db618f100758a8b7ec1b221c8737d543073701b7ffee74bc5019d46", size = 2704582, upload-time = "2025-10-13T23:11:39.408Z" }, - { url = "https://files.pythonhosted.org/packages/aa/0a/7bc46ee799459cce72a2e15b0eb184170f26cac37eace0b813e855fbc4d8/arro3_core-0.6.5-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:2cfe9b4b1dd663d256754f1aa7aae783a1cddd3eb5698892b9caf381431f0af7", size = 3155815, upload-time = "2025-10-13T23:11:43.304Z" }, - { url = "https://files.pythonhosted.org/packages/99/8a/f20eff8f4ff5bd7db9b37b70ea058b37375a930a10e03d584a7597b6b740/arro3_core-0.6.5-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:a3b2621505f97eb5ce80f1c6fa8c77d18d757ab48d1f11d33a805e9ccbcd6fb6", size = 3107791, upload-time = "2025-10-13T23:11:46.735Z" }, - { url = "https://files.pythonhosted.org/packages/79/da/60c66f0cc4a6af7f54e57973190540f77b84da1218fad2a9917e17bd897b/arro3_core-0.6.5-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6c1becbb96ceba0b20f3d4318dd35f3417ee9a49065813d99f52b0fa285fc569", size = 2957730, upload-time = "2025-10-13T23:11:49.875Z" }, - { url = "https://files.pythonhosted.org/packages/dd/8d/6e3235894196e1fd2be34e01ac2d4280dd24e6c9019e3b12603858651e91/arro3_core-0.6.5-cp39-cp39-win_amd64.whl", hash = "sha256:5459e7bd39bb9dd8c57aa06856d2bebc5c1ca782cbccab0e186c6c89530e4ca9", size = 2839298, upload-time = "2025-10-13T23:11:53.566Z" }, { url = "https://files.pythonhosted.org/packages/10/ca/b2139dbb25f9fefb9b1cdce8a73785615de6763af6a16bf6ff96a3b630f2/arro3_core-0.6.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:26d5b50139f1a96727fa1760b4d70393acf5ee0fba45346ad2d4f69824d3bdc2", size = 2676788, upload-time = "2025-10-13T23:11:56.965Z" }, { url = "https://files.pythonhosted.org/packages/34/a1/c68dde2944f493c8ccfcb91bf6da6d27a27c3674316dd09c9560f9e6ab1a/arro3_core-0.6.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b65b3d8d7f65f2f3c36002dc467380d7a31ea771132986dddc6341c5a9dc726f", size = 2382809, upload-time = "2025-10-13T23:12:00.175Z" }, { url = "https://files.pythonhosted.org/packages/c6/fc/2fb81d42a3cecd632deace97dc23ac74083d60d158106440c783bae4ff01/arro3_core-0.6.5-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6c3442a79a757ed3fbd7793de180019ae3201f04237537c2e2e3f1e3dd99b31c", size = 2882818, upload-time = "2025-10-13T23:12:03.721Z" }, @@ -314,8 +300,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jmespath" }, { name = "python-dateutil" }, - { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "urllib3", version = "2.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "urllib3" }, ] sdist = { url = "https://files.pythonhosted.org/packages/49/d0/3888673417202262ddd7e6361cab8e01ee2705e39643af8445e2eb276eab/botocore-1.40.43.tar.gz", hash = "sha256:d87412dc1ea785df156f412627d3417c9f9eb45601fd0846d8fe96fe3c78b630", size = 14389164, upload-time = "2025-10-01T19:38:16.06Z" } wheels = [ @@ -392,17 +377,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/72/2a/aff5dd112b2f14bcc3462c312dce5445806bfc8ab3a7328555da95330e4b/charset_normalizer-3.4.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d716a916938e03231e86e43782ca7878fb602a125a91e7acb8b5112e2e96ac16", size = 152224, upload-time = "2025-08-09T07:56:51.369Z" }, { url = "https://files.pythonhosted.org/packages/b7/8c/9839225320046ed279c6e839d51f028342eb77c91c89b8ef2549f951f3ec/charset_normalizer-3.4.3-cp314-cp314-win32.whl", hash = "sha256:c6dbd0ccdda3a2ba7c2ecd9d77b37f3b5831687d8dc1b6ca5f56a4880cc7b7ce", size = 100086, upload-time = "2025-08-09T07:56:52.722Z" }, { url = "https://files.pythonhosted.org/packages/ee/7a/36fbcf646e41f710ce0a563c1c9a343c6edf9be80786edeb15b6f62e17db/charset_normalizer-3.4.3-cp314-cp314-win_amd64.whl", hash = "sha256:73dc19b562516fc9bcf6e5d6e596df0b4eb98d87e4f79f3ae71840e6ed21361c", size = 107400, upload-time = "2025-08-09T07:56:55.172Z" }, - { url = "https://files.pythonhosted.org/packages/c2/ca/9a0983dd5c8e9733565cf3db4df2b0a2e9a82659fd8aa2a868ac6e4a991f/charset_normalizer-3.4.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:70bfc5f2c318afece2f5838ea5e4c3febada0be750fcf4775641052bbba14d05", size = 207520, upload-time = "2025-08-09T07:57:11.026Z" }, - { url = "https://files.pythonhosted.org/packages/39/c6/99271dc37243a4f925b09090493fb96c9333d7992c6187f5cfe5312008d2/charset_normalizer-3.4.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:23b6b24d74478dc833444cbd927c338349d6ae852ba53a0d02a2de1fce45b96e", size = 147307, upload-time = "2025-08-09T07:57:12.4Z" }, - { url = "https://files.pythonhosted.org/packages/e4/69/132eab043356bba06eb333cc2cc60c6340857d0a2e4ca6dc2b51312886b3/charset_normalizer-3.4.3-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:34a7f768e3f985abdb42841e20e17b330ad3aaf4bb7e7aeeb73db2e70f077b99", size = 160448, upload-time = "2025-08-09T07:57:13.712Z" }, - { url = "https://files.pythonhosted.org/packages/04/9a/914d294daa4809c57667b77470533e65def9c0be1ef8b4c1183a99170e9d/charset_normalizer-3.4.3-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:fb731e5deb0c7ef82d698b0f4c5bb724633ee2a489401594c5c88b02e6cb15f7", size = 157758, upload-time = "2025-08-09T07:57:14.979Z" }, - { url = "https://files.pythonhosted.org/packages/b0/a8/6f5bcf1bcf63cb45625f7c5cadca026121ff8a6c8a3256d8d8cd59302663/charset_normalizer-3.4.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:257f26fed7d7ff59921b78244f3cd93ed2af1800ff048c33f624c87475819dd7", size = 152487, upload-time = "2025-08-09T07:57:16.332Z" }, - { url = "https://files.pythonhosted.org/packages/c4/72/d3d0e9592f4e504f9dea08b8db270821c909558c353dc3b457ed2509f2fb/charset_normalizer-3.4.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1ef99f0456d3d46a50945c98de1774da86f8e992ab5c77865ea8b8195341fc19", size = 150054, upload-time = "2025-08-09T07:57:17.576Z" }, - { url = "https://files.pythonhosted.org/packages/20/30/5f64fe3981677fe63fa987b80e6c01042eb5ff653ff7cec1b7bd9268e54e/charset_normalizer-3.4.3-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:2c322db9c8c89009a990ef07c3bcc9f011a3269bc06782f916cd3d9eed7c9312", size = 161703, upload-time = "2025-08-09T07:57:20.012Z" }, - { url = "https://files.pythonhosted.org/packages/e1/ef/dd08b2cac9284fd59e70f7d97382c33a3d0a926e45b15fc21b3308324ffd/charset_normalizer-3.4.3-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:511729f456829ef86ac41ca78c63a5cb55240ed23b4b737faca0eb1abb1c41bc", size = 159096, upload-time = "2025-08-09T07:57:21.329Z" }, - { url = "https://files.pythonhosted.org/packages/45/8c/dcef87cfc2b3f002a6478f38906f9040302c68aebe21468090e39cde1445/charset_normalizer-3.4.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:88ab34806dea0671532d3f82d82b85e8fc23d7b2dd12fa837978dad9bb392a34", size = 153852, upload-time = "2025-08-09T07:57:22.608Z" }, - { url = "https://files.pythonhosted.org/packages/63/86/9cbd533bd37883d467fcd1bd491b3547a3532d0fbb46de2b99feeebf185e/charset_normalizer-3.4.3-cp39-cp39-win32.whl", hash = "sha256:16a8770207946ac75703458e2c743631c79c59c5890c80011d536248f8eaa432", size = 99840, upload-time = "2025-08-09T07:57:23.883Z" }, - { url = "https://files.pythonhosted.org/packages/ce/d6/7e805c8e5c46ff9729c49950acc4ee0aeb55efb8b3a56687658ad10c3216/charset_normalizer-3.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:d22dbedd33326a4a5190dd4fe9e9e693ef12160c77382d9e87919bce54f3d4ca", size = 107438, upload-time = "2025-08-09T07:57:25.287Z" }, { url = "https://files.pythonhosted.org/packages/8a/1f/f041989e93b001bc4e44bb1669ccdcf54d3f00e628229a85b08d330615c5/charset_normalizer-3.4.3-py3-none-any.whl", hash = "sha256:ce571ab16d890d23b5c278547ba694193a45011ff86a9162a71307ed9f86759a", size = 53175, upload-time = "2025-08-09T07:57:26.864Z" }, ] @@ -420,9 +394,9 @@ name = "datafusion" version = "53.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "pyarrow", version = "23.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "typing-extensions", marker = "python_full_version >= '3.10' and python_full_version < '3.13'" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/58/2b/0f96f12b70839c93930c4e17d767fc32b6c77d548c78784128049e944701/datafusion-53.0.0.tar.gz", hash = "sha256:ba9a5ec06b5453fbd8710d6aeeb515a8bcac4b6c140e254409bb53a5f322ef22", size = 224267, upload-time = "2026-04-13T00:45:02.686Z" } wheels = [ @@ -433,42 +407,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4b/1a/ea4831fc6aeefedbcf186c9f6a273d507b1787c03cbb905bded7e1149a6a/datafusion-53.0.0-cp310-abi3-win_amd64.whl", hash = "sha256:4c8410f5f659b926677be6c7d443bbc05d825c078c970b7d8cf977ebcf948314", size = 38120687, upload-time = "2026-04-13T00:45:00.633Z" }, ] -[[package]] -name = "datasets" -version = "0.0.9" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/cd/fe/4d2874473a753d59c83335691bd9532704f2605418a0d288a1d70fa003fc/datasets-0.0.9.zip", hash = "sha256:86d54441bab87aebb2aa3bf0853aa7fb7abed8c708f9bb08a88e86a498972010", size = 4013, upload-time = "2015-08-18T00:07:40.556Z" } - [[package]] name = "datasets" version = "4.1.1" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] dependencies = [ - { name = "dill", marker = "python_full_version >= '3.10'" }, - { name = "filelock", marker = "python_full_version >= '3.10'" }, - { name = "fsspec", extra = ["http"], marker = "python_full_version >= '3.10'" }, - { name = "huggingface-hub", marker = "python_full_version >= '3.10'" }, - { name = "multiprocess", marker = "python_full_version >= '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "dill" }, + { name = "filelock" }, + { name = "fsspec", extra = ["http"] }, + { name = "huggingface-hub" }, + { name = "multiprocess" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "packaging", marker = "python_full_version >= '3.10'" }, - { name = "pandas", marker = "python_full_version >= '3.10'" }, - { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "packaging" }, + { name = "pandas" }, + { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "pyarrow", version = "23.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "pyyaml", marker = "python_full_version >= '3.10'" }, - { name = "requests", marker = "python_full_version >= '3.10'" }, - { name = "tqdm", marker = "python_full_version >= '3.10'" }, - { name = "xxhash", marker = "python_full_version >= '3.10'" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "xxhash" }, ] sdist = { url = "https://files.pythonhosted.org/packages/91/a4/73f8e6ef52c535e1d20d5b2ca83bfe6de399d8b8b8a61ccc8d63d60735aa/datasets-4.1.1.tar.gz", hash = "sha256:7d8d5ba8b12861d2c44bfff9c83484ebfafff1ff553371e5901a8d3aab5450e2", size = 579324, upload-time = "2025-09-18T13:14:27.108Z" } wheels = [ @@ -514,12 +472,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/23/32/57866cf8881288b3dfb9212720221fb890daaa534dbdc6fe3fff3979ecd1/duckdb-1.4.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2de258a93435c977a0ec3a74ec8f60c2f215ddc73d427ee49adc4119558facd3", size = 18421289, upload-time = "2025-09-16T10:22:21.564Z" }, { url = "https://files.pythonhosted.org/packages/a0/83/7438fb43be451a7d4a04650aaaf662b2ff2d95895bbffe3e0e28cbe030c9/duckdb-1.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6d3659641d517dd9ed1ab66f110cdbdaa6900106f116effaf2dbedd83c38de3", size = 20426547, upload-time = "2025-09-16T10:22:23.759Z" }, { url = "https://files.pythonhosted.org/packages/21/b2/98fb89ae81611855f35984e96f648d871f3967bb3f524b51d1372d052f0c/duckdb-1.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:07fcc612ea5f0fe6032b92bcc93693034eb00e7a23eb9146576911d5326af4f7", size = 12290467, upload-time = "2025-09-16T10:22:25.923Z" }, - { url = "https://files.pythonhosted.org/packages/8d/42/0f355319b3e8ee1703d0e17378dd829db391434306621f85c110134f2763/duckdb-1.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1c97ee61c582002b654331f7fd967d6b1e83bf7fdb0772f409dfd4b6af3a70f4", size = 31292373, upload-time = "2025-09-16T10:22:28.118Z" }, - { url = "https://files.pythonhosted.org/packages/fd/52/091dbef5eb2ac4e60a9c6d38fcc7c7530a75fafa0f37658450e8731a265b/duckdb-1.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:74e3d6295355160df5d3588b880e8bcae23fdd6f573f538793a8a1abf4c2c29d", size = 17288145, upload-time = "2025-09-16T10:22:30.346Z" }, - { url = "https://files.pythonhosted.org/packages/c9/6c/879317d9c3ac7a2a1f0618ca536a48ebfa4b9fe202f9783e07070e168192/duckdb-1.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d0c76425e4ffe98069dd4fc4752ab919a4125dc0d176bb676b3065fdea152c42", size = 14816258, upload-time = "2025-09-16T10:22:32.442Z" }, - { url = "https://files.pythonhosted.org/packages/95/87/83ac8e67c0530b69fe39f91bbb7f3bd0a49b0c24216cffa9c5561fb2845c/duckdb-1.4.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c122bd7d80ab5057f53024ee3922d7612a5cdc99583fae730990964aebc3fd4", size = 18391043, upload-time = "2025-09-16T10:22:34.616Z" }, - { url = "https://files.pythonhosted.org/packages/d6/01/1d70bd6c594ef915c004edc0f1119d1602173dc5ce91c1eed7368f6aab34/duckdb-1.4.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:30689c1436bca723526be6102fe1f4f82ea6d4780fb9ca196bda7ed5ec227950", size = 20385348, upload-time = "2025-09-16T10:22:36.982Z" }, - { url = "https://files.pythonhosted.org/packages/b6/04/0650128cdcdc5208c4f51341a0a3f8db436ecaba51032c6065e20ea0baae/duckdb-1.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:4c55a367c1296617cff89c5e1c7153f1dc3c3b556ef70711a45b0236515f80c2", size = 12283322, upload-time = "2025-09-16T10:22:39.388Z" }, ] [[package]] @@ -543,29 +495,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" }, ] -[[package]] -name = "flatbuffers" -version = "2.0.7" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/d1/90/0532e737a11e1dc50e9e352c3ccc97338cb75991f83279c2edbc9234e022/flatbuffers-2.0.7.tar.gz", hash = "sha256:0ae7d69c5b82bf41962ca5fde9cc43033bc9501311d975fd5a25e8a7d29c1245", size = 22686, upload-time = "2022-08-23T22:50:07.903Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d7/0d/b5bfb553a6ac66d6ec2b6d7f1e814a908fba7188356ac94bb36ae3d905c3/flatbuffers-2.0.7-py2.py3-none-any.whl", hash = "sha256:71e135d533be527192819aaab757c5e3d109cb10fbb01e687f6bdb7a61ad39d1", size = 26562, upload-time = "2022-08-23T22:50:56.342Z" }, -] - [[package]] name = "flatbuffers" version = "25.9.23" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/9d/1f/3ee70b0a55137442038f2a33469cc5fddd7e0ad2abf83d7497c18a2b6923/flatbuffers-25.9.23.tar.gz", hash = "sha256:676f9fa62750bb50cf531b42a0a2a118ad8f7f797a511eda12881c016f093b12", size = 22067, upload-time = "2025-09-24T05:25:30.106Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/ee/1b/00a78aa2e8fbd63f9af08c9c19e6deb3d5d66b4dda677a0f61654680ee89/flatbuffers-25.9.23-py2.py3-none-any.whl", hash = "sha256:255538574d6cb6d0a79a17ec8bc0d30985913b87513a01cce8bcdb6b4c44d0e2", size = 30869, upload-time = "2025-09-24T05:25:28.912Z" }, @@ -662,23 +595,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/40/37/5f9f3c3fd7f7746082ec67bcdc204db72dad081f4f83a503d33220a92973/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:1a85e345b4c43db8b842cab1feb41be5cc0b10a1830e6295b69d7310f99becaf", size = 282620, upload-time = "2025-06-09T23:02:00.493Z" }, { url = "https://files.pythonhosted.org/packages/0b/31/8fbc5af2d183bff20f21aa743b4088eac4445d2bb1cdece449ae80e4e2d1/frozenlist-1.7.0-cp313-cp313t-win32.whl", hash = "sha256:3a14027124ddb70dfcee5148979998066897e79f89f64b13328595c4bdf77c81", size = 43059, upload-time = "2025-06-09T23:02:02.072Z" }, { url = "https://files.pythonhosted.org/packages/bb/ed/41956f52105b8dbc26e457c5705340c67c8cc2b79f394b79bffc09d0e938/frozenlist-1.7.0-cp313-cp313t-win_amd64.whl", hash = "sha256:3bf8010d71d4507775f658e9823210b7427be36625b387221642725b515dcf3e", size = 47516, upload-time = "2025-06-09T23:02:03.779Z" }, - { url = "https://files.pythonhosted.org/packages/dd/b1/ee59496f51cd244039330015d60f13ce5a54a0f2bd8d79e4a4a375ab7469/frozenlist-1.7.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:cea3dbd15aea1341ea2de490574a4a37ca080b2ae24e4b4f4b51b9057b4c3630", size = 82434, upload-time = "2025-06-09T23:02:05.195Z" }, - { url = "https://files.pythonhosted.org/packages/75/e1/d518391ce36a6279b3fa5bc14327dde80bcb646bb50d059c6ca0756b8d05/frozenlist-1.7.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d536ee086b23fecc36c2073c371572374ff50ef4db515e4e503925361c24f71", size = 48232, upload-time = "2025-06-09T23:02:07.728Z" }, - { url = "https://files.pythonhosted.org/packages/b7/8d/a0d04f28b6e821a9685c22e67b5fb798a5a7b68752f104bfbc2dccf080c4/frozenlist-1.7.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dfcebf56f703cb2e346315431699f00db126d158455e513bd14089d992101e44", size = 47186, upload-time = "2025-06-09T23:02:09.243Z" }, - { url = "https://files.pythonhosted.org/packages/93/3a/a5334c0535c8b7c78eeabda1579179e44fe3d644e07118e59a2276dedaf1/frozenlist-1.7.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:974c5336e61d6e7eb1ea5b929cb645e882aadab0095c5a6974a111e6479f8878", size = 226617, upload-time = "2025-06-09T23:02:10.949Z" }, - { url = "https://files.pythonhosted.org/packages/0a/67/8258d971f519dc3f278c55069a775096cda6610a267b53f6248152b72b2f/frozenlist-1.7.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c70db4a0ab5ab20878432c40563573229a7ed9241506181bba12f6b7d0dc41cb", size = 224179, upload-time = "2025-06-09T23:02:12.603Z" }, - { url = "https://files.pythonhosted.org/packages/fc/89/8225905bf889b97c6d935dd3aeb45668461e59d415cb019619383a8a7c3b/frozenlist-1.7.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1137b78384eebaf70560a36b7b229f752fb64d463d38d1304939984d5cb887b6", size = 235783, upload-time = "2025-06-09T23:02:14.678Z" }, - { url = "https://files.pythonhosted.org/packages/54/6e/ef52375aa93d4bc510d061df06205fa6dcfd94cd631dd22956b09128f0d4/frozenlist-1.7.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e793a9f01b3e8b5c0bc646fb59140ce0efcc580d22a3468d70766091beb81b35", size = 229210, upload-time = "2025-06-09T23:02:16.313Z" }, - { url = "https://files.pythonhosted.org/packages/ee/55/62c87d1a6547bfbcd645df10432c129100c5bd0fd92a384de6e3378b07c1/frozenlist-1.7.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74739ba8e4e38221d2c5c03d90a7e542cb8ad681915f4ca8f68d04f810ee0a87", size = 215994, upload-time = "2025-06-09T23:02:17.9Z" }, - { url = "https://files.pythonhosted.org/packages/45/d2/263fea1f658b8ad648c7d94d18a87bca7e8c67bd6a1bbf5445b1bd5b158c/frozenlist-1.7.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e63344c4e929b1a01e29bc184bbb5fd82954869033765bfe8d65d09e336a677", size = 225122, upload-time = "2025-06-09T23:02:19.479Z" }, - { url = "https://files.pythonhosted.org/packages/7b/22/7145e35d12fb368d92124f679bea87309495e2e9ddf14c6533990cb69218/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2ea2a7369eb76de2217a842f22087913cdf75f63cf1307b9024ab82dfb525938", size = 224019, upload-time = "2025-06-09T23:02:20.969Z" }, - { url = "https://files.pythonhosted.org/packages/44/1e/7dae8c54301beb87bcafc6144b9a103bfd2c8f38078c7902984c9a0c4e5b/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:836b42f472a0e006e02499cef9352ce8097f33df43baaba3e0a28a964c26c7d2", size = 239925, upload-time = "2025-06-09T23:02:22.466Z" }, - { url = "https://files.pythonhosted.org/packages/4b/1e/99c93e54aa382e949a98976a73b9b20c3aae6d9d893f31bbe4991f64e3a8/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e22b9a99741294b2571667c07d9f8cceec07cb92aae5ccda39ea1b6052ed4319", size = 220881, upload-time = "2025-06-09T23:02:24.521Z" }, - { url = "https://files.pythonhosted.org/packages/5e/9c/ca5105fa7fb5abdfa8837581be790447ae051da75d32f25c8f81082ffc45/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:9a19e85cc503d958abe5218953df722748d87172f71b73cf3c9257a91b999890", size = 234046, upload-time = "2025-06-09T23:02:26.206Z" }, - { url = "https://files.pythonhosted.org/packages/8d/4d/e99014756093b4ddbb67fb8f0df11fe7a415760d69ace98e2ac6d5d43402/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f22dac33bb3ee8fe3e013aa7b91dc12f60d61d05b7fe32191ffa84c3aafe77bd", size = 235756, upload-time = "2025-06-09T23:02:27.79Z" }, - { url = "https://files.pythonhosted.org/packages/8b/72/a19a40bcdaa28a51add2aaa3a1a294ec357f36f27bd836a012e070c5e8a5/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:9ccec739a99e4ccf664ea0775149f2749b8a6418eb5b8384b4dc0a7d15d304cb", size = 222894, upload-time = "2025-06-09T23:02:29.848Z" }, - { url = "https://files.pythonhosted.org/packages/08/49/0042469993e023a758af81db68c76907cd29e847d772334d4d201cbe9a42/frozenlist-1.7.0-cp39-cp39-win32.whl", hash = "sha256:b3950f11058310008a87757f3eee16a8e1ca97979833239439586857bc25482e", size = 39848, upload-time = "2025-06-09T23:02:31.413Z" }, - { url = "https://files.pythonhosted.org/packages/5a/45/827d86ee475c877f5f766fbc23fb6acb6fada9e52f1c9720e2ba3eae32da/frozenlist-1.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:43a82fce6769c70f2f5a06248b614a7d268080a9d20f7457ef10ecee5af82b63", size = 44102, upload-time = "2025-06-09T23:02:32.808Z" }, { url = "https://files.pythonhosted.org/packages/ee/45/b82e3c16be2182bff01179db177fe144d58b5dc787a7d4492c6ed8b9317f/frozenlist-1.7.0-py3-none-any.whl", hash = "sha256:9a5af342e34f7e97caf8c995864c7a396418ae2859cc6fdf1b1073020d516a7e", size = 13106, upload-time = "2025-06-09T23:02:34.204Z" }, ] @@ -693,32 +609,13 @@ wheels = [ [package.optional-dependencies] http = [ - { name = "aiohttp", marker = "python_full_version >= '3.10'" }, -] - -[[package]] -name = "gast" -version = "0.4.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/83/4a/07c7e59cef23fb147454663c3271c21da68ba2ab141427c20548ae5a8a4d/gast-0.4.0.tar.gz", hash = "sha256:40feb7b8b8434785585ab224d1568b857edb18297e5a3047f1ba012bc83b42c1", size = 13804, upload-time = "2020-08-07T21:45:23.526Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b6/48/583c032b79ae5b3daa02225a675aeb673e58d2cb698e78510feceb11958c/gast-0.4.0-py3-none-any.whl", hash = "sha256:b7adcdd5adbebf1adf17378da5ba3f543684dbec47b1cda1f3997e573cd542c4", size = 9824, upload-time = "2020-08-07T21:45:21.32Z" }, + { name = "aiohttp" }, ] [[package]] name = "gast" version = "0.6.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/3c/14/c566f5ca00c115db7725263408ff952b8ae6d6a4e792ef9c84e77d9af7a1/gast-0.6.0.tar.gz", hash = "sha256:88fc5300d32c7ac6ca7b515310862f71e6fdf2c029bbec7c66c0f5dd47b6b1fb", size = 27708, upload-time = "2024-06-27T20:31:49.527Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/a3/61/8001b38461d751cd1a0c3a6ae84346796a5758123f3ed97a1b121dfbf4f3/gast-0.6.0-py3-none-any.whl", hash = "sha256:52b182313f7330389f72b069ba00f174cfe2a06411099547288839c6cbafbd54", size = 21173, upload-time = "2024-07-09T13:15:15.615Z" }, @@ -726,69 +623,61 @@ wheels = [ [[package]] name = "geoarrow-rust-core" -version = "0.6.1" +version = "0.6.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "arro3-core" }, - { name = "pyproj", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "pyproj", version = "3.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "pyproj", version = "3.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "pyproj", version = "3.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/77/2d/3e994dd76223fac0eb597a6f55647cca51bd5a4f446d09b668697f901724/geoarrow_rust_core-0.6.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:84d972cc3dd45a797fd99588d7ee68f257e4083ebdcecad9ec773260067f71a6", size = 3570129, upload-time = "2025-12-03T18:51:07.148Z" }, - { url = "https://files.pythonhosted.org/packages/5f/2a/e19df203b4ffb225f39627e1bd1b89ce7b2220e39f1d6972692174820c57/geoarrow_rust_core-0.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bc0f382d4ed41e85d2d89fc2c7c8c3d046681c9a5e19350ce79e0e930cf69821", size = 3333881, upload-time = "2025-11-21T01:49:28.959Z" }, - { url = "https://files.pythonhosted.org/packages/52/98/b749a2165dfc5d9c54a1c19eb3e6a75b6d005ecde42289b25c1c355346b7/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:80e719edcaf6698ed2b1aa9525bd97cf79e23a500a39b1e83566cd9a16a294d3", size = 3806366, upload-time = "2025-11-21T01:48:03.525Z" }, - { url = "https://files.pythonhosted.org/packages/84/93/7c0e42ba7d46208fb0f851e06c05de071962170f3a3b2a2260d8a3f66e7a/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d0f3546a15503329880063aca31266b301b0b781f618f832585bcd1c9efcc876", size = 3981800, upload-time = "2025-11-21T01:48:17.789Z" }, - { url = "https://files.pythonhosted.org/packages/de/43/9c5736569dead60b33e46b7c485e24804d950693df70dee306e153547789/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6937f3cabebf673f8b726d60d8ca160b46401de8b08c8e257be22772c12c2001", size = 5068955, upload-time = "2025-11-21T01:48:32.569Z" }, - { url = "https://files.pythonhosted.org/packages/71/5e/f26f9bea2af96b0d070e980dcc2196d369a678e06141ed260de5ca72bcc2/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f29ba92053e8ad4bd60d72188518f033ca4abc1f34eecebeb41ee7b790612e00", size = 4104946, upload-time = "2025-11-21T01:48:45.801Z" }, - { url = "https://files.pythonhosted.org/packages/fa/08/473796b3e0c03b35292220de88c8efa3e74d6174e807b26a371f2523a4b0/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14a5d05a312fbb76821566b1d144c64d0923fcbd790b2c7376ee11f62472b2fe", size = 3917533, upload-time = "2025-11-21T01:49:14.631Z" }, - { url = "https://files.pythonhosted.org/packages/b9/7a/7b62b839c3a9878a7d91b8395e0b7b04483e4bec687e073df0fbd4056583/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:88fe8fd33b16a06e9b3b7638b51d24047f1d01af12cc2e3e2653140877bddef6", size = 4318837, upload-time = "2025-11-21T01:48:58.953Z" }, - { url = "https://files.pythonhosted.org/packages/ea/86/309c55a9c63f316e3a04949ade8847b8e5acbdd21645696911175f0e1814/geoarrow_rust_core-0.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:dbecc2487cc95526ac77797cd70c199e196811b0a9e877c1b61fcaca508575fa", size = 3320081, upload-time = "2025-11-21T01:49:58.861Z" }, - { url = "https://files.pythonhosted.org/packages/1a/ed/514cff089185d71242a62e774e2c59dda147baab65929851b66d72198d5d/geoarrow_rust_core-0.6.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:e26ca240d7a6a0fa1b4f56a9ebe07b2e14fc7c1c9507aa862bd31ef14e0521f0", size = 3572326, upload-time = "2025-12-03T18:51:08.477Z" }, - { url = "https://files.pythonhosted.org/packages/77/21/22f8233235bd020db22b4f2bf888f9aeed08813eda7b8b421a6963bdc7e4/geoarrow_rust_core-0.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:46876e3528685673e08b4cbc696dca7f22fb073a83318708b0eaf640107b923b", size = 3335166, upload-time = "2025-11-21T01:49:30.632Z" }, - { url = "https://files.pythonhosted.org/packages/bb/eb/0c2e40a6a1bd450347a8a9fc7648ca840710bc177ff6eed3fc5da6ef981a/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5502bd12ede712d9b4725753df4db231a0aa6d3e131079bc4b6452c436e37b7", size = 3800540, upload-time = "2025-11-21T01:48:05.583Z" }, - { url = "https://files.pythonhosted.org/packages/4c/42/22d3b8441bb7041a6fcdb4cf0a1108e150513a52f8a407715188412bc71f/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8f04dd7dd03449dba6d15f7d35c6c708637ac05f125638f56206e876756cd4c5", size = 3984840, upload-time = "2025-11-21T01:48:19.719Z" }, - { url = "https://files.pythonhosted.org/packages/12/44/477b6b2389398dc983026a4ab7dbb7ec121284ad5fb864a1b7a4658c3881/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d2afce33d0c3fa87d5d4d24d6617732e4297da3372b1746569b759f9b62aede1", size = 5067358, upload-time = "2025-11-21T01:48:34.373Z" }, - { url = "https://files.pythonhosted.org/packages/62/50/6995e9d11462635972b2fc09c8e1e510928563ca4fb0fd2c9145cf6ef771/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e63cdb661652a9836dc86cb5995ad269817d88b80f4cce6ed236a7f80f0aba", size = 4105773, upload-time = "2025-11-21T01:48:47.461Z" }, - { url = "https://files.pythonhosted.org/packages/a3/21/b369208495f213db0a0e7d563358307a706cc6af0cb9c897dacf28ae06a1/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adbaf97cb770aef69df8a16437c9faa67adb2b04856faf45bcb61d5b986101dc", size = 3914659, upload-time = "2025-11-21T01:49:16.35Z" }, - { url = "https://files.pythonhosted.org/packages/1d/49/fccb14c6ee9bb715451e4d5bbe3d571eb59a8a1abe21b2abe0d9d48a7fac/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:202f35b301caa5154d95fd74424a1ef6449306e4f6fbfb5140270e48e94188a5", size = 4315153, upload-time = "2025-11-21T01:49:01.075Z" }, - { url = "https://files.pythonhosted.org/packages/c0/1c/88b16510e24a4a3332284669085673701b9fe4d6a511b4466c90655a9daf/geoarrow_rust_core-0.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:491405dfcc821a2c599e381cc9923e04a758deb1cc84fdb5794b519446c2f8a8", size = 3320510, upload-time = "2025-11-21T01:50:15.545Z" }, - { url = "https://files.pythonhosted.org/packages/cb/5f/1dbdbc1dde2140937cff20188cb25034b6f39e1734c14ca6510cf464bf77/geoarrow_rust_core-0.6.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a8145a562e94419402dd0882bb62429853804c53d47dbea944f2a24abc57abd2", size = 3568115, upload-time = "2025-12-03T18:51:09.743Z" }, - { url = "https://files.pythonhosted.org/packages/fd/e1/b62676f89ef3b866676967989ee8dbbd3d16c77f69aa4287825703268c42/geoarrow_rust_core-0.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:51040a5afcfa0cd3ab372d981375c7fe8eb652d155e3964d52ed51d14faa04e8", size = 3325336, upload-time = "2025-11-21T01:49:32.67Z" }, - { url = "https://files.pythonhosted.org/packages/1f/89/94e20f255712ff0eaccf9bfeac4bf51953ebcef0599cfc92f67037f8ab1a/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2fbf8506848b0254b3c89b27c045be38bbef6372b21714cad45d76b0c8cb92ce", size = 3808535, upload-time = "2025-11-21T01:48:07.618Z" }, - { url = "https://files.pythonhosted.org/packages/e7/e4/37c7e2c9e251148be17292d0656d7d1ab35019678f6bd11090a41c270d18/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c1a0d9c14bf2f36676016c753517d9470381969c2a67859716cceae33735f3ee", size = 3978997, upload-time = "2025-11-21T01:48:21.551Z" }, - { url = "https://files.pythonhosted.org/packages/71/27/c4ba353d9b77889136bdfd1c0cd1a04d6eade9da6e0748b06719c458afb5/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6df97301782ecbaf5f2f0252011a9ff309471cde25435bdf1e17b29c263ebc16", size = 5066492, upload-time = "2025-11-21T01:48:36.142Z" }, - { url = "https://files.pythonhosted.org/packages/a6/81/34107fc9aacc489e41afed420202645675b41d85b46dc70d5ba222312791/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1948cfdd0e1c7d03a0c2067821dd536ab34d1e726515202e51fbd6b0d9f775f", size = 4106130, upload-time = "2025-11-21T01:48:49.144Z" }, - { url = "https://files.pythonhosted.org/packages/92/5f/2e348b884738fb213fb3b4745955baeeaf047aecb37639e39a4dd8f12d99/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95b1611b66c386cc6c74e990df4f114bcf24956a35e18e51bf6331c079a36688", size = 3913166, upload-time = "2025-11-21T01:49:18.228Z" }, - { url = "https://files.pythonhosted.org/packages/bf/81/fdda8bb5f84df82bc9e000435a88be46d46dda41eb5149f624ed96b7031c/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1751357a1aaa26aeb5feb6f66873b6a2d369655039f7278dedcb692b512111cc", size = 4313573, upload-time = "2025-11-21T01:49:03.184Z" }, - { url = "https://files.pythonhosted.org/packages/a0/14/ca0bc7d3b158094e769ba2bbc43d203330e7e457ed67b50af97d3eac45df/geoarrow_rust_core-0.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:16fe159043a444579948864808ebec8c49ec167ec0df3cb772dfb88de268bc91", size = 3318746, upload-time = "2025-11-21T01:50:17.319Z" }, - { url = "https://files.pythonhosted.org/packages/85/b8/94e4f8fb32ef705cf65031a24c58cdc441042a68a794b74757a6561cbc60/geoarrow_rust_core-0.6.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:6c1b692f76b613757438bf23cfe3be4a8715f0268afd8ad3ca0063c257a3be4b", size = 3568328, upload-time = "2025-12-03T18:51:11.291Z" }, - { url = "https://files.pythonhosted.org/packages/7c/45/a96e64f9febc3436766c5055508c4e823cce56577529d7b76c4e4f584bc4/geoarrow_rust_core-0.6.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1a2b4f9a8cfe852a0ba9a667258307db9e354b470b7e0a03edffd0b7daf9b6f5", size = 3325879, upload-time = "2025-11-21T01:49:34.941Z" }, - { url = "https://files.pythonhosted.org/packages/58/c0/c719ce3fb4e982e28c71f65a80cf697d07d733336e6b74d7d1b8a7daf9d0/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8248330f5c3e7ec5852d0a23c23b31a08395300ef9544109e2991317beddfee3", size = 3809144, upload-time = "2025-11-21T01:48:09.562Z" }, - { url = "https://files.pythonhosted.org/packages/e2/8e/2ab3563b2ffd13f2dd69c050a901de0a4bb325879531a66f56d30bc7337e/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:775e9fe45c06d02be59b1497c60aa4f7a7c1d460387bf5f63142faf39b8ad4ff", size = 3978886, upload-time = "2025-11-21T01:48:23.335Z" }, - { url = "https://files.pythonhosted.org/packages/db/0a/31625caa0a32e8e9e7aaf2514a840dda0dadf8e2452710ebc10e5f469494/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:94de8fb01da3f22332eab28b03570c43cc36492ce482c254fe87e851ae21285b", size = 5065429, upload-time = "2025-11-21T01:48:37.896Z" }, - { url = "https://files.pythonhosted.org/packages/11/8d/ee247bd4ccf3b0791b8669357d440e3960d4dbd5cca940a2e226e8910c31/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c70a63d1d36687a53dc6c2933446b1435c187e4c616cd84844d89b6ba13bc4f6", size = 4105436, upload-time = "2025-11-21T01:48:50.874Z" }, - { url = "https://files.pythonhosted.org/packages/a9/fb/c1e92716ee5aa00d48b650f0cb43220a1bf4088c8d572dfc21d400b16723/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e505312f2761393fe5158242f3f2d77e9daa5cca63badd8d66e6d1d69fc17bf", size = 3913672, upload-time = "2025-11-21T01:49:19.873Z" }, - { url = "https://files.pythonhosted.org/packages/f8/6f/ef47f6070c5d5cf0d061d5f5ba95aed7e895e4720a784b84c911c0209fc0/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a732e58549108df8267ab72fa6cc7c54e5a9e30b818d8d869e301a9de9d3029e", size = 4313496, upload-time = "2025-11-21T01:49:04.953Z" }, - { url = "https://files.pythonhosted.org/packages/3c/ac/2696b979623ea02129e342f8820c89d03fa5a253a913ad00b588d6dd2948/geoarrow_rust_core-0.6.1-cp313-cp313-win_amd64.whl", hash = "sha256:9e1d6492b1388b9d5ae898728838ada78dbf2340d2e9dd25ad3df6ccdd058813", size = 3318780, upload-time = "2025-11-21T01:50:18.928Z" }, - { url = "https://files.pythonhosted.org/packages/4e/42/0cb3af24b01d3897a9eee6af5cc0676bf6b80364e0d4638e45a5fc873d35/geoarrow_rust_core-0.6.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:3748cc8e8cb2bcedaede27cefed6749d4eea93e358b49a2f0b061d8974dd1b91", size = 3560313, upload-time = "2025-12-03T18:51:12.897Z" }, - { url = "https://files.pythonhosted.org/packages/51/bc/33f8c918e46188707ab358752b993bee9184fa62e580998c1ec4c37885c1/geoarrow_rust_core-0.6.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1b0e232fe4e239ca435d0bab638934eee87d758024c1727ee24a2b8bc4d8bc7b", size = 3321855, upload-time = "2025-12-03T18:51:00.056Z" }, - { url = "https://files.pythonhosted.org/packages/f4/d7/aeb2a3922670ad57f62cb591bd0309a8300ceeec6efc7f925a563c9da672/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:843444ada2c7f7670fd9df3bdebd93e5247b376d1dd20c4fb3828632847ab78e", size = 3799057, upload-time = "2025-12-03T18:50:28.982Z" }, - { url = "https://files.pythonhosted.org/packages/76/08/606e55fc2a0e85b02e0fde7dec2014eb8f1463e8a823496d72a3095de73d/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:880641183a09ebfbca3a6357071f137d1a4b0f1ba606fb9127a01cf58faaef56", size = 3968892, upload-time = "2025-12-03T18:50:34.661Z" }, - { url = "https://files.pythonhosted.org/packages/10/1f/e75fd5b59e9e582190c11ec73c91728d96e90608a22e0aed7365439d9534/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6bb69024257d2fd20da691d1e15bcced874d278884218b64690256982fa30cb1", size = 5049247, upload-time = "2025-12-03T18:50:40.542Z" }, - { url = "https://files.pythonhosted.org/packages/7e/95/2257b9b148c8c6557387e67828a5096ebc519b997a158ffb67a0987589e5/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:85464a1bab81068789de5fb19684e43709d2ba6d64d5655aace7c50b35893d6d", size = 4099850, upload-time = "2025-12-03T18:50:45.341Z" }, - { url = "https://files.pythonhosted.org/packages/b9/07/8c8aaf8755ee7c137f0898823bd005ffb16edaa6accc0cc1a9a747d56ddc/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7eb773a101f1d9716d750bb326991885a7c4576e85d9a016a567a3b07380bf07", size = 3908308, upload-time = "2025-12-03T18:50:55.587Z" }, - { url = "https://files.pythonhosted.org/packages/dc/7e/b8f1933be03d9a3a6416edf29fc23d520e45f00fbde6bd8f0614ad6f8a69/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:920e6fed857acd2145a8fca7c6fad17094873f586ac5efed7049ce43a7af4ff6", size = 4307178, upload-time = "2025-12-03T18:50:50.429Z" }, - { url = "https://files.pythonhosted.org/packages/df/95/a8ba3d7e51ec02ec954d0247c6021b36de5935a9a3845c1cf6c1348cd6e3/geoarrow_rust_core-0.6.1-cp314-cp314-win_amd64.whl", hash = "sha256:9887119cc31a763c34ed8676d06434b47971517e86f8e35c640b494d05e7d5ac", size = 3316511, upload-time = "2025-12-03T18:51:18.831Z" }, - { url = "https://files.pythonhosted.org/packages/ea/6d/4b2f51d0e4ac683217852d79c3acef719ca116f418d9ce8f4dcc6d717716/geoarrow_rust_core-0.6.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:420a720217b5a7ec6f7977cfe7e7a729c73381ed5e63112fdef33bd805b9cf8a", size = 3572216, upload-time = "2025-12-03T18:51:14.544Z" }, - { url = "https://files.pythonhosted.org/packages/f0/55/85a2948b10ad9ea347597f90355d8992745f00fedae54916205c8c9b80fb/geoarrow_rust_core-0.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0fb9c8c6bba4e712edf475ce3c78bf13f7b10f750256f57deb29c3222eaef033", size = 3335928, upload-time = "2025-11-21T01:49:51.601Z" }, - { url = "https://files.pythonhosted.org/packages/4e/98/fdd6c34ff8acd878c31e9f5fe4792f49d437e0465e0b60c24d6cdc287ed7/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9547ead76eac906b7a583ee65fa137e6b8ed34c0f128c1745a290c451726f27", size = 3808249, upload-time = "2025-11-21T01:48:11.192Z" }, - { url = "https://files.pythonhosted.org/packages/8a/a1/fd6741b5c1d7d48b5f6ab58a994a91c86e29d19ee7bca2636590b8ac9a54/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eaa8e8f40ca8fcd367735cb4226c5aa5171a713d75bc2caab9a03bd9f59d7bf2", size = 3984081, upload-time = "2025-11-21T01:48:25.595Z" }, - { url = "https://files.pythonhosted.org/packages/91/1e/2b5a9b65bf19a79d212ea0fe60fa5632ec4c89bb64ee446272b47e5cd6ac/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:08992719a2accbf993837a6aad615e3f2bf1954d2d9152e507dd79621c87e9d3", size = 5071749, upload-time = "2025-11-21T01:48:39.673Z" }, - { url = "https://files.pythonhosted.org/packages/08/7a/6b37f5e52300b60854b74f4cdc9fbe613c692a15c3ae42f1952f3849bc86/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:071c0e72c4c2047326ebec8d76ce2debcdd59e187207433c3a29ac2da861ca92", size = 4107621, upload-time = "2025-11-21T01:48:52.632Z" }, - { url = "https://files.pythonhosted.org/packages/e8/3e/f849642ef4e1f54bcc651903f19a219c3d2be68d27f4ceb282a07ebba7cd/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c49d5a9e7b73c30dd1790a3e0faf30b7a4ee393c127c5a799d543653d1d80f0c", size = 3919352, upload-time = "2025-11-21T01:49:21.495Z" }, - { url = "https://files.pythonhosted.org/packages/84/c8/57318cb04d061788d5ba523984915c98523e9eb9b7ba4937ff3438e045ef/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:909152922ee42197b8ae846a8b6c5383c6f3ab39fe627ec8539765e3a634de68", size = 4320006, upload-time = "2025-11-21T01:49:06.588Z" }, - { url = "https://files.pythonhosted.org/packages/13/9f/be16e191fdedbac4d9c01096327917a948625619423c666ec3db2191b4ab/geoarrow_rust_core-0.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:796c84184fe5e65e30df9f9f45aa8c1680f07689ea71ed1960faa7324fb67e52", size = 3321071, upload-time = "2025-11-21T01:50:20.844Z" }, + { url = "https://files.pythonhosted.org/packages/70/a7/9de5cdcb86089ef4d9a24940838a72ef0655d5be11b46dc4ee807b0d7772/geoarrow_rust_core-0.6.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:e1dbbca927858c05ef4eaa5e13a3977a62183cfa3f17fe7b19dd2d88ecf24e91", size = 3855749, upload-time = "2026-06-11T19:24:32.965Z" }, + { url = "https://files.pythonhosted.org/packages/54/48/da86c2bd1db71849f003f5a8eb78ce54f7a33341d5b33ddcdb480b5aafb4/geoarrow_rust_core-0.6.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ce7e126d340f335bcc108327cbf7264539e856cb6a299f59757a6ee8329f6643", size = 3710538, upload-time = "2026-06-11T19:24:34.925Z" }, + { url = "https://files.pythonhosted.org/packages/f6/65/7f8ecc05447a85f14643170de8a29715e7c3e732fbb7132617772d39eac7/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88eb7982c1345fc4c4b18d9895602f0148c9495fe7ac00df03a92c20c8058149", size = 4198382, upload-time = "2026-06-11T19:24:37.02Z" }, + { url = "https://files.pythonhosted.org/packages/41/57/b11fbb277fab166d8a8940bc1151bbd1aeef537e70c55f495ff85178f827/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c8368b91d4cab5cb5ad1b0f7369da4cec196d82bf73aa3823618a99c1bd4cf04", size = 4270350, upload-time = "2026-06-11T19:24:38.726Z" }, + { url = "https://files.pythonhosted.org/packages/6d/16/0c35e5aff4aca77d818b28d79f9ce20fe1c282ef26d6a2fcc764f3a55f26/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2955d82d0204197c8e96adbfb70f252fa5987821dd8f202e712a84bfb5b876d3", size = 5602389, upload-time = "2026-06-11T19:24:40.198Z" }, + { url = "https://files.pythonhosted.org/packages/e0/06/58e4d0c94f7d8897ca5e2469fe5db0dd937bfc3cd676dea43c6ce488effe/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cacbc2231b03c674975d5a25ff549c367dd8c07147c41edb5461c8ebda693739", size = 4414385, upload-time = "2026-06-11T19:24:41.779Z" }, + { url = "https://files.pythonhosted.org/packages/09/65/902e986d01d4978e752c1d0d5b15873de712321ce3f61c285f491e4149b9/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f5726fd638563d11dfefd7d17dd769e679ac1efb868178791573de19d16b41f", size = 4251263, upload-time = "2026-06-11T19:24:43.556Z" }, + { url = "https://files.pythonhosted.org/packages/2c/f1/b1e0f93ea5288706f08ac7c01f332eb0feaa128251f3c2c9896e5f42cba5/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:df7a0319cdec5d0e4ffc3f17a171e16787e7719f85f82c8cf0035d873ec31e62", size = 4747229, upload-time = "2026-06-11T19:24:45.281Z" }, + { url = "https://files.pythonhosted.org/packages/eb/f3/77ebd20cb5cf5eb18c5bb0e32e07f76ec915a728ea123e075365f0b6c53c/geoarrow_rust_core-0.6.3-cp310-cp310-win_amd64.whl", hash = "sha256:19ce5fb18025480461253d0a03f20cbb635163214b5f193b0700bc1a407dfe4d", size = 3601298, upload-time = "2026-06-11T19:24:46.721Z" }, + { url = "https://files.pythonhosted.org/packages/02/a8/d50e482a56d9543119be40000bc405b725242b6056809bbee3a75eff2411/geoarrow_rust_core-0.6.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d91b5249d5e1da53a79268759601c107beb69a8944dd3b5b225e9515ab63d519", size = 3856056, upload-time = "2026-06-11T19:24:48.331Z" }, + { url = "https://files.pythonhosted.org/packages/04/e3/f4de7795959d95d88b32b85740d5d2d6b0a2e17233258f0331aee6cb7b13/geoarrow_rust_core-0.6.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:14412f02c1e60c92d2f88bc9f92835cf6d80f1da37fe8ba462eafdb7bd570f3c", size = 3710092, upload-time = "2026-06-11T19:24:49.802Z" }, + { url = "https://files.pythonhosted.org/packages/b4/48/04888477c2a12fbe6a6f8898bd026facdc3a929b4e747d7b569e6d20dd58/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc5d6db2341568b1e44678ccc0ade1ca1e7660a2c186ebf8bf847acdb160f2cf", size = 4197891, upload-time = "2026-06-11T19:24:51.245Z" }, + { url = "https://files.pythonhosted.org/packages/fb/2d/c16b6eb6f9f2ab213dcd0cd2ac0dec2eae1e2ce5922b3fbeb7bb1ac2a865/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:45f4193b9d6f6caae969d8448f3687a19f0998d757519a091df609c06ffa68a0", size = 4269771, upload-time = "2026-06-11T19:24:52.781Z" }, + { url = "https://files.pythonhosted.org/packages/47/fd/2ee73341c37d554ce8d0b67a95525700ec32194fa785261c17262afadfc8/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf9ca054562fb4610c8e5ea140fa1bf746ccc16de505d3a5684abd2fa11f9538", size = 5601846, upload-time = "2026-06-11T19:24:54.63Z" }, + { url = "https://files.pythonhosted.org/packages/67/05/229234ae7bf1d39306e41896f3055a2ae847707ce58f21bd0872b9a5764e/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ec9530fefb653f9a2e605cc26fc1c0d1ffa5c4923ec1037323ba9a16744f8ccc", size = 4413741, upload-time = "2026-06-11T19:24:56.015Z" }, + { url = "https://files.pythonhosted.org/packages/eb/5a/7875548a48231b02f909d3d8c7d74ba47867b2af3396e7aed59cd3b2b40d/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2813aceabb29567d96f29fd2d3099d6f8decd0f5f968ff81ed1a664751dc84a3", size = 4251434, upload-time = "2026-06-11T19:24:57.527Z" }, + { url = "https://files.pythonhosted.org/packages/bf/46/ed0370def1a950f185edda603a02276bb412a9c95ad5a052c9e919b2df78/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:49686767d1379ff3b165f9d35a73e96fc25daba786ce27cf3359c5feac880fd0", size = 4746598, upload-time = "2026-06-11T19:24:58.979Z" }, + { url = "https://files.pythonhosted.org/packages/44/bc/3a1720be855d7d0011416b7f0a7b7e33546b0fc7320faf59b05e401adff7/geoarrow_rust_core-0.6.3-cp311-cp311-win_amd64.whl", hash = "sha256:fd9cc8c47af736dd087575306088e73b28a720f52e5c3342968851ddd2fb5778", size = 3601329, upload-time = "2026-06-11T19:25:00.459Z" }, + { url = "https://files.pythonhosted.org/packages/24/b2/65db3af5fcc7d64ac7ac86d7debc6a90803bb076c8f7d4599c167be79fd6/geoarrow_rust_core-0.6.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:86aaa60e5b6d99be08f9adc9e58bd088135e1dcfebd290085228ed8a0e93e90f", size = 3848323, upload-time = "2026-06-11T19:25:02.079Z" }, + { url = "https://files.pythonhosted.org/packages/27/9a/37bdd36d7feb9d591b9ccdc1952c6171b04dc777b999e2082b810eb1dd45/geoarrow_rust_core-0.6.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fec148cd63e616d9a7aa00c4ab08693eeec55aca7c9d700aa6451cd8001d0e08", size = 3707679, upload-time = "2026-06-11T19:25:03.594Z" }, + { url = "https://files.pythonhosted.org/packages/45/b7/8d2998284de21d0feb2a0935c41636f8ebf2b65723d8139026e7f9f3d5e8/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b1944f3d548b6296e9fbd668602accae0ad68e49ee0f5b8df9e7ea4f474e4ae", size = 4190279, upload-time = "2026-06-11T19:25:05.21Z" }, + { url = "https://files.pythonhosted.org/packages/25/f3/140209f53a70f261ef1459b08eea25c4edef3ad9f6ec0924033b5285ee7e/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7f5c04195cbedf5d1684a50203e862d979cda0d6218aac32f607d6e3f7cd65c8", size = 4264876, upload-time = "2026-06-11T19:25:06.654Z" }, + { url = "https://files.pythonhosted.org/packages/14/32/0097bfb92816ef91b38f7e757f65fe8456e56152ca51cd7a05b1be8a2e40/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:671c6be9cbc68295a68598fc8c6ddd875de063a795d64b2cfd10d36abd1ee324", size = 5586563, upload-time = "2026-06-11T19:25:08.376Z" }, + { url = "https://files.pythonhosted.org/packages/fd/86/508fe299aa44afe95399d9fa73cdbc7a451841803b8f1431e8c3d0b26ec1/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5f4726fbe09d545a507993f2f76c2be7812fef3c20c994ff33c32aaa96aaa212", size = 4402886, upload-time = "2026-06-11T19:25:10.302Z" }, + { url = "https://files.pythonhosted.org/packages/46/81/fc34afcce2b0f17424610405481f69f3c6e4d670c5c94170d71ed6719794/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0fa37a90312e7ca06921be56cee183c12c442b345fadd982480cd1f8ed2eede", size = 4247331, upload-time = "2026-06-11T19:25:11.857Z" }, + { url = "https://files.pythonhosted.org/packages/ff/0d/af42431f80282a2f7e1f3e496c39483dd2362e11f8008c65033be9d2ba4c/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3f41a8c0a9f3558d73537dcad83c88b29c2a169bcc7766dc677e8245a98a5e95", size = 4741954, upload-time = "2026-06-11T19:25:13.964Z" }, + { url = "https://files.pythonhosted.org/packages/cc/e5/be80aa4384f16be6a20828fd4cc67da18bd2266366f80c9bfefa481559f8/geoarrow_rust_core-0.6.3-cp312-cp312-win_amd64.whl", hash = "sha256:382f0914c75d84b87420aef7b6f11e8b5d4d58b5f5db7c8d199815e4dd282a42", size = 3599115, upload-time = "2026-06-11T19:25:15.357Z" }, + { url = "https://files.pythonhosted.org/packages/19/52/93bbf15979ce656d09821f02f82420957fdc99ee4cd37e5e2d8c99a324da/geoarrow_rust_core-0.6.3-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:c11190008ed6a571b8ca4ef769198e95434dbe7c3caefa9acd5f0ceba1ed868f", size = 3848682, upload-time = "2026-06-11T19:25:16.914Z" }, + { url = "https://files.pythonhosted.org/packages/a8/1e/1665171a3756b1977b7240a8f518bbbdfa778dcc156e0f90d659723468fb/geoarrow_rust_core-0.6.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1308ad09912fb67a43ff7dd7dbc685ca8a8fbd8028d3876eb187b6b082a98a7b", size = 3707868, upload-time = "2026-06-11T19:25:22.483Z" }, + { url = "https://files.pythonhosted.org/packages/ec/38/e344ccb72473b8756c8f2dae3a8a9339e1821884a2a50befbad45150d178/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1247b961c61656596631ca3380d405f8d0a2f60f045f8b8a3a335b1a849dc55", size = 4189835, upload-time = "2026-06-11T19:25:24.116Z" }, + { url = "https://files.pythonhosted.org/packages/22/10/bc92b9fcdc628fa1ff7e234219701cd575b0a78da5fdf3a6c8884e5ca445/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5c2cb90116255c3f74d5aee563405f3a440bd4eb75471adac13cd0c80a2564dc", size = 4265584, upload-time = "2026-06-11T19:25:25.628Z" }, + { url = "https://files.pythonhosted.org/packages/a6/ed/67edd70967851bef3ef9e35d8ccef242923ed69104ecb885ad3adf4de9a2/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a993d3a0964b8cf55a51bd404225dc3037b51f34b01c6bb1312611ce61f9b2d", size = 5586300, upload-time = "2026-06-11T19:25:27.32Z" }, + { url = "https://files.pythonhosted.org/packages/76/a6/a20fba654caa314b4688ad9dceb5e99fa7956bbf92b3059baa36e06c59b3/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cbd153a3348d166ecb57b2770b69b17c2df14cf303d41cd9168adba77532a31b", size = 4402375, upload-time = "2026-06-11T19:25:28.799Z" }, + { url = "https://files.pythonhosted.org/packages/ed/5d/c8949bb5916ff80186c854792b9ddadc9f3069db09d31311f24d82ba7096/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1fb5aaf3a6f104145b4c5a3188b1be589849b2599626c0e40181a18fc2e79f68", size = 4246712, upload-time = "2026-06-11T19:25:31.015Z" }, + { url = "https://files.pythonhosted.org/packages/b5/36/c9b7afa2929b697a164ae18f35aba517bcab85efcf19cb48ffa5ac66642b/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c3b33be8308a479f3a3a6d3a664861d6b5f8b1ad8822798f5a7e5d9af0b924eb", size = 4742863, upload-time = "2026-06-11T19:25:32.468Z" }, + { url = "https://files.pythonhosted.org/packages/57/5c/55a8d753bff924959837c39c9aa37c7813c5929570a2629ae4ece811505f/geoarrow_rust_core-0.6.3-cp313-cp313-pyemscripten_2025_0_wasm32.whl", hash = "sha256:a090191ae224e8490a95e68038db7a14df8f0326706f10c2e958621bf6c06ef5", size = 1979216, upload-time = "2026-06-11T19:25:33.905Z" }, + { url = "https://files.pythonhosted.org/packages/71/c7/a9f93af9306fd3743a96cc61bfdd7fc9194c38026f7904c067d4b4a99f0c/geoarrow_rust_core-0.6.3-cp313-cp313-win_amd64.whl", hash = "sha256:2606d6f5afacdb49145b39d3e024efadf33f847b596c19c9b6d3030d6beb2721", size = 3599237, upload-time = "2026-06-11T19:25:35.452Z" }, + { url = "https://files.pythonhosted.org/packages/8a/7a/6993bd89e12d0b227b611a53c657b38e63f906dfca773accae3a1f3815a4/geoarrow_rust_core-0.6.3-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:370cd1ef46bf18fa598f3038fe6f417b016da211ffe060f2b60e47dd2f684a34", size = 3854961, upload-time = "2026-06-11T19:25:37.045Z" }, + { url = "https://files.pythonhosted.org/packages/c3/c4/92cbcabd2a6add1b69a76a22a349fa219bdfed8026dfab4b8ec230bf9943/geoarrow_rust_core-0.6.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4dbf733db0bc57859d1a34c4bc8c50805f19e60081496967588e43f1f606e885", size = 3708325, upload-time = "2026-06-11T19:25:38.638Z" }, + { url = "https://files.pythonhosted.org/packages/07/b3/8fc34c5efa95cd597328876b6295fbe280d4b71df615655aaa2cd1618881/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45ac6715e790b1ca9be38ceb8ee39cdfe29395d29c83541f7a1190812290d81d", size = 4196828, upload-time = "2026-06-11T19:25:40.329Z" }, + { url = "https://files.pythonhosted.org/packages/ca/f2/bd2026862995ff96eb6b94d2fc56f7bf737d13f6bac9662481eaae23d079/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d14917d471dce8ee5a0976ec50b5da800bab0117bfd72bc56e23518a1dbbdb3a", size = 4265577, upload-time = "2026-06-11T19:25:41.91Z" }, + { url = "https://files.pythonhosted.org/packages/3e/01/73d69c5205a34e043026a73048d210f448a986ebb577deee7ceb1923fb5a/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:43a371299305388663131321f0d623fc70ca4a3840f973598946b5183e5ba4e4", size = 5592303, upload-time = "2026-06-11T19:25:43.503Z" }, + { url = "https://files.pythonhosted.org/packages/98/20/fe35466e526a5d363ebd9c9dd16985dbad7fd677b90e1f123a8180bceb44/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23eddb8dd65dfefb397762cc3c3f6bfaffb4271641bd9dc8043a9ab3aa4cd72a", size = 4409972, upload-time = "2026-06-11T19:25:45.114Z" }, + { url = "https://files.pythonhosted.org/packages/e5/c8/dc588827ad6e8dad75413bc1d35b5189c8a011a2be4827499a4ab9402253/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43ce7b3aaeb0e8c8ad7c37c84ceed49e10d0929a5a92042c3f6ec5ef33271de4", size = 4250885, upload-time = "2026-06-11T19:25:46.649Z" }, + { url = "https://files.pythonhosted.org/packages/e6/e2/a9923e4c5848ace6e3e6f09a40d3860955f7d836675affe35bc79bc27033/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c67201bd662e4732a822f91651111bc024329b3e71eba9f4eed19e58c9cf789b", size = 4742518, upload-time = "2026-06-11T19:25:48.098Z" }, + { url = "https://files.pythonhosted.org/packages/e6/c7/3112def9e93e88341210dd22b4d04c598fb4d0726adef2114b68157354d5/geoarrow_rust_core-0.6.3-cp314-cp314-pyemscripten_2026_0_wasm32.whl", hash = "sha256:8461e6d07a7b39ab099c9885a68d5e7983d4e83a82a42dd5b331c543683c9d6e", size = 1959191, upload-time = "2026-06-11T19:25:49.668Z" }, + { url = "https://files.pythonhosted.org/packages/ed/0f/de74ce2171c408e4b4a7660f69f6dfaa294797a18a209fa85b1ea79be141/geoarrow_rust_core-0.6.3-cp314-cp314-win_amd64.whl", hash = "sha256:5d2fd45d09bf700e0ca4d30b51ebcd59fb8d1a9eb4a4d7b4fc5f53a6cca59475", size = 3603948, upload-time = "2026-06-11T19:25:51.078Z" }, ] [[package]] @@ -797,8 +686,7 @@ version = "0.6.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "arro3-core" }, - { name = "pyproj", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "pyproj", version = "3.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "pyproj", version = "3.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "pyproj", version = "3.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] wheels = [ @@ -842,14 +730,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/41/14/1ec1ba4df851b477d802285e8b770f65e6774f0d6272e4e8548c8758892c/geoarrow_rust_io-0.6.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a10e67d95a134dbb5f657fe3436ea645c6760a4ffef44df211f7d9b8fb687e6", size = 10499137, upload-time = "2025-12-03T19:02:24.514Z" }, { url = "https://files.pythonhosted.org/packages/a5/66/7ad618415790671664e76596c000e812e0bd39e8f347f4eb7b8e3f519a55/geoarrow_rust_io-0.6.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:61ccbb528bbe4834849c501e5990a4a6f4b87976ca6a22df7859f16760c79590", size = 10394123, upload-time = "2025-12-03T19:02:01.248Z" }, { url = "https://files.pythonhosted.org/packages/43/4b/4520af8c694ca0932f995c91d604837741522bd02b66414fdff4521abc98/geoarrow_rust_io-0.6.1-cp314-cp314-win_amd64.whl", hash = "sha256:aa46f6beda6c267f420ea390f071fadd0161094c1db8d71ad54002c006fe7f21", size = 8989484, upload-time = "2025-12-03T19:02:40.081Z" }, - { url = "https://files.pythonhosted.org/packages/69/87/efadbf1bb9d359f55791f7198cf9aa87f0272be6a2d373f5844f5e59cd1e/geoarrow_rust_io-0.6.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:46e3e41b726b250b44a829ab41489e5008280acb8af8e68001230babf04bafd8", size = 9780411, upload-time = "2025-11-21T02:11:30.128Z" }, - { url = "https://files.pythonhosted.org/packages/95/73/5e108b286b219d3a46042cfa0830e0f075f4addd01f83f7c851a933919ae/geoarrow_rust_io-0.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:bb95364b726c34c23fb93ebc9c08b8fa1d52062a4a9c1ac614ff8761a339ba7a", size = 9316307, upload-time = "2025-11-21T02:11:21.195Z" }, - { url = "https://files.pythonhosted.org/packages/06/76/89c387d6d4d303feef328fc9c63df76cea52963e2046f2c092b434fb04a9/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:691a67ef3a5214fb704d1a19d33a9ddf173483c3943056fb965101c19b0edd28", size = 10309182, upload-time = "2025-11-21T02:10:34.063Z" }, - { url = "https://files.pythonhosted.org/packages/ff/08/34ed2d76ebfb34ed6bf3312defad16b2b5246e40d59e46443a6fe19e85dd/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91c82e9cbae6759798a8e4a87adb13ea617090a5498f384fc56c44775653d7f0", size = 11291230, upload-time = "2025-11-21T02:10:57.771Z" }, - { url = "https://files.pythonhosted.org/packages/e9/f5/9c25512c1f31101125555367e55ff28f72f449c8f56ff06c5be9e3feb9e5/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9c2b609addc7a810eab5cd573243710d95afe8486f829edd05b311d51bbb5af", size = 13300664, upload-time = "2025-11-21T02:10:46.082Z" }, - { url = "https://files.pythonhosted.org/packages/f5/aa/14be165b439d3a3ffc6ced96f971b02df255e86b82c7e1f9f340d35689c3/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6abdc80e130f472f55598543a4bb9ba522d6502a5d80017a952027a9e9c1d1ce", size = 10486589, upload-time = "2025-11-21T02:11:09.681Z" }, - { url = "https://files.pythonhosted.org/packages/5a/df/1c36bae723561785ce47e463f6366a3c52994795a168d7c4ed5e457e9a37/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c4638a89d61629110dde474b3d410ee2e71c89d2035ab2f2557857e7eee4ea30", size = 10395106, upload-time = "2025-11-21T02:10:20.832Z" }, - { url = "https://files.pythonhosted.org/packages/47/d4/4e9cffad7647c07a5cd1cce68c97102dd011652168e3e09a2dedc1253a5e/geoarrow_rust_io-0.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:4811e96b1777fcf12ac2416872407b1e4717f9a59fe5b80ce02b1e9a087d1b5e", size = 8988735, upload-time = "2025-11-21T02:11:39.164Z" }, { url = "https://files.pythonhosted.org/packages/e6/9f/32059400bb853eafe5d37d8c4ae9e48cd9c43820287e435cc1566f42208e/geoarrow_rust_io-0.6.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ef94f84ba4efb42d63588241733e1b62bbdb4edeac5513baeb7bfb07db4f204a", size = 10303111, upload-time = "2025-11-21T02:10:36.067Z" }, { url = "https://files.pythonhosted.org/packages/6c/a2/7db0a685eafa41e9565a3c4e441f41d2630c084f616d2669c5fe8f5805ef/geoarrow_rust_io-0.6.1-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:872dd92c52b2df342d34ac42d1b710c91c58e9dd93f5c88098816f9cd9dc8a84", size = 11299498, upload-time = "2025-11-21T02:11:00.19Z" }, { url = "https://files.pythonhosted.org/packages/13/b4/1bfbfbe828ca51b4f314d9f70514c2ff19923714aa7d51ef1b0ec8600aed/geoarrow_rust_io-0.6.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:235a7ea94faa95a4699f6577765a5e5a88bee079828c3d9015d9d5c6c240459c", size = 13299230, upload-time = "2025-11-21T02:10:48.12Z" }, @@ -913,13 +793,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4e/9d/5e3e362815152aa1afd8b26ea613effa005962f9da0eec6e0e4527e7a7d1/grpcio-1.75.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:3e71a2105210366bfc398eef7f57a664df99194f3520edb88b9c3a7e46ee0d64", size = 7081061, upload-time = "2025-09-26T09:02:58.261Z" }, { url = "https://files.pythonhosted.org/packages/1e/1a/46615682a19e100f46e31ddba9ebc297c5a5ab9ddb47b35443ffadb8776c/grpcio-1.75.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:8679aa8a5b67976776d3c6b0521e99d1c34db8a312a12bcfd78a7085cb9b604e", size = 8010849, upload-time = "2025-09-26T09:03:00.548Z" }, { url = "https://files.pythonhosted.org/packages/67/8e/3204b94ac30b0f675ab1c06540ab5578660dc8b690db71854d3116f20d00/grpcio-1.75.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:aad1c774f4ebf0696a7f148a56d39a3432550612597331792528895258966dc0", size = 7464478, upload-time = "2025-09-26T09:03:03.096Z" }, - { url = "https://files.pythonhosted.org/packages/8f/e2/33efd823a879dc7b60c10192df1900ee5c200f8e782663a41a3b2aecd143/grpcio-1.75.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:c09fba33327c3ac11b5c33dbdd8218eef8990d78f83b1656d628831812a8c0fb", size = 5706679, upload-time = "2025-09-26T09:03:10.218Z" }, - { url = "https://files.pythonhosted.org/packages/77/90/b80e75f8cce758425b2772742eed4e9db765a965d902ba4b7f239b2513de/grpcio-1.75.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c12121e509b9f8b0914d10054d24120237d19e870b1cd82acbb8a9b9ddd198a3", size = 6291926, upload-time = "2025-09-26T09:03:16.282Z" }, - { url = "https://files.pythonhosted.org/packages/40/5f/e6033d8f99063350e20873a46225468b73045b9ef2c8cba73d66a87c3fd5/grpcio-1.75.1-cp39-cp39-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:73577a93e692b3474b1bfe84285d098de36705dbd838bb4d6a056d326e4dc880", size = 6950040, upload-time = "2025-09-26T09:03:18.874Z" }, - { url = "https://files.pythonhosted.org/packages/01/12/34076c079b45af5aed40f037fffe388d7fbe90dd539ed01e4744c926d227/grpcio-1.75.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e19e7dfa0d7ca7dea22be464339e18ac608fd75d88c56770c646cdabe54bc724", size = 6465780, upload-time = "2025-09-26T09:03:21.219Z" }, - { url = "https://files.pythonhosted.org/packages/e4/c5/ee6fd69a9f6e7288d04da010ad7480a0566d2aac81097ff4dafbc5ffa9b6/grpcio-1.75.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4e1c28f51c1cf67eccdfc1065e8e866c9ed622f09773ca60947089c117f848a1", size = 7098308, upload-time = "2025-09-26T09:03:23.875Z" }, - { url = "https://files.pythonhosted.org/packages/78/32/f2be13f13035361768923159fe20470a7d22db2c7c692b952e21284f56e5/grpcio-1.75.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:030a6164bc2ca726052778c0cf8e3249617a34e368354f9e6107c27ad4af8c28", size = 8042268, upload-time = "2025-09-26T09:03:26.268Z" }, - { url = "https://files.pythonhosted.org/packages/e7/2d/1bb0572f0a2eaab100b4635c6c2cd0d37e3cda5554037e3f90b1bc428d56/grpcio-1.75.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:67697efef5a98d46d5db7b1720fa4043536f8b8e5072a5d61cfca762f287e939", size = 7491470, upload-time = "2025-09-26T09:03:28.906Z" }, ] [[package]] @@ -927,8 +800,7 @@ name = "h5py" version = "3.14.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/5d/57/dfb3c5c3f1bf5f5ef2e59a22dec4ff1f3d7408b55bfcefcfb0ea69ef21c6/h5py-3.14.0.tar.gz", hash = "sha256:2372116b2e0d5d3e5e705b7f663f7c8d96fa79a4052d250484ef91d24d6a08f4", size = 424323, upload-time = "2025-06-06T14:06:15.01Z" } @@ -941,8 +813,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/86/f9/f00de11c82c88bfc1ef22633557bfba9e271e0cb3189ad704183fc4a2644/h5py-3.14.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0cbd41f4e3761f150aa5b662df991868ca533872c95467216f2bec5fcad84882", size = 4929422, upload-time = "2025-06-06T14:05:18.399Z" }, { url = "https://files.pythonhosted.org/packages/0d/ce/3a21d87896bc7e3e9255e0ad5583ae31ae9e6b4b00e0bcb2a67e2b6acdbc/h5py-3.14.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8cbaf6910fa3983c46172666b0b8da7b7bd90d764399ca983236f2400436eeb", size = 4700675, upload-time = "2025-06-06T14:05:37.38Z" }, { url = "https://files.pythonhosted.org/packages/e7/ec/86f59025306dcc6deee5fda54d980d077075b8d9889aac80f158bd585f1b/h5py-3.14.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d90e6445ab7c146d7f7981b11895d70bc1dd91278a4f9f9028bc0c95e4a53f13", size = 4921632, upload-time = "2025-06-06T14:05:43.464Z" }, - { url = "https://files.pythonhosted.org/packages/66/40/b423b57696514e05aa7bb06150ef96667d0e0006cc6de7ab52c71734ab51/h5py-3.14.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:573c33ad056ac7c1ab6d567b6db9df3ffc401045e3f605736218f96c1e0490c6", size = 4326368, upload-time = "2025-06-06T14:06:00.782Z" }, - { url = "https://files.pythonhosted.org/packages/f7/07/e088f89f04fdbe57ddf9de377f857158d3daa38cf5d0fb20ef9bd489e313/h5py-3.14.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccbe17dc187c0c64178f1a10aa274ed3a57d055117588942b8a08793cc448216", size = 4559686, upload-time = "2025-06-06T14:06:07.416Z" }, ] [[package]] @@ -965,14 +835,14 @@ name = "huggingface-hub" version = "0.35.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "filelock", marker = "python_full_version >= '3.10'" }, - { name = "fsspec", marker = "python_full_version >= '3.10'" }, - { name = "hf-xet", marker = "(python_full_version >= '3.10' and platform_machine == 'aarch64') or (python_full_version >= '3.10' and platform_machine == 'amd64') or (python_full_version >= '3.10' and platform_machine == 'arm64') or (python_full_version >= '3.10' and platform_machine == 'x86_64')" }, - { name = "packaging", marker = "python_full_version >= '3.10'" }, - { name = "pyyaml", marker = "python_full_version >= '3.10'" }, - { name = "requests", marker = "python_full_version >= '3.10'" }, - { name = "tqdm", marker = "python_full_version >= '3.10'" }, - { name = "typing-extensions", marker = "python_full_version >= '3.10'" }, + { name = "filelock" }, + { name = "fsspec" }, + { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "typing-extensions" }, ] sdist = { url = "https://files.pythonhosted.org/packages/10/7e/a0a97de7c73671863ca6b3f61fa12518caf35db37825e43d63a70956738c/huggingface_hub-0.35.3.tar.gz", hash = "sha256:350932eaa5cc6a4747efae85126ee220e4ef1b54e29d31c3b45c5612ddf0b32a", size = 461798, upload-time = "2025-09-29T14:29:58.625Z" } wheels = [ @@ -988,18 +858,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d2/23/408243171aa9aaba178d3e2559159c24c1171a641aa83b67bdd3394ead8e/idna-3.15-py3-none-any.whl", hash = "sha256:048adeaf8c2d788c40fee287673ccaa74c24ffd8dcf09ffa555a2fbb59f10ac8", size = 72340, upload-time = "2026-05-12T22:45:55.733Z" }, ] -[[package]] -name = "importlib-metadata" -version = "8.7.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "zipp", marker = "python_full_version < '3.10'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" }, -] - [[package]] name = "iniconfig" version = "2.1.0" @@ -1030,83 +888,51 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" }, ] -[[package]] -name = "keras" -version = "2.7.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/6b/8b/065f94ba03282fa41b2d76942b87a180a9913312c4611ea7d6508fbbc114/keras-2.7.0-py2.py3-none-any.whl", hash = "sha256:0c33ae1f728064ca0d35dfba999e9c316f03623bf5688c82fb83cc74a80ea248", size = 1332171, upload-time = "2021-11-03T16:16:34.318Z" }, -] - [[package]] name = "keras" version = "3.11.3" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] dependencies = [ - { name = "absl-py", marker = "python_full_version >= '3.10'" }, - { name = "h5py", marker = "python_full_version >= '3.10'" }, - { name = "ml-dtypes", marker = "python_full_version >= '3.10'" }, - { name = "namex", marker = "python_full_version >= '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "absl-py" }, + { name = "h5py" }, + { name = "ml-dtypes" }, + { name = "namex" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "optree", marker = "python_full_version >= '3.10'" }, - { name = "packaging", marker = "python_full_version >= '3.10'" }, - { name = "rich", marker = "python_full_version >= '3.10'" }, + { name = "optree" }, + { name = "packaging" }, + { name = "rich" }, ] sdist = { url = "https://files.pythonhosted.org/packages/6a/89/646425fe9a46f9053430e1271f817c36041c6f33469950a3caafc3d2591e/keras-3.11.3.tar.gz", hash = "sha256:efda616835c31b7d916d72303ef9adec1257320bc9fd4b2b0138840fc65fb5b7", size = 1065906, upload-time = "2025-08-21T22:08:57.643Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/94/5b/4c778cc921ce4b864b238f63f8e3ff6e954ab19b80c9fa680593ad8093d4/keras-3.11.3-py3-none-any.whl", hash = "sha256:f484f050e05ee400455b05ec8c36ed35edc34de94256b6073f56cfe68f65491f", size = 1408438, upload-time = "2025-08-21T22:08:55.858Z" }, ] -[[package]] -name = "keras-preprocessing" -version = "1.1.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "six", marker = "python_full_version < '3.10'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/5e/f1/b44337faca48874333769a29398fe4666686733c8880aa160b9fd5dfe600/Keras_Preprocessing-1.1.2.tar.gz", hash = "sha256:add82567c50c8bc648c14195bf544a5ce7c1f76761536956c3d2978970179ef3", size = 163598, upload-time = "2020-05-14T03:53:48.526Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/79/4c/7c3275a01e12ef9368a892926ab932b33bb13d55794881e3573482b378a7/Keras_Preprocessing-1.1.2-py2.py3-none-any.whl", hash = "sha256:7b82029b130ff61cc99b55f3bd27427df4838576838c5b2f65940e4fcec99a7b", size = 42581, upload-time = "2020-05-14T03:53:47.192Z" }, -] - [[package]] name = "lance-namespace" -version = "0.8.0" +version = "0.8.6" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "lance-namespace-urllib3-client" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/21/80/2b6eaa08c5e25915acaa6368a70211a25b5ba9d2d6006450e68a73936164/lance_namespace-0.8.0.tar.gz", hash = "sha256:c4a79ee221a3b2315c29863ad12d85fcf219a13158e26149d63e21dc4b4673a7", size = 10756, upload-time = "2026-06-01T08:47:10.183Z" } +sdist = { url = "https://files.pythonhosted.org/packages/af/12/f7ab93b29be3edbf5fc3610714bf2d06088e7f4524bfb38dfd6852458b08/lance_namespace-0.8.6.tar.gz", hash = "sha256:18232e721c8188145f4ec9389cc2dfbeeabf54a619d94885ea1b3375bee9f4af", size = 11529, upload-time = "2026-06-12T17:36:41.651Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/4b/bd/7b40a08fb132fab39a6caebf832fdf6b9befc71be9413beb9be0a9d927d4/lance_namespace-0.8.0-py3-none-any.whl", hash = "sha256:782cf9e332f46bf06836722dd98b53ca8495ad98bb541501ff6876c89b67ec90", size = 12579, upload-time = "2026-06-01T08:47:10.91Z" }, + { url = "https://files.pythonhosted.org/packages/a0/1b/5b1668ee2dc8910965f390640359112a31157092fcf8e000b89c79b58708/lance_namespace-0.8.6-py3-none-any.whl", hash = "sha256:571eae34f9aad70e5b05020416c2860889b9ec82993ccd0eb015e7b39c3ea309", size = 13383, upload-time = "2026-06-12T17:36:43.456Z" }, ] [[package]] name = "lance-namespace-urllib3-client" -version = "0.8.0" +version = "0.8.6" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "pydantic" }, { name = "python-dateutil" }, { name = "typing-extensions" }, - { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "urllib3", version = "2.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "urllib3" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/8c/37/06fcd5a8969381e0ba953d51990af8d331bdccbc62458bf2eed30d064573/lance_namespace_urllib3_client-0.8.0.tar.gz", hash = "sha256:4f060f05ebf3c04aeaeb0d2022cbe77648a3df290f02cd2c305e5797d0fc1fdd", size = 203710, upload-time = "2026-06-01T08:47:13.404Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c7/80/fb224b4a89c1c1638cde949cb6cce6c3aca7759effbfea46a3d9c3960b21/lance_namespace_urllib3_client-0.8.6.tar.gz", hash = "sha256:b6fb1d306e74a7576e5309919020be744527de484a63dbf5eed10f8b368548df", size = 228772, upload-time = "2026-06-12T17:36:42.609Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/51/43/e280727feee958f303bc58d5fa912b07734a0831f756d841654d500c2c34/lance_namespace_urllib3_client-0.8.0-py3-none-any.whl", hash = "sha256:6734e341b726e5cc96a0cd257cef27eb9d03013f2d151526ee426cef8e63e228", size = 336669, upload-time = "2026-06-01T08:47:11.88Z" }, + { url = "https://files.pythonhosted.org/packages/c5/90/1e27de15cd1b16785a1c7312beb0a59e75c8344a815f600f58173a565bd1/lance_namespace_urllib3_client-0.8.6-py3-none-any.whl", hash = "sha256:9d78249c3fb15aa3d15d668f78f04a275af3d08d800a7027492f37996ac4968b", size = 369950, upload-time = "2026-06-12T17:36:40.438Z" }, ] [[package]] @@ -1125,9 +951,6 @@ wheels = [ name = "markdown" version = "3.9" source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "importlib-metadata", marker = "python_full_version < '3.10'" }, -] sdist = { url = "https://files.pythonhosted.org/packages/8d/37/02347f6d6d8279247a5837082ebc26fc0d5aaeaf75aa013fcbb433c777ab/markdown-3.9.tar.gz", hash = "sha256:d2900fe1782bd33bdbbd56859defef70c2e78fc46668f8eb9df3128138f2cb6a", size = 364585, upload-time = "2025-09-04T20:25:22.885Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/70/ae/44c4a6a4cbb496d93c6257954260fe3a6e91b7bed2240e5dad2a717f5111/markdown-3.9-py3-none-any.whl", hash = "sha256:9f4d91ed810864ea88a6f32c07ba8bee1346c0cc1f6b1f9f6c822f2a9667d280", size = 107441, upload-time = "2025-09-04T20:25:21.784Z" }, @@ -1138,7 +961,7 @@ name = "markdown-it-py" version = "4.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "mdurl", marker = "python_full_version >= '3.10'" }, + { name = "mdurl" }, ] sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" } wheels = [ @@ -1228,17 +1051,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" }, { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" }, { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" }, - { url = "https://files.pythonhosted.org/packages/56/23/0d8c13a44bde9154821586520840643467aee574d8ce79a17da539ee7fed/markupsafe-3.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:15d939a21d546304880945ca1ecb8a039db6b4dc49b2c5a400387cdae6a62e26", size = 11623, upload-time = "2025-09-27T18:37:29.296Z" }, - { url = "https://files.pythonhosted.org/packages/fd/23/07a2cb9a8045d5f3f0890a8c3bc0859d7a47bfd9a560b563899bec7b72ed/markupsafe-3.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f71a396b3bf33ecaa1626c255855702aca4d3d9fea5e051b41ac59a9c1c41edc", size = 12049, upload-time = "2025-09-27T18:37:30.234Z" }, - { url = "https://files.pythonhosted.org/packages/bc/e4/6be85eb81503f8e11b61c0b6369b6e077dcf0a74adbd9ebf6b349937b4e9/markupsafe-3.0.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f4b68347f8c5eab4a13419215bdfd7f8c9b19f2b25520968adfad23eb0ce60c", size = 21923, upload-time = "2025-09-27T18:37:31.177Z" }, - { url = "https://files.pythonhosted.org/packages/6f/bc/4dc914ead3fe6ddaef035341fee0fc956949bbd27335b611829292b89ee2/markupsafe-3.0.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8fc20152abba6b83724d7ff268c249fa196d8259ff481f3b1476383f8f24e42", size = 20543, upload-time = "2025-09-27T18:37:32.168Z" }, - { url = "https://files.pythonhosted.org/packages/89/6e/5fe81fbcfba4aef4093d5f856e5c774ec2057946052d18d168219b7bd9f9/markupsafe-3.0.3-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:949b8d66bc381ee8b007cd945914c721d9aba8e27f71959d750a46f7c282b20b", size = 20585, upload-time = "2025-09-27T18:37:33.166Z" }, - { url = "https://files.pythonhosted.org/packages/f6/f6/e0e5a3d3ae9c4020f696cd055f940ef86b64fe88de26f3a0308b9d3d048c/markupsafe-3.0.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:3537e01efc9d4dccdf77221fb1cb3b8e1a38d5428920e0657ce299b20324d758", size = 21387, upload-time = "2025-09-27T18:37:34.185Z" }, - { url = "https://files.pythonhosted.org/packages/c8/25/651753ef4dea08ea790f4fbb65146a9a44a014986996ca40102e237aa49a/markupsafe-3.0.3-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:591ae9f2a647529ca990bc681daebdd52c8791ff06c2bfa05b65163e28102ef2", size = 20133, upload-time = "2025-09-27T18:37:35.138Z" }, - { url = "https://files.pythonhosted.org/packages/dc/0a/c3cf2b4fef5f0426e8a6d7fce3cb966a17817c568ce59d76b92a233fdbec/markupsafe-3.0.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a320721ab5a1aba0a233739394eb907f8c8da5c98c9181d1161e77a0c8e36f2d", size = 20588, upload-time = "2025-09-27T18:37:36.096Z" }, - { url = "https://files.pythonhosted.org/packages/cd/1b/a7782984844bd519ad4ffdbebbba2671ec5d0ebbeac34736c15fb86399e8/markupsafe-3.0.3-cp39-cp39-win32.whl", hash = "sha256:df2449253ef108a379b8b5d6b43f4b1a8e81a061d6537becd5582fba5f9196d7", size = 14566, upload-time = "2025-09-27T18:37:37.09Z" }, - { url = "https://files.pythonhosted.org/packages/18/1f/8d9c20e1c9440e215a44be5ab64359e207fcb4f675543f1cf9a2a7f648d0/markupsafe-3.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:7c3fb7d25180895632e5d3148dbdc29ea38ccb7fd210aa27acbd1201a1902c6e", size = 15053, upload-time = "2025-09-27T18:37:38.054Z" }, - { url = "https://files.pythonhosted.org/packages/4e/d3/fe08482b5cd995033556d45041a4f4e76e7f0521112a9c9991d40d39825f/markupsafe-3.0.3-cp39-cp39-win_arm64.whl", hash = "sha256:38664109c14ffc9e7437e86b4dceb442b0096dfe3541d7864d9cbe1da4cf36c8", size = 13928, upload-time = "2025-09-27T18:37:39.037Z" }, ] [[package]] @@ -1279,8 +1091,7 @@ name = "ml-dtypes" version = "0.5.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/78/a7/aad060393123cfb383956dca68402aff3db1e1caffd5764887ed5153f41b/ml_dtypes-0.5.3.tar.gz", hash = "sha256:95ce33057ba4d05df50b1f3cfefab22e351868a843b3b15a46c65836283670c9", size = 692316, upload-time = "2025-07-29T18:39:19.454Z" } @@ -1315,10 +1126,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/53/21/783dfb51f40d2660afeb9bccf3612b99f6a803d980d2a09132b0f9d216ab/ml_dtypes-0.5.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:e12e29764a0e66a7a31e9b8bf1de5cc0423ea72979f45909acd4292de834ccd3", size = 689324, upload-time = "2025-07-29T18:39:07.567Z" }, { url = "https://files.pythonhosted.org/packages/09/f7/a82d249c711abf411ac027b7163f285487f5e615c3e0716c61033ce996ab/ml_dtypes-0.5.3-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:19f6c3a4f635c2fc9e2aa7d91416bd7a3d649b48350c51f7f715a09370a90d93", size = 5275917, upload-time = "2025-07-29T18:39:09.339Z" }, { url = "https://files.pythonhosted.org/packages/7f/3c/541c4b30815ab90ebfbb51df15d0b4254f2f9f1e2b4907ab229300d5e6f2/ml_dtypes-0.5.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ab039ffb40f3dc0aeeeba84fd6c3452781b5e15bef72e2d10bcb33e4bbffc39", size = 5285284, upload-time = "2025-07-29T18:39:11.532Z" }, - { url = "https://files.pythonhosted.org/packages/19/2d/c61af51173083bbf2a3b0f1a1a01d50ef1830436880027433d1b75271083/ml_dtypes-0.5.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5ee72568d46b9533ad54f78b1e1f3067c0534c5065120ea8ecc6f210d22748b3", size = 663552, upload-time = "2025-07-29T18:39:13.102Z" }, - { url = "https://files.pythonhosted.org/packages/61/0e/a628f2aefd719745e8a13492375a55cedea77c0cfc917b1ce11bde435c68/ml_dtypes-0.5.3-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:01de48de4537dc3c46e684b969a40ec36594e7eeb7c69e9a093e7239f030a28a", size = 4952704, upload-time = "2025-07-29T18:39:14.829Z" }, - { url = "https://files.pythonhosted.org/packages/f8/2e/5ba92f1f99d1f5f62bffec614a5b8161e55c3961257c902fa26dbe909baa/ml_dtypes-0.5.3-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8b1a6e231b0770f2894910f1dce6d2f31d65884dbf7668f9b08d73623cdca909", size = 4923538, upload-time = "2025-07-29T18:39:16.581Z" }, - { url = "https://files.pythonhosted.org/packages/70/3b/f801c69027866ea6e387224551185fedef62ad8e2e71181ec0d9dda905f7/ml_dtypes-0.5.3-cp39-cp39-win_amd64.whl", hash = "sha256:a4f39b9bf6555fab9bfb536cf5fdd1c1c727e8d22312078702e9ff005354b37f", size = 206567, upload-time = "2025-07-29T18:39:18.047Z" }, ] [[package]] @@ -1335,7 +1142,7 @@ name = "multidict" version = "6.6.4" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", marker = "python_full_version == '3.10.*'" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/69/7f/0652e6ed47ab288e3756ea9c0df8b14950781184d4bd7883f4d87dd41245/multidict-6.6.4.tar.gz", hash = "sha256:d2d4e4787672911b48350df02ed3fa3fffdc2f2e8ca06dd6afdf34189b76a9dd", size = 101843, upload-time = "2025-08-11T12:08:48.217Z" } wheels = [ @@ -1429,24 +1236,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/50/b0/a6fae46071b645ae98786ab738447de1ef53742eaad949f27e960864bb49/multidict-6.6.4-cp313-cp313t-win32.whl", hash = "sha256:f93b2b2279883d1d0a9e1bd01f312d6fc315c5e4c1f09e112e4736e2f650bc4e", size = 47775, upload-time = "2025-08-11T12:08:12.439Z" }, { url = "https://files.pythonhosted.org/packages/b2/0a/2436550b1520091af0600dff547913cb2d66fbac27a8c33bc1b1bccd8d98/multidict-6.6.4-cp313-cp313t-win_amd64.whl", hash = "sha256:6d46a180acdf6e87cc41dc15d8f5c2986e1e8739dc25dbb7dac826731ef381a4", size = 53100, upload-time = "2025-08-11T12:08:13.823Z" }, { url = "https://files.pythonhosted.org/packages/97/ea/43ac51faff934086db9c072a94d327d71b7d8b40cd5dcb47311330929ef0/multidict-6.6.4-cp313-cp313t-win_arm64.whl", hash = "sha256:756989334015e3335d087a27331659820d53ba432befdef6a718398b0a8493ad", size = 45501, upload-time = "2025-08-11T12:08:15.173Z" }, - { url = "https://files.pythonhosted.org/packages/d4/d3/f04c5db316caee9b5b2cbba66270b358c922a959855995bedde87134287c/multidict-6.6.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:af7618b591bae552b40dbb6f93f5518328a949dac626ee75927bba1ecdeea9f4", size = 76977, upload-time = "2025-08-11T12:08:16.667Z" }, - { url = "https://files.pythonhosted.org/packages/70/39/a6200417d883e510728ab3caec02d3b66ff09e1c85e0aab2ba311abfdf06/multidict-6.6.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b6819f83aef06f560cb15482d619d0e623ce9bf155115150a85ab11b8342a665", size = 44878, upload-time = "2025-08-11T12:08:18.157Z" }, - { url = "https://files.pythonhosted.org/packages/6f/7e/815be31ed35571b137d65232816f61513fcd97b2717d6a9d7800b5a0c6e0/multidict-6.6.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4d09384e75788861e046330308e7af54dd306aaf20eb760eb1d0de26b2bea2cb", size = 44546, upload-time = "2025-08-11T12:08:19.694Z" }, - { url = "https://files.pythonhosted.org/packages/e2/f1/21b5bff6a8c3e2aff56956c241941ace6b8820e1abe6b12d3c52868a773d/multidict-6.6.4-cp39-cp39-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:a59c63061f1a07b861c004e53869eb1211ffd1a4acbca330e3322efa6dd02978", size = 223020, upload-time = "2025-08-11T12:08:21.554Z" }, - { url = "https://files.pythonhosted.org/packages/15/59/37083f1dd3439979a0ffeb1906818d978d88b4cc7f4600a9f89b1cb6713c/multidict-6.6.4-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:350f6b0fe1ced61e778037fdc7613f4051c8baf64b1ee19371b42a3acdb016a0", size = 240528, upload-time = "2025-08-11T12:08:23.45Z" }, - { url = "https://files.pythonhosted.org/packages/d1/f0/f054d123c87784307a27324c829eb55bcfd2e261eb785fcabbd832c8dc4a/multidict-6.6.4-cp39-cp39-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0c5cbac6b55ad69cb6aa17ee9343dfbba903118fd530348c330211dc7aa756d1", size = 219540, upload-time = "2025-08-11T12:08:24.965Z" }, - { url = "https://files.pythonhosted.org/packages/e8/26/8f78ce17b7118149c17f238f28fba2a850b660b860f9b024a34d0191030f/multidict-6.6.4-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:630f70c32b8066ddfd920350bc236225814ad94dfa493fe1910ee17fe4365cbb", size = 251182, upload-time = "2025-08-11T12:08:26.511Z" }, - { url = "https://files.pythonhosted.org/packages/00/c3/a21466322d69f6594fe22d9379200f99194d21c12a5bbf8c2a39a46b83b6/multidict-6.6.4-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f8d4916a81697faec6cb724a273bd5457e4c6c43d82b29f9dc02c5542fd21fc9", size = 249371, upload-time = "2025-08-11T12:08:28.075Z" }, - { url = "https://files.pythonhosted.org/packages/c2/8e/2e673124eb05cf8dc82e9265eccde01a36bcbd3193e27799b8377123c976/multidict-6.6.4-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8e42332cf8276bb7645d310cdecca93a16920256a5b01bebf747365f86a1675b", size = 239235, upload-time = "2025-08-11T12:08:29.937Z" }, - { url = "https://files.pythonhosted.org/packages/2b/2d/bdd9f05e7c89e30a4b0e4faf0681a30748f8d1310f68cfdc0e3571e75bd5/multidict-6.6.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:f3be27440f7644ab9a13a6fc86f09cdd90b347c3c5e30c6d6d860de822d7cb53", size = 237410, upload-time = "2025-08-11T12:08:31.872Z" }, - { url = "https://files.pythonhosted.org/packages/46/4c/3237b83f8ca9a2673bb08fc340c15da005a80f5cc49748b587c8ae83823b/multidict-6.6.4-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:21f216669109e02ef3e2415ede07f4f8987f00de8cdfa0cc0b3440d42534f9f0", size = 232979, upload-time = "2025-08-11T12:08:33.399Z" }, - { url = "https://files.pythonhosted.org/packages/55/a6/a765decff625ae9bc581aed303cd1837955177dafc558859a69f56f56ba8/multidict-6.6.4-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:d9890d68c45d1aeac5178ded1d1cccf3bc8d7accf1f976f79bf63099fb16e4bd", size = 240979, upload-time = "2025-08-11T12:08:35.02Z" }, - { url = "https://files.pythonhosted.org/packages/6b/2d/9c75975cb0c66ea33cae1443bb265b2b3cd689bffcbc68872565f401da23/multidict-6.6.4-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:edfdcae97cdc5d1a89477c436b61f472c4d40971774ac4729c613b4b133163cb", size = 246849, upload-time = "2025-08-11T12:08:37.038Z" }, - { url = "https://files.pythonhosted.org/packages/3e/71/d21ac0843c1d8751fb5dcf8a1f436625d39d4577bc27829799d09b419af7/multidict-6.6.4-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:0b2e886624be5773e69cf32bcb8534aecdeb38943520b240fed3d5596a430f2f", size = 241798, upload-time = "2025-08-11T12:08:38.669Z" }, - { url = "https://files.pythonhosted.org/packages/94/3d/1d8911e53092837bd11b1c99d71de3e2a9a26f8911f864554677663242aa/multidict-6.6.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:be5bf4b3224948032a845d12ab0f69f208293742df96dc14c4ff9b09e508fc17", size = 235315, upload-time = "2025-08-11T12:08:40.266Z" }, - { url = "https://files.pythonhosted.org/packages/86/c5/4b758df96376f73e936b1942c6c2dfc17e37ed9d5ff3b01a811496966ca0/multidict-6.6.4-cp39-cp39-win32.whl", hash = "sha256:10a68a9191f284fe9d501fef4efe93226e74df92ce7a24e301371293bd4918ae", size = 41434, upload-time = "2025-08-11T12:08:41.965Z" }, - { url = "https://files.pythonhosted.org/packages/58/16/f1dfa2a0f25f2717a5e9e5fe8fd30613f7fe95e3530cec8d11f5de0b709c/multidict-6.6.4-cp39-cp39-win_amd64.whl", hash = "sha256:ee25f82f53262f9ac93bd7e58e47ea1bdcc3393cef815847e397cba17e284210", size = 46186, upload-time = "2025-08-11T12:08:43.367Z" }, - { url = "https://files.pythonhosted.org/packages/88/7d/a0568bac65438c494cb6950b29f394d875a796a237536ac724879cf710c9/multidict-6.6.4-cp39-cp39-win_arm64.whl", hash = "sha256:f9867e55590e0855bcec60d4f9a092b69476db64573c9fe17e92b0c50614c16a", size = 43115, upload-time = "2025-08-11T12:08:45.126Z" }, { url = "https://files.pythonhosted.org/packages/fd/69/b547032297c7e63ba2af494edba695d781af8a0c6e89e4d06cf848b21d80/multidict-6.6.4-py3-none-any.whl", hash = "sha256:27d8f8e125c07cb954e54d75d04905a9bba8a439c1d84aca94949d4d03d8601c", size = 12313, upload-time = "2025-08-11T12:08:46.891Z" }, ] @@ -1455,14 +1244,12 @@ name = "multiprocess" version = "0.70.16" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "dill", marker = "python_full_version >= '3.10'" }, + { name = "dill" }, ] sdist = { url = "https://files.pythonhosted.org/packages/b5/ae/04f39c5d0d0def03247c2893d6f2b83c136bf3320a2154d7b8858f2ba72d/multiprocess-0.70.16.tar.gz", hash = "sha256:161af703d4652a0e1410be6abccecde4a7ddffd19341be0a7011b94aeb171ac1", size = 1772603, upload-time = "2024-01-28T18:52:34.85Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/ef/76/6e712a2623d146d314f17598df5de7224c85c0060ef63fd95cc15a25b3fa/multiprocess-0.70.16-pp310-pypy310_pp73-macosx_10_13_x86_64.whl", hash = "sha256:476887be10e2f59ff183c006af746cb6f1fd0eadcfd4ef49e605cbe2659920ee", size = 134980, upload-time = "2024-01-28T18:52:15.731Z" }, { url = "https://files.pythonhosted.org/packages/0f/ab/1e6e8009e380e22254ff539ebe117861e5bdb3bff1fc977920972237c6c7/multiprocess-0.70.16-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d951bed82c8f73929ac82c61f01a7b5ce8f3e5ef40f5b52553b4f547ce2b08ec", size = 134982, upload-time = "2024-01-28T18:52:17.783Z" }, - { url = "https://files.pythonhosted.org/packages/d8/94/8638a89f93c80df329116e6781a060506c7e91e1f4370dc831e9d17a041d/multiprocess-0.70.16-pp39-pypy39_pp73-macosx_10_13_x86_64.whl", hash = "sha256:0dfd078c306e08d46d7a8d06fb120313d87aa43af60d66da43ffff40b44d2f41", size = 133497, upload-time = "2024-01-28T18:52:22.644Z" }, - { url = "https://files.pythonhosted.org/packages/89/21/222066f6bb8d8af287923ae3bd26cf4699a9ce020228ac273caca1de8250/multiprocess-0.70.16-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e7b9d0f307cd9bd50851afaac0dba2cb6c44449efff697df7c7645f7d3f2be3a", size = 133498, upload-time = "2024-01-28T18:52:24.576Z" }, { url = "https://files.pythonhosted.org/packages/bc/f7/7ec7fddc92e50714ea3745631f79bd9c96424cb2702632521028e57d3a36/multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02", size = 134824, upload-time = "2024-01-28T18:52:26.062Z" }, { url = "https://files.pythonhosted.org/packages/50/15/b56e50e8debaf439f44befec5b2af11db85f6e0f344c3113ae0be0593a91/multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a", size = 143519, upload-time = "2024-01-28T18:52:28.115Z" }, { url = "https://files.pythonhosted.org/packages/0a/7d/a988f258104dcd2ccf1ed40fdc97e26c4ac351eeaf81d76e266c52d84e2f/multiprocess-0.70.16-py312-none-any.whl", hash = "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e", size = 146741, upload-time = "2024-01-28T18:52:29.395Z" }, @@ -1479,24 +1266,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b2/bc/465daf1de06409cdd4532082806770ee0d8d7df434da79c76564d0f69741/namex-0.1.0-py3-none-any.whl", hash = "sha256:e2012a474502f1e2251267062aae3114611f07df4224b6e06334c57b0f2ce87c", size = 5905, upload-time = "2025-05-26T23:17:37.695Z" }, ] -[[package]] -name = "networkx" -version = "3.2.1" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/c4/80/a84676339aaae2f1cfdf9f418701dd634aef9cc76f708ef55c36ff39c3ca/networkx-3.2.1.tar.gz", hash = "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6", size = 2073928, upload-time = "2023-10-28T08:41:39.364Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d5/f0/8fbc882ca80cf077f1b246c0e3c3465f7f415439bdea6b899f6b19f61f70/networkx-3.2.1-py3-none-any.whl", hash = "sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2", size = 1647772, upload-time = "2023-10-28T08:41:36.945Z" }, -] - [[package]] name = "networkx" version = "3.4.2" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.10.*'", + "python_full_version < '3.11'", ] sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368, upload-time = "2024-10-21T12:39:38.695Z" } wheels = [ @@ -1527,67 +1302,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" }, ] -[[package]] -name = "numpy" -version = "2.0.2" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/a9/75/10dd1f8116a8b796cb2c737b674e02d02e80454bda953fa7e65d8c12b016/numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78", size = 18902015, upload-time = "2024-08-26T20:19:40.945Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/21/91/3495b3237510f79f5d81f2508f9f13fea78ebfdf07538fc7444badda173d/numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece", size = 21165245, upload-time = "2024-08-26T20:04:14.625Z" }, - { url = "https://files.pythonhosted.org/packages/05/33/26178c7d437a87082d11019292dce6d3fe6f0e9026b7b2309cbf3e489b1d/numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04", size = 13738540, upload-time = "2024-08-26T20:04:36.784Z" }, - { url = "https://files.pythonhosted.org/packages/ec/31/cc46e13bf07644efc7a4bf68df2df5fb2a1a88d0cd0da9ddc84dc0033e51/numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66", size = 5300623, upload-time = "2024-08-26T20:04:46.491Z" }, - { url = "https://files.pythonhosted.org/packages/6e/16/7bfcebf27bb4f9d7ec67332ffebee4d1bf085c84246552d52dbb548600e7/numpy-2.0.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b", size = 6901774, upload-time = "2024-08-26T20:04:58.173Z" }, - { url = "https://files.pythonhosted.org/packages/f9/a3/561c531c0e8bf082c5bef509d00d56f82e0ea7e1e3e3a7fc8fa78742a6e5/numpy-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd", size = 13907081, upload-time = "2024-08-26T20:05:19.098Z" }, - { url = "https://files.pythonhosted.org/packages/fa/66/f7177ab331876200ac7563a580140643d1179c8b4b6a6b0fc9838de2a9b8/numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318", size = 19523451, upload-time = "2024-08-26T20:05:47.479Z" }, - { url = "https://files.pythonhosted.org/packages/25/7f/0b209498009ad6453e4efc2c65bcdf0ae08a182b2b7877d7ab38a92dc542/numpy-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8", size = 19927572, upload-time = "2024-08-26T20:06:17.137Z" }, - { url = "https://files.pythonhosted.org/packages/3e/df/2619393b1e1b565cd2d4c4403bdd979621e2c4dea1f8532754b2598ed63b/numpy-2.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326", size = 14400722, upload-time = "2024-08-26T20:06:39.16Z" }, - { url = "https://files.pythonhosted.org/packages/22/ad/77e921b9f256d5da36424ffb711ae79ca3f451ff8489eeca544d0701d74a/numpy-2.0.2-cp310-cp310-win32.whl", hash = "sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97", size = 6472170, upload-time = "2024-08-26T20:06:50.361Z" }, - { url = "https://files.pythonhosted.org/packages/10/05/3442317535028bc29cf0c0dd4c191a4481e8376e9f0db6bcf29703cadae6/numpy-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131", size = 15905558, upload-time = "2024-08-26T20:07:13.881Z" }, - { url = "https://files.pythonhosted.org/packages/8b/cf/034500fb83041aa0286e0fb16e7c76e5c8b67c0711bb6e9e9737a717d5fe/numpy-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448", size = 21169137, upload-time = "2024-08-26T20:07:45.345Z" }, - { url = "https://files.pythonhosted.org/packages/4a/d9/32de45561811a4b87fbdee23b5797394e3d1504b4a7cf40c10199848893e/numpy-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195", size = 13703552, upload-time = "2024-08-26T20:08:06.666Z" }, - { url = "https://files.pythonhosted.org/packages/c1/ca/2f384720020c7b244d22508cb7ab23d95f179fcfff33c31a6eeba8d6c512/numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57", size = 5298957, upload-time = "2024-08-26T20:08:15.83Z" }, - { url = "https://files.pythonhosted.org/packages/0e/78/a3e4f9fb6aa4e6fdca0c5428e8ba039408514388cf62d89651aade838269/numpy-2.0.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a", size = 6905573, upload-time = "2024-08-26T20:08:27.185Z" }, - { url = "https://files.pythonhosted.org/packages/a0/72/cfc3a1beb2caf4efc9d0b38a15fe34025230da27e1c08cc2eb9bfb1c7231/numpy-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669", size = 13914330, upload-time = "2024-08-26T20:08:48.058Z" }, - { url = "https://files.pythonhosted.org/packages/ba/a8/c17acf65a931ce551fee11b72e8de63bf7e8a6f0e21add4c937c83563538/numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951", size = 19534895, upload-time = "2024-08-26T20:09:16.536Z" }, - { url = "https://files.pythonhosted.org/packages/ba/86/8767f3d54f6ae0165749f84648da9dcc8cd78ab65d415494962c86fac80f/numpy-2.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9", size = 19937253, upload-time = "2024-08-26T20:09:46.263Z" }, - { url = "https://files.pythonhosted.org/packages/df/87/f76450e6e1c14e5bb1eae6836478b1028e096fd02e85c1c37674606ab752/numpy-2.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15", size = 14414074, upload-time = "2024-08-26T20:10:08.483Z" }, - { url = "https://files.pythonhosted.org/packages/5c/ca/0f0f328e1e59f73754f06e1adfb909de43726d4f24c6a3f8805f34f2b0fa/numpy-2.0.2-cp311-cp311-win32.whl", hash = "sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4", size = 6470640, upload-time = "2024-08-26T20:10:19.732Z" }, - { url = "https://files.pythonhosted.org/packages/eb/57/3a3f14d3a759dcf9bf6e9eda905794726b758819df4663f217d658a58695/numpy-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc", size = 15910230, upload-time = "2024-08-26T20:10:43.413Z" }, - { url = "https://files.pythonhosted.org/packages/45/40/2e117be60ec50d98fa08c2f8c48e09b3edea93cfcabd5a9ff6925d54b1c2/numpy-2.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b", size = 20895803, upload-time = "2024-08-26T20:11:13.916Z" }, - { url = "https://files.pythonhosted.org/packages/46/92/1b8b8dee833f53cef3e0a3f69b2374467789e0bb7399689582314df02651/numpy-2.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e", size = 13471835, upload-time = "2024-08-26T20:11:34.779Z" }, - { url = "https://files.pythonhosted.org/packages/7f/19/e2793bde475f1edaea6945be141aef6c8b4c669b90c90a300a8954d08f0a/numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c", size = 5038499, upload-time = "2024-08-26T20:11:43.902Z" }, - { url = "https://files.pythonhosted.org/packages/e3/ff/ddf6dac2ff0dd50a7327bcdba45cb0264d0e96bb44d33324853f781a8f3c/numpy-2.0.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c", size = 6633497, upload-time = "2024-08-26T20:11:55.09Z" }, - { url = "https://files.pythonhosted.org/packages/72/21/67f36eac8e2d2cd652a2e69595a54128297cdcb1ff3931cfc87838874bd4/numpy-2.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692", size = 13621158, upload-time = "2024-08-26T20:12:14.95Z" }, - { url = "https://files.pythonhosted.org/packages/39/68/e9f1126d757653496dbc096cb429014347a36b228f5a991dae2c6b6cfd40/numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a", size = 19236173, upload-time = "2024-08-26T20:12:44.049Z" }, - { url = "https://files.pythonhosted.org/packages/d1/e9/1f5333281e4ebf483ba1c888b1d61ba7e78d7e910fdd8e6499667041cc35/numpy-2.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c", size = 19634174, upload-time = "2024-08-26T20:13:13.634Z" }, - { url = "https://files.pythonhosted.org/packages/71/af/a469674070c8d8408384e3012e064299f7a2de540738a8e414dcfd639996/numpy-2.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded", size = 14099701, upload-time = "2024-08-26T20:13:34.851Z" }, - { url = "https://files.pythonhosted.org/packages/d0/3d/08ea9f239d0e0e939b6ca52ad403c84a2bce1bde301a8eb4888c1c1543f1/numpy-2.0.2-cp312-cp312-win32.whl", hash = "sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5", size = 6174313, upload-time = "2024-08-26T20:13:45.653Z" }, - { url = "https://files.pythonhosted.org/packages/b2/b5/4ac39baebf1fdb2e72585c8352c56d063b6126be9fc95bd2bb5ef5770c20/numpy-2.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a", size = 15606179, upload-time = "2024-08-26T20:14:08.786Z" }, - { url = "https://files.pythonhosted.org/packages/43/c1/41c8f6df3162b0c6ffd4437d729115704bd43363de0090c7f913cfbc2d89/numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c", size = 21169942, upload-time = "2024-08-26T20:14:40.108Z" }, - { url = "https://files.pythonhosted.org/packages/39/bc/fd298f308dcd232b56a4031fd6ddf11c43f9917fbc937e53762f7b5a3bb1/numpy-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd", size = 13711512, upload-time = "2024-08-26T20:15:00.985Z" }, - { url = "https://files.pythonhosted.org/packages/96/ff/06d1aa3eeb1c614eda245c1ba4fb88c483bee6520d361641331872ac4b82/numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b", size = 5306976, upload-time = "2024-08-26T20:15:10.876Z" }, - { url = "https://files.pythonhosted.org/packages/2d/98/121996dcfb10a6087a05e54453e28e58694a7db62c5a5a29cee14c6e047b/numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729", size = 6906494, upload-time = "2024-08-26T20:15:22.055Z" }, - { url = "https://files.pythonhosted.org/packages/15/31/9dffc70da6b9bbf7968f6551967fc21156207366272c2a40b4ed6008dc9b/numpy-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1", size = 13912596, upload-time = "2024-08-26T20:15:42.452Z" }, - { url = "https://files.pythonhosted.org/packages/b9/14/78635daab4b07c0930c919d451b8bf8c164774e6a3413aed04a6d95758ce/numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd", size = 19526099, upload-time = "2024-08-26T20:16:11.048Z" }, - { url = "https://files.pythonhosted.org/packages/26/4c/0eeca4614003077f68bfe7aac8b7496f04221865b3a5e7cb230c9d055afd/numpy-2.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d", size = 19932823, upload-time = "2024-08-26T20:16:40.171Z" }, - { url = "https://files.pythonhosted.org/packages/f1/46/ea25b98b13dccaebddf1a803f8c748680d972e00507cd9bc6dcdb5aa2ac1/numpy-2.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d", size = 14404424, upload-time = "2024-08-26T20:17:02.604Z" }, - { url = "https://files.pythonhosted.org/packages/c8/a6/177dd88d95ecf07e722d21008b1b40e681a929eb9e329684d449c36586b2/numpy-2.0.2-cp39-cp39-win32.whl", hash = "sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa", size = 6476809, upload-time = "2024-08-26T20:17:13.553Z" }, - { url = "https://files.pythonhosted.org/packages/ea/2b/7fc9f4e7ae5b507c1a3a21f0f15ed03e794c1242ea8a242ac158beb56034/numpy-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73", size = 15911314, upload-time = "2024-08-26T20:17:36.72Z" }, - { url = "https://files.pythonhosted.org/packages/8f/3b/df5a870ac6a3be3a86856ce195ef42eec7ae50d2a202be1f5a4b3b340e14/numpy-2.0.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8", size = 21025288, upload-time = "2024-08-26T20:18:07.732Z" }, - { url = "https://files.pythonhosted.org/packages/2c/97/51af92f18d6f6f2d9ad8b482a99fb74e142d71372da5d834b3a2747a446e/numpy-2.0.2-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4", size = 6762793, upload-time = "2024-08-26T20:18:19.125Z" }, - { url = "https://files.pythonhosted.org/packages/12/46/de1fbd0c1b5ccaa7f9a005b66761533e2f6a3e560096682683a223631fe9/numpy-2.0.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c", size = 19334885, upload-time = "2024-08-26T20:18:47.237Z" }, - { url = "https://files.pythonhosted.org/packages/cc/dc/d330a6faefd92b446ec0f0dfea4c3207bb1fef3c4771d19cf4543efd2c78/numpy-2.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385", size = 15828784, upload-time = "2024-08-26T20:19:11.19Z" }, -] - [[package]] name = "numpy" version = "2.2.6" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.10.*'", + "python_full_version < '3.11'", ] sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" } wheels = [ @@ -1874,7 +1594,7 @@ name = "optree" version = "0.17.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", marker = "python_full_version >= '3.10'" }, + { name = "typing-extensions" }, ] sdist = { url = "https://files.pythonhosted.org/packages/56/c7/0853e0c59b135dff770615d2713b547b6b3b5cde7c10995b4a5825244612/optree-0.17.0.tar.gz", hash = "sha256:5335a5ec44479920620d72324c66563bd705ab2a698605dd4b6ee67dbcad7ecd", size = 163111, upload-time = "2025-07-25T11:26:11.586Z" } wheels = [ @@ -1915,11 +1635,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5b/d3/8819a2d5105a240d6793d11a61d597db91756ce84da5cee08808c6b8f61f/optree-0.17.0-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:875c017890a4b5d566af5593cab67fe3c4845544942af57e6bb9dea17e060297", size = 439080, upload-time = "2025-07-25T11:25:42.605Z" }, { url = "https://files.pythonhosted.org/packages/c6/ef/9dbd34dfd1ad89feb239ca9925897a14ac94f190379a3bd991afdfd94186/optree-0.17.0-cp314-cp314t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ffa5686191139f763e13445a169765c83517164bc28e60dbedb19bed2b2655f1", size = 439422, upload-time = "2025-07-25T11:25:43.672Z" }, { url = "https://files.pythonhosted.org/packages/86/ca/a7a7549af2951925a692df508902ed2a6a94a51bc846806d2281b1029ef9/optree-0.17.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:575cf48cc2190acb565bd2b26b6f9b15c4e3b60183e86031215badc9d5441345", size = 426579, upload-time = "2025-07-25T11:25:44.765Z" }, - { url = "https://files.pythonhosted.org/packages/1d/29/3bb53de2de3b36a51e46b6d9ada7ee1a3a312ac461cd54292a023adc807c/optree-0.17.0-cp39-cp39-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:537498cf7bf7a4fe71f7ffd815e72b8672aea0fac82e1513f6b6e35e8569f5aa", size = 350302, upload-time = "2025-07-25T11:25:52.016Z" }, - { url = "https://files.pythonhosted.org/packages/2b/3b/d17a31447ed7ef6f10bd0caf40742b016fcdeaa3abb7568307b04a0f50cf/optree-0.17.0-cp39-cp39-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:3b3bb2326b550ddb048e3454fad40183b7fed74dda4351b016d20362809180af", size = 405358, upload-time = "2025-07-25T11:25:53.085Z" }, - { url = "https://files.pythonhosted.org/packages/db/f3/b9f0a8c98fd0c7f53fa9d9a46d75bb1182aeecd7ecde6f353d3e69ec9618/optree-0.17.0-cp39-cp39-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c0d3d702044e5acbec2cf8349789f6b096057bd00dc8e1e1c97b990347279fda", size = 402694, upload-time = "2025-07-25T11:25:54.537Z" }, - { url = "https://files.pythonhosted.org/packages/cb/dd/0d9d7426fd6b5d90ad40e4d93717a955d4257d06574dfe7a1da0d24cb06c/optree-0.17.0-cp39-cp39-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a9155e82717be1dda1f3c1244e9cb5b3733d5dd3ba47702730c7816be083a5cb", size = 398857, upload-time = "2025-07-25T11:25:55.921Z" }, - { url = "https://files.pythonhosted.org/packages/d8/57/dacec3f8c70f4685bb07fce19cf3361037fde2b596f6f7228e1a4b39677b/optree-0.17.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a8e825501f55360e8381718623b094579dedc485e57010e01593d72a43b43e68", size = 387849, upload-time = "2025-07-25T11:25:57.046Z" }, { url = "https://files.pythonhosted.org/packages/ed/d7/3036d15c028c447b1bd65dcf8f66cfd775bfa4e52daa74b82fb1d3c88faf/optree-0.17.0-pp310-pypy310_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:adde1427e0982cfc5f56939c26b4ebbd833091a176734c79fb95c78bdf833dff", size = 350952, upload-time = "2025-07-25T11:26:02.692Z" }, { url = "https://files.pythonhosted.org/packages/71/45/e710024ef77324e745de48efd64f6270d8c209f14107a48ffef4049ac57a/optree-0.17.0-pp310-pypy310_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a80b7e5de5dd09b9c8b62d501e29a3850b047565c336c9d004b07ee1c01f4ae1", size = 389568, upload-time = "2025-07-25T11:26:04.094Z" }, { url = "https://files.pythonhosted.org/packages/69/c4/94a187ed3ca71194b9da6a276790e1703c7544c8f695ac915214ae8ce934/optree-0.17.0-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f87f6f39015fc82d7adeee19900d246b89911319726e93cb2dbd4d1a809899bd", size = 363728, upload-time = "2025-07-25T11:26:07.959Z" }, @@ -1940,8 +1655,7 @@ name = "pandas" version = "2.3.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "python-dateutil" }, { name = "pytz" }, @@ -1996,13 +1710,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a4/1e/1bac1a839d12e6a82ec6cb40cda2edde64a2013a66963293696bbf31fbbb/pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5", size = 12121582, upload-time = "2025-09-29T23:30:43.391Z" }, { url = "https://files.pythonhosted.org/packages/44/91/483de934193e12a3b1d6ae7c8645d083ff88dec75f46e827562f1e4b4da6/pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788", size = 12699963, upload-time = "2025-09-29T23:31:10.009Z" }, { url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" }, - { url = "https://files.pythonhosted.org/packages/56/b4/52eeb530a99e2a4c55ffcd352772b599ed4473a0f892d127f4147cf0f88e/pandas-2.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c503ba5216814e295f40711470446bc3fd00f0faea8a086cbc688808e26f92a2", size = 11567720, upload-time = "2025-09-29T23:33:06.209Z" }, - { url = "https://files.pythonhosted.org/packages/48/4a/2d8b67632a021bced649ba940455ed441ca854e57d6e7658a6024587b083/pandas-2.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a637c5cdfa04b6d6e2ecedcb81fc52ffb0fd78ce2ebccc9ea964df9f658de8c8", size = 10810302, upload-time = "2025-09-29T23:33:35.846Z" }, - { url = "https://files.pythonhosted.org/packages/13/e6/d2465010ee0569a245c975dc6967b801887068bc893e908239b1f4b6c1ac/pandas-2.3.3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:854d00d556406bffe66a4c0802f334c9ad5a96b4f1f868adf036a21b11ef13ff", size = 12154874, upload-time = "2025-09-29T23:33:49.939Z" }, - { url = "https://files.pythonhosted.org/packages/1f/18/aae8c0aa69a386a3255940e9317f793808ea79d0a525a97a903366bb2569/pandas-2.3.3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bf1f8a81d04ca90e32a0aceb819d34dbd378a98bf923b6398b9a3ec0bf44de29", size = 12790141, upload-time = "2025-09-29T23:34:05.655Z" }, - { url = "https://files.pythonhosted.org/packages/f7/26/617f98de789de00c2a444fbe6301bb19e66556ac78cff933d2c98f62f2b4/pandas-2.3.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:23ebd657a4d38268c7dfbdf089fbc31ea709d82e4923c5ffd4fbd5747133ce73", size = 13208697, upload-time = "2025-09-29T23:34:21.835Z" }, - { url = "https://files.pythonhosted.org/packages/b9/fb/25709afa4552042bd0e15717c75e9b4a2294c3dc4f7e6ea50f03c5136600/pandas-2.3.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5554c929ccc317d41a5e3d1234f3be588248e61f08a74dd17c9eabb535777dc9", size = 13879233, upload-time = "2025-09-29T23:34:35.079Z" }, - { url = "https://files.pythonhosted.org/packages/98/af/7be05277859a7bc399da8ba68b88c96b27b48740b6cf49688899c6eb4176/pandas-2.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:d3e28b3e83862ccf4d85ff19cf8c20b2ae7e503881711ff2d534dc8f761131aa", size = 11359119, upload-time = "2025-09-29T23:34:46.339Z" }, ] [[package]] @@ -2091,17 +1798,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f0/77/bc6f92a3e8e6e46c0ca78abfffec0037845800ea38c73483760362804c41/pillow-11.3.0-cp314-cp314t-win32.whl", hash = "sha256:118ca10c0d60b06d006be10a501fd6bbdfef559251ed31b794668ed569c87e12", size = 6377370, upload-time = "2025-07-01T09:15:46.673Z" }, { url = "https://files.pythonhosted.org/packages/4a/82/3a721f7d69dca802befb8af08b7c79ebcab461007ce1c18bd91a5d5896f9/pillow-11.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:8924748b688aa210d79883357d102cd64690e56b923a186f35a82cbc10f997db", size = 7121500, upload-time = "2025-07-01T09:15:48.512Z" }, { url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload-time = "2025-07-01T09:15:50.399Z" }, - { url = "https://files.pythonhosted.org/packages/9e/8e/9c089f01677d1264ab8648352dcb7773f37da6ad002542760c80107da816/pillow-11.3.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:48d254f8a4c776de343051023eb61ffe818299eeac478da55227d96e241de53f", size = 5316478, upload-time = "2025-07-01T09:15:52.209Z" }, - { url = "https://files.pythonhosted.org/packages/b5/a9/5749930caf674695867eb56a581e78eb5f524b7583ff10b01b6e5048acb3/pillow-11.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7aee118e30a4cf54fdd873bd3a29de51e29105ab11f9aad8c32123f58c8f8081", size = 4686522, upload-time = "2025-07-01T09:15:54.162Z" }, - { url = "https://files.pythonhosted.org/packages/43/46/0b85b763eb292b691030795f9f6bb6fcaf8948c39413c81696a01c3577f7/pillow-11.3.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:23cff760a9049c502721bdb743a7cb3e03365fafcdfc2ef9784610714166e5a4", size = 5853376, upload-time = "2025-07-03T13:11:01.066Z" }, - { url = "https://files.pythonhosted.org/packages/5e/c6/1a230ec0067243cbd60bc2dad5dc3ab46a8a41e21c15f5c9b52b26873069/pillow-11.3.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6359a3bc43f57d5b375d1ad54a0074318a0844d11b76abccf478c37c986d3cfc", size = 7626020, upload-time = "2025-07-03T13:11:06.479Z" }, - { url = "https://files.pythonhosted.org/packages/63/dd/f296c27ffba447bfad76c6a0c44c1ea97a90cb9472b9304c94a732e8dbfb/pillow-11.3.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:092c80c76635f5ecb10f3f83d76716165c96f5229addbd1ec2bdbbda7d496e06", size = 5956732, upload-time = "2025-07-01T09:15:56.111Z" }, - { url = "https://files.pythonhosted.org/packages/a5/a0/98a3630f0b57f77bae67716562513d3032ae70414fcaf02750279c389a9e/pillow-11.3.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cadc9e0ea0a2431124cde7e1697106471fc4c1da01530e679b2391c37d3fbb3a", size = 6624404, upload-time = "2025-07-01T09:15:58.245Z" }, - { url = "https://files.pythonhosted.org/packages/de/e6/83dfba5646a290edd9a21964da07674409e410579c341fc5b8f7abd81620/pillow-11.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6a418691000f2a418c9135a7cf0d797c1bb7d9a485e61fe8e7722845b95ef978", size = 6067760, upload-time = "2025-07-01T09:16:00.003Z" }, - { url = "https://files.pythonhosted.org/packages/bc/41/15ab268fe6ee9a2bc7391e2bbb20a98d3974304ab1a406a992dcb297a370/pillow-11.3.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:97afb3a00b65cc0804d1c7abddbf090a81eaac02768af58cbdcaaa0a931e0b6d", size = 6700534, upload-time = "2025-07-01T09:16:02.29Z" }, - { url = "https://files.pythonhosted.org/packages/64/79/6d4f638b288300bed727ff29f2a3cb63db054b33518a95f27724915e3fbc/pillow-11.3.0-cp39-cp39-win32.whl", hash = "sha256:ea944117a7974ae78059fcc1800e5d3295172bb97035c0c1d9345fca1419da71", size = 6277091, upload-time = "2025-07-01T09:16:04.4Z" }, - { url = "https://files.pythonhosted.org/packages/46/05/4106422f45a05716fd34ed21763f8ec182e8ea00af6e9cb05b93a247361a/pillow-11.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:e5c5858ad8ec655450a7c7df532e9842cf8df7cc349df7225c60d5d348c8aada", size = 6986091, upload-time = "2025-07-01T09:16:06.342Z" }, - { url = "https://files.pythonhosted.org/packages/63/c6/287fd55c2c12761d0591549d48885187579b7c257bef0c6660755b0b59ae/pillow-11.3.0-cp39-cp39-win_arm64.whl", hash = "sha256:6abdbfd3aea42be05702a8dd98832329c167ee84400a1d1f61ab11437f1717eb", size = 2422632, upload-time = "2025-07-01T09:16:08.142Z" }, { url = "https://files.pythonhosted.org/packages/6f/8b/209bd6b62ce8367f47e68a218bffac88888fdf2c9fcf1ecadc6c3ec1ebc7/pillow-11.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3cee80663f29e3843b68199b9d6f4f54bd1d4a6b59bdd91bceefc51238bcb967", size = 5270556, upload-time = "2025-07-01T09:16:09.961Z" }, { url = "https://files.pythonhosted.org/packages/2e/e6/231a0b76070c2cfd9e260a7a5b504fb72da0a95279410fa7afd99d9751d6/pillow-11.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b5f56c3f344f2ccaf0dd875d3e180f631dc60a51b314295a3e681fe8cf851fbe", size = 4654625, upload-time = "2025-07-01T09:16:11.913Z" }, { url = "https://files.pythonhosted.org/packages/13/f4/10cf94fda33cb12765f2397fc285fa6d8eb9c29de7f3185165b702fc7386/pillow-11.3.0-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e67d793d180c9df62f1f40aee3accca4829d3794c95098887edc18af4b8b780c", size = 4874207, upload-time = "2025-07-03T13:11:10.201Z" }, @@ -2250,52 +1946,13 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/35/91/9cb56efbb428b006bb85db28591e40b7736847b8331d43fe335acf95f6c8/propcache-0.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4445542398bd0b5d32df908031cb1b30d43ac848e20470a878b770ec2dcc6330", size = 265778, upload-time = "2025-06-09T22:55:36.45Z" }, { url = "https://files.pythonhosted.org/packages/9a/4c/b0fe775a2bdd01e176b14b574be679d84fc83958335790f7c9a686c1f468/propcache-0.3.2-cp313-cp313t-win32.whl", hash = "sha256:f86e5d7cd03afb3a1db8e9f9f6eff15794e79e791350ac48a8c924e6f439f394", size = 41175, upload-time = "2025-06-09T22:55:38.436Z" }, { url = "https://files.pythonhosted.org/packages/a4/ff/47f08595e3d9b5e149c150f88d9714574f1a7cbd89fe2817158a952674bf/propcache-0.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:9704bedf6e7cbe3c65eca4379a9b53ee6a83749f047808cbb5044d40d7d72198", size = 44857, upload-time = "2025-06-09T22:55:39.687Z" }, - { url = "https://files.pythonhosted.org/packages/6c/39/8ea9bcfaaff16fd0b0fc901ee522e24c9ec44b4ca0229cfffb8066a06959/propcache-0.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a7fad897f14d92086d6b03fdd2eb844777b0c4d7ec5e3bac0fbae2ab0602bbe5", size = 74678, upload-time = "2025-06-09T22:55:41.227Z" }, - { url = "https://files.pythonhosted.org/packages/d3/85/cab84c86966e1d354cf90cdc4ba52f32f99a5bca92a1529d666d957d7686/propcache-0.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1f43837d4ca000243fd7fd6301947d7cb93360d03cd08369969450cc6b2ce3b4", size = 43829, upload-time = "2025-06-09T22:55:42.417Z" }, - { url = "https://files.pythonhosted.org/packages/23/f7/9cb719749152d8b26d63801b3220ce2d3931312b2744d2b3a088b0ee9947/propcache-0.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:261df2e9474a5949c46e962065d88eb9b96ce0f2bd30e9d3136bcde84befd8f2", size = 43729, upload-time = "2025-06-09T22:55:43.651Z" }, - { url = "https://files.pythonhosted.org/packages/a2/a2/0b2b5a210ff311260002a315f6f9531b65a36064dfb804655432b2f7d3e3/propcache-0.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e514326b79e51f0a177daab1052bc164d9d9e54133797a3a58d24c9c87a3fe6d", size = 204483, upload-time = "2025-06-09T22:55:45.327Z" }, - { url = "https://files.pythonhosted.org/packages/3f/e0/7aff5de0c535f783b0c8be5bdb750c305c1961d69fbb136939926e155d98/propcache-0.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d4a996adb6904f85894570301939afeee65f072b4fd265ed7e569e8d9058e4ec", size = 217425, upload-time = "2025-06-09T22:55:46.729Z" }, - { url = "https://files.pythonhosted.org/packages/92/1d/65fa889eb3b2a7d6e4ed3c2b568a9cb8817547a1450b572de7bf24872800/propcache-0.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:76cace5d6b2a54e55b137669b30f31aa15977eeed390c7cbfb1dafa8dfe9a701", size = 214723, upload-time = "2025-06-09T22:55:48.342Z" }, - { url = "https://files.pythonhosted.org/packages/9a/e2/eecf6989870988dfd731de408a6fa366e853d361a06c2133b5878ce821ad/propcache-0.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31248e44b81d59d6addbb182c4720f90b44e1efdc19f58112a3c3a1615fb47ef", size = 200166, upload-time = "2025-06-09T22:55:49.775Z" }, - { url = "https://files.pythonhosted.org/packages/12/06/c32be4950967f18f77489268488c7cdc78cbfc65a8ba8101b15e526b83dc/propcache-0.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abb7fa19dbf88d3857363e0493b999b8011eea856b846305d8c0512dfdf8fbb1", size = 194004, upload-time = "2025-06-09T22:55:51.335Z" }, - { url = "https://files.pythonhosted.org/packages/46/6c/17b521a6b3b7cbe277a4064ff0aa9129dd8c89f425a5a9b6b4dd51cc3ff4/propcache-0.3.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d81ac3ae39d38588ad0549e321e6f773a4e7cc68e7751524a22885d5bbadf886", size = 203075, upload-time = "2025-06-09T22:55:52.681Z" }, - { url = "https://files.pythonhosted.org/packages/62/cb/3bdba2b736b3e45bc0e40f4370f745b3e711d439ffbffe3ae416393eece9/propcache-0.3.2-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:cc2782eb0f7a16462285b6f8394bbbd0e1ee5f928034e941ffc444012224171b", size = 195407, upload-time = "2025-06-09T22:55:54.048Z" }, - { url = "https://files.pythonhosted.org/packages/29/bd/760c5c6a60a4a2c55a421bc34a25ba3919d49dee411ddb9d1493bb51d46e/propcache-0.3.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:db429c19a6c7e8a1c320e6a13c99799450f411b02251fb1b75e6217cf4a14fcb", size = 196045, upload-time = "2025-06-09T22:55:55.485Z" }, - { url = "https://files.pythonhosted.org/packages/76/58/ced2757a46f55b8c84358d6ab8de4faf57cba831c51e823654da7144b13a/propcache-0.3.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:21d8759141a9e00a681d35a1f160892a36fb6caa715ba0b832f7747da48fb6ea", size = 208432, upload-time = "2025-06-09T22:55:56.884Z" }, - { url = "https://files.pythonhosted.org/packages/bb/ec/d98ea8d5a4d8fe0e372033f5254eddf3254344c0c5dc6c49ab84349e4733/propcache-0.3.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:2ca6d378f09adb13837614ad2754fa8afaee330254f404299611bce41a8438cb", size = 210100, upload-time = "2025-06-09T22:55:58.498Z" }, - { url = "https://files.pythonhosted.org/packages/56/84/b6d8a7ecf3f62d7dd09d9d10bbf89fad6837970ef868b35b5ffa0d24d9de/propcache-0.3.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:34a624af06c048946709f4278b4176470073deda88d91342665d95f7c6270fbe", size = 200712, upload-time = "2025-06-09T22:55:59.906Z" }, - { url = "https://files.pythonhosted.org/packages/bf/32/889f4903ddfe4a9dc61da71ee58b763758cf2d608fe1decede06e6467f8d/propcache-0.3.2-cp39-cp39-win32.whl", hash = "sha256:4ba3fef1c30f306b1c274ce0b8baaa2c3cdd91f645c48f06394068f37d3837a1", size = 38187, upload-time = "2025-06-09T22:56:01.212Z" }, - { url = "https://files.pythonhosted.org/packages/67/74/d666795fb9ba1dc139d30de64f3b6fd1ff9c9d3d96ccfdb992cd715ce5d2/propcache-0.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:7a2368eed65fc69a7a7a40b27f22e85e7627b74216f0846b04ba5c116e191ec9", size = 42025, upload-time = "2025-06-09T22:56:02.875Z" }, { url = "https://files.pythonhosted.org/packages/cc/35/cc0aaecf278bb4575b8555f2b137de5ab821595ddae9da9d3cd1da4072c7/propcache-0.3.2-py3-none-any.whl", hash = "sha256:98f1ec44fb675f5052cccc8e609c46ed23a35a1cfd18545ad4e29002d858a43f", size = 12663, upload-time = "2025-06-09T22:56:04.484Z" }, ] -[[package]] -name = "protobuf" -version = "3.19.6" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/51/d1/79bfd1f481469b661a2eddab551255536401892722189433282bfb13cfb1/protobuf-3.19.6.tar.gz", hash = "sha256:5f5540d57a43042389e87661c6eaa50f47c19c6176e8cf1c4f287aeefeccb5c4", size = 218071, upload-time = "2022-09-29T22:07:23.03Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4b/3b/90f805b9e5ecacf8a216f2e5acabc2d3ad965b62803510be41804e6bfbfe/protobuf-3.19.6-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:010be24d5a44be7b0613750ab40bc8b8cedc796db468eae6c779b395f50d1fa1", size = 913631, upload-time = "2022-09-29T21:17:39.095Z" }, - { url = "https://files.pythonhosted.org/packages/26/ef/bd6ba3b4ff9a35944bdd325e2c9ee56f71e855757f7d43938232499f0278/protobuf-3.19.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11478547958c2dfea921920617eb457bc26867b0d1aa065ab05f35080c5d9eb6", size = 1055327, upload-time = "2022-09-29T21:17:41.054Z" }, - { url = "https://files.pythonhosted.org/packages/bc/db/8b33c9558f1f27dd74e7f9ad730c6b32efab431419af556b1659e125b041/protobuf-3.19.6-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:30a15015d86b9c3b8d6bf78d5b8c7749f2512c29f168ca259c9d7727604d0e39", size = 913657, upload-time = "2022-09-29T21:18:18.359Z" }, - { url = "https://files.pythonhosted.org/packages/51/61/e80b7a04f4e1b4eecc86582335205fd876abca0abafee4a6c001f70a375e/protobuf-3.19.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:878b4cd080a21ddda6ac6d1e163403ec6eea2e206cf225982ae04567d39be7b0", size = 1055457, upload-time = "2022-09-29T21:18:20.212Z" }, - { url = "https://files.pythonhosted.org/packages/32/27/1141a8232723dcb10a595cc0ce4321dcbbd5215300bf4acfc142343205bf/protobuf-3.19.6-py2.py3-none-any.whl", hash = "sha256:14082457dc02be946f60b15aad35e9f5c69e738f80ebbc0900a19bc83734a5a4", size = 162648, upload-time = "2022-09-29T22:07:20.303Z" }, -] - [[package]] name = "protobuf" version = "6.32.1" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/fa/a4/cc17347aa2897568beece2e674674359f911d6fe21b0b8d6268cd42727ac/protobuf-6.32.1.tar.gz", hash = "sha256:ee2469e4a021474ab9baafea6cd070e5bf27c7d29433504ddea1a4ee5850f68d", size = 440635, upload-time = "2025-09-11T21:38:42.935Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/3f/be/8dd0a927c559b37d7a6c8ab79034fd167dcc1f851595f2e641ad62be8643/protobuf-6.32.1-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:2f5b80a49e1eb7b86d85fcd23fe92df154b9730a725c3b38c4e43b9d77018bf4", size = 322874, upload-time = "2025-09-11T21:38:35.509Z" }, @@ -2333,8 +1990,7 @@ name = "pyarrow" version = "21.0.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.10.*'", - "python_full_version < '3.10'", + "python_full_version < '3.11'", ] sdist = { url = "https://files.pythonhosted.org/packages/ef/c2/ea068b8f00905c06329a3dfcd40d0fcc2b7d0f2e355bdb25b65e0a0e4cd4/pyarrow-21.0.0.tar.gz", hash = "sha256:5051f2dccf0e283ff56335760cbc8622cf52264d67e359d5569541ac11b6d5bc", size = 1133487, upload-time = "2025-07-18T00:57:31.761Z" } wheels = [ @@ -2373,13 +2029,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0a/f9/4ee798dc902533159250fb4321267730bc0a107d8c6889e07c3add4fe3a5/pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fc0d2f88b81dcf3ccf9a6ae17f89183762c8a94a5bdcfa09e05cfe413acf0503", size = 43276625, upload-time = "2025-07-18T00:56:48.002Z" }, { url = "https://files.pythonhosted.org/packages/5a/da/e02544d6997037a4b0d22d8e5f66bc9315c3671371a8b18c79ade1cefe14/pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6299449adf89df38537837487a4f8d3bd91ec94354fdd2a7d30bc11c48ef6e79", size = 44951890, upload-time = "2025-07-18T00:56:52.568Z" }, { url = "https://files.pythonhosted.org/packages/e5/4e/519c1bc1876625fe6b71e9a28287c43ec2f20f73c658b9ae1d485c0c206e/pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10", size = 26371006, upload-time = "2025-07-18T00:56:56.379Z" }, - { url = "https://files.pythonhosted.org/packages/3e/cc/ce4939f4b316457a083dc5718b3982801e8c33f921b3c98e7a93b7c7491f/pyarrow-21.0.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:a7f6524e3747e35f80744537c78e7302cd41deee8baa668d56d55f77d9c464b3", size = 31211248, upload-time = "2025-07-18T00:56:59.7Z" }, - { url = "https://files.pythonhosted.org/packages/1f/c2/7a860931420d73985e2f340f06516b21740c15b28d24a0e99a900bb27d2b/pyarrow-21.0.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:203003786c9fd253ebcafa44b03c06983c9c8d06c3145e37f1b76a1f317aeae1", size = 32676896, upload-time = "2025-07-18T00:57:03.884Z" }, - { url = "https://files.pythonhosted.org/packages/68/a8/197f989b9a75e59b4ca0db6a13c56f19a0ad8a298c68da9cc28145e0bb97/pyarrow-21.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b4d97e297741796fead24867a8dabf86c87e4584ccc03167e4a811f50fdf74d", size = 41067862, upload-time = "2025-07-18T00:57:07.587Z" }, - { url = "https://files.pythonhosted.org/packages/fa/82/6ecfa89487b35aa21accb014b64e0a6b814cc860d5e3170287bf5135c7d8/pyarrow-21.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:898afce396b80fdda05e3086b4256f8677c671f7b1d27a6976fa011d3fd0a86e", size = 42747508, upload-time = "2025-07-18T00:57:13.917Z" }, - { url = "https://files.pythonhosted.org/packages/3b/b7/ba252f399bbf3addc731e8643c05532cf32e74cebb5e32f8f7409bc243cf/pyarrow-21.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:067c66ca29aaedae08218569a114e413b26e742171f526e828e1064fcdec13f4", size = 43345293, upload-time = "2025-07-18T00:57:19.828Z" }, - { url = "https://files.pythonhosted.org/packages/ff/0a/a20819795bd702b9486f536a8eeb70a6aa64046fce32071c19ec8230dbaa/pyarrow-21.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0c4e75d13eb76295a49e0ea056eb18dbd87d81450bfeb8afa19a7e5a75ae2ad7", size = 45060670, upload-time = "2025-07-18T00:57:24.477Z" }, - { url = "https://files.pythonhosted.org/packages/10/15/6b30e77872012bbfe8265d42a01d5b3c17ef0ac0f2fae531ad91b6a6c02e/pyarrow-21.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:cdc4c17afda4dab2a9c0b79148a43a7f4e1094916b3e18d8975bfd6d6d52241f", size = 26227521, upload-time = "2025-07-18T00:57:29.119Z" }, ] [[package]] @@ -2552,19 +2201,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" }, { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" }, { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008", size = 1974769, upload-time = "2025-11-04T13:42:01.186Z" }, - { url = "https://files.pythonhosted.org/packages/54/db/160dffb57ed9a3705c4cbcbff0ac03bdae45f1ca7d58ab74645550df3fbd/pydantic_core-2.41.5-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:8bfeaf8735be79f225f3fefab7f941c712aaca36f1128c9d7e2352ee1aa87bdf", size = 2107999, upload-time = "2025-11-04T13:42:03.885Z" }, - { url = "https://files.pythonhosted.org/packages/a3/7d/88e7de946f60d9263cc84819f32513520b85c0f8322f9b8f6e4afc938383/pydantic_core-2.41.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:346285d28e4c8017da95144c7f3acd42740d637ff41946af5ce6e5e420502dd5", size = 1929745, upload-time = "2025-11-04T13:42:06.075Z" }, - { url = "https://files.pythonhosted.org/packages/d5/c2/aef51e5b283780e85e99ff19db0f05842d2d4a8a8cd15e63b0280029b08f/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a75dafbf87d6276ddc5b2bf6fae5254e3d0876b626eb24969a574fff9149ee5d", size = 1920220, upload-time = "2025-11-04T13:42:08.457Z" }, - { url = "https://files.pythonhosted.org/packages/c7/97/492ab10f9ac8695cd76b2fdb24e9e61f394051df71594e9bcc891c9f586e/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7b93a4d08587e2b7e7882de461e82b6ed76d9026ce91ca7915e740ecc7855f60", size = 2067296, upload-time = "2025-11-04T13:42:10.817Z" }, - { url = "https://files.pythonhosted.org/packages/ec/23/984149650e5269c59a2a4c41d234a9570adc68ab29981825cfaf4cfad8f4/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e8465ab91a4bd96d36dde3263f06caa6a8a6019e4113f24dc753d79a8b3a3f82", size = 2231548, upload-time = "2025-11-04T13:42:13.843Z" }, - { url = "https://files.pythonhosted.org/packages/71/0c/85bcbb885b9732c28bec67a222dbed5ed2d77baee1f8bba2002e8cd00c5c/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:299e0a22e7ae2b85c1a57f104538b2656e8ab1873511fd718a1c1c6f149b77b5", size = 2362571, upload-time = "2025-11-04T13:42:16.208Z" }, - { url = "https://files.pythonhosted.org/packages/c0/4a/412d2048be12c334003e9b823a3fa3d038e46cc2d64dd8aab50b31b65499/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:707625ef0983fcfb461acfaf14de2067c5942c6bb0f3b4c99158bed6fedd3cf3", size = 2068175, upload-time = "2025-11-04T13:42:18.911Z" }, - { url = "https://files.pythonhosted.org/packages/73/f4/c58b6a776b502d0a5540ad02e232514285513572060f0d78f7832ca3c98b/pydantic_core-2.41.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f41eb9797986d6ebac5e8edff36d5cef9de40def462311b3eb3eeded1431e425", size = 2177203, upload-time = "2025-11-04T13:42:22.578Z" }, - { url = "https://files.pythonhosted.org/packages/ed/ae/f06ea4c7e7a9eead3d165e7623cd2ea0cb788e277e4f935af63fc98fa4e6/pydantic_core-2.41.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0384e2e1021894b1ff5a786dbf94771e2986ebe2869533874d7e43bc79c6f504", size = 2148191, upload-time = "2025-11-04T13:42:24.89Z" }, - { url = "https://files.pythonhosted.org/packages/c1/57/25a11dcdc656bf5f8b05902c3c2934ac3ea296257cc4a3f79a6319e61856/pydantic_core-2.41.5-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:f0cd744688278965817fd0839c4a4116add48d23890d468bc436f78beb28abf5", size = 2343907, upload-time = "2025-11-04T13:42:27.683Z" }, - { url = "https://files.pythonhosted.org/packages/96/82/e33d5f4933d7a03327c0c43c65d575e5919d4974ffc026bc917a5f7b9f61/pydantic_core-2.41.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:753e230374206729bf0a807954bcc6c150d3743928a73faffee51ac6557a03c3", size = 2322174, upload-time = "2025-11-04T13:42:30.776Z" }, - { url = "https://files.pythonhosted.org/packages/81/45/4091be67ce9f469e81656f880f3506f6a5624121ec5eb3eab37d7581897d/pydantic_core-2.41.5-cp39-cp39-win32.whl", hash = "sha256:873e0d5b4fb9b89ef7c2d2a963ea7d02879d9da0da8d9d4933dee8ee86a8b460", size = 1990353, upload-time = "2025-11-04T13:42:33.111Z" }, - { url = "https://files.pythonhosted.org/packages/44/8a/a98aede18db6e9cd5d66bcacd8a409fcf8134204cdede2e7de35c5a2c5ef/pydantic_core-2.41.5-cp39-cp39-win_amd64.whl", hash = "sha256:e4f4a984405e91527a0d62649ee21138f8e3d0ef103be488c1dc11a80d7f184b", size = 2015698, upload-time = "2025-11-04T13:42:35.484Z" }, { url = "https://files.pythonhosted.org/packages/11/72/90fda5ee3b97e51c494938a4a44c3a35a9c96c19bba12372fb9c634d6f57/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:b96d5f26b05d03cc60f11a7761a5ded1741da411e7fe0909e27a5e6a0cb7b034", size = 2115441, upload-time = "2025-11-04T13:42:39.557Z" }, { url = "https://files.pythonhosted.org/packages/1f/53/8942f884fa33f50794f119012dc6a1a02ac43a56407adaac20463df8e98f/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:634e8609e89ceecea15e2d61bc9ac3718caaaa71963717bf3c8f38bfde64242c", size = 1930291, upload-time = "2025-11-04T13:42:42.169Z" }, { url = "https://files.pythonhosted.org/packages/79/c8/ecb9ed9cd942bce09fc888ee960b52654fbdbede4ba6c2d6e0d3b1d8b49c/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93e8740d7503eb008aa2df04d3b9735f845d43ae845e6dcd2be0b55a2da43cd2", size = 1948632, upload-time = "2025-11-04T13:42:44.564Z" }, @@ -2605,8 +2241,7 @@ name = "pylance" source = { editable = "." } dependencies = [ { name = "lance-namespace" }, - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "pyarrow", version = "23.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, @@ -2626,9 +2261,8 @@ geo = [ ] tests = [ { name = "boto3" }, - { name = "datafusion", marker = "python_full_version >= '3.10'" }, - { name = "datasets", version = "0.0.9", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "datasets", version = "4.1.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "datafusion" }, + { name = "datasets" }, { name = "duckdb" }, { name = "ml-dtypes" }, { name = "pandas" }, @@ -2636,8 +2270,7 @@ tests = [ { name = "polars", extra = ["pandas", "pyarrow"] }, { name = "psutil" }, { name = "pytest" }, - { name = "tensorflow", version = "2.7.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' and sys_platform == 'linux'" }, - { name = "tensorflow", version = "2.20.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' and sys_platform == 'linux'" }, + { name = "tensorflow", marker = "sys_platform == 'linux'" }, { name = "tqdm" }, ] torch = [ @@ -2655,8 +2288,8 @@ dev = [ ] tests = [ { name = "boto3" }, - { name = "datafusion", marker = "python_full_version >= '3.10'" }, - { name = "datasets", version = "4.1.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "datafusion" }, + { name = "datasets" }, { name = "duckdb" }, { name = "ml-dtypes" }, { name = "pandas" }, @@ -2664,19 +2297,19 @@ tests = [ { name = "polars", extra = ["pandas", "pyarrow"] }, { name = "psutil" }, { name = "pytest" }, - { name = "tensorflow", version = "2.20.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' and sys_platform == 'linux'" }, + { name = "tensorflow", marker = "sys_platform == 'linux'" }, { name = "tqdm" }, ] [package.metadata] requires-dist = [ { name = "boto3", marker = "extra == 'tests'" }, - { name = "datafusion", marker = "python_full_version >= '3.10' and extra == 'tests'", specifier = ">=53,<54" }, + { name = "datafusion", marker = "extra == 'tests'", specifier = ">=53,<54" }, { name = "datasets", marker = "extra == 'tests'" }, { name = "duckdb", marker = "extra == 'tests'" }, { name = "geoarrow-rust-core", marker = "extra == 'geo'" }, { name = "geoarrow-rust-io", marker = "extra == 'geo'" }, - { name = "lance-namespace", specifier = ">=0.8.0,<0.9" }, + { name = "lance-namespace", specifier = ">=0.8.5,<0.9" }, { name = "ml-dtypes", marker = "extra == 'tests'" }, { name = "numpy", specifier = ">=1.22" }, { name = "pandas", marker = "extra == 'tests'" }, @@ -2703,8 +2336,8 @@ dev = [ ] tests = [ { name = "boto3", specifier = "==1.40.43" }, - { name = "datafusion", marker = "python_full_version >= '3.10'", specifier = "==53.0.0" }, - { name = "datasets", marker = "python_full_version >= '3.10'", specifier = "==4.1.1" }, + { name = "datafusion", specifier = "==53.0.0" }, + { name = "datasets", specifier = "==4.1.1" }, { name = "duckdb", specifier = "==1.4.0" }, { name = "ml-dtypes", specifier = "==0.5.3" }, { name = "pandas", specifier = "==2.3.3" }, @@ -2712,59 +2345,19 @@ tests = [ { name = "polars", extras = ["pyarrow", "pandas"], specifier = "==1.34.0" }, { name = "psutil", specifier = "==7.1.0" }, { name = "pytest", specifier = "==8.4.2" }, - { name = "tensorflow", marker = "python_full_version >= '3.10' and sys_platform == 'linux'", specifier = "==2.20.0" }, + { name = "tensorflow", marker = "sys_platform == 'linux'", specifier = "==2.20.0" }, { name = "tqdm", specifier = "==4.67.1" }, ] -[[package]] -name = "pyproj" -version = "3.6.1" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -dependencies = [ - { name = "certifi", marker = "python_full_version < '3.10'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/7d/84/2b39bbf888c753ea48b40d47511548c77aa03445465c35cc4c4e9649b643/pyproj-3.6.1.tar.gz", hash = "sha256:44aa7c704c2b7d8fb3d483bbf75af6cb2350d30a63b144279a09b75fead501bf", size = 225131, upload-time = "2023-09-21T02:07:51.593Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c5/32/63cf474f4a8d4804b3bdf7c16b8589f38142e8e2f8319dcea27e0bc21a87/pyproj-3.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ab7aa4d9ff3c3acf60d4b285ccec134167a948df02347585fdd934ebad8811b4", size = 6142763, upload-time = "2023-09-21T02:07:12.844Z" }, - { url = "https://files.pythonhosted.org/packages/18/86/2e7cb9de40492f1bafbf11f4c9072edc394509a40b5e4c52f8139546f039/pyproj-3.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4bc0472302919e59114aa140fd7213c2370d848a7249d09704f10f5b062031fe", size = 4877123, upload-time = "2023-09-21T02:10:37.905Z" }, - { url = "https://files.pythonhosted.org/packages/5e/c5/928d5a26995dbefbebd7507d982141cd9153bc7e4392b334fff722c4af12/pyproj-3.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5279586013b8d6582e22b6f9e30c49796966770389a9d5b85e25a4223286cd3f", size = 6190576, upload-time = "2023-09-21T02:17:08.637Z" }, - { url = "https://files.pythonhosted.org/packages/f6/2b/b60cf73b0720abca313bfffef34e34f7f7dae23852b2853cf0368d49426b/pyproj-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80fafd1f3eb421694857f254a9bdbacd1eb22fc6c24ca74b136679f376f97d35", size = 8328075, upload-time = "2023-09-21T02:07:15.353Z" }, - { url = "https://files.pythonhosted.org/packages/d9/a8/7193f46032636be917bc775506ae987aad72c931b1f691b775ca812a2917/pyproj-3.6.1-cp310-cp310-win32.whl", hash = "sha256:c41e80ddee130450dcb8829af7118f1ab69eaf8169c4bf0ee8d52b72f098dc2f", size = 5635713, upload-time = "2023-09-21T02:07:17.548Z" }, - { url = "https://files.pythonhosted.org/packages/89/8f/27350c8fba71a37cd0d316f100fbd96bf139cc2b5ff1ab0dcbc7ac64010a/pyproj-3.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:db3aedd458e7f7f21d8176f0a1d924f1ae06d725228302b872885a1c34f3119e", size = 6087932, upload-time = "2023-09-21T02:07:19.793Z" }, - { url = "https://files.pythonhosted.org/packages/84/a6/a300c1b14b2112e966e9f90b18f9c13b586bdcf417207cee913ae9005da3/pyproj-3.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ebfbdbd0936e178091309f6cd4fcb4decd9eab12aa513cdd9add89efa3ec2882", size = 6147442, upload-time = "2023-09-21T02:07:21.879Z" }, - { url = "https://files.pythonhosted.org/packages/30/bd/b9bd3761f08754e8dbb34c5a647db2099b348ab5da338e90980caf280e37/pyproj-3.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:447db19c7efad70ff161e5e46a54ab9cc2399acebb656b6ccf63e4bc4a04b97a", size = 4880331, upload-time = "2023-09-21T02:10:40.828Z" }, - { url = "https://files.pythonhosted.org/packages/f4/0a/d82aeeb605b5d6870bc72307c3b5e044e632eb7720df8885e144f51a8eac/pyproj-3.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e7e13c40183884ec7f94eb8e0f622f08f1d5716150b8d7a134de48c6110fee85", size = 6192425, upload-time = "2023-09-21T02:17:09.049Z" }, - { url = "https://files.pythonhosted.org/packages/64/90/dfe5c00de1ca4dbb82606e79790659d4ed7f0ed8d372bccb3baca2a5abe0/pyproj-3.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:65ad699e0c830e2b8565afe42bd58cc972b47d829b2e0e48ad9638386d994915", size = 8571478, upload-time = "2023-09-21T02:07:23.771Z" }, - { url = "https://files.pythonhosted.org/packages/14/6d/ae373629a1723f0db80d7b8c93598b00d9ecb930ed9ebf4f35826a33e97c/pyproj-3.6.1-cp311-cp311-win32.whl", hash = "sha256:8b8acc31fb8702c54625f4d5a2a6543557bec3c28a0ef638778b7ab1d1772132", size = 5634575, upload-time = "2023-09-21T02:07:26.535Z" }, - { url = "https://files.pythonhosted.org/packages/79/95/eb68113c5b5737c342bde1bab92705dabe69c16299c5a122616e50f1fbd6/pyproj-3.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:38a3361941eb72b82bd9a18f60c78b0df8408416f9340521df442cebfc4306e2", size = 6088494, upload-time = "2023-09-21T02:07:28.75Z" }, - { url = "https://files.pythonhosted.org/packages/0b/64/93232511a7906a492b1b7dfdfc17f4e95982d76a24ef4f86d18cfe7ae2c9/pyproj-3.6.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:1e9fbaf920f0f9b4ee62aab832be3ae3968f33f24e2e3f7fbb8c6728ef1d9746", size = 6135280, upload-time = "2023-09-21T02:07:30.911Z" }, - { url = "https://files.pythonhosted.org/packages/10/f2/b550b1f65cc7e51c9116b220b50aade60c439103432a3fd5b12efbc77e15/pyproj-3.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6d227a865356f225591b6732430b1d1781e946893789a609bb34f59d09b8b0f8", size = 4880030, upload-time = "2023-09-21T02:10:43.067Z" }, - { url = "https://files.pythonhosted.org/packages/fe/4b/2f8f6f94643b9fe2083338eff294feda84d916409b5840b7a402d2be93f8/pyproj-3.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83039e5ae04e5afc974f7d25ee0870a80a6bd6b7957c3aca5613ccbe0d3e72bf", size = 6184439, upload-time = "2023-09-21T02:17:43.499Z" }, - { url = "https://files.pythonhosted.org/packages/19/9b/c57569132174786aa3f72275ac306956859a639dad0ce8d95c8411ce8209/pyproj-3.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb059ba3bced6f6725961ba758649261d85ed6ce670d3e3b0a26e81cf1aa8d", size = 8660747, upload-time = "2023-09-21T02:07:32.586Z" }, - { url = "https://files.pythonhosted.org/packages/0e/ab/1c2159ec757677c5a6b8803f6be45c2b550dc42c84ec4a228dc219849bbb/pyproj-3.6.1-cp312-cp312-win32.whl", hash = "sha256:2d6ff73cc6dbbce3766b6c0bce70ce070193105d8de17aa2470009463682a8eb", size = 5626805, upload-time = "2023-09-21T02:07:35.28Z" }, - { url = "https://files.pythonhosted.org/packages/c7/f3/2f32fe143cd7ba1d4d68f1b6dce9ca402d909cbd5a5830e3a8fa3d1acbbf/pyproj-3.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:7a27151ddad8e1439ba70c9b4b2b617b290c39395fa9ddb7411ebb0eb86d6fb0", size = 6079779, upload-time = "2023-09-21T02:07:37.486Z" }, - { url = "https://files.pythonhosted.org/packages/d7/50/d369bbe62d7a0d1e2cb40bc211da86a3f6e0f3c99f872957a72c3d5492d6/pyproj-3.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4ba1f9b03d04d8cab24d6375609070580a26ce76eaed54631f03bab00a9c737b", size = 6144755, upload-time = "2023-09-21T02:07:39.611Z" }, - { url = "https://files.pythonhosted.org/packages/2c/c2/8d4f61065dfed965e53badd41201ad86a05af0c1bbc75dffb12ef0f5a7dd/pyproj-3.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:18faa54a3ca475bfe6255156f2f2874e9a1c8917b0004eee9f664b86ccc513d3", size = 4879187, upload-time = "2023-09-21T02:10:45.519Z" }, - { url = "https://files.pythonhosted.org/packages/31/38/2cf8777cb2d5622a78195e690281b7029098795fde4751aec8128238b8bb/pyproj-3.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd43bd9a9b9239805f406fd82ba6b106bf4838d9ef37c167d3ed70383943ade1", size = 6192339, upload-time = "2023-09-21T02:17:09.942Z" }, - { url = "https://files.pythonhosted.org/packages/97/0a/b1525be9680369cc06dd288e12c59d24d5798b4afcdcf1b0915836e1caa6/pyproj-3.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50100b2726a3ca946906cbaa789dd0749f213abf0cbb877e6de72ca7aa50e1ae", size = 8332638, upload-time = "2023-09-21T02:07:41.777Z" }, - { url = "https://files.pythonhosted.org/packages/8d/e8/e826e0a962f36bd925a933829cf6ef218efe2055db5ea292be40974a929d/pyproj-3.6.1-cp39-cp39-win32.whl", hash = "sha256:9274880263256f6292ff644ca92c46d96aa7e57a75c6df3f11d636ce845a1877", size = 5638159, upload-time = "2023-09-21T02:07:43.49Z" }, - { url = "https://files.pythonhosted.org/packages/43/d0/cbe29a4dcf38ee7e72bf695d0d3f2bee21b4f22ee6cf579ad974de9edfc8/pyproj-3.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:36b64c2cb6ea1cc091f329c5bd34f9c01bb5da8c8e4492c709bda6a09f96808f", size = 6090565, upload-time = "2023-09-21T02:07:45.735Z" }, - { url = "https://files.pythonhosted.org/packages/43/28/e8d2ca71dd56c27cbe668e4226963d61956cded222a2e839e6fec1ab6d82/pyproj-3.6.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:fd93c1a0c6c4aedc77c0fe275a9f2aba4d59b8acf88cebfc19fe3c430cfabf4f", size = 6034252, upload-time = "2023-09-21T02:07:47.906Z" }, - { url = "https://files.pythonhosted.org/packages/cb/39/1ce27cb86f51a1f5aed3a1617802a6131b59ea78492141d1fbe36722595e/pyproj-3.6.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6420ea8e7d2a88cb148b124429fba8cd2e0fae700a2d96eab7083c0928a85110", size = 6386263, upload-time = "2023-09-21T02:07:49.586Z" }, -] - [[package]] name = "pyproj" version = "3.7.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.10.*'", + "python_full_version < '3.11'", ] dependencies = [ - { name = "certifi", marker = "python_full_version == '3.10.*'" }, + { name = "certifi", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/67/10/a8480ea27ea4bbe896c168808854d00f2a9b49f95c0319ddcbba693c8a90/pyproj-3.7.1.tar.gz", hash = "sha256:60d72facd7b6b79853f19744779abcd3f804c4e0d4fa8815469db20c9f640a47", size = 226339, upload-time = "2025-02-16T04:28:46.621Z" } wheels = [ @@ -3000,15 +2593,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" }, { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" }, { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, - { url = "https://files.pythonhosted.org/packages/9f/62/67fc8e68a75f738c9200422bf65693fb79a4cd0dc5b23310e5202e978090/pyyaml-6.0.3-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:b865addae83924361678b652338317d1bd7e79b1f4596f96b96c77a5a34b34da", size = 184450, upload-time = "2025-09-25T21:33:00.618Z" }, - { url = "https://files.pythonhosted.org/packages/ae/92/861f152ce87c452b11b9d0977952259aa7df792d71c1053365cc7b09cc08/pyyaml-6.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c3355370a2c156cffb25e876646f149d5d68f5e0a3ce86a5084dd0b64a994917", size = 174319, upload-time = "2025-09-25T21:33:02.086Z" }, - { url = "https://files.pythonhosted.org/packages/d0/cd/f0cfc8c74f8a030017a2b9c771b7f47e5dd702c3e28e5b2071374bda2948/pyyaml-6.0.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3c5677e12444c15717b902a5798264fa7909e41153cdf9ef7ad571b704a63dd9", size = 737631, upload-time = "2025-09-25T21:33:03.25Z" }, - { url = "https://files.pythonhosted.org/packages/ef/b2/18f2bd28cd2055a79a46c9b0895c0b3d987ce40ee471cecf58a1a0199805/pyyaml-6.0.3-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5ed875a24292240029e4483f9d4a4b8a1ae08843b9c54f43fcc11e404532a8a5", size = 836795, upload-time = "2025-09-25T21:33:05.014Z" }, - { url = "https://files.pythonhosted.org/packages/73/b9/793686b2d54b531203c160ef12bec60228a0109c79bae6c1277961026770/pyyaml-6.0.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0150219816b6a1fa26fb4699fb7daa9caf09eb1999f3b70fb6e786805e80375a", size = 750767, upload-time = "2025-09-25T21:33:06.398Z" }, - { url = "https://files.pythonhosted.org/packages/a9/86/a137b39a611def2ed78b0e66ce2fe13ee701a07c07aebe55c340ed2a050e/pyyaml-6.0.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fa160448684b4e94d80416c0fa4aac48967a969efe22931448d853ada8baf926", size = 727982, upload-time = "2025-09-25T21:33:08.708Z" }, - { url = "https://files.pythonhosted.org/packages/dd/62/71c27c94f457cf4418ef8ccc71735324c549f7e3ea9d34aba50874563561/pyyaml-6.0.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:27c0abcb4a5dac13684a37f76e701e054692a9b2d3064b70f5e4eb54810553d7", size = 755677, upload-time = "2025-09-25T21:33:09.876Z" }, - { url = "https://files.pythonhosted.org/packages/29/3d/6f5e0d58bd924fb0d06c3a6bad00effbdae2de5adb5cda5648006ffbd8d3/pyyaml-6.0.3-cp39-cp39-win32.whl", hash = "sha256:1ebe39cb5fc479422b83de611d14e2c0d3bb2a18bbcb01f229ab3cfbd8fee7a0", size = 142592, upload-time = "2025-09-25T21:33:10.983Z" }, - { url = "https://files.pythonhosted.org/packages/f0/0c/25113e0b5e103d7f1490c0e947e303fe4a696c10b501dea7a9f49d4e876c/pyyaml-6.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:2e71d11abed7344e42a8849600193d15b6def118602c4c176f748e4583246007", size = 158777, upload-time = "2025-09-25T21:33:15.55Z" }, ] [[package]] @@ -3016,10 +2600,10 @@ name = "requests" version = "2.33.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "certifi", marker = "python_full_version >= '3.10'" }, - { name = "charset-normalizer", marker = "python_full_version >= '3.10'" }, - { name = "idna", marker = "python_full_version >= '3.10'" }, - { name = "urllib3", version = "2.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, ] sdist = { url = "https://files.pythonhosted.org/packages/34/64/8860370b167a9721e8956ae116825caff829224fbca0ca6e7bf8ddef8430/requests-2.33.0.tar.gz", hash = "sha256:c7ebc5e8b0f21837386ad0e1c8fe8b829fa5f544d8df3b2253bff14ef29d7652", size = 134232, upload-time = "2026-03-25T15:10:41.586Z" } wheels = [ @@ -3031,8 +2615,8 @@ name = "rich" version = "14.1.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "markdown-it-py", marker = "python_full_version >= '3.10'" }, - { name = "pygments", marker = "python_full_version >= '3.10'" }, + { name = "markdown-it-py" }, + { name = "pygments" }, ] sdist = { url = "https://files.pythonhosted.org/packages/fe/75/af448d8e52bf1d8fa6a9d089ca6c07ff4453d86c65c145d0a300bb073b9b/rich-14.1.0.tar.gz", hash = "sha256:e497a48b844b0320d45007cdebfeaeed8db2a4f4bcf49f15e455cfc4af11eaa8", size = 224441, upload-time = "2025-07-25T07:32:58.125Z" } wheels = [ @@ -3114,13 +2698,11 @@ dependencies = [ { name = "absl-py" }, { name = "grpcio" }, { name = "markdown" }, - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "packaging" }, { name = "pillow" }, - { name = "protobuf", version = "3.19.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "protobuf", version = "6.32.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "protobuf" }, { name = "setuptools" }, { name = "tensorboard-data-server" }, { name = "werkzeug" }, @@ -3138,74 +2720,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/73/c6/825dab04195756cf8ff2e12698f22513b3db2f64925bdd41671bfb33aaa5/tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:ef687163c24185ae9754ed5650eb5bc4d84ff257aabdc33f0cc6f74d8ba54530", size = 6590363, upload-time = "2023-10-23T21:23:35.583Z" }, ] -[[package]] -name = "tensorflow" -version = "2.7.4" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -dependencies = [ - { name = "absl-py", marker = "python_full_version < '3.10'" }, - { name = "astunparse", marker = "python_full_version < '3.10'" }, - { name = "flatbuffers", version = "2.0.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "gast", version = "0.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "google-pasta", marker = "python_full_version < '3.10'" }, - { name = "grpcio", marker = "python_full_version < '3.10'" }, - { name = "h5py", marker = "python_full_version < '3.10'" }, - { name = "keras", version = "2.7.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "keras-preprocessing", marker = "python_full_version < '3.10'" }, - { name = "libclang", marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "opt-einsum", marker = "python_full_version < '3.10'" }, - { name = "protobuf", version = "3.19.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "six", marker = "python_full_version < '3.10'" }, - { name = "tensorboard", marker = "python_full_version < '3.10'" }, - { name = "tensorflow-estimator", marker = "python_full_version < '3.10'" }, - { name = "tensorflow-io-gcs-filesystem", marker = "python_full_version < '3.10'" }, - { name = "termcolor", marker = "python_full_version < '3.10'" }, - { name = "typing-extensions", marker = "python_full_version < '3.10'" }, - { name = "wheel", marker = "python_full_version < '3.10'" }, - { name = "wrapt", marker = "python_full_version < '3.10'" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/5e/31/d49a3dff9c4ca6e6c09c2c5fea95f58cf59cc3cd4f0d557069c7dccd6f57/tensorflow-2.7.4-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:c4597635dd71fc6809b7fffcb462524d73e2ade09da61844059e6a2fead71140", size = 496066688, upload-time = "2022-09-02T19:11:01.631Z" }, -] - [[package]] name = "tensorflow" version = "2.20.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] dependencies = [ - { name = "absl-py", marker = "python_full_version >= '3.10'" }, - { name = "astunparse", marker = "python_full_version >= '3.10'" }, - { name = "flatbuffers", version = "25.9.23", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "gast", version = "0.6.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "google-pasta", marker = "python_full_version >= '3.10'" }, - { name = "grpcio", marker = "python_full_version >= '3.10'" }, - { name = "h5py", marker = "python_full_version >= '3.10'" }, - { name = "keras", version = "3.11.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "libclang", marker = "python_full_version >= '3.10'" }, - { name = "ml-dtypes", marker = "python_full_version >= '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "absl-py" }, + { name = "astunparse" }, + { name = "flatbuffers" }, + { name = "gast" }, + { name = "google-pasta" }, + { name = "grpcio" }, + { name = "h5py" }, + { name = "keras" }, + { name = "libclang" }, + { name = "ml-dtypes" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "opt-einsum", marker = "python_full_version >= '3.10'" }, - { name = "packaging", marker = "python_full_version >= '3.10'" }, - { name = "protobuf", version = "6.32.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "requests", marker = "python_full_version >= '3.10'" }, - { name = "setuptools", marker = "python_full_version >= '3.10'" }, - { name = "six", marker = "python_full_version >= '3.10'" }, - { name = "tensorboard", marker = "python_full_version >= '3.10'" }, - { name = "termcolor", marker = "python_full_version >= '3.10'" }, - { name = "typing-extensions", marker = "python_full_version >= '3.10'" }, - { name = "wrapt", marker = "python_full_version >= '3.10'" }, + { name = "opt-einsum" }, + { name = "packaging" }, + { name = "protobuf" }, + { name = "requests" }, + { name = "setuptools" }, + { name = "six" }, + { name = "tensorboard" }, + { name = "termcolor" }, + { name = "typing-extensions" }, + { name = "wrapt" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/ff/07/ea91ac67a9fd36d3372099f5a3e69860ded544f877f5f2117802388f4212/tensorflow-2.20.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02a0293d94f5c8b7125b66abf622cc4854a33ae9d618a0d41309f95e091bbaea", size = 259307122, upload-time = "2025-08-13T16:50:47.909Z" }, @@ -3216,31 +2757,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9c/d1/6aa15085d672056d5f08b5f28b1c7ce01c4e12149a23b0c98e3c79d04441/tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25265b0bc527e0d54b1e9cc60c44a24f44a809fe27666b905f0466471f9c52ec", size = 620682547, upload-time = "2025-08-13T16:52:46.396Z" }, { url = "https://files.pythonhosted.org/packages/ea/4c/c1aa90c5cc92e9f7f9c78421e121ef25bae7d378f8d1d4cbad46c6308836/tensorflow-2.20.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47c88e05a07f1ead4977b4894b3ecd4d8075c40191065afc4fd9355c9db3d926", size = 259663776, upload-time = "2025-08-13T16:53:24.507Z" }, { url = "https://files.pythonhosted.org/packages/43/fb/8be8547c128613d82a2b006004026d86ed0bd672e913029a98153af4ffab/tensorflow-2.20.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fa3729b0126f75a99882b89fb7d536515721eda8014a63e259e780ba0a37372", size = 620815537, upload-time = "2025-08-13T16:53:42.577Z" }, - { url = "https://files.pythonhosted.org/packages/83/ff/a26d49895586207b2704403366ef976dcaa6ed07514699dae9a4fc3fa1a9/tensorflow-2.20.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28bc33759249c98eabcee9debd24e74506bbe29ac139e050cf0c74aa9888ebdf", size = 259307564, upload-time = "2025-08-13T16:54:17.691Z" }, - { url = "https://files.pythonhosted.org/packages/5f/fe/f3d738dc7c93ed5f67f9ace8dd3ed66971dab7c5a47f2d1c504ef0d0cf1d/tensorflow-2.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0deb5c583dfc53b54fd158a194ce0087b406bb6518af400ca3809735e4548ec3", size = 620427169, upload-time = "2025-08-13T16:54:33.431Z" }, -] - -[[package]] -name = "tensorflow-estimator" -version = "2.7.0" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/db/de/3a71ad41b87f9dd424e3aec3b0794a60f169fa7e9a9a1e3dd44290b86dd6/tensorflow_estimator-2.7.0-py2.py3-none-any.whl", hash = "sha256:325b5a224864379242b7b76c6987ca544239be82579d33e68ec7c2bda57abc9d", size = 463110, upload-time = "2021-10-29T23:02:47.14Z" }, -] - -[[package]] -name = "tensorflow-io-gcs-filesystem" -version = "0.37.1" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e2/19/9095c69e22c879cb3896321e676c69273a549a3148c4f62aa4bc5ebdb20f/tensorflow_io_gcs_filesystem-0.37.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8febbfcc67c61e542a5ac1a98c7c20a91a5e1afc2e14b1ef0cb7c28bc3b6aa70", size = 4842078, upload-time = "2024-07-01T23:44:18.977Z" }, - { url = "https://files.pythonhosted.org/packages/f3/48/47b7d25572961a48b1de3729b7a11e835b888e41e0203cca82df95d23b91/tensorflow_io_gcs_filesystem-0.37.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9679b36e3a80921876f31685ab6f7270f3411a4cc51bc2847e80d0e4b5291e27", size = 5085736, upload-time = "2024-07-01T23:44:21.034Z" }, - { url = "https://files.pythonhosted.org/packages/de/bf/ba597d3884c77d05a78050f3c178933d69e3f80200a261df6eaa920656cd/tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e1f2796b57e799a8ca1b75bf47c2aaa437c968408cc1a402a9862929e104cda", size = 4842079, upload-time = "2024-07-01T23:44:26.825Z" }, - { url = "https://files.pythonhosted.org/packages/66/7f/e36ae148c2f03d61ca1bff24bc13a0fef6d6825c966abef73fc6f880a23b/tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee7c8ee5fe2fd8cb6392669ef16e71841133041fee8a330eff519ad9b36e4556", size = 5085736, upload-time = "2024-07-01T23:44:28.618Z" }, - { url = "https://files.pythonhosted.org/packages/d3/46/962f47af08bd39fc9feb280d3192825431a91a078c856d17a78ae4884eb1/tensorflow_io_gcs_filesystem-0.37.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fbb33f1745f218464a59cecd9a18e32ca927b0f4d77abd8f8671b645cc1a182f", size = 4842077, upload-time = "2024-07-01T23:44:33.86Z" }, - { url = "https://files.pythonhosted.org/packages/f0/9b/790d290c232bce9b691391cf16e95a96e469669c56abfb1d9d0f35fa437c/tensorflow_io_gcs_filesystem-0.37.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:286389a203a5aee1a4fa2e53718c661091aa5fea797ff4fa6715ab8436b02e6c", size = 5085733, upload-time = "2024-07-01T23:44:36.663Z" }, - { url = "https://files.pythonhosted.org/packages/66/5f/334a011caa1eb97689274d1141df8e6b7a25e389f0390bdcd90235de9783/tensorflow_io_gcs_filesystem-0.37.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:426de1173cb81fbd62becec2012fc00322a295326d90eb6c737fab636f182aed", size = 4842075, upload-time = "2024-07-01T23:44:42.094Z" }, - { url = "https://files.pythonhosted.org/packages/3d/cb/7dcee55fc5a7d7d8a862e12519322851cd5fe5b086f946fd71e4ae1ef281/tensorflow_io_gcs_filesystem-0.37.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df00891669390078a003cedbdd3b8e645c718b111917535fa1d7725e95cdb95", size = 5087496, upload-time = "2024-07-01T23:44:43.797Z" }, ] [[package]] @@ -3299,8 +2815,7 @@ dependencies = [ { name = "filelock" }, { name = "fsspec" }, { name = "jinja2" }, - { name = "networkx", version = "3.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, @@ -3342,10 +2857,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4a/15/5e488ca0bc6162c86a33b58642bc577c84ded17c7b72d97e49b5833e2d73/torch-2.8.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:8f0a9d617a66509ded240add3754e462430a6c1fc5589f86c17b433dd808f97a", size = 887990692, upload-time = "2025-08-06T14:56:18.286Z" }, { url = "https://files.pythonhosted.org/packages/b4/a8/6a04e4b54472fc5dba7ca2341ab219e529f3c07b6941059fbf18dccac31f/torch-2.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:a7242b86f42be98ac674b88a4988643b9bc6145437ec8f048fea23f72feb5eca", size = 241603453, upload-time = "2025-08-06T14:55:22.945Z" }, { url = "https://files.pythonhosted.org/packages/04/6e/650bb7f28f771af0cb791b02348db8b7f5f64f40f6829ee82aa6ce99aabe/torch-2.8.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:7b677e17f5a3e69fdef7eb3b9da72622f8d322692930297e4ccb52fefc6c8211", size = 73632395, upload-time = "2025-08-06T14:55:28.645Z" }, - { url = "https://files.pythonhosted.org/packages/5b/b0/a321f27270049baa12f5c3fb0d6ceea005634787e3af9a8d75dce8306b0a/torch-2.8.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:da6afa31c13b669d4ba49d8a2169f0db2c3ec6bec4af898aa714f401d4c38904", size = 102059214, upload-time = "2025-08-06T14:55:33.433Z" }, - { url = "https://files.pythonhosted.org/packages/fd/dd/1630cb51b10d3d2e97db95e5a84c32def81fc26b005bce6fc880b0e6db81/torch-2.8.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:06fcee8000e5c62a9f3e52a688b9c5abb7c6228d0e56e3452983416025c41381", size = 888024302, upload-time = "2025-08-06T14:57:28.23Z" }, - { url = "https://files.pythonhosted.org/packages/b9/dc/1f1f621afe15e3c496e1e8f94f8903f75f87e7d642d5a985e92210cc208d/torch-2.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:5128fe752a355d9308e56af1ad28b15266fe2da5948660fad44de9e3a9e36e8c", size = 241249338, upload-time = "2025-08-06T14:57:05.669Z" }, - { url = "https://files.pythonhosted.org/packages/ae/95/ae26263aceb3d57b821179f827d0e321373ed49423e603dd5906ab14a730/torch-2.8.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:e9f071f5b52a9f6970dc8a919694b27a91ae9dc08898b2b988abbef5eddfd1ae", size = 73610795, upload-time = "2025-08-06T14:57:11.513Z" }, ] [[package]] @@ -3365,7 +2876,6 @@ name = "triton" version = "3.4.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "importlib-metadata", marker = "python_full_version < '3.10'" }, { name = "setuptools" }, ] wheels = [ @@ -3374,7 +2884,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d0/66/b1eb52839f563623d185f0927eb3530ee4d5ffe9d377cdaf5346b306689e/triton-3.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:31c1d84a5c0ec2c0f8e8a072d7fd150cab84a9c239eaddc6706c081bfae4eb04", size = 155560068, upload-time = "2025-07-30T19:58:37.081Z" }, { url = "https://files.pythonhosted.org/packages/30/7b/0a685684ed5322d2af0bddefed7906674f67974aa88b0fae6e82e3b766f6/triton-3.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00be2964616f4c619193cb0d1b29a99bd4b001d7dc333816073f92cf2a8ccdeb", size = 155569223, upload-time = "2025-07-30T19:58:44.017Z" }, { url = "https://files.pythonhosted.org/packages/20/63/8cb444ad5cdb25d999b7d647abac25af0ee37d292afc009940c05b82dda0/triton-3.4.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7936b18a3499ed62059414d7df563e6c163c5e16c3773678a3ee3d417865035d", size = 155659780, upload-time = "2025-07-30T19:58:51.171Z" }, - { url = "https://files.pythonhosted.org/packages/12/34/1251beb5a3cb93f3950ebe68732752014646003ef6eb11eb5f1a37ca78cd/triton-3.4.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98e5c1442eaeabae2e2452ae765801bd53cd4ce873cab0d1bdd59a32ab2d9397", size = 155430799, upload-time = "2025-07-30T19:58:57.664Z" }, ] [[package]] @@ -3407,29 +2916,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" }, ] -[[package]] -name = "urllib3" -version = "1.26.20" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/e4/e8/6ff5e6bc22095cfc59b6ea711b687e2b7ed4bdb373f7eeec370a97d7392f/urllib3-1.26.20.tar.gz", hash = "sha256:40c2dc0c681e47eb8f90e7e27bf6ff7df2e677421fd46756da1161c39ca70d32", size = 307380, upload-time = "2024-08-29T15:43:11.37Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/33/cf/8435d5a7159e2a9c83a95896ed596f68cf798005fe107cc655b5c5c14704/urllib3-1.26.20-py2.py3-none-any.whl", hash = "sha256:0ed14ccfbf1c30a9072c7ca157e4319b70d65f623e91e7b32fadb2853431016e", size = 144225, upload-time = "2024-08-29T15:43:08.921Z" }, -] - [[package]] name = "urllib3" version = "2.5.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload-time = "2025-06-18T14:07:41.644Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" }, @@ -3486,10 +2976,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/01/9b85a99996b0a97c8a17484684f206cbb6ba73c1ce6890ac668bcf3838fb/wrapt-1.17.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:223db574bb38637e8230eb14b185565023ab624474df94d2af18f1cdb625216f", size = 113094, upload-time = "2025-08-12T05:52:22.618Z" }, { url = "https://files.pythonhosted.org/packages/25/02/78926c1efddcc7b3aa0bc3d6b33a822f7d898059f7cd9ace8c8318e559ef/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e405adefb53a435f01efa7ccdec012c016b5a1d3f35459990afc39b6be4d5056", size = 110659, upload-time = "2025-08-12T05:52:24.057Z" }, { url = "https://files.pythonhosted.org/packages/dc/ee/c414501ad518ac3e6fe184753632fe5e5ecacdcf0effc23f31c1e4f7bfcf/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:88547535b787a6c9ce4086917b6e1d291aa8ed914fdd3a838b3539dc95c12804", size = 106946, upload-time = "2025-08-12T05:52:45.976Z" }, - { url = "https://files.pythonhosted.org/packages/43/46/dd0791943613885f62619f18ee6107e6133237a6b6ed8a9ecfac339d0b4f/wrapt-1.17.3-cp39-cp39-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:7e18f01b0c3e4a07fe6dfdb00e29049ba17eadbc5e7609a2a3a4af83ab7d710a", size = 81745, upload-time = "2025-08-12T05:52:49.62Z" }, - { url = "https://files.pythonhosted.org/packages/dd/ec/bb2d19bd1a614cc4f438abac13ae26c57186197920432d2a915183b15a8b/wrapt-1.17.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f5f51a6466667a5a356e6381d362d259125b57f059103dd9fdc8c0cf1d14139", size = 82833, upload-time = "2025-08-12T05:52:27.738Z" }, - { url = "https://files.pythonhosted.org/packages/8d/eb/66579aea6ad36f07617fedca8e282e49c7c9bab64c63b446cfe4f7f47a49/wrapt-1.17.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:59923aa12d0157f6b82d686c3fd8e1166fa8cdfb3e17b42ce3b6147ff81528df", size = 81889, upload-time = "2025-08-12T05:52:29.023Z" }, - { url = "https://files.pythonhosted.org/packages/04/9c/a56b5ac0e2473bdc3fb11b22dd69ff423154d63861cf77911cdde5e38fd2/wrapt-1.17.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:46acc57b331e0b3bcb3e1ca3b421d65637915cfcd65eb783cb2f78a511193f9b", size = 81344, upload-time = "2025-08-12T05:52:50.869Z" }, { url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" }, ] @@ -3604,21 +3090,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9a/9a/c19c42c5b3f5a4aad748a6d5b4f23df3bed7ee5445accc65a0fb3ff03953/xxhash-3.6.0-cp314-cp314t-win32.whl", hash = "sha256:5851f033c3030dd95c086b4a36a2683c2ff4a799b23af60977188b057e467119", size = 31586, upload-time = "2025-10-02T14:36:15.603Z" }, { url = "https://files.pythonhosted.org/packages/03/d6/4cc450345be9924fd5dc8c590ceda1db5b43a0a889587b0ae81a95511360/xxhash-3.6.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0444e7967dac37569052d2409b00a8860c2135cff05502df4da80267d384849f", size = 32526, upload-time = "2025-10-02T14:36:16.708Z" }, { url = "https://files.pythonhosted.org/packages/0f/c9/7243eb3f9eaabd1a88a5a5acadf06df2d83b100c62684b7425c6a11bcaa8/xxhash-3.6.0-cp314-cp314t-win_arm64.whl", hash = "sha256:bb79b1e63f6fd84ec778a4b1916dfe0a7c3fdb986c06addd5db3a0d413819d95", size = 28898, upload-time = "2025-10-02T14:36:17.843Z" }, - { url = "https://files.pythonhosted.org/packages/03/ff/1b4bb3f397552116c1df6266c1b83a21aeeb26061ab1f462984b499a3870/xxhash-3.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:cc604dc06027dbeb8281aeac5899c35fcfe7c77b25212833709f0bff4ce74d2a", size = 32844, upload-time = "2025-10-02T14:36:39.157Z" }, - { url = "https://files.pythonhosted.org/packages/c1/db/27146d0bee4346a9a31f7b498a81fc02747f6f1e6c52a2e7989504278051/xxhash-3.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:277175a73900ad43a8caeb8b99b9604f21fe8d7c842f2f9061a364a7e220ddb7", size = 30806, upload-time = "2025-10-02T14:36:40.621Z" }, - { url = "https://files.pythonhosted.org/packages/e7/2b/4896188df564908817a75de19bf7f2384b99a75af2d528f9c49326f76458/xxhash-3.6.0-cp39-cp39-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cfbc5b91397c8c2972fdac13fb3e4ed2f7f8ccac85cd2c644887557780a9b6e2", size = 193448, upload-time = "2025-10-02T14:36:41.797Z" }, - { url = "https://files.pythonhosted.org/packages/51/c5/be8953f62e772340319a826ce1e07489935600089756cf83b628cd36ebe3/xxhash-3.6.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2762bfff264c4e73c0e507274b40634ff465e025f0eaf050897e88ec8367575d", size = 212547, upload-time = "2025-10-02T14:36:43.581Z" }, - { url = "https://files.pythonhosted.org/packages/51/1a/1e9f0b911d1cf00dd537c074ae3fae15b535a7f0d9e7edd42a9d2c4f78ce/xxhash-3.6.0-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2f171a900d59d51511209f7476933c34a0c2c711078d3c80e74e0fe4f38680ec", size = 211309, upload-time = "2025-10-02T14:36:45.307Z" }, - { url = "https://files.pythonhosted.org/packages/63/88/b284c6a128d88dc47f201957f926e707db79fb7415a87072e15c0e490de0/xxhash-3.6.0-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:780b90c313348f030b811efc37b0fa1431163cb8db8064cf88a7936b6ce5f222", size = 444480, upload-time = "2025-10-02T14:36:47.226Z" }, - { url = "https://files.pythonhosted.org/packages/87/e4/798293a2bf9e4fac5f6d53ce59cba4739930778dfc6c7c73f40044ab0e6e/xxhash-3.6.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18b242455eccdfcd1fa4134c431a30737d2b4f045770f8fe84356b3469d4b919", size = 192957, upload-time = "2025-10-02T14:36:48.968Z" }, - { url = "https://files.pythonhosted.org/packages/78/55/bfd0d7db447a927897469048b953caececa3532e743b940dd1f5c1032d24/xxhash-3.6.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a75ffc1bd5def584129774c158e108e5d768e10b75813f2b32650bb041066ed6", size = 209850, upload-time = "2025-10-02T14:36:50.258Z" }, - { url = "https://files.pythonhosted.org/packages/31/06/d08ef9a792bfebfd2fb2bcbf04a541ad283bef74749ead6f089a0809d288/xxhash-3.6.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1fc1ed882d1e8df932a66e2999429ba6cc4d5172914c904ab193381fba825360", size = 197342, upload-time = "2025-10-02T14:36:51.651Z" }, - { url = "https://files.pythonhosted.org/packages/7b/1a/aebf90797c94e9ca407c28e23f54d71f7149d91a93406a08a09e44d06994/xxhash-3.6.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:44e342e8cc11b4e79dae5c57f2fb6360c3c20cc57d32049af8f567f5b4bcb5f4", size = 209757, upload-time = "2025-10-02T14:36:53.009Z" }, - { url = "https://files.pythonhosted.org/packages/3c/80/799eec3d0a144dc3edf8c19b4f139c27fb923c50b34352796089ca206429/xxhash-3.6.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:c2f9ccd5c4be370939a2e17602fbc49995299203da72a3429db013d44d590e86", size = 412773, upload-time = "2025-10-02T14:36:54.691Z" }, - { url = "https://files.pythonhosted.org/packages/6a/f9/09df7545699de09219a205123b8463ce9ea83f48acc7aeeba0269507f9d3/xxhash-3.6.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:02ea4cb627c76f48cd9fb37cf7ab22bd51e57e1b519807234b473faebe526796", size = 190357, upload-time = "2025-10-02T14:36:56.363Z" }, - { url = "https://files.pythonhosted.org/packages/07/40/2f8327f94e64a3f34d6ce3347c55207c322abbc80ae486ea45df4c62e7b3/xxhash-3.6.0-cp39-cp39-win32.whl", hash = "sha256:6551880383f0e6971dc23e512c9ccc986147ce7bfa1cd2e4b520b876c53e9f3d", size = 30585, upload-time = "2025-10-02T14:36:57.664Z" }, - { url = "https://files.pythonhosted.org/packages/6a/c8/2ecbc6799be9c02e8bf7b5a66cd94832b6ac13d59808746f0d402481c6ad/xxhash-3.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:7c35c4cdc65f2a29f34425c446f2f5cdcd0e3c34158931e1cc927ece925ab802", size = 31512, upload-time = "2025-10-02T14:36:58.837Z" }, - { url = "https://files.pythonhosted.org/packages/19/94/1d5459a9c587c94d7b8bcc710bd08bbfa145cbd814ebde41b48494362a21/xxhash-3.6.0-cp39-cp39-win_arm64.whl", hash = "sha256:ffc578717a347baf25be8397cb10d2528802d24f94cfc005c0e44fef44b5cdd6", size = 27878, upload-time = "2025-10-02T14:37:00.201Z" }, { url = "https://files.pythonhosted.org/packages/93/1e/8aec23647a34a249f62e2398c42955acd9b4c6ed5cf08cbea94dc46f78d2/xxhash-3.6.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0f7b7e2ec26c1666ad5fc9dbfa426a6a3367ceaf79db5dd76264659d509d73b0", size = 30662, upload-time = "2025-10-02T14:37:01.743Z" }, { url = "https://files.pythonhosted.org/packages/b8/0b/b14510b38ba91caf43006209db846a696ceea6a847a0c9ba0a5b1adc53d6/xxhash-3.6.0-pp311-pypy311_pp73-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5dc1e14d14fa0f5789ec29a7062004b5933964bb9b02aae6622b8f530dc40296", size = 41056, upload-time = "2025-10-02T14:37:02.879Z" }, { url = "https://files.pythonhosted.org/packages/50/55/15a7b8a56590e66ccd374bbfa3f9ffc45b810886c8c3b614e3f90bd2367c/xxhash-3.6.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:881b47fc47e051b37d94d13e7455131054b56749b91b508b0907eb07900d1c13", size = 36251, upload-time = "2025-10-02T14:37:04.44Z" }, @@ -3631,9 +3102,9 @@ name = "yarl" version = "1.20.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "idna", marker = "python_full_version >= '3.10'" }, - { name = "multidict", marker = "python_full_version >= '3.10'" }, - { name = "propcache", marker = "python_full_version >= '3.10'" }, + { name = "idna" }, + { name = "multidict" }, + { name = "propcache" }, ] sdist = { url = "https://files.pythonhosted.org/packages/3c/fb/efaa23fa4e45537b827620f04cf8f3cd658b76642205162e072703a5b963/yarl-1.20.1.tar.gz", hash = "sha256:d017a4997ee50c91fd5466cef416231bb82177b93b029906cefc542ce14c35ac", size = 186428, upload-time = "2025-06-10T00:46:09.923Z" } wheels = [ @@ -3722,31 +3193,5 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9e/ed/c5fb04869b99b717985e244fd93029c7a8e8febdfcffa06093e32d7d44e7/yarl-1.20.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:88cab98aa4e13e1ade8c141daeedd300a4603b7132819c484841bb7af3edce9e", size = 341709, upload-time = "2025-06-10T00:45:23.221Z" }, { url = "https://files.pythonhosted.org/packages/24/fd/725b8e73ac2a50e78a4534ac43c6addf5c1c2d65380dd48a9169cc6739a9/yarl-1.20.1-cp313-cp313t-win32.whl", hash = "sha256:b121ff6a7cbd4abc28985b6028235491941b9fe8fe226e6fdc539c977ea1739d", size = 86591, upload-time = "2025-06-10T00:45:25.793Z" }, { url = "https://files.pythonhosted.org/packages/94/c3/b2e9f38bc3e11191981d57ea08cab2166e74ea770024a646617c9cddd9f6/yarl-1.20.1-cp313-cp313t-win_amd64.whl", hash = "sha256:541d050a355bbbc27e55d906bc91cb6fe42f96c01413dd0f4ed5a5240513874f", size = 93003, upload-time = "2025-06-10T00:45:27.752Z" }, - { url = "https://files.pythonhosted.org/packages/01/75/0d37402d208d025afa6b5b8eb80e466d267d3fd1927db8e317d29a94a4cb/yarl-1.20.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e42ba79e2efb6845ebab49c7bf20306c4edf74a0b20fc6b2ccdd1a219d12fad3", size = 134259, upload-time = "2025-06-10T00:45:29.882Z" }, - { url = "https://files.pythonhosted.org/packages/73/84/1fb6c85ae0cf9901046f07d0ac9eb162f7ce6d95db541130aa542ed377e6/yarl-1.20.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:41493b9b7c312ac448b7f0a42a089dffe1d6e6e981a2d76205801a023ed26a2b", size = 91269, upload-time = "2025-06-10T00:45:32.917Z" }, - { url = "https://files.pythonhosted.org/packages/f3/9c/eae746b24c4ea29a5accba9a06c197a70fa38a49c7df244e0d3951108861/yarl-1.20.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f5a5928ff5eb13408c62a968ac90d43f8322fd56d87008b8f9dabf3c0f6ee983", size = 89995, upload-time = "2025-06-10T00:45:35.066Z" }, - { url = "https://files.pythonhosted.org/packages/fb/30/693e71003ec4bc1daf2e4cf7c478c417d0985e0a8e8f00b2230d517876fc/yarl-1.20.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30c41ad5d717b3961b2dd785593b67d386b73feca30522048d37298fee981805", size = 325253, upload-time = "2025-06-10T00:45:37.052Z" }, - { url = "https://files.pythonhosted.org/packages/0f/a2/5264dbebf90763139aeb0b0b3154763239398400f754ae19a0518b654117/yarl-1.20.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:59febc3969b0781682b469d4aca1a5cab7505a4f7b85acf6db01fa500fa3f6ba", size = 320897, upload-time = "2025-06-10T00:45:39.962Z" }, - { url = "https://files.pythonhosted.org/packages/e7/17/77c7a89b3c05856489777e922f41db79ab4faf58621886df40d812c7facd/yarl-1.20.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d2b6fb3622b7e5bf7a6e5b679a69326b4279e805ed1699d749739a61d242449e", size = 340696, upload-time = "2025-06-10T00:45:41.915Z" }, - { url = "https://files.pythonhosted.org/packages/6d/55/28409330b8ef5f2f681f5b478150496ec9cf3309b149dab7ec8ab5cfa3f0/yarl-1.20.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:749d73611db8d26a6281086f859ea7ec08f9c4c56cec864e52028c8b328db723", size = 335064, upload-time = "2025-06-10T00:45:43.893Z" }, - { url = "https://files.pythonhosted.org/packages/85/58/cb0257cbd4002828ff735f44d3c5b6966c4fd1fc8cc1cd3cd8a143fbc513/yarl-1.20.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9427925776096e664c39e131447aa20ec738bdd77c049c48ea5200db2237e000", size = 327256, upload-time = "2025-06-10T00:45:46.393Z" }, - { url = "https://files.pythonhosted.org/packages/53/f6/c77960370cfa46f6fb3d6a5a79a49d3abfdb9ef92556badc2dcd2748bc2a/yarl-1.20.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff70f32aa316393eaf8222d518ce9118148eddb8a53073c2403863b41033eed5", size = 316389, upload-time = "2025-06-10T00:45:48.358Z" }, - { url = "https://files.pythonhosted.org/packages/64/ab/be0b10b8e029553c10905b6b00c64ecad3ebc8ace44b02293a62579343f6/yarl-1.20.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:c7ddf7a09f38667aea38801da8b8d6bfe81df767d9dfc8c88eb45827b195cd1c", size = 340481, upload-time = "2025-06-10T00:45:50.663Z" }, - { url = "https://files.pythonhosted.org/packages/c5/c3/3f327bd3905a4916029bf5feb7f86dcf864c7704f099715f62155fb386b2/yarl-1.20.1-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:57edc88517d7fc62b174fcfb2e939fbc486a68315d648d7e74d07fac42cec240", size = 336941, upload-time = "2025-06-10T00:45:52.554Z" }, - { url = "https://files.pythonhosted.org/packages/d1/42/040bdd5d3b3bb02b4a6ace4ed4075e02f85df964d6e6cb321795d2a6496a/yarl-1.20.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:dab096ce479d5894d62c26ff4f699ec9072269d514b4edd630a393223f45a0ee", size = 339936, upload-time = "2025-06-10T00:45:54.919Z" }, - { url = "https://files.pythonhosted.org/packages/0d/1c/911867b8e8c7463b84dfdc275e0d99b04b66ad5132b503f184fe76be8ea4/yarl-1.20.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:14a85f3bd2d7bb255be7183e5d7d6e70add151a98edf56a770d6140f5d5f4010", size = 360163, upload-time = "2025-06-10T00:45:56.87Z" }, - { url = "https://files.pythonhosted.org/packages/e2/31/8c389f6c6ca0379b57b2da87f1f126c834777b4931c5ee8427dd65d0ff6b/yarl-1.20.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:2c89b5c792685dd9cd3fa9761c1b9f46fc240c2a3265483acc1565769996a3f8", size = 359108, upload-time = "2025-06-10T00:45:58.869Z" }, - { url = "https://files.pythonhosted.org/packages/7f/09/ae4a649fb3964324c70a3e2b61f45e566d9ffc0affd2b974cbf628957673/yarl-1.20.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:69e9b141de5511021942a6866990aea6d111c9042235de90e08f94cf972ca03d", size = 351875, upload-time = "2025-06-10T00:46:01.45Z" }, - { url = "https://files.pythonhosted.org/packages/8d/43/bbb4ed4c34d5bb62b48bf957f68cd43f736f79059d4f85225ab1ef80f4b9/yarl-1.20.1-cp39-cp39-win32.whl", hash = "sha256:b5f307337819cdfdbb40193cad84978a029f847b0a357fbe49f712063cfc4f06", size = 82293, upload-time = "2025-06-10T00:46:03.763Z" }, - { url = "https://files.pythonhosted.org/packages/d7/cd/ce185848a7dba68ea69e932674b5c1a42a1852123584bccc5443120f857c/yarl-1.20.1-cp39-cp39-win_amd64.whl", hash = "sha256:eae7bfe2069f9c1c5b05fc7fe5d612e5bbc089a39309904ee8b829e322dcad00", size = 87385, upload-time = "2025-06-10T00:46:05.655Z" }, { url = "https://files.pythonhosted.org/packages/b4/2d/2345fce04cfd4bee161bf1e7d9cdc702e3e16109021035dbb24db654a622/yarl-1.20.1-py3-none-any.whl", hash = "sha256:83b8eb083fe4683c6115795d9fc1cfaf2cbbefb19b3a1cb68f6527460f483a77", size = 46542, upload-time = "2025-06-10T00:46:07.521Z" }, ] - -[[package]] -name = "zipp" -version = "3.23.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, -] diff --git a/rust/examples/Cargo.toml b/rust/examples/Cargo.toml index a4e760f8cbe..80eff457140 100644 --- a/rust/examples/Cargo.toml +++ b/rust/examples/Cargo.toml @@ -49,6 +49,6 @@ tokio = { workspace = true } all_asserts = "2.3.1" env_logger = "0.11.7" hf-hub = "0.4.2" -parquet = "58.0.0" +parquet = { version = "58.0.0", default-features = false, features = ["arrow", "async"] } tokenizers = "0.15.2" rand.workspace = true diff --git a/rust/lance-arrow/src/ipc.rs b/rust/lance-arrow/src/ipc.rs index 1c6364c4525..8b6e5cf41fe 100644 --- a/rust/lance-arrow/src/ipc.rs +++ b/rust/lance-arrow/src/ipc.rs @@ -270,7 +270,7 @@ pub fn read_ipc_stream_single_at( /// Modern IPC streams have an 8-byte prefix `[continuation: 4][size: 4]`. /// Legacy streams have a 4-byte prefix `[size: 4]`. Returns `(prefix_len, meta_size)`. fn parse_ipc_message_prefix(buf: &Buffer) -> Result<(usize, usize), ArrowError> { - let has_continuation = buf.len() >= 4 && buf[..4] == [0xff; 4]; + let has_continuation = buf.len() >= 4 && buf[..4] == IPC_CONTINUATION; if has_continuation { if buf.len() < 8 { return Err(ArrowError::ParseError( @@ -358,6 +358,134 @@ pub fn read_ipc_stream_single(data: &Bytes) -> Result { } } +// --------------------------------------------------------------------------- +// Aligned IPC sections +// --------------------------------------------------------------------------- + +/// Byte alignment that each IPC section's stream start is padded to. +/// +/// When several IPC streams are concatenated into one larger blob (e.g. a +/// cache entry), a section that starts at an arbitrary offset would leave its +/// array data misaligned. [`FileDecoder`] with `require_alignment = false` +/// then silently copies each buffer into a freshly aligned allocation on +/// every read, defeating zero-copy. Padding each section start to a 64-byte +/// boundary keeps the decoded buffers borrowed directly from the input. +pub const IPC_SECTION_ALIGNMENT: usize = 64; + +/// Number of zero-padding bytes needed to advance `pos` to the next +/// [`IPC_SECTION_ALIGNMENT`] boundary. +fn section_padding(pos: usize) -> usize { + (IPC_SECTION_ALIGNMENT - (pos % IPC_SECTION_ALIGNMENT)) % IPC_SECTION_ALIGNMENT +} + +/// A [`Write`] adapter that counts the bytes written through it. +struct CountingWriter<'a> { + inner: &'a mut dyn Write, + count: usize, +} + +impl Write for CountingWriter<'_> { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + let n = self.inner.write(buf)?; + self.count += n; + Ok(n) + } + + fn flush(&mut self) -> std::io::Result<()> { + self.inner.flush() + } +} + +/// Write zero padding so the next byte lands on an [`IPC_SECTION_ALIGNMENT`] +/// boundary, advancing `pos` past it. +fn write_section_padding(writer: &mut dyn Write, pos: &mut usize) -> Result<(), ArrowError> { + let pad = section_padding(*pos); + if pad > 0 { + const ZEROS: [u8; IPC_SECTION_ALIGNMENT] = [0u8; IPC_SECTION_ALIGNMENT]; + writer + .write_all(&ZEROS[..pad]) + .map_err(|e| ArrowError::IoError(e.to_string(), e))?; + *pos += pad; + } + Ok(()) +} + +/// Write `batch` as a 64-byte-aligned single-batch Arrow IPC section. +/// +/// `pos` is the absolute byte offset of `writer` within the enclosing blob. +/// Zero padding is written first so the IPC stream begins on an +/// [`IPC_SECTION_ALIGNMENT`] boundary, then the stream itself. `pos` is +/// advanced past both the padding and the stream so the caller can write +/// further aligned sections. +/// +/// Paired with [`read_ipc_section_at`]. For the decoded buffers to be borrowed +/// zero-copy, the blob must ultimately be read back from a buffer whose base +/// address is at least 64-byte aligned. +pub fn write_ipc_section( + writer: &mut dyn Write, + pos: &mut usize, + batch: &RecordBatch, +) -> Result<(), ArrowError> { + write_section_padding(writer, pos)?; + + let mut counting = CountingWriter { + inner: writer, + count: 0, + }; + write_ipc_stream(batch, &mut counting)?; + *pos += counting.count; + Ok(()) +} + +/// Read a single [`RecordBatch`] from an aligned IPC section at `offset`. +/// +/// Skips the alignment padding written by [`write_ipc_section`], then reads +/// the stream, advancing `offset` past the section (padding + stream + EOS). +/// +/// Zero-copy: array buffers borrow from `data`'s allocation when `data`'s base +/// address is at least 64-byte aligned (see [`write_ipc_section`]). +pub fn read_ipc_section_at(data: &Bytes, offset: &mut usize) -> Result { + *offset += section_padding(*offset); + read_ipc_stream_single_at(data, offset) +} + +/// Write `batches` as a single 64-byte-aligned multi-batch Arrow IPC section. +/// +/// Like [`write_ipc_section`] but emits every batch from `iter` into one IPC +/// stream (schema + N batches + EOS). `iter` must yield at least one batch. +/// Paired with [`read_ipc_section_batches_at`]. +pub fn write_ipc_section_batches( + writer: &mut dyn Write, + pos: &mut usize, + iter: I, +) -> Result<(), ArrowError> +where + I: IntoIterator, +{ + write_section_padding(writer, pos)?; + + let mut counting = CountingWriter { + inner: writer, + count: 0, + }; + write_ipc_stream_batches(iter, &mut counting)?; + *pos += counting.count; + Ok(()) +} + +/// Read all [`RecordBatch`]es from an aligned multi-batch IPC section at +/// `offset`, advancing `offset` past the section (padding + stream + EOS). +/// +/// Zero-copy: array buffers borrow from `data`'s allocation when `data`'s base +/// address is at least 64-byte aligned (see [`write_ipc_section_batches`]). +pub fn read_ipc_section_batches_at( + data: &Bytes, + offset: &mut usize, +) -> Result, ArrowError> { + *offset += section_padding(*offset); + read_ipc_stream_at(data, offset) +} + #[cfg(test)] mod tests { use arrow_array::{ArrayRef, record_batch}; @@ -403,4 +531,90 @@ mod tests { assert_col_zero_copy(batch.column(1)); } } + + /// Allocate a [`Bytes`] whose base address is 64-byte aligned, modelling a + /// backend that reads cache entries into an aligned buffer. A plain + /// `Bytes::from(vec)` only guarantees the allocator's alignment for `u8`. + fn aligned_bytes(payload: &[u8]) -> Bytes { + let mut v = vec![0u8; payload.len() + IPC_SECTION_ALIGNMENT]; + let pad = section_padding(v.as_ptr() as usize); + v[pad..pad + payload.len()].copy_from_slice(payload); + Bytes::from(v).slice(pad..pad + payload.len()) + } + + #[test] + fn test_aligned_ipc_sections_are_zero_copy() { + // A LargeBinary column exercises the i64-offset buffer whose 8-byte + // alignment requirement triggers a realigning memcpy when misaligned. + let blocks = arrow_array::LargeBinaryArray::from_vec(vec![&b"hello"[..], b"world"]); + let section_a = RecordBatch::try_from_iter([("a", Arc::new(blocks) as ArrayRef)]).unwrap(); + let section_b = record_batch!(("b", Int64, [10i64, 20, 30, 40, 50])).unwrap(); + + let mut buf = Vec::new(); + // Arbitrary, deliberately non-64-aligned preamble so the first section + // must be padded rather than landing at offset 0 by luck. + buf.extend_from_slice(&[0xABu8; 7]); + let mut pos = buf.len(); + // The first section's stream begins after padding the 7-byte preamble + // up to the next 64-byte boundary. + assert_eq!(7 + section_padding(7), IPC_SECTION_ALIGNMENT); + write_ipc_section(&mut buf, &mut pos, §ion_a).unwrap(); + write_ipc_section(&mut buf, &mut pos, §ion_b).unwrap(); + + let data = aligned_bytes(&buf); + assert_eq!( + section_padding(data.as_ptr() as usize), + 0, + "base not aligned" + ); + + let mut offset = 7; + let read_a = read_ipc_section_at(&data, &mut offset).unwrap(); + let read_b = read_ipc_section_at(&data, &mut offset).unwrap(); + assert_eq!(read_a, section_a); + assert_eq!(read_b, section_b); + + let data_base = data.as_ptr() as usize; + let data_end = data_base + data.len(); + for batch in [&read_a, &read_b] { + for buffer in batch.column(0).to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= data_base && ptr < data_end, + "section buffer at {ptr:#x} was realigned out of the input \ + [{data_base:#x}..{data_end:#x}) — misaligned section", + ); + } + } + } + + #[test] + fn test_aligned_multi_batch_section_roundtrip_zero_copy() { + // A multi-batch section (e.g. IVF SQ storage chunks) must round-trip + // every batch and decode the first batch's buffers zero-copy. + let b1 = record_batch!(("v", Int64, [1i64, 2, 3])).unwrap(); + let b2 = record_batch!(("v", Int64, [4i64, 5])).unwrap(); + let b3 = record_batch!(("v", Int64, [6i64])).unwrap(); + + let mut buf = vec![0xCDu8; 5]; + let mut pos = buf.len(); + write_ipc_section_batches(&mut buf, &mut pos, [b1.clone(), b2.clone(), b3.clone()]) + .unwrap(); + + let data = aligned_bytes(&buf); + let mut offset = 5; + let read = read_ipc_section_batches_at(&data, &mut offset).unwrap(); + assert_eq!(read, vec![b1, b2, b3]); + assert_eq!(offset, buf.len(), "offset should land at section end"); + + let data_base = data.as_ptr() as usize; + let data_end = data_base + data.len(); + for buffer in read[0].column(0).to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= data_base && ptr < data_end, + "first batch buffer at {ptr:#x} was realigned out of the input", + ); + } + } } diff --git a/rust/lance-arrow/src/lib.rs b/rust/lance-arrow/src/lib.rs index b993cf00745..34a67600543 100644 --- a/rust/lance-arrow/src/lib.rs +++ b/rust/lance-arrow/src/lib.rs @@ -52,6 +52,8 @@ pub const BLOB_V2_EXT_NAME: &str = "lance.blob.v2"; /// Metadata key for overriding the dedicated blob size threshold (in bytes) pub const BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY: &str = "lance-encoding:blob-dedicated-size-threshold"; +/// Metadata key for overriding the inline blob size threshold (in bytes) +pub const BLOB_INLINE_SIZE_THRESHOLD_META_KEY: &str = "lance-encoding:blob-inline-size-threshold"; type Result = std::result::Result; diff --git a/rust/lance-core/src/cache/backend.rs b/rust/lance-core/src/cache/backend.rs index 237254c464f..9307868f399 100644 --- a/rust/lance-core/src/cache/backend.rs +++ b/rust/lance-core/src/cache/backend.rs @@ -22,6 +22,9 @@ use super::CacheCodec; /// A type-erased cache entry. pub type CacheEntry = Arc; +/// Iterator over cache keys currently known to a backend. +pub type CacheKeyIterator<'a> = Box + Send + 'a>; + /// Structured cache key passed to [`CacheBackend`] methods. /// /// CacheBackend impls receive these ready-made from [`LanceCache`](super::LanceCache) @@ -116,6 +119,15 @@ pub trait CacheBackend: Send + Sync + std::fmt::Debug { /// Remove all entries. async fn clear(&self); + /// Return an iterator over cache keys currently known to this backend. + /// + /// Backends that cannot enumerate keys cheaply or accurately should return + /// `None`. An empty iterator means key inventory is supported and the + /// cache currently has no entries. + async fn keys(&self) -> Option> { + None + } + /// Number of entries currently stored (may flush pending operations). async fn num_entries(&self) -> usize; diff --git a/rust/lance-core/src/cache/codec.rs b/rust/lance-core/src/cache/codec.rs index 34e5264bb28..bba54840829 100644 --- a/rust/lance-core/src/cache/codec.rs +++ b/rust/lance-core/src/cache/codec.rs @@ -5,12 +5,184 @@ //! //! Implement [`CacheCodecImpl`] on concrete types, then use //! [`CacheCodec::from_impl`] to produce a type-erased codec for the cache. +//! +//! # Wire format +//! +//! Every serialized entry begins with a small hand-framed **envelope** so the +//! reader can validate it before trusting the body: +//! +//! ```text +//! [magic: 4B = b"LCE1"] +//! [envelope_version: u8] +//! [type_id_len: u16 LE][type_id: utf8] # stable, author-assigned +//! [type_version: u32 LE] # per-type body schema version +//! +//! ``` +//! +//! The envelope is deliberately *not* protobuf: it is the most +//! stability-critical part, must parse robustly against arbitrary bytes +//! (including data written by older, pre-stabilization builds), and never +//! changes shape. Bodies use protobuf headers, where field-number evolution +//! pays off. +//! +//! # Decode outcome +//! +//! Deserialization never propagates a parse failure as a hard error into the +//! cache path. Anything the reader cannot confidently interpret — absent or +//! wrong magic, an unknown `envelope_version`, a `type_id` mismatch, an +//! unsupported `type_version`, or a body decode error — becomes +//! [`CacheDecode::Miss`]. A backend turns `Miss` into a normal cache miss and +//! recomputes the value. This is what lets data written by an older format +//! self-heal: it simply fails the magic check and is regenerated. +use std::io::Write; use std::sync::Arc; use bytes::Bytes; -use crate::Result; +use crate::{Error, Result}; + +use super::{CacheEntryReader, CacheEntryWriter}; + +// --------------------------------------------------------------------------- +// Envelope +// --------------------------------------------------------------------------- + +/// Magic bytes that prefix every stabilized cache entry. +/// +/// An ASCII tag (`0x4C 0x43 0x45 0x31`) chosen so it cannot collide with any +/// pre-stabilization blob: those began with either a small little-endian +/// length (tens of bytes) or a small tag byte, never these values. +/// +/// Exported so backends can cheaply identify Lance cache entries (e.g. when +/// scanning a persistent store at startup) without hardcoding the bytes — +/// prefer [`has_cache_envelope`] over comparing against this directly. +pub const MAGIC: [u8; 4] = *b"LCE1"; + +/// Returns `true` if `data` begins with the cache-entry [`MAGIC`]. +/// +/// A cheap prefix check for backends that need to recognize Lance cache +/// entries without fully [`deserialize`](CacheCodec::deserialize)-ing them. A +/// `true` result only means the framing looks like ours; the entry can still +/// decode to a [`Miss`](CacheDecode::Miss) (e.g. wrong `type_id`). +pub fn has_cache_envelope(data: &[u8]) -> bool { + data.get(..MAGIC.len()) == Some(&MAGIC[..]) +} + +/// Version of the envelope framing itself. Bumped only if the outer frame +/// (magic/version/type_id/type_version layout) ever changes — expected never. +const ENVELOPE_VERSION: u8 = 1; + +/// Parsed envelope borrowed from the input bytes. +struct ParsedEnvelope<'a> { + type_id: &'a str, + type_version: u32, + /// Offset of the first body byte within the input. + body_offset: usize, +} + +/// Parse and validate the envelope at the start of `data`. +/// +/// Returns `None` for anything that is not a well-formed envelope this build +/// understands (wrong/absent magic, unknown `envelope_version`, truncation, +/// non-utf8 `type_id`). Callers translate `None` into [`CacheDecode::Miss`]. +fn parse_envelope(data: &Bytes) -> Option> { + let bytes = data.as_ref(); + let mut off = 0usize; + + let magic = bytes.get(off..off + 4)?; + if magic != MAGIC { + return None; + } + off += 4; + + if *bytes.get(off)? != ENVELOPE_VERSION { + return None; + } + off += 1; + + let type_id_len = u16::from_le_bytes(bytes.get(off..off + 2)?.try_into().ok()?) as usize; + off += 2; + + let type_id = std::str::from_utf8(bytes.get(off..off + type_id_len)?).ok()?; + off += type_id_len; + + let type_version = u32::from_le_bytes(bytes.get(off..off + 4)?.try_into().ok()?); + off += 4; + + Some(ParsedEnvelope { + type_id, + type_version, + body_offset: off, + }) +} + +/// Write the envelope for `type_id`/`type_version`, returning the number of +/// bytes written (the body's starting offset). +fn write_envelope(writer: &mut dyn Write, type_id: &str, type_version: u32) -> Result { + let type_id_len = u16::try_from(type_id.len()).map_err(|_| { + Error::io(format!( + "cache codec type_id too long ({} bytes, max {})", + type_id.len(), + u16::MAX + )) + })?; + + writer.write_all(&MAGIC)?; + writer.write_all(&[ENVELOPE_VERSION])?; + writer.write_all(&type_id_len.to_le_bytes())?; + writer.write_all(type_id.as_bytes())?; + writer.write_all(&type_version.to_le_bytes())?; + + Ok(4 + 1 + 2 + type_id.len() + 4) +} + +// --------------------------------------------------------------------------- +// CacheDecode — first-class cache-miss outcome +// --------------------------------------------------------------------------- + +/// Why a cache entry could not be decoded into the expected type. +/// +/// Carried by [`CacheDecode::Miss`] so backends can emit targeted metrics +/// (e.g. distinguish "evicting due to a stale format" from "type collision") +/// without re-parsing. Every reason maps to the same behavior — recompute via +/// the loader — so callers that don't care can ignore it. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CacheMissReason { + /// Absent or wrong magic, unknown `envelope_version`, truncated framing, or + /// a non-utf8 `type_id`. Typically an entry written by a pre-stabilization + /// or otherwise foreign build. + InvalidEnvelope, + /// Well-formed envelope, but its `type_id` names a different entry type than + /// the codec reading it. + TypeMismatch, + /// Written by a newer build whose `type_version` this build does not + /// understand and must not attempt to interpret. + VersionTooNew, + /// Envelope validated, but the body failed to decode (truncation, a + /// malformed protobuf header, an IPC error, etc.). + BodyError, +} + +/// Outcome of deserializing a cache entry. +/// +/// `Miss` means the bytes could not be confidently decoded into `T`; the +/// [`CacheMissReason`] says why. A backend treats any `Miss` exactly like a key +/// that was never present: recompute via the loader. +#[derive(Debug)] +pub enum CacheDecode { + Hit(T), + Miss(CacheMissReason), +} + +impl CacheDecode { + pub fn hit(self) -> Option { + match self { + Self::Hit(v) => Some(v), + Self::Miss(_) => None, + } + } +} // --------------------------------------------------------------------------- // CacheCodecImpl — trait for serializable cache entry types @@ -18,31 +190,40 @@ use crate::Result; /// Serialization trait for cache entries. /// -/// **Experimental**: the serialized format is not stable and may change -/// between releases without notice. +/// **Experimental**: the serialized format is not yet covered by a stability +/// guarantee and may change between releases. When it does stabilize, the +/// rules are: `TYPE_ID`, protobuf field numbers, and enum values are +/// append-only forever; format changes that protobuf cannot express +/// transparently bump [`CURRENT_VERSION`](Self::CURRENT_VERSION). /// -/// Implement this on concrete types that need to survive serialization -/// through a persistent cache backend. Then wire it into a [`CacheKey`](super::CacheKey) -/// via [`CacheCodec::from_impl`]: +/// Implement this on concrete types that need to survive serialization through +/// a persistent cache backend, then wire it into a +/// [`CacheKey`](super::CacheKey) via [`CacheCodec::from_impl`]. /// -/// ```ignore -/// impl CacheCodecImpl for MyData { -/// fn serialize(&self, w: &mut dyn Write) -> Result<()> { /* ... */ } -/// fn deserialize(data: &Bytes) -> Result { /* ... */ } -/// } -/// -/// impl CacheKey for MyDataKey { -/// type ValueType = MyData; -/// fn codec() -> Option { -/// Some(CacheCodec::from_impl::()) -/// } -/// // ... -/// } -/// ``` +/// The envelope (magic/version/type_id/type_version) is written and validated +/// by the [`CacheCodec`] wrapper. [`serialize`](Self::serialize) writes only +/// the body — a header followed by sections in a fixed, version-keyed order — +/// and [`deserialize`](Self::deserialize) reads them back in that same order. +/// The read sequence mirroring the write sequence for each `type_version` is +/// the invariant the implementor owns. pub trait CacheCodecImpl: Send + Sync { - fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()>; + /// Stable identity for this entry type. **Must not change once shipped.** + /// This is a deliberate author-assigned string, not `std::any::type_name` + /// (which is not stable across compiler versions). + const TYPE_ID: &'static str; + + /// Body schema version this build writes. Bump when the body layout + /// changes in a way protobuf field additions cannot express transparently + /// (adding/removing/reordering sections, a raw-blob encoding change, etc.). + const CURRENT_VERSION: u32; + + /// Write the body: a header, then sections in a fixed order. + fn serialize(&self, writer: &mut CacheEntryWriter<'_>) -> Result<()>; - fn deserialize(data: &Bytes) -> Result + /// Reconstruct from the body. Branch on + /// [`reader.version()`](CacheEntryReader::version) for backward compat; + /// sections are read in write order. + fn deserialize(reader: &mut CacheEntryReader<'_>) -> Result where Self: Sized; } @@ -55,25 +236,31 @@ pub(crate) type ArcAny = Arc; /// Type-erased codec for serializing and deserializing cache entries. /// -/// `CacheCodec` is two plain function pointers — it is `Copy` and has no -/// heap allocation. Construct one via [`CacheCodec::from_impl`] for types -/// that implement [`CacheCodecImpl`], or [`CacheCodec::new`] for custom -/// cases (e.g. when the orphan rule prevents a direct impl). +/// `CacheCodec` carries the entry's stable `type_id`/`version` plus two plain +/// function pointers — it is `Copy` and has no heap allocation. Construct one +/// via [`CacheCodec::from_impl`] for types that implement [`CacheCodecImpl`], +/// or [`CacheCodec::new`] for custom cases (e.g. when the orphan rule prevents +/// a direct impl). #[derive(Copy, Clone)] pub struct CacheCodec { - pub(crate) serialize: fn(&ArcAny, &mut dyn std::io::Write) -> Result<()>, - pub(crate) deserialize: fn(&Bytes) -> Result, + type_id: &'static str, + version: u32, + serialize_body: fn(&ArcAny, &mut CacheEntryWriter<'_>) -> Result<()>, + deserialize_body: fn(&mut CacheEntryReader<'_>) -> Result, } impl std::fmt::Debug for CacheCodec { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("CacheCodec").finish_non_exhaustive() + f.debug_struct("CacheCodec") + .field("type_id", &self.type_id) + .field("version", &self.version) + .finish_non_exhaustive() } } fn serialize_via_impl( any: &ArcAny, - writer: &mut dyn std::io::Write, + writer: &mut CacheEntryWriter<'_>, ) -> Result<()> { let val = any .downcast_ref::() @@ -81,44 +268,278 @@ fn serialize_via_impl( val.serialize(writer) } -fn deserialize_via_impl(data: &Bytes) -> Result { - let val = T::deserialize(data)?; +fn deserialize_via_impl( + reader: &mut CacheEntryReader<'_>, +) -> Result { + let val = T::deserialize(reader)?; Ok(Arc::new(val) as ArcAny) } impl CacheCodec { - /// Create a `CacheCodec` from plain function pointers. + /// Create a `CacheCodec` from explicit body function pointers. /// /// Prefer [`from_impl`](Self::from_impl) when the value type implements /// [`CacheCodecImpl`]. Use this for types where a direct impl isn't - /// possible (e.g. orphan rule prevents it). + /// possible (e.g. the orphan rule prevents it). `type_id` and `version` + /// play the same role as the corresponding [`CacheCodecImpl`] constants. pub fn new( - serialize: fn(&ArcAny, &mut dyn std::io::Write) -> Result<()>, - deserialize: fn(&Bytes) -> Result, + type_id: &'static str, + version: u32, + serialize_body: fn(&ArcAny, &mut CacheEntryWriter<'_>) -> Result<()>, + deserialize_body: fn(&mut CacheEntryReader<'_>) -> Result, ) -> Self { Self { - serialize, - deserialize, + type_id, + version, + serialize_body, + deserialize_body, } } /// Create a `CacheCodec` from a [`CacheCodecImpl`] implementation. - /// - /// For **sized** types stored directly in the cache. The codec - /// downcasts `&dyn Any` to `&T` for serialization and returns `Arc` - /// from deserialization. pub fn from_impl() -> Self { Self { - serialize: serialize_via_impl::, - deserialize: deserialize_via_impl::, + type_id: T::TYPE_ID, + version: T::CURRENT_VERSION, + serialize_body: serialize_via_impl::, + deserialize_body: deserialize_via_impl::, } } - pub fn serialize(&self, value: &ArcAny, writer: &mut dyn std::io::Write) -> Result<()> { - (self.serialize)(value, writer) + /// Serialize `value` into `writer`: envelope first, then the body. + pub fn serialize(&self, value: &ArcAny, writer: &mut dyn Write) -> Result<()> { + let body_offset = write_envelope(writer, self.type_id, self.version)?; + let mut entry_writer = CacheEntryWriter::with_pos(writer, body_offset); + (self.serialize_body)(value, &mut entry_writer) + } + + /// Deserialize an entry from `data`. + /// + /// Never fails: any non-fatal failure to interpret the bytes becomes a + /// [`CacheDecode::Miss`] with the reason why (see [`CacheMissReason`]). + /// Reading from an in-memory [`Bytes`] cannot do I/O, so there is no fault + /// channel — a miss is the only non-`Hit` outcome. + pub fn deserialize(&self, data: &Bytes) -> CacheDecode { + let Some(envelope) = parse_envelope(data) else { + log::debug!("cache entry rejected: missing or invalid envelope"); + return CacheDecode::Miss(CacheMissReason::InvalidEnvelope); + }; + + if envelope.type_id != self.type_id { + log::debug!( + "cache entry type_id mismatch: got {:?}, expected {:?}", + envelope.type_id, + self.type_id + ); + return CacheDecode::Miss(CacheMissReason::TypeMismatch); + } + + // A version newer than this build writes was produced by a newer build + // whose body layout we cannot assume to understand. Older/equal versions + // are the impl's responsibility to handle (branching on reader.version()). + if envelope.type_version > self.version { + log::debug!( + "cache entry {:?} has unsupported type_version {} (this build writes {})", + self.type_id, + envelope.type_version, + self.version + ); + return CacheDecode::Miss(CacheMissReason::VersionTooNew); + } + + let mut reader = CacheEntryReader::new(data, envelope.body_offset, envelope.type_version); + match (self.deserialize_body)(&mut reader) { + Ok(value) => CacheDecode::Hit(value), + Err(e) => { + log::debug!( + "cache entry {:?} v{} failed to decode: {e}", + self.type_id, + envelope.type_version + ); + CacheDecode::Miss(CacheMissReason::BodyError) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// A trivial codec used to exercise the envelope and miss semantics + /// without pulling in arrow-backed payloads. + #[derive(Debug, PartialEq)] + struct Widget { + n: u32, + } + + impl CacheCodecImpl for Widget { + const TYPE_ID: &'static str = "test.Widget"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, writer: &mut CacheEntryWriter<'_>) -> Result<()> { + writer.write_raw(&self.n.to_le_bytes()) + } + + fn deserialize(reader: &mut CacheEntryReader<'_>) -> Result { + let bytes = reader.read_raw()?; + let n = u32::from_le_bytes( + bytes + .as_ref() + .try_into() + .map_err(|_| Error::io("bad widget".to_string()))?, + ); + Ok(Self { n }) + } + } + + fn serialize_widget(widget: &Widget) -> Bytes { + let codec = CacheCodec::from_impl::(); + let any: ArcAny = Arc::new(Widget { n: widget.n }); + let mut buf = Vec::new(); + codec.serialize(&any, &mut buf).unwrap(); + Bytes::from(buf) + } + + /// The miss reason, or `None` if the decode was a hit. + fn miss_reason(data: &Bytes) -> Option { + match deserialize_widget(data) { + CacheDecode::Hit(_) => None, + CacheDecode::Miss(reason) => Some(reason), + } } - pub fn deserialize(&self, data: &Bytes) -> Result { - (self.deserialize)(data) + fn deserialize_widget(data: &Bytes) -> CacheDecode { + let codec = CacheCodec::from_impl::(); + match codec.deserialize(data) { + CacheDecode::Hit(any) => { + CacheDecode::Hit(Arc::try_unwrap(any.downcast::().unwrap()).unwrap()) + } + CacheDecode::Miss(reason) => CacheDecode::Miss(reason), + } + } + + #[test] + fn envelope_roundtrip_hits() { + let bytes = serialize_widget(&Widget { n: 0xDEADBEEF }); + // Sanity: the entry starts with the magic. + assert_eq!(&bytes[..4], b"LCE1"); + let decoded = deserialize_widget(&bytes).hit().unwrap(); + assert_eq!(decoded, Widget { n: 0xDEADBEEF }); + } + + #[test] + fn has_cache_envelope_detects_magic() { + let bytes = serialize_widget(&Widget { n: 1 }); + assert!(has_cache_envelope(&bytes)); + assert!(has_cache_envelope(&MAGIC)); // exactly the magic, nothing after + assert!(!has_cache_envelope(b"LCE")); // too short + assert!(!has_cache_envelope(b"JUNK and more")); + assert!(!has_cache_envelope(&[])); + } + + #[test] + fn wrong_magic_is_miss() { + let mut bytes = serialize_widget(&Widget { n: 7 }).to_vec(); + bytes[0] = b'X'; + assert_eq!( + miss_reason(&Bytes::from(bytes)), + Some(CacheMissReason::InvalidEnvelope) + ); + } + + #[test] + fn pre_stabilization_blob_is_miss() { + // An old unstable blob led with a small u64 LE length prefix (a JSON + // header of tens of bytes) — no magic. It must self-heal to a miss. + let mut blob = Vec::new(); + blob.extend_from_slice(&(42u64).to_le_bytes()); + blob.extend_from_slice(&[0u8; 42]); + assert_eq!( + miss_reason(&Bytes::from(blob)), + Some(CacheMissReason::InvalidEnvelope) + ); + + // A different unstable shape led with a small u8 tag (0/1/2). + assert_eq!( + miss_reason(&Bytes::from(vec![0u8, 1, 2, 3])), + Some(CacheMissReason::InvalidEnvelope) + ); + } + + #[test] + fn unknown_envelope_version_is_miss() { + let mut bytes = serialize_widget(&Widget { n: 7 }).to_vec(); + bytes[4] = 0xFF; // envelope_version byte + assert_eq!( + miss_reason(&Bytes::from(bytes)), + Some(CacheMissReason::InvalidEnvelope) + ); + } + + #[test] + fn type_id_mismatch_is_miss() { + // Hand-build an envelope with a foreign type_id but valid framing. + let mut buf = Vec::new(); + write_envelope(&mut buf, "some.OtherType", 1).unwrap(); + buf.extend_from_slice(&(4u64).to_le_bytes()); + buf.extend_from_slice(&99u32.to_le_bytes()); + assert_eq!( + miss_reason(&Bytes::from(buf)), + Some(CacheMissReason::TypeMismatch) + ); + } + + #[test] + fn unsupported_future_type_version_is_miss() { + // An entry written by a newer build (higher type_version) must miss + // rather than be misread by this build. + let mut buf = Vec::new(); + write_envelope(&mut buf, Widget::TYPE_ID, Widget::CURRENT_VERSION + 1).unwrap(); + lance_arrow::ipc::write_len_prefixed_bytes(&mut buf, &9u32.to_le_bytes()).unwrap(); + assert_eq!( + miss_reason(&Bytes::from(buf)), + Some(CacheMissReason::VersionTooNew) + ); + } + + #[test] + fn truncated_envelope_is_miss() { + let bytes = serialize_widget(&Widget { n: 7 }); + for cut in [0, 1, 4, 5, 7, 9] { + assert_eq!( + miss_reason(&bytes.slice(..cut.min(bytes.len()))), + Some(CacheMissReason::InvalidEnvelope), + "truncating to {cut} bytes should miss as InvalidEnvelope" + ); + } + } + + #[test] + fn body_decode_error_is_miss() { + // Valid envelope, but the body is too short for the widget. + let mut buf = Vec::new(); + write_envelope(&mut buf, Widget::TYPE_ID, Widget::CURRENT_VERSION).unwrap(); + buf.extend_from_slice(&(1u64).to_le_bytes()); + buf.push(0u8); + assert_eq!( + miss_reason(&Bytes::from(buf)), + Some(CacheMissReason::BodyError) + ); + } + + #[test] + fn reader_exposes_envelope_version() { + // type_version travels through the envelope to reader.version(). + let mut buf = Vec::new(); + write_envelope(&mut buf, Widget::TYPE_ID, 7).unwrap(); + let body_off = buf.len(); + // A widget body so the codec can decode it. + lance_arrow::ipc::write_len_prefixed_bytes(&mut buf, &5u32.to_le_bytes()).unwrap(); + let data = Bytes::from(buf); + + let mut r = CacheEntryReader::new(&data, body_off, 7); + assert_eq!(r.version(), 7); + assert_eq!(r.read_raw().unwrap().as_ref(), 5u32.to_le_bytes()); } } diff --git a/rust/lance-core/src/cache/entry_io.rs b/rust/lance-core/src/cache/entry_io.rs new file mode 100644 index 00000000000..fe91b11ca7d --- /dev/null +++ b/rust/lance-core/src/cache/entry_io.rs @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Streaming readers/writers for cache entry bodies. +//! +//! [`CacheCodecImpl`](super::CacheCodecImpl) bodies are written and read +//! through these wrappers. They keep serialization streaming (no buffering of +//! the whole entry) and reads zero-copy (sections borrow from the input +//! [`Bytes`]), while tracking the byte position needed to keep Arrow IPC +//! sections 64-byte aligned (see [`lance_arrow::ipc`]). +//! +//! Body layout primitives: +//! +//! ```text +//! HEADER : [header_len: u32 LE][header proto bytes] +//! ARROW_IPC : [pad to 64B][self-delimiting IPC stream] +//! RAW_BLOB : [len: u64 LE][bytes] +//! ``` + +use std::io::Write; + +use arrow_array::RecordBatch; +use bytes::Bytes; +use prost::Message; + +use crate::{Error, Result}; + +/// Writes a cache entry body: a header followed by sections, streaming +/// directly to the underlying writer. +/// +/// The envelope is written by the [`CacheCodec`](super::CacheCodec) wrapper +/// before this writer is handed to +/// [`CacheCodecImpl::serialize`](super::CacheCodecImpl::serialize). +pub struct CacheEntryWriter<'a> { + writer: &'a mut dyn Write, + /// Absolute byte offset within the entry, used to align IPC sections. + pos: usize, +} + +impl<'a> CacheEntryWriter<'a> { + /// Create a writer positioned at the start of an entry (offset 0). + /// + /// Use this for nested serialization into a standalone buffer. The + /// envelope-aware entry point is [`CacheCodec::serialize`](super::CacheCodec::serialize). + pub fn new(writer: &'a mut dyn Write) -> Self { + Self { writer, pos: 0 } + } + + /// Create a writer whose section alignment accounts for `pos` bytes + /// already written ahead of the body (i.e. the envelope). + pub(crate) fn with_pos(writer: &'a mut dyn Write, pos: usize) -> Self { + Self { writer, pos } + } + + /// Write a single discriminant byte (e.g. a variant tag). + pub fn write_u8(&mut self, value: u8) -> Result<()> { + self.writer.write_all(&[value])?; + self.pos += 1; + Ok(()) + } + + /// Write a protobuf header as `[len: u32 LE][bytes]`. + pub fn write_header(&mut self, header: &P) -> Result<()> { + let bytes = header.encode_to_vec(); + let len = u32::try_from(bytes.len()) + .map_err(|_| Error::io(format!("cache header too large: {} bytes", bytes.len())))?; + self.writer.write_all(&len.to_le_bytes())?; + self.writer.write_all(&bytes)?; + self.pos += 4 + bytes.len(); + Ok(()) + } + + /// Write `batch` as a 64-byte-aligned Arrow IPC section. + pub fn write_ipc(&mut self, batch: &RecordBatch) -> Result<()> { + lance_arrow::ipc::write_ipc_section(self.writer, &mut self.pos, batch) + .map_err(|e| Error::io(e.to_string())) + } + + /// Write `batches` as a single 64-byte-aligned multi-batch Arrow IPC + /// section. The iterator must yield at least one batch. + pub fn write_ipc_batches(&mut self, batches: I) -> Result<()> + where + I: IntoIterator, + { + lance_arrow::ipc::write_ipc_section_batches(self.writer, &mut self.pos, batches) + .map_err(|e| Error::io(e.to_string())) + } + + /// Write a raw blob as `[len: u64 LE][bytes]`. + /// + /// Only for byte payloads that already have their own stable, portable + /// encoding (e.g. a roaring bitmap, a varint-packed stream). + pub fn write_raw(&mut self, bytes: &[u8]) -> Result<()> { + lance_arrow::ipc::write_len_prefixed_bytes(self.writer, bytes) + .map_err(|e| Error::io(e.to_string()))?; + self.pos += 8 + bytes.len(); + Ok(()) + } + + /// The underlying writer, for a payload that carries its own framing. + /// + /// Use this only when the codec writes a self-delimiting or whole-body + /// payload — e.g. streaming a roaring bitmap as the entire body, where the + /// length prefix of [`write_raw`](Self::write_raw) would be redundant and + /// buffering to measure that length would force an extra copy. For + /// structured bodies prefer [`write_header`](Self::write_header) / + /// [`write_ipc`](Self::write_ipc) / [`write_raw`](Self::write_raw), which + /// give you versioning and 64-byte IPC alignment. + /// + /// Bytes written through this do **not** advance the section-alignment + /// position, so it must not be interleaved with [`write_ipc`](Self::write_ipc). + pub fn raw_writer(&mut self) -> &mut dyn Write { + self.writer + } +} + +/// Reads a cache entry body, tracking an offset into the input and exposing +/// the entry's `type_version` so implementors can branch for backward compat. +/// +/// All reads are zero-copy: returned [`Bytes`] and the buffers behind decoded +/// [`RecordBatch`]es borrow from the input allocation. +pub struct CacheEntryReader<'a> { + data: &'a Bytes, + offset: usize, + version: u32, +} + +impl<'a> CacheEntryReader<'a> { + /// Create a reader over `data`, starting at body byte `offset`, for an + /// entry written at `version`. + pub fn new(data: &'a Bytes, offset: usize, version: u32) -> Self { + Self { + data, + offset, + version, + } + } + + /// The `type_version` from the envelope. Branch on this for backward compat. + pub fn version(&self) -> u32 { + self.version + } + + /// Read a single discriminant byte written by [`CacheEntryWriter::write_u8`]. + pub fn read_u8(&mut self) -> Result { + let bytes = self.data.as_ref(); + let v = *bytes + .get(self.offset) + .ok_or_else(|| Error::io("cache entry: truncated, missing tag byte".to_string()))?; + self.offset += 1; + Ok(v) + } + + /// Read a protobuf header written by [`CacheEntryWriter::write_header`]. + pub fn read_header(&mut self) -> Result

{ + let bytes = self.data.as_ref(); + let len_end = self + .offset + .checked_add(4) + .filter(|&e| e <= bytes.len()) + .ok_or_else(|| Error::io("cache header: truncated length prefix".to_string()))?; + let len = u32::from_le_bytes(bytes[self.offset..len_end].try_into().unwrap()) as usize; + let data_end = len_end + .checked_add(len) + .filter(|&e| e <= bytes.len()) + .ok_or_else(|| Error::io("cache header: truncated body".to_string()))?; + let msg = P::decode(&bytes[len_end..data_end]) + .map_err(|e| Error::io(format!("cache header decode failed: {e}")))?; + self.offset = data_end; + Ok(msg) + } + + /// Read one [`RecordBatch`] from a 64-byte-aligned IPC section. + pub fn read_ipc(&mut self) -> Result { + lance_arrow::ipc::read_ipc_section_at(self.data, &mut self.offset) + .map_err(|e| Error::io(e.to_string())) + } + + /// Read all [`RecordBatch`]es from a 64-byte-aligned multi-batch IPC + /// section written by [`CacheEntryWriter::write_ipc_batches`]. + pub fn read_ipc_batches(&mut self) -> Result> { + lance_arrow::ipc::read_ipc_section_batches_at(self.data, &mut self.offset) + .map_err(|e| Error::io(e.to_string())) + } + + /// Read a raw blob written by [`CacheEntryWriter::write_raw`], zero-copy. + pub fn read_raw(&mut self) -> Result { + lance_arrow::ipc::read_len_prefixed_bytes_at(self.data, &mut self.offset) + .map_err(|e| Error::io(e.to_string())) + } + + /// The not-yet-consumed body bytes as a zero-copy slice. + /// + /// For a payload that carries its own framing and is parsed with the + /// codec's own cursor — the read counterpart of + /// [`CacheEntryWriter::raw_writer`]. For structured bodies prefer + /// [`read_header`](Self::read_header) / [`read_ipc`](Self::read_ipc) / + /// [`read_raw`](Self::read_raw). + pub fn body(&self) -> Bytes { + self.data.slice(self.offset..) + } +} diff --git a/rust/lance-core/src/cache/mod.rs b/rust/lance-core/src/cache/mod.rs index f62837fe3cc..07038c6e9d5 100644 --- a/rust/lance-core/src/cache/mod.rs +++ b/rust/lance-core/src/cache/mod.rs @@ -47,10 +47,14 @@ pub mod backend; pub mod codec; +mod entry_io; mod moka; -pub use backend::{CacheBackend, CacheEntry, InternalCacheKey}; -pub use codec::{CacheCodec, CacheCodecImpl}; +pub use backend::{CacheBackend, CacheEntry, CacheKeyIterator, InternalCacheKey}; +pub use codec::{ + CacheCodec, CacheCodecImpl, CacheDecode, CacheMissReason, MAGIC, has_cache_envelope, +}; +pub use entry_io::{CacheEntryReader, CacheEntryWriter}; pub use moka::MokaCacheBackend; use std::borrow::Cow; @@ -245,6 +249,40 @@ impl LanceCache { self.cache.size_bytes().await } + /// Return an iterator over keys currently stored under this cache's prefix. + /// + /// Returns `None` when the backend does not support key inventory. The + /// iterator is intended for diagnostics and may be weakly consistent with + /// concurrent cache mutations. + /// + /// # Examples + /// + /// ``` + /// # use std::{borrow::Cow, sync::Arc}; + /// # use lance_core::cache::{CacheKey, LanceCache}; + /// # struct MyKey; + /// # impl CacheKey for MyKey { + /// # type ValueType = Vec; + /// # fn key(&self) -> Cow<'_, str> { Cow::Borrowed("my-key") } + /// # fn type_name() -> &'static str { "VecI32" } + /// # } + /// # async fn example() { + /// let cache = LanceCache::with_capacity(1024); + /// cache.insert_with_key(&MyKey, Arc::new(vec![1, 2, 3])).await; + /// + /// let mut keys = cache.keys().await.expect("Moka supports key inventory"); + /// assert_eq!(keys.next().unwrap().key(), "my-key"); + /// # } + /// ``` + pub async fn keys(&self) -> Option> { + Some(Box::new( + self.cache + .keys() + .await? + .filter(|key| key.starts_with(&self.prefix)), + )) + } + // -- Sized insert/get (internal, shared by sized and unsized paths) -------- async fn insert_with_id( @@ -557,7 +595,7 @@ impl CacheStats { #[cfg(test)] mod tests { use super::*; - use std::collections::HashMap; + use std::collections::{BTreeSet, HashMap}; use std::marker::PhantomData; struct TestKey { @@ -609,6 +647,18 @@ mod tests { } } + fn key_fields(keys: &[InternalCacheKey]) -> BTreeSet<(String, String, &'static str)> { + keys.iter() + .map(|key| { + ( + key.prefix().to_string(), + key.key().to_string(), + key.type_name(), + ) + }) + .collect() + } + #[tokio::test] async fn test_cache_bytes() { let item = Arc::new(vec![1, 2, 3]); @@ -718,6 +768,99 @@ mod tests { assert_eq!(base.stats().await.hits, 1); } + #[tokio::test] + async fn test_cache_keys_with_prefixes() { + let base = LanceCache::with_capacity(1000); + let prefixed = base.with_key_prefix("ns"); + let nested = prefixed.with_key_prefix("index"); + let other = base.with_key_prefix("ns-other"); + + base.insert_with_key(&TestKey::new("root"), Arc::new(vec![0])) + .await; + prefixed + .insert_with_key(&TestKey::new("child"), Arc::new(vec![1])) + .await; + nested + .insert_with_key(&TestKey::new("nested"), Arc::new(vec![2])) + .await; + other + .insert_with_key(&TestKey::new("other"), Arc::new(vec![3])) + .await; + + let base_keys = base.keys().await.unwrap().collect::>(); + assert_eq!( + key_fields(&base_keys), + BTreeSet::from([ + ( + "".to_string(), + "root".to_string(), + TestKey::>::type_name() + ), + ( + "ns/".to_string(), + "child".to_string(), + TestKey::>::type_name() + ), + ( + "ns/index/".to_string(), + "nested".to_string(), + TestKey::>::type_name() + ), + ( + "ns-other/".to_string(), + "other".to_string(), + TestKey::>::type_name() + ), + ]) + ); + + let prefixed_keys = prefixed.keys().await.unwrap().collect::>(); + assert_eq!( + key_fields(&prefixed_keys), + BTreeSet::from([ + ( + "ns/".to_string(), + "child".to_string(), + TestKey::>::type_name() + ), + ( + "ns/index/".to_string(), + "nested".to_string(), + TestKey::>::type_name() + ), + ]) + ); + } + + #[tokio::test] + async fn test_cache_keys_reflect_invalidation_and_clear() { + let base = LanceCache::with_capacity(1000); + let prefixed = base.with_key_prefix("ns"); + let other = base.with_key_prefix("other"); + + prefixed + .insert_with_key(&TestKey::new("child"), Arc::new(vec![1])) + .await; + other + .insert_with_key(&TestKey::new("other"), Arc::new(vec![2])) + .await; + assert_eq!(base.keys().await.unwrap().count(), 2); + + prefixed.invalidate_prefix("").await; + let keys = base.keys().await.unwrap().collect::>(); + assert_eq!( + key_fields(&keys), + BTreeSet::from([( + "other/".to_string(), + "other".to_string(), + TestKey::>::type_name() + )]) + ); + + base.clear().await; + assert_eq!(base.keys().await.unwrap().count(), 0); + } + #[tokio::test] async fn test_cache_get_or_insert() { let cache = LanceCache::with_capacity(1000); @@ -833,6 +976,7 @@ mod tests { .await .is_none() ); + assert!(cache.keys().await.is_none()); } #[tokio::test] diff --git a/rust/lance-core/src/cache/moka.rs b/rust/lance-core/src/cache/moka.rs index 6be7760458a..a3956c1720c 100644 --- a/rust/lance-core/src/cache/moka.rs +++ b/rust/lance-core/src/cache/moka.rs @@ -11,7 +11,7 @@ use futures::Future; use crate::Result; use super::CacheCodec; -use super::backend::{CacheBackend, CacheEntry, InternalCacheKey}; +use super::backend::{CacheBackend, CacheEntry, CacheKeyIterator, InternalCacheKey}; /// Internal record stored in the moka cache. #[derive(Clone, Debug)] @@ -123,6 +123,13 @@ impl CacheBackend for MokaCacheBackend { self.cache.run_pending_tasks().await; } + async fn keys(&self) -> Option> { + self.cache.run_pending_tasks().await; + Some(Box::new( + self.cache.iter().map(|(key, _)| key.as_ref().clone()), + )) + } + async fn num_entries(&self) -> usize { self.cache.run_pending_tasks().await; self.cache.entry_count() as usize diff --git a/rust/lance-core/src/datatypes.rs b/rust/lance-core/src/datatypes.rs index 628f9cf9a90..8837037c308 100644 --- a/rust/lance-core/src/datatypes.rs +++ b/rust/lance-core/src/datatypes.rs @@ -25,6 +25,7 @@ pub use field::{ pub use schema::{ BlobHandling, FieldRef, OnMissing, Projectable, Projection, Schema, escape_field_path_for_project, format_field_path, parse_field_path, + validate_fixed_size_list_dimensions, }; pub static BLOB_DESC_FIELDS: LazyLock = LazyLock::new(|| { diff --git a/rust/lance-core/src/datatypes/field.rs b/rust/lance-core/src/datatypes/field.rs index 4c2665a3640..9f06d421949 100644 --- a/rust/lance-core/src/datatypes/field.rs +++ b/rust/lance-core/src/datatypes/field.rs @@ -575,6 +575,18 @@ impl Field { } } + /// Convert blob v2 fields in this field tree to their descriptor view. + pub fn unload_blobs_recursive(&mut self) { + if self.is_blob_v2() { + self.unloaded_mut(); + return; + } + + for child in &mut self.children { + child.unload_blobs_recursive(); + } + } + pub fn project(&self, path_components: &[&str]) -> Result { let mut f = Self { name: self.name.clone(), @@ -1864,6 +1876,54 @@ mod tests { assert_eq!(field.logical_type, BLOB_V2_DESC_LANCE_FIELD.logical_type); } + #[test] + fn unload_blobs_recursive_only_unloads_blob_v2() { + let legacy_metadata = HashMap::from([(BLOB_META_KEY.to_string(), "true".to_string())]); + let blob_v2_metadata = + HashMap::from([(ARROW_EXT_NAME_KEY.to_string(), BLOB_V2_EXT_NAME.to_string())]); + + let mut field: Field = ArrowField::new( + "parent", + DataType::Struct(Fields::from(vec![ + ArrowField::new("legacy_blob", DataType::LargeBinary, true) + .with_metadata(legacy_metadata), + ArrowField::new( + "blob_v2", + DataType::Struct( + vec![ + ArrowField::new("data", DataType::LargeBinary, true), + ArrowField::new("uri", DataType::Utf8, true), + ] + .into(), + ), + true, + ) + .with_metadata(blob_v2_metadata), + ])), + true, + ) + .try_into() + .unwrap(); + + field.unload_blobs_recursive(); + + let legacy_blob = field + .children + .iter() + .find(|f| f.name == "legacy_blob") + .unwrap(); + assert_eq!( + legacy_blob.logical_type, + LogicalType::try_from(&DataType::LargeBinary).unwrap() + ); + assert_eq!(legacy_blob.children.len(), 0); + assert!(legacy_blob.metadata.contains_key(BLOB_META_KEY)); + + let blob_v2 = field.children.iter().find(|f| f.name == "blob_v2").unwrap(); + assert_eq!(blob_v2.logical_type, BLOB_V2_DESC_LANCE_FIELD.logical_type); + assert_eq!(blob_v2.children.len(), 5); + } + #[test] fn project_by_field_accepts_blob_descriptor_projection() { let metadata = HashMap::from([(BLOB_META_KEY.to_string(), "true".to_string())]); diff --git a/rust/lance-core/src/datatypes/schema.rs b/rust/lance-core/src/datatypes/schema.rs index f959c37672f..d13eb476359 100644 --- a/rust/lance-core/src/datatypes/schema.rs +++ b/rust/lance-core/src/datatypes/schema.rs @@ -11,7 +11,7 @@ use std::{ use crate::deepsize::DeepSizeOf; use arrow_array::RecordBatch; -use arrow_schema::{Field as ArrowField, Schema as ArrowSchema}; +use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; use lance_arrow::*; use super::field::{Field, OnTypeMismatch, SchemaCompareOptions}; @@ -110,6 +110,29 @@ impl<'a> Iterator for SchemaFieldIterPreOrder<'a> { } } +/// Reject `FixedSizeList` types whose dimension is not a positive integer. +/// +/// The row count of a fixed-size list is derived by dividing the number of +/// child items by the dimension, so a zero dimension panics with a +/// divide-by-zero further down the write path (see issue #5102). A +/// `FixedSizeList` of a `FixedSizeList` over a primitive collapses into a +/// single leaf field, so the pre-order field walk never visits the inner list; +/// recurse through the nested list types here to catch an inner zero dimension. +/// +/// Shared by [`Schema::validate`] on the write path and the decoder's +/// field-scheduler builders on the read path. +pub fn validate_fixed_size_list_dimensions(field_name: &str, data_type: &DataType) -> Result<()> { + if let DataType::FixedSizeList(inner, dimension) = data_type { + if *dimension <= 0 { + return Err(Error::schema(format!( + "Field \"{field_name}\" contains a FixedSizeList with dimension {dimension}; dimension must be a positive integer" + ))); + } + validate_fixed_size_list_dimensions(field_name, inner.data_type())?; + } + Ok(()) +} + impl Schema { /// The unenforced primary key fields in the schema, ordered by position. /// @@ -346,6 +369,10 @@ impl Schema { field.id, self ))); } + // The row count of a fixed-size list is derived by dividing the + // number of items by the dimension, so a zero dimension would + // panic with a divide-by-zero further down the write path. + validate_fixed_size_list_dimensions(&field.name, &field.data_type())?; } Ok(()) @@ -2825,6 +2852,67 @@ mod tests { assert!(paths.contains(&"name".to_string())); } + #[test] + fn test_validate_rejects_zero_dimension_fixed_size_list() { + // A zero dimension divides-by-zero further down the write path (#5102) + let fsl = |dimension: i32| { + ArrowDataType::FixedSizeList( + Arc::new(ArrowField::new("item", ArrowDataType::Float32, true)), + dimension, + ) + }; + + let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", fsl(0), true)]); + let err = Schema::try_from(&arrow_schema).unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + + // Nested inside a struct is rejected too + let arrow_schema = ArrowSchema::new(vec![ArrowField::new( + "outer", + ArrowDataType::Struct(ArrowFields::from(vec![ArrowField::new( + "vec", + fsl(0), + true, + )])), + true, + )]); + let err = Schema::try_from(&arrow_schema).unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + + // A zero-dimension FixedSizeList nested inside a positive-dimension + // FixedSizeList collapses into a single leaf field, so the inner + // dimension is not visited by the pre-order field walk and must still + // be rejected: FixedSizeList(FixedSizeList(Float32, 0), 4). + let nested = + ArrowDataType::FixedSizeList(Arc::new(ArrowField::new("inner", fsl(0), true)), 4); + let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", nested, true)]); + let err = Schema::try_from(&arrow_schema).unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + + // A positive dimension still validates, including nested lists + let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", fsl(2), true)]); + assert!(Schema::try_from(&arrow_schema).is_ok()); + let nested_ok = + ArrowDataType::FixedSizeList(Arc::new(ArrowField::new("inner", fsl(2), true)), 4); + let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", nested_ok, true)]); + assert!(Schema::try_from(&arrow_schema).is_ok()); + } + #[test] fn test_schema_unenforced_clustering_key() { use crate::datatypes::field::LANCE_UNENFORCED_CLUSTERING_KEY_POSITION; diff --git a/rust/lance-core/src/utils.rs b/rust/lance-core/src/utils.rs index 8f16744b158..c202329838c 100644 --- a/rust/lance-core/src/utils.rs +++ b/rust/lance-core/src/utils.rs @@ -12,6 +12,7 @@ pub mod cpu; pub mod deletion; pub mod futures; pub mod hash; +pub mod io_stats; pub mod parse; pub mod path; pub mod tempfile; diff --git a/rust/lance-core/src/utils/io_stats.rs b/rust/lance-core/src/utils/io_stats.rs new file mode 100644 index 00000000000..e2169d71ae3 --- /dev/null +++ b/rust/lance-core/src/utils/io_stats.rs @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::ops::Range; + +/// A sink that records I/O requests as they are submitted to storage. +/// +/// This lives in `lance-core` so that the encoding layer (`lance-encoding`) and +/// the I/O layer (`lance-io`) can both refer to it without depending on one +/// another. It lets a caller attach a lightweight counter to a file reader and +/// measure the exact bytes/IOPS performed for a bounded scope (e.g. a single +/// query); see `lance_io::scheduler::IoStats` for the concrete implementation. +/// +/// # When to use this +/// +/// Lance also exposes two *process-wide, cumulative* I/O accounting facilities: +/// the global scheduler counters (`lance_io::scheduler::iops_counter` / +/// `bytes_read_counter`) and the object-store `IOTracker` wrapper used in tests. +/// Both aggregate every read in the process and cannot attribute I/O to a single +/// bounded scope. Prefer an `IoStatsRecorder` when you need the *exact* I/O of +/// one operation (e.g. a single query): attach it to a reader with +/// `with_io_stats`, then read the snapshot when the scope ends. It re-uses the +/// reader's cached metadata, so measuring costs no extra file opens and does not +/// disturb the global counters. +pub trait IoStatsRecorder: std::fmt::Debug + Send + Sync { + /// Record one completed request, given the byte ranges as actually + /// submitted to storage (i.e. after any coalescing/splitting), so the + /// counts reflect physical I/O. + fn record_request(&self, ranges: &[Range]); +} diff --git a/rust/lance-datafusion/src/expr.rs b/rust/lance-datafusion/src/expr.rs index 79650f6775e..a0da34ba2bb 100644 --- a/rust/lance-datafusion/src/expr.rs +++ b/rust/lance-datafusion/src/expr.rs @@ -17,6 +17,18 @@ const MS_PER_DAY: i64 = 86400000; // will always yield "x = 7_u64" regardless of the type of the column "x". As a result, we // need to do that literal coercion ourselves. pub fn safe_coerce_scalar(value: &ScalarValue, ty: &DataType) -> Option { + // A dictionary target coerces the value to the dictionary's value type and + // re-wraps it as a dictionary literal. Only an untyped `ScalarValue::Null` + // keeps its untyped form, matching the behavior for all other targets; a + // *typed* null (e.g. `Utf8(None)`) is coerced and wrapped like any other + // value so it produces a `Dictionary(..)` literal that matches the column. + if let DataType::Dictionary(key_type, value_type) = ty { + if matches!(value, ScalarValue::Null) { + return Some(value.clone()); + } + let inner = safe_coerce_scalar(value, value_type)?; + return Some(ScalarValue::Dictionary(key_type.clone(), Box::new(inner))); + } match value { ScalarValue::Int8(val) => match ty { DataType::Int8 => Some(value.clone()), @@ -436,6 +448,9 @@ pub fn safe_coerce_scalar(value: &ScalarValue, ty: &DataType) -> Option Some(value.clone()), _ => None, }, + // A dictionary-encoded literal (e.g. produced by DataFusion's dictionary + // cast in the scalar-index path) coerces by unwrapping its underlying value. + ScalarValue::Dictionary(_, inner) => safe_coerce_scalar(inner, ty), _ => None, } } @@ -775,4 +790,97 @@ mod tests { Some(ScalarValue::BinaryView(Some(vec![1, 2, 3]))) ); } + + #[test] + fn test_dictionary_coerce() { + let dict_ty = DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)); + + // A string literal coerces to a dictionary target by wrapping the + // coerced value in a dictionary scalar. + assert_eq!( + safe_coerce_scalar(&ScalarValue::Utf8(Some("com".to_string())), &dict_ty), + Some(ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + )) + ); + + // The inner value is coerced through to the dictionary value type, so a + // LargeUtf8 literal lands as a Utf8 value inside the dictionary. + assert_eq!( + safe_coerce_scalar(&ScalarValue::LargeUtf8(Some("com".to_string())), &dict_ty), + Some(ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + )) + ); + + // A dictionary literal round-trips back to its value type. + assert_eq!( + safe_coerce_scalar( + &ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + ), + &DataType::Utf8, + ), + Some(ScalarValue::Utf8(Some("com".to_string()))) + ); + + // A dictionary literal coerces to a dictionary target, adopting the + // target's key type. + assert_eq!( + safe_coerce_scalar( + &ScalarValue::Dictionary( + Box::new(DataType::Int32), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + ), + &dict_ty, + ), + Some(ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + )) + ); + + // An untyped null keeps its untyped form for a dictionary target, just + // like for every other target type. + assert_eq!( + safe_coerce_scalar(&ScalarValue::Null, &dict_ty), + Some(ScalarValue::Null) + ); + + // A *typed* null (e.g. an API-built `Utf8(None)` literal, or an IN value + // already typed as Utf8) is still wrapped in the dictionary type so it + // matches the dictionary column. Returning a bare `Utf8(None)` here would + // leave `resolve_value` with a literal whose type does not line up with + // the column, breaking planning/evaluation the same way non-null strings + // used to break. + assert_eq!( + safe_coerce_scalar(&ScalarValue::Utf8(None), &dict_ty), + Some(ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(None)), + )) + ); + + // The inner null is coerced through to the dictionary value type as well, + // so a LargeUtf8 typed null lands as a Utf8 null inside the dictionary. + assert_eq!( + safe_coerce_scalar(&ScalarValue::LargeUtf8(None), &dict_ty), + Some(ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(None)), + )) + ); + + // A value that cannot be coerced to the dictionary value type fails. + assert_eq!( + safe_coerce_scalar( + &ScalarValue::Utf8(Some("com".to_string())), + &DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Int32)), + ), + None + ); + } } diff --git a/rust/lance-datafusion/src/logical_expr.rs b/rust/lance-datafusion/src/logical_expr.rs index ab0936d31da..0eed438dae7 100644 --- a/rust/lance-datafusion/src/logical_expr.rs +++ b/rust/lance-datafusion/src/logical_expr.rs @@ -463,4 +463,58 @@ mod tests { _ => unreachable!("Expected BinaryExpr"), } } + + #[test] + fn test_resolve_typed_null_against_dictionary_column() { + // A dictionary-encoded string column, e.g. a categorical field. + let dict_ty = DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)); + let arrow_schema = ArrowSchema::new(vec![Field::new("etld", dict_ty, true)]); + let schema = Schema::try_from(&arrow_schema).unwrap(); + + // A typed null must be wrapped in the dictionary type, not left as a bare + // `Utf8(None)` literal sitting next to a `Dictionary(...)` column. + let expected_null = Expr::Literal( + ScalarValue::Dictionary(Box::new(DataType::Int16), Box::new(ScalarValue::Utf8(None))), + None, + ); + + // `etld = ` built directly via the API, as opposed to coming + // through SQL parsing. + let expr = Expr::BinaryExpr(BinaryExpr { + left: Box::new(Expr::Column("etld".to_string().into())), + op: Operator::Eq, + right: Box::new(Expr::Literal(ScalarValue::Utf8(None), None)), + }); + match resolve_expr(&expr, &schema).unwrap() { + Expr::BinaryExpr(be) => assert_eq!(be.right.as_ref(), &expected_null), + other => unreachable!("Expected BinaryExpr, got {other:?}"), + } + + // `etld IN ('a', )` — a typed value mixed with a typed null, + // both already typed as Utf8. Every list element is wrapped in the + // dictionary type. + let expr = Expr::in_list( + Expr::Column("etld".to_string().into()), + vec![ + Expr::Literal(ScalarValue::Utf8(Some("a".to_string())), None), + Expr::Literal(ScalarValue::Utf8(None), None), + ], + false, + ); + let expected = Expr::in_list( + Expr::Column("etld".to_string().into()), + vec![ + Expr::Literal( + ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(Some("a".to_string()))), + ), + None, + ), + expected_null, + ], + false, + ); + assert_eq!(resolve_expr(&expr, &schema).unwrap(), expected); + } } diff --git a/rust/lance-datagen/Cargo.toml b/rust/lance-datagen/Cargo.toml index eae1e3086b6..83b5aba3689 100644 --- a/rust/lance-datagen/Cargo.toml +++ b/rust/lance-datagen/Cargo.toml @@ -21,7 +21,6 @@ hex = "0.4.3" rand = { workspace = true } rand_distr = { workspace = true } rand_xoshiro = { workspace = true } -random_word = { version = "0.5", features = ["en"] } [dev-dependencies] criterion = { workspace = true } diff --git a/rust/lance-datagen/src/generator.rs b/rust/lance-datagen/src/generator.rs index 3756e354bea..39da4734619 100644 --- a/rust/lance-datagen/src/generator.rs +++ b/rust/lance-datagen/src/generator.rs @@ -21,7 +21,6 @@ use arrow_schema::{ArrowError, DataType, Field, Fields, IntervalUnit, Schema, Sc use futures::{StreamExt, stream::BoxStream}; use rand::{Rng, RngCore, SeedableRng, distr::Uniform}; use rand_distr::Zipf; -use random_word; use self::array::rand_with_distribution; @@ -1172,24 +1171,223 @@ impl ArrayGenerator for BinaryPrefixPlusCounterGenerator { } } -// Common English stop words placed at the front to be sampled more frequently +// Common English stop words placed at the front to be sampled more frequently. const STOP_WORDS: &[&str] = &[ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", ]; +const ENGLISH_WORDS: &[&str] = &[ + "ability", + "able", + "about", + "above", + "accept", + "access", + "account", + "across", + "action", + "active", + "activity", + "actual", + "address", + "adjust", + "admin", + "advance", + "agent", + "align", + "allow", + "amount", + "analysis", + "answer", + "application", + "archive", + "array", + "asset", + "async", + "attribute", + "available", + "balance", + "batch", + "binary", + "bitmap", + "block", + "branch", + "buffer", + "build", + "cache", + "capacity", + "catalog", + "change", + "chunk", + "client", + "cluster", + "column", + "commit", + "common", + "compare", + "compile", + "compute", + "condition", + "config", + "connect", + "content", + "context", + "control", + "convert", + "copy", + "core", + "count", + "create", + "current", + "cursor", + "data", + "dataset", + "decode", + "default", + "delete", + "delta", + "depend", + "derive", + "design", + "detail", + "detect", + "device", + "direct", + "display", + "document", + "domain", + "drive", + "dynamic", + "encode", + "engine", + "error", + "event", + "example", + "execute", + "expand", + "expect", + "export", + "extend", + "feature", + "field", + "filter", + "final", + "finish", + "format", + "fragment", + "future", + "generate", + "global", + "group", + "handle", + "header", + "index", + "input", + "insert", + "inspect", + "instance", + "integer", + "internal", + "item", + "join", + "kernel", + "large", + "layer", + "layout", + "length", + "level", + "limit", + "linear", + "local", + "logical", + "lookup", + "manage", + "manifest", + "memory", + "merge", + "metric", + "model", + "module", + "namespace", + "native", + "node", + "normal", + "number", + "object", + "offset", + "option", + "output", + "package", + "page", + "parallel", + "parse", + "partition", + "pattern", + "physical", + "plan", + "policy", + "prefix", + "prepare", + "primary", + "process", + "profile", + "project", + "property", + "query", + "range", + "reader", + "record", + "region", + "registry", + "request", + "resolve", + "resource", + "result", + "return", + "row", + "runtime", + "scalar", + "scan", + "schema", + "search", + "segment", + "select", + "session", + "setting", + "source", + "stable", + "stage", + "state", + "static", + "storage", + "stream", + "string", + "struct", + "table", + "target", + "task", + "thread", + "token", + "trace", + "transform", + "type", + "update", + "upload", + "value", + "vector", + "version", + "view", + "write", + "writer", +]; + /// Word list with stop words at the front for Zipf sampling, computed once. static SENTENCE_WORDS: LazyLock> = LazyLock::new(|| { - let all_words = random_word::all(random_word::Lang::En); - let mut words = Vec::with_capacity(STOP_WORDS.len() + all_words.len()); + let mut words = Vec::with_capacity(STOP_WORDS.len() + ENGLISH_WORDS.len()); words.extend(STOP_WORDS.iter().copied()); - words.extend( - all_words - .iter() - .filter(|w| !STOP_WORDS.contains(w)) - .copied(), - ); + words.extend(ENGLISH_WORDS.iter().copied()); words }); @@ -1279,7 +1477,7 @@ struct RandomWordGenerator { impl RandomWordGenerator { pub fn new(is_large: bool) -> Self { - let words = random_word::all(random_word::Lang::En); + let words = ENGLISH_WORDS; Self { words, is_large } } } @@ -3190,9 +3388,9 @@ mod tests { assert_eq!( *genn.generate(RowCount::from(3), &mut rng).unwrap(), arrow_array::BinaryArray::from_iter_values([ - vec![174, 178], - vec![64, 122, 207, 248], - vec![124, 3, 58] + vec![111, 9, 80], + vec![86, 118, 13, 209], + vec![68, 33, 202] ]) ); } diff --git a/rust/lance-encoding/src/decoder.rs b/rust/lance-encoding/src/decoder.rs index 59886d337d1..a30d5ed93a9 100644 --- a/rust/lance-encoding/src/decoder.rs +++ b/rust/lance-encoding/src/decoder.rs @@ -226,7 +226,9 @@ use futures::stream::{self, BoxStream}; use futures::{FutureExt, StreamExt}; use lance_arrow::DataTypeExt; use lance_core::cache::LanceCache; -use lance_core::datatypes::{BLOB_DESC_LANCE_FIELD, Field, Schema}; +use lance_core::datatypes::{ + BLOB_DESC_LANCE_FIELD, Field, Schema, validate_fixed_size_list_dimensions, +}; use lance_core::utils::futures::{FinallyStreamExt, StreamOnDropExt}; use lance_core::utils::parse::parse_env_as_bool; use log::{debug, trace, warn}; @@ -723,6 +725,7 @@ impl CoreFieldDecoderStrategy { column_infos: &mut ColumnInfoIter, ) -> Result> { let data_type = field.data_type(); + validate_fixed_size_list_dimensions(&field.name, &data_type)?; if Self::is_structural_primitive(&data_type) { let column_info = column_infos.expect_next()?; let scheduler = Box::new(StructuralPrimitiveFieldScheduler::try_new( @@ -832,6 +835,7 @@ impl CoreFieldDecoderStrategy { buffers: FileBuffers, ) -> Result> { let data_type = field.data_type(); + validate_fixed_size_list_dimensions(&field.name, &data_type)?; if Self::is_primitive_legacy(&data_type) { let column_info = column_infos.expect_next()?; let scheduler = self.create_primitive_scheduler(field, column_info, buffers)?; @@ -2887,6 +2891,52 @@ pub async fn decode_batch( mod tests { use super::*; + #[test] + fn test_read_zero_dimension_fsl_errors_instead_of_panicking() { + // Simulates reading a column whose stored schema declares a + // zero-dimension FixedSizeList, as old writers (before #5102) could + // persist. The read plan is built by the field-scheduler factories, + // which run the dimension guard before touching any column data, so + // an empty column iterator is sufficient to reach the guard. The read + // must surface a clean error rather than a divide-by-zero panic. + use arrow_schema::Field as ArrowField; + + let zero_dim = DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 0, + ); + let field = Field::try_from(&ArrowField::new("vec", zero_dim, true)).unwrap(); + let strategy = CoreFieldDecoderStrategy::default(); + + let mut structural_columns = ColumnInfoIter::new(vec![], &[]); + let err = strategy + .create_structural_field_scheduler(&field, &mut structural_columns) + .unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + + let mut legacy_columns = ColumnInfoIter::new(vec![], &[]); + let err = strategy + .create_legacy_field_scheduler( + &field, + &mut legacy_columns, + FileBuffers { + positions_and_sizes: &[], + }, + ) + .unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + } + #[test] fn test_coalesce_indices_to_ranges_with_single_index() { let indices = vec![1]; diff --git a/rust/lance-encoding/src/encodings/logical/primitive.rs b/rust/lance-encoding/src/encodings/logical/primitive.rs index 064e3b59745..9b506359e55 100644 --- a/rust/lance-encoding/src/encodings/logical/primitive.rs +++ b/rust/lance-encoding/src/encodings/logical/primitive.rs @@ -3701,12 +3701,7 @@ struct SerializedFullZip { // // If we directly record the size in bytes with 12 bits we would be limited to // 4KiB which is too small. Since we know each mini-block consists of 8 byte -// words we can store the # of words instead which gives us 32KiB. We want -// at least 24KiB so we can handle even the worst case of -// - 4Ki values compressed into an 8186 byte buffer -// - 4 bytes to describe rep & def lengths -// - 16KiB of rep & def buffer (this will almost never happen but life is easier if we -// plan for it) +// words we can store the # of words instead which gives us 32KiB. // // Second, each chunk in a mini-block is aligned to 8 bytes. This allows multi-byte // values like offsets to be stored in a mini-block and safely read back out. It also @@ -3906,9 +3901,9 @@ impl PrimitiveStructuralEncoder { // 0xA) All blocks except the last must have power-of-two number of values. // This not only makes metadata smaller but it makes decoding easier since // batch sizes are typically a power of 2. 4 bits would allow us to express - // up to 16Ki values but we restrict this further to 4Ki values. + // up to 32Ki values. // - // This means blocks can have 1 to 4Ki values and 8 - 32Ki bytes. + // This means blocks can have 1 to 32Ki values and 8 - 32Ki bytes. // // All metadata words are serialized (as little endian) into a single buffer // of metadata values. @@ -4007,7 +4002,13 @@ impl PrimitiveStructuralEncoder { } } else { for &buffer_size in &chunk.buffer_sizes { - data_buffer.extend_from_slice(&(buffer_size as u16).to_le_bytes()); + let buffer_size = u16::try_from(buffer_size).map_err(|_| { + Error::internal(format!( + "Mini-block buffer size ({} bytes) too large for 16-bit metadata", + buffer_size + )) + })?; + data_buffer.extend_from_slice(&buffer_size.to_le_bytes()); } } @@ -4041,15 +4042,28 @@ impl PrimitiveStructuralEncoder { let chunk_bytes = data_buffer.len() - start_pos; let max_chunk_size = if support_large_chunk { - 4 * 1024 * 1024 * 1024 // 4GB limit with u32 metadata + 1_u64 << 31 // 28 bits of 8-byte words in u32 metadata } else { 32 * 1024 // 32KiB limit with u16 metadata }; - assert!(chunk_bytes <= max_chunk_size); - assert!(chunk_bytes > 0); - assert_eq!(chunk_bytes % 8, 0); - // 4Ki values max - assert!(chunk.log_num_values <= 12); + if chunk_bytes == 0 || chunk_bytes as u64 > max_chunk_size { + return Err(Error::internal(format!( + "Mini-block chunk size {} bytes exceeds the {} byte metadata limit", + chunk_bytes, max_chunk_size + ))); + } + if chunk_bytes % MINIBLOCK_ALIGNMENT != 0 { + return Err(Error::internal(format!( + "Mini-block chunk size {} bytes is not aligned to {} bytes", + chunk_bytes, MINIBLOCK_ALIGNMENT + ))); + } + if chunk.log_num_values > 15 { + return Err(Error::internal(format!( + "Mini-block log_num_values {} exceeds the 4-bit metadata limit", + chunk.log_num_values + ))); + } // We subtract 1 here from chunk_bytes because we want to be able to express // a size of 32KiB and not (32Ki - 8)B which is what we'd get otherwise with // 0xFFF @@ -5768,8 +5782,9 @@ mod tests { use super::{ ChunkInstructions, DataBlock, DecodeMiniBlockTask, FixedPerValueDecompressor, FixedWidthDataBlock, FullZipCacheableState, FullZipDecodeDetails, FullZipReadSource, - FullZipRepIndexDetails, FullZipScheduler, MiniBlockRepIndex, PerValueDecompressor, - PreambleAction, StructuralPageScheduler, VariableFullZipDecoder, + FullZipRepIndexDetails, FullZipScheduler, MiniBlockChunk, MiniBlockCompressed, + MiniBlockRepIndex, PerValueDecompressor, PreambleAction, StructuralPageScheduler, + VariableFullZipDecoder, }; use crate::buffer::LanceBuffer; use crate::compression::DefaultDecompressionStrategy; @@ -6967,7 +6982,7 @@ mod tests { #[tokio::test] async fn test_binary_large_minichunk_size_over_max_miniblock_values() { let mut string_data = Vec::new(); - // 128kb/chunk / 6 bytes (t_9999) = 21845 > max 4096 items per chunk + // 128kb/chunk / 6 bytes (t_9999) = 21845 items per chunk for i in 0..10000 { string_data.push(Some(format!("t_{}", i))); } @@ -7566,6 +7581,36 @@ mod tests { ); } + #[test] + fn test_v2_1_miniblock_serializes_log_num_values_15() { + let miniblocks = MiniBlockCompressed { + data: vec![LanceBuffer::from(vec![1_u8; 16])], + chunks: vec![ + MiniBlockChunk { + buffer_sizes: vec![8], + log_num_values: 15, + }, + MiniBlockChunk { + buffer_sizes: vec![8], + log_num_values: 0, + }, + ], + num_values: 32_769, + }; + + let serialized = + PrimitiveStructuralEncoder::serialize_miniblocks(miniblocks, None, None, false) + .unwrap(); + + let chunk_metadata = serialized.metadata.borrow_to_typed_slice::(); + assert_eq!(chunk_metadata.len(), 2); + assert_eq!( + chunk_metadata[0] & 0x0F, + 15, + "V2.1 metadata should use all 4 bits for log_num_values" + ); + } + async fn encode_first_page( field: arrow_schema::Field, array: ArrayRef, diff --git a/rust/lance-encoding/src/encodings/logical/primitive/miniblock.rs b/rust/lance-encoding/src/encodings/logical/primitive/miniblock.rs index de3227b2a39..1cf3b9bf581 100644 --- a/rust/lance-encoding/src/encodings/logical/primitive/miniblock.rs +++ b/rust/lance-encoding/src/encodings/logical/primitive/miniblock.rs @@ -19,13 +19,14 @@ use lance_core::Result; pub const MAX_MINIBLOCK_BYTES: u64 = 8 * 1024 - 6; const DEFAULT_MAX_MINIBLOCK_VALUES: u64 = 4096; +const MAX_CONFIGURABLE_MINIBLOCK_VALUES: u64 = 32768; fn parse_max_miniblock_values() -> u64 { let val = std::env::var("LANCE_MINIBLOCK_MAX_VALUES") .ok() .and_then(|v| v.parse().ok()) .unwrap_or(DEFAULT_MAX_MINIBLOCK_VALUES); - val.clamp(1, DEFAULT_MAX_MINIBLOCK_VALUES) + val.clamp(1, MAX_CONFIGURABLE_MINIBLOCK_VALUES) } pub static MAX_MINIBLOCK_VALUES: std::sync::LazyLock = @@ -58,9 +59,9 @@ pub struct MiniBlockCompressed { /// and contain a power-of-two number of values (except for the last chunk) /// /// By default we limit a chunk to 4Ki values and slightly less than -/// 8KiB of compressed data. This means that even in the extreme case -/// where we have 4 bytes of rep/def then we will have at most 24KiB of -/// data (values, repetition, and definition) per mini-block. +/// 8KiB of compressed value data. The byte budget remains the primary +/// constraint, so only encodings that compress many values into that +/// budget can use larger value counts when explicitly configured. /// /// The maximum number of values per chunk can be configured via the /// `LANCE_MINIBLOCK_MAX_VALUES` environment variable. This is only @@ -77,8 +78,8 @@ pub struct MiniBlockChunk { // then this should be 0 (the number of values will be calculated by subtracting the // size of all other chunks from the total size of the page) // - // For example, 1 would mean there are 2 values in the chunk and 12 would mean there - // are 4Ki values in the chunk. + // For example, 1 would mean there are 2 values in the chunk and 15 would mean there + // are 32Ki values in the chunk. // // This must be <= log2(MAX_MINIBLOCK_VALUES) (i.e. <= 12 at the default of 4096) pub log_num_values: u8, @@ -135,6 +136,14 @@ mod tests { unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") }; } + #[test] + #[serial] + fn test_parse_can_raise_to_32k() { + unsafe { std::env::set_var("LANCE_MINIBLOCK_MAX_VALUES", "32768") }; + assert_eq!(parse_max_miniblock_values(), 32768); + unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") }; + } + #[test] #[serial] fn test_parse_clamps_zero_to_one() { @@ -147,7 +156,10 @@ mod tests { #[serial] fn test_parse_clamps_above_max() { unsafe { std::env::set_var("LANCE_MINIBLOCK_MAX_VALUES", "99999") }; - assert_eq!(parse_max_miniblock_values(), DEFAULT_MAX_MINIBLOCK_VALUES); + assert_eq!( + parse_max_miniblock_values(), + MAX_CONFIGURABLE_MINIBLOCK_VALUES + ); unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") }; } diff --git a/rust/lance-encoding/src/lib.rs b/rust/lance-encoding/src/lib.rs index cb4062d3220..a58e0a14c59 100644 --- a/rust/lance-encoding/src/lib.rs +++ b/rust/lance-encoding/src/lib.rs @@ -86,6 +86,22 @@ pub trait EncodingsIo: std::fmt::Debug + Send + Sync { fn with_bypass_backpressure(&self) -> Option> { None } + + /// Returns a version of this I/O service that additionally records the I/O it + /// performs into `stats`, on top of any global accounting. This is the seam + /// used to measure exact per-scope (e.g. per-query) I/O without re-opening + /// files: wrap a reader's I/O service, perform the reads, then inspect the + /// recorder. + /// + /// Returns `None` if this implementation does not support per-scope I/O + /// statistics (e.g. in-memory or test schedulers), in which case the caller + /// should fall back to using self (and no statistics are recorded). + fn with_io_stats( + &self, + _stats: Arc, + ) -> Option> { + None + } } /// An implementation of EncodingsIo that serves data from an in-memory buffer diff --git a/rust/lance-file/src/io.rs b/rust/lance-file/src/io.rs index c09e9d8d372..1a8edf92b08 100644 --- a/rust/lance-file/src/io.rs +++ b/rust/lance-file/src/io.rs @@ -38,6 +38,16 @@ impl EncodingsIo for LanceEncodingsIo { })) } + fn with_io_stats( + &self, + stats: Arc, + ) -> Option> { + Some(Arc::new(Self { + scheduler: self.scheduler.with_io_stats(stats), + read_chunk_size: self.read_chunk_size, + })) + } + fn submit_request( &self, ranges: Vec>, diff --git a/rust/lance-file/src/reader.rs b/rust/lance-file/src/reader.rs index 9e4e4c449a4..c454f73819e 100644 --- a/rust/lance-file/src/reader.rs +++ b/rust/lance-file/src/reader.rs @@ -470,6 +470,23 @@ impl FileReader { } } + /// Returns a clone of this reader whose I/O is additionally recorded into + /// `stats`, on top of the scheduler's global accounting. + /// + /// All cached metadata is shared with `self`, so no file is re-opened and + /// only a few `Arc` clones are performed. If the underlying I/O service + /// does not support per-scope statistics (e.g. an in-memory scheduler), the + /// returned reader is an ordinary, uninstrumented clone. + pub fn with_io_stats( + &self, + stats: Arc, + ) -> Self { + match self.scheduler.with_io_stats(stats) { + Some(scheduler) => self.with_scheduler(scheduler), + None => self.clone(), + } + } + pub fn num_rows(&self) -> u64 { self.num_rows } diff --git a/rust/lance-file/src/writer.rs b/rust/lance-file/src/writer.rs index 14a4c82bde6..12bd50df6fe 100644 --- a/rust/lance-file/src/writer.rs +++ b/rust/lance-file/src/writer.rs @@ -633,14 +633,11 @@ impl FileWriter { async fn write_global_buffers(&mut self) -> Result> { let schema = self.schema.as_mut().ok_or(Error::invalid_input("No schema provided on writer open and no data provided. Schema is unknown and file cannot be created"))?; schema.metadata = std::mem::take(&mut self.schema_metadata); - // Use descriptor layout for blob v2 in the footer to avoid exposing logical child fields. - // - // TODO(xuanwo): this doesn't work on nested struct, need better solution like fields_per_order_mut? - schema.fields.iter_mut().for_each(|f| { - if f.is_blob_v2() { - f.unloaded_mut(); - } - }); + // Use descriptor layout for blob v2 fields in the footer to avoid exposing logical child fields. + schema + .fields + .iter_mut() + .for_each(|f| f.unload_blobs_recursive()); let file_descriptor = Self::make_file_descriptor(schema, self.rows_written)?; let file_descriptor_bytes = file_descriptor.encode_to_vec(); diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml index e3947b57856..85de43c0f9b 100644 --- a/rust/lance-index/Cargo.toml +++ b/rust/lance-index/Cargo.toml @@ -56,6 +56,7 @@ object_store.workspace = true prost.workspace = true prost-types.workspace = true rand.workspace = true +regex-syntax.workspace = true roaring.workspace = true rayon.workspace = true serde_json.workspace = true diff --git a/rust/lance-index/benches/rq.rs b/rust/lance-index/benches/rq.rs index 4a7364d1313..72e0c49820d 100644 --- a/rust/lance-index/benches/rq.rs +++ b/rust/lance-index/benches/rq.rs @@ -17,11 +17,16 @@ use lance_datagen::array::rand_type; use lance_datagen::{BatchGeneratorBuilder, RowCount}; use lance_index::vector::bq::RQRotationType; use lance_index::vector::bq::builder::RabitQuantizer; +use lance_index::vector::bq::ex_dot::{ + blocked_ex_code_bytes, ex_dot_kernel, pack_blocked_row, packed_ex_code_value, +}; use lance_index::vector::bq::storage::*; use lance_index::vector::bq::transform::{ADD_FACTORS_COLUMN, SCALE_FACTORS_COLUMN}; use lance_index::vector::quantizer::{Quantization, QuantizerStorage}; use lance_index::vector::storage::{DistCalculator, VectorStore}; use lance_linalg::distance::DistanceType; +use rand::rngs::SmallRng; +use rand::{Rng, SeedableRng}; const DIM: usize = 128; const TOTAL: usize = 16 * 1000; @@ -119,16 +124,526 @@ fn compute_distances(c: &mut Criterion) { } } -#[cfg(target_os = "linux")] -criterion_group!( - name=benches; - config = Criterion::default().measurement_time(Duration::from_secs(10)); - targets = construct_dist_table, compute_distances); +/// The table-gather ex distance used before the dedicated ex-dot kernels, +/// kept here as the baseline: per dim, extract the packed code and gather +/// `query[d] * code` from a `dim * 2^ex_bits` table. +fn gather_ex_distance(row_codes: &[u8], dim: usize, ex_bits: u8, ex_dist_table: &[f32]) -> f32 { + let entries_per_dim = 1usize << ex_bits; + (0..dim) + .map(|dim_idx| { + let code = packed_ex_code_value(row_codes, dim_idx, ex_bits) as usize; + ex_dist_table[dim_idx * entries_per_dim + code] + }) + .sum() +} + +fn ex_dot_kernels(c: &mut Criterion) { + for ex_dim in [1536usize, 2048] { + ex_dot_kernels_for_dim(c, ex_dim); + } +} + +fn ex_dot_kernels_for_dim(c: &mut Criterion, ex_dim: usize) { + const NUM_ROWS: usize = 1024; + + let mut rng = SmallRng::seed_from_u64(42); + let query = (0..ex_dim) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect::>(); + + for ex_bits in 1..=8u8 { + let max_code = ((1u16 << ex_bits) - 1) as u8; + let values = (0..NUM_ROWS * ex_dim) + .map(|_| rng.random_range(0..=max_code)) + .collect::>(); + + // The gather baseline reads the legacy sequential layout it shipped + // with; the kernel reads the blocked layout. + let seq_code_len = (ex_dim * ex_bits as usize).div_ceil(8); + let mut seq_codes = vec![0u8; NUM_ROWS * seq_code_len]; + for (row, row_values) in seq_codes + .chunks_exact_mut(seq_code_len) + .zip(values.chunks_exact(ex_dim)) + { + for (dim, &value) in row_values.iter().enumerate() { + let bit_offset = dim * ex_bits as usize; + let bits = (value as u16) << (bit_offset % 8); + row[bit_offset / 8] |= bits as u8; + if bits >> 8 != 0 { + row[bit_offset / 8 + 1] |= (bits >> 8) as u8; + } + } + } + + let kernel_code_len = blocked_ex_code_bytes(ex_dim, ex_bits); + let mut kernel_codes = vec![0u8; NUM_ROWS * kernel_code_len]; + for (row, row_values) in kernel_codes + .chunks_exact_mut(kernel_code_len) + .zip(values.chunks_exact(ex_dim)) + { + pack_blocked_row(row_values, ex_bits, row); + } + + // ex_dim is block-aligned here, so the kernels read the query as-is. + let ex_query = &query; + let kernel = ex_dot_kernel(ex_bits); + c.bench_function( + format!("RQ ex_dot kernel: ex_bits={ex_bits}, DIM={ex_dim}, rows={NUM_ROWS}").as_str(), + |b| { + b.iter(|| { + let mut sum = 0.0f32; + for row in kernel_codes.chunks_exact(kernel_code_len) { + sum += kernel(ex_query, row); + } + black_box(sum) + }) + }, + ); + + let entries_per_dim = 1usize << ex_bits; + let mut ex_dist_table = vec![0.0f32; ex_dim * entries_per_dim]; + for (dim, table) in ex_dist_table.chunks_exact_mut(entries_per_dim).enumerate() { + for (code, value) in table.iter_mut().enumerate() { + *value = query[dim] * code as f32; + } + } + c.bench_function( + format!("RQ ex_dot table-gather: ex_bits={ex_bits}, DIM={ex_dim}, rows={NUM_ROWS}") + .as_str(), + |b| { + b.iter(|| { + let mut sum = 0.0f32; + for row in seq_codes.chunks_exact(seq_code_len) { + sum += gather_ex_distance(row, ex_dim, ex_bits, &ex_dist_table); + } + black_box(sum) + }) + }, + ); + } +} + +/// Storage load cost per format: blocked-format ex codes are aliased as-is, +/// legacy sequential ex codes are repacked row by row. +fn ex_code_storage_load(c: &mut Criterion) { + use arrow_array::{ArrayRef, FixedSizeListArray, Float32Array, UInt8Array, UInt64Array}; + use lance_arrow::FixedSizeListArrayExt; + use lance_index::vector::bq::ex_dot::repack_sequential_row; + use lance_index::vector::bq::rabit_ex_code_bytes; + use lance_index::vector::bq::transform::{EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN}; + use std::sync::Arc; + + const LOAD_DIM: usize = 1536; + const LOAD_ROWS: usize = 8192; + const NUM_BITS: u8 = 4; // ex_bits=3, a bit-plane width + + let ex_bits = NUM_BITS - 1; + let mut rng = SmallRng::seed_from_u64(7); + let metadata = RabitQuantizationMetadata { + rotate_mat: None, + rotate_mat_position: None, + fast_rotation_signs: None, + rotation_type: RQRotationType::Fast, + code_dim: LOAD_DIM as u32, + num_bits: NUM_BITS, + packed: true, + query_estimator: RabitQueryEstimator::RawQuery, + }; + let code_len = LOAD_DIM / 8; + let binary_codes = (0..LOAD_ROWS * code_len) + .map(|_| rng.random_range(0..=u8::MAX)) + .collect::>(); + let seq_code_len = rabit_ex_code_bytes(LOAD_DIM, ex_bits).unwrap(); + let seq_codes = (0..LOAD_ROWS * seq_code_len) + .map(|_| rng.random_range(0..=u8::MAX)) + .collect::>(); + let blocked_code_len = blocked_ex_code_bytes(LOAD_DIM, ex_bits); + let mut blocked_codes = vec![0u8; LOAD_ROWS * blocked_code_len]; + for (seq_row, blocked_row) in seq_codes + .chunks_exact(seq_code_len) + .zip(blocked_codes.chunks_exact_mut(blocked_code_len)) + { + repack_sequential_row(seq_row, LOAD_DIM, ex_bits, blocked_row); + } + + let make_batch = |ex_column: &str, ex_values: Vec, ex_code_len: usize| { + arrow_array::RecordBatch::try_from_iter(vec![ + ( + ROW_ID, + Arc::new(UInt64Array::from_iter_values(0..LOAD_ROWS as u64)) as ArrayRef, + ), + ( + RABIT_CODE_COLUMN, + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(binary_codes.clone()), + code_len as i32, + ) + .unwrap(), + ) as ArrayRef, + ), + ( + ADD_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; LOAD_ROWS])) as ArrayRef, + ), + ( + SCALE_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; LOAD_ROWS])) as ArrayRef, + ), + ( + ex_column, + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(ex_values), + ex_code_len as i32, + ) + .unwrap(), + ) as ArrayRef, + ), + ( + EX_ADD_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; LOAD_ROWS])) as ArrayRef, + ), + ( + EX_SCALE_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; LOAD_ROWS])) as ArrayRef, + ), + ]) + .unwrap() + }; + + let blocked_batch = make_batch( + RABIT_BLOCKED_EX_CODE_COLUMN, + blocked_codes, + blocked_code_len, + ); + c.bench_function( + format!("RQ storage load (blocked ex codes): num_bits={NUM_BITS}, DIM={LOAD_DIM}, rows={LOAD_ROWS}") + .as_str(), + |b| { + b.iter(|| { + black_box( + RabitQuantizationStorage::try_from_batch( + blocked_batch.clone(), + &metadata, + DistanceType::L2, + None, + ) + .unwrap(), + ) + }) + }, + ); + + let legacy_batch = make_batch(RABIT_EX_CODE_COLUMN, seq_codes, seq_code_len); + c.bench_function( + format!("RQ storage load (legacy ex codes): num_bits={NUM_BITS}, DIM={LOAD_DIM}, rows={LOAD_ROWS}") + .as_str(), + |b| { + b.iter(|| { + black_box( + RabitQuantizationStorage::try_from_batch( + legacy_batch.clone(), + &metadata, + DistanceType::L2, + None, + ) + .unwrap(), + ) + }) + }, + ); +} + +/// Bulk-scoring cost of the ex stage: the quantized ex-FastScan LUT path +/// (inside `distance_all`) vs the exact per-row ex-dot kernel. The +/// binary-only run isolates the shared binary stage so the ex cost is the +/// difference from the full run. +fn ex_bulk_paths(c: &mut Criterion) { + use arrow_array::{ArrayRef, FixedSizeListArray, Float32Array, UInt8Array, UInt64Array}; + use lance_arrow::FixedSizeListArrayExt; + use lance_index::vector::ApproxMode; + use lance_index::vector::bq::ex_dot::pad_query_into; + use lance_index::vector::bq::transform::{EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN}; + use lance_index::vector::storage::DistanceCalculatorOptions; + use std::sync::Arc; + + const BULK_DIM: usize = 1536; + const BULK_ROWS: usize = 16384; + + let mut rng = SmallRng::seed_from_u64(13); + for num_bits in [3u8, 5, 9] { + let ex_bits = num_bits - 1; + let max_code = ((1u16 << ex_bits) - 1) as u8; + + let rq = RabitQuantizer::new_with_rotation::( + num_bits, + BULK_DIM as i32, + RQRotationType::Fast, + ); + let metadata = rq.metadata(None); + + let code_len = BULK_DIM / 8; + let binary_codes = (0..BULK_ROWS * code_len) + .map(|_| rng.random_range(0..=u8::MAX)) + .collect::>(); + let ex_code_len = blocked_ex_code_bytes(BULK_DIM, ex_bits); + let mut ex_codes = vec![0u8; BULK_ROWS * ex_code_len]; + let values = (0..BULK_DIM) + .map(|_| rng.random_range(0..=max_code)) + .collect::>(); + for row in ex_codes.chunks_exact_mut(ex_code_len) { + pack_blocked_row(&values, ex_bits, row); + } + + // No error factors: `distance_all` takes the FastScan ex bulk branch. + let batch = arrow_array::RecordBatch::try_from_iter(vec![ + ( + ROW_ID, + Arc::new(UInt64Array::from_iter_values(0..BULK_ROWS as u64)) as ArrayRef, + ), + ( + RABIT_CODE_COLUMN, + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(binary_codes), + code_len as i32, + ) + .unwrap(), + ) as ArrayRef, + ), + ( + ADD_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; BULK_ROWS])) as ArrayRef, + ), + ( + SCALE_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; BULK_ROWS])) as ArrayRef, + ), + ( + RABIT_BLOCKED_EX_CODE_COLUMN, + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(ex_codes.clone()), + ex_code_len as i32, + ) + .unwrap(), + ) as ArrayRef, + ), + ( + EX_ADD_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; BULK_ROWS])) as ArrayRef, + ), + ( + EX_SCALE_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![1.0f32; BULK_ROWS])) as ArrayRef, + ), + ]) + .unwrap(); + let storage = + RabitQuantizationStorage::try_from_batch(batch, &metadata, DistanceType::L2, None) + .unwrap(); + + let query: ArrayRef = Arc::new(Float32Array::from( + (0..BULK_DIM) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect::>(), + )); + + for (label, approx_mode) in [ + ("full distance_all (binary + ex LUT)", ApproxMode::Normal), + ("binary-only distance_all (fast mode)", ApproxMode::Fast), + ] { + let mut f32_scratch = Vec::new(); + let calc = storage.dist_calculator_with_scratch( + query.clone(), + 0.0, + None, + &mut f32_scratch, + DistanceCalculatorOptions { approx_mode }, + ); + let mut dists = Vec::new(); + let mut u16_scratch = Vec::new(); + let mut u8_scratch = Vec::new(); + let mut u32_scratch = Vec::new(); + c.bench_function( + format!("RQ bulk {label}: num_bits={num_bits}, DIM={BULK_DIM}, rows={BULK_ROWS}") + .as_str(), + |b| { + b.iter(|| { + calc.distance_all_with_scratch( + 0, + &mut dists, + &mut u16_scratch, + &mut u8_scratch, + &mut u32_scratch, + ); + black_box(dists.len()) + }) + }, + ); + } + + let kernel = ex_dot_kernel(ex_bits); + let mut ex_query = vec![0.0f32; BULK_DIM]; + pad_query_into( + query + .as_any() + .downcast_ref::() + .unwrap() + .values(), + &mut ex_query, + ); + c.bench_function( + format!( + "RQ bulk ex kernel loop: num_bits={num_bits}, DIM={BULK_DIM}, rows={BULK_ROWS}" + ) + .as_str(), + |b| { + b.iter(|| { + let mut sum = 0.0f32; + for row in ex_codes.chunks_exact(ex_code_len) { + sum += kernel(&ex_query, row); + } + black_box(sum) + }) + }, + ); + } +} + +/// Top-k accumulation through the gated raw-query multi-bit path: binary +/// FastScan, the per-row lower-bound pruning scan, and the exact rerank of +/// the surviving rows. Error factors are present so the gating is enabled. +fn heap_topk(c: &mut Criterion) { + use arrow_array::{ArrayRef, FixedSizeListArray, Float32Array, UInt8Array, UInt64Array}; + use lance_arrow::FixedSizeListArrayExt; + use lance_index::vector::ApproxMode; + use lance_index::vector::bq::transform::{ + ERROR_FACTORS_COLUMN, EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN, + }; + use lance_index::vector::storage::DistanceCalculatorOptions; + use std::collections::BinaryHeap; + use std::sync::Arc; + + const TOPK_DIM: usize = 1536; + const TOPK_ROWS: usize = 4096; + const TOPK_K: usize = 10; + const NUM_BITS: u8 = 5; + let ex_bits = NUM_BITS - 1; + + let mut rng = SmallRng::seed_from_u64(99); + let rq = RabitQuantizer::new_with_rotation::( + NUM_BITS, + TOPK_DIM as i32, + RQRotationType::Fast, + ); + let metadata = rq.metadata(None); + + let code_len = TOPK_DIM / 8; + let binary_codes = (0..TOPK_ROWS * code_len) + .map(|_| rng.random()) + .collect::>(); + let ex_code_len = blocked_ex_code_bytes(TOPK_DIM, ex_bits); + let ex_codes = (0..TOPK_ROWS * ex_code_len) + .map(|_| rng.random()) + .collect::>(); + // Factor magnitudes chosen so the lower bounds spread mostly with the add + // factors; once the heap is full the threshold prunes the vast majority + // of rows, like a production multi-partition scan. + let mut rand_factors = |low: f32, high: f32| { + Arc::new(Float32Array::from( + (0..TOPK_ROWS) + .map(|_| rng.random_range(low..high)) + .collect::>(), + )) as ArrayRef + }; + let batch = arrow_array::RecordBatch::try_from_iter(vec![ + ( + ROW_ID, + Arc::new(UInt64Array::from_iter_values(0..TOPK_ROWS as u64)) as ArrayRef, + ), + ( + RABIT_CODE_COLUMN, + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(binary_codes), + code_len as i32, + ) + .unwrap(), + ) as ArrayRef, + ), + (ADD_FACTORS_COLUMN, rand_factors(0.0, 1.0)), + (SCALE_FACTORS_COLUMN, rand_factors(0.0005, 0.0015)), + (ERROR_FACTORS_COLUMN, rand_factors(0.0, 0.01)), + ( + RABIT_BLOCKED_EX_CODE_COLUMN, + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(ex_codes), + ex_code_len as i32, + ) + .unwrap(), + ) as ArrayRef, + ), + (EX_ADD_FACTORS_COLUMN, rand_factors(0.0, 1.0)), + (EX_SCALE_FACTORS_COLUMN, rand_factors(0.00003, 0.0001)), + ]) + .unwrap(); + let storage = + RabitQuantizationStorage::try_from_batch(batch, &metadata, DistanceType::L2, None).unwrap(); + let query: ArrayRef = Arc::new(Float32Array::from( + (0..TOPK_DIM) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect::>(), + )); + + for (label, approx_mode) in [ + ("normal", ApproxMode::Normal), + ("accurate", ApproxMode::Accurate), + ] { + let mut f32_scratch = Vec::new(); + let calc = storage.dist_calculator_with_scratch( + query.clone(), + 1.0, + None, + &mut f32_scratch, + DistanceCalculatorOptions { approx_mode }, + ); + let mut heap = BinaryHeap::with_capacity(TOPK_K + 1); + let mut dists = Vec::new(); + let mut u16_scratch = Vec::new(); + let mut u8_scratch = Vec::new(); + let mut u32_scratch = Vec::new(); + c.bench_function( + format!( + "RQ heap topk ({label}): num_bits={NUM_BITS}, DIM={TOPK_DIM}, rows={TOPK_ROWS}, k={TOPK_K}" + ) + .as_str(), + |b| { + b.iter(|| { + heap.clear(); + calc.accumulate_topk_with_scratch( + TOPK_K, + None, + None, + |id| id as u64, + &mut heap, + &mut dists, + &mut u16_scratch, + &mut u8_scratch, + &mut u32_scratch, + ); + black_box(heap.len()) + }) + }, + ); + } +} -#[cfg(not(target_os = "linux"))] criterion_group!( name=benches; config = Criterion::default().measurement_time(Duration::from_secs(10)); - targets = construct_dist_table, compute_distances); + targets = construct_dist_table, compute_distances, ex_dot_kernels, ex_code_storage_load, ex_bulk_paths, heap_topk); criterion_main!(benches); diff --git a/rust/lance-index/build.rs b/rust/lance-index/build.rs index 0617de8c806..b47744f7b5a 100644 --- a/rust/lance-index/build.rs +++ b/rust/lance-index/build.rs @@ -6,6 +6,9 @@ use std::io::Result; fn main() -> Result<()> { println!("cargo:rerun-if-changed=protos"); + // Cache-entry protos are library-internal serialization, not part of the + // on-disk format spec, so they live here rather than in the shared `protos/`. + println!("cargo:rerun-if-changed=protos-cache"); #[cfg(feature = "protoc")] // Use vendored protobuf compiler if requested. @@ -17,8 +20,12 @@ fn main() -> Result<()> { prost_build.protoc_arg("--experimental_allow_proto3_optional"); prost_build.enable_type_names(); prost_build.compile_protos( - &["./protos/index.proto", "./protos/index_old.proto"], - &["./protos"], + &[ + "./protos/index.proto", + "./protos/index_old.proto", + "./protos-cache/cache.proto", + ], + &["./protos", "./protos-cache"], )?; let rust_toolchain = env::var("RUSTUP_TOOLCHAIN") diff --git a/rust/lance-index/protos-cache/cache.proto b/rust/lance-index/protos-cache/cache.proto new file mode 100644 index 00000000000..b24a27055d7 --- /dev/null +++ b/rust/lance-index/protos-cache/cache.proto @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +// Protobuf headers for serialized index cache entries. +// +// These messages describe the *cache* serialization format, not the on-disk +// Lance format spec, so they live with the library (lance-index) rather than in +// the top-level `protos/` spec folder. +// +// Field numbers and enum values are append-only across all messages here: never +// renumber or reuse them. A change the proto cannot express transparently +// (adding/removing/reordering the IPC/raw sections that follow a header) must +// bump the relevant codec's `CURRENT_VERSION` instead. + +syntax = "proto3"; + +package lance.index.cache; + +// --------------------------------------------------------------------------- +// Full-text search (FTS) posting lists +// --------------------------------------------------------------------------- + +// Header for a serialized `CompressedPostingList` cache entry. +message CompressedPostingHeader { + float max_score = 1; + uint32 length = 2; + PostingTailCodec posting_tail_codec = 3; + PositionStorage position_storage = 4; + // Only meaningful when position_storage == POSITION_STORAGE_SHARED. + PositionStreamCodec position_stream_codec = 5; +} + +// Header for a serialized `PlainPostingList` cache entry. Followed by an Arrow +// IPC section of (row_ids: UInt64, frequencies: Float32), then — when +// position_storage == POSITION_STORAGE_LEGACY — an IPC section of the per-doc +// position list. Plain postings never carry a shared position stream. +message PlainPostingHeader { + // Absent when the posting has no precomputed block-max score (the in-memory + // `max_score` is `None`); present otherwise. + optional float max_score = 1; + // POSITION_STORAGE_NONE or POSITION_STORAGE_LEGACY only. + PositionStorage position_storage = 2; +} + +// Header for a serialized standalone `Positions` cache entry. Followed by the +// position sections framed per `position_storage`, which is never +// POSITION_STORAGE_NONE for a standalone entry. +message PositionsHeader { + PositionStorage position_storage = 1; + // Only meaningful when position_storage == POSITION_STORAGE_SHARED. + PositionStreamCodec position_stream_codec = 2; +} + +// Header for a serialized `PostingListGroup`: a member count followed by that +// many `PostingList` bodies written inline. Each member body is +// self-delimiting, so members need no length prefixes, and writing them inline +// keeps their Arrow IPC sections 64-byte aligned within the group entry. +message PostingListGroupHeader { + uint32 count = 1; +} + +// Tail-block encoding of a compressed posting list. +enum PostingTailCodec { + POSTING_TAIL_CODEC_FIXED32 = 0; + POSTING_TAIL_CODEC_VARINT_DELTA = 1; +} + +// Encoding of a shared position stream's byte buffer. +enum PositionStreamCodec { + POSITION_STREAM_CODEC_VARINT_DOC_DELTA = 0; + POSITION_STREAM_CODEC_PACKED_DELTA = 1; +} + +// Which (if any) positions accompany the posting list, and how they are framed +// in the sections after the header. +enum PositionStorage { + POSITION_STORAGE_NONE = 0; + // Legacy per-doc positions as a single Arrow IPC section. + POSITION_STORAGE_LEGACY = 1; + // Shared stream: an Arrow IPC section of block offsets, then a raw blob of + // the (codec-encoded) position bytes. + POSITION_STORAGE_SHARED = 2; +} + +// --------------------------------------------------------------------------- +// Scalar indices +// --------------------------------------------------------------------------- + +// Header for a serialized `BTreeIndexState` cache entry, followed by a single +// Arrow IPC section holding the page-lookup batch. +message BTreeIndexHeader { + uint64 batch_size = 1; + // Whether an explicit page-range -> file mapping is present. Distinguishes a + // non-range-partitioned index (false) from a range-partitioned one whose map + // happens to be empty (true with no entries). + bool has_ranges_to_files = 2; + repeated RangeToFile ranges_to_files = 3; +} + +// One entry of a `BTreeIndexState` page-range -> file mapping. The range is +// inclusive on both ends (a `RangeInclusive`). +message RangeToFile { + uint32 start = 1; + uint32 end = 2; + uint32 page_offset = 3; + string path = 4; +} + +// --------------------------------------------------------------------------- +// Vector indices (IVF partitions) +// --------------------------------------------------------------------------- + +// Headers for serialized IVF partition cache entries (`PartitionEntry`). +// +// Each header is followed by 64-byte-aligned Arrow IPC sections in a fixed, +// version-keyed order (sub-index, then any quantizer-specific arrays, then the +// quantizer storage batches). + +// Distance metric a quantizer's storage was built for. +enum DistanceType { + DISTANCE_TYPE_L2 = 0; + DISTANCE_TYPE_COSINE = 1; + DISTANCE_TYPE_DOT = 2; + DISTANCE_TYPE_HAMMING = 3; +} + +// Rotation applied by a RabitQ quantizer. +enum RotationType { + ROTATION_TYPE_MATRIX = 0; + ROTATION_TYPE_FAST = 1; +} + +// Estimator a RabitQ quantizer uses at query time. +enum RabitQueryEstimator { + RABIT_QUERY_ESTIMATOR_RESIDUAL_QUERY = 0; + RABIT_QUERY_ESTIMATOR_RAW_QUERY = 1; +} + +// Product quantizer. Sections: sub-index IPC, codebook IPC, storage IPC. +message PqPartitionHeader { + DistanceType distance_type = 1; + uint32 nbits = 2; + uint64 num_sub_vectors = 3; + uint64 dimension = 4; + bool transposed = 5; +} + +// Flat (float) and flat-binary quantizers. Sections: sub-index IPC, storage IPC. +message FlatPartitionHeader { + DistanceType distance_type = 1; + uint64 dim = 2; +} + +// Scalar quantizer. Sections: sub-index IPC, storage IPC (possibly multi-batch). +message SqPartitionHeader { + DistanceType distance_type = 1; + uint32 num_bits = 2; + uint64 dim = 3; + double bounds_start = 4; + double bounds_end = 5; +} + +// Header for a serialized IVF index state (`IvfIndexState`), followed by +// three raw blobs: the IVF model protobuf, the quantizer's extra-metadata +// buffer (may be empty), and the auxiliary IVF model protobuf. +message IvfStateHeader { + string index_file_path = 1; + string uuid = 2; + string distance_type = 3; + repeated string sub_index_metadata = 4; + string sub_index_type = 5; + string quantization_type = 6; + // Per-quantizer `Q::Metadata` as JSON. Kept as a string because the metadata + // type is generic over the quantizer; the proto envelope still provides + // additive evolution for the surrounding fields. + string quantizer_metadata_json = 7; + string cache_key_prefix = 8; + uint64 index_file_size = 9; + uint64 aux_file_size = 10; +} + +// RabitQ quantizer. Sections: sub-index IPC, rotate-matrix IPC (Matrix rotation +// only), storage IPC. +message RabitPartitionHeader { + DistanceType distance_type = 1; + uint32 num_bits = 2; + uint32 code_dim = 3; + RotationType rotation_type = 4; + // Fast-rotation sign vector; present only when rotation_type == + // ROTATION_TYPE_FAST (the Matrix case stores its rotation as an IPC section). + optional bytes fast_rotation_signs = 5; + // Estimator the RabitQ storage uses at query time (residual vs raw query). + RabitQueryEstimator query_estimator = 6; +} diff --git a/rust/lance-index/src/lib.rs b/rust/lance-index/src/lib.rs index 888070a3c1f..20e1c2692d9 100644 --- a/rust/lance-index/src/lib.rs +++ b/rust/lance-index/src/lib.rs @@ -68,6 +68,13 @@ pub mod pbold { include!(concat!(env!("OUT_DIR"), "/lance.table.rs")); } +/// Protobuf headers for serialized index cache entries (FTS posting lists, +/// scalar indices, and IVF vector partitions). +pub mod cache_pb { + #![allow(clippy::use_self)] + include!(concat!(env!("OUT_DIR"), "/lance.index.cache.rs")); +} + /// Generic methods common across all types of secondary indices /// #[async_trait] @@ -312,6 +319,7 @@ impl IndexType { Self::IvfFlat => 4096, Self::IvfSq => 8192, Self::IvfPq => 8192, + Self::IvfRq => 4096, Self::IvfHnswFlat => 1 << 20, Self::IvfHnswSq => 1 << 20, Self::IvfHnswPq => 1 << 20, @@ -382,6 +390,11 @@ mod tests { assert_eq!(IndexType::max_vector_version(), IVF_RQ_INDEX_VERSION); } + #[test] + fn test_ivf_rq_target_partition_size() { + assert_eq!(IndexType::IvfRq.target_partition_size(), 4096); + } + #[test] fn test_index_type_try_from_i32_covers_all_variants() { let all = [ diff --git a/rust/lance-index/src/metrics.rs b/rust/lance-index/src/metrics.rs index 9e2161ae8f9..37e2c43d196 100644 --- a/rust/lance-index/src/metrics.rs +++ b/rust/lance-index/src/metrics.rs @@ -43,6 +43,19 @@ pub trait MetricsCollector: Send + Sync { /// /// The goal is to provide some visibility into the compute cost of the search fn record_comparisons(&self, num_comparisons: usize); + + /// Returns an optional sink for recording exact I/O statistics (bytes read, + /// IOPS, and requests) performed on behalf of this collector. + /// + /// Index implementations that read from a + /// [`lance_io::scheduler::ScanScheduler`] can attach the returned handle to + /// their file readers so the I/O performed for a single query is measured + /// and attributed here. The default returns `None`, meaning the caller does + /// not want I/O measured (and index implementations should then take their + /// normal, uninstrumented read path). + fn io_stats(&self) -> Option { + None + } } /// A no-op metrics collector that does nothing diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index eaebbd1b338..a74318fc5c9 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -8,6 +8,7 @@ use arrow_array::{BooleanArray, ListArray, RecordBatch, UInt64Array}; use arrow_schema::{Field, Schema}; use async_trait::async_trait; use bytes::Bytes; +use datafusion::functions::regex::regexplike::RegexpLikeFunc; use datafusion::functions::string::contains::ContainsFunc; use datafusion::functions_nested::array_has; use datafusion::physical_plan::SendableRecordBatchStream; @@ -288,6 +289,22 @@ pub trait IndexStore: std::fmt::Debug + Send + Sync + DeepSizeOf { /// This is often useful when remapping or updating async fn copy_index_file(&self, name: &str, dest_store: &dyn IndexStore) -> Result; + /// Copy an index file from this store to a new name in another store, leaving the source intact + async fn copy_index_file_to( + &self, + name: &str, + new_name: &str, + dest_store: &dyn IndexStore, + ) -> Result { + if name == new_name { + self.copy_index_file(name, dest_store).await + } else { + Err(Error::not_supported(format!( + "copying index file {name} to {new_name} is not supported by this index store" + ))) + } + } + /// Rename an index file async fn rename_index_file(&self, name: &str, new_name: &str) -> Result; @@ -633,9 +650,15 @@ impl AnyQuery for LabelListQuery { pub enum TextQuery { /// Retrieve all row ids where the text contains the given string StringContains(String), - // TODO: In the future we should be able to do string-insensitive contains - // as well as partial matches (e.g. LIKE 'foo%') and potentially even - // some regular expressions + /// Retrieve all row ids whose text matches the given regular expression. + /// + /// The pattern is a full regular expression (as accepted by `regexp_like`). + /// The index returns a candidate superset that the scan rechecks, so any + /// pattern is sound; patterns with no usable trigram structure simply fall + /// back to rechecking every row. + Regex(String), + // TODO: In the future we should be able to do case-insensitive contains + // as well as partial matches (e.g. LIKE 'foo%'). } impl AnyQuery for TextQuery { @@ -656,6 +679,17 @@ impl AnyQuery for TextQuery { Expr::Literal(ScalarValue::Utf8(Some(substr.clone())), None), ], }), + // `regexp_like` returns Boolean directly, so the reconstructed + // expression can be used as-is for the recheck filter (no IsNotNull + // wrapper, unlike `regexp_match`). It is the semantic equivalent of + // the original predicate for the "does it match" question. + Self::Regex(pattern) => Expr::ScalarFunction(ScalarFunction { + func: Arc::new(RegexpLikeFunc::new().into()), + args: vec![ + Expr::Column(Column::new_unqualified(col)), + Expr::Literal(ScalarValue::Utf8(Some(pattern.clone())), None), + ], + }), } } @@ -935,6 +969,15 @@ impl OldIndexDataFilter { Self::RowIds(valid_row_ids) => *addrs &= valid_row_ids, } } + + /// True if this filter would keep no rows at all (its keep-set is empty), + /// letting a segment merge skip reading the source segment entirely. + pub fn keeps_nothing(&self) -> bool { + match self { + Self::Fragments { to_keep, .. } => to_keep.is_empty(), + Self::RowIds(valid_row_ids) => valid_row_ids.is_empty(), + } + } } impl UpdateCriteria { diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index 4a212713e1f..8a7fea074c3 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -17,14 +17,13 @@ use bytes::Bytes; use datafusion::physical_plan::SendableRecordBatchStream; use datafusion_common::ScalarValue; use futures::{StreamExt, TryStreamExt, stream}; -use lance_arrow::ipc::{ - read_ipc_stream_single_at, read_len_prefixed_bytes_at, write_ipc_stream, - write_len_prefixed_bytes, -}; use lance_core::deepsize::DeepSizeOf; use lance_core::{ Error, ROW_ID, Result, - cache::{CacheCodec, CacheCodecImpl, CacheKey, LanceCache, WeakLanceCache}, + cache::{ + CacheCodec, CacheCodecImpl, CacheEntryReader, CacheEntryWriter, CacheKey, LanceCache, + WeakLanceCache, + }, error::LanceOptionExt, utils::tokio::get_num_compute_intensive_cpus, }; @@ -201,6 +200,32 @@ impl BitmapIndexState { frag_reuse_index, ))) } + + /// Build a state directly from its parts, for codec tests in sibling + /// modules (e.g. the label-list index, which nests a bitmap state). + #[cfg(test)] + pub(crate) fn new_for_test( + index_map: BTreeMap, + null_map: RowAddrTreeMap, + value_type: DataType, + ) -> Result { + Ok(Self { + lookup_batch: build_lookup_batch(&index_map, &value_type)?, + null_map: Arc::new(null_map), + value_type, + index_map: Arc::new(index_map), + }) + } + + #[cfg(test)] + pub(crate) fn lookup_batch(&self) -> &RecordBatch { + &self.lookup_batch + } + + #[cfg(test)] + pub(crate) fn null_map(&self) -> &RowAddrTreeMap { + &self.null_map + } } fn build_lookup_batch( @@ -240,25 +265,27 @@ fn parse_lookup_batch(batch: &RecordBatch) -> Result, offsets: UInt64)] + /// RAW_BLOB : null_map (roaring tree map, portable encoding) + /// ARROW_IPC : (keys: , offsets: UInt64) /// ``` - /// The value type is recovered from the IPC stream schema. - fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()> { + /// The value type is recovered from the IPC section schema. + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { let mut null_bytes = Vec::with_capacity(self.null_map.serialized_size()); self.null_map.serialize_into(&mut null_bytes)?; - write_len_prefixed_bytes(writer, &null_bytes)?; - write_ipc_stream(&self.lookup_batch, writer)?; + w.write_raw(&null_bytes)?; + w.write_ipc(&self.lookup_batch)?; Ok(()) } - fn deserialize(data: &bytes::Bytes) -> Result { - let mut offset = 0; - let null_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let null_bytes = r.read_raw()?; let null_map = Arc::new(RowAddrTreeMap::deserialize_from(null_bytes.as_ref())?); - let lookup_batch = read_ipc_stream_single_at(data, &mut offset)?; + let lookup_batch = r.read_ipc()?; let value_type = lookup_batch.schema().field(0).data_type().clone(); let index_map = Arc::new(parse_lookup_batch(&lookup_batch)?); Ok(Self { @@ -1246,6 +1273,19 @@ pub async fn merge_bitmap_indices( ))); } + // A segment whose filter keeps nothing contributes no postings; skip the + // state load entirely. (Remapping for deferred compaction happens inside + // `load_bitmap`, so the loaded postings already reference live fragments.) + if old_data_filters[idx] + .as_ref() + .is_some_and(|f| f.keeps_nothing()) + { + progress + .stage_progress("merge_bitmap_segments", (idx + 1) as u64) + .await?; + continue; + } + let mut state = source_index.load_bitmap_index_state().await?; if let Some(old_data_filter) = &old_data_filters[idx] { state.retain(|_, postings| { @@ -1449,8 +1489,12 @@ mod tests { fn assert_state_roundtrips(state: &BitmapIndexState) { let mut buf = Vec::new(); - state.serialize(&mut buf).unwrap(); - let restored = BitmapIndexState::deserialize(&bytes::Bytes::from(buf)).unwrap(); + state + .serialize(&mut CacheEntryWriter::new(&mut buf)) + .unwrap(); + let data = bytes::Bytes::from(buf); + let mut reader = CacheEntryReader::new(&data, 0, BitmapIndexState::CURRENT_VERSION); + let restored = BitmapIndexState::deserialize(&mut reader).unwrap(); assert_eq!(restored.lookup_batch, state.lookup_batch); assert_eq!(&*restored.null_map, &*state.null_map); assert_eq!(restored.value_type, state.value_type); @@ -1484,6 +1528,53 @@ mod tests { assert_state_roundtrips(&empty_state); } + /// The lookup batch must decode zero-copy through the full envelope-bearing + /// [`CacheCodec`] even though the envelope pushes the IPC section to a + /// non-aligned starting offset. + #[test] + fn test_bitmap_index_state_lookup_is_zero_copy() { + const ALIGN: usize = 64; + let mut index_map = BTreeMap::new(); + for k in 0..32i32 { + index_map.insert( + OrderableScalarValue(ScalarValue::Int32(Some(k))), + k as usize, + ); + } + let state = BitmapIndexState { + lookup_batch: build_lookup_batch(&index_map, &DataType::Int32).unwrap(), + null_map: Arc::new(RowAddrTreeMap::new()), + value_type: DataType::Int32, + index_map: Arc::new(index_map), + }; + + let codec = CacheCodec::from_impl::(); + let any: Arc = Arc::new(state); + let mut buf = Vec::new(); + codec.serialize(&any, &mut buf).unwrap(); + + // Model a backend reading into a 64-byte-aligned buffer. + let mut v = vec![0u8; buf.len() + ALIGN]; + let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN; + v[pad..pad + buf.len()].copy_from_slice(&buf); + let data = bytes::Bytes::from(v).slice(pad..pad + buf.len()); + + let restored = codec.deserialize(&data).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + + let base = data.as_ptr() as usize; + let end = base + data.len(); + for col in restored.lookup_batch.columns() { + for buffer in col.to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= base && ptr < end, + "lookup batch buffer was realigned out of the input — misaligned IPC section", + ); + } + } + } + #[tokio::test] async fn test_bitmap_lazy_loading_and_cache() { // Create a temporary directory for the index diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index e8e5c42a248..ab3f6c58075 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -15,6 +15,7 @@ use super::{ OldIndexDataFilter, SargableQuery, ScalarIndex, ScalarIndexParams, SearchResult, compute_next_prefix, }; +use crate::cache_pb::{BTreeIndexHeader, RangeToFile}; use crate::{Index, IndexType}; use crate::{ frag_reuse::FragReuseIndex, @@ -45,18 +46,23 @@ use datafusion::physical_plan::{ sorts::sort_preserving_merge::SortPreservingMergeExec, stream::RecordBatchStreamAdapter, union::UnionExec, }; -use datafusion_common::{DataFusionError, ScalarValue}; -use datafusion_physical_expr::{PhysicalSortExpr, expressions::Column}; +use datafusion_common::{DFSchema, DataFusionError, ScalarValue}; +use datafusion_expr::execution_props::ExecutionProps; +use datafusion_physical_expr::{ + PhysicalExpr, PhysicalSortExpr, create_physical_expr, expressions::Column, +}; use futures::{ FutureExt, Stream, StreamExt, TryFutureExt, TryStreamExt, future::BoxFuture, stream::{self}, }; -use lance_arrow::ipc::{read_ipc_stream_single_at, write_ipc_stream}; use lance_core::deepsize::DeepSizeOf; use lance_core::{ Error, ROW_ID, Result, - cache::{CacheCodec, CacheCodecImpl, CacheKey, LanceCache, WeakLanceCache}, + cache::{ + CacheCodec, CacheCodecImpl, CacheEntryReader, CacheEntryWriter, CacheKey, LanceCache, + WeakLanceCache, + }, error::LanceOptionExt, utils::{ tokio::get_num_compute_intensive_cpus, @@ -589,7 +595,7 @@ impl Ord for OrderableScalarValue { } } (Struct(_arr), _) => panic!("Attempt to compare Struct with non-Struct"), - (Dictionary(_k1, _v1), Dictionary(_k2, _v2)) => todo!(), + (Dictionary(_k1, v1), Dictionary(_k2, v2)) => Self(*v1.clone()).cmp(&Self(*v2.clone())), (Dictionary(_, v1), Null) => Self(*v1.clone()).cmp(&Self(ScalarValue::Null)), (Dictionary(_, _), _) => panic!("Attempt to compare Dictionary with non-Dictionary"), // What would a btree of unions even look like? May not be possible. @@ -1402,106 +1408,58 @@ impl BTreeIndexState { } impl CacheCodecImpl for BTreeIndexState { - /// Wire format (no stability guarantees yet — the cache is rebuilt from - /// source on any version mismatch): + const TYPE_ID: &'static str = "lance.scalar.BTreeIndexState"; + const CURRENT_VERSION: u32 = 1; + + /// Wire format: /// ```text - /// u64 batch_size (LE) - /// u8 has_ranges (0 = None, 1 = Some) - /// if has_ranges: - /// u32 entry_count (LE) - /// per entry: u32 start | u32 end | u32 offset | u32 path_len | path bytes - /// lookup batch (Arrow IPC stream) + /// HEADER : BTreeIndexHeader proto (batch_size + page-range mapping) + /// ARROW_IPC : page-lookup batch /// ``` - fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()> { - writer.write_all(&self.batch_size.to_le_bytes())?; - match &self.ranges_to_files { - None => writer.write_all(&[0u8])?, - Some(ranges) => { - writer.write_all(&[1u8])?; - let count = u32::try_from(ranges.len()).map_err(|_| { - Error::io("BTreeIndexState: ranges_to_files exceeds u32::MAX entries") - })?; - writer.write_all(&count.to_le_bytes())?; - for (range, (path, page_offset)) in ranges.iter() { - writer.write_all(&range.start().to_le_bytes())?; - writer.write_all(&range.end().to_le_bytes())?; - writer.write_all(&page_offset.to_le_bytes())?; - let path_len = u32::try_from(path.len()).map_err(|_| { - Error::io("BTreeIndexState: ranges_to_files path exceeds u32::MAX bytes") - })?; - writer.write_all(&path_len.to_le_bytes())?; - writer.write_all(path.as_bytes())?; - } - } - } - write_ipc_stream(&self.lookup_batch, writer)?; + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + let ranges_to_files = match &self.ranges_to_files { + None => Vec::new(), + Some(ranges) => ranges + .iter() + .map(|(range, (path, page_offset))| RangeToFile { + start: *range.start(), + end: *range.end(), + page_offset: *page_offset, + path: path.clone(), + }) + .collect(), + }; + let header = BTreeIndexHeader { + batch_size: self.batch_size, + has_ranges_to_files: self.ranges_to_files.is_some(), + ranges_to_files, + }; + w.write_header(&header)?; + w.write_ipc(&self.lookup_batch)?; Ok(()) } - fn deserialize(data: &bytes::Bytes) -> Result { - let mut offset = 0; - let batch_size = read_u64_le(data, &mut offset)?; - let has_ranges = read_u8(data, &mut offset)?; - let ranges_to_files = match has_ranges { - 0 => None, - 1 => { - let count = read_u32_le(data, &mut offset)? as usize; - let mut entries = Vec::with_capacity(count); - for _ in 0..count { - let start = read_u32_le(data, &mut offset)?; - let end = read_u32_le(data, &mut offset)?; - let page_offset = read_u32_le(data, &mut offset)?; - let path_len = read_u32_le(data, &mut offset)? as usize; - let path = read_bytes(data, &mut offset, path_len)?; - let path = std::str::from_utf8(&path) - .map_err(|e| Error::io(format!("BTreeIndexState path: {e}")))? - .to_string(); - entries.push((start..=end, (path, page_offset))); - } - Some(Arc::new(entries.into_iter().collect())) - } - other => { - return Err(Error::io(format!( - "BTreeIndexState: invalid has_ranges tag {other}" - ))); - } + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: BTreeIndexHeader = r.read_header()?; + let ranges_to_files = if header.has_ranges_to_files { + let map: RangeInclusiveMap = header + .ranges_to_files + .into_iter() + .map(|entry| (entry.start..=entry.end, (entry.path, entry.page_offset))) + .collect(); + Some(Arc::new(map)) + } else { + None }; - let lookup_batch = read_ipc_stream_single_at(data, &mut offset)?; + let lookup_batch = r.read_ipc()?; Ok(Self { lookup_batch, - batch_size, + batch_size: header.batch_size, ranges_to_files, }) } } -fn read_bytes(data: &bytes::Bytes, offset: &mut usize, len: usize) -> Result { - if data.len() < *offset + len { - return Err(Error::io(format!( - "BTreeIndexState: short read of {len} bytes at offset {offset} (have {})", - data.len() - ))); - } - let slice = data.slice(*offset..*offset + len); - *offset += len; - Ok(slice) -} - -fn read_u8(data: &bytes::Bytes, offset: &mut usize) -> Result { - let bytes = read_bytes(data, offset, 1)?; - Ok(bytes[0]) -} - -fn read_u32_le(data: &bytes::Bytes, offset: &mut usize) -> Result { - let bytes = read_bytes(data, offset, 4)?; - Ok(u32::from_le_bytes(bytes.as_ref().try_into().unwrap())) -} - -fn read_u64_le(data: &bytes::Bytes, offset: &mut usize) -> Result { - let bytes = read_bytes(data, offset, 8)?; - Ok(u64::from_le_bytes(bytes.as_ref().try_into().unwrap())) -} - /// Cache key for a [`BTreeIndexState`]. The cache it is used with is already /// namespaced per-index, so the key string is a constant. struct BTreeIndexStateKey; @@ -1595,6 +1553,66 @@ impl BTreeIndex { } } + /// For each key in `keys`, whether this index contains it — a batched + /// existence check returning a mask aligned to `keys`. + /// + /// The per-key sibling of `search(Equals(..))`, but one call replaces N + /// probes: keys are grouped by page using the same page resolution as + /// [`ScalarIndex::search`] (`pages_eq`), each touched page is loaded once + /// (session-cached), and membership is tested against the page's values via + /// `FlatIndex::contains_values`. Avoids the per-key `SearchResult` / + /// `RowAddrTreeMap` allocation when the caller only wants a yes/no. + /// + /// Intended for primary-key dedup, where keys are non-null; a null key maps + /// to `false`. + pub async fn contains_keys( + &self, + keys: &[ScalarValue], + metrics: &dyn MetricsCollector, + ) -> Result> { + // Group each key (by input position) under every page whose value range + // could hold it. Mirrors `search`'s page selection so the two agree. + let mut by_page: HashMap> = HashMap::new(); + for (idx, key) in keys.iter().enumerate() { + if key.is_null() { + continue; + } + let ov = OrderableScalarValue(key.clone()); + for matches in self.page_lookup.pages_eq(&ov)? { + by_page + .entry(matches.page_id()) + .or_default() + .push((idx, ov.clone())); + } + } + + let index_reader = LazyIndexReader::new(self.store.clone(), self.ranges_to_files.clone()); + let page_tasks = by_page.into_iter().map(|(page_number, entries)| { + let index_reader = index_reader.clone(); + async move { + let page = self.lookup_page(page_number, index_reader, metrics).await?; + let needles: Vec = + entries.iter().map(|(_, ov)| ov.clone()).collect(); + let present = page.contains_values(&needles)?; + Result::Ok((entries, present)) + } + }); + + let mut result = vec![false; keys.len()]; + let page_results: Vec<_> = stream::iter(page_tasks) + .buffer_unordered(get_num_compute_intensive_cpus()) + .try_collect() + .await?; + for (entries, present) in page_results { + for (idx, ov) in entries { + if present.contains(&ov) { + result[idx] = true; + } + } + } + Ok(result) + } + async fn lookup_page( &self, page_number: u32, @@ -1628,11 +1646,28 @@ impl BTreeIndex { FlatIndex::try_new(serialized_page) } + /// Compile a sargable predicate into a physical expr against the per-page + /// schema ([values, ids]). Built once in `search` and shared across pages so + /// a large IN-list is not re-materialized for every page. + fn compile_predicate(&self, query: &SargableQuery) -> Result> { + let schema = Arc::new(Schema::new(vec![ + Field::new(BTREE_VALUES_COLUMN, self.data_type.clone(), true), + Field::new(BTREE_IDS_COLUMN, DataType::UInt64, false), + ])); + let df_schema = DFSchema::try_from(schema)?; + Ok(create_physical_expr( + &query.to_expr(BTREE_VALUES_COLUMN.to_string()), + &df_schema, + &ExecutionProps::default(), + )?) + } + async fn search_page( &self, query: &SargableQuery, matches: Matches, index_reader: LazyIndexReader, + prebuilt: Option<&Arc>, metrics: &dyn MetricsCollector, ) -> Result { let subindex = self @@ -1640,13 +1675,12 @@ impl BTreeIndex { .await?; match matches { - Matches::Some(_) => { - // TODO: If this is an IN query we can perhaps simplify the subindex query by restricting it to the - // values that might be in the page. E.g. if we are searching for X IN [5, 3, 7] and five is in pages - // 1 and 2 and three is in page 2 and seven is in pages 8 and 9, then when searching page 2 we only need - // to search for X IN [5, 3] - subindex.search(query, metrics) - } + // For a large IsIn the predicate is compiled once (see `search`) and + // reused here, instead of rebuilding the whole IN-list per page. + Matches::Some(_) => match prebuilt { + Some(expr) => subindex.search_prebuilt(expr, metrics), + None => subindex.search(query, metrics), + }, Matches::All(_) => Ok(match query { // This means we hit an all-null page so just grab all row ids as true SargableQuery::IsNull() => subindex.all_ignore_nulls(), @@ -1809,7 +1843,7 @@ impl BTreeIndex { if old_data_filters.len() != segments.len() { return Err(Error::invalid_input(format!( "BTree merge: expected one old-data filter per source segment \ - ({} segments) but got {}", + (segments={}, filters={})", segments.len(), old_data_filters.len() ))); @@ -1837,13 +1871,19 @@ impl BTreeIndex { let mut inputs: Vec> = Vec::with_capacity(segments.len() + 1); for (segment, old_data_filter) in segments.iter().zip(old_data_filters) { + if old_data_filter.as_ref().is_some_and(|f| f.keeps_nothing()) { + continue; + } let stream = segment.data_stream().await?; + let stream = match segment.frag_reuse_index.clone() { + Some(frag_reuse_index) => remap_row_ids(stream, frag_reuse_index), + None => stream, + }; let stream = match old_data_filter.clone() { Some(filter) => filter_row_ids(stream, filter), None => stream, }; - let exec = Arc::new(OneShotExec::new(stream)); - inputs.push(exec); + inputs.push(Arc::new(OneShotExec::new(stream))); } inputs.push(Arc::new(OneShotExec::new(new_data))); @@ -1898,6 +1938,18 @@ fn filter_row_ids( Box::pin(RecordBatchStreamAdapter::new(schema, filtered)) } +fn remap_row_ids( + stream: SendableRecordBatchStream, + frag_reuse_index: Arc, +) -> SendableRecordBatchStream { + let schema = stream.schema(); + let remapped = stream.map(move |batch_result| { + let batch = batch_result?; + Ok(frag_reuse_index.remap_row_ids_record_batch(batch, 1)?) + }); + Box::pin(RecordBatchStreamAdapter::new(schema, remapped)) +} + fn wrap_bound(bound: &Bound) -> Bound { match bound { Bound::Unbounded => Bound::Unbounded, @@ -2113,13 +2165,27 @@ impl ScalarIndex for BTreeIndex { } } + // Compile a large IsIn predicate once and reuse it across every page; + // rebuilding the full IN-list per page is O(pages * values) and dominates + // the lookup for sets with many values. + let prebuilt = match query { + SargableQuery::IsIn(_) => Some(self.compile_predicate(query)?), + _ => None, + }; + let lazy_index_reader = LazyIndexReader::new(self.store.clone(), self.ranges_to_files.clone()); let page_tasks = pages .into_iter() .map(|page_index| { - self.search_page(query, page_index, lazy_index_reader.clone(), metrics) - .boxed() + self.search_page( + query, + page_index, + lazy_index_reader.clone(), + prebuilt.as_ref(), + metrics, + ) + .boxed() }) .collect::>(); debug!("Searching {} btree pages", page_tasks.len()); @@ -3295,7 +3361,23 @@ mod tests { }; use crate::scalar::registry::ScalarIndexPlugin; use arrow_array::RecordBatch; - use lance_core::cache::{CacheCodecImpl, CacheKey}; + use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter, CacheKey}; + + /// Serialize a `BTreeIndexState` body (no envelope) for tests. + fn serialize_state(state: &BTreeIndexState) -> Vec { + let mut buf = Vec::new(); + state + .serialize(&mut CacheEntryWriter::new(&mut buf)) + .unwrap(); + buf + } + + /// Deserialize a `BTreeIndexState` body (no envelope) for tests. + fn deserialize_state(buf: Vec) -> lance_core::Result { + let data = bytes::Bytes::from(buf); + let mut reader = CacheEntryReader::new(&data, 0, BTreeIndexState::CURRENT_VERSION); + BTreeIndexState::deserialize(&mut reader) + } use rangemap::RangeInclusiveMap; lance_testing::define_stage_event_progress!( @@ -3319,6 +3401,37 @@ mod tests { assert!(size_of_many_i32 > 128 * 4); } + #[test] + fn test_orderable_dictionary_cmp() { + use arrow_schema::DataType; + use std::cmp::Ordering; + + let dict = |s: &str, key: DataType| { + OrderableScalarValue(ScalarValue::Dictionary( + Box::new(key), + Box::new(ScalarValue::Utf8(Some(s.to_string()))), + )) + }; + + // Dictionary scalars are ordered by their underlying value, regardless + // of the key type. This is exercised when loading a scalar index built + // on a dictionary-encoded column into a BTreeMap. + assert_eq!( + dict("a", DataType::Int16).cmp(&dict("b", DataType::Int16)), + Ordering::Less + ); + assert_eq!( + dict("b", DataType::Int32).cmp(&dict("b", DataType::Int16)), + Ordering::Equal + ); + + // A non-null dictionary value sorts after null. + assert_eq!( + dict("a", DataType::Int16).cmp(&OrderableScalarValue(ScalarValue::Null)), + Ordering::Greater + ); + } + #[tokio::test] async fn test_null_ids() { let tmpdir = TempObjDir::default(); @@ -3436,6 +3549,86 @@ mod tests { } } + #[tokio::test] + async fn test_contains_keys_matches_search() { + let tmpdir = TempObjDir::default(); + let test_store = Arc::new(LanceIndexStore::new( + Arc::new(ObjectStore::local()), + tmpdir.clone(), + Arc::new(LanceCache::no_cache()), + )); + + // 1000 distinct Int32 values [0, 1000), spread across many small pages + // (batch_size 64) so the keys below exercise multi-page grouping. + let data = gen_batch() + .col("value", array::step::()) + .col("_rowid", array::step::()) + .into_df_exec(RowCount::from(100), BatchCount::from(10)); + let schema = data.schema(); + let sort_expr = PhysicalSortExpr::new_default(col("value", schema.as_ref()).unwrap()); + let plan = Arc::new(SortExec::new([sort_expr].into(), data)); + let stream = plan.execute(0, Arc::new(TaskContext::default())).unwrap(); + let stream = break_stream(stream, 64); + let stream = stream.map_err(DataFusionError::from); + let stream = + Box::pin(RecordBatchStreamAdapter::new(schema, stream)) as SendableRecordBatchStream; + + train_btree_index(stream, test_store.as_ref(), 64, None, None) + .await + .unwrap(); + let index = BTreeIndex::load(test_store, None, &LanceCache::no_cache()) + .await + .unwrap(); + + // Present (range ends, mid, and adjacent values that straddle page + // boundaries), interleaved with absent (below/above range, and a gap). + let keys: Vec = vec![0, 999, 500, 1, 998, -1, 1000, 1500, 250, 251, 7, 64, 63, 65]; + let scalar_keys: Vec = + keys.iter().map(|k| ScalarValue::Int32(Some(*k))).collect(); + + let batched = index + .contains_keys(&scalar_keys, &NoOpMetricsCollector) + .await + .unwrap(); + + // Oracle: the per-key Equals search the batched path replaces. + let mut oracle = Vec::with_capacity(keys.len()); + for k in &scalar_keys { + let result = index + .search(&SargableQuery::Equals(k.clone()), &NoOpMetricsCollector) + .await + .unwrap(); + oracle.push(!result.row_addrs().is_empty()); + } + assert_eq!( + batched, oracle, + "contains_keys must agree with per-key Equals search; keys={keys:?}" + ); + + // And both must match ground truth: [0, 1000) present, others absent. + let expected: Vec = keys.iter().map(|k| (0..1000).contains(k)).collect(); + assert_eq!(batched, expected); + + // Empty input → empty mask. + assert!( + index + .contains_keys(&[], &NoOpMetricsCollector) + .await + .unwrap() + .is_empty() + ); + + // A null key maps to false (and must not panic). + let with_null = vec![ScalarValue::Int32(Some(5)), ScalarValue::Int32(None)]; + assert_eq!( + index + .contains_keys(&with_null, &NoOpMetricsCollector) + .await + .unwrap(), + vec![true, false] + ); + } + #[tokio::test] async fn test_page_cache() { let tmpdir = TempObjDir::default(); @@ -5897,9 +6090,7 @@ mod tests { } fn assert_state_roundtrips(state: &BTreeIndexState) { - let mut buf = Vec::new(); - state.serialize(&mut buf).unwrap(); - let restored = BTreeIndexState::deserialize(&bytes::Bytes::from(buf)).unwrap(); + let restored = deserialize_state(serialize_state(state)).unwrap(); assert_eq!(restored.lookup_batch, state.lookup_batch); assert_eq!(restored.batch_size, state.batch_size); assert_eq!(restored.ranges_to_files, state.ranges_to_files); @@ -5968,9 +6159,7 @@ mod tests { batch_size: index.batch_size, ranges_to_files: index.ranges_to_files.clone(), }; - let mut buf = Vec::new(); - state.serialize(&mut buf).unwrap(); - let restored = BTreeIndexState::deserialize(&bytes::Bytes::from(buf)).unwrap(); + let restored = deserialize_state(serialize_state(&state)).unwrap(); let reconstructed = restored .reconstruct(test_store.clone(), &LanceCache::no_cache(), None) .unwrap(); @@ -6006,18 +6195,57 @@ mod tests { assert_eq!(expected, actual); } + /// The lookup batch must decode zero-copy through the full envelope even + /// though the proto header pushes the IPC section to a non-aligned offset. #[test] - fn test_btree_index_state_rejects_invalid_has_ranges_tag() { - // u64 batch_size (any) then a bad has_ranges tag. + fn test_btree_index_state_lookup_is_zero_copy() { + use lance_core::cache::CacheCodec; + const ALIGN: usize = 64; + + let ranges: RangeInclusiveMap = + [(0..=99, ("part_0_page_file.lance".to_string(), 0))] + .into_iter() + .collect(); + let state = BTreeIndexState { + lookup_batch: sample_lookup_batch(), + batch_size: 8192, + ranges_to_files: Some(Arc::new(ranges)), + }; + + let codec = CacheCodec::from_impl::(); + let any: Arc = Arc::new(state); let mut buf = Vec::new(); - buf.extend_from_slice(&1000u64.to_le_bytes()); - buf.push(7u8); - let err = BTreeIndexState::deserialize(&bytes::Bytes::from(buf)).unwrap_err(); - let msg = err.to_string(); - assert!( - msg.contains("has_ranges") && msg.contains("7"), - "expected error to mention the bad has_ranges tag, got: {msg}" - ); + codec.serialize(&any, &mut buf).unwrap(); + + let mut v = vec![0u8; buf.len() + ALIGN]; + let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN; + v[pad..pad + buf.len()].copy_from_slice(&buf); + let data = bytes::Bytes::from(v).slice(pad..pad + buf.len()); + + let restored = codec.deserialize(&data).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + + let base = data.as_ptr() as usize; + let end = base + data.len(); + for col in restored.lookup_batch.columns() { + for buffer in col.to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= base && ptr < end, + "lookup batch buffer was realigned out of the input — misaligned IPC section", + ); + } + } + } + + #[test] + fn test_btree_index_state_rejects_truncated_header() { + // A header length prefix that overruns the buffer must error rather + // than panic or silently misread it. + let mut buf = Vec::new(); + buf.extend_from_slice(&100u32.to_le_bytes()); // claims a 100-byte header + buf.extend_from_slice(&[0u8; 4]); // but only 4 bytes follow + assert!(deserialize_state(buf).is_err()); } #[tokio::test] diff --git a/rust/lance-index/src/scalar/btree/flat.rs b/rust/lance-index/src/scalar/btree/flat.rs index 212ef6490be..744f6a3cb3c 100644 --- a/rust/lance-index/src/scalar/btree/flat.rs +++ b/rust/lance-index/src/scalar/btree/flat.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::{ops::Bound, sync::Arc}; use arrow_array::Array; @@ -11,19 +11,20 @@ use arrow_array::{ use datafusion_common::DFSchema; use datafusion_expr::execution_props::ExecutionProps; -use datafusion_physical_expr::create_physical_expr; +use datafusion_physical_expr::{PhysicalExpr, create_physical_expr}; use lance_arrow::RecordBatchExt; -use lance_arrow::ipc::{read_ipc_stream_single_at, read_len_prefixed_bytes_at, write_ipc_stream}; use lance_core::Result; -use lance_core::cache::CacheCodecImpl; +use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter}; use lance_core::deepsize::DeepSizeOf; use lance_core::utils::address::RowAddress; use lance_select::{NullableRowAddrSet, RowAddrTreeMap, RowSetOps}; use roaring::RoaringBitmap; use tracing::instrument; +use datafusion_common::ScalarValue; + use crate::metrics::MetricsCollector; -use crate::scalar::btree::BTREE_VALUES_COLUMN; +use crate::scalar::btree::{BTREE_VALUES_COLUMN, OrderableScalarValue}; use crate::scalar::{AnyQuery, SargableQuery}; const VALUES_COL_IDX: usize = 0; @@ -83,6 +84,46 @@ impl FlatIndex { self.data.column(IDS_COL_IDX) } + fn values(&self) -> &ArrayRef { + self.data.column(VALUES_COL_IDX) + } + + /// Which of `needles` are present in this page. + /// + /// Batched existence sibling of [`Self::search`]: it runs the same `IsIn` + /// predicate over the page's `values` column, but returns the matched + /// *values* rather than row addresses — so the caller can map each result + /// back to the input key it asked about. The page scan stays vectorized; + /// only the (small) matched subset is lifted into `ScalarValue`. + /// + /// Nulls: a null `values` entry never matches a (non-null) primary-key + /// needle, so it is simply absent from the result. + pub(crate) fn contains_values( + &self, + needles: &[OrderableScalarValue], + ) -> Result> { + if needles.is_empty() { + return Ok(BTreeSet::new()); + } + let query = SargableQuery::IsIn(needles.iter().map(|v| v.0.clone()).collect()); + let expr = query.to_expr(BTREE_VALUES_COLUMN.to_string()); + let expr = create_physical_expr(&expr, &self.df_schema, &ExecutionProps::default())?; + let predicate = expr.evaluate(&self.data)?; + let predicate = predicate.into_array(self.data.num_rows())?; + let predicate = predicate + .as_any() + .downcast_ref::() + .expect("Predicate should return boolean array"); + let matched = arrow_select::filter::filter(self.values(), predicate)?; + (0..matched.len()) + .map(|i| { + Ok(OrderableScalarValue(ScalarValue::try_from_array( + &matched, i, + )?)) + }) + .collect() + } + pub fn all(&self) -> NullableRowAddrSet { // Some rows will be in both sets but that is ok, null trumps true NullableRowAddrSet::new(self.all_addrs_map.clone(), self.null_addrs_map.clone()) @@ -196,7 +237,22 @@ impl FlatIndex { // No shortcut possible, need to actually evaluate the query let expr = query.to_expr(BTREE_VALUES_COLUMN.to_string()); let expr = create_physical_expr(&expr, &self.df_schema, &ExecutionProps::default())?; + self.eval_expr(&expr) + } + /// Evaluate a predicate compiled once by the caller. Lets a large IsIn that + /// spans many pages build the physical expr a single time instead of + /// rebuilding the whole IN-list per page (the dominant cost of a big lookup). + pub fn search_prebuilt( + &self, + expr: &Arc, + metrics: &dyn MetricsCollector, + ) -> Result { + metrics.record_comparisons(self.data.num_rows()); + self.eval_expr(expr) + } + + fn eval_expr(&self, expr: &Arc) -> Result { let predicate = expr.evaluate(&self.data)?; let predicate = predicate.into_array(self.data.num_rows())?; let predicate = predicate @@ -236,32 +292,38 @@ impl FlatIndex { } impl CacheCodecImpl for FlatIndex { - fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()> { + const TYPE_ID: &'static str = "lance.scalar.FlatIndex"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { // Format: - // [len-prefixed all_addrs_map][len-prefixed null_addrs_map][batch IPC stream] - writer.write_all(&(self.all_addrs_map.serialized_size() as u64).to_le_bytes())?; - self.all_addrs_map.serialize_into(&mut *writer)?; + // RAW_BLOB : all_addrs_map (roaring tree map) + // RAW_BLOB : null_addrs_map (roaring tree map) + // ARROW_IPC : data batch + let mut all_addrs_bytes = Vec::with_capacity(self.all_addrs_map.serialized_size()); + self.all_addrs_map.serialize_into(&mut all_addrs_bytes)?; + w.write_raw(&all_addrs_bytes)?; - writer.write_all(&(self.null_addrs_map.serialized_size() as u64).to_le_bytes())?; - self.null_addrs_map.serialize_into(&mut *writer)?; + let mut null_addrs_bytes = Vec::with_capacity(self.null_addrs_map.serialized_size()); + self.null_addrs_map.serialize_into(&mut null_addrs_bytes)?; + w.write_raw(&null_addrs_bytes)?; - write_ipc_stream(self.data.as_ref(), writer)?; + w.write_ipc(self.data.as_ref())?; Ok(()) } - fn deserialize(data: &bytes::Bytes) -> Result + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result where Self: Sized, { - let mut offset = 0; - let all_addrs_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + let all_addrs_bytes = r.read_raw()?; let all_addrs_map = RowAddrTreeMap::deserialize_from(all_addrs_bytes.as_ref())?; - let null_addrs_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + let null_addrs_bytes = r.read_raw()?; let null_addrs_map = RowAddrTreeMap::deserialize_from(null_addrs_bytes.as_ref())?; - let batch = read_ipc_stream_single_at(data, &mut offset)?; + let batch = r.read_ipc()?; let df_schema = DFSchema::try_from(batch.schema())?; @@ -309,8 +371,12 @@ mod tests { fn assert_roundtrips(index: &FlatIndex) { let mut buf = Vec::new(); - index.serialize(&mut buf).unwrap(); - let restored = FlatIndex::deserialize(&bytes::Bytes::from(buf)).unwrap(); + index + .serialize(&mut CacheEntryWriter::new(&mut buf)) + .unwrap(); + let data = bytes::Bytes::from(buf); + let mut reader = CacheEntryReader::new(&data, 0, FlatIndex::CURRENT_VERSION); + let restored = FlatIndex::deserialize(&mut reader).unwrap(); assert_eq!(restored.data, index.data); assert_eq!(restored.all_addrs_map, index.all_addrs_map); @@ -335,6 +401,41 @@ mod tests { assert_roundtrips(&FlatIndex::try_new(empty).unwrap()); } + /// The data batch must decode zero-copy through the full envelope-bearing + /// [`CacheCodec`], even though the two roaring blobs and the envelope push + /// the IPC section to a non-aligned starting offset. + #[test] + fn test_flat_index_data_is_zero_copy() { + use lance_core::cache::CacheCodec; + const ALIGN: usize = 64; + + let index = example_index(); + let codec = CacheCodec::from_impl::(); + let any: Arc = Arc::new(index); + let mut buf = Vec::new(); + codec.serialize(&any, &mut buf).unwrap(); + + let mut v = vec![0u8; buf.len() + ALIGN]; + let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN; + v[pad..pad + buf.len()].copy_from_slice(&buf); + let data = bytes::Bytes::from(v).slice(pad..pad + buf.len()); + + let restored = codec.deserialize(&data).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + + let base = data.as_ptr() as usize; + let end = base + data.len(); + for col in restored.data.columns() { + for buffer in col.to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= base && ptr < end, + "data batch buffer was realigned out of the input — misaligned IPC section", + ); + } + } + } + #[tokio::test] async fn test_equality() { check_index(&SargableQuery::Equals(ScalarValue::from(100)), &[0]).await; diff --git a/rust/lance-index/src/scalar/expression.rs b/rust/lance-index/src/scalar/expression.rs index 38a29e9c43c..053da5ae5e7 100644 --- a/rust/lance-index/src/scalar/expression.rs +++ b/rust/lance-index/src/scalar/expression.rs @@ -179,6 +179,18 @@ impl MultiQueryParser { pub fn add(&mut self, other: Box) { self.parsers.push(other); } + + /// Pick the first underlying parser whose `is_valid_reference` accepts `expr`. + pub fn select( + &self, + expr: &Expr, + data_type: &DataType, + ) -> Option<(&dyn ScalarQueryParser, DataType)> { + self.parsers.iter().find_map(|p| { + p.is_valid_reference(expr, data_type) + .map(|dt| (p.as_ref(), dt)) + }) + } } impl ScalarQueryParser for MultiQueryParser { @@ -781,20 +793,28 @@ impl ScalarQueryParser for LabelListQueryParser { } } -/// A parser for indices that handle string contains queries +/// A parser for indices that handle string `contains` queries, and -- when +/// `supports_regex` is set -- `regexp_like` / `regexp_match` queries. #[derive(Debug, Clone)] pub struct TextQueryParser { index_name: String, index_type: String, needs_recheck: bool, + supports_regex: bool, } impl TextQueryParser { - pub fn new(index_name: String, index_type: String, needs_recheck: bool) -> Self { + pub fn new( + index_name: String, + index_type: String, + needs_recheck: bool, + supports_regex: bool, + ) -> Self { Self { index_name, index_type, needs_recheck, + supports_regex, } } } @@ -837,31 +857,156 @@ impl ScalarQueryParser for TextQueryParser { func: &ScalarUDF, args: &[Expr], ) -> Option { - if args.len() != 2 { + // The first argument is the indexed column; the second is the substring + // / pattern. `contains` takes exactly two arguments; the regex functions + // optionally take a third flags argument. + if args.len() < 2 { return None; } - let scalar = maybe_scalar(&args[1], data_type)?; - match scalar { - ScalarValue::Utf8(Some(scalar_str)) | ScalarValue::LargeUtf8(Some(scalar_str)) => { - if func.name() == "contains" { - let query = TextQuery::StringContains(scalar_str); - Some(IndexedExpression::index_query_with_recheck( - column.to_string(), - self.index_name.clone(), - self.index_type.clone(), - Arc::new(query), - self.needs_recheck, - )) - } else { + // A non-string pattern cannot be handled. + let (ScalarValue::Utf8(Some(pattern)) | ScalarValue::LargeUtf8(Some(pattern))) = + maybe_scalar(&args[1], data_type)? + else { + return None; + }; + + let query = match func.name() { + "contains" if args.len() == 2 => TextQuery::StringContains(pattern), + "regexp_like" | "regexp_match" if self.supports_regex => { + let pattern = match args.get(2) { + Some(flags_expr) => apply_regex_flags(&pattern, flags_expr)?, + None => pattern, + }; + // If the pattern yields no usable trigram (e.g. `a.b`), leave it + // to a full scan instead of routing it to the index, which could + // only answer with an unsupported "recheck everything" result. + if !crate::scalar::ngram::regex_can_use_index(&pattern) { + return None; + } + TextQuery::Regex(pattern) + } + _ => return None, + }; + + Some(IndexedExpression::index_query_with_recheck( + column.to_string(), + self.index_name.clone(), + self.index_type.clone(), + Arc::new(query), + self.needs_recheck, + )) + } + + fn visit_like( + &self, + column: &str, + like: &Like, + pattern: &ScalarValue, + ) -> Option { + // Infix LIKE is accelerated only by the ngram index (via its regex + // machinery). A plain-literal `regexp_like(col, 'foo')` is rewritten to + // `col LIKE '%foo%'` before it reaches the index, so this is the path + // that accelerates those. ILIKE is skipped because its case folding does + // not match the index's normalization. + if !self.supports_regex || like.case_insensitive { + return None; + } + let pattern_str = match pattern { + ScalarValue::Utf8(Some(s)) | ScalarValue::LargeUtf8(Some(s)) => s.as_str(), + _ => return None, + }; + // Translate the LIKE pattern into a loose regex used only for candidate + // generation; the original LIKE stays as the recheck filter, so the + // regex only needs to be a sound superset. + let regex = like_to_regex(pattern_str, like.escape_char)?; + if !crate::scalar::ngram::regex_can_use_index(®ex) { + return None; + } + Some(IndexedExpression { + scalar_query: Some(ScalarIndexExpr::Query(ScalarIndexSearch { + column: column.to_string(), + index_name: self.index_name.clone(), + index_type: self.index_type.clone(), + query: Arc::new(TextQuery::Regex(regex)), + needs_recheck: self.needs_recheck, + fragment_bitmap: None, + })), + refine_expr: Some(Expr::Like(like.clone())), + }) + } +} + +/// Translate a LIKE pattern into a regular expression used purely for ngram +/// candidate generation: `%` becomes `.*`, `_` becomes `.`, and literal +/// characters are regex-escaped. Returns `None` when no literal run is long +/// enough to yield a trigram (the index could not help, so a full scan is left +/// to handle it). +fn like_to_regex(pattern: &str, escape: Option) -> Option { + let mut regex = String::new(); + let mut run = 0usize; + let mut longest_run = 0usize; + let mut chars = pattern.chars(); + while let Some(c) = chars.next() { + let literal = if Some(c) == escape { + // The next character is escaped, i.e. a literal. + chars.next() + } else { + match c { + '%' => { + regex.push_str(".*"); + run = 0; None } + '_' => { + regex.push('.'); + run = 0; + None + } + other => Some(other), } - _ => { - // If the scalar is not a string, we cannot handle it - None + }; + if let Some(lit) = literal { + if regex_syntax::is_meta_character(lit) { + regex.push('\\'); + } + regex.push(lit); + // Only runs of alphanumeric characters can produce a trigram. + if lit.is_alphanumeric() { + run += 1; + longest_run = longest_run.max(run); + } else { + run = 0; } } } + (longest_run >= 3).then_some(regex) +} + +/// Fold the supported `regexp_like` / `regexp_match` flags into an inline prefix +/// on the pattern (e.g. flags `"i"` -> `"(?i)pattern"`). Returns `None` for a +/// non-literal flags argument or an unrecognized flag, so the caller leaves the +/// predicate to a full recheck rather than risk changing its semantics. +fn apply_regex_flags(pattern: &str, flags_expr: &Expr) -> Option { + let (Expr::Literal(ScalarValue::Utf8(Some(flags)), _) + | Expr::Literal(ScalarValue::LargeUtf8(Some(flags)), _)) = flags_expr + else { + return None; + }; + let mut inline = String::new(); + for flag in flags.chars() { + // Only flags expressible as an inline `(?...)` group in the regex crate + // (which the recheck uses) are safe to fold. + if ['i', 's', 'm', 'x'].contains(&flag) { + inline.push(flag); + } else { + return None; + } + } + if inline.is_empty() { + Some(pattern.to_string()) + } else { + Some(format!("(?{inline}){pattern}")) + } } /// A parser for indices that handle queries with the contains_tokens function @@ -1452,8 +1597,8 @@ fn maybe_indexed_column<'b>( ) -> Option<(String, DataType, &'b dyn ScalarQueryParser)> { // First try to extract the full nested column path for get_field expressions if let Some(nested_path) = extract_nested_column_path(expr) - && let Some((data_type, parser)) = index_info.get_index(&nested_path) - && let Some(data_type) = parser.is_valid_reference(expr, data_type) + && let Some((data_type, multi)) = index_info.get_index(&nested_path) + && let Some((parser, data_type)) = multi.select(expr, data_type) { return Some((nested_path, data_type, parser)); } @@ -1461,12 +1606,9 @@ fn maybe_indexed_column<'b>( match expr { Expr::Column(col) => { let col = col.name.as_str(); - let (data_type, parser) = index_info.get_index(col)?; - if let Some(data_type) = parser.is_valid_reference(expr, data_type) { - Some((col.to_string(), data_type, parser)) - } else { - None - } + let (data_type, multi) = index_info.get_index(col)?; + let (parser, data_type) = multi.select(expr, data_type)?; + Some((col.to_string(), data_type, parser)) } Expr::ScalarFunction(udf) => { if udf.args.is_empty() { @@ -1474,12 +1616,9 @@ fn maybe_indexed_column<'b>( } // For non-get_field functions, fall back to old behavior let col = maybe_column(&udf.args[0])?; - let (data_type, parser) = index_info.get_index(col)?; - if let Some(data_type) = parser.is_valid_reference(expr, data_type) { - Some((col.to_string(), data_type, parser)) - } else { - None - } + let (data_type, multi) = index_info.get_index(col)?; + let (parser, data_type) = multi.select(expr, data_type)?; + Some((col.to_string(), data_type, parser)) } _ => None, } @@ -1813,7 +1952,18 @@ fn visit_node( Expr::IsFalse(expr) => Ok(visit_is_bool(expr.as_ref(), index_info, false)), Expr::IsTrue(expr) => Ok(visit_is_bool(expr.as_ref(), index_info, true)), Expr::IsNull(expr) => Ok(visit_is_null(expr.as_ref(), index_info, false)), - Expr::IsNotNull(expr) => Ok(visit_is_null(expr.as_ref(), index_info, true)), + Expr::IsNotNull(expr) => { + // `regexp_match(col, pat)` returns a list and is coerced to + // `IsNotNull(regexp_match(...))` before it reaches here. Unwrap that + // so the regex acceleration applies; everything else is a genuine + // IS NOT NULL check. + if let Expr::ScalarFunction(scalar_fn) = expr.as_ref() + && scalar_fn.func.name() == "regexp_match" + { + return Ok(visit_scalar_fn(scalar_fn, index_info)); + } + Ok(visit_is_null(expr.as_ref(), index_info, true)) + } Expr::Not(expr) => visit_not(expr.as_ref(), index_info, depth), Expr::BinaryExpr(binary_expr) => visit_binary_expr(binary_expr, index_info, depth), Expr::ScalarFunction(scalar_fn) => Ok(visit_scalar_fn(scalar_fn, index_info)), @@ -1833,7 +1983,7 @@ fn visit_node( pub trait IndexInformationProvider { /// Check if an index exists for `col` and, if so, return the data type of col /// as well as a query parser that can parse queries for that column - fn get_index(&self, col: &str) -> Option<(&DataType, &dyn ScalarQueryParser)>; + fn get_index(&self, col: &str) -> Option<(&DataType, &MultiQueryParser)>; /// The set of fragments covered by `(column, index_name)`. /// @@ -2015,11 +2165,18 @@ mod tests { struct ColInfo { data_type: DataType, - parser: Box, + parser: Box, } impl ColInfo { fn new(data_type: DataType, parser: Box) -> Self { + Self { + data_type, + parser: Box::new(MultiQueryParser::single(parser)), + } + } + + fn with_multi(data_type: DataType, parser: Box) -> Self { Self { data_type, parser } } } @@ -2041,7 +2198,7 @@ mod tests { } impl IndexInformationProvider for MockIndexInfoProvider { - fn get_index(&self, col: &str) -> Option<(&DataType, &dyn ScalarQueryParser)> { + fn get_index(&self, col: &str) -> Option<(&DataType, &MultiQueryParser)> { self.indexed_columns .get(col) .map(|col_info| (&col_info.data_type, col_info.parser.as_ref())) @@ -2690,6 +2847,59 @@ mod tests { assert!(matches!(negated.upper, NullableRowAddrMask::BlockList(_))); } + #[test] + fn test_like_to_regex() { + // `%` -> `.*`, `_` -> `.`, with a literal run of at least three chars. + assert_eq!(like_to_regex("%foo%", None).as_deref(), Some(".*foo.*")); + assert_eq!(like_to_regex("foo%bar", None).as_deref(), Some("foo.*bar")); + assert_eq!(like_to_regex("foo_bar", None).as_deref(), Some("foo.bar")); + assert_eq!(like_to_regex("foobar", None).as_deref(), Some("foobar")); + + // Regex metacharacters in the literal portion are escaped. + assert_eq!( + like_to_regex("%a.bcd%", None).as_deref(), + Some(".*a\\.bcd.*") + ); + + // No literal run of three alphanumeric characters -> no index help. + assert_eq!(like_to_regex("%ab%", None), None); + assert_eq!(like_to_regex("%a%b%c%", None), None); + assert_eq!(like_to_regex("%", None), None); + + // The escape character makes the following character a literal. + assert_eq!( + like_to_regex(r"%foo\%bar%", Some('\\')).as_deref(), + Some(".*foo%bar.*") + ); + } + + #[test] + fn test_apply_regex_flags() { + fn flags(s: &str) -> Expr { + Expr::Literal(ScalarValue::Utf8(Some(s.to_string())), None) + } + + // Empty flags leave the pattern untouched (no inline group emitted). + assert_eq!(apply_regex_flags("foo", &flags("")).as_deref(), Some("foo")); + // Supported flags are folded into an inline `(?...)` prefix. + assert_eq!( + apply_regex_flags("foo", &flags("i")).as_deref(), + Some("(?i)foo") + ); + assert_eq!( + apply_regex_flags("foo", &flags("is")).as_deref(), + Some("(?is)foo") + ); + // An unrecognized flag bails out so the caller leaves the predicate to a + // full recheck rather than risk changing its semantics. + assert_eq!(apply_regex_flags("foo", &flags("g")), None); + // A non-string (hence non-literal-flags) argument cannot be folded. + assert_eq!( + apply_regex_flags("foo", &Expr::Literal(ScalarValue::Int32(Some(1)), None)), + None + ); + } + #[test] fn test_extract_like_leading_prefix() { // Simple prefix patterns (no recheck needed) @@ -3157,4 +3367,75 @@ mod tests { assert_eq!(round_tripped.upper, RowAddrMask::from_allowed(upper_addrs)); assert_eq!(round_tripped_frags, fragments_covered); } + + /// Regression test: when two JSON indices target different paths on the same + /// column, a query against one path must be routed to its own index instead + /// of being intercepted by whichever parser was registered first. + #[test] + fn test_multi_json_indices_route_by_path() { + // Build a MultiQueryParser containing two JSON sub-parsers: one for + // path "$.a" and one for path "$.b". + let mut multi = MultiQueryParser::single(Box::new(JsonQueryParser::new( + "$.a".to_string(), + Box::new(SargableQueryParser::new( + "json_a_idx".to_string(), + "Json".to_string(), + false, + )), + ))); + multi.add(Box::new(JsonQueryParser::new( + "$.b".to_string(), + Box::new(SargableQueryParser::new( + "json_b_idx".to_string(), + "Json".to_string(), + false, + )), + ))); + + let index_info = MockIndexInfoProvider::new(vec![( + "json", + ColInfo::with_multi(DataType::LargeBinary, Box::new(multi)), + )]); + + // Query against path "$.b" must hit the "$.b" index. + let expected_b = IndexedExpression::index_query( + "json".to_string(), + "json_b_idx".to_string(), + "Json".to_string(), + Arc::new(JsonQuery::new( + Arc::new(SargableQuery::Equals(ScalarValue::Utf8(Some( + "foo".to_string(), + )))), + "$.b".to_string(), + )), + ); + check( + &index_info, + "json_extract(json, '$.b') = 'foo'", + Some(expected_b), + false, + ); + + // Query against path "$.a" must hit the "$.a" index. + let expected_a = IndexedExpression::index_query( + "json".to_string(), + "json_a_idx".to_string(), + "Json".to_string(), + Arc::new(JsonQuery::new( + Arc::new(SargableQuery::Equals(ScalarValue::Utf8(Some( + "foo".to_string(), + )))), + "$.a".to_string(), + )), + ); + check( + &index_info, + "json_extract(json, '$.a') = 'foo'", + Some(expected_a), + false, + ); + + // Query against an unindexed path must not bind to either index. + check_no_index(&index_info, "json_extract(json, '$.c') = 'foo'"); + } } diff --git a/rust/lance-index/src/scalar/fmindex.rs b/rust/lance-index/src/scalar/fmindex.rs index 9677f7471ea..cdf19f0304c 100644 --- a/rust/lance-index/src/scalar/fmindex.rs +++ b/rust/lance-index/src/scalar/fmindex.rs @@ -1352,6 +1352,12 @@ impl ScalarIndex for FMIndexScalarIndex { Default::default(), ))) } + // Regex queries are routed only to the ngram index (the FM-index's + // query parser advertises `supports_regex = false`), so this is + // unreachable in practice; reject it explicitly rather than silently. + TextQuery::Regex(_) => Err(Error::invalid_input( + "FMIndex does not support regular expression queries", + )), } } fn can_remap(&self) -> bool { @@ -1370,8 +1376,7 @@ impl ScalarIndex for FMIndexScalarIndex { dest: &dyn IndexStore, _old_data_filter: Option, ) -> Result { - let texts = collect_texts(new_data).await?; - let files = write_partitioned_fmindex(&texts, dest).await?; + let files = write_partitioned_fmindex_stream(new_data, dest).await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pb::FmIndexIndexDetails {}).unwrap(), index_version: FMINDEX_INDEX_VERSION, @@ -1390,8 +1395,14 @@ impl ScalarIndex for FMIndexScalarIndex { // ── Helpers ────────────────────────────────────────────────────────────────── -async fn collect_texts(mut stream: SendableRecordBatchStream) -> Result)>> { - let mut texts = Vec::new(); +async fn write_partitioned_fmindex_stream( + mut stream: SendableRecordBatchStream, + store: &dyn IndexStore, +) -> Result> { + let mut files = Vec::new(); + let mut partition = Vec::with_capacity(PARTITION_SIZE); + let mut partition_id = 0; + while let Some(batch) = stream.next().await { let batch = batch?; // Prefer _rowaddr (global row address) over _rowid to ensure stable, @@ -1409,24 +1420,85 @@ async fn collect_texts(mut stream: SendableRecordBatchStream) -> Result = bytes - .iter() - .map(|&b| { - if b == SENTINEL_BYTE || b == 0x00 { - b' ' - } else { - b - } - }) - .collect(); - texts.push((rid, sanitized)); + if let Some(bytes) = extract_sanitized_text_bytes(value_col.as_ref(), i)? { + partition.push((rid, bytes)); + if partition.len() == PARTITION_SIZE { + files.push(write_fmindex_partition(&partition, store, partition_id).await?); + partition.clear(); + partition_id += 1; + } } } } - Ok(texts) + + if !partition.is_empty() { + files.push(write_fmindex_partition(&partition, store, partition_id).await?); + } else if files.is_empty() { + files.push(write_empty_fmindex_partition(store).await?); + } + + Ok(files) +} + +fn sanitize_text_bytes(bytes: &[u8]) -> Vec { + bytes + .iter() + .map(|&b| { + if b == SENTINEL_BYTE || b == 0x00 { + b' ' + } else { + b + } + }) + .collect() } +fn extract_sanitized_text_bytes( + array: &dyn arrow_array::Array, + index: usize, +) -> Result>> { + if array.is_null(index) { + return Ok(None); + } + match array.data_type() { + DataType::Utf8 => Ok(Some(sanitize_text_bytes( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index) + .as_bytes(), + ))), + DataType::LargeUtf8 => Ok(Some(sanitize_text_bytes( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index) + .as_bytes(), + ))), + DataType::Binary => Ok(Some(sanitize_text_bytes( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index), + ))), + DataType::LargeBinary => Ok(Some(sanitize_text_bytes( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index), + ))), + _ => Err(Error::invalid_input(format!( + "Fm does not support data type: {:?}", + array.data_type() + ))), + } +} + +#[cfg(test)] fn extract_text_bytes(array: &dyn arrow_array::Array, index: usize) -> Result>> { if array.is_null(index) { return Ok(None); @@ -1568,25 +1640,36 @@ async fn write_fmindex(fm: &FMIndex, store: &dyn IndexStore, filename: &str) -> writer.finish_with_metadata(metadata).await } +#[cfg(test)] async fn write_partitioned_fmindex( texts: &[(u64, Vec)], store: &dyn IndexStore, ) -> Result> { - let refs: Vec<(u64, &[u8])> = texts.iter().map(|(id, t)| (*id, t.as_slice())).collect(); - if refs.is_empty() { - let fm = FMIndex::build(&[])?; - return Ok(vec![ - write_fmindex(&fm, store, &fmindex_partition_path(0)).await?, - ]); + if texts.is_empty() { + return Ok(vec![write_empty_fmindex_partition(store).await?]); } let mut files = Vec::new(); - for (pid, chunk) in refs.chunks(PARTITION_SIZE).enumerate() { - let fm = FMIndex::build(chunk)?; - files.push(write_fmindex(&fm, store, &fmindex_partition_path(pid as u64)).await?); + for (pid, chunk) in texts.chunks(PARTITION_SIZE).enumerate() { + files.push(write_fmindex_partition(chunk, store, pid as u64).await?); } Ok(files) } +async fn write_fmindex_partition( + texts: &[(u64, Vec)], + store: &dyn IndexStore, + partition_id: u64, +) -> Result { + let refs: Vec<(u64, &[u8])> = texts.iter().map(|(id, t)| (*id, t.as_slice())).collect(); + let fm = FMIndex::build(&refs)?; + write_fmindex(&fm, store, &fmindex_partition_path(partition_id)).await +} + +async fn write_empty_fmindex_partition(store: &dyn IndexStore) -> Result { + let fm = FMIndex::build(&[])?; + write_fmindex(&fm, store, &fmindex_partition_path(0)).await +} + // ── Plugin ─────────────────────────────────────────────────────────────────── #[derive(Debug, Default)] @@ -1623,8 +1706,7 @@ impl ScalarIndexPlugin for FMIndexPlugin { _fids: Option>, _progress: Arc, ) -> Result { - let texts = collect_texts(data).await?; - let files = write_partitioned_fmindex(&texts, store).await?; + let files = write_partitioned_fmindex_stream(data, store).await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pb::FmIndexIndexDetails {}).unwrap(), index_version: FMINDEX_INDEX_VERSION, @@ -1645,6 +1727,9 @@ impl ScalarIndexPlugin for FMIndexPlugin { Some(Box::new(TextQueryParser::new( index_name, self.name().to_string(), + // needs_recheck: the FM-index returns exact substring matches. + false, + // supports_regex: regex acceleration is only implemented for ngram. false, ))) } @@ -1672,7 +1757,10 @@ impl ScalarIndexPlugin for FMIndexPlugin { #[cfg(test)] mod tests { use super::*; - use lance_core::cache::LanceCache; + use arrow_array::{BinaryArray, LargeBinaryArray, LargeStringArray, StringArray, UInt64Array}; + use datafusion::physical_plan::stream::RecordBatchStreamAdapter; + use futures::stream; + use lance_core::{ROW_ADDR, cache::LanceCache}; use lance_io::object_store::ObjectStore; use object_store::path::Path; use std::sync::Arc; @@ -1885,11 +1973,10 @@ mod tests { #[test] fn test_sentinel_sanitization() { - // Text containing \xFF should be sanitized to space + // Text containing \xFF should be sanitized to space during training. let texts: Vec<(u64, &[u8])> = vec![(0, b"hello\xFFworld")]; let fm = FMIndex::build(&texts).unwrap(); - // The \xFF is replaced with space during collect_texts, but here we test build directly - // which doesn't sanitize. The search should still work. + // Build itself does not sanitize, but search should still work. let r = fm.search(b"hello"); assert!(r.contains(0)); } @@ -2061,11 +2148,6 @@ mod tests { #[tokio::test(flavor = "multi_thread")] async fn test_plugin_train_and_load() { - use arrow_array::{StringArray, UInt64Array}; - use datafusion::physical_plan::stream::RecordBatchStreamAdapter; - use futures::stream; - use lance_core::ROW_ADDR; - let docs = vec!["hello world", "hello rust", "goodbye world"]; let row_addrs: Vec = vec![0, 1, 2]; let schema = Arc::new(arrow_schema::Schema::new(vec![ @@ -2128,6 +2210,88 @@ mod tests { } } + #[tokio::test(flavor = "multi_thread")] + async fn test_plugin_train_streams_multiple_partitions() { + fn training_batch( + schema: Arc, + start: usize, + len: usize, + ) -> RecordBatch { + let docs = vec!["x"; len]; + let row_addrs: Vec = (start..start + len).map(|i| i as u64).collect(); + RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from(docs)), + Arc::new(UInt64Array::from(row_addrs)), + ], + ) + .unwrap() + } + + let total_rows = PARTITION_SIZE + 5; + let first_batch_rows = PARTITION_SIZE - 3; + let schema = Arc::new(arrow_schema::Schema::new(vec![ + arrow_schema::Field::new( + crate::scalar::registry::VALUE_COLUMN_NAME, + DataType::Utf8, + false, + ), + arrow_schema::Field::new(ROW_ADDR, DataType::UInt64, false), + ])); + let batches = vec![ + Ok(training_batch(schema.clone(), 0, first_batch_rows)), + Ok(training_batch( + schema.clone(), + first_batch_rows, + total_rows - first_batch_rows, + )), + ]; + + let tempdir = tempfile::tempdir().unwrap(); + let index_dir = Path::from_filesystem_path(tempdir.path()).unwrap(); + let store = Arc::new(LanceIndexStore::new( + Arc::new(ObjectStore::local()), + index_dir, + Arc::new(LanceCache::no_cache()), + )); + + let stream = RecordBatchStreamAdapter::new(schema, stream::iter(batches)); + let req = FMIndexPlugin + .new_training_request("", &arrow_schema::Field::new("val", DataType::Utf8, false)) + .unwrap(); + let created = FMIndexPlugin + .train_index( + Box::pin(stream), + store.as_ref(), + req, + None, + Arc::new(crate::progress::NoopIndexBuildProgress), + ) + .await + .unwrap(); + + assert_eq!(created.files.len(), 2); + + let index = FMIndexPlugin + .load_index(store, &created.index_details, None, &LanceCache::no_cache()) + .await + .unwrap(); + let r = index + .search( + &TextQuery::StringContains("x".to_string()), + &crate::metrics::NoOpMetricsCollector, + ) + .await + .unwrap(); + match r { + SearchResult::Exact(set) => { + assert_eq!(set.len(), Some(total_rows as u64)); + } + _ => panic!("expected exact result"), + } + } + #[test] fn test_build_wavelet_batch() { let texts: Vec<(u64, &[u8])> = vec![(0, b"hello world"), (1, b"test data")]; @@ -2139,8 +2303,6 @@ mod tests { #[test] fn test_extract_text_bytes_types() { - use arrow_array::{BinaryArray, LargeBinaryArray, LargeStringArray, StringArray}; - let utf8 = StringArray::from(vec!["hello"]); assert_eq!( extract_text_bytes(&utf8, 0).unwrap(), @@ -2158,6 +2320,11 @@ mod tests { extract_text_bytes(&binary, 0).unwrap(), Some(b"bytes".to_vec()) ); + let binary_with_sentinels = BinaryArray::from(vec![b"a\xFFb\0c" as &[u8]]); + assert_eq!( + extract_sanitized_text_bytes(&binary_with_sentinels, 0).unwrap(), + Some(b"a b c".to_vec()) + ); let large_binary = LargeBinaryArray::from(vec![b"large" as &[u8]]); assert_eq!( diff --git a/rust/lance-index/src/scalar/inverted/builder.rs b/rust/lance-index/src/scalar/inverted/builder.rs index 24b1eb50203..93932f35332 100644 --- a/rust/lance-index/src/scalar/inverted/builder.rs +++ b/rust/lance-index/src/scalar/inverted/builder.rs @@ -407,11 +407,21 @@ impl InvertedIndexBuilder { ) -> Result> { let partition_id = self.next_partition_id() | self.fragment_mask.unwrap_or(0); builder.set_id(partition_id); - let files = builder.write(dest_store).await?; + let files = builder + .write_to(dest_store, self.partition_write_target()) + .await?; self.new_partitions.push(partition_id); Ok(files) } + fn partition_write_target(&self) -> PartitionWriteTarget { + if self.fragment_mask.is_some() { + PartitionWriteTarget::Staged + } else { + PartitionWriteTarget::Final + } + } + fn next_partition_id(&self) -> u64 { self.partitions .iter() @@ -523,7 +533,11 @@ impl InvertedIndexBuilder { if let Some(builder) = merged_tail_partitions { self.new_partitions.push(builder.id()); let mut builder = builder; - files.extend(builder.write(dest_store.as_ref()).await?); + files.extend( + builder + .write_to(dest_store.as_ref(), self.partition_write_target()) + .await?, + ); } log::info!("wait workers indexing elapsed: {:?}", start.elapsed()); Result::Ok(files) @@ -550,12 +564,16 @@ impl InvertedIndexBuilder { .await?; let mut builder = part.into_builder().await?; builder.remap(mapping).await?; - files.extend(builder.write(dest_store).await?); + files.extend( + builder + .write_to(dest_store, self.partition_write_target()) + .await?, + ); } if self.fragment_mask.is_none() { files.push(self.write_metadata(dest_store, &self.partitions).await?); } else { - // in distributed mode, the part_temp_metadata is written by the worker + // in distributed mode, the staged partition metadata is written by the worker for &partition_id in &self.partitions { files.push(self.write_part_metadata(dest_store, partition_id).await?); } @@ -709,26 +727,35 @@ impl InvertedIndexBuilder { .await?; let mut copied = 0; let mut files = Vec::new(); + let target = self.partition_write_target(); for part in self.partitions.iter() { files.push( self.src_store .as_ref() .expect("existing partitions require a source store") - .copy_index_file(&token_file_path(*part), dest_store) + .copy_index_file_to( + &token_file_path(*part), + &target.token_path(*part), + dest_store, + ) .await?, ); files.push( self.src_store .as_ref() .expect("existing partitions require a source store") - .copy_index_file(&posting_file_path(*part), dest_store) + .copy_index_file_to( + &posting_file_path(*part), + &target.posting_path(*part), + dest_store, + ) .await?, ); files.push( self.src_store .as_ref() .expect("existing partitions require a source store") - .copy_index_file(&doc_file_path(*part), dest_store) + .copy_index_file_to(&doc_file_path(*part), &target.doc_path(*part), dest_store) .await?, ); copied += 1; @@ -986,11 +1013,22 @@ impl InnerBuilder { } pub async fn write(&mut self, store: &dyn IndexStore) -> Result> { + self.write_to(store, PartitionWriteTarget::Final).await + } + + async fn write_to( + &mut self, + store: &dyn IndexStore, + target: PartitionWriteTarget, + ) -> Result> { let docs = Arc::new(std::mem::take(&mut self.docs)); let files = vec![ - self.write_posting_lists(store, docs.clone()).await?, - self.write_tokens(store).await?, - self.write_docs(store, docs).await?, + self.write_posting_lists(store, docs.clone(), &target.posting_path(self.id)) + .await?, + self.write_tokens(store, &target.token_path(self.id)) + .await?, + self.write_docs(store, docs, &target.doc_path(self.id)) + .await?, ]; Ok(files) } @@ -1000,11 +1038,12 @@ impl InnerBuilder { &mut self, store: &dyn IndexStore, docs: Arc, + path: &str, ) -> Result { let id = self.id; let mut writer = store .new_index_file( - &posting_file_path(self.id), + path, inverted_list_schema_for_version(self.with_position, self.format_version), ) .await?; @@ -1090,29 +1129,57 @@ impl InnerBuilder { } #[instrument(level = "debug", skip_all)] - async fn write_tokens(&mut self, store: &dyn IndexStore) -> Result { + async fn write_tokens(&mut self, store: &dyn IndexStore, path: &str) -> Result { log::info!("writing tokens of partition {}", self.id); let tokens = std::mem::take(&mut self.tokens); let batch = tokens.to_batch(self.token_set_format)?; - let mut writer = store - .new_index_file(&token_file_path(self.id), batch.schema()) - .await?; + let mut writer = store.new_index_file(path, batch.schema()).await?; writer.write_record_batch(batch).await?; writer.finish().await } #[instrument(level = "debug", skip_all)] - async fn write_docs(&mut self, store: &dyn IndexStore, docs: Arc) -> Result { + async fn write_docs( + &mut self, + store: &dyn IndexStore, + docs: Arc, + path: &str, + ) -> Result { log::info!("writing docs of partition {}", self.id); let batch = docs.to_batch()?; - let mut writer = store - .new_index_file(&doc_file_path(self.id), batch.schema()) - .await?; + let mut writer = store.new_index_file(path, batch.schema()).await?; writer.write_record_batch(batch).await?; writer.finish().await } } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum PartitionWriteTarget { + Final, + Staged, +} + +impl PartitionWriteTarget { + fn file_path(self, partition_id: u64, suffix: &str) -> String { + match self { + Self::Final => partition_file_path(partition_id, suffix), + Self::Staged => staged_partition_file_path(partition_id, suffix), + } + } + + fn token_path(self, partition_id: u64) -> String { + self.file_path(partition_id, TOKENS_FILE) + } + + fn posting_path(self, partition_id: u64) -> String { + self.file_path(partition_id, INVERT_LIST_FILE) + } + + fn doc_path(self, partition_id: u64) -> String { + self.file_path(partition_id, DOCS_FILE) + } +} + struct IndexWorker { tokenizer: Box, dest_store: Arc, @@ -1430,8 +1497,13 @@ impl IndexWorker { ); let written_partition_id = builder.id(); let mut builder = builder; + let target = if self.fragment_mask.is_some() { + PartitionWriteTarget::Staged + } else { + PartitionWriteTarget::Final + }; let files = builder - .write(self.dest_store.as_ref()) + .write_to(self.dest_store.as_ref(), target) .await .map_err(|err| { Error::execution(format!( @@ -1782,14 +1854,23 @@ pub(crate) fn doc_file_path(partition_id: u64) -> String { } pub(crate) fn part_metadata_file_path(partition_id: u64) -> String { - format!("part_{}_{}", partition_id, METADATA_FILE) + staged_partition_file_path(partition_id, METADATA_FILE) } const PARTITION_FILE_SUFFIXES: [&str; 3] = [TOKENS_FILE, INVERT_LIST_FILE, DOCS_FILE]; -// Each remapped file is renamed twice: first to a temp path (phase 1), then to -// its final path (phase 2). Keep in sync with the two rename loops below in -// `merge_metadata_files`. -const PARTITION_FILE_RENAME_PHASES: u64 = 2; +const STAGED_PARTITION_DIR: &str = "staging"; + +fn partition_file_path(partition_id: u64, suffix: &str) -> String { + format!("part_{}_{}", partition_id, suffix) +} + +fn staged_partition_file_path(partition_id: u64, suffix: &str) -> String { + format!( + "{}/{}", + STAGED_PARTITION_DIR, + partition_file_path(partition_id, suffix) + ) +} pub async fn merge_index_files( object_store: &ObjectStore, @@ -1797,33 +1878,65 @@ pub async fn merge_index_files( store: Arc, progress: Arc, ) -> Result<()> { - // List all partition metadata files in the index directory - let part_metadata_files = list_metadata_files(object_store, index_dir).await?; + let metadata_path = index_dir.clone().join(METADATA_FILE); + if object_store.exists(&metadata_path).await? { + return Ok(()); + } + + // List all staged partition metadata files in the index directory + let index_files = list_index_files(object_store, index_dir).await?; + let part_metadata_files = metadata_files(&index_files); + if part_metadata_files.is_empty() { + return Err(Error::invalid_input_source( + format!( + "No partition metadata files found in index directory: {}", + index_dir + ) + .into(), + )); + } // Call merge_metadata_files function for inverted index merge_metadata_files(store, &part_metadata_files, progress).await } -/// List and filter metadata files from the index directory -/// Returns partition metadata files -async fn list_metadata_files(object_store: &ObjectStore, index_dir: &Path) -> Result> { - // List all partition metadata files in the index directory - let mut part_metadata_files = Vec::new(); - let mut list_stream = object_store.list(Some(index_dir.clone())); +async fn list_index_files(object_store: &ObjectStore, index_dir: &Path) -> Result> { + let mut index_files = Vec::new(); + let mut list_stream = object_store.read_dir_all(index_dir, None); while let Some(item) = list_stream.next().await { match item { Ok(meta) => { - let file_name = meta.location.filename().unwrap_or_default(); - // Filter files matching the pattern part_*_metadata.lance - if file_name.starts_with("part_") && file_name.ends_with("_metadata.lance") { - part_metadata_files.push(file_name.to_string()); - } + let location = meta.location.as_ref().trim_start_matches('/'); + let index_dir = index_dir.as_ref().trim_start_matches('/'); + let relative_path = location + .strip_prefix(index_dir) + .map(|s| s.trim_start_matches('/').to_string()) + .unwrap_or_else(|| meta.location.filename().unwrap_or("").to_string()); + index_files.push(relative_path); } Err(err) => return Err(err), } } + Ok(index_files) +} + +fn metadata_files(index_files: &[String]) -> Vec { + index_files + .iter() + .filter(|file_name| { + file_name.starts_with(&format!("{}/part_", STAGED_PARTITION_DIR)) + && file_name.ends_with("_metadata.lance") + }) + .cloned() + .collect() +} + +#[cfg(test)] +async fn list_metadata_files(object_store: &ObjectStore, index_dir: &Path) -> Result> { + let index_files = list_index_files(object_store, index_dir).await?; + let part_metadata_files = metadata_files(&index_files); if part_metadata_files.is_empty() { return Err(Error::invalid_input_source( format!( @@ -1914,89 +2027,35 @@ async fn merge_metadata_files( progress.stage_complete("read_partition_metadata").await?; // Create ID mapping: sorted original IDs -> 0,1,2... - let mut sorted_ids = all_partitions.clone(); + let mut sorted_ids = all_partitions; sorted_ids.sort(); sorted_ids.dedup(); - let id_mapping: HashMap = sorted_ids + let id_mapping: Vec<(u64, u64)> = sorted_ids .iter() .enumerate() .map(|(new_id, &old_id)| (old_id, new_id as u64)) .collect(); - // Safe rename partition files using temporary files to avoid overwrite - let timestamp = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap() - .as_secs(); - - let changed_partition_count = id_mapping - .iter() - .filter(|(old_id, new_id)| old_id != new_id) - .count() as u64; - let total_renames = changed_partition_count - * PARTITION_FILE_SUFFIXES.len() as u64 - * PARTITION_FILE_RENAME_PHASES; + let total_copies = id_mapping.len() as u64 * PARTITION_FILE_SUFFIXES.len() as u64; progress - .stage_start("remap_partition_files", Some(total_renames), "files") + .stage_start("remap_partition_files", Some(total_copies), "files") .await?; - // Phase 1: Move files to temporary locations - let mut temp_files: Vec<(String, String, String)> = Vec::new(); // (temp_path, old_path, final_path) - let mut renamed_files = 0u64; + let mut copied_files = 0u64; - for (&old_id, &new_id) in &id_mapping { - if old_id != new_id { - for suffix in PARTITION_FILE_SUFFIXES { - let old_path = format!("part_{}_{}", old_id, suffix); - let new_path = format!("part_{}_{}", new_id, suffix); - let temp_path = format!("temp_{}_{}", timestamp, old_path); - - // Move to temporary location first to avoid overwrite - if let Err(e) = store.rename_index_file(&old_path, &temp_path).await { - // Rollback phase 1: restore files from temp locations - for (temp_name, old_name, _) in temp_files.iter().rev() { - let _ = store.rename_index_file(temp_name, old_name).await; - } - return Err(Error::index(format!( - "Failed to move {} to temp {}: {}", - old_path, temp_path, e - ))); - } - temp_files.push((temp_path, old_path, new_path)); - renamed_files += 1; - progress - .stage_progress("remap_partition_files", renamed_files) - .await?; - } - } - } - - // Phase 2: Move from temporary to final locations - let mut completed_renames: Vec<(String, String)> = Vec::new(); // (final_path, temp_path) - - for (temp_path, _old_path, final_path) in &temp_files { - if let Err(e) = store.rename_index_file(temp_path, final_path).await { - // Rollback phase 2: restore completed renames and remaining temps - for (final_name, temp_name) in completed_renames.iter().rev() { - let _ = store.rename_index_file(final_name, temp_name).await; - } - // Restore remaining temp files to original locations - for (temp_name, orig_name, _) in temp_files.iter() { - if !completed_renames.iter().any(|(_, t)| t == temp_name) { - let _ = store.rename_index_file(temp_name, orig_name).await; - } - } - return Err(Error::index(format!( - "Failed to rename {} to {}: {}", - temp_path, final_path, e - ))); + for &(old_id, new_id) in &id_mapping { + for suffix in PARTITION_FILE_SUFFIXES { + let staged_path = staged_partition_file_path(old_id, suffix); + let final_path = partition_file_path(new_id, suffix); + store + .copy_index_file_to(&staged_path, &final_path, store.as_ref()) + .await?; + copied_files += 1; + progress + .stage_progress("remap_partition_files", copied_files) + .await?; } - completed_renames.push((final_path.clone(), temp_path.clone())); - renamed_files += 1; - progress - .stage_progress("remap_partition_files", renamed_files) - .await?; } progress.stage_complete("remap_partition_files").await?; @@ -2023,10 +2082,15 @@ async fn merge_metadata_files( progress.stage_progress("write_merged_metadata", 1).await?; progress.stage_complete("write_merged_metadata").await?; - // Cleanup partition metadata files + // Cleanup staged partition metadata files for file_name in part_metadata_files { - if file_name.starts_with("part_") && file_name.ends_with("_metadata.lance") { - let _ = store.delete_index_file(file_name).await; + let _ = store.delete_index_file(file_name).await; + } + for &(old_id, _) in &id_mapping { + for suffix in PARTITION_FILE_SUFFIXES { + let _ = store + .delete_index_file(&staged_partition_file_path(old_id, suffix)) + .await; } } @@ -2246,6 +2310,234 @@ mod tests { } } + #[derive(Debug, Clone)] + struct NoRenameStore { + inner: Arc, + final_delete_count: Option>, + } + + impl NoRenameStore { + fn new(inner: Arc) -> Self { + Self { + inner, + final_delete_count: None, + } + } + + fn with_final_delete_tracking(inner: Arc) -> Self { + Self { + inner, + final_delete_count: Some(Arc::new(AtomicUsize::new(0))), + } + } + + fn final_delete_count(&self) -> usize { + self.final_delete_count + .as_ref() + .map(|count| count.load(Ordering::SeqCst)) + .unwrap_or_default() + } + + fn unwrap_dest_store(dest_store: &dyn IndexStore) -> &dyn IndexStore { + dest_store + .as_any() + .downcast_ref::() + .map(|store| store.inner.as_ref()) + .unwrap_or(dest_store) + } + } + + impl DeepSizeOf for NoRenameStore { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { + self.inner.deep_size_of_children(context) + } + } + + #[async_trait] + impl IndexStore for NoRenameStore { + fn as_any(&self) -> &dyn Any { + self + } + + fn clone_arc(&self) -> Arc { + Arc::new(self.clone()) + } + + fn io_parallelism(&self) -> usize { + self.inner.io_parallelism() + } + + async fn new_index_file( + &self, + name: &str, + schema: Arc, + ) -> Result> { + self.inner.new_index_file(name, schema).await + } + + async fn open_index_file(&self, name: &str) -> Result> { + self.inner.open_index_file(name).await + } + + async fn copy_index_file( + &self, + name: &str, + dest_store: &dyn IndexStore, + ) -> Result { + self.inner + .copy_index_file(name, Self::unwrap_dest_store(dest_store)) + .await + } + + async fn copy_index_file_to( + &self, + name: &str, + new_name: &str, + dest_store: &dyn IndexStore, + ) -> Result { + self.inner + .copy_index_file_to(name, new_name, Self::unwrap_dest_store(dest_store)) + .await + } + + async fn rename_index_file(&self, name: &str, new_name: &str) -> Result { + Err(Error::internal(format!( + "merge_index_files should not rename partition file {name} to {new_name}" + ))) + } + + async fn delete_index_file(&self, name: &str) -> Result<()> { + if name.starts_with("part_") + && let Some(count) = &self.final_delete_count + { + count.fetch_add(1, Ordering::SeqCst); + } + self.inner.delete_index_file(name).await + } + + async fn list_files_with_sizes(&self) -> Result> { + self.inner.list_files_with_sizes().await + } + } + + #[derive(Debug)] + struct FailMetadataStore { + inner: Arc, + } + + impl FailMetadataStore { + fn new(inner: Arc) -> Self { + Self { inner } + } + + fn unwrap_dest_store(dest_store: &dyn IndexStore) -> &dyn IndexStore { + dest_store + .as_any() + .downcast_ref::() + .map(|store| store.inner.as_ref()) + .unwrap_or(dest_store) + } + } + + impl DeepSizeOf for FailMetadataStore { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { + self.inner.deep_size_of_children(context) + } + } + + #[async_trait] + impl IndexStore for FailMetadataStore { + fn as_any(&self) -> &dyn Any { + self + } + + fn clone_arc(&self) -> Arc { + Arc::new(Self { + inner: self.inner.clone(), + }) + } + + fn io_parallelism(&self) -> usize { + self.inner.io_parallelism() + } + + async fn new_index_file( + &self, + name: &str, + schema: Arc, + ) -> Result> { + let writer = self.inner.new_index_file(name, schema).await?; + if name == METADATA_FILE { + Ok(Box::new(FailFinishWriter { inner: writer })) + } else { + Ok(writer) + } + } + + async fn open_index_file(&self, name: &str) -> Result> { + self.inner.open_index_file(name).await + } + + async fn copy_index_file( + &self, + name: &str, + dest_store: &dyn IndexStore, + ) -> Result { + self.inner + .copy_index_file(name, Self::unwrap_dest_store(dest_store)) + .await + } + + async fn copy_index_file_to( + &self, + name: &str, + new_name: &str, + dest_store: &dyn IndexStore, + ) -> Result { + self.inner + .copy_index_file_to(name, new_name, Self::unwrap_dest_store(dest_store)) + .await + } + + async fn rename_index_file(&self, name: &str, new_name: &str) -> Result { + self.inner.rename_index_file(name, new_name).await + } + + async fn delete_index_file(&self, name: &str) -> Result<()> { + self.inner.delete_index_file(name).await + } + + async fn list_files_with_sizes(&self) -> Result> { + self.inner.list_files_with_sizes().await + } + } + + struct FailFinishWriter { + inner: Box, + } + + #[async_trait] + impl IndexWriter for FailFinishWriter { + async fn write_record_batch(&mut self, batch: RecordBatch) -> Result { + self.inner.write_record_batch(batch).await + } + + async fn add_global_buffer(&mut self, data: Bytes) -> Result { + self.inner.add_global_buffer(data).await + } + + async fn finish(&mut self) -> Result { + Err(Error::internal("injected metadata write failure")) + } + + async fn finish_with_metadata( + &mut self, + _metadata: HashMap, + ) -> Result { + Err(Error::internal("injected metadata write failure")) + } + } + #[derive(Debug)] struct CountingWriter { path: String, @@ -2412,12 +2704,446 @@ mod tests { let store = CountingStore::new(); let docs = Arc::new(std::mem::take(&mut builder.docs)); - builder.write_posting_lists(&store, docs).await?; + builder + .write_posting_lists(&store, docs, &posting_file_path(0)) + .await?; assert_eq!(store.write_count(), 1); Ok(()) } + async fn write_partition_file_marker( + store: &dyn IndexStore, + path: &str, + partition_id: u64, + ) -> Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new( + "partition_id", + DataType::UInt64, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(UInt64Array::from(vec![partition_id]))], + )?; + let mut writer = store.new_index_file(path, schema).await?; + writer.write_record_batch(batch).await?; + writer.finish().await?; + Ok(()) + } + + async fn write_partition_files( + store: &dyn IndexStore, + partition_id: u64, + target: PartitionWriteTarget, + ) -> Result<()> { + write_partition_file_marker(store, &target.token_path(partition_id), partition_id).await?; + write_partition_file_marker(store, &target.posting_path(partition_id), partition_id) + .await?; + write_partition_file_marker(store, &target.doc_path(partition_id), partition_id).await?; + Ok(()) + } + + async fn read_partition_file_marker(store: &dyn IndexStore, path: &str) -> Result { + let reader = store.open_index_file(path).await?; + let batch = reader.read_range(0..1, None).await?; + let partition_ids = batch.column(0).as_primitive::(); + Ok(partition_ids.value(0)) + } + + async fn assert_partition_file_markers( + store: &dyn IndexStore, + partition_id: u64, + expected_marker: u64, + ) -> Result<()> { + assert_eq!( + read_partition_file_marker(store, &token_file_path(partition_id)).await?, + expected_marker + ); + assert_eq!( + read_partition_file_marker(store, &posting_file_path(partition_id)).await?, + expected_marker + ); + assert_eq!( + read_partition_file_marker(store, &doc_file_path(partition_id)).await?, + expected_marker + ); + Ok(()) + } + + #[tokio::test] + async fn test_merge_index_files_remaps_staged_partitions_without_rename() -> Result<()> { + let index_dir = TempDir::default(); + let object_store = Arc::new(ObjectStore::local()); + let base_store: Arc = Arc::new(LanceIndexStore::new( + object_store.clone(), + index_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + let store = Arc::new(NoRenameStore::new(base_store.clone())); + let partitions = vec![5_u64, 1_u64, (17_u64 << 32) | 2]; + let metadata_builder = InvertedIndexBuilder::from_existing_index( + InvertedIndexParams::default(), + None, + Vec::new(), + TokenSetFormat::default(), + None, + RoaringBitmap::new(), + ); + + for partition_id in &partitions { + write_partition_files( + base_store.as_ref(), + *partition_id, + PartitionWriteTarget::Staged, + ) + .await?; + metadata_builder + .write_part_metadata(base_store.as_ref(), *partition_id) + .await?; + } + + merge_index_files( + object_store.as_ref(), + &index_dir.obj_path(), + store, + noop_progress(), + ) + .await?; + + let metadata_reader = base_store.open_index_file(METADATA_FILE).await?; + let metadata = &metadata_reader.schema().metadata; + let written_partitions: Vec = serde_json::from_str( + metadata + .get("partitions") + .expect("partitions missing from metadata"), + )?; + let mut expected_partitions = partitions.clone(); + expected_partitions.sort_unstable(); + expected_partitions.dedup(); + let remapped_partitions = (0..expected_partitions.len() as u64).collect::>(); + assert_eq!(written_partitions, remapped_partitions); + + for (new_id, old_id) in expected_partitions.iter().enumerate() { + assert_partition_file_markers(base_store.as_ref(), new_id as u64, *old_id).await?; + assert!( + base_store + .open_index_file(&part_metadata_file_path(*old_id)) + .await + .is_err(), + "partition metadata should be cleaned up after final metadata is written" + ); + for suffix in PARTITION_FILE_SUFFIXES { + assert!( + base_store + .open_index_file(&staged_partition_file_path(*old_id, suffix)) + .await + .is_err(), + "staged partition files should be cleaned up after final metadata is written" + ); + } + } + + Ok(()) + } + + #[tokio::test] + async fn test_merge_index_files_rewrites_partial_final_files_from_staging() -> Result<()> { + let index_dir = TempDir::default(); + let object_store = Arc::new(ObjectStore::local()); + let base_store: Arc = Arc::new(LanceIndexStore::new( + object_store.clone(), + index_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + let store = Arc::new(NoRenameStore::with_final_delete_tracking( + base_store.clone(), + )); + let partitions = vec![1_u64, 5_u64]; + let metadata_builder = InvertedIndexBuilder::from_existing_index( + InvertedIndexParams::default(), + None, + Vec::new(), + TokenSetFormat::default(), + None, + RoaringBitmap::new(), + ); + + for partition_id in &partitions { + write_partition_files( + base_store.as_ref(), + *partition_id, + PartitionWriteTarget::Staged, + ) + .await?; + metadata_builder + .write_part_metadata(base_store.as_ref(), *partition_id) + .await?; + } + + for suffix in PARTITION_FILE_SUFFIXES { + write_partition_file_marker(base_store.as_ref(), &partition_file_path(1, suffix), 999) + .await?; + } + + merge_index_files( + object_store.as_ref(), + &index_dir.obj_path(), + store.clone(), + noop_progress(), + ) + .await?; + + assert_partition_file_markers(base_store.as_ref(), 0, 1).await?; + assert_partition_file_markers(base_store.as_ref(), 1, 5).await?; + assert_eq!( + store.final_delete_count(), + 0, + "merge should overwrite final partition files without deleting them first" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_distributed_from_existing_copies_existing_partitions_to_staging_and_finalizes() + -> Result<()> { + let object_store = Arc::new(ObjectStore::local()); + let source_dir = TempDir::default(); + let dest_dir = TempDir::default(); + let source_store: Arc = Arc::new(LanceIndexStore::new( + object_store.clone(), + source_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + let dest_store: Arc = Arc::new(LanceIndexStore::new( + object_store.clone(), + dest_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + let merge_store = Arc::new(NoRenameStore::new(dest_store.clone())); + let fragment_mask = 7_u64 << 32; + let partitions = vec![fragment_mask | 5, fragment_mask | 1]; + + for partition_id in &partitions { + write_partition_files( + source_store.as_ref(), + *partition_id, + PartitionWriteTarget::Final, + ) + .await?; + } + + let builder = InvertedIndexBuilder::from_existing_index( + InvertedIndexParams::default(), + Some(source_store.clone()), + partitions.clone(), + TokenSetFormat::default(), + Some(fragment_mask), + RoaringBitmap::new(), + ); + builder.write(dest_store.as_ref()).await?; + + for partition_id in &partitions { + assert_partition_file_markers(source_store.as_ref(), *partition_id, *partition_id) + .await?; + for suffix in PARTITION_FILE_SUFFIXES { + let staged_path = staged_partition_file_path(*partition_id, suffix); + assert_eq!( + read_partition_file_marker(dest_store.as_ref(), &staged_path).await?, + *partition_id + ); + assert!( + dest_store + .open_index_file(&partition_file_path(*partition_id, suffix)) + .await + .is_err(), + "distributed existing partition should be staged instead of copied to root" + ); + } + dest_store + .open_index_file(&part_metadata_file_path(*partition_id)) + .await?; + } + + merge_index_files( + object_store.as_ref(), + &dest_dir.obj_path(), + merge_store, + noop_progress(), + ) + .await?; + + let mut expected_partitions = partitions.clone(); + expected_partitions.sort_unstable(); + for (new_id, old_id) in expected_partitions.iter().enumerate() { + assert_partition_file_markers(dest_store.as_ref(), new_id as u64, *old_id).await?; + for suffix in PARTITION_FILE_SUFFIXES { + assert!( + dest_store + .open_index_file(&staged_partition_file_path(*old_id, suffix)) + .await + .is_err(), + "staged partition files should be cleaned after final metadata is written" + ); + } + } + + Ok(()) + } + + #[tokio::test] + async fn test_merge_index_files_keeps_staging_when_final_metadata_write_fails() -> Result<()> { + let index_dir = TempDir::default(); + let object_store = Arc::new(ObjectStore::local()); + let base_store: Arc = Arc::new(LanceIndexStore::new( + object_store.clone(), + index_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + let failing_store = Arc::new(FailMetadataStore::new(base_store.clone())); + let partitions = vec![1_u64, 5_u64]; + let metadata_builder = InvertedIndexBuilder::from_existing_index( + InvertedIndexParams::default(), + None, + Vec::new(), + TokenSetFormat::default(), + None, + RoaringBitmap::new(), + ); + + for partition_id in &partitions { + write_partition_files( + base_store.as_ref(), + *partition_id, + PartitionWriteTarget::Staged, + ) + .await?; + metadata_builder + .write_part_metadata(base_store.as_ref(), *partition_id) + .await?; + } + + let err = merge_index_files( + object_store.as_ref(), + &index_dir.obj_path(), + failing_store, + noop_progress(), + ) + .await + .unwrap_err(); + assert!( + err.to_string().contains("metadata write failure"), + "expected injected metadata failure, got: {err}" + ); + + for partition_id in &partitions { + base_store + .open_index_file(&part_metadata_file_path(*partition_id)) + .await?; + for suffix in PARTITION_FILE_SUFFIXES { + let staged_path = staged_partition_file_path(*partition_id, suffix); + assert_eq!( + read_partition_file_marker(base_store.as_ref(), &staged_path).await?, + *partition_id + ); + } + } + + Ok(()) + } + + #[tokio::test] + async fn test_distributed_build_writes_partition_data_to_staging() -> Result<()> { + let index_dir = TempDir::default(); + let object_store = ObjectStore::local(); + let store = Arc::new(LanceIndexStore::new( + object_store.into(), + index_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + + let fragment_mask = 7_u64 << 32; + let batch = make_doc_batch("hello world", fragment_mask); + let stream = RecordBatchStreamAdapter::new(batch.schema(), stream::iter(vec![Ok(batch)])); + let stream = Box::pin(stream); + let mut builder = InvertedIndexBuilder::new_with_fragment_mask( + InvertedIndexParams::default(), + Some(fragment_mask), + ); + builder.update(stream, store.as_ref(), None).await?; + + let part_metadata_files = + list_metadata_files(&ObjectStore::local(), &index_dir.obj_path()).await?; + assert_eq!(part_metadata_files.len(), 1); + assert!( + part_metadata_files[0].starts_with("staging/part_"), + "partition metadata should be written to staging" + ); + let reader = store.open_index_file(&part_metadata_files[0]).await?; + let partition_ids: Vec = serde_json::from_str( + reader + .schema() + .metadata + .get("partitions") + .expect("partitions missing from metadata"), + )?; + assert_eq!(partition_ids.len(), 1); + let partition_id = partition_ids[0]; + + store + .open_index_file(&staged_partition_file_path(partition_id, TOKENS_FILE)) + .await?; + assert!( + store + .open_index_file(&partition_file_path(partition_id, METADATA_FILE)) + .await + .is_err(), + "distributed build-only metadata should not be written to root partition metadata paths" + ); + assert!( + store + .open_index_file(&token_file_path(partition_id)) + .await + .is_err(), + "distributed build-only data should not be written to final partition paths" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_merge_index_files_is_noop_when_metadata_exists() -> Result<()> { + let index_dir = TempDir::default(); + let object_store = Arc::new(ObjectStore::local()); + let store: Arc = Arc::new(LanceIndexStore::new( + object_store.clone(), + index_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + let metadata_builder = InvertedIndexBuilder::from_existing_index( + InvertedIndexParams::default(), + None, + vec![42], + TokenSetFormat::default(), + None, + RoaringBitmap::new(), + ); + metadata_builder + .write_metadata(store.as_ref(), &[42]) + .await?; + + merge_index_files( + object_store.as_ref(), + &index_dir.obj_path(), + store, + noop_progress(), + ) + .await?; + + Ok(()) + } + #[tokio::test] async fn test_build_only_path_writes_partitions_as_is() -> Result<()> { let src_dir = TempDir::default(); @@ -2856,7 +3582,6 @@ mod tests { } }) .collect::>(); - let read_start = tags .iter() .position(|e| e == "start:read_partition_metadata") @@ -2894,8 +3619,8 @@ mod tests { ); assert_eq!( remap_progress.last().copied().unwrap_or_default(), - 12, - "expected remap_partition_files progress to cover both rename phases" + 6, + "expected remap_partition_files progress to cover staged-to-final copies" ); assert!( tags.iter().any(|e| e == "progress:write_merged_metadata"), diff --git a/rust/lance-index/src/scalar/inverted/cache_codec.rs b/rust/lance-index/src/scalar/inverted/cache_codec.rs index 74cfc98ef7b..a676455d5c9 100644 --- a/rust/lance-index/src/scalar/inverted/cache_codec.rs +++ b/rust/lance-index/src/scalar/inverted/cache_codec.rs @@ -4,16 +4,24 @@ //! Cache codec impls for FTS index entries. //! //! Serializes [`PostingList`] and [`Positions`] cache values for persistent -//! cache backends. The format is a small variant tag plus a JSON header for -//! scalar metadata, with Arrow-backed payload sections written as zero-copy -//! Arrow IPC streams via [`lance_arrow::ipc`]. The raw byte buffer inside -//! [`SharedPositionStream`] is written via [`write_len_prefixed_bytes`] and -//! read back via [`read_len_prefixed_bytes_at`] -- both zero-copy slices into -//! the input `Bytes` allocation. +//! cache backends, behind the stabilized envelope written by +//! [`CacheCodec`](lance_core::cache::CacheCodec). //! -//! This is the FTS counterpart of `partition_serde.rs` for vector indices. +//! Every variant uses a protobuf header (see `protos-cache/cache.proto`, with the +//! tail/position codecs and position-storage kind as proto enums) followed by +//! 64-byte-aligned Arrow IPC sections and, where applicable, raw blobs: +//! +//! - the compressed posting list: an IPC section for `blocks`, then the +//! position sections (legacy IPC, or shared block-offsets IPC + a raw blob of +//! the [`SharedPositionStream`] byte buffer, which has its own portable +//! encoding); +//! - the plain posting list: an IPC section of `(row_ids, frequencies)`, then +//! an optional legacy position IPC section; +//! - the standalone [`Positions`] codec: the position sections alone. +//! +//! All sections read back zero-copy via [`lance_arrow::ipc`]. This is the FTS +//! counterpart of `partition_serde.rs` for vector indices. -use std::io::Write; use std::sync::Arc; use arrow_array::cast::AsArray; @@ -22,14 +30,14 @@ use arrow_array::{ Array, Float32Array, LargeBinaryArray, ListArray, RecordBatch, UInt32Array, UInt64Array, }; use arrow_schema::{DataType, Field, Schema}; -use bytes::Bytes; -use lance_arrow::ipc::{ - read_ipc_stream_single_at, read_len_prefixed_bytes_at, write_ipc_stream, - write_len_prefixed_bytes, -}; -use lance_core::cache::CacheCodecImpl; +use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter}; use lance_core::{Error, Result}; -use serde::{Deserialize, Serialize}; + +use crate::cache_pb::{ + CompressedPostingHeader, PlainPostingHeader, PositionStorage as PbPositionStorage, + PositionStreamCodec as PbPositionStreamCodec, PositionsHeader, PostingListGroupHeader, + PostingTailCodec as PbPostingTailCodec, +}; use super::index::{ CompressedPositionStorage, CompressedPostingList, PlainPostingList, PositionStreamCodec, @@ -43,86 +51,43 @@ use super::index::{ const POSTING_VARIANT_PLAIN: u8 = 0; const POSTING_VARIANT_COMPRESSED: u8 = 1; -const POSITIONS_TAG_NONE: u8 = 0; -const POSITIONS_TAG_LEGACY: u8 = 1; -const POSITIONS_TAG_SHARED: u8 = 2; - -const POSTING_TAIL_CODEC_FIXED32: u8 = 0; -const POSTING_TAIL_CODEC_VARINT_DELTA: u8 = 1; - -const POSITION_STREAM_CODEC_VARINT_DOC_DELTA: u8 = 0; -const POSITION_STREAM_CODEC_PACKED_DELTA: u8 = 1; - // --------------------------------------------------------------------------- -// Codec enum byte mappings +// Codec enum mappings // --------------------------------------------------------------------------- -fn posting_tail_codec_to_u8(c: PostingTailCodec) -> u8 { - match c { - PostingTailCodec::Fixed32 => POSTING_TAIL_CODEC_FIXED32, - PostingTailCodec::VarintDelta => POSTING_TAIL_CODEC_VARINT_DELTA, - } -} +// Posting lists carry their discriminants as protobuf enums in the header; +// these map to/from the in-memory Rust enums. -fn u8_to_posting_tail_codec(v: u8) -> Result { - match v { - POSTING_TAIL_CODEC_FIXED32 => Ok(PostingTailCodec::Fixed32), - POSTING_TAIL_CODEC_VARINT_DELTA => Ok(PostingTailCodec::VarintDelta), - _ => Err(Error::io(format!("unknown posting tail codec: {v}"))), +fn posting_tail_codec_to_proto(c: PostingTailCodec) -> PbPostingTailCodec { + match c { + PostingTailCodec::Fixed32 => PbPostingTailCodec::Fixed32, + PostingTailCodec::VarintDelta => PbPostingTailCodec::VarintDelta, } } -fn position_stream_codec_to_u8(c: PositionStreamCodec) -> u8 { +fn proto_to_posting_tail_codec(c: PbPostingTailCodec) -> PostingTailCodec { match c { - PositionStreamCodec::VarintDocDelta => POSITION_STREAM_CODEC_VARINT_DOC_DELTA, - PositionStreamCodec::PackedDelta => POSITION_STREAM_CODEC_PACKED_DELTA, + PbPostingTailCodec::Fixed32 => PostingTailCodec::Fixed32, + PbPostingTailCodec::VarintDelta => PostingTailCodec::VarintDelta, } } -fn u8_to_position_stream_codec(v: u8) -> Result { - match v { - POSITION_STREAM_CODEC_VARINT_DOC_DELTA => Ok(PositionStreamCodec::VarintDocDelta), - POSITION_STREAM_CODEC_PACKED_DELTA => Ok(PositionStreamCodec::PackedDelta), - _ => Err(Error::io(format!("unknown position stream codec: {v}"))), +fn position_stream_codec_to_proto(c: PositionStreamCodec) -> PbPositionStreamCodec { + match c { + PositionStreamCodec::VarintDocDelta => PbPositionStreamCodec::VarintDocDelta, + PositionStreamCodec::PackedDelta => PbPositionStreamCodec::PackedDelta, } } -// --------------------------------------------------------------------------- -// Header / tag I/O helpers (mirrors partition_serde.rs) -// --------------------------------------------------------------------------- - -fn write_json_header(writer: &mut dyn Write, header: &impl Serialize) -> Result<()> { - let bytes = serde_json::to_vec(header)?; - write_len_prefixed_bytes(writer, &bytes)?; - Ok(()) -} - -fn read_json_header(data: &Bytes, offset: &mut usize) -> Result { - let bytes = read_len_prefixed_bytes_at(data, offset).map_err(|e| Error::io(e.to_string()))?; - serde_json::from_slice(&bytes) - .map_err(|e| Error::io(format!("failed to deserialize cache header: {e}"))) -} - -fn write_u8(writer: &mut dyn Write, value: u8) -> Result<()> { - writer - .write_all(&[value]) - .map_err(|e| Error::io(format!("failed to write tag byte: {e}"))) -} - -fn read_u8(data: &Bytes, offset: &mut usize) -> Result { - let bytes = data.as_ref(); - if *offset >= bytes.len() { - return Err(Error::io( - "truncated cache entry: missing tag byte".to_string(), - )); +fn proto_to_position_stream_codec(c: PbPositionStreamCodec) -> PositionStreamCodec { + match c { + PbPositionStreamCodec::VarintDocDelta => PositionStreamCodec::VarintDocDelta, + PbPositionStreamCodec::PackedDelta => PositionStreamCodec::PackedDelta, } - let v = bytes[*offset]; - *offset += 1; - Ok(v) } // --------------------------------------------------------------------------- -// Position storage serde (shared by PostingList variants and Positions) +// Position storage sections (shared by PostingList variants and Positions) // --------------------------------------------------------------------------- const POSITION_LIST_COLUMN: &str = "position_list"; @@ -131,33 +96,36 @@ const ROW_IDS_COLUMN: &str = "row_ids"; const FREQUENCIES_COLUMN: &str = "frequencies"; const BLOCKS_COLUMN: &str = "blocks"; -#[derive(Serialize, Deserialize)] -struct SharedPositionsHeader { - codec: u8, +fn legacy_positions_batch(list: &ListArray) -> Result { + let schema = Arc::new(Schema::new(vec![Field::new( + POSITION_LIST_COLUMN, + list.data_type().clone(), + list.is_nullable(), + )])); + Ok(RecordBatch::try_new(schema, vec![Arc::new(list.clone())])?) +} + +fn read_legacy_positions(r: &mut CacheEntryReader<'_>) -> Result { + let batch = r.read_ipc()?; + Ok(batch + .column(0) + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::io("legacy position column is not a ListArray".to_string()))? + .clone()) } -fn write_position_storage( - writer: &mut dyn Write, +/// Write the position sections (the bytes after the header) for `storage`. The +/// caller's header proto carries the storage kind and shared-stream codec. +fn write_position_sections( + w: &mut CacheEntryWriter<'_>, storage: &CompressedPositionStorage, ) -> Result<()> { match storage { CompressedPositionStorage::LegacyPerDoc(list) => { - write_u8(writer, POSITIONS_TAG_LEGACY)?; - let schema = Arc::new(Schema::new(vec![Field::new( - POSITION_LIST_COLUMN, - list.data_type().clone(), - list.is_nullable(), - )])); - let batch = RecordBatch::try_new(schema, vec![Arc::new(list.clone())])?; - write_ipc_stream(&batch, writer)?; + w.write_ipc(&legacy_positions_batch(list)?)?; } CompressedPositionStorage::SharedStream(stream) => { - write_u8(writer, POSITIONS_TAG_SHARED)?; - let header = SharedPositionsHeader { - codec: position_stream_codec_to_u8(stream.codec()), - }; - write_json_header(writer, &header)?; - let offsets = UInt32Array::from(stream.block_offsets().to_vec()); let schema = Arc::new(Schema::new(vec![Field::new( BLOCK_OFFSETS_COLUMN, @@ -165,55 +133,42 @@ fn write_position_storage( false, )])); let batch = RecordBatch::try_new(schema, vec![Arc::new(offsets)])?; - write_ipc_stream(&batch, writer)?; - - write_len_prefixed_bytes(writer, stream.bytes())?; + w.write_ipc(&batch)?; + w.write_raw(stream.bytes())?; } } Ok(()) } -fn read_position_storage( - data: &Bytes, - offset: &mut usize, - tag: u8, -) -> Result { - match tag { - POSITIONS_TAG_LEGACY => { - let batch = - read_ipc_stream_single_at(data, offset).map_err(|e| Error::io(e.to_string()))?; - let list = batch - .column(0) - .as_any() - .downcast_ref::() - .ok_or_else(|| Error::io("legacy position column is not a ListArray".to_string()))? - .clone(); - Ok(CompressedPositionStorage::LegacyPerDoc(list)) - } - POSITIONS_TAG_SHARED => { - let header: SharedPositionsHeader = read_json_header(data, offset)?; - let codec = u8_to_position_stream_codec(header.codec)?; - - let batch = - read_ipc_stream_single_at(data, offset).map_err(|e| Error::io(e.to_string()))?; +/// Read the position sections for the given `storage` kind and (for shared +/// streams) `stream_codec`. Returns `None` only when `storage` is +/// [`PbPositionStorage::None`]. +fn read_position_sections( + r: &mut CacheEntryReader<'_>, + storage: PbPositionStorage, + stream_codec: PositionStreamCodec, +) -> Result> { + match storage { + PbPositionStorage::None => Ok(None), + PbPositionStorage::Legacy => Ok(Some(CompressedPositionStorage::LegacyPerDoc( + read_legacy_positions(r)?, + ))), + PbPositionStorage::Shared => { + let batch = r.read_ipc()?; let block_offsets = batch .column(0) .as_primitive_opt::() .ok_or_else(|| Error::io("block_offsets column is not UInt32".to_string()))? .values() .to_vec(); - - // Zero copy: read_len_prefixed_bytes_at returns a Bytes slice - // backed by the same allocation as `data`, and SharedPositionStream - // now stores its byte buffer as Bytes -- no copy on read. - let bytes = - read_len_prefixed_bytes_at(data, offset).map_err(|e| Error::io(e.to_string()))?; - - Ok(CompressedPositionStorage::SharedStream( - SharedPositionStream::new(codec, block_offsets, bytes), - )) + // Zero copy: read_raw returns a Bytes slice backed by the same + // allocation as the input, and SharedPositionStream stores its byte + // buffer as Bytes -- no copy on read. + let bytes = r.read_raw()?; + Ok(Some(CompressedPositionStorage::SharedStream( + SharedPositionStream::new(stream_codec, block_offsets, bytes), + ))) } - other => Err(Error::io(format!("unknown positions tag: {other}"))), } } @@ -221,50 +176,45 @@ fn read_position_storage( // PostingList codec // --------------------------------------------------------------------------- -#[derive(Serialize, Deserialize)] -struct PlainPostingHeader { - max_score: Option, -} - -#[derive(Serialize, Deserialize)] -struct CompressedPostingHeader { - max_score: f32, - length: u32, - posting_tail_codec: u8, -} - impl CacheCodecImpl for PostingList { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { + const TYPE_ID: &'static str = "lance.fts.PostingList"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { match self { Self::Plain(plain) => { - write_u8(writer, POSTING_VARIANT_PLAIN)?; - serialize_plain(writer, plain) + w.write_u8(POSTING_VARIANT_PLAIN)?; + serialize_plain(w, plain) } Self::Compressed(compressed) => { - write_u8(writer, POSTING_VARIANT_COMPRESSED)?; - serialize_compressed(writer, compressed) + w.write_u8(POSTING_VARIANT_COMPRESSED)?; + serialize_compressed(w, compressed) } } } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let variant = read_u8(data, &mut offset)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let variant = r.read_u8()?; match variant { - POSTING_VARIANT_PLAIN => Ok(Self::Plain(deserialize_plain(data, &mut offset)?)), - POSTING_VARIANT_COMPRESSED => { - Ok(Self::Compressed(deserialize_compressed(data, &mut offset)?)) - } + POSTING_VARIANT_PLAIN => Ok(Self::Plain(deserialize_plain(r)?)), + POSTING_VARIANT_COMPRESSED => Ok(Self::Compressed(deserialize_compressed(r)?)), other => Err(Error::io(format!("unknown PostingList variant: {other}"))), } } } -fn serialize_plain(writer: &mut dyn Write, plain: &PlainPostingList) -> Result<()> { +fn serialize_plain(w: &mut CacheEntryWriter<'_>, plain: &PlainPostingList) -> Result<()> { + // Plain postings carry only per-doc legacy positions (or none). + let position_storage = if plain.positions.is_some() { + PbPositionStorage::Legacy + } else { + PbPositionStorage::None + }; let header = PlainPostingHeader { max_score: plain.max_score, + position_storage: position_storage as i32, }; - write_json_header(writer, &header)?; + w.write_header(&header)?; let row_ids = UInt64Array::new(plain.row_ids.clone(), None); let frequencies = Float32Array::new(plain.frequencies.clone(), None); @@ -273,26 +223,18 @@ fn serialize_plain(writer: &mut dyn Write, plain: &PlainPostingList) -> Result<( Field::new(FREQUENCIES_COLUMN, DataType::Float32, false), ])); let batch = RecordBatch::try_new(schema, vec![Arc::new(row_ids), Arc::new(frequencies)])?; - write_ipc_stream(&batch, writer)?; - - match &plain.positions { - Some(list) => { - // Plain postings can only carry per-doc legacy positions; reuse - // the shared encoder. - write_position_storage( - writer, - &CompressedPositionStorage::LegacyPerDoc(list.clone()), - )?; - } - None => write_u8(writer, POSITIONS_TAG_NONE)?, + w.write_ipc(&batch)?; + + if let Some(list) = &plain.positions { + w.write_ipc(&legacy_positions_batch(list)?)?; } Ok(()) } -fn deserialize_plain(data: &Bytes, offset: &mut usize) -> Result { - let header: PlainPostingHeader = read_json_header(data, offset)?; +fn deserialize_plain(r: &mut CacheEntryReader<'_>) -> Result { + let header: PlainPostingHeader = r.read_header()?; - let batch = read_ipc_stream_single_at(data, offset).map_err(|e| Error::io(e.to_string()))?; + let batch = r.read_ipc()?; let row_ids = batch .column(0) .as_primitive_opt::() @@ -306,19 +248,13 @@ fn deserialize_plain(data: &Bytes, offset: &mut usize) -> Result None, - POSITIONS_TAG_LEGACY => match read_position_storage(data, offset, positions_tag)? { - CompressedPositionStorage::LegacyPerDoc(list) => Some(list), - CompressedPositionStorage::SharedStream(_) => { - unreachable!("shared stream tag was read as legacy variant (this is a bug)") - } - }, - other => { - return Err(Error::io(format!( - "Plain posting list cannot have positions tag {other}" - ))); + let positions = match header.position_storage() { + PbPositionStorage::None => None, + PbPositionStorage::Legacy => Some(read_legacy_positions(r)?), + PbPositionStorage::Shared => { + return Err(Error::io( + "Plain posting list cannot have a shared position stream".to_string(), + )); } }; @@ -330,13 +266,33 @@ fn deserialize_plain(data: &Bytes, offset: &mut usize) -> Result Result<()> { +/// The compressed posting list is serialized with a protobuf header followed +/// by 64-byte-aligned Arrow IPC sections (for the `blocks`, and for shared +/// position block-offsets) and a raw blob (for the shared position byte +/// stream, which already has its own portable encoding). +fn serialize_compressed( + w: &mut CacheEntryWriter<'_>, + posting: &CompressedPostingList, +) -> Result<()> { + let (position_storage, position_stream_codec) = match &posting.positions { + None => (PbPositionStorage::None, PbPositionStreamCodec::default()), + Some(CompressedPositionStorage::LegacyPerDoc(_)) => { + (PbPositionStorage::Legacy, PbPositionStreamCodec::default()) + } + Some(CompressedPositionStorage::SharedStream(stream)) => ( + PbPositionStorage::Shared, + position_stream_codec_to_proto(stream.codec()), + ), + }; + let header = CompressedPostingHeader { max_score: posting.max_score, length: posting.length, - posting_tail_codec: posting_tail_codec_to_u8(posting.posting_tail_codec), + posting_tail_codec: posting_tail_codec_to_proto(posting.posting_tail_codec) as i32, + position_storage: position_storage as i32, + position_stream_codec: position_stream_codec as i32, }; - write_json_header(writer, &header)?; + w.write_header(&header)?; let schema = Arc::new(Schema::new(vec![Field::new( BLOCKS_COLUMN, @@ -344,20 +300,19 @@ fn serialize_compressed(writer: &mut dyn Write, posting: &CompressedPostingList) false, )])); let batch = RecordBatch::try_new(schema, vec![Arc::new(posting.blocks.clone())])?; - write_ipc_stream(&batch, writer)?; + w.write_ipc(&batch)?; - match &posting.positions { - Some(storage) => write_position_storage(writer, storage)?, - None => write_u8(writer, POSITIONS_TAG_NONE)?, + if let Some(storage) = &posting.positions { + write_position_sections(w, storage)?; } Ok(()) } -fn deserialize_compressed(data: &Bytes, offset: &mut usize) -> Result { - let header: CompressedPostingHeader = read_json_header(data, offset)?; - let posting_tail_codec = u8_to_posting_tail_codec(header.posting_tail_codec)?; +fn deserialize_compressed(r: &mut CacheEntryReader<'_>) -> Result { + let header: CompressedPostingHeader = r.read_header()?; + let posting_tail_codec = proto_to_posting_tail_codec(header.posting_tail_codec()); - let batch = read_ipc_stream_single_at(data, offset).map_err(|e| Error::io(e.to_string()))?; + let batch = r.read_ipc()?; let blocks = batch .column(0) .as_any() @@ -365,12 +320,8 @@ fn deserialize_compressed(data: &Bytes, offset: &mut usize) -> Result Result Result<()> { + const TYPE_ID: &'static str = "lance.fts.PostingListGroup"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { let count = u32::try_from(self.posting_lists.len()) .map_err(|_| Error::io("posting list group too large to serialize".to_string()))?; - writer - .write_all(&count.to_le_bytes()) - .map_err(|e| Error::io(format!("failed to write group count: {e}")))?; + w.write_header(&PostingListGroupHeader { count })?; for posting in &self.posting_lists { - let mut buf = Vec::new(); - posting.serialize(&mut buf)?; - write_len_prefixed_bytes(writer, &buf)?; + posting.serialize(w)?; } Ok(()) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - if data.len() < 4 { - return Err(Error::io( - "truncated posting list group: missing count".to_string(), - )); - } - let count = u32::from_le_bytes(data[0..4].try_into().unwrap()) as usize; - offset += 4; - let mut posting_lists = Vec::with_capacity(count); - for _ in 0..count { - let entry = read_len_prefixed_bytes_at(data, &mut offset) - .map_err(|e| Error::io(e.to_string()))?; - posting_lists.push(PostingList::deserialize(&entry)?); + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: PostingListGroupHeader = r.read_header()?; + let mut posting_lists = Vec::with_capacity(header.count as usize); + for _ in 0..header.count { + posting_lists.push(PostingList::deserialize(r)?); } Ok(Self::new(posting_lists)) } @@ -428,20 +371,35 @@ impl CacheCodecImpl for PostingListGroup { // --------------------------------------------------------------------------- impl CacheCodecImpl for Positions { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { - write_position_storage(writer, &self.0) + const TYPE_ID: &'static str = "lance.fts.Positions"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + let (position_storage, position_stream_codec) = match &self.0 { + CompressedPositionStorage::LegacyPerDoc(_) => { + (PbPositionStorage::Legacy, PbPositionStreamCodec::default()) + } + CompressedPositionStorage::SharedStream(stream) => ( + PbPositionStorage::Shared, + position_stream_codec_to_proto(stream.codec()), + ), + }; + let header = PositionsHeader { + position_storage: position_storage as i32, + position_stream_codec: position_stream_codec as i32, + }; + w.write_header(&header)?; + write_position_sections(w, &self.0) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let tag = read_u8(data, &mut offset)?; - if tag == POSITIONS_TAG_NONE { - return Err(Error::io( - "Positions cache entry cannot encode the None variant".to_string(), - )); - } - let storage = read_position_storage(data, &mut offset, tag)?; - Ok(Self(storage)) + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: PositionsHeader = r.read_header()?; + let stream_codec = proto_to_position_stream_codec(header.position_stream_codec()); + read_position_sections(r, header.position_storage(), stream_codec)? + .map(Self) + .ok_or_else(|| { + Error::io("Positions cache entry cannot encode the None variant".to_string()) + }) } } @@ -455,7 +413,8 @@ mod tests { use arrow_array::LargeBinaryArray; use arrow_array::builder::{Int32Builder, ListBuilder}; use bytes::Bytes; - use lance_core::cache::CacheCodecImpl; + use lance_core::Result; + use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter}; use super::super::index::{ CompressedPositionStorage, CompressedPostingList, PlainPostingList, PositionStreamCodec, @@ -502,16 +461,26 @@ mod tests { } } - fn roundtrip_posting_list(entry: &PostingList) -> PostingList { + /// Serialize a codec body (no envelope) into a standalone buffer. + fn body_bytes(entry: &T) -> Bytes { let mut buf = Vec::new(); - entry.serialize(&mut buf).unwrap(); - PostingList::deserialize(&Bytes::from(buf)).unwrap() + let mut w = CacheEntryWriter::new(&mut buf); + entry.serialize(&mut w).unwrap(); + Bytes::from(buf) + } + + /// Deserialize a codec body (no envelope) at the current build's version. + fn from_body(data: &Bytes) -> Result { + let mut r = CacheEntryReader::new(data, 0, T::CURRENT_VERSION); + T::deserialize(&mut r) + } + + fn roundtrip_posting_list(entry: &PostingList) -> PostingList { + from_body::(&body_bytes(entry)).unwrap() } fn roundtrip_positions(entry: &Positions) -> Positions { - let mut buf = Vec::new(); - entry.serialize(&mut buf).unwrap(); - Positions::deserialize(&Bytes::from(buf)).unwrap() + from_body::(&body_bytes(entry)).unwrap() } fn assert_slice_points_into_bytes(slice: &[u8], bytes: &Bytes) { @@ -652,13 +621,9 @@ mod tests { expected_stream.clone(), )), ); - let mut buf = Vec::new(); - PostingList::Compressed(posting) - .serialize(&mut buf) - .unwrap(); - let serialized = Bytes::from(buf); + let serialized = body_bytes(&PostingList::Compressed(posting)); - let restored = PostingList::deserialize(&serialized).unwrap(); + let restored = from_body::(&serialized).unwrap(); let PostingList::Compressed(restored) = restored else { panic!("expected Compressed variant"); }; @@ -695,9 +660,7 @@ mod tests { vec![plain.clone(), compressed, plain], ] { let group = PostingListGroup::new(members.clone()); - let mut buf = Vec::new(); - group.serialize(&mut buf).unwrap(); - let restored = PostingListGroup::deserialize(&Bytes::from(buf)).unwrap(); + let restored = from_body::(&body_bytes(&group)).unwrap(); assert_eq!(restored.posting_lists.len(), members.len()); for (a, b) in members.iter().zip(restored.posting_lists.iter()) { match (a, b) { @@ -743,9 +706,241 @@ mod tests { None, ); let entry = PostingList::Plain(plain); - let mut buf = Vec::new(); - entry.serialize(&mut buf).unwrap(); + let mut buf = body_bytes(&entry).to_vec(); buf.truncate(buf.len() / 2); - assert!(PostingList::deserialize(&Bytes::from(buf)).is_err()); + assert!(from_body::(&Bytes::from(buf)).is_err()); + } + + /// Tests covering the stabilized envelope + compressed proto format, + /// exercised through the full type-erased [`CacheCodec`] (envelope + body). + mod stable_format { + use std::sync::Arc; + + use arrow_array::Array; + use lance_core::cache::CacheCodec; + use prost::Message; + + use super::*; + use crate::cache_pb::{CompressedPostingHeader, PostingTailCodec as PbPostingTailCodec}; + + type ArcAny = Arc; + + fn codec() -> CacheCodec { + CacheCodec::from_impl::() + } + + /// Serialize an entry through the full codec (envelope + body). + fn serialize_entry(entry: PostingList) -> Vec { + let any: ArcAny = Arc::new(entry); + let mut buf = Vec::new(); + codec().serialize(&any, &mut buf).unwrap(); + buf + } + + /// A `Bytes` whose base address is 64-byte aligned, modelling a backend + /// that reads cache entries into an aligned buffer. + fn aligned_bytes(payload: &[u8]) -> Bytes { + const ALIGN: usize = 64; + let mut v = vec![0u8; payload.len() + ALIGN]; + let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN; + v[pad..pad + payload.len()].copy_from_slice(payload); + Bytes::from(v).slice(pad..pad + payload.len()) + } + + fn compressed_with_shared_positions() -> PostingList { + let blocks = + LargeBinaryArray::from_opt_vec(vec![Some(&[9u8; 48][..]), Some(&[1u8; 48])]); + let stream = SharedPositionStream::new( + PositionStreamCodec::PackedDelta, + vec![0u32, 4, 11], + Bytes::from((0u8..64).collect::>()), + ); + PostingList::Compressed(CompressedPostingList::new( + blocks, + 7.0, + 3, + PostingTailCodec::VarintDelta, + Some(CompressedPositionStorage::SharedStream(stream)), + )) + } + + /// The compressed `blocks` (an aligned IPC section) and the shared + /// position blob (a raw section) must both be borrowed zero-copy from + /// the input even though the envelope pushes them to a non-zero, + /// non-aligned starting offset. + #[test] + fn compressed_sections_are_zero_copy_through_envelope() { + let serialized = aligned_bytes(&serialize_entry(compressed_with_shared_positions())); + let restored = codec().deserialize(&serialized).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + let PostingList::Compressed(restored) = restored.as_ref() else { + panic!("expected Compressed"); + }; + + let base = serialized.as_ptr() as usize; + let end = base + serialized.len(); + let points_in = |ptr: usize| ptr >= base && ptr < end; + + // blocks IPC section decoded in place (no realigning memcpy). + for buf in restored.blocks.to_data().buffers() { + assert!( + points_in(buf.as_ptr() as usize), + "blocks buffer was realigned out of the input — misaligned IPC section", + ); + } + // shared position raw blob borrowed in place. + let Some(CompressedPositionStorage::SharedStream(stream)) = &restored.positions else { + panic!("expected shared stream"); + }; + assert!(points_in(stream.bytes().as_ptr() as usize)); + } + + /// Every member of a `PostingListGroup` must also decode zero-copy. The + /// group writes its members inline so each member's IPC sections stay + /// 64-byte aligned within the entry; embedding members in per-member + /// sub-buffers would land them at arbitrary offsets and force a + /// realigning memcpy on load. + #[test] + fn group_member_sections_are_zero_copy_through_envelope() { + let make_member = |fill: u8| { + let blocks = + LargeBinaryArray::from_opt_vec(vec![Some(&[fill; 48][..]), Some(&[fill; 48])]); + PostingList::Compressed(CompressedPostingList::new( + blocks, + 7.0, + 3, + PostingTailCodec::VarintDelta, + None, + )) + }; + let group = PostingListGroup::new(vec![make_member(9), make_member(1)]); + + let group_codec = CacheCodec::from_impl::(); + let any: ArcAny = Arc::new(group); + let mut buf = Vec::new(); + group_codec.serialize(&any, &mut buf).unwrap(); + let serialized = aligned_bytes(&buf); + + let restored = group_codec.deserialize(&serialized).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + + let base = serialized.as_ptr() as usize; + let end = base + serialized.len(); + let points_in = |ptr: usize| ptr >= base && ptr < end; + + assert_eq!(restored.posting_lists.len(), 2); + for member in &restored.posting_lists { + let PostingList::Compressed(member) = member else { + panic!("expected Compressed member"); + }; + for buf in member.blocks.to_data().buffers() { + assert!( + points_in(buf.as_ptr() as usize), + "group member blocks buffer was realigned out of the input — \ + misaligned IPC section", + ); + } + } + } + + /// The plain posting's row-id/frequency IPC section must also decode + /// zero-copy through the envelope + proto header. + #[test] + fn plain_sections_are_zero_copy_through_envelope() { + let plain = PostingList::Plain(PlainPostingList::new( + ScalarBuffer::from((0u64..64).collect::>()), + ScalarBuffer::from(vec![1.0f32; 64]), + Some(2.0), + None, + )); + let serialized = aligned_bytes(&serialize_entry(plain)); + let restored = codec().deserialize(&serialized).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + let PostingList::Plain(restored) = restored.as_ref() else { + panic!("expected Plain"); + }; + + let base = serialized.as_ptr() as usize; + let end = base + serialized.len(); + // The row_ids ScalarBuffer must borrow from the input allocation. + let ptr = restored.row_ids.as_ptr() as usize; + assert!( + ptr >= base && ptr < end, + "row_ids buffer was realigned out of the input — misaligned IPC section", + ); + } + + /// Additive proto fields (lever #1) must not break decoding: an unknown + /// field number appended to the header is ignored. + #[test] + fn header_proto_ignores_unknown_fields() { + let header = CompressedPostingHeader { + max_score: 1.5, + length: 9, + posting_tail_codec: PbPostingTailCodec::VarintDelta as i32, + ..Default::default() + }; + let mut bytes = header.encode_to_vec(); + // Append an unknown field #15, varint wire type (0), value 7. + bytes.push(15 << 3); + bytes.push(7); + let decoded = CompressedPostingHeader::decode(bytes.as_slice()).unwrap(); + assert_eq!(decoded.length, 9); + assert_eq!(decoded.max_score, 1.5); + } + + /// An entry written by a different codec (foreign TYPE_ID) misses. + #[test] + fn foreign_type_id_is_miss() { + // A PostingListGroup entry carries a different TYPE_ID in its + // envelope; reading it as a PostingList must miss, not misread it. + let group = PostingListGroup::new(vec![]); + let any: ArcAny = Arc::new(group); + let mut buf = Vec::new(); + CacheCodec::from_impl::() + .serialize(&any, &mut buf) + .unwrap(); + assert!(codec().deserialize(&Bytes::from(buf)).hit().is_none()); + } + + /// An entry written by a newer build (higher type_version) misses. + #[test] + fn future_type_version_is_miss() { + let mut buf = serialize_entry(compressed_with_shared_positions()); + // Patch the envelope's type_version (magic[4] + ver[1] + len[2] + + // type_id[N]) to a value beyond what this build understands. + let type_id_len = u16::from_le_bytes([buf[5], buf[6]]) as usize; + let version_off = 4 + 1 + 2 + type_id_len; + buf[version_off..version_off + 4].copy_from_slice(&u32::MAX.to_le_bytes()); + assert!(codec().deserialize(&Bytes::from(buf)).hit().is_none()); + } + + /// A pre-stabilization blob (no magic) self-heals to a miss. + #[test] + fn pre_stabilization_blob_is_miss() { + // Old format led with a u64 LE length prefix, never our magic. + let mut blob = (30u64).to_le_bytes().to_vec(); + blob.extend_from_slice(&[0u8; 30]); + assert!(codec().deserialize(&Bytes::from(blob)).hit().is_none()); + } + + /// A structurally-valid envelope whose body leads with an out-of-range + /// variant tag self-heals to a `BodyError` miss rather than panicking or + /// misreading the remaining bytes. + #[test] + fn unknown_posting_variant_is_miss() { + use lance_core::cache::{CacheDecode, CacheMissReason}; + + let mut buf = serialize_entry(compressed_with_shared_positions()); + // The variant tag is the first body byte, right after the envelope + // (magic[4] + ver[1] + type_id_len[2] + type_id[N] + type_version[4]). + let type_id_len = u16::from_le_bytes([buf[5], buf[6]]) as usize; + let variant_off = 4 + 1 + 2 + type_id_len + 4; + buf[variant_off] = 2; // neither PLAIN (0) nor COMPRESSED (1) + match codec().deserialize(&Bytes::from(buf)) { + CacheDecode::Miss(reason) => assert_eq!(reason, CacheMissReason::BodyError), + CacheDecode::Hit(_) => panic!("expected a BodyError miss, got a hit"), + } + } } } diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index 56547c6510b..41a18c3bd68 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -37,11 +37,12 @@ use datafusion::physical_plan::metrics::Time; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use fst::{Automaton, IntoStreamer, Streamer}; use futures::{FutureExt, Stream, StreamExt, TryStreamExt, stream}; -use itertools::Itertools; +use itertools::{Either, Itertools}; use lance_arrow::{RecordBatchExt, iter_str_array}; use lance_core::cache::{CacheCodec, CacheKey, LanceCache, WeakLanceCache}; use lance_core::deepsize::DeepSizeOf; use lance_core::error::{DataFusionResult, LanceOptionExt}; +use lance_core::utils::address::RowAddress; use lance_core::utils::tokio::{get_num_compute_intensive_cpus, spawn_cpu}; use lance_core::utils::tracing::{IO_TYPE_LOAD_SCALAR_PART, TRACE_IO_EVENTS}; use lance_core::{Error, ROW_ID, ROW_ID_FIELD, Result}; @@ -2947,10 +2948,9 @@ impl DeepSizeOf for CompressedPositionStorage { #[derive(Debug, Clone, PartialEq, Eq, Default)] pub struct SharedPositionStream { codec: PositionStreamCodec, - block_offsets: Vec, - // Stored as `Bytes` so that the cache deserialization path can hand - // ownership of an IPC-decoded slice in without copying. Cloning the - // stream is then an `Arc` bump rather than an O(N) buffer copy. + block_offsets: Arc<[u32]>, + // Stored with shared ownership so cache hits can clone position streams + // without copying either offsets or bytes. bytes: bytes::Bytes, } @@ -2958,7 +2958,7 @@ impl SharedPositionStream { pub fn new(codec: PositionStreamCodec, block_offsets: Vec, bytes: bytes::Bytes) -> Self { Self { codec, - block_offsets, + block_offsets: Arc::from(block_offsets.into_boxed_slice()), bytes, } } @@ -2991,11 +2991,11 @@ impl SharedPositionStream { } pub fn block_offsets(&self) -> &[u32] { - &self.block_offsets + self.block_offsets.as_ref() } pub fn size(&self) -> usize { - self.block_offsets.capacity() * std::mem::size_of::() + self.bytes.len() + self.block_offsets.len() * std::mem::size_of::() + self.bytes.len() } } @@ -4615,18 +4615,25 @@ impl DocSet { self.row_ids[doc_id as usize] } - pub fn doc_id(&self, row_id: u64) -> Option { + /// Resolve a `row_id` to every `doc_id` it owns. + /// + /// A scalar column maps each row to a single document, but a + /// `list` column indexes every element as its own document, so a + /// single `row_id` can own several `doc_id`s sharing that key in `inv`. + /// The prefilter path (`flat_search`) walks an allow-list of row_ids and + /// must evaluate *all* of a row's documents; resolving to one `doc_id` + /// silently drops matches at non-last list positions (lancedb#3352). + pub fn doc_ids(&self, row_id: u64) -> impl Iterator + '_ { if self.inv.is_empty() { - // in legacy format, the row id is doc id - match self.row_ids.binary_search(&row_id) { - Ok(_) => Some(row_id), - Err(_) => None, - } + // in legacy format, the row id is doc id (one document per row) + let found = self.row_ids.binary_search(&row_id).is_ok(); + Either::Left(found.then_some(row_id).into_iter()) } else { - match self.inv.binary_search_by_key(&row_id, |x| x.0) { - Ok(idx) => Some(self.inv[idx].1 as u64), - Err(_) => None, - } + // `inv` is sorted by row_id, so the entries sharing this key form a + // contiguous run; yield the doc_id of each. + let lo = self.inv.partition_point(|entry| entry.0 < row_id); + let hi = self.inv.partition_point(|entry| entry.0 <= row_id); + Either::Right(self.inv[lo..hi].iter().map(|entry| entry.1 as u64)) } } pub fn total_tokens_num(&self) -> u64 { @@ -4750,23 +4757,36 @@ impl DocSet { }); } - // if frag reuse happened, we'll need to remap the row_ids. And after row_ids been - // remapped, we'll need resort to make sure binary_search works. + // If frag reuse happened, remap the row_ids through it. Crucially we + // must NOT drop the rows the reuse index deleted, because the posting + // lists reference doc_ids *positionally* (a doc_id is an index into + // these arrays, fixed at build time). Dropping deleted rows would + // renumber every later doc_id and desync the posting lists, so wand + // would index `num_tokens`/`row_ids` out of bounds or score the wrong + // doc. Instead we tombstone deleted rows in place: their slot survives + // (so doc_ids stay aligned with the posting lists) carrying + // `RowAddress::TOMBSTONE_ROW`, which wand skips, and they are left out + // of `inv` so a row_id lookup never resolves to a deleted doc. The + // heavyweight physical remap (`DocSet::remap`) is what actually + // renumbers and compacts; this load-time path only has to stay + // consistent until then. if let Some(frag_reuse_index_ref) = frag_reuse_index.as_ref() { let mut row_ids = Vec::with_capacity(row_id_col.len()); - let mut num_tokens = Vec::with_capacity(num_tokens_col.len()); - for (row_id, num_token) in row_id_col.values().iter().zip(num_tokens_col.values()) { - if let Some(new_row_id) = frag_reuse_index_ref.remap_row_id(*row_id) { - row_ids.push(new_row_id); - num_tokens.push(*num_token); + let num_tokens = num_tokens_col.values().to_vec(); + let mut inv = Vec::with_capacity(row_id_col.len()); + for (doc_id, row_id) in row_id_col.values().iter().enumerate() { + match frag_reuse_index_ref.remap_row_id(*row_id) { + Some(new_row_id) => { + row_ids.push(new_row_id); + inv.push((new_row_id, doc_id as u32)); + } + None => { + // Deleted: keep the slot (doc_ids must not shift) but + // tombstone it and leave it out of `inv`. + row_ids.push(RowAddress::TOMBSTONE_ROW); + } } } - - let mut inv: Vec<(u64, u32)> = row_ids - .iter() - .enumerate() - .map(|(doc_id, row_id)| (*row_id, doc_id as u32)) - .collect(); inv.sort_unstable_by_key(|entry| entry.0); let total_tokens = num_tokens.iter().map(|&x| x as u64).sum(); @@ -5475,6 +5495,21 @@ mod tests { ); } + #[test] + fn test_shared_position_stream_clone_shares_block_offsets() { + let stream = SharedPositionStream::new( + PositionStreamCodec::PackedDelta, + vec![0_u32, 4, 11], + bytes::Bytes::from_static(b"shared position bytes"), + ); + let original_offsets = stream.block_offsets().as_ptr(); + + let cloned = stream.clone(); + + assert_eq!(cloned.block_offsets(), stream.block_offsets()); + assert_eq!(cloned.block_offsets().as_ptr(), original_offsets); + } + #[test] fn test_posting_builder_roundtrip_shared_positions() { let entries = vec![ @@ -6446,6 +6481,16 @@ mod tests { ) -> Result { self.inner.copy_index_file(name, dest_store).await } + async fn copy_index_file_to( + &self, + name: &str, + new_name: &str, + dest_store: &dyn IndexStore, + ) -> Result { + self.inner + .copy_index_file_to(name, new_name, dest_store) + .await + } async fn rename_index_file( &self, name: &str, diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs index 6024747025b..5a2a701dc73 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs @@ -355,16 +355,7 @@ impl InvertedIndexParams { builder = builder.filter_dynamic(Stemmer::new(self.language)); } if self.remove_stop_words { - let stop_word_filter = match &self.custom_stop_words { - Some(words) => StopWordFilter::remove(words.iter().cloned()), - None => StopWordFilter::new(self.language).ok_or_else(|| { - Error::invalid_input(format!( - "removing stop words for language {:?} is not supported yet", - self.language - )) - })?, - }; - builder = builder.filter_dynamic(stop_word_filter); + builder = builder.filter_dynamic(self.stop_word_filter()?); } if self.ascii_folding { builder = builder.filter_dynamic(AsciiFoldingFilter); @@ -382,6 +373,19 @@ impl InvertedIndexParams { } } + fn stop_word_filter(&self) -> Result { + match &self.custom_stop_words { + Some(words) => Ok(StopWordFilter::remove(words.iter().cloned())), + None if self.base_tokenizer == "icu" => Ok(StopWordFilter::all()), + None => StopWordFilter::new(self.language).ok_or_else(|| { + Error::invalid_input(format!( + "removing stop words for language {:?} is not supported yet", + self.language + )) + }), + } + } + fn build_base_tokenizer(&self) -> Result { match self.base_tokenizer.as_str() { "simple" => Ok(TextAnalyzer::builder(SimpleTokenizer::default()).dynamic()), @@ -503,4 +507,52 @@ mod tests { stream.process(&mut |token| tokens.push(token.text.clone())); assert_eq!(tokens, vec!["hello", "こんにちは", "世界"]); } + + #[test] + fn test_remove_stop_words_respects_language_for_non_icu_tokenizer() { + let mut tokenizer = InvertedIndexParams::default() + .stem(false) + .base_tokenizer("simple".to_string()) + .build() + .unwrap(); + let mut stream = tokenizer.token_stream_for_search("the 的 lance data"); + let mut tokens = Vec::new(); + while let Some(token) = stream.next() { + tokens.push(token.text.clone()); + } + assert_eq!( + tokens, + vec!["的".to_string(), "lance".to_string(), "data".to_string()] + ); + } + + #[test] + fn test_custom_stop_words_replace_language_builtins() { + let mut tokenizer = InvertedIndexParams::default() + .stem(false) + .custom_stop_words(Some(vec!["lance".to_string()])) + .build() + .unwrap(); + let mut stream = tokenizer.token_stream_for_search("the lance data"); + let mut tokens = Vec::new(); + while let Some(token) = stream.next() { + tokens.push(token.text.clone()); + } + assert_eq!(tokens, vec!["the".to_string(), "data".to_string()]); + } + + #[test] + fn test_icu_stop_words_use_all_builtin_lists() { + let mut tokenizer = InvertedIndexParams::default() + .stem(false) + .base_tokenizer("icu".to_string()) + .build() + .unwrap(); + let mut stream = tokenizer.token_stream_for_search("the 的 lance data"); + let mut tokens = Vec::new(); + while let Some(token) = stream.next() { + tokens.push(token.text.clone()); + } + assert_eq!(tokens, vec!["lance".to_string(), "data".to_string()]); + } } diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs index 609ec08041f..dc6d2a860fb 100644 --- a/rust/lance-index/src/scalar/inverted/wand.rs +++ b/rust/lance-index/src/scalar/inverted/wand.rs @@ -736,6 +736,15 @@ impl<'a, S: Scorer> Wand<'a, S> { } DocInfo::Located(doc) => doc.row_id, }; + // Skip docs the fragment-reuse remap deleted. They are tombstoned + // in the DocSet (slot kept so posting-list doc_ids stay aligned) + // and must not surface in results. + if docs_has_row_ids && row_id == RowAddress::TOMBSTONE_ROW { + if self.operator == Operator::Or { + self.push_back_leads(doc.doc_id() + 1); + } + continue; + } if docs_has_row_ids && !mask.selected(row_id) { if self.operator == Operator::Or { self.push_back_leads(doc.doc_id() + 1); @@ -767,14 +776,15 @@ impl<'a, S: Scorer> Wand<'a, S> { self.score(doc_length) }; - let freqs = self.iter_term_freqs().collect(); if candidates.len() < limit { + let freqs = self.iter_term_freqs().collect(); candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length))); if candidates.len() == limit { let kth = candidates.peek().unwrap().0.0.score.0; self.update_threshold(kth, params.wand_factor); } } else if score > candidates.peek().unwrap().0.0.score.0 { + let freqs = self.iter_term_freqs().collect(); candidates.pop(); candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length))); let kth = candidates.peek().unwrap().0.0.score.0; @@ -819,11 +829,16 @@ impl<'a, S: Scorer> Wand<'a, S> { } // we need to map the row ids to doc ids, and sort them, - // because WAND PostingIterator can't go back to the previous doc id + // because WAND PostingIterator can't go back to the previous doc id. + // A list column maps one row id to several doc ids, so expand every + // document the row owns — keying on a single doc id would drop matches + // at non-last list positions (lancedb#3352). let doc_ids = row_ids - .filter_map(|row_addr| { + .flat_map(|row_addr| { let row_id: u64 = row_addr.into(); - self.docs.doc_id(row_id).map(|doc_id| (doc_id, row_id)) + self.docs + .doc_ids(row_id) + .map(move |doc_id| (doc_id, row_id)) }) .sorted_unstable() .collect::>(); @@ -885,15 +900,16 @@ impl<'a, S: Scorer> Wand<'a, S> { self.collect_tail_matches(doc_id); let score = self.score(doc_length); - let freqs = self.iter_term_freqs().collect(); if candidates.len() < limit { + let freqs = self.iter_term_freqs().collect(); candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length))); if candidates.len() == limit { let kth = candidates.peek().unwrap().0.0.score.0; self.update_threshold(kth, params.wand_factor); } } else if score > candidates.peek().unwrap().0.0.score.0 { + let freqs = self.iter_term_freqs().collect(); candidates.pop(); candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length))); let kth = candidates.peek().unwrap().0.0.score.0; @@ -967,41 +983,37 @@ impl<'a, S: Scorer> Wand<'a, S> { continue; } - let Some(doc) = self.lead.first().and_then(|posting| posting.doc()) else { + let Some(first_doc) = self.lead.first().and_then(|posting| posting.doc()) else { self.push_back_leads(target + 1); continue; }; - let doc_length = match &doc { + let doc_length = match &first_doc { DocInfo::Raw(doc) => self.docs.num_tokens(doc.doc_id), DocInfo::Located(doc) => self.docs.num_tokens_by_row_id(doc.row_id), }; - let mut lead_score = self - .lead - .iter() - .filter_map(|posting| { - posting.doc().map(|lead_doc| { - posting.score(&self.scorer, lead_doc.frequency(), doc_length) - }) - }) - .sum::(); + let mut lead_score = 0.0; + if let Some(first_posting) = self.lead.first() { + lead_score += first_posting.score(&self.scorer, first_doc.frequency(), doc_length); + } + for posting in self.lead.iter().skip(1) { + if let Some(lead_doc) = posting.doc() { + lead_score += posting.score(&self.scorer, lead_doc.frequency(), doc_length); + } + } while lead_score <= self.threshold { if lead_score + self.tail_max_score <= self.threshold { - self.push_back_leads(doc.doc_id() + 1); + self.push_back_leads(first_doc.doc_id() + 1); break; } if !self.advance_tail_top(target, doc_length, &mut lead_score) { - self.push_back_leads(doc.doc_id() + 1); + self.push_back_leads(first_doc.doc_id() + 1); break; } } if !self.lead.is_empty() { - return Ok(self - .lead - .first() - .and_then(|posting| posting.doc()) - .map(|doc| (doc, lead_score))); + return Ok(Some((first_doc, lead_score))); } } @@ -1392,10 +1404,9 @@ impl<'a, S: Scorer> Wand<'a, S> { }; self.tail_max_score -= upper_bound; posting.next(target); - match posting.doc().map(|doc| doc.doc_id()) { - Some(doc_id) if doc_id == target => { - let frequency = posting.doc().expect("posting must exist").frequency(); - *lead_score += posting.score(&self.scorer, frequency, doc_length); + match posting.doc() { + Some(doc) if doc.doc_id() == target => { + *lead_score += posting.score(&self.scorer, doc.frequency(), doc_length); self.lead.push(posting); } Some(_) => self.push_head(posting), @@ -1418,14 +1429,10 @@ impl<'a, S: Scorer> Wand<'a, S> { for tail_posting in tail.into_vec() { let mut posting = tail_posting.posting; posting.next(target); - match posting.doc().map(|doc| doc.doc_id()) { - Some(doc_id) if doc_id == target => { + match posting.doc() { + Some(doc) if doc.doc_id() == target => { if let (Some(doc_length), Some(score)) = (doc_length, score.as_deref_mut()) { - let frequency = posting - .doc() - .expect("posting moved to target should have doc") - .frequency(); - *score += posting.score(&self.scorer, frequency, doc_length); + *score += posting.score(&self.scorer, doc.frequency(), doc_length); } self.lead.push(posting) } @@ -2211,6 +2218,74 @@ mod tests { assert_eq!(matched, vec![2]); } + #[test] + fn test_doc_ids_resolves_every_document_a_row_owns() { + // A list column indexes each element as its own document, so + // one row id owns several doc ids. row 100 -> {0, 1}, row 101 -> {2}. + let row_id_col = arrow_array::UInt64Array::from(vec![100_u64, 100, 101]); + let num_tokens_col = arrow_array::UInt32Array::from(vec![1_u32, 1, 1]); + let docs = DocSet::from_columns(&row_id_col, &num_tokens_col, false, None).unwrap(); + + assert_eq!(docs.doc_ids(100).collect::>(), vec![0, 1]); + assert_eq!(docs.doc_ids(101).collect::>(), vec![2]); + assert!(docs.doc_ids(999).next().is_none()); + + // legacy shape (row id == doc id) still resolves to a single document. + let mut legacy = DocSet::default(); + legacy.append(7, 1); + assert_eq!(legacy.doc_ids(7).collect::>(), vec![7]); + assert!(legacy.doc_ids(8).next().is_none()); + } + + #[rstest] + fn test_flat_search_finds_list_row_with_match_at_non_last_position( + #[values(false, true)] is_compressed: bool, + ) { + // row 100 owns two element-documents (doc 0, doc 1) that share its row + // id; row 101 owns doc 2. The query term lives only in doc 0 — the + // *non-last* element of row 100. Resolving the row to a single doc id + // would evaluate doc 1, miss the term, and drop the row (lancedb#3352). + let row_id_col = arrow_array::UInt64Array::from(vec![100_u64, 100, 101]); + let num_tokens_col = arrow_array::UInt32Array::from(vec![1_u32, 1, 1]); + let docs = DocSet::from_columns(&row_id_col, &num_tokens_col, false, None).unwrap(); + + let posting = PostingIterator::with_query_weight( + String::from("needle"), + 0, + 0, + 1.0, + generate_posting_list(vec![0], 1.0, None, is_compressed), + docs.len(), + ); + + let mut wand = Wand::new( + Operator::Or, + vec![posting].into_iter(), + &docs, + InverseDocLengthScorer, + ); + wand.threshold = 0.5; + + let selected = vec![RowAddress::from(100_u64)]; + let result = wand + .flat_search( + &FtsSearchParams::default(), + Box::new(selected.into_iter()), + &NoOpMetricsCollector, + ) + .unwrap(); + + // flat_search resolves the prefilter against the DocSet, so the single + // match comes back as a concrete RowId(100) rather than a deferred + // Pending addr. Asserting on the whole result avoids a never-taken + // match arm that would otherwise read as uncovered. + let addrs = result.into_iter().map(|doc| doc.addr).collect::>(); + assert!( + matches!(addrs.as_slice(), [CandidateAddr::RowId(100)]), + "expected exactly row 100, got {addrs:?}" + ); + } + #[test] fn test_block_max_score_matches_stored_value() { let doc_ids = vec![0_u32]; diff --git a/rust/lance-index/src/scalar/label_list.rs b/rust/lance-index/src/scalar/label_list.rs index cf357d89585..8e07a607bff 100644 --- a/rust/lance-index/src/scalar/label_list.rs +++ b/rust/lance-index/src/scalar/label_list.rs @@ -18,8 +18,9 @@ use datafusion::execution::RecordBatchStream; use datafusion::physical_plan::{SendableRecordBatchStream, stream::RecordBatchStreamAdapter}; use datafusion_common::ScalarValue; use futures::{StreamExt, TryStream, TryStreamExt, stream::BoxStream}; -use lance_arrow::ipc::{read_len_prefixed_bytes_at, write_len_prefixed_bytes}; -use lance_core::cache::{CacheCodec, CacheCodecImpl, CacheKey, LanceCache}; +use lance_core::cache::{ + CacheCodec, CacheCodecImpl, CacheEntryReader, CacheEntryWriter, CacheKey, LanceCache, +}; use lance_core::deepsize::DeepSizeOf; use lance_core::error::LanceOptionExt; use lance_core::{Error, ROW_ID, Result}; @@ -532,27 +533,30 @@ impl LabelListIndexState { } impl CacheCodecImpl for LabelListIndexState { + const TYPE_ID: &'static str = "lance.scalar.LabelListIndexState"; + const CURRENT_VERSION: u32 = 1; + /// Wire format: /// ```text - /// [u64 list_nulls_len][list_nulls bytes] - /// [bitmap state bytes (self-delimiting)] + /// RAW_BLOB : list_nulls (roaring tree map, portable encoding) + /// /// ``` - fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()> { + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { let mut nulls_bytes = Vec::with_capacity(self.list_nulls.serialized_size()); self.list_nulls.serialize_into(&mut nulls_bytes)?; - write_len_prefixed_bytes(writer, &nulls_bytes)?; - self.bitmap_state.serialize(writer)?; + w.write_raw(&nulls_bytes)?; + // The bitmap state writes its own self-delimiting body inline. + self.bitmap_state.serialize(w)?; Ok(()) } - fn deserialize(data: &bytes::Bytes) -> Result { - let mut offset = 0; - let nulls_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let nulls_bytes = r.read_raw()?; let list_nulls = Arc::new(RowAddrTreeMap::deserialize_from(nulls_bytes.as_ref())?); // The bitmap state is self-delimiting (length-prefixed null map + - // Arrow IPC stream with EOS marker), so we can hand the remaining - // tail to it directly. - let bitmap_state = BitmapIndexState::deserialize(&data.slice(offset..))?; + // Arrow IPC stream with EOS marker); it continues reading the body + // from where the null map left off. + let bitmap_state = BitmapIndexState::deserialize(r)?; Ok(Self { bitmap_state, list_nulls, @@ -728,3 +732,91 @@ impl ScalarIndexPlugin for LabelListIndexPlugin { Ok(()) } } + +#[cfg(test)] +mod tests { + use std::collections::BTreeMap; + + use datafusion_common::ScalarValue; + use lance_core::cache::CacheCodec; + use lance_core::utils::address::RowAddress; + + use super::super::bitmap::BitmapIndexState; + use super::super::btree::OrderableScalarValue; + use super::*; + + fn sample_state() -> LabelListIndexState { + let mut index_map = BTreeMap::new(); + for k in 0..32i32 { + index_map.insert( + OrderableScalarValue(ScalarValue::Int32(Some(k))), + k as usize, + ); + } + let mut bitmap_nulls = RowAddrTreeMap::new(); + bitmap_nulls.insert(RowAddress::new_from_parts(0, 3).into()); + let bitmap_state = + BitmapIndexState::new_for_test(index_map, bitmap_nulls, DataType::Int32).unwrap(); + + let mut list_nulls = RowAddrTreeMap::new(); + list_nulls.insert(RowAddress::new_from_parts(0, 9).into()); + LabelListIndexState { + bitmap_state, + list_nulls: Arc::new(list_nulls), + } + } + + #[test] + fn test_label_list_state_codec_roundtrip() { + let state = sample_state(); + let mut buf = Vec::new(); + state + .serialize(&mut CacheEntryWriter::new(&mut buf)) + .unwrap(); + let data = Bytes::from(buf); + let mut reader = CacheEntryReader::new(&data, 0, LabelListIndexState::CURRENT_VERSION); + let restored = LabelListIndexState::deserialize(&mut reader).unwrap(); + + assert_eq!(&*restored.list_nulls, &*state.list_nulls); + assert_eq!( + restored.bitmap_state.lookup_batch(), + state.bitmap_state.lookup_batch() + ); + assert_eq!( + restored.bitmap_state.null_map(), + state.bitmap_state.null_map() + ); + } + + /// The nested bitmap lookup batch must decode zero-copy through the full + /// envelope, proving the leading `list_nulls` RAW_BLOB does not knock the + /// nested IPC section off its 64-byte boundary. + #[test] + fn test_label_list_nested_lookup_is_zero_copy() { + const ALIGN: usize = 64; + let codec = CacheCodec::from_impl::(); + let any: Arc = Arc::new(sample_state()); + let mut buf = Vec::new(); + codec.serialize(&any, &mut buf).unwrap(); + + let mut v = vec![0u8; buf.len() + ALIGN]; + let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN; + v[pad..pad + buf.len()].copy_from_slice(&buf); + let data = Bytes::from(v).slice(pad..pad + buf.len()); + + let restored = codec.deserialize(&data).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + + let base = data.as_ptr() as usize; + let end = base + data.len(); + for col in restored.bitmap_state.lookup_batch().columns() { + for buffer in col.to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= base && ptr < end, + "nested bitmap lookup buffer was realigned — misaligned IPC section", + ); + } + } + } +} diff --git a/rust/lance-index/src/scalar/lance_format.rs b/rust/lance-index/src/scalar/lance_format.rs index 562945b8f0d..2f82deb8403 100644 --- a/rust/lance-index/src/scalar/lance_format.rs +++ b/rust/lance-index/src/scalar/lance_format.rs @@ -99,6 +99,24 @@ impl LanceIndexStore { self.file_sizes = file_sizes; self } + + fn index_file_path(&self, name: &str) -> Result { + let relative_path = Path::parse(name).map_err(|err| { + Error::invalid_input(format!("invalid index file path {name:?}: {err}")) + })?; + if self.index_dir.is_root() { + return Ok(relative_path); + } + if relative_path.is_root() { + return Ok(self.index_dir.clone()); + } + Path::parse(format!( + "{}/{}", + self.index_dir.as_ref(), + relative_path.as_ref() + )) + .map_err(|err| Error::invalid_input(format!("invalid index file path {name:?}: {err}"))) + } } #[async_trait] @@ -397,7 +415,7 @@ impl IndexStore for LanceIndexStore { name: &str, schema: Arc, ) -> Result> { - let path = self.index_dir.clone().join(name); + let path = self.index_file_path(name)?; let schema = schema.as_ref().try_into()?; let writer = self.object_store.create(&path).await?; let writer = current_writer::FileWriter::try_new( @@ -415,7 +433,7 @@ impl IndexStore for LanceIndexStore { } async fn open_index_file(&self, name: &str) -> Result> { - let path = self.index_dir.clone().join(name); + let path = self.index_file_path(name)?; // Use cached file size if available, otherwise unknown (requires HEAD call) let cached_size = self .file_sizes @@ -436,7 +454,7 @@ impl IndexStore for LanceIndexStore { Err(e) => { // If the error is a version conflict we can try to read the file with v1 reader if let Error::VersionConflict { .. } = e { - let path = self.index_dir.clone().join(name); + let path = self.index_file_path(name)?; let file_reader = PreviousFileReader::try_new_self_described( &self.object_store, &path, @@ -452,7 +470,16 @@ impl IndexStore for LanceIndexStore { } async fn copy_index_file(&self, name: &str, dest_store: &dyn IndexStore) -> Result { - let path = self.index_dir.clone().join(name); + self.copy_index_file_to(name, name, dest_store).await + } + + async fn copy_index_file_to( + &self, + name: &str, + new_name: &str, + dest_store: &dyn IndexStore, + ) -> Result { + let path = self.index_file_path(name)?; let other_store = dest_store.as_any().downcast_ref::(); match other_store { @@ -460,21 +487,21 @@ impl IndexStore for LanceIndexStore { // If both this store and the destination are lance stores we can use object_store's copy // This does blindly assume that both stores are using the same underlying object_store // but there is no easy way to verify this and it happens to always be true at the moment - let dest_path = dest_store.index_dir.clone().join(name); + let dest_path = dest_store.index_file_path(new_name)?; self.object_store.copy(&path, &dest_path).await?; let size_bytes = match self.file_sizes.get(name) { Some(size_bytes) => *size_bytes, None => self.object_store.size(&path).await?, }; Ok(IndexFile { - path: name.to_string(), + path: new_name.to_string(), size_bytes, }) } _ => { let reader = self.open_index_file(name).await?; let mut writer = dest_store - .new_index_file(name, Arc::new(reader.schema().into())) + .new_index_file(new_name, Arc::new(reader.schema().into())) .await?; for offset in (0..reader.num_rows()).step_by(4096) { @@ -488,8 +515,8 @@ impl IndexStore for LanceIndexStore { } async fn rename_index_file(&self, name: &str, new_name: &str) -> Result { - let path = self.index_dir.clone().join(name); - let new_path = self.index_dir.clone().join(new_name); + let path = self.index_file_path(name)?; + let new_path = self.index_file_path(new_name)?; self.object_store.copy(&path, &new_path).await?; self.object_store.delete(&path).await?; let size_bytes = match self.file_sizes.get(name) { @@ -503,7 +530,7 @@ impl IndexStore for LanceIndexStore { } async fn delete_index_file(&self, name: &str) -> Result<()> { - let path = self.index_dir.clone().join(name); + let path = self.index_file_path(name)?; self.object_store.delete(&path).await } diff --git a/rust/lance-index/src/scalar/ngram.rs b/rust/lance-index/src/scalar/ngram.rs index 72ef8d53a92..b452ef78c85 100644 --- a/rust/lance-index/src/scalar/ngram.rs +++ b/rust/lance-index/src/scalar/ngram.rs @@ -5,7 +5,10 @@ use std::any::Any; use std::collections::BTreeMap; use std::iter::once; use std::time::Instant; -use std::{collections::HashMap, sync::Arc}; +use std::{ + collections::{HashMap, HashSet}, + sync::Arc, +}; use super::lance_format::LanceIndexStore; use super::{ @@ -49,6 +52,9 @@ use roaring::{RoaringBitmap, RoaringTreemap}; use serde::Serialize; use tracing::instrument; +mod ngram_regex; +pub(crate) use ngram_regex::regex_can_use_index; + const TOKENS_COL: &str = "tokens"; const POSTING_LIST_COL: &str = "posting_list"; const POSTINGS_FILENAME: &str = "ngram_postings.lance"; @@ -476,6 +482,45 @@ impl ScalarIndex for NGramIndex { let row_ids = NGramPostingList::intersect(list_refs); Ok(SearchResult::at_most(RowAddrTreeMap::from(row_ids))) } + TextQuery::Regex(pattern) => { + let trigram_query = ngram_regex::regex_to_trigram_query(pattern); + match &trigram_query { + // No usable trigram structure (e.g. `a.b`, `.*`): the index + // cannot prune, so every row must be rechecked. + ngram_regex::TrigramQuery::All => { + Ok(SearchResult::at_least(RowAddrTreeMap::new())) + } + // The pattern is provably unsatisfiable. + ngram_regex::TrigramQuery::None => { + Ok(SearchResult::exact(RowAddrTreeMap::new())) + } + _ => { + let mut tokens = HashSet::new(); + ngram_regex::collect_tokens(&trigram_query, &mut tokens); + // Fetch the posting list for every trigram the condition + // references; a token absent from the index contributes + // an empty list, which `eval_trigram_query` handles. + let present = tokens.into_iter().filter_map(|token| { + self.tokens.get(&token).map(|offset| (token, *offset)) + }); + let lists = futures::stream::iter(present.map(|(token, offset)| { + self.list_reader + .ngram_list(offset, metrics) + .map(move |result| result.map(|list| (token, list))) + })) + .buffer_unordered(self.io_parallelism) + .try_collect::)>>() + .await?; + metrics.record_comparisons(lists.len()); + let bitmaps: HashMap = lists + .into_iter() + .map(|(token, list)| (token, list.bitmap.clone())) + .collect(); + let row_ids = ngram_regex::eval_trigram_query(&trigram_query, &bitmaps); + Ok(SearchResult::at_most(RowAddrTreeMap::from(row_ids))) + } + } + } } } @@ -1279,6 +1324,9 @@ impl ScalarIndexPlugin for NGramIndexPlugin { Some(Box::new(TextQueryParser::new( index_name, self.name().to_string(), + // needs_recheck: ngram results are an inexact candidate superset. + true, + // supports_regex: the ngram index can answer regex queries. true, ))) } @@ -1538,6 +1586,107 @@ mod tests { assert_eq!(expected, res); } + #[test_log::test(tokio::test)] + async fn test_ngram_regex_search() { + // Same corpus as test_basic_ngram_index. + let data = StringArray::from_iter_values([ + "cat", // 0 + "dog", // 1 + "cat dog", // 2 + "dog cat", // 3 + "elephant", // 4 + "mouse", // 5 + "rhino", // 6 + "giraffe", // 7 + "rhinos nose", // 8 + ]); + let row_ids = UInt64Array::from_iter_values((0..data.len()).map(|i| i as u64)); + let schema = Arc::new(Schema::new(vec![ + Field::new(VALUE_COLUMN_NAME, DataType::Utf8, false), + Field::new(ROW_ID, DataType::UInt64, false), + ])); + let data = + RecordBatch::try_new(schema.clone(), vec![Arc::new(data), Arc::new(row_ids)]).unwrap(); + let data = Box::pin(RecordBatchStreamAdapter::new( + schema, + stream::once(std::future::ready(Ok(data))), + )); + + let builder = NGramIndexBuilder::try_new(NGramIndexBuilderOptions::default()).unwrap(); + let (index, _tmpdir) = do_train(builder, data).await; + + async fn search(index: &NGramIndex, pattern: &str) -> SearchResult { + index + .search( + &TextQuery::Regex(pattern.to_string()), + &NoOpMetricsCollector, + ) + .await + .unwrap() + } + + // A plain literal yields the same candidates as contains("cat"). + assert_eq!( + search(&index, "cat").await, + SearchResult::at_most(RowAddrTreeMap::from_iter([0, 2, 3])) + ); + + // Alternation -> union of each branch's rows. + assert_eq!( + search(&index, "(cat|dog)").await, + SearchResult::at_most(RowAddrTreeMap::from_iter([0, 1, 2, 3])) + ); + + // AND across `.*`: must contain both the `rhino` and `nose` trigrams, so + // row 6 ("rhino") is correctly excluded and only row 8 survives. + assert_eq!( + search(&index, "rhino.*nose").await, + SearchResult::at_most(RowAddrTreeMap::from_iter([8])) + ); + + // No derivable trigram -> recheck everything. + assert_eq!( + search(&index, "a.b").await, + SearchResult::at_least(RowAddrTreeMap::new()) + ); + + // A trigram that is absent from the index -> empty candidate set. + assert_eq!( + search(&index, "zzz").await, + SearchResult::at_most(RowAddrTreeMap::new()) + ); + } + + #[test_log::test(tokio::test)] + async fn test_ngram_regex_search_nulls() { + // Rows: cat(0), dog(1), NULL(2), NULL(3), cat dog(4). + let data = simple_data_with_nulls(); + let builder = NGramIndexBuilder::try_new(NGramIndexBuilderOptions::default()).unwrap(); + let (index, _tmpdir) = do_train(builder, data).await; + + // The NULL rows (2, 3) must never appear in the candidate set. + let res = index + .search(&TextQuery::Regex("cat".to_string()), &NoOpMetricsCollector) + .await + .unwrap(); + assert_eq!( + res, + SearchResult::at_most(RowAddrTreeMap::from_iter([0, 4])) + ); + + let res = index + .search( + &TextQuery::Regex("(cat|dog)".to_string()), + &NoOpMetricsCollector, + ) + .await + .unwrap(); + assert_eq!( + res, + SearchResult::at_most(RowAddrTreeMap::from_iter([0, 1, 4])) + ); + } + fn test_data_schema() -> Arc { Arc::new(Schema::new(vec![ Field::new(VALUE_COLUMN_NAME, DataType::Utf8, true), diff --git a/rust/lance-index/src/scalar/ngram/ngram_regex.rs b/rust/lance-index/src/scalar/ngram/ngram_regex.rs new file mode 100644 index 00000000000..ee67c479a71 --- /dev/null +++ b/rust/lance-index/src/scalar/ngram/ngram_regex.rs @@ -0,0 +1,673 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Deriving a trigram pre-filter from a regular expression. +//! +//! This is the query-side counterpart of the ngram index that lets us +//! accelerate `regexp_like` / `regexp_match` predicates the same way the index +//! already accelerates `contains`. The idea (the same one Postgres `pg_trgm` +//! and Russ Cox's Google Code Search use) is to derive, from the regex, a +//! boolean condition over trigram presence that is *necessary* for any string +//! to match, evaluate it against the inverted index, and let the scan recheck +//! the true regex on the surviving rows. +//! +//! The derived condition is a [`TrigramQuery`] -- an AND/OR tree of trigram +//! tokens. `AND` maps onto posting-list intersection and `OR` onto union, which +//! is exactly the set algebra the ngram index is built for. +//! +//! # Soundness +//! +//! The single invariant that matters is that the condition must never require a +//! trigram that a matching string could lack -- otherwise we would drop real +//! matches (a false negative, far worse than a false positive, which the recheck +//! removes). Everything here is therefore a conservative *over*-approximation: +//! when in doubt we emit [`TrigramQuery::All`] ("no constraint, recheck +//! everything"). Concretely: +//! +//! * Every trigram requirement is produced by [`trigrams_of_string`], which runs +//! the *same* tokenizer the index was built with, so a string shorter than a +//! trigram (or with no alphanumeric run) contributes no requirement. +//! * Character classes and case-insensitive folds are treated as a single +//! unknown character (`All`), because the index's normalization does not agree +//! with Unicode case folding (e.g. `(?i)c` also matches `ℂ`, which the index +//! does not fold to `c`). Literal runs -- the common case -- are fully used. +//! * When the exact / prefix / suffix string sets grow past a bound we first fold +//! their trigrams into the running condition and only then drop the strings, so +//! collapsing precision never removes a necessary trigram. + +use std::collections::{BTreeSet, HashMap, HashSet}; + +use regex_syntax::hir::{Class, Hir, HirKind}; +use roaring::RoaringTreemap; + +use super::{NGRAM_N, NGRAM_TOKENIZER, ngram_to_token, tokenize_visitor}; + +/// Maximum number of strings kept in an `exact` / `prefix` / `suffix` set before +/// it is folded into the trigram condition and dropped. +const MAX_SET_SIZE: usize = 16; +/// Maximum length (in characters) of a string kept in a set. Longer strings are +/// trimmed to a sound shorter affix. +const MAX_STRING_LEN: usize = 32; + +/// A boolean condition over trigram presence that is *necessary* for a regex to +/// match. `All` means "no constraint" and `None` means "unsatisfiable"; by +/// construction these only ever appear at the root of the tree. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub enum TrigramQuery { + /// No constraint: every row is a candidate (the scan must recheck all rows). + All, + /// Unsatisfiable: no row can match. + None, + /// The given trigram token must be present. + Trigram(u32), + /// Every child condition must hold (posting-list intersection). + And(Vec), + /// At least one child condition must hold (posting-list union). + Or(Vec), +} + +impl TrigramQuery { + /// Build an `AND` of conditions, applying identity (`All`), absorbing + /// (`None`), flattening, sorting and de-duplication so the result is + /// canonical and free of nested `All`/`None`. + fn and(items: Vec) -> Self { + let mut flat = Vec::with_capacity(items.len()); + for item in items { + match item { + Self::All => {} // identity + Self::None => return Self::None, // absorbing + Self::And(children) => flat.extend(children), // flatten + other => flat.push(other), + } + } + flat.sort(); + flat.dedup(); + match flat.len() { + 0 => Self::All, + 1 => flat.pop().unwrap(), + _ => Self::And(flat), + } + } + + /// Build an `OR` of conditions, applying absorbing (`All`), identity + /// (`None`), flattening, sorting and de-duplication. + fn or(items: Vec) -> Self { + let mut flat = Vec::with_capacity(items.len()); + for item in items { + match item { + Self::All => return Self::All, // absorbing + Self::None => {} // identity + Self::Or(children) => flat.extend(children), // flatten + other => flat.push(other), + } + } + flat.sort(); + flat.dedup(); + match flat.len() { + 0 => Self::None, + 1 => flat.pop().unwrap(), + _ => Self::Or(flat), + } + } +} + +/// Information about the set of strings a sub-expression can match, used to +/// build a necessary trigram condition bottom-up. For every string `s` the +/// sub-expression matches: `s` is in `exact` (when it is `Some`), `s` starts +/// with some member of `prefix` and ends with some member of `suffix`, and `s` +/// satisfies `match_q`. +struct RegexInfo { + /// Whether the sub-expression can match the empty string. + emptyable: bool, + /// The complete set of strings the sub-expression matches, or `None` if that + /// set is unbounded / unknown. + exact: Option>, + /// Strings that every match must start with (empty = unknown). + prefix: BTreeSet, + /// Strings that every match must end with (empty = unknown). + suffix: BTreeSet, + /// A necessary trigram condition for the sub-expression. + match_q: TrigramQuery, +} + +impl RegexInfo { + /// The empty string (also used for zero-width anchors): matches only `""`. + fn empty_string() -> Self { + let empty = BTreeSet::from([String::new()]); + Self { + emptyable: true, + exact: Some(empty.clone()), + prefix: empty.clone(), + suffix: empty, + match_q: TrigramQuery::All, + } + } + + /// A fixed literal string. + fn literal(s: &str) -> Self { + let set = BTreeSet::from([s.to_string()]); + Self { + emptyable: s.is_empty(), + exact: Some(set.clone()), + prefix: set.clone(), + suffix: set, + match_q: trigrams_of_string(s), + } + } + + /// A single unknown character (a character class we cannot pin down). + fn any_char() -> Self { + Self { + emptyable: false, + exact: None, + prefix: BTreeSet::new(), + suffix: BTreeSet::new(), + match_q: TrigramQuery::All, + } + } + + /// Enforce the size/length bounds, folding any information about to be + /// discarded into `match_q` first so that precision loss never drops a + /// necessary trigram. Idempotent. + fn bound(&mut self) { + let oversized_exact = self.exact.as_ref().is_some_and(|exact| { + exact.len() > MAX_SET_SIZE || exact.iter().any(|s| s.chars().count() > MAX_STRING_LEN) + }); + if oversized_exact { + let exact = self.exact.take().expect("checked above"); + self.fold_into_match(&exact); + } + + self.prefix = self + .prefix + .iter() + .map(|s| leading(s, MAX_STRING_LEN)) + .collect(); + if self.prefix.len() > MAX_SET_SIZE { + let prefix = std::mem::take(&mut self.prefix); + self.fold_into_match(&prefix); + } + + self.suffix = self + .suffix + .iter() + .map(|s| trailing(s, MAX_STRING_LEN)) + .collect(); + if self.suffix.len() > MAX_SET_SIZE { + let suffix = std::mem::take(&mut self.suffix); + self.fold_into_match(&suffix); + } + } + + /// AND the trigrams of `set` (a complete set of possible affixes/strings) + /// into `match_q`. Sound because the set is exhaustive for its role. + fn fold_into_match(&mut self, set: &BTreeSet) { + let folded = trigrams_of_set(set.iter()); + let current = std::mem::replace(&mut self.match_q, TrigramQuery::All); + self.match_q = TrigramQuery::and(vec![current, folded]); + } +} + +/// AND together the trigrams of `s`. Reuses the index's own tokenizer so the +/// tokens are normalized (lowercase, ASCII-folded, alphanumeric-bounded) +/// exactly as they were stored. Returns `All` if `s` yields no trigram (too +/// short, or no run of three alphanumeric characters). +fn trigrams_of_string(s: &str) -> TrigramQuery { + let mut tokens = Vec::new(); + tokenize_visitor(&NGRAM_TOKENIZER, s, |ngram| { + tokens.push(TrigramQuery::Trigram(ngram_to_token(ngram, NGRAM_N))); + }); + TrigramQuery::and(tokens) +} + +/// OR together the trigram conditions of each string in `set`. An empty set +/// means "unknown" and yields `All` (no constraint); if any member yields `All` +/// the whole OR is `All`. +fn trigrams_of_set<'a>(set: impl IntoIterator) -> TrigramQuery { + let queries: Vec<_> = set.into_iter().map(|s| trigrams_of_string(s)).collect(); + if queries.is_empty() { + return TrigramQuery::All; + } + TrigramQuery::or(queries) +} + +/// Concatenate every string in `a` with every string in `b`. +fn cross_concat(a: &BTreeSet, b: &BTreeSet) -> BTreeSet { + let mut out = BTreeSet::new(); + for x in a { + for y in b { + out.insert(format!("{x}{y}")); + } + } + out +} + +/// The first `n` characters of `s` (a sound shorter prefix). +fn leading(s: &str, n: usize) -> String { + s.chars().take(n).collect() +} + +/// The last `n` characters of `s` (a sound shorter suffix). +fn trailing(s: &str, n: usize) -> String { + let count = s.chars().count(); + s.chars().skip(count.saturating_sub(n)).collect() +} + +/// If `class` matches exactly one scalar value, return that character. +fn singleton_char(class: &Class) -> Option { + match class { + Class::Unicode(u) => { + let ranges = u.ranges(); + match ranges { + [r] if r.start() == r.end() => Some(r.start()), + _ => None, + } + } + Class::Bytes(b) => { + let ranges = b.ranges(); + match ranges { + [r] if r.start() == r.end() && r.start() < 0x80 => Some(r.start() as char), + _ => None, + } + } + } +} + +/// Compute the [`RegexInfo`] for `hir` bottom-up. +fn analyze(hir: &Hir) -> RegexInfo { + let mut info = match hir.kind() { + // Zero-width: the empty match. Anchors (^, $, \b) carry no trigram. + HirKind::Empty | HirKind::Look(_) => RegexInfo::empty_string(), + HirKind::Literal(lit) => match std::str::from_utf8(&lit.0) { + Ok(s) => RegexInfo::literal(s), + // A literal that is not valid UTF-8 cannot be reasoned about here. + Err(_) => RegexInfo::any_char(), + }, + HirKind::Class(class) => match singleton_char(class) { + Some(ch) => RegexInfo::literal(ch.encode_utf8(&mut [0u8; 4])), + None => RegexInfo::any_char(), + }, + HirKind::Repetition(rep) => { + let inner = analyze(&rep.sub); + let at_least_one = rep.min >= 1; + RegexInfo { + emptyable: !at_least_one || inner.emptyable, + // We do not unroll bounded repetitions, so the matched set is + // unbounded as far as we are concerned. + exact: None, + prefix: if at_least_one { + inner.prefix.clone() + } else { + BTreeSet::new() + }, + suffix: if at_least_one { + inner.suffix.clone() + } else { + BTreeSet::new() + }, + // Only a required occurrence (min >= 1) contributes; the single + // inner match is necessary, never multiplied. + match_q: if at_least_one { + inner.match_q + } else { + TrigramQuery::All + }, + } + } + HirKind::Capture(cap) => analyze(&cap.sub), + HirKind::Concat(subs) => analyze_concat(subs), + HirKind::Alternation(subs) => analyze_alternation(subs), + }; + info.bound(); + info +} + +fn analyze_concat(subs: &[Hir]) -> RegexInfo { + let mut acc = RegexInfo::empty_string(); + for sub in subs { + acc = concat_info(acc, analyze(sub)); + } + acc +} + +/// Combine two adjacent sub-expressions. This is the subtle part: it recovers +/// trigrams that straddle the junction via the cross product of `acc.suffix` and +/// `next.prefix`. +fn concat_info(acc: RegexInfo, next: RegexInfo) -> RegexInfo { + let emptyable = acc.emptyable && next.emptyable; + + // Trigrams spanning the junction (computed from the pre-merge affixes). + let boundary = if acc.suffix.is_empty() || next.prefix.is_empty() { + TrigramQuery::All + } else { + trigrams_of_set(cross_concat(&acc.suffix, &next.prefix).iter()) + }; + + // exact = acc.exact x next.exact, only while both are finite and small. + let exact = match (&acc.exact, &next.exact) { + (Some(a), Some(b)) if a.len().saturating_mul(b.len()) <= MAX_SET_SIZE => { + Some(cross_concat(a, b)) + } + _ => None, + }; + + // A match starts with acc's full string (when known) then next's prefix, + // otherwise with acc's own prefix. + let prefix = match &acc.exact { + Some(a) if !next.prefix.is_empty() => cross_concat(a, &next.prefix), + Some(a) => a.clone(), + None => acc.prefix.clone(), + }; + + // Mirror image for the suffix (driven by the right side). + let suffix = match &next.exact { + Some(b) if !acc.suffix.is_empty() => cross_concat(&acc.suffix, b), + Some(b) => b.clone(), + None => next.suffix.clone(), + }; + + let match_q = TrigramQuery::and(vec![acc.match_q, next.match_q, boundary]); + + let mut info = RegexInfo { + emptyable, + exact, + prefix, + suffix, + match_q, + }; + info.bound(); + info +} + +fn analyze_alternation(subs: &[Hir]) -> RegexInfo { + let infos: Vec = subs.iter().map(analyze).collect(); + + let emptyable = infos.iter().any(|i| i.emptyable); + + let exact = if infos.iter().all(|i| i.exact.is_some()) { + Some( + infos + .iter() + .flat_map(|i| i.exact.as_ref().unwrap().iter().cloned()) + .collect(), + ) + } else { + None + }; + + // A common prefix exists only if every branch contributes one. + let prefix = if infos.iter().all(|i| !i.prefix.is_empty()) { + infos + .iter() + .flat_map(|i| i.prefix.iter().cloned()) + .collect() + } else { + BTreeSet::new() + }; + let suffix = if infos.iter().all(|i| !i.suffix.is_empty()) { + infos + .iter() + .flat_map(|i| i.suffix.iter().cloned()) + .collect() + } else { + BTreeSet::new() + }; + + let match_q = TrigramQuery::or(infos.into_iter().map(|i| i.match_q).collect()); + + RegexInfo { + emptyable, + exact, + prefix, + suffix, + match_q, + } +} + +/// Derive a necessary trigram condition from a regular expression pattern. +/// +/// Returns [`TrigramQuery::All`] when no useful condition can be derived (an +/// unparsable pattern, or one with no trigram-able literal structure such as +/// `a.b` or `.*`); callers must treat that as "recheck everything". +pub fn regex_to_trigram_query(pattern: &str) -> TrigramQuery { + // An unparsable pattern cannot be accelerated; rechecking is still safe. + let Ok(hir) = regex_syntax::parse(pattern) else { + return TrigramQuery::All; + }; + let info = analyze(&hir); + + let mut conditions = vec![info.match_q]; + if let Some(exact) = &info.exact { + if exact.is_empty() { + // The expression matches nothing. + return TrigramQuery::None; + } + conditions.push(trigrams_of_set(exact.iter())); + } + conditions.push(trigrams_of_set(info.prefix.iter())); + conditions.push(trigrams_of_set(info.suffix.iter())); + TrigramQuery::and(conditions) +} + +/// Whether a regular expression yields any trigram condition the index can use +/// to prune candidates. When it does not (e.g. `a.b`, `.*`, or a case-insensitive +/// pattern), callers should leave the predicate to a full scan rather than route +/// it to the index, which would otherwise have to ask the scan to recheck every +/// row -- a path the index result type (`AtLeast`) does not support. +pub fn regex_can_use_index(pattern: &str) -> bool { + regex_to_trigram_query(pattern) != TrigramQuery::All +} + +/// Collect the distinct trigram tokens referenced anywhere in the tree. +pub fn collect_tokens(query: &TrigramQuery, out: &mut HashSet) { + match query { + TrigramQuery::Trigram(token) => { + out.insert(*token); + } + TrigramQuery::And(items) | TrigramQuery::Or(items) => { + for item in items { + collect_tokens(item, out); + } + } + TrigramQuery::All | TrigramQuery::None => {} + } +} + +/// Evaluate the tree against a map of `trigram token -> posting list`. A token +/// missing from the map contributes an empty set (sound: a required trigram that +/// is absent everywhere yields no rows; an absent OR branch contributes +/// nothing). `All` / `None` are handled by the caller before evaluation. +pub fn eval_trigram_query( + query: &TrigramQuery, + bitmaps: &HashMap, +) -> RoaringTreemap { + match query { + TrigramQuery::Trigram(token) => bitmaps.get(token).cloned().unwrap_or_default(), + TrigramQuery::And(items) => { + let mut iter = items.iter(); + let mut acc = match iter.next() { + Some(first) => eval_trigram_query(first, bitmaps), + None => return RoaringTreemap::new(), + }; + for item in iter { + if acc.is_empty() { + break; + } + acc &= &eval_trigram_query(item, bitmaps); + } + acc + } + TrigramQuery::Or(items) => { + let mut acc = RoaringTreemap::new(); + for item in items { + acc |= &eval_trigram_query(item, bitmaps); + } + acc + } + TrigramQuery::All | TrigramQuery::None => RoaringTreemap::new(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// A single trigram condition, hashed the same way the index hashes it. + fn tri(trigram: &str) -> TrigramQuery { + TrigramQuery::Trigram(ngram_to_token(trigram, NGRAM_N)) + } + + fn q(pattern: &str) -> TrigramQuery { + regex_to_trigram_query(pattern) + } + + #[test] + fn test_single_literal_trigram() { + assert_eq!(q("foo"), tri("foo")); + } + + #[test] + fn test_multi_trigram_literal() { + assert_eq!( + q("foobar"), + TrigramQuery::and(vec![tri("foo"), tri("oob"), tri("oba"), tri("bar")]) + ); + } + + #[test] + fn test_wildcard_splits_into_and() { + // `.*` breaks the literal run; both sides are required. + assert_eq!( + q("foo.*bar"), + TrigramQuery::and(vec![tri("foo"), tri("bar")]) + ); + } + + #[test] + fn test_alternation_is_or() { + assert_eq!( + q("(cat|dog)"), + TrigramQuery::or(vec![tri("cat"), tri("dog")]) + ); + } + + #[test] + fn test_anchors_are_transparent() { + assert_eq!( + q("^rhino"), + TrigramQuery::and(vec![tri("rhi"), tri("hin"), tri("ino")]) + ); + assert_eq!(q("nose$"), TrigramQuery::and(vec![tri("nos"), tri("ose")])); + } + + #[test] + fn test_boundary_trigram_recovered_across_groups() { + // A capturing group is not merged into the adjacent literals, so this + // exercises the suffix x prefix cross product that recovers the `foo` + // trigram straddling the `(o)` group boundary in "foobar". + assert_eq!( + q("fo(o)bar"), // spellchecker:disable-line + TrigramQuery::and(vec![tri("foo"), tri("oob"), tri("oba"), tri("bar")]) + ); + } + + #[test] + fn test_no_trigram_yields_all() { + // No run of three literal characters anywhere. + assert_eq!(q("a.b"), TrigramQuery::All); + assert_eq!(q(".*"), TrigramQuery::All); + // Every alternation branch is shorter than a trigram, so we must not + // require either two-character branch as a (non-existent) trigram. + assert_eq!(q("fo|ba"), TrigramQuery::All); // spellchecker:disable-line + } + + #[test] + fn test_case_insensitive_not_accelerated() { + // Unicode case folding (e.g. `(?i)c` also matches U+2102) does not agree + // with the index's normalization, so case-insensitive patterns are left + // unaccelerated (correct via recheck) rather than risk a false negative. + assert_eq!(q("(?i)Cat"), TrigramQuery::All); + } + + #[test] + fn test_unparsable_pattern_yields_all() { + assert_eq!(q("("), TrigramQuery::All); + } + + #[test] + fn test_large_alternation_stays_bounded() { + // More than MAX_SET_SIZE branches: must still produce a sound OR without + // panicking or exploding. + let pattern = (0..40) + .map(|i| format!("aa{i:02}zz")) + .collect::>() + .join("|"); + let result = q(&pattern); + // Each branch shares the trigram `aa0`/`aa1`/... and `zz`-ish endings; + // the important property is that it is a sound non-empty condition. + assert_ne!(result, TrigramQuery::None); + } + + #[test] + fn test_plus_requires_inner() { + // `(abc)+` must contain at least one `abc`. + assert_eq!(q("(abc)+"), tri("abc")); + } + + #[test] + fn test_optional_group_is_not_required() { + // `(foo)?bar` -> foo optional, bar required. + assert_eq!(q("(foo)?bar"), tri("bar")); + } + + #[test] + fn test_eval_and_or_with_missing_tokens() { + let foo = ngram_to_token("foo", NGRAM_N); + let bar = ngram_to_token("bar", NGRAM_N); + let mut bitmaps = HashMap::new(); + bitmaps.insert(foo, RoaringTreemap::from_iter([1u64, 2, 3])); + bitmaps.insert(bar, RoaringTreemap::from_iter([2u64, 3, 4])); + // `baz` is absent from the index. + + // AND intersects. + let and = TrigramQuery::and(vec![tri("foo"), tri("bar")]); + assert_eq!( + eval_trigram_query(&and, &bitmaps), + RoaringTreemap::from_iter([2u64, 3]) + ); + + // OR unions. + let or = TrigramQuery::or(vec![tri("foo"), tri("bar")]); + assert_eq!( + eval_trigram_query(&or, &bitmaps), + RoaringTreemap::from_iter([1u64, 2, 3, 4]) + ); + + // A missing token is empty: it zeroes an AND but is harmless in an OR. + let and_missing = TrigramQuery::and(vec![tri("foo"), tri("baz")]); + assert!(eval_trigram_query(&and_missing, &bitmaps).is_empty()); + let or_missing = TrigramQuery::or(vec![tri("foo"), tri("baz")]); + assert_eq!( + eval_trigram_query(&or_missing, &bitmaps), + RoaringTreemap::from_iter([1u64, 2, 3]) + ); + } + + #[test] + fn test_collect_tokens() { + let query = TrigramQuery::and(vec![ + tri("foo"), + TrigramQuery::or(vec![tri("bar"), tri("baz")]), + ]); + let mut tokens = HashSet::new(); + collect_tokens(&query, &mut tokens); + assert_eq!( + tokens, + HashSet::from([ + ngram_to_token("foo", NGRAM_N), + ngram_to_token("bar", NGRAM_N), + ngram_to_token("baz", NGRAM_N), + ]) + ); + } +} diff --git a/rust/lance-index/src/scalar/zonemap.rs b/rust/lance-index/src/scalar/zonemap.rs index 9f2228740c2..8e7e20c211a 100644 --- a/rust/lance-index/src/scalar/zonemap.rs +++ b/rust/lance-index/src/scalar/zonemap.rs @@ -151,26 +151,11 @@ impl ZoneMapIndex { Self::zone_has_finite_min(zone) && !(zone.max.is_null() || Self::scalar_is_nan(&zone.max)) } - fn finite_value_may_be_in_zone(value: &ScalarValue, zone: &ZoneMapStatistics) -> bool { - if !Self::zone_has_finite_min(zone) || value < &zone.min { - return false; - } - - if Self::scalar_is_nan(&zone.max) { - // A NaN max means this zone had both NaNs and finite values. The - // finite max is not persisted, so keep the zone as a false positive - // instead of using total ordering to prune it. - return true; - } - - !zone.max.is_null() && value <= &zone.max - } - /// Evaluates whether a zone could potentially contain values matching the query. /// - /// NaN query values use the explicit `nan_count`. When the stored max is - /// NaN we do not treat it as a finite upper bound; that representation means - /// the zone had finite values plus NaNs, and the finite max was not persisted. + /// NaN query values use the explicit `nan_count`. For finite query values, + /// `ScalarValue` total ordering keeps finite values below a stored NaN max, + /// so zones with finite values plus NaNs remain conservative false positives. fn evaluate_zone_against_query( &self, zone: &ZoneMapStatistics, @@ -206,7 +191,7 @@ impl ZoneMapIndex { return Ok(false); } - Ok(Self::finite_value_may_be_in_zone(target, zone)) + Ok(target >= &zone.min && target <= &zone.max) } SargableQuery::Range(start, end) => { // Zone overlaps with query range if there's any intersection between @@ -336,22 +321,28 @@ impl ZoneMapIndex { ScalarValue::Float16(Some(f)) => { if f.is_nan() { zone.nan_count > 0 + } else if !Self::zone_has_finite_min(zone) { + false } else { - Self::finite_value_may_be_in_zone(value, zone) + value >= &zone.min && value <= &zone.max } } ScalarValue::Float32(Some(f)) => { if f.is_nan() { zone.nan_count > 0 + } else if !Self::zone_has_finite_min(zone) { + false } else { - Self::finite_value_may_be_in_zone(value, zone) + value >= &zone.min && value <= &zone.max } } ScalarValue::Float64(Some(f)) => { if f.is_nan() { zone.nan_count > 0 + } else if !Self::zone_has_finite_min(zone) { + false } else { - Self::finite_value_may_be_in_zone(value, zone) + value >= &zone.min && value <= &zone.max } } _ => { @@ -1438,6 +1429,17 @@ mod tests { ); } + let zone = &index.zones[0]; + assert!(matches!( + zone.max, + ScalarValue::Float32(Some(value)) if value.is_nan() + )); + let finite_target = ScalarValue::Float32(Some(1000.0)); + assert!( + finite_target >= zone.min && finite_target <= zone.max, + "ScalarValue total ordering keeps finite values below NaN max" + ); + // Test search for NaN values using Equals with NaN let query = SargableQuery::Equals(ScalarValue::Float32(Some(f32::NAN))); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); diff --git a/rust/lance-index/src/vector.rs b/rust/lance-index/src/vector.rs index d0df2fcb7e2..3c5a6601a8a 100644 --- a/rust/lance-index/src/vector.rs +++ b/rust/lance-index/src/vector.rs @@ -419,6 +419,14 @@ pub trait VectorIndex: Send + Sync + std::fmt::Debug + Index { /// the index type of this vector index. fn sub_index_type(&self) -> (SubIndexType, QuantizationType); + + /// The cumulative I/O performed while opening this index (file footers, IVF + /// centroids, quantization metadata). This is a one-time cost; it is + /// reported once, on the query that actually opens the index, and is `None` + /// for index implementations that do not track it. + fn open_io_stats(&self) -> Option { + None + } } // it can be an IVF index or a partition of IVF index diff --git a/rust/lance-index/src/vector/bq.rs b/rust/lance-index/src/vector/bq.rs index 0fdd918edab..7a47fa88d54 100644 --- a/rust/lance-index/src/vector/bq.rs +++ b/rust/lance-index/src/vector/bq.rs @@ -18,6 +18,9 @@ use crate::vector::bq::storage::RabitQuantizationMetadata; use crate::vector::quantizer::QuantizerBuildParams; pub mod builder; +pub(crate) mod dist_table_quant; +pub mod ex_dot; +pub mod prune; pub mod rotation; pub mod storage; pub mod transform; diff --git a/rust/lance-index/src/vector/bq/builder.rs b/rust/lance-index/src/vector/bq/builder.rs index 178a6bb5435..9eb7fc76903 100644 --- a/rust/lance-index/src/vector/bq/builder.rs +++ b/rust/lance-index/src/vector/bq/builder.rs @@ -25,7 +25,7 @@ use crate::vector::bq::transform::{ SCALE_FACTORS_FIELD, }; use crate::vector::bq::{ - RQBuildParams, RQRotationType, rabit_binary_code_bytes, rabit_ex_bits, rabit_ex_code_bytes, + RQBuildParams, RQRotationType, rabit_binary_code_bytes, rabit_ex_bits, rotation::{apply_fast_rotation, fast_rotation_signs_len, random_fast_rotation_signs}, validate_rq_num_bits, }; @@ -78,21 +78,6 @@ fn pack_sign_bits(codes: &mut [u8], rotated: &[f32]) { } } -#[inline] -fn pack_ex_code_bits(codes: &mut [u8], ex_values: &[u8], ex_bits: u8) { - codes.fill(0); - let ex_bits = ex_bits as usize; - for (dim_idx, &value) in ex_values.iter().enumerate() { - let bit_offset = dim_idx * ex_bits; - for bit_idx in 0..ex_bits { - if (value >> bit_idx) & 1 != 0 { - let dst_bit = bit_offset + bit_idx; - codes[dst_bit / u8::BITS as usize] |= 1u8 << (dst_bit % u8::BITS as usize); - } - } - } -} - const EX_QUANTIZATION_EPSILON: f32 = 1.0e-5; const EX_TIGHT_START: [f32; 9] = [0.0, 0.15, 0.20, 0.52, 0.59, 0.71, 0.75, 0.77, 0.81]; @@ -200,7 +185,7 @@ fn quantize_ex_code( *ex_code_value = ex_code; } - pack_ex_code_bits(ex_code_dst, ex_code_values_dst, ex_bits); + crate::vector::bq::ex_dot::pack_blocked_row(ex_code_values_dst, ex_bits, ex_code_dst); residual_dot_code } @@ -599,7 +584,11 @@ impl RabitQuantizer { .as_slice(); let code_dim = self.code_dim(); let code_bytes = rabit_binary_code_bytes(code_dim); - let ex_code_bytes = rabit_ex_code_bytes(code_dim, ex_bits)?; + let ex_code_bytes = if ex_bits == 0 { + 0 + } else { + crate::vector::bq::ex_dot::blocked_ex_code_bytes(code_dim, ex_bits) + }; let mut encoded_codes = vec![0u8; n * code_bytes]; let mut encoded_ex_codes = (ex_bits != 0).then(|| vec![0u8; n * ex_code_bytes]); @@ -901,7 +890,7 @@ mod tests { use lance_linalg::distance::DistanceType; use rstest::rstest; - use crate::vector::bq::storage::RABIT_EX_CODE_COLUMN; + use crate::vector::bq::storage::RABIT_BLOCKED_EX_CODE_COLUMN; #[rstest] #[case(8)] @@ -978,14 +967,14 @@ mod tests { assert!( !fields .iter() - .any(|field| field.name() == RABIT_EX_CODE_COLUMN) + .any(|field| field.name() == RABIT_BLOCKED_EX_CODE_COLUMN) ); let q = RabitQuantizer::new_with_rotation::(3, 128, RQRotationType::Fast); let fields = q.extra_fields(); for expected in [ ERROR_FACTORS_FIELD.name().as_str(), - RABIT_EX_CODE_COLUMN, + RABIT_BLOCKED_EX_CODE_COLUMN, EX_ADD_FACTORS_FIELD.name().as_str(), EX_SCALE_FACTORS_FIELD.name().as_str(), ] { @@ -1095,7 +1084,8 @@ mod tests { .unwrap() .as_fixed_size_list() .value_length(), - 32 + // dim=32 is padded to one 64-dim block at ex_bits=8. + 64 ); } diff --git a/rust/lance-index/src/vector/bq/dist_table_quant.rs b/rust/lance-index/src/vector/bq/dist_table_quant.rs new file mode 100644 index 00000000000..22196f06edb --- /dev/null +++ b/rust/lance-index/src/vector/bq/dist_table_quant.rs @@ -0,0 +1,935 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! SIMD kernels for quantizing the RaBitQ FastScan distance table. +//! +//! Once per (query, probed partition) the `dim * 4`-entry `f32` distance +//! table is quantized into `u8` (fast/normal approx modes) or `u16` +//! (accurate mode) FastScan LUT entries: a min/max pass over the table +//! followed by an affine quantize-and-narrow pass. Both passes are branchy +//! in scalar form, so they get the same runtime-dispatch treatment as +//! [`super::ex_dot`]: explicit AVX-512/AVX2 kernels on x86_64 and a portable +//! fold elsewhere that LLVM auto-vectorizes (NEON is part of the aarch64 +//! baseline). +//! +//! Table values are sums of rotated-query components: always finite, never +//! NaN, so lanewise IEEE `min`/`max` matches `total_cmp` ordering. The only +//! divergence is the sign of zero, which callers cannot observe: `d - qmin` +//! and the `qmin == qmax` early-out are arithmetically identical either way. +//! +//! Quantization rounds half-to-even so that the scalar fallback and the SIMD +//! kernels agree bit-exactly. All paths round with fixed-mode rounding, +//! independent of the dynamic MXCSR rounding mode native code may have +//! installed: the SIMD kernels use the converts' static rounding and the +//! scalar path (also the SIMD tails) rounds via `f32::floor` rather than +//! `f32::round_ties_even`, which can lower to an MXCSR-honoring instruction on +//! x86. Relative to the pre-SIMD implementation (`f32::round`, +//! half-away-from-zero) this can move a LUT entry by 1 on exact .5 ties, which +//! is within the table's inherent quantization error. + +use std::mem::MaybeUninit; +use std::sync::LazyLock; + +use super::storage::SEGMENT_NUM_CODES; + +type MinMaxFn = fn(&[f32]) -> (f32, f32); +type QuantizeU8Fn = fn(&[f32], f32, f32, &mut [MaybeUninit]); +type QuantizeU16Fn = fn(&[f32], f32, f32, &mut [MaybeUninit]); + +/// How the caller reconstructs binary inner-product distances from the +/// FastScan accumulator sums computed against the quantized LUT. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum DistTableDequant { + /// Reconstruct each distance with the affine map + /// `q_sum * (qmax - qmin) / SCALE + num_tables * qmin`. Returned whenever + /// that map is finite, including a zero/sub-resolution range — then the + /// LUT is zeroed and every distance collapses to the constant + /// `num_tables * qmin`. + Affine { qmin: f32, qmax: f32 }, + /// `num_tables * {qmin, qmax, qmax - qmin}` overflowed f32, so the affine + /// reconstruction would yield NaN/inf. The LUT is zeroed; the caller must + /// compute exact distances directly from the f32 table. + Exact, +} + +/// Quantize `dist_table` into `u8` FastScan LUT entries in the caller-owned +/// scratch buffer, returning how the caller must dequantize the FastScan +/// sums (see [`DistTableDequant`]). `dist_table` must be non-empty and all +/// values finite. +pub fn quantize_dist_table_into( + dist_table: &[f32], + quantized_dist_table: &mut Vec, +) -> DistTableDequant { + debug_assert!(!dist_table.is_empty(), "dist table must be non-empty"); + let (qmin, qmax) = min_max(dist_table); + if dequant_overflows(dist_table.len(), qmin, qmax) { + // The caller's affine reconstruction would be non-finite; it computes + // exact distances and ignores the LUT, but keep the buffer valid. + quantized_dist_table.clear(); + quantized_dist_table.resize(dist_table.len(), 0); + return DistTableDequant::Exact; + } + let factor = u8::MAX as f32 / (qmax - qmin); + if !factor.is_finite() { + // Zero or sub-u8-resolution range (e.g. an all-zeros query): the LUT + // carries no information, but the finite affine map sends every sum + // to the constant `num_tables * qmin`. + quantized_dist_table.clear(); + quantized_dist_table.resize(dist_table.len(), 0); + return DistTableDequant::Affine { qmin, qmax }; + } + quantized_dist_table.clear(); + quantized_dist_table.reserve(dist_table.len()); + quantize_u8( + dist_table, + qmin, + factor, + &mut quantized_dist_table.spare_capacity_mut()[..dist_table.len()], + ); + // SAFETY: the kernel initialized every element in the reserved range. + unsafe { + quantized_dist_table.set_len(dist_table.len()); + } + DistTableDequant::Affine { qmin, qmax } +} + +/// `u16` variant of [`quantize_dist_table_into`] for the accurate approx +/// mode. +pub fn quantize_dist_table_u16_into( + dist_table: &[f32], + quantized_dist_table: &mut Vec, +) -> DistTableDequant { + debug_assert!(!dist_table.is_empty(), "dist table must be non-empty"); + let (qmin, qmax) = min_max(dist_table); + if dequant_overflows(dist_table.len(), qmin, qmax) { + quantized_dist_table.clear(); + quantized_dist_table.resize(dist_table.len(), 0); + return DistTableDequant::Exact; + } + let factor = u16::MAX as f32 / (qmax - qmin); + if !factor.is_finite() { + quantized_dist_table.clear(); + quantized_dist_table.resize(dist_table.len(), 0); + return DistTableDequant::Affine { qmin, qmax }; + } + quantized_dist_table.clear(); + quantized_dist_table.reserve(dist_table.len()); + quantize_u16( + dist_table, + qmin, + factor, + &mut quantized_dist_table.spare_capacity_mut()[..dist_table.len()], + ); + // SAFETY: the kernel initialized every element in the reserved range. + unsafe { + quantized_dist_table.set_len(dist_table.len()); + } + DistTableDequant::Affine { qmin, qmax } +} + +/// Whether the caller's affine dequantization +/// `q_sum * (qmax - qmin) / SCALE + num_tables * qmin` would overflow `f32` +/// for some row. Each row's reconstructed binary IP lies in +/// `[num_tables * qmin, num_tables * qmax]` and its quantized term is at most +/// `num_tables * (qmax - qmin)`, so if any of those is non-finite the table +/// must fall back to exact distances. The bound is scale-independent — the +/// `1 / SCALE` factor and the `q_sum <= num_tables * SCALE` range cancel. +/// Real dist tables are bounded sums of rotated-query components and never +/// approach this; the guard exists so a pathological query degrades to exact +/// distances instead of producing NaN. +fn dequant_overflows(table_len: usize, qmin: f32, qmax: f32) -> bool { + let num_tables = (table_len / SEGMENT_NUM_CODES) as f32; + !(num_tables * qmin).is_finite() + || !(num_tables * qmax).is_finite() + || !(num_tables * (qmax - qmin)).is_finite() +} + +fn min_max(values: &[f32]) -> (f32, f32) { + static KERNEL: LazyLock = LazyLock::new(select_min_max); + KERNEL(values) +} + +fn quantize_u8(values: &[f32], qmin: f32, factor: f32, out: &mut [MaybeUninit]) { + static KERNEL: LazyLock = LazyLock::new(select_quantize_u8); + KERNEL(values, qmin, factor, out) +} + +fn quantize_u16(values: &[f32], qmin: f32, factor: f32, out: &mut [MaybeUninit]) { + static KERNEL: LazyLock = LazyLock::new(select_quantize_u16); + KERNEL(values, qmin, factor, out) +} + +fn select_min_max() -> MinMaxFn { + #[cfg(target_arch = "x86_64")] + { + if std::arch::is_x86_feature_detected!("avx512f") { + return x86::min_max_avx512_dispatch; + } + if std::arch::is_x86_feature_detected!("avx2") { + return x86::min_max_avx2_dispatch; + } + } + min_max_fold +} + +fn select_quantize_u8() -> QuantizeU8Fn { + #[cfg(target_arch = "x86_64")] + { + if std::arch::is_x86_feature_detected!("avx512f") { + return x86::quantize_u8_avx512_dispatch; + } + if std::arch::is_x86_feature_detected!("avx2") { + return x86::quantize_u8_avx2_dispatch; + } + } + quantize_u8_scalar +} + +fn select_quantize_u16() -> QuantizeU16Fn { + #[cfg(target_arch = "x86_64")] + { + if std::arch::is_x86_feature_detected!("avx512f") { + return x86::quantize_u16_avx512_dispatch; + } + if std::arch::is_x86_feature_detected!("avx2") { + return x86::quantize_u16_avx2_dispatch; + } + } + quantize_u16_scalar +} + +const FOLD_LANES: usize = 16; + +/// Portable 16-lane min/max fold; the scalar fallback and the aarch64 path. +/// The `if` comparisons (rather than `f32::min`/`max`, which carry NaN +/// bookkeeping) lower to lanewise min/max instructions on targets with +/// baseline SIMD. +fn min_max_fold(values: &[f32]) -> (f32, f32) { + let mut mins = [f32::INFINITY; FOLD_LANES]; + let mut maxs = [f32::NEG_INFINITY; FOLD_LANES]; + let mut chunks = values.chunks_exact(FOLD_LANES); + for chunk in &mut chunks { + let chunk: &[f32; FOLD_LANES] = chunk.try_into().expect("chunks_exact length"); + for (i, &v) in chunk.iter().enumerate() { + mins[i] = if v < mins[i] { v } else { mins[i] }; + maxs[i] = if v > maxs[i] { v } else { maxs[i] }; + } + } + let mut min = f32::INFINITY; + let mut max = f32::NEG_INFINITY; + for v in mins { + min = if v < min { v } else { min }; + } + for v in maxs { + max = if v > max { v } else { max }; + } + for &v in chunks.remainder() { + min = if v < min { v } else { min }; + max = if v > max { v } else { max }; + } + (min, max) +} + +/// Round `x` to the nearest integer, ties to even — the same rule the SIMD +/// converts use — with fixed-mode operations only, so the result never +/// depends on the dynamic rounding mode native code may have installed. +/// +/// On x86, `f32::round_ties_even` can lower to an MXCSR-honoring instruction +/// (outside an SSE4.1 context), so nearest-even is built from `f32::floor`, +/// which is always fixed-mode. `x` is a non-negative quantization product, so +/// only the upward tie case is reachable, but the form is correct for any +/// finite `x` whose floor fits in `i64`. Elsewhere (e.g. aarch64) the standard +/// `round_ties_even` is already a fixed-mode instruction (`frintn`) that the +/// quantize loop — which has no dedicated SIMD kernel there — vectorizes, so +/// it is kept. +#[inline(always)] +fn round_ties_even_fixed(x: f32) -> f32 { + #[cfg(target_arch = "x86_64")] + { + let lower = x.floor(); + let frac = x - lower; + let round_up = frac > 0.5 || (frac == 0.5 && (lower as i64 & 1) != 0); + lower + f32::from(round_up) + } + #[cfg(not(target_arch = "x86_64"))] + { + x.round_ties_even() + } +} + +fn quantize_u8_scalar(values: &[f32], qmin: f32, factor: f32, out: &mut [MaybeUninit]) { + debug_assert_eq!(values.len(), out.len()); + for (quantized, &d) in out.iter_mut().zip(values) { + quantized.write(round_ties_even_fixed((d - qmin) * factor) as u8); + } +} + +fn quantize_u16_scalar(values: &[f32], qmin: f32, factor: f32, out: &mut [MaybeUninit]) { + debug_assert_eq!(values.len(), out.len()); + for (quantized, &d) in out.iter_mut().zip(values) { + quantized.write(round_ties_even_fixed((d - qmin) * factor) as u16); + } +} + +#[cfg(target_arch = "x86_64")] +mod x86 { + use std::arch::x86_64::*; + use std::mem::MaybeUninit; + + use super::{quantize_u8_scalar, quantize_u16_scalar}; + + pub(super) fn min_max_avx512_dispatch(values: &[f32]) -> (f32, f32) { + // SAFETY: only selected when AVX-512F was detected. + unsafe { min_max_avx512(values) } + } + + #[target_feature(enable = "avx512f")] + unsafe fn min_max_avx512(values: &[f32]) -> (f32, f32) { + // Two accumulators per direction break the lanewise min/max latency + // chain; they are reduced once at the end. + let mut min0 = _mm512_set1_ps(f32::INFINITY); + let mut min1 = min0; + let mut max0 = _mm512_set1_ps(f32::NEG_INFINITY); + let mut max1 = max0; + let mut chunks = values.chunks_exact(32); + for chunk in &mut chunks { + // SAFETY: the chunk holds 32 consecutive floats. + let (v0, v1) = unsafe { + ( + _mm512_loadu_ps(chunk.as_ptr()), + _mm512_loadu_ps(chunk.as_ptr().add(16)), + ) + }; + min0 = _mm512_min_ps(min0, v0); + max0 = _mm512_max_ps(max0, v0); + min1 = _mm512_min_ps(min1, v1); + max1 = _mm512_max_ps(max1, v1); + } + let mut min = _mm512_reduce_min_ps(_mm512_min_ps(min0, min1)); + let mut max = _mm512_reduce_max_ps(_mm512_max_ps(max0, max1)); + for &v in chunks.remainder() { + min = if v < min { v } else { min }; + max = if v > max { v } else { max }; + } + (min, max) + } + + pub(super) fn min_max_avx2_dispatch(values: &[f32]) -> (f32, f32) { + // SAFETY: only selected when AVX2 was detected. + unsafe { min_max_avx2(values) } + } + + #[target_feature(enable = "avx2")] + unsafe fn min_max_avx2(values: &[f32]) -> (f32, f32) { + let mut min0 = _mm256_set1_ps(f32::INFINITY); + let mut min1 = min0; + let mut max0 = _mm256_set1_ps(f32::NEG_INFINITY); + let mut max1 = max0; + let mut chunks = values.chunks_exact(16); + for chunk in &mut chunks { + // SAFETY: the chunk holds 16 consecutive floats. + let (v0, v1) = unsafe { + ( + _mm256_loadu_ps(chunk.as_ptr()), + _mm256_loadu_ps(chunk.as_ptr().add(8)), + ) + }; + min0 = _mm256_min_ps(min0, v0); + max0 = _mm256_max_ps(max0, v0); + min1 = _mm256_min_ps(min1, v1); + max1 = _mm256_max_ps(max1, v1); + } + let mut min = reduce_min_avx2(_mm256_min_ps(min0, min1)); + let mut max = reduce_max_avx2(_mm256_max_ps(max0, max1)); + for &v in chunks.remainder() { + min = if v < min { v } else { min }; + max = if v > max { v } else { max }; + } + (min, max) + } + + #[inline] + #[target_feature(enable = "avx2")] + fn reduce_min_avx2(v: __m256) -> f32 { + let halves = _mm_min_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps::<1>(v)); + let pairs = _mm_min_ps(halves, _mm_movehl_ps(halves, halves)); + let single = _mm_min_ss(pairs, _mm_shuffle_ps::<0b01>(pairs, pairs)); + _mm_cvtss_f32(single) + } + + #[inline] + #[target_feature(enable = "avx2")] + fn reduce_max_avx2(v: __m256) -> f32 { + let halves = _mm_max_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps::<1>(v)); + let pairs = _mm_max_ps(halves, _mm_movehl_ps(halves, halves)); + let single = _mm_max_ss(pairs, _mm_shuffle_ps::<0b01>(pairs, pairs)); + _mm_cvtss_f32(single) + } + + /// Load 16 floats and affine-quantize them into `i32` lanes, rounding to + /// nearest-even with static rounding (`_MM_FROUND_TO_NEAREST_INT`) so the + /// result does not depend on the dynamic MXCSR rounding mode and matches + /// the scalar [`super::round_ties_even_fixed`]. + #[inline] + #[target_feature(enable = "avx512f")] + unsafe fn quantize16_epi32(src: *const f32, min: __m512, factor: __m512) -> __m512i { + // SAFETY: the caller guarantees 16 floats are readable at `src`. + let v = unsafe { _mm512_loadu_ps(src) }; + let scaled = _mm512_mul_ps(_mm512_sub_ps(v, min), factor); + _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(scaled) + } + + pub(super) fn quantize_u8_avx512_dispatch( + values: &[f32], + qmin: f32, + factor: f32, + out: &mut [MaybeUninit], + ) { + // SAFETY: only selected when AVX-512F was detected. + unsafe { quantize_u8_avx512(values, qmin, factor, out) } + } + + #[target_feature(enable = "avx512f")] + unsafe fn quantize_u8_avx512( + values: &[f32], + qmin: f32, + factor: f32, + out: &mut [MaybeUninit], + ) { + debug_assert_eq!(values.len(), out.len()); + let min = _mm512_set1_ps(qmin); + let factor_v = _mm512_set1_ps(factor); + let full = values.len() - values.len() % 16; + let src = values.as_ptr(); + let dst = out.as_mut_ptr().cast::(); + for i in (0..full).step_by(16) { + // SAFETY: `i + 16 <= values.len() == out.len()`. + unsafe { + let q = quantize16_epi32(src.add(i), min, factor_v); + // Unsigned-saturating i32 -> u8 narrow: lanes are in + // [0, 255] plus float epsilon, which saturation clips. + _mm_storeu_si128(dst.add(i).cast(), _mm512_cvtusepi32_epi8(q)); + } + } + quantize_u8_scalar(&values[full..], qmin, factor, &mut out[full..]); + } + + pub(super) fn quantize_u16_avx512_dispatch( + values: &[f32], + qmin: f32, + factor: f32, + out: &mut [MaybeUninit], + ) { + // SAFETY: only selected when AVX-512F was detected. + unsafe { quantize_u16_avx512(values, qmin, factor, out) } + } + + #[target_feature(enable = "avx512f")] + unsafe fn quantize_u16_avx512( + values: &[f32], + qmin: f32, + factor: f32, + out: &mut [MaybeUninit], + ) { + debug_assert_eq!(values.len(), out.len()); + let min = _mm512_set1_ps(qmin); + let factor_v = _mm512_set1_ps(factor); + let full = values.len() - values.len() % 16; + let src = values.as_ptr(); + let dst = out.as_mut_ptr().cast::(); + for i in (0..full).step_by(16) { + // SAFETY: `i + 16 <= values.len() == out.len()`. + unsafe { + let q = quantize16_epi32(src.add(i), min, factor_v); + _mm256_storeu_si256(dst.add(i).cast(), _mm512_cvtusepi32_epi16(q)); + } + } + quantize_u16_scalar(&values[full..], qmin, factor, &mut out[full..]); + } + + /// Load 8 floats and affine-quantize them into `i32` lanes. AVX2 has no + /// embedded-rounding convert, so round to nearest-even explicitly with + /// `_mm256_round_ps` (which ignores MXCSR); the subsequent convert then + /// sees an integral value, so its dynamic rounding mode cannot change the + /// result, keeping it bit-identical to the scalar + /// [`super::round_ties_even_fixed`]. + #[inline] + #[target_feature(enable = "avx2")] + unsafe fn quantize8_epi32(src: *const f32, min: __m256, factor: __m256) -> __m256i { + // SAFETY: the caller guarantees 8 floats are readable at `src`. + let v = unsafe { _mm256_loadu_ps(src) }; + let scaled = _mm256_mul_ps(_mm256_sub_ps(v, min), factor); + let rounded = _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(scaled); + _mm256_cvtps_epi32(rounded) + } + + pub(super) fn quantize_u8_avx2_dispatch( + values: &[f32], + qmin: f32, + factor: f32, + out: &mut [MaybeUninit], + ) { + // SAFETY: only selected when AVX2 was detected. + unsafe { quantize_u8_avx2(values, qmin, factor, out) } + } + + #[target_feature(enable = "avx2")] + unsafe fn quantize_u8_avx2( + values: &[f32], + qmin: f32, + factor: f32, + out: &mut [MaybeUninit], + ) { + debug_assert_eq!(values.len(), out.len()); + let min = _mm256_set1_ps(qmin); + let factor_v = _mm256_set1_ps(factor); + // The 32->16 and 16->8 packs interleave the two 128-bit lanes; this + // permutation of 32-bit groups restores natural order. + let restore = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); + let full = values.len() - values.len() % 32; + let src = values.as_ptr(); + let dst = out.as_mut_ptr().cast::(); + for i in (0..full).step_by(32) { + // SAFETY: `i + 32 <= values.len() == out.len()`. + unsafe { + let q0 = quantize8_epi32(src.add(i), min, factor_v); + let q1 = quantize8_epi32(src.add(i + 8), min, factor_v); + let q2 = quantize8_epi32(src.add(i + 16), min, factor_v); + let q3 = quantize8_epi32(src.add(i + 24), min, factor_v); + // Unsigned-saturating i32 -> u16 -> u8 narrows: lanes are in + // [0, 255] plus float epsilon, which saturation clips. + let lo = _mm256_packus_epi32(q0, q1); + let hi = _mm256_packus_epi32(q2, q3); + let bytes = _mm256_permutevar8x32_epi32(_mm256_packus_epi16(lo, hi), restore); + _mm256_storeu_si256(dst.add(i).cast(), bytes); + } + } + quantize_u8_scalar(&values[full..], qmin, factor, &mut out[full..]); + } + + pub(super) fn quantize_u16_avx2_dispatch( + values: &[f32], + qmin: f32, + factor: f32, + out: &mut [MaybeUninit], + ) { + // SAFETY: only selected when AVX2 was detected. + unsafe { quantize_u16_avx2(values, qmin, factor, out) } + } + + #[target_feature(enable = "avx2")] + unsafe fn quantize_u16_avx2( + values: &[f32], + qmin: f32, + factor: f32, + out: &mut [MaybeUninit], + ) { + debug_assert_eq!(values.len(), out.len()); + let min = _mm256_set1_ps(qmin); + let factor_v = _mm256_set1_ps(factor); + let full = values.len() - values.len() % 16; + let src = values.as_ptr(); + let dst = out.as_mut_ptr().cast::(); + for i in (0..full).step_by(16) { + // SAFETY: `i + 16 <= values.len() == out.len()`. + unsafe { + let q0 = quantize8_epi32(src.add(i), min, factor_v); + let q1 = quantize8_epi32(src.add(i + 8), min, factor_v); + // The pack interleaves the 128-bit lanes as + // [q0_lo, q1_lo, q0_hi, q1_hi]; the 64-bit-lane permute + // restores [q0_lo, q0_hi, q1_lo, q1_hi]. + let packed = _mm256_packus_epi32(q0, q1); + let words = _mm256_permute4x64_epi64::<0b11_01_10_00>(packed); + _mm256_storeu_si256(dst.add(i).cast(), words); + } + } + quantize_u16_scalar(&values[full..], qmin, factor, &mut out[full..]); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::rngs::SmallRng; + use rand::{Rng, SeedableRng}; + use rstest::rstest; + + /// Straightforward scalar reference implementing the documented + /// semantics: `total_cmp` min/max plus nearest-even rounding. + fn reference_min_max(values: &[f32]) -> (f32, f32) { + let min = values + .iter() + .cloned() + .min_by(|a, b| a.total_cmp(b)) + .unwrap(); + let max = values + .iter() + .cloned() + .max_by(|a, b| a.total_cmp(b)) + .unwrap(); + (min, max) + } + + fn reference_u8(values: &[f32]) -> (DistTableDequant, Vec) { + let (qmin, qmax) = reference_min_max(values); + if dequant_overflows(values.len(), qmin, qmax) { + return (DistTableDequant::Exact, vec![0; values.len()]); + } + let factor = u8::MAX as f32 / (qmax - qmin); + if !factor.is_finite() { + return ( + DistTableDequant::Affine { qmin, qmax }, + vec![0; values.len()], + ); + } + let quantized = values + .iter() + .map(|&d| ((d - qmin) * factor).round_ties_even() as u8) + .collect(); + (DistTableDequant::Affine { qmin, qmax }, quantized) + } + + fn reference_u16(values: &[f32]) -> (DistTableDequant, Vec) { + let (qmin, qmax) = reference_min_max(values); + if dequant_overflows(values.len(), qmin, qmax) { + return (DistTableDequant::Exact, vec![0; values.len()]); + } + let factor = u16::MAX as f32 / (qmax - qmin); + if !factor.is_finite() { + return ( + DistTableDequant::Affine { qmin, qmax }, + vec![0; values.len()], + ); + } + let quantized = values + .iter() + .map(|&d| ((d - qmin) * factor).round_ties_even() as u16) + .collect(); + (DistTableDequant::Affine { qmin, qmax }, quantized) + } + + fn available_kernels() -> Vec<(&'static str, MinMaxFn, QuantizeU8Fn, QuantizeU16Fn)> { + // `mut` is only exercised on x86_64 where extra kernels may be pushed. + #[allow(unused_mut)] + let mut kernels = vec![( + "scalar", + min_max_fold as MinMaxFn, + quantize_u8_scalar as QuantizeU8Fn, + quantize_u16_scalar as QuantizeU16Fn, + )]; + #[cfg(target_arch = "x86_64")] + { + if std::arch::is_x86_feature_detected!("avx2") { + kernels.push(( + "avx2", + x86::min_max_avx2_dispatch, + x86::quantize_u8_avx2_dispatch, + x86::quantize_u16_avx2_dispatch, + )); + } + if std::arch::is_x86_feature_detected!("avx512f") { + kernels.push(( + "avx512", + x86::min_max_avx512_dispatch, + x86::quantize_u8_avx512_dispatch, + x86::quantize_u16_avx512_dispatch, + )); + } + } + kernels + } + + /// Every available kernel must agree bit-exactly with the reference on + /// the given input. + fn check_against_reference(values: &[f32]) { + let (expected_dequant_u8, expected_u8) = reference_u8(values); + let (expected_dequant_u16, expected_u16) = reference_u16(values); + let (expected_min, expected_max) = reference_min_max(values); + + for (name, min_max_fn, quantize_u8_fn, quantize_u16_fn) in available_kernels() { + let (qmin, qmax) = min_max_fn(values); + assert_eq!( + (qmin, qmax), + (expected_min, expected_max), + "kernel={name} len={}", + values.len() + ); + + // The quantize kernels are only invoked on the populated path, so + // mirror that guard before exercising them directly. + let overflows = dequant_overflows(values.len(), qmin, qmax); + let factor_u8 = u8::MAX as f32 / (qmax - qmin); + if !overflows && factor_u8.is_finite() { + let mut out_u8 = Vec::with_capacity(values.len()); + quantize_u8_fn( + values, + qmin, + factor_u8, + &mut out_u8.spare_capacity_mut()[..values.len()], + ); + // SAFETY: the kernel initialized every element. + unsafe { out_u8.set_len(values.len()) }; + assert_eq!(out_u8, expected_u8, "kernel={name} len={}", values.len()); + } + + let factor_u16 = u16::MAX as f32 / (qmax - qmin); + if !overflows && factor_u16.is_finite() { + let mut out_u16 = Vec::with_capacity(values.len()); + quantize_u16_fn( + values, + qmin, + factor_u16, + &mut out_u16.spare_capacity_mut()[..values.len()], + ); + // SAFETY: the kernel initialized every element. + unsafe { out_u16.set_len(values.len()) }; + assert_eq!(out_u16, expected_u16, "kernel={name} len={}", values.len()); + } + } + + // The public entry points exercise the dispatched kernels, the + // dequantization classification, and the scratch-buffer handling. + let mut out_u8 = Vec::new(); + assert_eq!( + quantize_dist_table_into(values, &mut out_u8), + expected_dequant_u8, + "len={}", + values.len() + ); + assert_eq!(out_u8, expected_u8, "len={}", values.len()); + let mut out_u16 = Vec::new(); + assert_eq!( + quantize_dist_table_u16_into(values, &mut out_u16), + expected_dequant_u16, + "len={}", + values.len() + ); + assert_eq!(out_u16, expected_u16, "len={}", values.len()); + } + + #[rstest] + fn test_quantize_matches_reference( + #[values(1, 2, 15, 16, 17, 31, 32, 33, 63, 64, 100, 6144, 6160)] len: usize, + #[values(1.0, 1e-3, 1e4)] scale: f32, + ) { + let mut rng = SmallRng::seed_from_u64(42 + len as u64); + let values = (0..len) + .map(|_| rng.random_range(-scale..scale)) + .collect::>(); + check_against_reference(&values); + } + + /// Integer tables with range 510 (resp. 131070) make `factor` exactly + /// 0.5, so odd values land on exact .5 ties; all kernels must round them + /// to even and agree with each other. + #[test] + fn test_exact_half_ties_round_to_even() { + let values = (0..=510).map(|v| v as f32).collect::>(); + check_against_reference(&values); + let mut quantized = Vec::new(); + assert_eq!( + quantize_dist_table_into(&values, &mut quantized), + DistTableDequant::Affine { + qmin: 0.0, + qmax: 510.0 + } + ); + // Spot-check nearest-even: 0.5 -> 0, 1.5 -> 2, 127.5 -> 128, + // 254.5 -> 254. + assert_eq!(&quantized[..6], &[0, 0, 1, 2, 2, 2]); + assert_eq!(quantized[255], 128); + assert_eq!(quantized[509], 254); + assert_eq!(quantized[510], 255); + + // Integers up to 131070 are exactly representable in f32. + let values = (0..=510).map(|v| (v * 257) as f32).collect::>(); + check_against_reference(&values); + let mut quantized = Vec::new(); + assert_eq!( + quantize_dist_table_u16_into(&values, &mut quantized), + DistTableDequant::Affine { + qmin: 0.0, + qmax: 131070.0 + } + ); + // value * 0.5 = 128.5 -> 128, 385.5 -> 386 under nearest-even. + assert_eq!(&quantized[..4], &[0, 128, 257, 386]); + assert_eq!(quantized[510], u16::MAX); + } + + #[test] + fn test_negative_and_mixed_sign_values() { + let mut rng = SmallRng::seed_from_u64(7); + let values = (0..1000) + .map(|_| rng.random_range(-100.0f32..-1.0)) + .collect::>(); + check_against_reference(&values); + let values = (0..999) + .map(|i| (i as f32 - 499.5) * 0.75) + .collect::>(); + check_against_reference(&values); + } + + #[rstest] + fn test_all_equal_input_zeroes_table(#[values(0.0, -7.25, 3.5)] value: f32) { + let values = vec![value; 100]; + check_against_reference(&values); + // Zero range: a zeroed LUT plus the finite affine map (every sum maps + // to `num_tables * value`). + let expected = DistTableDequant::Affine { + qmin: value, + qmax: value, + }; + let mut quantized = vec![1u8; 5]; + assert_eq!(quantize_dist_table_into(&values, &mut quantized), expected); + assert_eq!(quantized, vec![0; 100]); + let mut quantized = vec![1u16; 5]; + assert_eq!( + quantize_dist_table_u16_into(&values, &mut quantized), + expected + ); + assert_eq!(quantized, vec![0; 100]); + } + + /// A finite sub-resolution range zeroes the LUT but still dequantizes + /// with the finite affine map (`Affine`), whereas a range whose + /// `num_tables`-scaled reconstruction overflows must signal `Exact` so the + /// caller computes exact distances instead of `0 * inf = NaN`. + #[test] + fn test_degenerate_range_classification() { + // factor = 255 / 1e-38 overflows to +inf, but the reconstruction + // (num_tables * {0, 1e-38}) stays finite -> Affine, zeroed LUT. + let mut tiny_range = vec![0.0f32; 32]; + tiny_range[1] = 1e-38; + // num_tables * (2e38 - (-2e38)) overflows f32 -> Exact. + let mut huge_range = vec![0.0f32; 32]; + huge_range[0] = -2e38; + huge_range[1] = 2e38; + // factor = 65535 / 1e-35 overflows only in the u16 variant; the u8 + // variant still quantizes normally. + let mut u16_only = vec![0.0f32; 32]; + u16_only[1] = 1e-35; + + for values in [&tiny_range, &huge_range, &u16_only] { + check_against_reference(values); + } + let mut quantized_u8 = Vec::new(); + assert_eq!( + quantize_dist_table_into(&tiny_range, &mut quantized_u8), + DistTableDequant::Affine { + qmin: 0.0, + qmax: 1e-38 + } + ); + assert_eq!(quantized_u8, vec![0; 32]); + assert_eq!( + quantize_dist_table_into(&huge_range, &mut quantized_u8), + DistTableDequant::Exact + ); + assert_eq!(quantized_u8, vec![0; 32]); + let mut quantized_u16 = Vec::new(); + assert_eq!( + quantize_dist_table_u16_into(&u16_only, &mut quantized_u16), + DistTableDequant::Affine { + qmin: 0.0, + qmax: 1e-35 + } + ); + assert_eq!(quantized_u16, vec![0; 32]); + assert_eq!( + quantize_dist_table_into(&u16_only, &mut quantized_u8), + DistTableDequant::Affine { + qmin: 0.0, + qmax: 1e-35 + } + ); + assert_eq!(quantized_u8[1], u8::MAX); + } + + /// `-0.0 == 0.0` must keep taking the zero-range path (zeroed LUT, + /// `Affine`) even though SIMD min/max may pick either sign for the + /// extremes. + #[test] + fn test_signed_zero_mix_zeroes_table() { + let mut values = vec![0.0f32; 64]; + values.iter_mut().step_by(2).for_each(|v| *v = -0.0); + let mut quantized = Vec::new(); + match quantize_dist_table_into(&values, &mut quantized) { + DistTableDequant::Affine { qmin, qmax } => assert_eq!(qmin, qmax), + other => panic!("expected Affine, got {other:?}"), + } + assert_eq!(quantized, vec![0; 64]); + } + + /// Every quantizer — scalar, AVX2, AVX-512, including the SIMD kernels' + /// scalar tails — must round with fixed nearest-even, independent of the + /// dynamic MXCSR rounding mode. Run each with MXCSR forced to + /// round-toward-zero and require it still matches the nearest-even + /// reference (computed under the default mode). `factor == 0.5` puts odd + /// integers on exact .5 ties, where truncation (1.5 -> 1) and nearest-even + /// (1.5 -> 2) disagree, so a path that honored MXCSR would fail. The + /// length (511) is deliberately not a multiple of the SIMD step so the + /// kernels' scalar tails are exercised too. + #[cfg(target_arch = "x86_64")] + #[test] + #[allow(deprecated)] // _mm_getcsr/_mm_setcsr: no stable non-asm replacement. + fn test_quantize_rounding_ignores_mxcsr() { + use std::arch::x86_64::{_MM_ROUND_MASK, _MM_ROUND_TOWARD_ZERO, _mm_getcsr, _mm_setcsr}; + + let values = (0..=510).map(|v| v as f32).collect::>(); + // Computed under the default (nearest-even) rounding mode. + let (_, expected_u8) = reference_u8(&values); + let (_, expected_u16) = reference_u16(&values); + let factor_u8 = u8::MAX as f32 / 510.0; + let factor_u16 = u16::MAX as f32 / 510.0; + + for (name, _, quantize_u8_fn, quantize_u16_fn) in available_kernels() { + let mut out_u8 = Vec::with_capacity(values.len()); + let mut out_u16 = Vec::with_capacity(values.len()); + // SAFETY: SSE is baseline on x86_64. MXCSR is restored before any + // assertion so a failure cannot leak the truncating mode. + let saved = unsafe { _mm_getcsr() }; + unsafe { + _mm_setcsr((saved & !_MM_ROUND_MASK) | _MM_ROUND_TOWARD_ZERO); + quantize_u8_fn( + &values, + 0.0, + factor_u8, + &mut out_u8.spare_capacity_mut()[..values.len()], + ); + quantize_u16_fn( + &values, + 0.0, + factor_u16, + &mut out_u16.spare_capacity_mut()[..values.len()], + ); + _mm_setcsr(saved); + out_u8.set_len(values.len()); + out_u16.set_len(values.len()); + } + assert_eq!(out_u8, expected_u8, "kernel={name} under truncating MXCSR"); + assert_eq!( + out_u16, expected_u16, + "kernel={name} under truncating MXCSR" + ); + } + } + + /// The scratch buffer must be fully overwritten across reuses with + /// different lengths. + #[test] + fn test_scratch_buffer_reuse() { + let mut rng = SmallRng::seed_from_u64(11); + let mut scratch_u8 = vec![7u8; 500]; + let mut scratch_u16 = vec![7u16; 500]; + for len in [48, 512, 16] { + let values = (0..len) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect::>(); + quantize_dist_table_into(&values, &mut scratch_u8); + assert_eq!(scratch_u8, reference_u8(&values).1); + quantize_dist_table_u16_into(&values, &mut scratch_u16); + assert_eq!(scratch_u16, reference_u16(&values).1); + } + } +} diff --git a/rust/lance-index/src/vector/bq/ex_dot.rs b/rust/lance-index/src/vector/bq/ex_dot.rs new file mode 100644 index 00000000000..1aeb83ba40c --- /dev/null +++ b/rust/lance-index/src/vector/bq/ex_dot.rs @@ -0,0 +1,1078 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Inner-product kernels between an `f32` query and bit-packed RaBitQ ex-codes. +//! +//! Multi-bit RaBitQ reranking reduces to `sum_d query[d] * ex_code[d]`, where +//! `ex_code[d]` is an unsigned `ex_bits`-wide integer. Materializing a +//! `dim * 2^ex_bits` lookup table and gathering one entry per dimension is +//! cache-hostile (the table is 1MiB for `ex_bits=8`, `dim=1024`); these kernels +//! instead unpack the codes with shifts and masks and FMA them against the +//! query directly, following the kernel design of the RaBitQ reference library +//! (, Apache-2.0). +//! +//! Codes are stored in the *blocked* layout: dims are grouped into 64-dim +//! blocks (the last block zero-padded) and bit-interleaved within each block +//! so that the SIMD unpack emits codes in natural dim order: +//! +//! ```text +//! per 64-dim block (T = ex_bits - 1, the top bit; "run k" = dims 16k..16k+16): +//! 1 bit: [8B] bit i of the LE word = dim i +//! 2 bits: [16B] byte b = dims {b, b+16, b+32, b+48} at bit pairs 0/2/4/6 +//! 3 bits: [16B 2-bit plane as above][8B top-bit plane] +//! 4 bits: [32B] byte 8j+b = dim 16j+b (low nibble) | dim 16j+8+b (high nibble) +//! 5 bits: [32B 4-bit plane: byte b = dims b|b+16; byte 16+b = dims b+32|b+48] +//! [8B top-bit plane] +//! 6 bits: [48B] byte 16k+b = dim 16k+b (6 low bits) | bits 2k..2k+2 of +//! dim 48+b (2 high bits) +//! 7 bits: [48B as 6 bits][8B top-bit plane] +//! 8 bits: [64B] identity +//! top-bit plane: top bit of dim 16k+b at bit 8*(b%8) + 2k + b/8 of a LE u64 +//! ``` +//! +//! Because unpack order is natural, the kernels read the rotated query +//! directly; it only needs zero-padding ([`pad_query_into`]) when the rotated +//! dim is not a multiple of 64. Legacy indexes store ex codes sequentially +//! (LSB-first bit stream) and are repacked once at load time +//! ([`repack_sequential_row`]); for `ex_bits` ∈ {1, 8} the two layouts agree +//! (modulo trailing padding, which the kernels tolerate) and rows are used as +//! stored. + +use std::sync::LazyLock; + +/// Dims are packed in blocks of this size; the query is zero-padded to a +/// whole number of blocks when the rotated dim is not already a multiple. +pub const EX_DOT_BLOCK_DIMS: usize = 64; + +/// `f32` length of the query consumed by the kernels. +pub fn padded_query_len(dim: usize) -> usize { + dim.next_multiple_of(EX_DOT_BLOCK_DIMS) +} + +/// Whether the legacy sequential layout of a row already matches the blocked +/// layout (modulo trailing zero padding, which the kernels tolerate), so +/// legacy rows can be consumed without repacking. +pub fn sequential_matches_blocked(ex_bits: u8) -> bool { + matches!(ex_bits, 1 | 8) +} + +/// Bytes per row of the blocked ex-code layout. +pub fn blocked_ex_code_bytes(dim: usize, ex_bits: u8) -> usize { + debug_assert!((1..=8).contains(&ex_bits)); + padded_query_len(dim) * ex_bits as usize / 8 +} + +/// Dimensions per unpacking group for the given code width. +fn group_dims(ex_bits: u8) -> usize { + match ex_bits { + 1 | 4 | 8 => 16, + _ => EX_DOT_BLOCK_DIMS, + } +} + +fn group_bytes(ex_bits: u8) -> usize { + group_dims(ex_bits) * ex_bits as usize / 8 +} + +/// Extract the `ex_bits`-wide code of `dim_idx` from a sequentially bit-packed +/// row (LSB-first, codes may straddle byte boundaries). +#[inline] +pub fn packed_ex_code_value(row_codes: &[u8], dim_idx: usize, ex_bits: u8) -> u8 { + debug_assert!(ex_bits > 0); + let bit_offset = dim_idx * ex_bits as usize; + let byte_idx = bit_offset / u8::BITS as usize; + let bit_shift = bit_offset % u8::BITS as usize; + let bits = row_codes[byte_idx] as u16 + | row_codes + .get(byte_idx + 1) + .map(|byte| (*byte as u16) << u8::BITS) + .unwrap_or_default(); + let mask = (1u16 << ex_bits) - 1; + ((bits >> bit_shift) & mask) as u8 +} + +/// Zero-pad the rotated query to a whole number of 64-dim blocks. Only needed +/// when `dim` is not a multiple of [`EX_DOT_BLOCK_DIMS`]; aligned queries are +/// passed to the kernels as-is. +pub fn pad_query_into(rotated_query: &[f32], out: &mut [f32]) { + debug_assert_eq!(out.len(), padded_query_len(rotated_query.len())); + out[..rotated_query.len()].copy_from_slice(rotated_query); + out[rotated_query.len()..].fill(0.0); +} + +/// Pack the top bit of each of 64 codes into a `u64` so kernels can position +/// it with two shifts per 16-code run: the top bit of dim `16k + b` is stored +/// at bit `8 * (b % 8) + 2k + b / 8`. +fn pack_top_plane(block_values: &[u8; 64], top_bit: u8) -> u64 { + let mut plane = 0u64; + for k in 0..4 { + for b in 0..16 { + let bit = (block_values[16 * k + b] >> top_bit) & 1; + plane |= (bit as u64) << (8 * (b % 8) + 2 * k + b / 8); + } + } + plane +} + +/// Shift `plane` so that its bit `8j + from_bit` lands at bit `8j + to_bit`. +#[inline(always)] +fn shift_plane(plane: u64, from_bit: usize, to_bit: usize) -> u64 { + if from_bit >= to_bit { + plane >> (from_bit - to_bit) + } else { + plane << (to_bit - from_bit) + } +} + +/// Pack one block of 64 code values (natural dim order) into the blocked +/// layout described in the module docs. +fn pack_block(ex_bits: u8, block_values: &[u8; 64], out: &mut [u8]) { + let v = block_values; + match ex_bits { + 1 => { + for (b, byte) in out[..8].iter_mut().enumerate() { + *byte = (0..8).fold(0, |acc, t| acc | ((v[8 * b + t] & 1) << t)); + } + } + 2 | 3 => { + for b in 0..16 { + out[b] = (v[b] & 0b11) + | ((v[16 + b] & 0b11) << 2) + | ((v[32 + b] & 0b11) << 4) + | ((v[48 + b] & 0b11) << 6); + } + if ex_bits == 3 { + out[16..24].copy_from_slice(&pack_top_plane(v, 2).to_le_bytes()); + } + } + 4 => { + for unit in 0..4 { + for b in 0..8 { + out[8 * unit + b] = + (v[16 * unit + b] & 0x0f) | ((v[16 * unit + 8 + b] & 0x0f) << 4); + } + } + } + 5 => { + for b in 0..16 { + out[b] = (v[b] & 0x0f) | ((v[16 + b] & 0x0f) << 4); + out[16 + b] = (v[32 + b] & 0x0f) | ((v[48 + b] & 0x0f) << 4); + } + out[32..40].copy_from_slice(&pack_top_plane(v, 4).to_le_bytes()); + } + 6 | 7 => { + // Runs 0..3 keep their 6 low bits in place; the fourth run's dims + // are split into three 2-bit pieces stored in the runs' top bits. + for k in 0..3 { + for b in 0..16 { + out[16 * k + b] = + (v[16 * k + b] & 0x3f) | (((v[48 + b] >> (2 * k)) & 0b11) << 6); + } + } + if ex_bits == 7 { + out[48..56].copy_from_slice(&pack_top_plane(v, 6).to_le_bytes()); + } + } + 8 => out[..64].copy_from_slice(v), + _ => unreachable!("invalid RabitQ ex_bits={ex_bits}"), + } +} + +/// Pack one row of unpacked code values (one `u8` per dim) into the blocked +/// layout; the writer path. `out` must have [`blocked_ex_code_bytes`] bytes. +pub fn pack_blocked_row(values: &[u8], ex_bits: u8, out: &mut [u8]) { + debug_assert_eq!(out.len(), blocked_ex_code_bytes(values.len(), ex_bits)); + let block_bytes = EX_DOT_BLOCK_DIMS * ex_bits as usize / 8; + let mut block_values = [0u8; 64]; + for (block, out) in out.chunks_exact_mut(block_bytes).enumerate() { + let base = block * EX_DOT_BLOCK_DIMS; + let count = EX_DOT_BLOCK_DIMS.min(values.len() - base); + block_values[..count].copy_from_slice(&values[base..base + count]); + block_values[count..].fill(0); + pack_block(ex_bits, &block_values, out); + } +} + +/// Repack one legacy sequentially bit-packed row into the blocked layout. +/// `out` must have [`blocked_ex_code_bytes`] bytes. +pub fn repack_sequential_row(seq_row: &[u8], dim: usize, ex_bits: u8, out: &mut [u8]) { + debug_assert_eq!(out.len(), blocked_ex_code_bytes(dim, ex_bits)); + let block_bytes = EX_DOT_BLOCK_DIMS * ex_bits as usize / 8; + let mut block_values = [0u8; 64]; + for (block, out) in out.chunks_exact_mut(block_bytes).enumerate() { + block_values.fill(0); + let base = block * EX_DOT_BLOCK_DIMS; + let count = EX_DOT_BLOCK_DIMS.min(dim.saturating_sub(base)); + for (i, value) in block_values[..count].iter_mut().enumerate() { + *value = packed_ex_code_value(seq_row, base + i, ex_bits); + } + pack_block(ex_bits, &block_values, out); + } +} + +/// Unpack one code group into per-dim values (natural dim order). Reference +/// implementation for the SIMD unpackers; also the scalar fallback. +fn unpack_group(ex_bits: u8, group_codes: &[u8], out: &mut [u8; 64]) { + debug_assert_eq!(group_codes.len(), group_bytes(ex_bits)); + match ex_bits { + 1 => { + for (i, value) in out[..16].iter_mut().enumerate() { + *value = (group_codes[i / 8] >> (i % 8)) & 1; + } + } + 2 => { + for k in 0..4 { + for b in 0..16 { + out[16 * k + b] = (group_codes[b] >> (2 * k)) & 0b11; + } + } + } + 3 => { + let plane = u64::from_le_bytes(group_codes[16..24].try_into().unwrap()); + for k in 0..4 { + for b in 0..16 { + let top = (plane >> (8 * (b % 8) + 2 * k + b / 8)) & 1; + out[16 * k + b] = ((group_codes[b] >> (2 * k)) & 0b11) | ((top as u8) << 2); + } + } + } + 4 => { + for b in 0..8 { + out[b] = group_codes[b] & 0x0f; + out[8 + b] = group_codes[b] >> 4; + } + } + 5 => { + let plane = u64::from_le_bytes(group_codes[32..40].try_into().unwrap()); + for k in 0..4 { + for b in 0..16 { + let nibble = (group_codes[16 * (k / 2) + b] >> (4 * (k % 2))) & 0x0f; + let top = (plane >> (8 * (b % 8) + 2 * k + b / 8)) & 1; + out[16 * k + b] = nibble | ((top as u8) << 4); + } + } + } + 6 | 7 => { + for k in 0..3 { + for b in 0..16 { + out[16 * k + b] = group_codes[16 * k + b] & 0x3f; + } + } + for b in 0..16 { + out[48 + b] = (group_codes[b] >> 6) + | ((group_codes[16 + b] >> 6) << 2) + | ((group_codes[32 + b] >> 6) << 4); + } + if ex_bits == 7 { + let plane = u64::from_le_bytes(group_codes[48..56].try_into().unwrap()); + for k in 0..4 { + for b in 0..16 { + let top = (plane >> (8 * (b % 8) + 2 * k + b / 8)) & 1; + out[16 * k + b] |= (top as u8) << 6; + } + } + } + } + 8 => out[..16].copy_from_slice(group_codes), + _ => unreachable!("invalid RabitQ ex_bits={ex_bits}"), + } +} + +/// `sum_d query[d] * code[d]` for one row of blocked-layout codes. +/// +/// The query must cover a whole number of 64-dim blocks (the rotated query +/// as-is for aligned dims, otherwise zero-padded via [`pad_query_into`]); +/// `codes` is the blocked row slice. Rows shorter than the padded query +/// length are treated as zero-padded. +pub type ExDotFn = fn(&[f32], &[u8]) -> f32; + +/// Resolve the dot kernel for `ex_bits` once; the result can be cached by the +/// caller for per-candidate use. +pub fn ex_dot_kernel(ex_bits: u8) -> ExDotFn { + debug_assert!((1..=8).contains(&ex_bits)); + static KERNELS: LazyLock<[ExDotFn; 8]> = + LazyLock::new(|| std::array::from_fn(|i| select_ex_dot_kernel(i as u8 + 1))); + KERNELS[usize::from(ex_bits) - 1] +} + +fn select_ex_dot_kernel(ex_bits: u8) -> ExDotFn { + #[cfg(target_arch = "x86_64")] + { + if std::arch::is_x86_feature_detected!("avx512f") { + return x86::avx512_kernel(ex_bits); + } + if std::arch::is_x86_feature_detected!("avx2") && std::arch::is_x86_feature_detected!("fma") + { + return x86::avx2_kernel(ex_bits); + } + } + #[cfg(target_arch = "aarch64")] + { + // NEON is part of the aarch64 baseline. + return neon::kernel(ex_bits); + } + #[allow(unreachable_code)] + scalar_kernel(ex_bits) +} + +fn scalar_kernel(ex_bits: u8) -> ExDotFn { + match ex_bits { + 1 => ex_dot_scalar::<1>, + 2 => ex_dot_scalar::<2>, + 3 => ex_dot_scalar::<3>, + 4 => ex_dot_scalar::<4>, + 5 => ex_dot_scalar::<5>, + 6 => ex_dot_scalar::<6>, + 7 => ex_dot_scalar::<7>, + 8 => ex_dot_scalar::<8>, + _ => unreachable!("invalid RabitQ ex_bits={ex_bits}"), + } +} + +fn ex_dot_scalar(ex_query: &[f32], codes: &[u8]) -> f32 { + let group_dims = group_dims(EX_BITS); + let bytes_per_group = group_bytes(EX_BITS); + debug_assert_eq!(ex_query.len() % EX_DOT_BLOCK_DIMS, 0); + debug_assert!(codes.len() * u8::BITS as usize <= ex_query.len() * EX_BITS as usize); + + let mut sum = 0.0f32; + let mut unpacked = [0u8; 64]; + let mut padded = [0u8; 56]; + for (group, query) in ex_query.chunks_exact(group_dims).enumerate() { + let start = group * bytes_per_group; + if start >= codes.len() { + // The remaining query lanes are zero padding. + break; + } + let group_codes = if start + bytes_per_group <= codes.len() { + &codes[start..start + bytes_per_group] + } else { + let avail = codes.len() - start; + padded[..bytes_per_group].fill(0); + padded[..avail].copy_from_slice(&codes[start..]); + &padded[..bytes_per_group] + }; + unpack_group(EX_BITS, group_codes, &mut unpacked); + for (q, &code) in query.iter().zip(unpacked[..group_dims].iter()) { + sum += q * code as f32; + } + } + sum +} + +#[cfg(target_arch = "x86_64")] +mod x86 { + use super::ExDotFn; + use std::arch::x86_64::*; + + pub(super) fn avx2_kernel(ex_bits: u8) -> ExDotFn { + match ex_bits { + 1 => dot_u1_avx2_dispatch, + 2 => dot_u2_avx2_dispatch, + 3 => dot_u3_avx2_dispatch, + 4 => dot_u4_avx2_dispatch, + 5 => dot_u5_avx2_dispatch, + 6 => dot_u6_avx2_dispatch, + 7 => dot_u7_avx2_dispatch, + 8 => dot_u8_avx2_dispatch, + _ => unreachable!("invalid RabitQ ex_bits={ex_bits}"), + } + } + + pub(super) fn avx512_kernel(ex_bits: u8) -> ExDotFn { + match ex_bits { + 1 => dot_u1_avx512_dispatch, + 2 => dot_u2_avx512_dispatch, + 3 => dot_u3_avx512_dispatch, + 4 => dot_u4_avx512_dispatch, + 5 => dot_u5_avx512_dispatch, + 6 => dot_u6_avx512_dispatch, + 7 => dot_u7_avx512_dispatch, + 8 => dot_u8_avx512_dispatch, + _ => unreachable!("invalid RabitQ ex_bits={ex_bits}"), + } + } + + /// Broadcast a byte to the 8 bytes of a `u64`. + #[inline(always)] + fn splat_byte(byte: u8) -> u64 { + byte as u64 * 0x0101_0101_0101_0101 + } + + // Unpack helpers. They read exactly one group of code bytes and return + // runs of 16 codes matching the kernel-order query. Only SSE2 (baseline on + // x86_64) is required. + + /// 16 1-bit codes from 2 bytes: compare each replicated byte against + /// per-lane bit masks to turn set bits into 0/1 bytes. + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn unpack_u1(ptr: *const u8) -> [__m128i; 1] { + let (b0, b1) = unsafe { (ptr.read(), ptr.add(1).read()) }; + let bytes = _mm_set_epi64x(splat_byte(b1) as i64, splat_byte(b0) as i64); + let bit_select = _mm_set1_epi64x(0x8040_2010_0804_0201u64 as i64); + let selected = _mm_cmpeq_epi8(_mm_and_si128(bytes, bit_select), bit_select); + [_mm_and_si128(selected, _mm_set1_epi8(1))] + } + + /// 64 2-bit codes from 16 bytes: byte b holds dims 4b..4b+3 at bit pairs. + /// The 16-bit shifts drag bits across byte boundaries, which the per-byte + /// mask removes. + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn unpack_u2(ptr: *const u8) -> [__m128i; 4] { + let raw = unsafe { _mm_loadu_si128(ptr as *const __m128i) }; + let mask = _mm_set1_epi8(0b11); + [ + _mm_and_si128(raw, mask), + _mm_and_si128(_mm_srli_epi16::<2>(raw), mask), + _mm_and_si128(_mm_srli_epi16::<4>(raw), mask), + _mm_and_si128(_mm_srli_epi16::<6>(raw), mask), + ] + } + + /// Position the top-bit plane (see [`super::pack_top_plane`]) of run `k` + /// at `top_bit` within each byte. + #[inline] + #[target_feature(enable = "sse2")] + fn top_plane_run(plane: u64, k: usize, top_bit: usize) -> __m128i { + let lo = super::shift_plane(plane, 2 * k, top_bit); + let hi = super::shift_plane(plane, 2 * k + 1, top_bit); + _mm_and_si128( + _mm_set_epi64x(hi as i64, lo as i64), + _mm_set1_epi8(1 << top_bit), + ) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn unpack_u3(ptr: *const u8) -> [__m128i; 4] { + let mut runs = unsafe { unpack_u2(ptr) }; + let plane = unsafe { (ptr.add(16) as *const u64).read_unaligned() }; + for (k, run) in runs.iter_mut().enumerate() { + *run = _mm_or_si128(*run, top_plane_run(plane, k, 2)); + } + runs + } + + /// 16 4-bit codes from 8 bytes: low nibbles are the even dims, high + /// nibbles the odd dims. + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn unpack_u4(ptr: *const u8) -> [__m128i; 1] { + let word = unsafe { (ptr as *const u64).read_unaligned() }; + let mask = 0x0f0f_0f0f_0f0f_0f0fu64; + [_mm_set_epi64x( + ((word >> 4) & mask) as i64, + (word & mask) as i64, + )] + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn unpack_u5(ptr: *const u8) -> [__m128i; 4] { + let blk0 = unsafe { _mm_loadu_si128(ptr as *const __m128i) }; + let blk1 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) }; + let plane = unsafe { (ptr.add(32) as *const u64).read_unaligned() }; + let mask = _mm_set1_epi8(0x0f); + let mut runs = [ + _mm_and_si128(blk0, mask), + _mm_and_si128(_mm_srli_epi16::<4>(blk0), mask), + _mm_and_si128(blk1, mask), + _mm_and_si128(_mm_srli_epi16::<4>(blk1), mask), + ]; + for (k, run) in runs.iter_mut().enumerate() { + *run = _mm_or_si128(*run, top_plane_run(plane, k, 4)); + } + runs + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn unpack_u6(ptr: *const u8) -> [__m128i; 4] { + let blk0 = unsafe { _mm_loadu_si128(ptr as *const __m128i) }; + let blk1 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) }; + let blk2 = unsafe { _mm_loadu_si128(ptr.add(32) as *const __m128i) }; + let mask6 = _mm_set1_epi8(0x3f); + let mask2 = _mm_set1_epi8(0b1100_0000u8 as i8); + let stolen = _mm_or_si128( + _mm_or_si128( + _mm_srli_epi16::<6>(_mm_and_si128(blk0, mask2)), + _mm_srli_epi16::<4>(_mm_and_si128(blk1, mask2)), + ), + _mm_srli_epi16::<2>(_mm_and_si128(blk2, mask2)), + ); + [ + _mm_and_si128(blk0, mask6), + _mm_and_si128(blk1, mask6), + _mm_and_si128(blk2, mask6), + stolen, + ] + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn unpack_u7(ptr: *const u8) -> [__m128i; 4] { + let mut runs = unsafe { unpack_u6(ptr) }; + let plane = unsafe { (ptr.add(48) as *const u64).read_unaligned() }; + for (k, run) in runs.iter_mut().enumerate() { + *run = _mm_or_si128(*run, top_plane_run(plane, k, 6)); + } + runs + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn unpack_u8x16(ptr: *const u8) -> [__m128i; 1] { + [unsafe { _mm_loadu_si128(ptr as *const __m128i) }] + } + + /// FMA 16 code bytes against 16 query floats (AVX2: two 8-float halves). + #[inline] + #[target_feature(enable = "avx2", enable = "fma")] + unsafe fn fma16_avx2(codes: __m128i, query: *const f32, acc: &mut [__m256; 2]) { + let lo = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(codes)); + acc[0] = _mm256_fmadd_ps(lo, unsafe { _mm256_loadu_ps(query) }, acc[0]); + let hi = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_srli_si128::<8>(codes))); + acc[1] = _mm256_fmadd_ps(hi, unsafe { _mm256_loadu_ps(query.add(8)) }, acc[1]); + } + + #[inline] + #[target_feature(enable = "avx2")] + unsafe fn reduce_add_avx2(acc: [__m256; 2]) -> f32 { + let v = _mm256_add_ps(acc[0], acc[1]); + let halves = _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps::<1>(v)); + let pairs = _mm_add_ps(halves, _mm_movehl_ps(halves, halves)); + let total = _mm_add_ss(pairs, _mm_shuffle_ps::<0b01>(pairs, pairs)); + _mm_cvtss_f32(total) + } + + /// FMA 16 code bytes against 16 query floats (AVX-512: one 16-float lane). + #[inline] + #[target_feature(enable = "avx512f")] + unsafe fn fma16_avx512(codes: __m128i, query: *const f32, acc: &mut __m512) { + let values = _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(codes)); + *acc = _mm512_fmadd_ps(values, unsafe { _mm512_loadu_ps(query) }, *acc); + } + + macro_rules! x86_dot_kernel { + ($name:ident, $dispatch:ident, $unpack:ident, $ex_bits:expr, $runs:expr) => { + #[target_feature(enable = "avx2", enable = "fma")] + unsafe fn $name(ex_query: &[f32], codes: &[u8]) -> f32 { + const GROUP_DIMS: usize = if $runs == 1 { 16 } else { 64 }; + const GROUP_BYTES: usize = GROUP_DIMS * $ex_bits / 8; + debug_assert_eq!(ex_query.len() % super::EX_DOT_BLOCK_DIMS, 0); + debug_assert!(codes.len() * 8 <= ex_query.len() * $ex_bits); + + let groups = ex_query.len() / GROUP_DIMS; + let full_groups = (codes.len() / GROUP_BYTES).min(groups); + // Two accumulators per run position break the FMA latency + // chain; they are summed once at the end. + let mut acc = [_mm256_setzero_ps(); 2]; + for group in 0..full_groups { + // SAFETY: `group < full_groups` keeps both the code group + // and the query run in bounds. + let runs = unsafe { $unpack(codes.as_ptr().add(group * GROUP_BYTES)) }; + for (run, codes16) in runs.into_iter().enumerate() { + unsafe { + fma16_avx2( + codes16, + ex_query.as_ptr().add(group * GROUP_DIMS + run * 16), + &mut acc, + ) + }; + } + } + let consumed = full_groups * GROUP_BYTES; + if consumed < codes.len() && full_groups < groups { + // Zero-pad the final partial code group on the stack. + let mut padded = [0u8; GROUP_BYTES]; + padded[..codes.len() - consumed].copy_from_slice(&codes[consumed..]); + let runs = unsafe { $unpack(padded.as_ptr()) }; + for (run, codes16) in runs.into_iter().enumerate() { + unsafe { + fma16_avx2( + codes16, + ex_query.as_ptr().add(full_groups * GROUP_DIMS + run * 16), + &mut acc, + ) + }; + } + } + unsafe { reduce_add_avx2(acc) } + } + + fn $dispatch(ex_query: &[f32], codes: &[u8]) -> f32 { + // SAFETY: only selected when AVX2 and FMA were detected. + unsafe { $name(ex_query, codes) } + } + }; + } + + macro_rules! x86_dot_kernel_avx512 { + ($name:ident, $dispatch:ident, $unpack:ident, $ex_bits:expr, $runs:expr) => { + #[target_feature(enable = "avx512f")] + unsafe fn $name(ex_query: &[f32], codes: &[u8]) -> f32 { + const GROUP_DIMS: usize = if $runs == 1 { 16 } else { 64 }; + const GROUP_BYTES: usize = GROUP_DIMS * $ex_bits / 8; + debug_assert_eq!(ex_query.len() % super::EX_DOT_BLOCK_DIMS, 0); + debug_assert!(codes.len() * 8 <= ex_query.len() * $ex_bits); + + let groups = ex_query.len() / GROUP_DIMS; + let full_groups = (codes.len() / GROUP_BYTES).min(groups); + // Alternating by group as well as run keeps two independent + // FMA chains even for the single-run widths. + let mut acc = [_mm512_setzero_ps(); 2]; + for group in 0..full_groups { + // SAFETY: `group < full_groups` keeps both the code group + // and the query run in bounds. + let runs = unsafe { $unpack(codes.as_ptr().add(group * GROUP_BYTES)) }; + for (run, codes16) in runs.into_iter().enumerate() { + unsafe { + fma16_avx512( + codes16, + ex_query.as_ptr().add(group * GROUP_DIMS + run * 16), + &mut acc[(group + run) % 2], + ) + }; + } + } + let consumed = full_groups * GROUP_BYTES; + if consumed < codes.len() && full_groups < groups { + let mut padded = [0u8; GROUP_BYTES]; + padded[..codes.len() - consumed].copy_from_slice(&codes[consumed..]); + let runs = unsafe { $unpack(padded.as_ptr()) }; + for (run, codes16) in runs.into_iter().enumerate() { + unsafe { + fma16_avx512( + codes16, + ex_query.as_ptr().add(full_groups * GROUP_DIMS + run * 16), + &mut acc[(full_groups + run) % 2], + ) + }; + } + } + _mm512_reduce_add_ps(_mm512_add_ps(acc[0], acc[1])) + } + + fn $dispatch(ex_query: &[f32], codes: &[u8]) -> f32 { + // SAFETY: only selected when AVX-512F was detected. + unsafe { $name(ex_query, codes) } + } + }; + } + + x86_dot_kernel!(dot_u1_avx2, dot_u1_avx2_dispatch, unpack_u1, 1, 1); + x86_dot_kernel!(dot_u2_avx2, dot_u2_avx2_dispatch, unpack_u2, 2, 4); + x86_dot_kernel!(dot_u3_avx2, dot_u3_avx2_dispatch, unpack_u3, 3, 4); + x86_dot_kernel!(dot_u4_avx2, dot_u4_avx2_dispatch, unpack_u4, 4, 1); + x86_dot_kernel!(dot_u5_avx2, dot_u5_avx2_dispatch, unpack_u5, 5, 4); + x86_dot_kernel!(dot_u6_avx2, dot_u6_avx2_dispatch, unpack_u6, 6, 4); + x86_dot_kernel!(dot_u7_avx2, dot_u7_avx2_dispatch, unpack_u7, 7, 4); + x86_dot_kernel!(dot_u8_avx2, dot_u8_avx2_dispatch, unpack_u8x16, 8, 1); + + x86_dot_kernel_avx512!(dot_u1_avx512, dot_u1_avx512_dispatch, unpack_u1, 1, 1); + x86_dot_kernel_avx512!(dot_u2_avx512, dot_u2_avx512_dispatch, unpack_u2, 2, 4); + x86_dot_kernel_avx512!(dot_u3_avx512, dot_u3_avx512_dispatch, unpack_u3, 3, 4); + x86_dot_kernel_avx512!(dot_u4_avx512, dot_u4_avx512_dispatch, unpack_u4, 4, 1); + x86_dot_kernel_avx512!(dot_u5_avx512, dot_u5_avx512_dispatch, unpack_u5, 5, 4); + x86_dot_kernel_avx512!(dot_u6_avx512, dot_u6_avx512_dispatch, unpack_u6, 6, 4); + x86_dot_kernel_avx512!(dot_u7_avx512, dot_u7_avx512_dispatch, unpack_u7, 7, 4); + x86_dot_kernel_avx512!(dot_u8_avx512, dot_u8_avx512_dispatch, unpack_u8x16, 8, 1); +} + +#[cfg(target_arch = "aarch64")] +mod neon { + use super::ExDotFn; + use std::arch::aarch64::*; + + pub(super) fn kernel(ex_bits: u8) -> ExDotFn { + match ex_bits { + 1 => dot_u1_neon_dispatch, + 2 => dot_u2_neon_dispatch, + 3 => dot_u3_neon_dispatch, + 4 => dot_u4_neon_dispatch, + 5 => dot_u5_neon_dispatch, + 6 => dot_u6_neon_dispatch, + 7 => dot_u7_neon_dispatch, + 8 => dot_u8_neon_dispatch, + _ => unreachable!("invalid RabitQ ex_bits={ex_bits}"), + } + } + + #[inline] + #[target_feature(enable = "neon")] + unsafe fn unpack_u1(ptr: *const u8) -> [uint8x16_t; 1] { + let (b0, b1) = unsafe { (ptr.read(), ptr.add(1).read()) }; + let bytes = vcombine_u8(vdup_n_u8(b0), vdup_n_u8(b1)); + let bit_select = vreinterpretq_u8_u64(vdupq_n_u64(0x8040_2010_0804_0201)); + [vandq_u8(vtstq_u8(bytes, bit_select), vdupq_n_u8(1))] + } + + #[inline] + #[target_feature(enable = "neon")] + unsafe fn unpack_u2(ptr: *const u8) -> [uint8x16_t; 4] { + let raw = unsafe { vld1q_u8(ptr) }; + let mask = vdupq_n_u8(0b11); + [ + vandq_u8(raw, mask), + vandq_u8(vshrq_n_u8::<2>(raw), mask), + vandq_u8(vshrq_n_u8::<4>(raw), mask), + vshrq_n_u8::<6>(raw), + ] + } + + #[inline] + #[target_feature(enable = "neon")] + fn top_plane_run(plane: u64, k: usize, top_bit: usize) -> uint8x16_t { + let lo = super::shift_plane(plane, 2 * k, top_bit); + let hi = super::shift_plane(plane, 2 * k + 1, top_bit); + vandq_u8( + vreinterpretq_u8_u64(vcombine_u64(vcreate_u64(lo), vcreate_u64(hi))), + vdupq_n_u8(1 << top_bit), + ) + } + + #[inline] + #[target_feature(enable = "neon")] + unsafe fn unpack_u3(ptr: *const u8) -> [uint8x16_t; 4] { + let mut runs = unsafe { unpack_u2(ptr) }; + let plane = unsafe { (ptr.add(16) as *const u64).read_unaligned() }; + for (k, run) in runs.iter_mut().enumerate() { + *run = vorrq_u8(*run, top_plane_run(plane, k, 2)); + } + runs + } + + #[inline] + #[target_feature(enable = "neon")] + unsafe fn unpack_u4(ptr: *const u8) -> [uint8x16_t; 1] { + let word = unsafe { (ptr as *const u64).read_unaligned() }; + let mask = 0x0f0f_0f0f_0f0f_0f0fu64; + [vreinterpretq_u8_u64(vcombine_u64( + vcreate_u64(word & mask), + vcreate_u64((word >> 4) & mask), + ))] + } + + #[inline] + #[target_feature(enable = "neon")] + unsafe fn unpack_u5(ptr: *const u8) -> [uint8x16_t; 4] { + let blk0 = unsafe { vld1q_u8(ptr) }; + let blk1 = unsafe { vld1q_u8(ptr.add(16)) }; + let plane = unsafe { (ptr.add(32) as *const u64).read_unaligned() }; + let mask = vdupq_n_u8(0x0f); + let mut runs = [ + vandq_u8(blk0, mask), + vshrq_n_u8::<4>(blk0), + vandq_u8(blk1, mask), + vshrq_n_u8::<4>(blk1), + ]; + for (k, run) in runs.iter_mut().enumerate() { + *run = vorrq_u8(*run, top_plane_run(plane, k, 4)); + } + runs + } + + #[inline] + #[target_feature(enable = "neon")] + unsafe fn unpack_u6(ptr: *const u8) -> [uint8x16_t; 4] { + let blk0 = unsafe { vld1q_u8(ptr) }; + let blk1 = unsafe { vld1q_u8(ptr.add(16)) }; + let blk2 = unsafe { vld1q_u8(ptr.add(32)) }; + let mask6 = vdupq_n_u8(0x3f); + let stolen = vorrq_u8( + vorrq_u8( + vshrq_n_u8::<6>(blk0), + vshlq_n_u8::<2>(vshrq_n_u8::<6>(blk1)), + ), + vshlq_n_u8::<4>(vshrq_n_u8::<6>(blk2)), + ); + [ + vandq_u8(blk0, mask6), + vandq_u8(blk1, mask6), + vandq_u8(blk2, mask6), + stolen, + ] + } + + #[inline] + #[target_feature(enable = "neon")] + unsafe fn unpack_u7(ptr: *const u8) -> [uint8x16_t; 4] { + let mut runs = unsafe { unpack_u6(ptr) }; + let plane = unsafe { (ptr.add(48) as *const u64).read_unaligned() }; + for (k, run) in runs.iter_mut().enumerate() { + *run = vorrq_u8(*run, top_plane_run(plane, k, 6)); + } + runs + } + + #[inline] + #[target_feature(enable = "neon")] + unsafe fn unpack_u8x16(ptr: *const u8) -> [uint8x16_t; 1] { + [unsafe { vld1q_u8(ptr) }] + } + + /// FMA 16 code bytes against 16 query floats over four 4-float lanes. + #[inline] + #[target_feature(enable = "neon")] + unsafe fn fma16_neon(codes: uint8x16_t, query: *const f32, acc: &mut [float32x4_t; 4]) { + let lo = vmovl_u8(vget_low_u8(codes)); + let hi = vmovl_u8(vget_high_u8(codes)); + let c0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(lo))); + let c1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(lo))); + let c2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(hi))); + let c3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(hi))); + unsafe { + acc[0] = vfmaq_f32(acc[0], c0, vld1q_f32(query)); + acc[1] = vfmaq_f32(acc[1], c1, vld1q_f32(query.add(4))); + acc[2] = vfmaq_f32(acc[2], c2, vld1q_f32(query.add(8))); + acc[3] = vfmaq_f32(acc[3], c3, vld1q_f32(query.add(12))); + } + } + + macro_rules! neon_dot_kernel { + ($name:ident, $dispatch:ident, $unpack:ident, $ex_bits:expr, $runs:expr) => { + #[target_feature(enable = "neon")] + unsafe fn $name(ex_query: &[f32], codes: &[u8]) -> f32 { + const GROUP_DIMS: usize = if $runs == 1 { 16 } else { 64 }; + const GROUP_BYTES: usize = GROUP_DIMS * $ex_bits / 8; + debug_assert_eq!(ex_query.len() % super::EX_DOT_BLOCK_DIMS, 0); + debug_assert!(codes.len() * 8 <= ex_query.len() * $ex_bits); + + let groups = ex_query.len() / GROUP_DIMS; + let full_groups = (codes.len() / GROUP_BYTES).min(groups); + let mut acc = [vdupq_n_f32(0.0); 4]; + for group in 0..full_groups { + // SAFETY: `group < full_groups` keeps both the code group + // and the query run in bounds. + let runs = unsafe { $unpack(codes.as_ptr().add(group * GROUP_BYTES)) }; + for (run, codes16) in runs.into_iter().enumerate() { + unsafe { + fma16_neon( + codes16, + ex_query.as_ptr().add(group * GROUP_DIMS + run * 16), + &mut acc, + ) + }; + } + } + let consumed = full_groups * GROUP_BYTES; + if consumed < codes.len() && full_groups < groups { + // Zero-pad the final partial code group on the stack. + let mut padded = [0u8; GROUP_BYTES]; + padded[..codes.len() - consumed].copy_from_slice(&codes[consumed..]); + let runs = unsafe { $unpack(padded.as_ptr()) }; + for (run, codes16) in runs.into_iter().enumerate() { + unsafe { + fma16_neon( + codes16, + ex_query.as_ptr().add(full_groups * GROUP_DIMS + run * 16), + &mut acc, + ) + }; + } + } + vaddvq_f32(vaddq_f32( + vaddq_f32(acc[0], acc[1]), + vaddq_f32(acc[2], acc[3]), + )) + } + + fn $dispatch(ex_query: &[f32], codes: &[u8]) -> f32 { + // SAFETY: NEON is part of the aarch64 baseline. + unsafe { $name(ex_query, codes) } + } + }; + } + + neon_dot_kernel!(dot_u1_neon, dot_u1_neon_dispatch, unpack_u1, 1, 1); + neon_dot_kernel!(dot_u2_neon, dot_u2_neon_dispatch, unpack_u2, 2, 4); + neon_dot_kernel!(dot_u3_neon, dot_u3_neon_dispatch, unpack_u3, 3, 4); + neon_dot_kernel!(dot_u4_neon, dot_u4_neon_dispatch, unpack_u4, 4, 1); + neon_dot_kernel!(dot_u5_neon, dot_u5_neon_dispatch, unpack_u5, 5, 4); + neon_dot_kernel!(dot_u6_neon, dot_u6_neon_dispatch, unpack_u6, 6, 4); + neon_dot_kernel!(dot_u7_neon, dot_u7_neon_dispatch, unpack_u7, 7, 4); + neon_dot_kernel!(dot_u8_neon, dot_u8_neon_dispatch, unpack_u8x16, 8, 1); +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::rngs::SmallRng; + use rand::{Rng, SeedableRng}; + use rstest::rstest; + + /// Bit-pack code values sequentially (LSB-first), the on-disk ex-code layout. + fn pack_sequential(values: &[u8], ex_bits: u8) -> Vec { + let mut out = vec![0u8; (values.len() * ex_bits as usize).div_ceil(8)]; + for (dim, &value) in values.iter().enumerate() { + let bit_offset = dim * ex_bits as usize; + let bits = (value as u16) << (bit_offset % 8); + out[bit_offset / 8] |= bits as u8; + if bits >> 8 != 0 { + out[bit_offset / 8 + 1] |= (bits >> 8) as u8; + } + } + out + } + + fn kernel_codes(values: &[u8], dim: usize, ex_bits: u8) -> Vec { + debug_assert_eq!(values.len(), dim); + let mut out = vec![0u8; blocked_ex_code_bytes(dim, ex_bits)]; + pack_blocked_row(values, ex_bits, &mut out); + out + } + + fn available_kernels(ex_bits: u8) -> Vec<(&'static str, ExDotFn)> { + // `mut` is only exercised on x86_64 where extra kernels may be pushed. + #[allow(unused_mut)] + let mut kernels = vec![ + ("scalar", scalar_kernel(ex_bits)), + ("dispatched", ex_dot_kernel(ex_bits)), + ]; + #[cfg(target_arch = "x86_64")] + { + if std::arch::is_x86_feature_detected!("avx2") + && std::arch::is_x86_feature_detected!("fma") + { + kernels.push(("avx2", x86::avx2_kernel(ex_bits))); + } + if std::arch::is_x86_feature_detected!("avx512f") { + kernels.push(("avx512", x86::avx512_kernel(ex_bits))); + } + } + kernels + } + + #[rstest] + fn test_ex_dot_matches_reference( + #[values(1, 2, 3, 4, 5, 6, 7, 8)] ex_bits: u8, + #[values(7, 16, 60, 64, 100, 128, 1024, 1536, 2048)] dim: usize, + ) { + let mut rng = SmallRng::seed_from_u64(42 + ex_bits as u64 * 1000 + dim as u64); + let max_code = ((1u16 << ex_bits) - 1) as u8; + let values = (0..dim) + .map(|_| rng.random_range(0..=max_code)) + .collect::>(); + let query = (0..dim) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect::>(); + + let expected = query + .iter() + .zip(values.iter()) + .map(|(q, &c)| *q as f64 * c as f64) + .sum::(); + + let codes = kernel_codes(&values, dim, ex_bits); + let mut ex_query = vec![0.0; padded_query_len(dim)]; + pad_query_into(&query, &mut ex_query); + + let tolerance = 1e-3 * expected.abs().max(1.0); + for (name, kernel) in available_kernels(ex_bits) { + let actual = kernel(&ex_query, &codes) as f64; + assert!( + (actual - expected).abs() <= tolerance, + "ex_bits={ex_bits} dim={dim} kernel={name}: {actual} != {expected}" + ); + } + } + + #[rstest] + fn test_unpack_group_roundtrip(#[values(1, 2, 3, 4, 5, 6, 7, 8)] ex_bits: u8) { + let mut rng = SmallRng::seed_from_u64(7 + ex_bits as u64); + let max_code = ((1u16 << ex_bits) - 1) as u8; + let values = (0..EX_DOT_BLOCK_DIMS) + .map(|_| rng.random_range(0..=max_code)) + .collect::>(); + let codes = kernel_codes(&values, EX_DOT_BLOCK_DIMS, ex_bits); + + // Unpacking each kernel group must reproduce the values in natural + // dim order. + let dims = group_dims(ex_bits); + let bytes = group_bytes(ex_bits); + let mut unpacked = [0u8; 64]; + for group in 0..EX_DOT_BLOCK_DIMS / dims { + unpack_group( + ex_bits, + &codes[group * bytes..(group + 1) * bytes], + &mut unpacked, + ); + assert_eq!( + &unpacked[..dims], + &values[group * dims..(group + 1) * dims], + "ex_bits={ex_bits} group={group}" + ); + } + } + + /// The legacy sequential rows must repack into exactly what the writer + /// produces from the unpacked values. + #[rstest] + fn test_repack_sequential_matches_blocked( + #[values(1, 2, 3, 4, 5, 6, 7, 8)] ex_bits: u8, + #[values(7, 64, 100, 1536)] dim: usize, + ) { + let mut rng = SmallRng::seed_from_u64(11 + ex_bits as u64 * 100 + dim as u64); + let max_code = ((1u16 << ex_bits) - 1) as u8; + let values = (0..dim) + .map(|_| rng.random_range(0..=max_code)) + .collect::>(); + let seq = pack_sequential(&values, ex_bits); + + let mut repacked = vec![0u8; blocked_ex_code_bytes(dim, ex_bits)]; + repack_sequential_row(&seq, dim, ex_bits, &mut repacked); + assert_eq!(repacked, kernel_codes(&values, dim, ex_bits)); + + // For the widths where the sequential layout is already blocked + // (modulo trailing padding), the raw row must be a prefix. + if sequential_matches_blocked(ex_bits) { + assert_eq!(&repacked[..seq.len()], &seq); + assert!(repacked[seq.len()..].iter().all(|&byte| byte == 0)); + } + } + + /// Dense dim sweep for the bit-plane widths: every tail shape within the + /// 64-dim kernel group, plus multi-group sizes. + #[rstest] + fn test_ex_dot_plane_widths_dense_dims(#[values(3, 5)] ex_bits: u8) { + let mut rng = SmallRng::seed_from_u64(97 + ex_bits as u64); + let max_code = ((1u16 << ex_bits) - 1) as u8; + for dim in (1..=160).chain([255, 256, 1000, 1536, 2048]) { + let values = (0..dim) + .map(|_| rng.random_range(0..=max_code)) + .collect::>(); + let query = (0..dim) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect::>(); + let expected = query + .iter() + .zip(values.iter()) + .map(|(q, &c)| *q as f64 * c as f64) + .sum::(); + + let codes = kernel_codes(&values, dim, ex_bits); + let mut ex_query = vec![0.0; padded_query_len(dim)]; + pad_query_into(&query, &mut ex_query); + let tolerance = 1e-3 * expected.abs().max(1.0); + for (name, kernel) in available_kernels(ex_bits) { + let actual = kernel(&ex_query, &codes) as f64; + assert!( + (actual - expected).abs() <= tolerance, + "ex_bits={ex_bits} dim={dim} kernel={name}: {actual} != {expected}" + ); + } + } + } + + #[test] + fn test_pad_query_pads_with_zeros() { + let query = vec![1.0f32; 100]; + let mut padded = vec![f32::NAN; padded_query_len(query.len())]; + pad_query_into(&query, &mut padded); + assert_eq!(padded.len(), 128); + assert_eq!(&padded[..100], &query[..]); + assert!(padded[100..].iter().all(|&value| value == 0.0)); + } +} diff --git a/rust/lance-index/src/vector/bq/prune.rs b/rust/lance-index/src/vector/bq/prune.rs new file mode 100644 index 00000000000..e67ab6642b8 --- /dev/null +++ b/rust/lance-index/src/vector/bq/prune.rs @@ -0,0 +1,527 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! SIMD kernels for the RaBitQ top-k lower-bound pruning scan. +//! +//! Multi-bit IVF_RQ search gates the exact ex-code rerank with a per-row +//! distance lower bound: after the binary FastScan fills the per-row binary +//! inner products, every row of the partition is classified against the query +//! upper bound and the current top-k heap threshold, and only the survivors +//! (typically well under 1%) are reranked. The classification is the per-row +//! formula of `RabitDistCalculator::raw_query_lower_bound`: +//! +//! ```text +//! lower_bound = (binary_ip - 0.5 * sum_q) * scale_factor +//! + add_factor + query_factor +//! - error_factor * query_error +//! ``` +//! +//! These kernels evaluate the formula and both comparisons for +//! [`PRUNE_LANES`] rows at a time, returning bit masks instead of values so +//! the caller can skip whole groups (the overwhelmingly common case) and run +//! the existing scalar rerank only for the surviving lanes. +//! +//! Correctness contract: +//! +//! - The lower bound is computed with exactly the operation order of the +//! scalar helper — multiplies and adds, never FMA. A fused multiply-add +//! rounds differently, which could prune a row the scalar code would have +//! kept; with bit-identical lower bounds the masks reproduce the scalar +//! `>=` decisions exactly, keeping heap contents and prune-stats counters +//! unchanged. +//! - Comparisons use ordered-quiet GE predicates (`_CMP_GE_OQ`), matching +//! scalar `>=`: a NaN lower bound is never pruned and falls through to the +//! exact rerank. +//! - The heap threshold may be a stale snapshot (it only ever tightens); the +//! caller re-checks surviving lanes against live values, so a stale +//! threshold can only over-select survivors, never wrongly prune. + +use std::sync::LazyLock; + +/// Rows classified per kernel invocation. +pub const PRUNE_LANES: usize = 16; + +/// Per-query constants of the lower-bound formula, mirroring +/// `RabitDistCalculator::raw_query_lower_bound` term by term. +#[derive(Debug, Clone, Copy)] +pub struct LowerBoundTerms { + /// `0.5 * sum_q`, subtracted from the binary inner product. + pub half_sum_q: f32, + pub query_factor: f32, + pub query_error: f32, +} + +/// Classify [`PRUNE_LANES`] rows against the pruning bounds. +/// +/// Arguments are the per-row binary inner products, scale factors, add +/// factors, and error factors, followed by the formula constants, the query +/// upper bound, and the heap threshold (`None` while the heap is not full, +/// which disables the heap mask). +/// +/// Returns `(pruned_upper_bound, pruned_heap)` masks: bit `i` of +/// `pruned_upper_bound` is set when `lower_bound[i] >= upper_bound`, and bit +/// `i` of `pruned_heap` is set when the row is not already pruned by the +/// upper bound and `lower_bound[i] >= heap_threshold`. Surviving rows are the +/// zero bits of the OR of both masks. +pub type PruneMaskFn = fn( + &[f32; PRUNE_LANES], + &[f32; PRUNE_LANES], + &[f32; PRUNE_LANES], + &[f32; PRUNE_LANES], + LowerBoundTerms, + f32, + Option, +) -> (u16, u16); + +/// Resolve the prune-mask kernel for the running CPU once; the result can be +/// cached by the caller for per-partition use. +pub fn prune_mask_kernel() -> PruneMaskFn { + static KERNEL: LazyLock = LazyLock::new(select_prune_mask_kernel); + *KERNEL +} + +fn select_prune_mask_kernel() -> PruneMaskFn { + #[cfg(target_arch = "x86_64")] + { + if std::arch::is_x86_feature_detected!("avx512f") { + return x86::prune_masks_avx512_dispatch; + } + if std::arch::is_x86_feature_detected!("avx2") { + return x86::prune_masks_avx2_dispatch; + } + } + // On aarch64 the plain 16-wide loop auto-vectorizes to NEON (part of the + // baseline), so no dedicated kernel is needed. + prune_masks_portable +} + +/// Portable implementation; also the reference for the SIMD kernels. +fn prune_masks_portable( + dists: &[f32; PRUNE_LANES], + scale_factors: &[f32; PRUNE_LANES], + add_factors: &[f32; PRUNE_LANES], + error_factors: &[f32; PRUNE_LANES], + terms: LowerBoundTerms, + upper_bound: f32, + heap_threshold: Option, +) -> (u16, u16) { + let mut lower_bounds = [0.0f32; PRUNE_LANES]; + for lane in 0..PRUNE_LANES { + lower_bounds[lane] = ((dists[lane] - terms.half_sum_q) * scale_factors[lane] + + add_factors[lane] + + terms.query_factor) + - error_factors[lane] * terms.query_error; + } + let mut pruned_upper_bound = 0u16; + for (lane, lower_bound) in lower_bounds.iter().enumerate() { + pruned_upper_bound |= u16::from(*lower_bound >= upper_bound) << lane; + } + let mut pruned_heap = 0u16; + if let Some(threshold) = heap_threshold { + for (lane, lower_bound) in lower_bounds.iter().enumerate() { + pruned_heap |= u16::from(*lower_bound >= threshold) << lane; + } + pruned_heap &= !pruned_upper_bound; + } + (pruned_upper_bound, pruned_heap) +} + +#[cfg(target_arch = "x86_64")] +mod x86 { + use super::{LowerBoundTerms, PRUNE_LANES}; + use std::arch::x86_64::*; + + /// Lower bounds for 8 lanes with the scalar operation order (no FMA). + #[inline] + #[target_feature(enable = "avx")] + fn lower_bounds_avx( + dists: __m256, + scale_factors: __m256, + add_factors: __m256, + error_factors: __m256, + half_sum_q: __m256, + query_factor: __m256, + query_error: __m256, + ) -> __m256 { + let binary_distance = _mm256_add_ps( + _mm256_add_ps( + _mm256_mul_ps(_mm256_sub_ps(dists, half_sum_q), scale_factors), + add_factors, + ), + query_factor, + ); + _mm256_sub_ps(binary_distance, _mm256_mul_ps(error_factors, query_error)) + } + + #[inline] + #[target_feature(enable = "avx")] + fn ge_mask_avx(lower_bounds_lo: __m256, lower_bounds_hi: __m256, bound: f32) -> u16 { + let bound = _mm256_set1_ps(bound); + let lo = _mm256_movemask_ps(_mm256_cmp_ps::<_CMP_GE_OQ>(lower_bounds_lo, bound)); + let hi = _mm256_movemask_ps(_mm256_cmp_ps::<_CMP_GE_OQ>(lower_bounds_hi, bound)); + (lo | (hi << 8)) as u16 + } + + #[target_feature(enable = "avx2")] + unsafe fn prune_masks_avx2( + dists: &[f32; PRUNE_LANES], + scale_factors: &[f32; PRUNE_LANES], + add_factors: &[f32; PRUNE_LANES], + error_factors: &[f32; PRUNE_LANES], + terms: LowerBoundTerms, + upper_bound: f32, + heap_threshold: Option, + ) -> (u16, u16) { + let half_sum_q = _mm256_set1_ps(terms.half_sum_q); + let query_factor = _mm256_set1_ps(terms.query_factor); + let query_error = _mm256_set1_ps(terms.query_error); + // SAFETY: the array references guarantee 16 readable floats each. + let lower_bounds_lo = unsafe { + lower_bounds_avx( + _mm256_loadu_ps(dists.as_ptr()), + _mm256_loadu_ps(scale_factors.as_ptr()), + _mm256_loadu_ps(add_factors.as_ptr()), + _mm256_loadu_ps(error_factors.as_ptr()), + half_sum_q, + query_factor, + query_error, + ) + }; + let lower_bounds_hi = unsafe { + lower_bounds_avx( + _mm256_loadu_ps(dists.as_ptr().add(8)), + _mm256_loadu_ps(scale_factors.as_ptr().add(8)), + _mm256_loadu_ps(add_factors.as_ptr().add(8)), + _mm256_loadu_ps(error_factors.as_ptr().add(8)), + half_sum_q, + query_factor, + query_error, + ) + }; + let pruned_upper_bound = ge_mask_avx(lower_bounds_lo, lower_bounds_hi, upper_bound); + let pruned_heap = match heap_threshold { + Some(threshold) => { + ge_mask_avx(lower_bounds_lo, lower_bounds_hi, threshold) & !pruned_upper_bound + } + None => 0, + }; + (pruned_upper_bound, pruned_heap) + } + + pub(super) fn prune_masks_avx2_dispatch( + dists: &[f32; PRUNE_LANES], + scale_factors: &[f32; PRUNE_LANES], + add_factors: &[f32; PRUNE_LANES], + error_factors: &[f32; PRUNE_LANES], + terms: LowerBoundTerms, + upper_bound: f32, + heap_threshold: Option, + ) -> (u16, u16) { + // SAFETY: only selected when AVX2 was detected. + unsafe { + prune_masks_avx2( + dists, + scale_factors, + add_factors, + error_factors, + terms, + upper_bound, + heap_threshold, + ) + } + } + + #[target_feature(enable = "avx512f")] + unsafe fn prune_masks_avx512( + dists: &[f32; PRUNE_LANES], + scale_factors: &[f32; PRUNE_LANES], + add_factors: &[f32; PRUNE_LANES], + error_factors: &[f32; PRUNE_LANES], + terms: LowerBoundTerms, + upper_bound: f32, + heap_threshold: Option, + ) -> (u16, u16) { + // SAFETY: the array references guarantee 16 readable floats each. + let (dists, scale_factors, add_factors, error_factors) = unsafe { + ( + _mm512_loadu_ps(dists.as_ptr()), + _mm512_loadu_ps(scale_factors.as_ptr()), + _mm512_loadu_ps(add_factors.as_ptr()), + _mm512_loadu_ps(error_factors.as_ptr()), + ) + }; + let binary_distance = _mm512_add_ps( + _mm512_add_ps( + _mm512_mul_ps( + _mm512_sub_ps(dists, _mm512_set1_ps(terms.half_sum_q)), + scale_factors, + ), + add_factors, + ), + _mm512_set1_ps(terms.query_factor), + ); + let lower_bounds = _mm512_sub_ps( + binary_distance, + _mm512_mul_ps(error_factors, _mm512_set1_ps(terms.query_error)), + ); + let pruned_upper_bound = + _mm512_cmp_ps_mask::<_CMP_GE_OQ>(lower_bounds, _mm512_set1_ps(upper_bound)); + let pruned_heap = match heap_threshold { + Some(threshold) => { + _mm512_cmp_ps_mask::<_CMP_GE_OQ>(lower_bounds, _mm512_set1_ps(threshold)) + & !pruned_upper_bound + } + None => 0, + }; + (pruned_upper_bound, pruned_heap) + } + + pub(super) fn prune_masks_avx512_dispatch( + dists: &[f32; PRUNE_LANES], + scale_factors: &[f32; PRUNE_LANES], + add_factors: &[f32; PRUNE_LANES], + error_factors: &[f32; PRUNE_LANES], + terms: LowerBoundTerms, + upper_bound: f32, + heap_threshold: Option, + ) -> (u16, u16) { + // SAFETY: only selected when AVX-512F was detected. + unsafe { + prune_masks_avx512( + dists, + scale_factors, + add_factors, + error_factors, + terms, + upper_bound, + heap_threshold, + ) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::rngs::SmallRng; + use rand::{Rng, SeedableRng}; + + fn available_kernels() -> Vec<(&'static str, PruneMaskFn)> { + // `mut` is only exercised on x86_64 where extra kernels may be pushed. + #[allow(unused_mut)] + let mut kernels = vec![ + ("portable", prune_masks_portable as PruneMaskFn), + ("dispatched", prune_mask_kernel()), + ]; + #[cfg(target_arch = "x86_64")] + { + if std::arch::is_x86_feature_detected!("avx2") { + kernels.push(("avx2", x86::prune_masks_avx2_dispatch)); + } + if std::arch::is_x86_feature_detected!("avx512f") { + kernels.push(("avx512", x86::prune_masks_avx512_dispatch)); + } + } + kernels + } + + /// Per-lane reference mirroring `raw_query_lower_bound` and the scalar + /// pruning checks of the top-k scan. + fn reference_masks( + dists: &[f32; PRUNE_LANES], + scale_factors: &[f32; PRUNE_LANES], + add_factors: &[f32; PRUNE_LANES], + error_factors: &[f32; PRUNE_LANES], + terms: LowerBoundTerms, + upper_bound: f32, + heap_threshold: Option, + ) -> (u16, u16) { + let mut pruned_upper_bound = 0u16; + let mut pruned_heap = 0u16; + for lane in 0..PRUNE_LANES { + let lower_bound = (dists[lane] - terms.half_sum_q) * scale_factors[lane] + + add_factors[lane] + + terms.query_factor + - error_factors[lane] * terms.query_error; + if lower_bound >= upper_bound { + pruned_upper_bound |= 1 << lane; + } else if heap_threshold.is_some_and(|threshold| lower_bound >= threshold) { + pruned_heap |= 1 << lane; + } + } + (pruned_upper_bound, pruned_heap) + } + + #[allow(clippy::too_many_arguments)] + fn assert_kernels_match_reference( + dists: &[f32; PRUNE_LANES], + scale_factors: &[f32; PRUNE_LANES], + add_factors: &[f32; PRUNE_LANES], + error_factors: &[f32; PRUNE_LANES], + terms: LowerBoundTerms, + upper_bound: f32, + heap_threshold: Option, + case: &str, + ) { + let expected = reference_masks( + dists, + scale_factors, + add_factors, + error_factors, + terms, + upper_bound, + heap_threshold, + ); + for (name, kernel) in available_kernels() { + let actual = kernel( + dists, + scale_factors, + add_factors, + error_factors, + terms, + upper_bound, + heap_threshold, + ); + assert_eq!( + actual, expected, + "kernel={name} case={case}: masks {actual:04x?} != {expected:04x?}" + ); + } + } + + #[test] + fn test_prune_masks_match_reference_on_random_inputs() { + let mut rng = SmallRng::seed_from_u64(42); + for round in 0..200 { + let mut dists = [0.0f32; PRUNE_LANES]; + let mut scale_factors = [0.0f32; PRUNE_LANES]; + let mut add_factors = [0.0f32; PRUNE_LANES]; + let mut error_factors = [0.0f32; PRUNE_LANES]; + for lane in 0..PRUNE_LANES { + dists[lane] = rng.random_range(-100.0f32..100.0); + scale_factors[lane] = rng.random_range(-2.0f32..2.0); + add_factors[lane] = rng.random_range(-10.0f32..10.0); + error_factors[lane] = rng.random_range(0.0f32..5.0); + } + let terms = LowerBoundTerms { + half_sum_q: rng.random_range(-50.0f32..50.0), + query_factor: rng.random_range(-10.0f32..10.0), + query_error: rng.random_range(0.0f32..2.0), + }; + let upper_bound = rng.random_range(-50.0f32..50.0); + let heap_threshold = if round % 3 == 0 { + None + } else { + Some(rng.random_range(-50.0f32..50.0)) + }; + assert_kernels_match_reference( + &dists, + &scale_factors, + &add_factors, + &error_factors, + terms, + upper_bound, + heap_threshold, + &format!("random round {round}"), + ); + } + } + + #[test] + fn test_prune_masks_exact_boundaries() { + // With scale=1, err=0, half_sum_q=0, query_factor=0 the lower bound + // is the input itself, so bounds can be placed exactly on lanes. + let dists: [f32; PRUNE_LANES] = std::array::from_fn(|lane| lane as f32); + let scale_factors = [1.0f32; PRUNE_LANES]; + let add_factors = [0.0f32; PRUNE_LANES]; + let error_factors = [0.0f32; PRUNE_LANES]; + let terms = LowerBoundTerms { + half_sum_q: 0.0, + query_factor: 0.0, + query_error: 1.0, + }; + // Equality must prune (scalar uses `>=`): lanes 3.. hit the upper + // bound, lanes 1..3 hit only the heap threshold. + let (pruned_upper_bound, pruned_heap) = prune_masks_portable( + &dists, + &scale_factors, + &add_factors, + &error_factors, + terms, + 3.0, + Some(1.0), + ); + assert_eq!(pruned_upper_bound, 0xfff8); + assert_eq!(pruned_heap, 0x0006); + assert_kernels_match_reference( + &dists, + &scale_factors, + &add_factors, + &error_factors, + terms, + 3.0, + Some(1.0), + "exact boundaries", + ); + // No heap threshold: only the upper-bound mask is set. + assert_kernels_match_reference( + &dists, + &scale_factors, + &add_factors, + &error_factors, + terms, + 3.0, + None, + "no heap threshold", + ); + } + + #[test] + fn test_prune_masks_nan_and_infinity_semantics() { + let mut dists = [0.0f32; PRUNE_LANES]; + dists[0] = f32::NAN; + dists[1] = f32::INFINITY; + dists[2] = f32::NEG_INFINITY; + dists[3] = 1.0; + let mut scale_factors = [1.0f32; PRUNE_LANES]; + scale_factors[4] = f32::NAN; + let add_factors = [0.0f32; PRUNE_LANES]; + let mut error_factors = [0.0f32; PRUNE_LANES]; + error_factors[5] = f32::INFINITY; + let terms = LowerBoundTerms { + half_sum_q: 0.0, + query_factor: 0.0, + query_error: 1.0, + }; + for (upper_bound, heap_threshold) in [ + (0.5, Some(0.0)), + (f32::INFINITY, Some(f32::NEG_INFINITY)), + (f32::NAN, Some(f32::NAN)), + (0.5, None), + ] { + assert_kernels_match_reference( + &dists, + &scale_factors, + &add_factors, + &error_factors, + terms, + upper_bound, + heap_threshold, + &format!("special values ub={upper_bound} thr={heap_threshold:?}"), + ); + } + // NaN lower bounds (lane 0 via a NaN binary inner product, lane 4 via + // a NaN scale factor) must never be pruned by either mask. + let (pruned_upper_bound, pruned_heap) = prune_masks_portable( + &dists, + &scale_factors, + &add_factors, + &error_factors, + terms, + 0.5, + Some(0.0), + ); + assert_eq!(pruned_upper_bound & 0b1_0001, 0); + assert_eq!(pruned_heap & 0b1_0001, 0); + } +} diff --git a/rust/lance-index/src/vector/bq/storage.rs b/rust/lance-index/src/vector/bq/storage.rs index bd70f176c5d..2f4fe69792a 100644 --- a/rust/lance-index/src/vector/bq/storage.rs +++ b/rust/lance-index/src/vector/bq/storage.rs @@ -17,7 +17,7 @@ use arrow_array::{ use arrow_schema::{DataType, Field, SchemaRef}; use async_trait::async_trait; use bytes::{Bytes, BytesMut}; -use itertools::Itertools; +use itertools::{Itertools, izip}; use lance_arrow::{ArrowFloatType, FixedSizeListArrayExt, FloatArray, RecordBatchExt}; use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, ROW_ID, Result}; @@ -41,6 +41,14 @@ use serde::{Deserialize, Serialize}; use crate::frag_reuse::FragReuseIndex; use crate::pb; use crate::vector::ApproxMode; +use crate::vector::bq::dist_table_quant::{ + DistTableDequant, quantize_dist_table_into, quantize_dist_table_u16_into, +}; +use crate::vector::bq::ex_dot::{ + EX_DOT_BLOCK_DIMS, ExDotFn, blocked_ex_code_bytes, ex_dot_kernel, pad_query_into, + padded_query_len, repack_sequential_row, sequential_matches_blocked, +}; +use crate::vector::bq::prune::{LowerBoundTerms, PRUNE_LANES, prune_mask_kernel}; use crate::vector::bq::rotation::{apply_fast_rotation, apply_fast_rotation_in_place}; use crate::vector::bq::transform::{ ADD_FACTORS_COLUMN, ERROR_FACTORS_COLUMN, EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN, @@ -59,7 +67,14 @@ use crate::vector::storage::{ pub const RABIT_METADATA_KEY: &str = "lance:rabit"; pub const RABIT_CODE_COLUMN: &str = "_rabit_codes"; +/// Legacy ex-code column: sequential LSB-first bit stream per row. Read-only; +/// rows are repacked into the blocked layout at load time. pub const RABIT_EX_CODE_COLUMN: &str = "__ex_codes"; +/// Ex-code column in the blocked layout consumed by the ex-dot kernels (see +/// `ex_dot` module docs). Indexes written with this column cannot be read by +/// older versions, which fail with a missing-column error instead of +/// misinterpreting the bytes. +pub const RABIT_BLOCKED_EX_CODE_COLUMN: &str = "__blocked_ex_codes"; pub const SEGMENT_LENGTH: usize = 4; pub const SEGMENT_NUM_CODES: usize = 1 << SEGMENT_LENGTH; const RABIT_PRUNE_STATS_ENV: &str = "LANCE_RQ_PRUNE_STATS"; @@ -122,16 +137,28 @@ fn emit_rabit_prune_stats(message: &str) { ); } -fn record_rabit_prune_stats( +/// Per-scan tallies of the raw-query lower-bound gating, reported through +/// `record_rabit_prune_stats`. +#[derive(Default)] +struct RabitPruneCounters { candidates: usize, pruned_upper_bound: usize, pruned_heap: usize, exact: usize, exact_rejected: usize, -) { +} + +fn record_rabit_prune_stats(counters: &RabitPruneCounters) { if !rabit_prune_stats_enabled() { return; } + let RabitPruneCounters { + candidates, + pruned_upper_bound, + pruned_heap, + exact, + exact_rejected, + } = *counters; let stats = RABIT_PRUNE_STATS.get_or_init(RabitPruneStats::default); let calls = stats.calls.fetch_add(1, Ordering::Relaxed) + 1; @@ -210,10 +237,10 @@ pub fn rabit_ex_code_field(rotated_dim: usize, num_bits: u8) -> Result(&rotated_query, &mut dist_table); - let mut ex_dist_table = vec![0.0; ex_dist_table_len]; - build_ex_dist_table_direct_into(&rotated_query, ex_bits, &mut ex_dist_table); + // The kernels consume the rotated query directly; a zero-padded copy + // is only needed when the rotated dim is not block-aligned. + let mut ex_query = Vec::new(); + if ex_bits > 0 && !code_dim.is_multiple_of(EX_DOT_BLOCK_DIMS) { + ex_query.resize(padded_query_len(code_dim), 0.0); + pad_query_into(&rotated_query, &mut ex_query); + } let sum_q = rotated_query.iter().copied().sum(); Ok(RabitRawQueryContext { @@ -370,7 +397,7 @@ impl RabitQuantizationMetadata { ex_bits, rotated_query, dist_table, - ex_dist_table, + ex_query, sum_q, }) } @@ -462,6 +489,10 @@ pub struct RabitQuantizationStorage { add_factors: Float32Array, scale_factors: Float32Array, error_factors: Option, + // ex codes in the blocked kernel layout; always aliases the batch column + // (legacy sequential batches are normalized at load, replacing the + // sequential column with the repacked one, so rewrites emit the blocked + // format). ex_codes: Option, packed_ex_codes: Option, ex_add_factors: Option, @@ -560,12 +591,17 @@ impl RabitQuantizationStorage { let RabitDistCalculatorParts { dim, dist_table, - ex_dist_table, + ex_query, sum_q, query_factor, query_error, approx_mode, } = parts; + let ex_code_len = self + .ex_codes + .as_ref() + .map(|codes| codes.value_length() as usize) + .unwrap_or_default(); let ex_codes = self .ex_codes .as_ref() @@ -579,10 +615,11 @@ impl RabitQuantizationStorage { self.metadata.num_bits, self.metadata.query_estimator, dist_table, - ex_dist_table, + ex_query, sum_q, self.codes.values().as_primitive::().values(), ex_codes, + ex_code_len, self.add_factors.values(), self.scale_factors.values(), self.error_factors @@ -767,25 +804,56 @@ fn copy_subtract_f32(lhs: &[f32], rhs: &[f32], output: &mut [f32]) { struct RabitDistCalculatorParts<'a> { dim: usize, dist_table: Cow<'a, [f32]>, - ex_dist_table: Cow<'a, [f32]>, + ex_query: Cow<'a, [f32]>, sum_q: f32, query_factor: f32, query_error: f32, approx_mode: ApproxMode, } +/// Loop-invariant inputs of the raw-query multi-bit top-k scans: the row +/// count, the resolved ex-code state for exact reranking, and the query +/// bounds. +struct RawQueryTopkContext<'a> { + n: usize, + k: usize, + ex_bits: u8, + ex_codes: &'a [u8], + ex_add_factors: &'a [f32], + ex_scale_factors: &'a [f32], + query_lower_bound: f32, + query_upper_bound: f32, +} + +/// Pick the query slice the ex-dot kernels consume: the rotated query itself +/// when the dim is block-aligned, otherwise a zero-padded copy. +fn kernel_query<'a>(rotated_query: &'a [f32], padded: &'a [f32]) -> &'a [f32] { + if rotated_query.len().is_multiple_of(EX_DOT_BLOCK_DIMS) { + rotated_query + } else { + padded + } +} + pub struct RabitDistCalculator<'a> { dim: usize, num_bits: u8, query_estimator: RabitQueryEstimator, // n * d / 8 binary-code bytes codes: &'a [u8], + // per-row ex codes in the blocked kernel layout ex_codes: Option<&'a [u8]>, + // bytes per ex-code row; legacy rows for layout-compatible widths may be + // shorter than the blocked size, which the kernels treat as zero padding + ex_code_len: usize, // this is a flattened 2D array of size d/4 * 16, // we split the query codes into d/4 chunks, each chunk is with 4 elements, // then dist_table[i][j] is the distance between the i-th query code and the code j dist_table: Cow<'a, [f32]>, - ex_dist_table: Cow<'a, [f32]>, + // the rotated query, zero-padded to a 64-dim multiple when needed; also + // the source for the FastScan ex LUT on the legacy bypass path + ex_query: Cow<'a, [f32]>, + ex_dot: Option, add_factors: &'a [f32], scale_factors: &'a [f32], error_factors: Option<&'a [f32]>, @@ -807,10 +875,11 @@ impl<'a> RabitDistCalculator<'a> { num_bits: u8, query_estimator: RabitQueryEstimator, dist_table: Cow<'a, [f32]>, - ex_dist_table: Cow<'a, [f32]>, + ex_query: Cow<'a, [f32]>, sum_q: f32, codes: &'a [u8], ex_codes: Option<&'a [u8]>, + ex_code_len: usize, add_factors: &'a [f32], scale_factors: &'a [f32], error_factors: Option<&'a [f32]>, @@ -821,14 +890,17 @@ impl<'a> RabitDistCalculator<'a> { query_error: f32, approx_mode: ApproxMode, ) -> Self { + let ex_dot = (num_bits > 1).then(|| ex_dot_kernel(num_bits - 1)); Self { dim, num_bits, query_estimator, codes, ex_codes, + ex_code_len, dist_table, - ex_dist_table, + ex_query, + ex_dot, add_factors, scale_factors, error_factors, @@ -843,6 +915,34 @@ impl<'a> RabitDistCalculator<'a> { } } + /// `sum_d query[d] * ex_code[d]` for the candidate's packed ex codes. + #[inline] + fn ex_code_dot(&self, ex_codes: &[u8], id: usize) -> f32 { + let ex_dot = self + .ex_dot + .expect("raw-query multi-bit RQ requires an ex-dot kernel"); + ex_dot( + self.ex_query.as_ref(), + &ex_codes[id * self.ex_code_len..(id + 1) * self.ex_code_len], + ) + } + + /// Fill `dists[0..n]` with exact per-row binary distances computed + /// directly from the f32 dist table — the fallback when the quantized + /// reconstruction scale would be non-finite ([`DistTableDequant::Exact`]). + #[allow(clippy::uninit_vec)] + fn fill_exact_binary_distances(&self, n: usize, code_len: usize, dists: &mut Vec) { + dists.clear(); + dists.reserve(n); + // SAFETY: the loop initializes every element in [0, n). + unsafe { + dists.set_len(n); + } + dists.iter_mut().enumerate().for_each(|(id, dist)| { + *dist = compute_single_rq_distance(self.codes, id, n, code_len, &self.dist_table); + }); + } + #[allow(clippy::uninit_vec)] fn binary_distances_with_scratch( &self, @@ -864,7 +964,16 @@ impl<'a> RabitDistCalculator<'a> { ); } - let (qmin, qmax) = quantize_dist_table_into(&self.dist_table, quantized_dists_table); + let (qmin, qmax) = match quantize_dist_table_into(&self.dist_table, quantized_dists_table) { + DistTableDequant::Affine { qmin, qmax } => (qmin, qmax), + DistTableDequant::Exact => { + // The affine reconstruction would be non-finite; compute every + // binary distance exactly and report no SIMD rows so the + // ex-rerank caller takes the per-row path for all of them. + self.fill_exact_binary_distances(n, code_len, dists); + return 0; + } + }; let remainder = n % BATCH_SIZE; let simd_len = n - remainder; quantized_dists.clear(); @@ -924,7 +1033,16 @@ impl<'a> RabitDistCalculator<'a> { hacc_dist_table: &mut Vec, quantized_dists: &mut Vec, ) -> usize { - let (qmin, qmax) = quantize_dist_table_u16_into(&self.dist_table, quantized_dist_table); + let (qmin, qmax) = + match quantize_dist_table_u16_into(&self.dist_table, quantized_dist_table) { + DistTableDequant::Affine { qmin, qmax } => (qmin, qmax), + DistTableDequant::Exact => { + // See binary_distances_with_scratch: non-finite affine + // scale falls back to exact per-row distances. + self.fill_exact_binary_distances(n, code_len, dists); + return 0; + } + }; simd::dist_table::transfer_4bit_dist_table_u16(quantized_dist_table, hacc_dist_table); let remainder = n % BATCH_SIZE; let simd_len = n - remainder; @@ -1030,8 +1148,6 @@ impl<'a> RabitDistCalculator<'a> { let ex_scale_factors = self .ex_scale_factors .expect("raw-query multi-bit RQ requires ex scale factors"); - let ex_code_len = - rabit_ex_code_bytes(self.dim, ex_bits).expect("RabitQ num_bits should be validated"); let code_scale = (1u32 << ex_bits) as f32; let code_bias = -(code_scale - 0.5); @@ -1039,12 +1155,11 @@ impl<'a> RabitDistCalculator<'a> { self.packed_ex_codes .map(|packed_ex_codes| { let fastscan_len = simd_len; - let fastscan_code_len = ex_fastscan_code_len(self.dim, ex_bits) - .expect("RabitQ num_bits should be validated"); + let fastscan_code_len = self.ex_code_len; let (qmin, qmax, quantization_max) = quantize_ex_fastscan_dist_table_into( - self.dim, ex_bits, - &self.ex_dist_table, + self.ex_code_len, + self.ex_query.as_ref(), quantized_dists_table, ); quantized_dists.clear(); @@ -1088,14 +1203,7 @@ impl<'a> RabitDistCalculator<'a> { .enumerate() .skip(fastscan_len) .for_each(|(id, dist)| { - let ex_dist = compute_single_rq_ex_distance( - ex_codes, - id, - ex_code_len, - ex_bits, - self.dim, - &self.ex_dist_table, - ); + let ex_dist = self.ex_code_dot(ex_codes, id); let full_dot = code_scale * *dist + ex_dist + code_bias * self.sum_q; *dist = full_dot * ex_scale_factors[id] + ex_add_factors[id] + self.query_factor; }); @@ -1121,44 +1229,37 @@ impl<'a> RabitDistCalculator<'a> { id: usize, binary_ip: f32, ex_bits: u8, - ex_code_len: usize, ex_codes: &[u8], ex_add_factors: &[f32], ex_scale_factors: &[f32], ) -> f32 { - let ex_dist = compute_single_rq_ex_distance( - ex_codes, - id, - ex_code_len, - ex_bits, - self.dim, - &self.ex_dist_table, - ); + let ex_dist = self.ex_code_dot(ex_codes, id); let code_bias = -((1u32 << ex_bits) as f32 - 0.5); let full_dot = (1u32 << ex_bits) as f32 * binary_ip + ex_dist + code_bias * self.sum_q; full_dot * ex_scale_factors[id] + ex_add_factors[id] + self.query_factor } + /// Compute the binary inner products into `dists` and resolve the inputs + /// shared by the raw-query multi-bit top-k scans. Returns `None` when the + /// partition has no rows. #[allow(clippy::too_many_arguments)] - fn accumulate_raw_query_multi_bit_topk_with_scratch( + fn raw_query_multi_bit_topk_context( &self, k: usize, lower_bound: Option, upper_bound: Option, - row_ids: impl Iterator, - res: &mut BinaryHeap>, dists: &mut Vec, quantized_dists: &mut Vec, quantized_dists_table: &mut Vec, hacc_quantized_dists: &mut Vec, - ) { + ) -> Option> { let code_len = rabit_binary_code_bytes(self.dim); let n = self.codes.len() / code_len; if n == 0 { dists.clear(); quantized_dists.clear(); hacc_quantized_dists.clear(); - return; + return None; } self.binary_distances_with_scratch( @@ -1170,77 +1271,233 @@ impl<'a> RabitDistCalculator<'a> { hacc_quantized_dists, ); - let ex_bits = self.num_bits - 1; - let ex_codes = self - .ex_codes - .expect("raw-query multi-bit RQ requires ex codes"); - let ex_add_factors = self - .ex_add_factors - .expect("raw-query multi-bit RQ requires ex add factors"); - let ex_scale_factors = self - .ex_scale_factors - .expect("raw-query multi-bit RQ requires ex scale factors"); - let ex_code_len = - rabit_ex_code_bytes(self.dim, ex_bits).expect("RabitQ num_bits should be validated"); - let query_lower_bound = lower_bound.unwrap_or(f32::MIN); - let query_upper_bound = upper_bound.unwrap_or(f32::MAX); + Some(RawQueryTopkContext { + n, + k, + ex_bits: self.num_bits - 1, + ex_codes: self + .ex_codes + .expect("raw-query multi-bit RQ requires ex codes"), + ex_add_factors: self + .ex_add_factors + .expect("raw-query multi-bit RQ requires ex add factors"), + ex_scale_factors: self + .ex_scale_factors + .expect("raw-query multi-bit RQ requires ex scale factors"), + query_lower_bound: lower_bound.unwrap_or(f32::MIN), + query_upper_bound: upper_bound.unwrap_or(f32::MAX), + }) + } + + /// Process one candidate row given its lower bound: the bound checks, + /// the exact rerank, and the heap update shared by the sparse scan and + /// the dense scan's surviving lanes and tail. + #[inline] + #[allow(clippy::too_many_arguments)] + fn accumulate_raw_query_multi_bit_row( + &self, + ctx: &RawQueryTopkContext<'_>, + id: usize, + row_id: u64, + binary_ip: f32, + raw_lower_bound: f32, + res: &mut BinaryHeap>, + max_dist: &mut Option, + counters: &mut RabitPruneCounters, + ) { + if raw_lower_bound >= ctx.query_upper_bound { + counters.pruned_upper_bound += 1; + return; + } + if res.len() >= ctx.k && max_dist.is_some_and(|max_dist| raw_lower_bound >= max_dist.0) { + counters.pruned_heap += 1; + return; + } + + counters.exact += 1; + let dist = self.raw_query_multi_bit_exact_distance( + id, + binary_ip, + ctx.ex_bits, + ctx.ex_codes, + ctx.ex_add_factors, + ctx.ex_scale_factors, + ); + if dist < ctx.query_lower_bound || dist >= ctx.query_upper_bound { + counters.exact_rejected += 1; + return; + } + let dist = OrderedFloat(dist); + if res.len() < ctx.k { + res.push(OrderedNode::new(row_id, dist)); + if res.len() == ctx.k { + *max_dist = res.peek().map(|node| node.dist); + } + } else if max_dist.is_some_and(|max_dist| max_dist > dist) { + res.pop(); + res.push(OrderedNode::new(row_id, dist)); + *max_dist = res.peek().map(|node| node.dist); + } + } + + #[allow(clippy::too_many_arguments)] + fn accumulate_raw_query_multi_bit_topk_with_scratch( + &self, + k: usize, + lower_bound: Option, + upper_bound: Option, + row_ids: impl Iterator, + res: &mut BinaryHeap>, + dists: &mut Vec, + quantized_dists: &mut Vec, + quantized_dists_table: &mut Vec, + hacc_quantized_dists: &mut Vec, + ) { + let Some(ctx) = self.raw_query_multi_bit_topk_context( + k, + lower_bound, + upper_bound, + dists, + quantized_dists, + quantized_dists_table, + hacc_quantized_dists, + ) else { + return; + }; let mut max_dist = res.peek().map(|node| node.dist); - let mut candidates = 0; - let mut pruned_upper_bound = 0; - let mut pruned_heap = 0; - let mut exact = 0; - let mut exact_rejected = 0; + let mut counters = RabitPruneCounters::default(); for (id, row_id) in row_ids { let Some(binary_ip) = dists.get(id).copied() else { continue; }; - candidates += 1; + counters.candidates += 1; let Some(raw_lower_bound) = self.raw_query_lower_bound(id, binary_ip) else { continue; }; - if raw_lower_bound >= query_upper_bound { - pruned_upper_bound += 1; - continue; - } - if res.len() >= k && max_dist.is_some_and(|max_dist| raw_lower_bound >= max_dist.0) { - pruned_heap += 1; - continue; + self.accumulate_raw_query_multi_bit_row( + &ctx, + id, + row_id, + binary_ip, + raw_lower_bound, + res, + &mut max_dist, + &mut counters, + ); + } + record_rabit_prune_stats(&counters); + } + + /// Top-k scan over all rows `0..n` in order: classify [`PRUNE_LANES`] + /// rows at a time with the SIMD lower-bound kernel and run the scalar + /// rerank only for the surviving lanes. + #[allow(clippy::too_many_arguments)] + fn accumulate_raw_query_multi_bit_topk_dense_with_scratch( + &self, + k: usize, + lower_bound: Option, + upper_bound: Option, + row_id: impl Fn(u32) -> u64, + res: &mut BinaryHeap>, + dists: &mut Vec, + quantized_dists: &mut Vec, + quantized_dists_table: &mut Vec, + hacc_quantized_dists: &mut Vec, + ) { + let Some(ctx) = self.raw_query_multi_bit_topk_context( + k, + lower_bound, + upper_bound, + dists, + quantized_dists, + quantized_dists_table, + hacc_quantized_dists, + ) else { + return; + }; + let dists = dists.as_slice(); + debug_assert_eq!(dists.len(), ctx.n); + let scale_factors = &self.scale_factors[..ctx.n]; + let add_factors = &self.add_factors[..ctx.n]; + let error_factors = &self + .error_factors + .expect("raw-query lower-bound gating requires error factors")[..ctx.n]; + // Same expression as `raw_query_lower_bound` with `error_factors` + // already resolved; the masks below match it bit for bit. + let lower_bound_of = |id: usize, binary_ip: f32| { + self.raw_query_binary_distance(id, binary_ip) - error_factors[id] * self.query_error + }; + let terms = LowerBoundTerms { + half_sum_q: 0.5 * self.sum_q, + query_factor: self.query_factor, + query_error: self.query_error, + }; + let prune_masks = prune_mask_kernel(); + let mut max_dist = res.peek().map(|node| node.dist); + let mut counters = RabitPruneCounters::default(); + + let (dist_groups, dist_tail) = dists.as_chunks::(); + let (scale_groups, _) = scale_factors.as_chunks::(); + let (add_groups, _) = add_factors.as_chunks::(); + let (error_groups, _) = error_factors.as_chunks::(); + for (group, (dist16, scale16, add16, error16)) in + izip!(dist_groups, scale_groups, add_groups, error_groups).enumerate() + { + counters.candidates += PRUNE_LANES; + // The heap threshold only ever tightens, so this group-start + // snapshot can only over-select survivors (which the per-row + // processing below re-checks against live values), never prune a + // row the scalar scan would have kept. + let heap_threshold = (res.len() >= ctx.k) + .then(|| max_dist.map(|max_dist| max_dist.0)) + .flatten(); + let (pruned_upper_bound, pruned_heap) = prune_masks( + dist16, + scale16, + add16, + error16, + terms, + ctx.query_upper_bound, + heap_threshold, + ); + counters.pruned_upper_bound += pruned_upper_bound.count_ones() as usize; + counters.pruned_heap += pruned_heap.count_ones() as usize; + let mut survivors = !(pruned_upper_bound | pruned_heap); + while survivors != 0 { + let lane = survivors.trailing_zeros() as usize; + survivors &= survivors - 1; + let id = group * PRUNE_LANES + lane; + let binary_ip = dists[id]; + self.accumulate_raw_query_multi_bit_row( + &ctx, + id, + row_id(id as u32), + binary_ip, + lower_bound_of(id, binary_ip), + res, + &mut max_dist, + &mut counters, + ); } + } - exact += 1; - let dist = self.raw_query_multi_bit_exact_distance( + let tail_start = ctx.n - dist_tail.len(); + for (offset, binary_ip) in dist_tail.iter().copied().enumerate() { + let id = tail_start + offset; + counters.candidates += 1; + self.accumulate_raw_query_multi_bit_row( + &ctx, id, + row_id(id as u32), binary_ip, - ex_bits, - ex_code_len, - ex_codes, - ex_add_factors, - ex_scale_factors, + lower_bound_of(id, binary_ip), + res, + &mut max_dist, + &mut counters, ); - if dist < query_lower_bound || dist >= query_upper_bound { - exact_rejected += 1; - continue; - } - let dist = OrderedFloat(dist); - if res.len() < k { - res.push(OrderedNode::new(row_id, dist)); - if res.len() == k { - max_dist = res.peek().map(|node| node.dist); - } - } else if max_dist.is_some_and(|max_dist| max_dist > dist) { - res.pop(); - res.push(OrderedNode::new(row_id, dist)); - max_dist = res.peek().map(|node| node.dist); - } } - record_rabit_prune_stats( - candidates, - pruned_upper_bound, - pruned_heap, - exact, - exact_rejected, - ); + record_rabit_prune_stats(&counters); } fn raw_query_lower_bound_gating_disabled_reason(&self) -> Option<&'static str> { @@ -1276,33 +1533,6 @@ where dist_table } -fn build_ex_dist_table_direct(rotated_query: &[f32], ex_bits: u8) -> Vec { - if ex_bits == 0 { - return Vec::new(); - } - let entries_per_dim = 1usize << ex_bits; - let mut dist_table = vec![0.0; rotated_query.len() * entries_per_dim]; - build_ex_dist_table_direct_into(rotated_query, ex_bits, &mut dist_table); - dist_table -} - -fn build_ex_dist_table_direct_into(rotated_query: &[f32], ex_bits: u8, dist_table: &mut [f32]) { - if ex_bits == 0 { - debug_assert!(dist_table.is_empty()); - return; - } - let entries_per_dim = 1usize << ex_bits; - debug_assert_eq!(dist_table.len(), rotated_query.len() * entries_per_dim); - for (query_value, table) in rotated_query - .iter() - .zip(dist_table.chunks_exact_mut(entries_per_dim)) - { - for (code, value) in table.iter_mut().enumerate() { - *value = *query_value * code as f32; - } - } -} - fn build_dist_table_direct_into(qc: &[T::Native], dist_table: &mut [f32]) where T::Native: AsPrimitive, @@ -1339,95 +1569,20 @@ where }) } -// Quantize the distance table into a caller-owned buffer. -#[inline] -fn quantize_dist_table_into(dist_table: &[f32], quantized_dist_table: &mut Vec) -> (f32, f32) { - let (qmin, qmax) = dist_table - .iter() - .cloned() - .minmax_by(|a, b| a.total_cmp(b)) - .into_option() - .unwrap(); - // this happens if the query is all zeros - if qmin == qmax { - quantized_dist_table.clear(); - quantized_dist_table.resize(dist_table.len(), 0); - return (qmin, qmax); - } - let factor = 255.0 / (qmax - qmin); - quantized_dist_table.clear(); - quantized_dist_table.reserve(dist_table.len()); - let spare = quantized_dist_table.spare_capacity_mut(); - for (quantized, &d) in spare[..dist_table.len()].iter_mut().zip(dist_table.iter()) { - quantized.write(((d - qmin) * factor).round() as u8); - } - // SAFETY: every element in the reserved range was initialized in the loop above. - unsafe { - quantized_dist_table.set_len(dist_table.len()); - } - - (qmin, qmax) -} - -#[inline] -fn quantize_dist_table_u16_into( - dist_table: &[f32], - quantized_dist_table: &mut Vec, -) -> (f32, f32) { - let (qmin, qmax) = dist_table - .iter() - .cloned() - .minmax_by(|a, b| a.total_cmp(b)) - .into_option() - .unwrap(); - if qmin == qmax { - quantized_dist_table.clear(); - quantized_dist_table.resize(dist_table.len(), 0); - return (qmin, qmax); - } - - let factor = u16::MAX as f32 / (qmax - qmin); - quantized_dist_table.clear(); - quantized_dist_table.reserve(dist_table.len()); - let spare = quantized_dist_table.spare_capacity_mut(); - for (quantized, &d) in spare[..dist_table.len()].iter_mut().zip(dist_table.iter()) { - quantized.write(((d - qmin) * factor).round() as u16); - } - // SAFETY: every element in the reserved range was initialized in the loop above. - unsafe { - quantized_dist_table.set_len(dist_table.len()); - } - - (qmin, qmax) -} - -#[inline] -fn packed_ex_code_value(row_codes: &[u8], dim_idx: usize, ex_bits: u8) -> u8 { - debug_assert!(ex_bits > 0); - let bit_offset = dim_idx * ex_bits as usize; - let byte_idx = bit_offset / u8::BITS as usize; - let bit_shift = bit_offset % u8::BITS as usize; - let bits = row_codes[byte_idx] as u16 - | row_codes - .get(byte_idx + 1) - .map(|byte| (*byte as u16) << u8::BITS) - .unwrap_or_default(); - let mask = (1u16 << ex_bits) - 1; - ((bits >> bit_shift) & mask) as u8 -} - +/// Build the u8 FastScan LUT for the ex codes directly from the rotated +/// query (`ex_query`, natural dim order, padding dims zero): the underlying +/// per-dim table is the pure multiplication `q[d] * code`, so no intermediate +/// `dim * 2^ex_bits` table is materialized. fn quantize_ex_fastscan_dist_table_into( - dim: usize, ex_bits: u8, - ex_dist_table: &[f32], + ex_code_len: usize, + ex_query: &[f32], quantized_dist_table: &mut Vec, ) -> (f32, f32, f32) { debug_assert!(supports_ex_fastscan(ex_bits)); - let entries_per_dim = 1usize << ex_bits; - debug_assert_eq!(ex_dist_table.len(), dim * entries_per_dim); - let num_split_tables = - ex_fastscan_code_len(dim, ex_bits).expect("RabitQ num_bits should be validated") * 2; + // One split table per code nibble of the row. + let num_split_tables = ex_code_len * 2; let quantization_max = (u16::MAX as usize / num_split_tables) .min(u8::MAX as usize) .max(1) as f32; @@ -1436,7 +1591,7 @@ fn quantize_ex_fastscan_dist_table_into( let mut qmax = f32::NEG_INFINITY; for table_idx in 0..num_split_tables { for code in 0..SEGMENT_NUM_CODES { - let value = ex_fastscan_dist_table_value(dim, ex_bits, ex_dist_table, table_idx, code); + let value = ex_fastscan_dist_table_value(ex_query, ex_bits, table_idx, code); qmin = qmin.min(value); qmax = qmax.max(value); } @@ -1452,7 +1607,7 @@ fn quantize_ex_fastscan_dist_table_into( let factor = quantization_max / (qmax - qmin); for table_idx in 0..num_split_tables { for code in 0..SEGMENT_NUM_CODES { - let value = ex_fastscan_dist_table_value(dim, ex_bits, ex_dist_table, table_idx, code); + let value = ex_fastscan_dist_table_value(ex_query, ex_bits, table_idx, code); quantized_dist_table.push(((value - qmin) * factor).round() as u8); } } @@ -1465,91 +1620,153 @@ fn supports_ex_fastscan(ex_bits: u8) -> bool { matches!(ex_bits, 2 | 4 | 8) } -#[inline] -fn ex_fastscan_code_len(dim: usize, ex_bits: u8) -> Option { - match ex_bits { - 2 | 4 | 8 => rabit_ex_code_bytes(dim, ex_bits).ok(), - _ => None, - } -} - +/// The FastScan LUT value for one nibble of a blocked-layout code byte: +/// `table_idx / 2` is the byte position within a row and `table_idx % 2` +/// selects its low/high nibble (see the `ex_dot` module docs for the +/// byte-to-dim mapping per width). Dims beyond the query length (block +/// padding) contribute zero. #[inline] fn ex_fastscan_dist_table_value( - dim: usize, + ex_query: &[f32], ex_bits: u8, - ex_dist_table: &[f32], table_idx: usize, code: usize, ) -> f32 { + let query = |dim_idx: usize| ex_query.get(dim_idx).copied().unwrap_or(0.0); + let byte_idx = table_idx / 2; + let high_nibble = table_idx % 2 == 1; match ex_bits { 2 => { - let dim_idx = table_idx * 2; - let low = code & 0b11; - let high = (code >> 2) & 0b11; - ex_dist_table_value(ex_dist_table, dim, ex_bits, dim_idx, low) - + ex_dist_table_value(ex_dist_table, dim, ex_bits, dim_idx + 1, high) + // byte 16g+b = dims {64g+b, +16, +32, +48} at bit pairs; the low + // nibble covers the first two dims, the high nibble the last two. + let dim_idx = 64 * (byte_idx / 16) + byte_idx % 16 + 32 * usize::from(high_nibble); + let low = (code & 0b11) as f32; + let high = ((code >> 2) & 0b11) as f32; + query(dim_idx) * low + query(dim_idx + 16) * high + } + 4 => { + // byte 32g+8j+b = dim 64g+16j+b (low nibble) | dim +8 (high). + let in_block = byte_idx % 32; + let dim_idx = 64 * (byte_idx / 32) + + 16 * (in_block / 8) + + in_block % 8 + + 8 * usize::from(high_nibble); + query(dim_idx) * code as f32 } - 4 => ex_dist_table_value(ex_dist_table, dim, ex_bits, table_idx, code), 8 => { - let dim_idx = table_idx / 2; - if table_idx.is_multiple_of(2) { - ex_dist_table_value(ex_dist_table, dim, ex_bits, dim_idx, code) + // byte = dim identity; the high nibble carries code bits 4..8. + let code = if high_nibble { + code << SEGMENT_LENGTH } else { - ex_dist_table_value(ex_dist_table, dim, ex_bits, dim_idx, code << SEGMENT_LENGTH) - } + code + }; + query(byte_idx) * code as f32 } _ => unreachable!("unsupported RabitQ ex_bits={ex_bits} for FastScan"), } } -#[inline] -fn ex_dist_table_value( - ex_dist_table: &[f32], - dim: usize, - ex_bits: u8, - dim_idx: usize, - code: usize, -) -> f32 { - if dim_idx >= dim { - return 0.0; - } - let entries_per_dim = 1usize << ex_bits; - ex_dist_table[dim_idx * entries_per_dim + code] -} - -#[inline] -fn compute_single_rq_ex_distance( - ex_codes: &[u8], - id: usize, - ex_code_len: usize, - ex_bits: u8, - dim: usize, - ex_dist_table: &[f32], -) -> f32 { - if ex_bits == 0 { - return 0.0; - } - let entries_per_dim = 1usize << ex_bits; - let row_codes = &ex_codes[id * ex_code_len..(id + 1) * ex_code_len]; - (0..dim) - .map(|dim_idx| { - let code = packed_ex_code_value(row_codes, dim_idx, ex_bits) as usize; - ex_dist_table[dim_idx * entries_per_dim + code] - }) - .sum() -} - +/// Transpose ex codes for the FastScan bulk path. That path is only reachable +/// when lower-bound gating is disabled, i.e. for legacy indexes without error +/// factors; gated indexes rerank per candidate with the ex-dot kernels and +/// never touch this copy, so skip the transpose (and its resident memory). fn maybe_pack_ex_codes( ex_codes: Option<&FixedSizeListArray>, ex_bits: u8, + error_factors: Option<&Float32Array>, ) -> Option { let ex_codes = ex_codes?; + if error_factors.is_some() { + return None; + } match ex_bits { 2 | 4 | 8 => Some(pack_codes(ex_codes)), _ => None, } } +/// Bring legacy sequential ex codes into the blocked kernel layout: rows are +/// repacked, except for the widths whose layouts agree byte-for-byte (then +/// the column is used as stored). +fn blocked_ex_codes_from_sequential( + seq_codes: &FixedSizeListArray, + dim: usize, + ex_bits: u8, +) -> Result { + if sequential_matches_blocked(ex_bits) + && seq_codes.value_length() as usize == blocked_ex_code_bytes(dim, ex_bits) + { + return Ok(seq_codes.clone()); + } + let seq_code_len = seq_codes.value_length() as usize; + let seq_values = seq_codes.values().as_primitive::().values(); + let blocked_code_len = blocked_ex_code_bytes(dim, ex_bits); + let mut blocked_values = vec![0u8; seq_codes.len() * blocked_code_len]; + for (seq_row, blocked_row) in seq_values + .chunks_exact(seq_code_len) + .zip(blocked_values.chunks_exact_mut(blocked_code_len)) + { + repack_sequential_row(seq_row, dim, ex_bits, blocked_row); + } + Ok(FixedSizeListArray::try_new_from_values( + UInt8Array::from(blocked_values), + blocked_code_len as i32, + )?) +} + +/// Load the ex-code column of an index batch into the blocked kernel layout, +/// accepting both the blocked format and the legacy sequential format. Legacy +/// batches are normalized in place (the sequential column is replaced by the +/// blocked one), so rewrites — remap, optimize merges — always emit the +/// blocked format and legacy indexes upgrade on their next rewrite. +pub(crate) fn load_blocked_ex_codes( + batch: RecordBatch, + rotated_dim: usize, + num_bits: u8, +) -> Result<(RecordBatch, FixedSizeListArray)> { + let ex_bits = rabit_ex_bits(num_bits)?; + if let Some(column) = batch.column_by_name(RABIT_BLOCKED_EX_CODE_COLUMN) { + let codes = column.as_fixed_size_list().clone(); + let expected_bytes = blocked_ex_code_bytes(rotated_dim, ex_bits); + if codes.value_length() as usize != expected_bytes { + return Err(Error::invalid_input(format!( + "RabitQ ex-code byte width mismatch: column {} has {} bytes, metadata rotated_dim={} ex_bits={} requires {} bytes", + RABIT_BLOCKED_EX_CODE_COLUMN, + codes.value_length(), + rotated_dim, + ex_bits, + expected_bytes + ))); + } + return Ok((batch, codes)); + } + let column = batch.column_by_name(RABIT_EX_CODE_COLUMN).ok_or_else(|| { + Error::invalid_input(format!( + "RabitQ num_bits={} requires {} column", + num_bits, RABIT_BLOCKED_EX_CODE_COLUMN + )) + })?; + let codes = column.as_fixed_size_list().clone(); + let expected_bytes = rabit_ex_code_bytes(rotated_dim, ex_bits)?; + if codes.value_length() as usize != expected_bytes { + return Err(Error::invalid_input(format!( + "RabitQ ex-code byte width mismatch: column {} has {} bytes, metadata rotated_dim={} ex_bits={} requires {} bytes", + RABIT_EX_CODE_COLUMN, + codes.value_length(), + rotated_dim, + ex_bits, + expected_bytes + ))); + } + let blocked = blocked_ex_codes_from_sequential(&codes, rotated_dim, ex_bits)?; + let ex_code_field = rabit_ex_code_field(rotated_dim, num_bits)? + .expect("multi-bit RabitQ always has an ex-code field"); + let batch = batch + .drop_column(RABIT_EX_CODE_COLUMN)? + .try_with_column(ex_code_field, Arc::new(blocked.clone()))?; + Ok((batch, blocked)) +} + impl DistCalculator for RabitDistCalculator<'_> { #[inline(always)] fn distance(&self, id: u32) -> f32 { @@ -1580,13 +1797,10 @@ impl DistCalculator for RabitDistCalculator<'_> { let ex_scale_factors = self .ex_scale_factors .expect("raw-query multi-bit RQ requires ex scale factors"); - let ex_code_len = rabit_ex_code_bytes(self.dim, ex_bits) - .expect("RabitQ num_bits should be validated"); self.raw_query_multi_bit_exact_distance( id, dist, ex_bits, - ex_code_len, ex_codes, ex_add_factors, ex_scale_factors, @@ -1690,13 +1904,11 @@ impl DistCalculator for RabitDistCalculator<'_> { return; } - let code_len = rabit_binary_code_bytes(self.dim); - let n = self.codes.len() / code_len; - self.accumulate_raw_query_multi_bit_topk_with_scratch( + self.accumulate_raw_query_multi_bit_topk_dense_with_scratch( k, lower_bound, upper_bound, - (0..n).map(|id| (id, row_id(id as u32))), + row_id, res, dists, quantized_dists, @@ -1865,8 +2077,6 @@ impl VectorStore for RabitQuantizationStorage { let code_dim = self.code_dim(); let rotated_qr = self.rotate_query_vector(code_dim, &qr); let dist_table = build_dist_table_direct::(&rotated_qr); - let ex_bits = self.metadata.num_bits - 1; - let ex_dist_table = build_ex_dist_table_direct(&rotated_qr, ex_bits); let query_factor = match self.metadata.query_estimator { RabitQueryEstimator::ResidualQuery => self.residual_query_factor(dist_q_c), RabitQueryEstimator::RawQuery => self.raw_query_factor(dist_q_c, &rotated_qr, None), @@ -1877,12 +2087,21 @@ impl VectorStore for RabitQuantizationStorage { self.raw_query_error_for_gating(dist_q_c, &rotated_qr, None) } }; - let sum_q = rotated_qr.into_iter().sum(); + let sum_q = rotated_qr.iter().copied().sum(); + // The kernels read the rotated query directly; only unaligned dims + // need a zero-padded copy. + let ex_query = if code_dim.is_multiple_of(EX_DOT_BLOCK_DIMS) { + rotated_qr + } else { + let mut padded = vec![0.0; padded_query_len(code_dim)]; + pad_query_into(&rotated_qr, &mut padded); + padded + }; self.distance_calculator_from_parts(RabitDistCalculatorParts { dim: code_dim, dist_table: Cow::Owned(dist_table), - ex_dist_table: Cow::Owned(ex_dist_table), + ex_query: Cow::Owned(ex_query), sum_q, query_factor, query_error, @@ -1921,7 +2140,10 @@ impl VectorStore for RabitQuantizationStorage { return self.distance_calculator_from_parts(RabitDistCalculatorParts { dim: code_dim, dist_table: Cow::Borrowed(&raw_query.dist_table), - ex_dist_table: Cow::Borrowed(&raw_query.ex_dist_table), + ex_query: Cow::Borrowed(kernel_query( + &raw_query.rotated_query, + &raw_query.ex_query, + )), sum_q: raw_query.sum_q, query_factor, query_error, @@ -1931,18 +2153,20 @@ impl VectorStore for RabitQuantizationStorage { let dist_table_len = code_dim * 4; let ex_bits = self.metadata.num_bits - 1; - let ex_dist_table_len = if ex_bits == 0 { + // The kernels read the rotated query in place; a zero-padded copy is + // only needed when the rotated dim is not block-aligned. + let ex_query_table_len = if ex_bits == 0 || code_dim.is_multiple_of(EX_DOT_BLOCK_DIMS) { 0 } else { - code_dim * (1usize << ex_bits) + padded_query_len(code_dim) }; - f32_scratch.resize(code_dim + dist_table_len + ex_dist_table_len, 0.0); + f32_scratch.resize(code_dim + dist_table_len + ex_query_table_len, 0.0); let query_factor; let query_error; let sum_q = { let (rotated_qr, remaining) = f32_scratch.split_at_mut(code_dim); - let (dist_table, ex_dist_table) = remaining.split_at_mut(dist_table_len); + let (dist_table, ex_query) = remaining.split_at_mut(dist_table_len); match residual { Some(QueryResidual::Centroid(residual_centroid)) => { self.rotate_query_vector_into( @@ -1981,17 +2205,20 @@ impl VectorStore for RabitQuantizationStorage { } }; build_dist_table_direct_into::(rotated_qr, dist_table); - build_ex_dist_table_direct_into(rotated_qr, ex_bits, ex_dist_table); + if ex_query_table_len > 0 { + pad_query_into(rotated_qr, ex_query); + } rotated_qr.iter().copied().sum() }; + let ex_query_start = code_dim + dist_table_len; self.distance_calculator_from_parts(RabitDistCalculatorParts { dim: code_dim, - dist_table: Cow::Borrowed(&f32_scratch[code_dim..code_dim + dist_table_len]), - ex_dist_table: Cow::Borrowed( - &f32_scratch - [code_dim + dist_table_len..code_dim + dist_table_len + ex_dist_table_len], - ), + dist_table: Cow::Borrowed(&f32_scratch[code_dim..ex_query_start]), + ex_query: Cow::Borrowed(kernel_query( + &f32_scratch[..code_dim], + &f32_scratch[ex_query_start..ex_query_start + ex_query_table_len], + )), sum_q, query_factor, query_error, @@ -2155,6 +2382,38 @@ pub fn unpack_codes(codes: &FixedSizeListArray) -> FixedSizeListArray { FixedSizeListArray::try_new_from_values(UInt8Array::from(unpacked), code_len as i32).unwrap() } +/// Build a row-id remapping for the rows present in this partition from a +/// fragment-reuse index, mirroring the PQ storage frag-reuse path. +/// +/// Returns `None` when there is nothing to do (no fragment-reuse index, or the +/// index leaves every present row id unchanged), so callers keep the zero-cost +/// no-op path. Otherwise, returns a `HashMap` mapping every affected old row id +/// to `Some(new_id)` for surviving rows or `None` for rows whose covering +/// fragment was compacted away, suitable for `RabitQuantizationStorage::remap`. +fn build_frag_reuse_mapping( + fri: Option<&FragReuseIndex>, + row_ids: &UInt64Array, +) -> Option>> { + let fri = fri?; + if fri.row_id_maps.is_empty() { + return None; + } + let mut mapping: HashMap> = HashMap::new(); + for row_id in row_ids.values().iter() { + match fri.remap_row_id(*row_id) { + Some(new_id) if new_id == *row_id => {} + mapped => { + mapping.insert(*row_id, mapped); + } + } + } + if mapping.is_empty() { + None + } else { + Some(mapping) + } +} + #[async_trait] impl QuantizerStorage for RabitQuantizationStorage { type Metadata = RabitQuantizationMetadata; @@ -2163,7 +2422,7 @@ impl QuantizerStorage for RabitQuantizationStorage { batch: RecordBatch, metadata: &Self::Metadata, distance_type: DistanceType, - _fri: Option>, + fri: Option>, ) -> Result { let distance_type = match (metadata.query_estimator, distance_type) { (RabitQueryEstimator::RawQuery, DistanceType::Cosine) => DistanceType::L2, @@ -2192,31 +2451,14 @@ impl QuantizerStorage for RabitQuantizationStorage { .column_by_name(ERROR_FACTORS_COLUMN) .map(|factors| factors.as_primitive::().clone()); let ex_bits = rabit_ex_bits(metadata.num_bits)?; + let mut batch = batch; let mut ex_codes = None; let mut ex_add_factors = None; let mut ex_scale_factors = None; if ex_bits != 0 { - let codes = batch - .column_by_name(RABIT_EX_CODE_COLUMN) - .ok_or_else(|| { - Error::invalid_input(format!( - "RabitQ num_bits={} requires {} column", - metadata.num_bits, RABIT_EX_CODE_COLUMN - )) - })? - .as_fixed_size_list() - .clone(); - let expected_ex_code_bytes = rabit_ex_code_bytes(metadata.rotated_dim(), ex_bits)?; - if codes.value_length() as usize != expected_ex_code_bytes { - return Err(Error::invalid_input(format!( - "RabitQ ex-code byte width mismatch: column {} has {} bytes, metadata rotated_dim={} ex_bits={} requires {} bytes", - RABIT_EX_CODE_COLUMN, - codes.value_length(), - metadata.rotated_dim(), - ex_bits, - expected_ex_code_bytes - ))); - } + let (normalized_batch, codes) = + load_blocked_ex_codes(batch, metadata.rotated_dim(), metadata.num_bits)?; + batch = normalized_batch; ex_codes = Some(codes); ex_add_factors = Some( batch @@ -2246,16 +2488,19 @@ impl QuantizerStorage for RabitQuantizationStorage { if batch.column_by_name(EX_ADD_FACTORS_COLUMN).is_some() || batch.column_by_name(EX_SCALE_FACTORS_COLUMN).is_some() || batch.column_by_name(RABIT_EX_CODE_COLUMN).is_some() + || batch.column_by_name(RABIT_BLOCKED_EX_CODE_COLUMN).is_some() { return Err(Error::invalid_input( "RabitQ num_bits=1 raw-query indexes must not contain ex-code columns" .to_string(), )); } - } else if batch.column_by_name(RABIT_EX_CODE_COLUMN).is_some() { + } else if batch.column_by_name(RABIT_EX_CODE_COLUMN).is_some() + || batch.column_by_name(RABIT_BLOCKED_EX_CODE_COLUMN).is_some() + { return Err(Error::invalid_input(format!( - "RabitQ num_bits={} does not support {} column", - metadata.num_bits, RABIT_EX_CODE_COLUMN + "RabitQ num_bits={} does not support ex-code columns", + metadata.num_bits ))); } @@ -2270,9 +2515,10 @@ impl QuantizerStorage for RabitQuantizationStorage { let mut metadata = metadata.clone(); metadata.packed = true; - let packed_ex_codes = maybe_pack_ex_codes(ex_codes.as_ref(), ex_bits); + let packed_ex_codes = + maybe_pack_ex_codes(ex_codes.as_ref(), ex_bits, error_factors.as_ref()); - Ok(Self { + let storage = Self { metadata, batch, distance_type, @@ -2285,7 +2531,12 @@ impl QuantizerStorage for RabitQuantizationStorage { packed_ex_codes, ex_add_factors, ex_scale_factors, - }) + }; + + match build_frag_reuse_mapping(fri.as_deref(), &storage.row_ids) { + Some(mapping) => storage.remap(&mapping), + None => Ok(storage), + } } fn metadata(&self) -> &Self::Metadata { @@ -2353,11 +2604,18 @@ impl QuantizerStorage for RabitQuantizationStorage { let error_factors = batch .column_by_name(ERROR_FACTORS_COLUMN) .map(|factors| factors.as_primitive::().clone()); - let ex_codes = batch - .column_by_name(RABIT_EX_CODE_COLUMN) - .map(|codes| codes.as_fixed_size_list().clone()); + let ex_bits = rabit_ex_bits(self.metadata.num_bits)?; + let (batch, ex_codes) = if ex_bits == 0 { + (batch, None) + } else { + // `self.batch` is already normalized at load, so this is a + // zero-copy column lookup. + let (batch, codes) = + load_blocked_ex_codes(batch, self.metadata.rotated_dim(), self.metadata.num_bits)?; + (batch, Some(codes)) + }; let packed_ex_codes = - maybe_pack_ex_codes(ex_codes.as_ref(), rabit_ex_bits(self.metadata.num_bits)?); + maybe_pack_ex_codes(ex_codes.as_ref(), ex_bits, error_factors.as_ref()); let ex_add_factors = batch .column_by_name(EX_ADD_FACTORS_COLUMN) .map(|factors| factors.as_primitive::().clone()); @@ -2490,6 +2748,9 @@ mod tests { use arrow_array::{ArrayRef, Float32Array, Float64Array, UInt64Array}; use lance_core::ROW_ID; use lance_linalg::distance::DistanceType; + use rand::rngs::SmallRng; + use rand::{Rng, SeedableRng}; + use rstest::rstest; use crate::vector::bq::{RQRotationType, builder::RabitQuantizer}; use crate::vector::quantizer::{Quantization, QuantizerStorage}; @@ -2695,7 +2956,7 @@ mod tests { assert!(rabit_ex_code_field(128, 1).unwrap().is_none()); let ex_field = rabit_ex_code_field(128, 9).unwrap().unwrap(); - assert_eq!(ex_field.name(), RABIT_EX_CODE_COLUMN); + assert_eq!(ex_field.name(), RABIT_BLOCKED_EX_CODE_COLUMN); let DataType::FixedSizeList(_, ex_code_bytes) = ex_field.data_type() else { panic!("ex-code field should be FixedSizeList"); }; @@ -2898,6 +3159,229 @@ mod tests { assert_eq!(distances, vec![104.0, 22.0]); } + /// Exercise the ex-dot kernel through the storage API for every ex width, + /// including the widths without FastScan support ({1, 3, 5, 6, 7}), and a + /// dim that is not a multiple of the 64-dim kernel group. + /// + /// The dim must be a multiple of 8: the binary distance stage consumes + /// two 4-dim segments per code byte and ignores trailing dims otherwise. + #[test] + fn test_raw_query_multi_bit_distance_matches_reference_for_all_ex_widths() { + use rand::rngs::SmallRng; + use rand::{Rng, SeedableRng}; + + // 72 exercises the kernels' padded-tail path; 1536 is a production + // embedding dim exercising the full-group path. Both the blocked + // format and the legacy sequential format must produce the same + // distances. + for (code_dim, num_rows) in [(72usize, 33usize), (1536, 33)] { + for num_bits in 2..=9u8 { + for legacy_format in [false, true] { + let ex_bits = num_bits - 1; + let mut rng = SmallRng::seed_from_u64(num_bits as u64); + + let sign_bits = (0..num_rows * code_dim) + .map(|_| rng.random_bool(0.5)) + .collect::>(); + let max_code = ((1u16 << ex_bits) - 1) as u8; + let ex_values = (0..num_rows * code_dim) + .map(|_| rng.random_range(0..=max_code)) + .collect::>(); + + let code_len = rabit_binary_code_bytes(code_dim); + let mut code_bytes = vec![0u8; num_rows * code_len]; + for (row, bits) in sign_bits.chunks_exact(code_dim).enumerate() { + for (dim, &bit) in bits.iter().enumerate() { + code_bytes[row * code_len + dim / 8] |= (bit as u8) << (dim % 8); + } + } + let (ex_code_column, ex_code_len, ex_code_bytes) = if legacy_format { + let ex_code_len = rabit_ex_code_bytes(code_dim, ex_bits).unwrap(); + let mut ex_code_bytes = vec![0u8; num_rows * ex_code_len]; + for (row, values) in ex_values.chunks_exact(code_dim).enumerate() { + for (dim, &value) in values.iter().enumerate() { + let bit_offset = dim * ex_bits as usize; + let bits = (value as u16) << (bit_offset % 8); + ex_code_bytes[row * ex_code_len + bit_offset / 8] |= bits as u8; + if bits >> 8 != 0 { + ex_code_bytes[row * ex_code_len + bit_offset / 8 + 1] |= + (bits >> 8) as u8; + } + } + } + (RABIT_EX_CODE_COLUMN, ex_code_len, ex_code_bytes) + } else { + let ex_code_len = blocked_ex_code_bytes(code_dim, ex_bits); + let mut ex_code_bytes = vec![0u8; num_rows * ex_code_len]; + for (row, values) in ex_code_bytes + .chunks_exact_mut(ex_code_len) + .zip(ex_values.chunks_exact(code_dim)) + { + crate::vector::bq::ex_dot::pack_blocked_row(values, ex_bits, row); + } + (RABIT_BLOCKED_EX_CODE_COLUMN, ex_code_len, ex_code_bytes) + }; + + let identity = Float32Array::from_iter_values((0..code_dim).flat_map(|row| { + (0..code_dim).map(move |col| if row == col { 1.0 } else { 0.0 }) + })); + let rotate_mat = + FixedSizeListArray::try_new_from_values(identity, code_dim as i32).unwrap(); + let metadata = RabitQuantizationMetadata { + rotate_mat: Some(rotate_mat), + rotate_mat_position: None, + fast_rotation_signs: None, + rotation_type: RQRotationType::Matrix, + code_dim: code_dim as u32, + num_bits, + packed: false, + query_estimator: RabitQueryEstimator::RawQuery, + }; + let codes = FixedSizeListArray::try_new_from_values( + UInt8Array::from(code_bytes), + code_len as i32, + ) + .unwrap(); + let ex_codes = FixedSizeListArray::try_new_from_values( + UInt8Array::from(ex_code_bytes), + ex_code_len as i32, + ) + .unwrap(); + let ex_add_factors = (0..num_rows) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect::>(); + let ex_scale_factors = (0..num_rows) + .map(|_| rng.random_range(0.1f32..1.0)) + .collect::>(); + let batch = RecordBatch::try_from_iter(vec![ + ( + ROW_ID, + Arc::new(UInt64Array::from_iter_values(0..num_rows as u64)) as ArrayRef, + ), + (RABIT_CODE_COLUMN, Arc::new(codes) as ArrayRef), + ( + ADD_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0; num_rows])) as ArrayRef, + ), + ( + SCALE_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0; num_rows])) as ArrayRef, + ), + (ex_code_column, Arc::new(ex_codes) as ArrayRef), + ( + EX_ADD_FACTORS_COLUMN, + Arc::new(Float32Array::from(ex_add_factors.clone())) as ArrayRef, + ), + ( + EX_SCALE_FACTORS_COLUMN, + Arc::new(Float32Array::from(ex_scale_factors.clone())) as ArrayRef, + ), + ]) + .unwrap(); + let storage = RabitQuantizationStorage::try_from_batch( + batch, + &metadata, + DistanceType::L2, + None, + ) + .unwrap(); + + let query = (0..code_dim) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect::>(); + let sum_q = query.iter().sum::(); + let calc = storage.dist_calculator( + Arc::new(Float32Array::from(query.clone())) as ArrayRef, + 0.0, + ); + + let code_scale = (1u32 << ex_bits) as f32; + let code_bias = -(code_scale - 0.5); + let expected = (0..num_rows) + .map(|row| { + let binary_ip = (0..code_dim) + .map(|dim| { + query[dim] * sign_bits[row * code_dim + dim] as u8 as f32 + }) + .sum::(); + let ex_dist = (0..code_dim) + .map(|dim| query[dim] * ex_values[row * code_dim + dim] as f32) + .sum::(); + let full_dot = code_scale * binary_ip + ex_dist + code_bias * sum_q; + full_dot * ex_scale_factors[row] + ex_add_factors[row] + }) + .collect::>(); + + for (row, &want) in expected.iter().enumerate() { + let got = calc.distance(row as u32); + assert!( + (got - want).abs() <= 1e-3 * want.abs().max(1.0), + "num_bits={num_bits} row={row}: {got} != {want}" + ); + } + + let mut distances = Vec::new(); + let mut u16_scratch = Vec::new(); + let mut u8_scratch = Vec::new(); + let mut u32_scratch = Vec::new(); + calc.distance_all_with_scratch( + 0, + &mut distances, + &mut u16_scratch, + &mut u8_scratch, + &mut u32_scratch, + ); + assert_eq!(distances.len(), num_rows); + // The bulk path quantizes the binary LUT to u8, and that error is + // amplified by 2^ex_bits in the multi-bit estimate, so the value + // assertions need a quantization-aware bound. The FastScan ex + // widths additionally quantize the ex LUT and are covered by + // `test_raw_query_multi_bit_distance_all_uses_fastscan_for_split_ex_codes`. + if !matches!(ex_bits, 2 | 4 | 8) { + // Worst-case |error| of one u8-quantized binary LUT lookup is + // (table range) / 255 / 2, accumulated over one lookup per + // 8-dim pair of segments. + let num_tables = code_dim.div_ceil(4); + let mut table_min = f32::INFINITY; + let mut table_max = f32::NEG_INFINITY; + for segment in query.chunks(4) { + for subset in 0..16usize { + let value = segment + .iter() + .enumerate() + .filter(|(idx, _)| subset & (1 << idx) != 0) + .map(|(_, q)| *q) + .sum::(); + table_min = table_min.min(value); + table_max = table_max.max(value); + } + } + let binary_bound = + code_scale * num_tables as f32 * (table_max - table_min) / 255.0 / 2.0 + * ex_scale_factors.iter().fold(0.0f32, |max, &s| max.max(s)); + for (row, (&got, &want)) in + distances.iter().zip(expected.iter()).enumerate() + { + assert!( + (got - want).abs() <= binary_bound + 1e-3, + "num_bits={num_bits} row={row} (distance_all): {got} != {want} (bound {binary_bound})" + ); + } + // Rows past the SIMD batch use the exact binary path, so the + // final remainder row must match the per-candidate distance. + let remainder_row = num_rows - 1; + let got = distances[remainder_row]; + let want = calc.distance(remainder_row as u32); + assert!( + (got - want).abs() <= 1e-3 * want.abs().max(1.0), + "num_bits={num_bits} remainder row (distance_all): {got} != {want}" + ); + } + } + } + } + } + #[test] fn test_fast_approx_mode_uses_one_bit_scores_for_multi_bit_raw_query() { let code_dim = 8usize; @@ -3061,10 +3545,17 @@ mod tests { assert_eq!(hacc_accum_len, num_rows); } - fn assert_raw_query_multi_bit_distance_all_uses_fastscan(num_bits: u8) { - let code_dim = 8usize; + fn assert_raw_query_multi_bit_distance_all_uses_fastscan( + num_bits: u8, + legacy_format: bool, + with_error_factors: bool, + ) { + // Not a multiple of 64, so the padded-tail LUT entries are exercised; + // a multiple of 8 as the binary stage requires. + let code_dim = 72usize; let num_rows = BATCH_SIZE + 1; let ex_bits = rabit_ex_bits(num_bits).unwrap(); + let max_code = ((1u16 << ex_bits) - 1) as u8; let identity = Float32Array::from_iter_values( (0..code_dim) .flat_map(|row| (0..code_dim).map(move |col| if row == col { 1.0 } else { 0.0 })), @@ -3081,16 +3572,42 @@ mod tests { packed: false, query_estimator: RabitQueryEstimator::RawQuery, }; + let code_len = rabit_binary_code_bytes(code_dim); let codes = FixedSizeListArray::try_new_from_values( - UInt8Array::from_iter_values((0..num_rows).map(|idx| (idx * 13) as u8)), - 1, + UInt8Array::from_iter_values((0..num_rows * code_len).map(|idx| (idx * 13) as u8)), + code_len as i32, ) .unwrap(); - let ex_code_len = rabit_ex_code_bytes(code_dim, ex_bits).unwrap(); + let ex_values = (0..num_rows * code_dim) + .map(|idx| ((idx * 37) % (max_code as usize + 1)) as u8) + .collect::>(); + let (ex_code_column, ex_code_len, ex_code_bytes) = if legacy_format { + let ex_code_len = rabit_ex_code_bytes(code_dim, ex_bits).unwrap(); + let mut ex_code_bytes = vec![0u8; num_rows * ex_code_len]; + for (row, values) in ex_values.chunks_exact(code_dim).enumerate() { + for (dim, &value) in values.iter().enumerate() { + let bit_offset = dim * ex_bits as usize; + let bits = (value as u16) << (bit_offset % 8); + ex_code_bytes[row * ex_code_len + bit_offset / 8] |= bits as u8; + if bits >> 8 != 0 { + ex_code_bytes[row * ex_code_len + bit_offset / 8 + 1] |= (bits >> 8) as u8; + } + } + } + (RABIT_EX_CODE_COLUMN, ex_code_len, ex_code_bytes) + } else { + let ex_code_len = blocked_ex_code_bytes(code_dim, ex_bits); + let mut ex_code_bytes = vec![0u8; num_rows * ex_code_len]; + for (row, values) in ex_code_bytes + .chunks_exact_mut(ex_code_len) + .zip(ex_values.chunks_exact(code_dim)) + { + crate::vector::bq::ex_dot::pack_blocked_row(values, ex_bits, row); + } + (RABIT_BLOCKED_EX_CODE_COLUMN, ex_code_len, ex_code_bytes) + }; let ex_codes = FixedSizeListArray::try_new_from_values( - UInt8Array::from_iter_values( - (0..num_rows * ex_code_len).map(|idx| (idx * 37 % 251) as u8), - ), + UInt8Array::from(ex_code_bytes), ex_code_len as i32, ) .unwrap(); @@ -3108,7 +3625,7 @@ mod tests { SCALE_FACTORS_COLUMN, Arc::new(Float32Array::from(vec![1.0; num_rows])) as ArrayRef, ), - (RABIT_EX_CODE_COLUMN, Arc::new(ex_codes) as ArrayRef), + (ex_code_column, Arc::new(ex_codes) as ArrayRef), ( EX_ADD_FACTORS_COLUMN, Arc::new(Float32Array::from(vec![0.0; num_rows])) as ArrayRef, @@ -3119,12 +3636,30 @@ mod tests { ), ]) .unwrap(); + let batch = if with_error_factors { + batch + .try_with_column( + crate::vector::bq::transform::ERROR_FACTORS_FIELD.clone(), + Arc::new(Float32Array::from(vec![1000.0; num_rows])) as ArrayRef, + ) + .unwrap() + } else { + batch + }; let storage = RabitQuantizationStorage::try_from_batch(batch, &metadata, DistanceType::L2, None) .unwrap(); - assert!(storage.packed_ex_codes.is_some()); + // The FastScan transpose only exists for indexes that can reach the + // bulk bypass path (no error factors); gated indexes fall through to + // the exact per-row kernels in `distance_all`. + assert_eq!(storage.packed_ex_codes.is_some(), !with_error_factors); - let query = Arc::new(Float32Array::from(vec![1.0; code_dim])) as ArrayRef; + // A per-dim varying query so that any dim-mapping error in the + // FastScan LUT shows up as a value mismatch. + let query_values = (0..code_dim) + .map(|dim| (dim % 11) as f32 * 0.3 - 1.5) + .collect::>(); + let query = Arc::new(Float32Array::from(query_values.clone())) as ArrayRef; let calc = storage.dist_calculator(query, 0.0); let mut distances = Vec::new(); let mut u16_scratch = Vec::new(); @@ -3140,15 +3675,57 @@ mod tests { assert_eq!(distances.len(), num_rows); assert_eq!(u16_scratch.len(), BATCH_SIZE); - assert_eq!( - u8_scratch.len(), - ex_fastscan_code_len(code_dim, ex_bits).unwrap() * 2 * SEGMENT_NUM_CODES + let loaded_ex_code_len = storage.ex_codes.as_ref().unwrap().value_length() as usize; + if with_error_factors { + // The gated path never builds the ex LUT; the scratch holds the + // binary LUT only. + assert_eq!(u8_scratch.len(), code_dim * 4); + } else { + assert_eq!(u8_scratch.len(), loaded_ex_code_len * 2 * SEGMENT_NUM_CODES); + } + + // The fastscan estimate differs from the exact path only by the u8 + // quantization of the binary LUT (amplified by 2^ex_bits) and of the + // ex LUT, so bound the comparison by those quantization errors. + let mut table_min = f32::INFINITY; + let mut table_max = f32::NEG_INFINITY; + for segment in query_values.chunks(4) { + for subset in 0..SEGMENT_NUM_CODES { + let value = segment + .iter() + .enumerate() + .filter(|(idx, _)| subset & (1 << idx) != 0) + .map(|(_, q)| *q) + .sum::(); + table_min = table_min.min(value); + table_max = table_max.max(value); + } + } + let code_scale = (1u32 << ex_bits) as f32; + let binary_bound = + code_scale * code_dim.div_ceil(4) as f32 * (table_max - table_min) / 510.0; + let mut padded_query = vec![0.0f32; crate::vector::bq::ex_dot::padded_query_len(code_dim)]; + crate::vector::bq::ex_dot::pad_query_into(&query_values, &mut padded_query); + let mut quantized_table = Vec::new(); + let (ex_qmin, ex_qmax, ex_qcap) = quantize_ex_fastscan_dist_table_into( + ex_bits, + loaded_ex_code_len, + &padded_query, + &mut quantized_table, ); + // Without the FastScan transpose the ex stage is exact, so only the + // binary LUT quantization remains. + let ex_bound = if with_error_factors { + 0.0 + } else { + (loaded_ex_code_len * 2) as f32 * (ex_qmax - ex_qmin) / ex_qcap / 2.0 + }; + let bound = (binary_bound + ex_bound) * 1.5 + 1e-3; for (id, distance) in distances.iter().take(BATCH_SIZE).enumerate() { let exact = calc.distance(id as u32); assert!( - (*distance - exact).abs() < 10.0, - "distance_all fastscan mismatch for id {id}: actual={distance}, exact={exact}" + (*distance - exact).abs() <= bound, + "distance_all fastscan mismatch for id {id} (num_bits={num_bits} legacy={legacy_format}): actual={distance}, exact={exact}, bound={bound}" ); } assert_eq!(distances[BATCH_SIZE], calc.distance(BATCH_SIZE as u32)); @@ -3156,8 +3733,108 @@ mod tests { #[test] fn test_raw_query_multi_bit_distance_all_uses_fastscan_for_split_ex_codes() { - for num_bits in [3, 9] { - assert_raw_query_multi_bit_distance_all_uses_fastscan(num_bits); + for num_bits in [3, 5, 9] { + for legacy_format in [false, true] { + assert_raw_query_multi_bit_distance_all_uses_fastscan( + num_bits, + legacy_format, + false, + ); + } + // Gated indexes (with error factors) skip the FastScan artifacts + // and score the bulk path with the exact kernels. + assert_raw_query_multi_bit_distance_all_uses_fastscan(num_bits, false, true); + } + } + + /// A dist table whose `num_tables`-scaled reconstruction overflows `f32` + /// must fall back to exact distances rather than the affine dequant's + /// `0 * inf = NaN`. Covers both the u8 (Normal) and u16 (Accurate) LUT + /// paths end-to-end through `distance_all`, asserting the result is + /// NaN-free and bit-identical to the always-exact per-row computation. + #[rstest] + fn test_degenerate_dist_table_falls_back_to_exact_distances( + #[values(ApproxMode::Normal, ApproxMode::Accurate)] approx_mode: ApproxMode, + ) { + let code_dim = 8usize; + let num_rows = BATCH_SIZE + 5; + let num_bits = 3; + let ex_bits = rabit_ex_bits(num_bits).unwrap(); + let identity = Float32Array::from_iter_values( + (0..code_dim) + .flat_map(|row| (0..code_dim).map(move |col| if row == col { 1.0 } else { 0.0 })), + ); + let rotate_mat = + FixedSizeListArray::try_new_from_values(identity, code_dim as i32).unwrap(); + let metadata = RabitQuantizationMetadata { + rotate_mat: Some(rotate_mat), + rotate_mat_position: None, + fast_rotation_signs: None, + rotation_type: RQRotationType::Matrix, + code_dim: code_dim as u32, + num_bits, + packed: false, + query_estimator: RabitQueryEstimator::RawQuery, + }; + let codes = FixedSizeListArray::try_new_from_values( + UInt8Array::from_iter_values((0..num_rows).map(|idx| (idx * 19) as u8)), + rabit_binary_code_bytes(code_dim) as i32, + ) + .unwrap(); + let ex_codes = make_test_ex_codes(num_rows, code_dim, num_bits); + let batch = make_test_batch_with_ex(codes, ex_codes); + let storage = + RabitQuantizationStorage::try_from_batch(batch, &metadata, DistanceType::L2, None) + .unwrap(); + let query = Arc::new(Float32Array::from(vec![1.0; code_dim])) as ArrayRef; + + let mut calc = storage.dist_calculator(query, 4.0); + calc.approx_mode = approx_mode; + // num_tables = (code_dim * 4) / SEGMENT_NUM_CODES = 2; the extrema sum + // (qmax - qmin = 4e38) overflows when scaled by num_tables, so the + // quantizer returns `Exact`. Per-row sums stay finite (each row reads + // one entry per segment), so the exact path is well-defined. + let mut degenerate = vec![0.0f32; code_dim * 4]; + degenerate[0] = -2e38; + degenerate[1] = 2e38; + calc.dist_table = Cow::Owned(degenerate); + + let code_len = rabit_binary_code_bytes(code_dim); + let ex_codes = calc.ex_codes.unwrap(); + let ex_add_factors = calc.ex_add_factors.unwrap(); + let ex_scale_factors = calc.ex_scale_factors.unwrap(); + let expected = (0..num_rows) + .map(|id| { + let binary_ip = compute_single_rq_distance( + calc.codes, + id, + num_rows, + code_len, + &calc.dist_table, + ); + calc.raw_query_multi_bit_exact_distance( + id, + binary_ip, + ex_bits, + ex_codes, + ex_add_factors, + ex_scale_factors, + ) + }) + .collect::>(); + + let actual = calc.distance_all(0); + assert_eq!(actual.len(), num_rows); + for id in 0..num_rows { + assert!( + !actual[id].is_nan(), + "approx_mode={approx_mode:?} id={id}: degenerate table produced NaN" + ); + assert_eq!( + actual[id].to_bits(), + expected[id].to_bits(), + "approx_mode={approx_mode:?} id={id}: distance_all must match the exact path" + ); } } @@ -3239,7 +3916,6 @@ mod tests { id, binary_ip, ex_bits, - ex_code_len, ex_codes, ex_add_factors, ex_scale_factors, @@ -3289,6 +3965,200 @@ mod tests { } } + /// Inputs crafted so the top-k scan outcomes are fully determined by the + /// factor columns: with zero scale factors, a zero query factor, and a + /// query error of one, the lower bound is + /// `add_factors[id] - error_factors[id]`, and with zero ex scale factors + /// the exact distance is `ex_add_factors[id]`, regardless of the random + /// codes and query. + struct CraftedTopkData { + codes: Vec, + ex_codes: Vec, + dist_table: Vec, + ex_query: Vec, + scale_factors: Vec, + add_factors: Vec, + error_factors: Vec, + ex_scale_factors: Vec, + ex_add_factors: Vec, + } + + const CRAFTED_TOPK_DIM: usize = 64; + const CRAFTED_TOPK_NUM_BITS: u8 = 5; + + impl CraftedTopkData { + fn new( + exact_dists: &[f32], + lower_bound_margins: &[f32], + error_factors: Vec, + rng: &mut SmallRng, + ) -> Self { + let n = exact_dists.len(); + let code_len = rabit_binary_code_bytes(CRAFTED_TOPK_DIM); + let ex_code_len = blocked_ex_code_bytes(CRAFTED_TOPK_DIM, CRAFTED_TOPK_NUM_BITS - 1); + let add_factors = izip!(exact_dists, lower_bound_margins, &error_factors) + .map(|(dist, margin, error)| dist - margin + error) + .collect(); + Self { + codes: (0..n * code_len).map(|_| rng.random()).collect(), + ex_codes: (0..n * ex_code_len).map(|_| rng.random()).collect(), + dist_table: (0..CRAFTED_TOPK_DIM * 4) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect(), + ex_query: (0..CRAFTED_TOPK_DIM) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect(), + scale_factors: vec![0.0; n], + add_factors, + error_factors, + ex_scale_factors: vec![0.0; n], + ex_add_factors: exact_dists.to_vec(), + } + } + + fn calculator(&self, approx_mode: ApproxMode) -> RabitDistCalculator<'_> { + RabitDistCalculator::new( + CRAFTED_TOPK_DIM, + CRAFTED_TOPK_NUM_BITS, + RabitQueryEstimator::RawQuery, + Cow::Borrowed(self.dist_table.as_slice()), + Cow::Borrowed(self.ex_query.as_slice()), + 0.7, + &self.codes, + Some(&self.ex_codes), + blocked_ex_code_bytes(CRAFTED_TOPK_DIM, CRAFTED_TOPK_NUM_BITS - 1), + &self.add_factors, + &self.scale_factors, + Some(&self.error_factors), + Some(&self.ex_add_factors), + Some(&self.ex_scale_factors), + None, + 0.0, + 1.0, + approx_mode, + ) + } + } + + fn canonical_heap_rows(heap: BinaryHeap>) -> Vec<(u32, u64)> { + let mut rows = heap + .into_iter() + .map(|node| (node.dist.0.to_bits(), node.id)) + .collect::>(); + rows.sort_unstable(); + rows + } + + /// The dense (SIMD-pruned) scan must reproduce the sparse scalar scan + /// exactly: identical heap contents including row ids, and the k smallest + /// in-bounds exact distances overall. + #[rstest] + fn test_raw_query_multi_bit_topk_dense_matches_sparse( + #[values(ApproxMode::Normal, ApproxMode::Accurate)] approx_mode: ApproxMode, + #[values("descending", "ascending", "random", "duplicates", "duplicate_ties")] + ordering: &str, + ) { + for n in [1usize, 15, 16, 17, 100, 4109] { + let mut rng = SmallRng::seed_from_u64(n as u64 * 31 + ordering.len() as u64); + let exact_dists: Vec = match ordering { + // Improving rows force constant heap updates. + "descending" => (0..n).map(|id| (n - id) as f32).collect(), + // Worsening rows force mass pruning, the common regime. + "ascending" => (0..n).map(|id| id as f32).collect(), + "random" => (0..n).map(|_| rng.random_range(0.0..n as f32)).collect(), + "duplicates" => (0..n).map(|id| (id % 7) as f32).collect(), + // Lower bound equals the distance, so heap-threshold and + // upper-bound comparisons hit exact `>=` ties. + "duplicate_ties" => (0..n).map(|id| (id % 5) as f32).collect(), + _ => unreachable!(), + }; + let (margins, error_factors) = if ordering == "duplicate_ties" { + (vec![0.0; n], vec![0.0; n]) + } else if ordering == "random" { + ( + (0..n).map(|_| rng.random_range(0.0f32..2.0)).collect(), + (0..n).map(|_| rng.random_range(0.0f32..1.0)).collect(), + ) + } else { + ( + vec![1.0; n], + (0..n).map(|_| rng.random_range(0.0f32..1.0)).collect(), + ) + }; + let data = CraftedTopkData::new(&exact_dists, &margins, error_factors, &mut rng); + let calc = data.calculator(approx_mode); + assert!( + calc.raw_query_lower_bound_gating_disabled_reason() + .is_none() + ); + + let max_dist = exact_dists.iter().fold(0.0f32, |acc, dist| acc.max(*dist)); + for k in [1usize, 10, n + 7] { + for bounds in [(None, None), (Some(max_dist * 0.25), Some(max_dist * 0.7))] { + let (lower_bound, upper_bound) = bounds; + let mut dense_heap = BinaryHeap::new(); + let mut sparse_heap = BinaryHeap::new(); + let mut dists = Vec::new(); + let mut u16_scratch = Vec::new(); + let mut u8_scratch = Vec::new(); + let mut u32_scratch = Vec::new(); + // Two passes sharing the heap, as IVF partition probing + // does: the second pass starts with a full, tight heap. + for pass in 0..2u64 { + let offset = pass * n as u64; + calc.accumulate_topk_with_scratch( + k, + lower_bound, + upper_bound, + |id| id as u64 + offset, + &mut dense_heap, + &mut dists, + &mut u16_scratch, + &mut u8_scratch, + &mut u32_scratch, + ); + calc.accumulate_filtered_topk_with_scratch( + k, + lower_bound, + upper_bound, + (0..n as u32).map(|id| (id, id as u64 + offset)), + |_| true, + &mut sparse_heap, + &mut dists, + &mut u16_scratch, + &mut u8_scratch, + &mut u32_scratch, + ); + } + let dense = canonical_heap_rows(dense_heap); + let sparse = canonical_heap_rows(sparse_heap); + assert_eq!( + dense, sparse, + "ordering={ordering} n={n} k={k} bounds={bounds:?} mode={approx_mode:?}" + ); + + // The distance multiset must be the k smallest in-bounds + // distances over both passes. Row ids are not compared: + // evictions among tied maxima depend on heap layout. + let query_lower_bound = lower_bound.unwrap_or(f32::MIN); + let query_upper_bound = upper_bound.unwrap_or(f32::MAX); + let mut expected = (0..2 * n) + .map(|row| exact_dists[row % n]) + .filter(|dist| *dist >= query_lower_bound && *dist < query_upper_bound) + .map(|dist| dist.to_bits()) + .collect::>(); + expected.sort_unstable(); + expected.truncate(k); + let actual = dense.iter().map(|(dist, _)| *dist).collect::>(); + assert_eq!( + actual, expected, + "ordering={ordering} n={n} k={k} bounds={bounds:?} mode={approx_mode:?}" + ); + } + } + } + } + #[test] fn test_raw_query_one_bit_distance_uses_binary_factors_without_ex_columns() { let code_dim = 8usize; @@ -3457,7 +4327,8 @@ mod tests { ) .unwrap_err(); assert!( - err.to_string().contains("requires __ex_codes column"), + err.to_string() + .contains("requires __blocked_ex_codes column"), "{}", err ); @@ -3501,9 +4372,11 @@ mod tests { .unwrap(); assert!(storage.metadata().packed); + // Legacy batches are normalized to the blocked column at load. let stored_batch = storage.to_batches().unwrap().next().unwrap(); + assert!(stored_batch.column_by_name(RABIT_EX_CODE_COLUMN).is_none()); assert_eq!( - stored_batch[RABIT_EX_CODE_COLUMN] + stored_batch[RABIT_BLOCKED_EX_CODE_COLUMN] .as_fixed_size_list() .value_length(), 64 @@ -3571,11 +4444,19 @@ mod tests { #[test] fn test_remap_preserves_multi_bit_rq_split_columns() { + // num_bits=9 keeps sequential ex codes; num_bits 4/6/8 (ex_bits + // 3/5/7) also exercise the bit-plane repack rebuild in `remap`. + for num_bits in [4, 6, 8, 9u8] { + test_remap_preserves_multi_bit_rq_split_columns_impl(num_bits); + } + } + + fn test_remap_preserves_multi_bit_rq_split_columns_impl(num_bits: u8) { let original_codes = make_test_codes(50, 64); let code_dim = original_codes.value_length() as usize * 8; - let ex_codes = make_test_ex_codes(original_codes.len(), code_dim, 9); + let ex_codes = make_test_ex_codes(original_codes.len(), code_dim, num_bits); let mut metadata = make_test_metadata(code_dim); - metadata.num_bits = 9; + metadata.num_bits = num_bits; let storage = RabitQuantizationStorage::try_from_batch( make_test_batch_with_ex(original_codes.clone(), ex_codes), &metadata, @@ -3599,11 +4480,14 @@ mod tests { ); assert_eq!(remapped_row_ids, expected_row_ids.values()); + // Legacy batches are normalized to the blocked format at load, so the + // remapped batch carries the blocked column. + let ex_code_len = blocked_ex_code_bytes(code_dim, rabit_ex_bits(num_bits).unwrap()); assert_eq!( - remapped_batch[RABIT_EX_CODE_COLUMN] + remapped_batch[RABIT_BLOCKED_EX_CODE_COLUMN] .as_fixed_size_list() .value_length(), - 64 + ex_code_len as i32 ); assert_eq!( &remapped_batch[EX_ADD_FACTORS_COLUMN] @@ -3623,5 +4507,20 @@ mod tests { .values()[..5], &[0.25, 1.25, 2.25, 4.25, 5.25] ); + + // The remapped storage must hold the same kernel-layout ex codes as a + // storage freshly loaded from the remapped batch. + let reloaded = RabitQuantizationStorage::try_from_batch( + remapped_batch, + &remapped.metadata, + DistanceType::L2, + None, + ) + .unwrap(); + assert_eq!(remapped.ex_codes, reloaded.ex_codes); + assert_eq!( + remapped.ex_codes.as_ref().unwrap().value_length() as usize, + blocked_ex_code_bytes(code_dim, rabit_ex_bits(num_bits).unwrap()) + ); } } diff --git a/rust/lance-index/src/vector/bq/transform.rs b/rust/lance-index/src/vector/bq/transform.rs index c2fc0608102..c87695e14cd 100644 --- a/rust/lance-index/src/vector/bq/transform.rs +++ b/rust/lance-index/src/vector/bq/transform.rs @@ -17,7 +17,9 @@ use tracing::instrument; use crate::vector::bq::builder::RabitQuantizer; use crate::vector::bq::rabit_ex_bits; -use crate::vector::bq::storage::{RABIT_CODE_COLUMN, RABIT_EX_CODE_COLUMN, RabitQueryEstimator}; +use crate::vector::bq::storage::{ + RABIT_BLOCKED_EX_CODE_COLUMN, RABIT_CODE_COLUMN, RabitQueryEstimator, +}; use crate::vector::quantizer::Quantization; use crate::vector::transform::Transformer; use crate::vector::{CENTROID_DIST_COLUMN, PART_ID_COLUMN}; @@ -281,7 +283,7 @@ impl Transformer for RQTransformer { #[instrument(name = "RQTransformer::transform", level = "debug", skip_all)] fn transform(&self, batch: &RecordBatch) -> Result { let has_split_codes = self.rq.num_bits() == 1 - || (batch.column_by_name(RABIT_EX_CODE_COLUMN).is_some() + || (batch.column_by_name(RABIT_BLOCKED_EX_CODE_COLUMN).is_some() && batch.column_by_name(EX_ADD_FACTORS_COLUMN).is_some() && batch.column_by_name(EX_SCALE_FACTORS_COLUMN).is_some()); if batch.column_by_name(RABIT_CODE_COLUMN).is_some() && has_split_codes { @@ -494,7 +496,8 @@ mod tests { use crate::vector::bq::RQRotationType; use crate::vector::bq::builder::RabitQuantizer; - use crate::vector::bq::storage::RABIT_EX_CODE_COLUMN; + use crate::vector::bq::ex_dot::blocked_ex_code_bytes; + use crate::vector::bq::storage::RABIT_BLOCKED_EX_CODE_COLUMN; use crate::vector::transform::Transformer; use crate::vector::{CENTROID_DIST_COLUMN, PART_ID_COLUMN}; @@ -535,15 +538,19 @@ mod tests { .unwrap(); let transformed = transformer.transform(&batch).unwrap(); - assert!(transformed.column_by_name(RABIT_EX_CODE_COLUMN).is_some()); + assert!( + transformed + .column_by_name(RABIT_BLOCKED_EX_CODE_COLUMN) + .is_some() + ); assert_eq!( - transformed[RABIT_EX_CODE_COLUMN] + transformed[RABIT_BLOCKED_EX_CODE_COLUMN] .as_fixed_size_list() .value_length(), - 3 + blocked_ex_code_bytes(8, 3) as i32 ); assert!( - transformed[RABIT_EX_CODE_COLUMN] + transformed[RABIT_BLOCKED_EX_CODE_COLUMN] .as_fixed_size_list() .values() .as_primitive::() diff --git a/rust/lance-index/src/vector/distributed/index_merger.rs b/rust/lance-index/src/vector/distributed/index_merger.rs index 5f59985673e..70371ad4794 100755 --- a/rust/lance-index/src/vector/distributed/index_merger.rs +++ b/rust/lance-index/src/vector/distributed/index_merger.rs @@ -1440,6 +1440,25 @@ pub async fn merge_partial_vector_auxiliary_files( ))); } + // Shards written by older lance versions carry sequential ex + // codes; normalize every batch to the blocked layout before + // concatenation so mixed-version shards merge correctly + // (concat_batches combines columns by position and would + // otherwise mix the two layouts silently). + let batches = match rq_meta.as_ref() { + Some(meta) if meta.num_bits > 1 => batches + .into_iter() + .map(|batch| { + crate::vector::bq::storage::load_blocked_ex_codes( + batch, + meta.rotated_dim(), + meta.num_bits, + ) + .map(|(batch, _)| batch) + }) + .collect::>>()?, + _ => batches, + }; let schema = batches[0].schema(); let partition_batch = concat_batches(&schema, batches.iter())?; if let Some(w) = v2w_opt.as_mut() { @@ -1527,7 +1546,7 @@ mod tests { use prost::Message; use crate::vector::bq::RQRotationType; - use crate::vector::bq::storage::{RABIT_EX_CODE_COLUMN, RabitQueryEstimator}; + use crate::vector::bq::storage::{RABIT_BLOCKED_EX_CODE_COLUMN, RabitQueryEstimator}; use crate::vector::bq::transform::{EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN}; lance_testing::define_stage_event_progress!( RecordingProgress, @@ -2529,11 +2548,14 @@ mod tests { let batch = batch.unwrap(); if !checked_split_columns { let schema = batch.schema(); - let ex_code_field = schema.field_with_name(RABIT_EX_CODE_COLUMN).unwrap(); + let ex_code_field = schema + .field_with_name(RABIT_BLOCKED_EX_CODE_COLUMN) + .unwrap(); let DataType::FixedSizeList(_, ex_code_bytes) = ex_code_field.data_type() else { panic!("RQ ex-code field should be FixedSizeList"); }; - assert_eq!(*ex_code_bytes, 6); + // code_dim=16 padded to one 64-dim block at ex_bits=3. + assert_eq!(*ex_code_bytes, 24); assert!(schema.field_with_name(ERROR_FACTORS_FIELD.name()).is_ok()); assert!(schema.field_with_name(EX_ADD_FACTORS_COLUMN).is_ok()); assert!(schema.field_with_name(EX_SCALE_FACTORS_COLUMN).is_ok()); diff --git a/rust/lance-index/src/vector/pq/storage.rs b/rust/lance-index/src/vector/pq/storage.rs index 68747713aac..de5a7ac28bd 100644 --- a/rust/lance-index/src/vector/pq/storage.rs +++ b/rust/lance-index/src/vector/pq/storage.rs @@ -221,7 +221,7 @@ impl ProductQuantizationStorage { "Row ID column not found from PQ storage".to_string(), )); }; - let row_ids: Arc = row_ids + let mut row_ids: Arc = row_ids .as_primitive_opt::() .ok_or(Error::index( "Row ID column is not of type UInt64".to_string(), @@ -293,6 +293,11 @@ impl ProductQuantizationStorage { .as_primitive::() .clone() .into(); + // Refresh the stored row ids from the remapped batch. Without this + // the storage keeps the pre-remap (compacted-away) addresses while + // its codes are remapped, so search returns stale row ids and the + // take fails with "fragment ... does not exist". + row_ids = batch[ROW_ID].as_primitive::().clone().into(); } let distance_type = match distance_type { diff --git a/rust/lance-index/src/vector/storage.rs b/rust/lance-index/src/vector/storage.rs index b036e187b77..a14308197ed 100644 --- a/rust/lance-index/src/vector/storage.rs +++ b/rust/lance-index/src/vector/storage.rs @@ -14,10 +14,12 @@ use lance_core::{Error, ROW_ID, Result}; use lance_encoding::decoder::FilterExpression; use lance_file::reader::FileReader; use lance_io::ReadBatchParams; +use lance_io::scheduler::IoStats; use lance_linalg::distance::DistanceType; use prost::Message; use std::{ any::Any, + borrow::Cow, collections::BinaryHeap, mem::size_of, ops::{Deref, DerefMut}, @@ -249,7 +251,10 @@ pub struct RabitRawQueryContext { pub ex_bits: u8, pub rotated_query: Vec, pub dist_table: Vec, - pub ex_dist_table: Vec, + /// The rotated query zero-padded to a 64-dim multiple for the ex-dot + /// kernels; empty when `code_dim` is already aligned (the kernels then + /// read `rotated_query` directly). + pub ex_query: Vec, pub sum_q: f32, } @@ -620,15 +625,29 @@ impl IvfQuantizationStorage { self.ivf.num_partitions() } - pub async fn load_partition(&self, part_id: usize) -> Result { + /// Load a partition's quantization storage, optionally measuring the exact + /// I/O it performs into `io_stats`. + /// + /// When `io_stats` is `Some`, the partition is read through a reader whose + /// scheduler also records into the sink (a cheap clone that shares all + /// cached metadata, so no file is re-opened). When `None`, the normal + /// uninstrumented reader is used. + pub async fn load_partition( + &self, + part_id: usize, + io_stats: Option, + ) -> Result { let range = self.ivf.row_range(part_id); let batch = if range.is_empty() { let schema = self.reader.schema(); let arrow_schema = arrow_schema::Schema::from(schema.as_ref()); RecordBatch::new_empty(Arc::new(arrow_schema)) } else { - let batches = self - .reader + let reader = match &io_stats { + Some(io_stats) => Cow::Owned(self.reader.with_io_stats(io_stats.recorder())), + None => Cow::Borrowed(&self.reader), + }; + let batches = reader .read_stream( ReadBatchParams::Range(range), u32::MAX, diff --git a/rust/lance-io/src/scheduler.rs b/rust/lance-io/src/scheduler.rs index 4f43cb00668..efe4b9b0c24 100644 --- a/rust/lance-io/src/scheduler.rs +++ b/rust/lance-io/src/scheduler.rs @@ -15,6 +15,7 @@ use std::sync::{Arc, Mutex}; use std::time::Instant; use tokio::sync::Notify; +use lance_core::utils::io_stats::IoStatsRecorder; use lance_core::utils::parse::str_is_truthy; use lance_core::{Error, Result}; @@ -475,8 +476,25 @@ impl StatsCollector { Ordering::Relaxed, ); } + + /// Add already-aggregated counts (e.g. a snapshot captured from another + /// scheduler) into these counters. + fn add(&self, iops: u64, requests: u64, bytes_read: u64) { + self.iops.fetch_add(iops, Ordering::Relaxed); + self.requests.fetch_add(requests, Ordering::Relaxed); + self.bytes_read.fetch_add(bytes_read, Ordering::Relaxed); + } } +impl IoStatsRecorder for StatsCollector { + fn record_request(&self, request: &[Range]) { + // Inherent methods take precedence in resolution, so this delegates to + // the inherent `record_request` above rather than recursing. + Self::record_request(self, request) + } +} + +#[derive(Debug, Clone, Copy, Default)] pub struct ScanStats { pub iops: u64, pub requests: u64, @@ -493,6 +511,57 @@ impl ScanStats { } } +/// A shareable, cloneable handle to a set of cumulative I/O counters. +/// +/// All clones share the same underlying counters. This serves two purposes: +/// +/// 1. It backs each [`ScanScheduler`]'s own running totals. +/// 2. It can be attached to an individual [`FileScheduler`] (via +/// [`FileScheduler::with_io_stats`]) as a *secondary* sink, so a caller can +/// measure the exact bytes/IOPS performed through that file handle for a +/// bounded scope (e.g. a single query) without disturbing the scheduler's +/// global totals. Read the result back with [`IoStats::snapshot`]. +#[derive(Debug, Clone)] +pub struct IoStats(Arc); + +impl IoStats { + pub fn new() -> Self { + Self(Arc::new(StatsCollector::new())) + } + + /// Record a single completed request. `request` holds the byte ranges as + /// actually submitted to storage (post coalescing/splitting), so the counts + /// reflect physical I/O. + pub fn record_request(&self, request: &[Range]) { + self.0.record_request(request); + } + + /// Take an immutable snapshot of the current cumulative counters. + pub fn snapshot(&self) -> ScanStats { + ScanStats::new(self.0.as_ref()) + } + + /// Return this handle as a type-erased [`IoStatsRecorder`], suitable for + /// attaching to a file reader (e.g. `FileReader::with_io_stats`). The + /// returned recorder shares the same underlying counters as `self`. + pub fn recorder(&self) -> Arc { + self.0.clone() + } + + /// Add a snapshot of already-aggregated statistics into this sink. Used to + /// fold in I/O measured on a separate scheduler (e.g. the one-time reads + /// performed while opening an index). + pub fn add_scan_stats(&self, stats: &ScanStats) { + self.0.add(stats.iops, stats.requests, stats.bytes_read); + } +} + +impl Default for IoStats { + fn default() -> Self { + Self::new() + } +} + enum IoQueueType { Standard(Arc), Lite(Arc), @@ -509,7 +578,7 @@ enum IoQueueType { pub struct ScanScheduler { object_store: Arc, io_queue: IoQueueType, - stats: Arc, + stats: IoStats, } impl Debug for ScanScheduler { @@ -606,7 +675,7 @@ impl ScanScheduler { Arc::new(Self { object_store, io_queue, - stats: Arc::new(StatsCollector::new()), + stats: IoStats::new(), }) } @@ -646,6 +715,7 @@ impl ScanScheduler { base_priority, max_iop_size, bypass_backpressure: false, + extra_stats: None, }) } @@ -791,7 +861,7 @@ impl ScanScheduler { } pub fn stats(&self) -> ScanStats { - ScanStats::new(self.stats.as_ref()) + self.stats.snapshot() } #[cfg(test)] @@ -829,6 +899,10 @@ pub struct FileScheduler { base_priority: u64, max_iop_size: u64, bypass_backpressure: bool, + /// Optional secondary statistics sink. When set, every request submitted + /// through this handle is also recorded here, in addition to the + /// scheduler's global totals. Used to measure per-scope I/O. + extra_stats: Option>, } fn is_close_together(range1: &Range, range2: &Range, block_size: u64) -> bool { @@ -899,6 +973,9 @@ impl FileScheduler { } self.root.stats.record_request(&updated_requests); + if let Some(extra_stats) = &self.extra_stats { + extra_stats.record_request(&updated_requests); + } let bytes_vec_fut = self.root.submit_request( self.reader.clone(), @@ -964,6 +1041,23 @@ impl FileScheduler { max_iop_size: self.max_iop_size, base_priority: priority, bypass_backpressure: self.bypass_backpressure, + extra_stats: self.extra_stats.clone(), + } + } + + /// Returns a copy of this scheduler that additionally records the I/O it + /// performs into `stats`, on top of the scheduler's global statistics. + /// + /// This is the mechanism for measuring exact per-scope (e.g. per-query) I/O: + /// attach a recorder here (e.g. via [`IoStats::recorder`]), perform the reads + /// through the returned handle, then read the totals back with + /// [`IoStats::snapshot`]. The returned handle is cheap to create (a few + /// `Arc` clones) and reuses the same underlying reader, so it does not + /// re-open the file. + pub fn with_io_stats(&self, stats: Arc) -> Self { + Self { + extra_stats: Some(stats), + ..self.clone() } } @@ -1183,6 +1277,59 @@ mod tests { assert_eq!(11, scheduler.stats().iops); } + #[tokio::test] + async fn test_io_stats_sink() { + let tmp_file = TempObjFile::default(); + let obj_store = Arc::new(ObjectStore::local()); + + const DATA_SIZE: u64 = 1024 * 1024; + let mut some_data = vec![0; DATA_SIZE as usize]; + rand::rng().fill_bytes(&mut some_data); + obj_store.put(&tmp_file, &some_data).await.unwrap(); + + let scheduler = ScanScheduler::new(obj_store, SchedulerConfig::default_for_testing()); + + // Attach a per-scope sink to one file handle. + let sink = IoStats::new(); + let file_scheduler = scheduler + .open_file(&tmp_file, &CachedFileSize::unknown()) + .await + .unwrap() + .with_io_stats(sink.recorder()); + + // Three reads within 4KiB coalesce into a single physical IOP. The sink + // and the scheduler's global totals must agree exactly, because both are + // recorded from the same post-coalescing request. + file_scheduler + .submit_request(vec![50_000..51_000, 52_000..53_000, 54_000..55_000], 0) + .await + .unwrap(); + + let global = scheduler.stats(); + let scoped = sink.snapshot(); + assert_eq!(1, scoped.iops); + assert_eq!(1, scoped.requests); + // Coalesced range 50_000..55_000 => 5000 physical bytes. + assert_eq!(5000, scoped.bytes_read); + assert_eq!(global.iops, scoped.iops); + assert_eq!(global.requests, scoped.requests); + assert_eq!(global.bytes_read, scoped.bytes_read); + + // A sibling handle without the sink: the global totals advance but the + // sink stays put, proving per-scope isolation. + let other = scheduler + .open_file(&tmp_file, &CachedFileSize::unknown()) + .await + .unwrap(); + other.submit_request(vec![0..1000], 0).await.unwrap(); + + let global_after = scheduler.stats(); + let scoped_after = sink.snapshot(); + assert_eq!(global.bytes_read + 1000, global_after.bytes_read); + assert_eq!(scoped.bytes_read, scoped_after.bytes_read); + assert_eq!(scoped.iops, scoped_after.iops); + } + #[tokio::test] async fn test_priority() { let some_path = Path::parse("foo").unwrap(); diff --git a/rust/lance-linalg/Cargo.toml b/rust/lance-linalg/Cargo.toml index cf91deb69d7..6a188ec3c62 100644 --- a/rust/lance-linalg/Cargo.toml +++ b/rust/lance-linalg/Cargo.toml @@ -18,6 +18,7 @@ lance-arrow = { workspace = true } lance-core = { workspace = true } num-traits = { workspace = true } rand = { workspace = true } +rayon = { workspace = true } [dev-dependencies] approx = { workspace = true } @@ -50,10 +51,6 @@ harness = false name = "cosine" harness = false -[[bench]] -name = "hamming" -harness = false - [[bench]] name = "norm_l2" harness = false diff --git a/rust/lance-linalg/benches/hamming.rs b/rust/lance-linalg/benches/hamming.rs deleted file mode 100644 index 9af3bf4614b..00000000000 --- a/rust/lance-linalg/benches/hamming.rs +++ /dev/null @@ -1,52 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -use std::iter::repeat_with; - -use std::hint::black_box; - -use criterion::{Criterion, criterion_group, criterion_main}; -use lance_linalg::distance::hamming::{hamming, hamming_scalar}; -use rand::Rng; - -const DIMENSION: usize = 1024; -const TOTAL: usize = 1024 * 1024; // 1M vectors - -fn bench_hamming(c: &mut Criterion) { - let mut rng = rand::rng(); - - let key = repeat_with(|| rng.random::()) - .take(DIMENSION) - .collect::>(); - let target = repeat_with(|| rng.random::()) - .take(TOTAL * DIMENSION) - .collect::>(); - - c.bench_function("hamming,scalar", |b| { - b.iter(|| { - black_box( - target - .chunks_exact(DIMENSION) - .map(|tgt| hamming_scalar(&key, tgt)) - .sum::(), - ); - }) - }); - - c.bench_function("hamming,auto_vec", |b| { - b.iter(|| { - black_box( - target - .chunks_exact(DIMENSION) - .map(|tgt| hamming(&key, tgt)) - .sum::(), - ); - }) - }); -} - -criterion_group!( - name=benches; - config = Criterion::default().significance_level(0.1).sample_size(10); - targets = bench_hamming); -criterion_main!(benches); diff --git a/rust/lance-linalg/src/distance.rs b/rust/lance-linalg/src/distance.rs index a356d5c1225..23d1cae2d63 100644 --- a/rust/lance-linalg/src/distance.rs +++ b/rust/lance-linalg/src/distance.rs @@ -27,7 +27,11 @@ pub mod norm_l2; pub use cosine::*; pub use dot::*; -use hamming::hamming_distance_arrow_batch; +pub use hamming::{ + Cluster, ClusteringResult, PairwiseResult, UnionFind, cluster_edges, cluster_pairwise_result, + extract_hashes_from_fixed_list, hamming_distance_arrow_batch, hamming_u64, + pairwise_hamming_distance, pairwise_hamming_distance_parallel, +}; pub use l2::*; use lance_core::deepsize::DeepSizeOf; pub use norm_l2::*; diff --git a/rust/lance-linalg/src/distance/hamming.rs b/rust/lance-linalg/src/distance/hamming.rs index d8fd60f4054..a6f4b038195 100644 --- a/rust/lance-linalg/src/distance/hamming.rs +++ b/rust/lance-linalg/src/distance/hamming.rs @@ -2,14 +2,24 @@ // SPDX-FileCopyrightText: Copyright The Lance Authors //! Hamming distance. +//! +//! This module provides hamming distance computation for binary vectors, +//! including SIMD-accelerated pairwise hamming distance for 64-bit hashes. +use std::collections::HashMap; use std::sync::Arc; -use crate::{Error, Result}; +use arrow_array::builder::{ListBuilder, UInt64Builder}; use arrow_array::cast::AsArray; use arrow_array::types::UInt8Type; -use arrow_array::{Array, FixedSizeListArray, Float32Array}; -use arrow_schema::DataType; +use arrow_array::{ + Array, ArrayRef, FixedSizeListArray, Float32Array, RecordBatch, RecordBatchIterator, + RecordBatchReader, UInt32Array, UInt64Array, +}; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use rayon::prelude::*; + +use crate::{Error, Result}; pub trait Hamming { /// Hamming distance between two vectors. @@ -86,6 +96,640 @@ pub fn hamming_distance_arrow_batch( ))) } +/// Compute hamming distance between two 64-bit values using POPCNT. +#[inline(always)] +pub fn hamming_u64(a: u64, b: u64) -> u32 { + (a ^ b).count_ones() +} + +/// Result of pairwise hamming distance computation. +#[derive(Debug, Clone)] +pub struct PairwiseResult { + pub row_id_a: Vec, + pub row_id_b: Vec, + pub distances: Vec, +} + +impl PairwiseResult { + pub fn new() -> Self { + Self { + row_id_a: Vec::new(), + row_id_b: Vec::new(), + distances: Vec::new(), + } + } + + pub fn with_capacity(capacity: usize) -> Self { + Self { + row_id_a: Vec::with_capacity(capacity), + row_id_b: Vec::with_capacity(capacity), + distances: Vec::with_capacity(capacity), + } + } + + pub fn push(&mut self, a: u64, b: u64, dist: u32) { + self.row_id_a.push(a); + self.row_id_b.push(b); + self.distances.push(dist); + } + + pub fn len(&self) -> usize { + self.row_id_a.len() + } + + pub fn is_empty(&self) -> bool { + self.row_id_a.is_empty() + } + + pub fn extend(&mut self, other: Self) { + self.row_id_a.extend(other.row_id_a); + self.row_id_b.extend(other.row_id_b); + self.distances.extend(other.distances); + } + + /// Convert to Arrow RecordBatch, consuming self. + pub fn into_record_batch(self) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![ + Field::new("row_id_a", DataType::UInt64, false), + Field::new("row_id_b", DataType::UInt64, false), + Field::new("distance", DataType::UInt32, false), + ])); + + let row_id_a = Arc::new(UInt64Array::from(self.row_id_a)); + let row_id_b = Arc::new(UInt64Array::from(self.row_id_b)); + let distances = Arc::new(UInt32Array::from(self.distances)); + + RecordBatch::try_new(schema, vec![row_id_a, row_id_b, distances]) + .expect("Failed to create RecordBatch") + } +} + +impl Default for PairwiseResult { + fn default() -> Self { + Self::new() + } +} + +/// Compute hamming distances for a query against multiple targets. +/// Uses SIMD acceleration when available. +#[inline] +pub fn hamming_batch_u64(query: u64, targets: &[u64], results: &mut [u32]) { + debug_assert_eq!(targets.len(), results.len()); + hamming_batch_simd(query, targets, results); +} + +/// SIMD-accelerated batch hamming distance computation. +#[inline] +fn hamming_batch_simd(query: u64, targets: &[u64], results: &mut [u32]) { + #[cfg(target_arch = "x86_64")] + { + if is_x86_feature_detected!("avx512vpopcntdq") && is_x86_feature_detected!("avx512f") { + unsafe { + hamming_batch_avx512(query, targets, results); + } + return; + } + if is_x86_feature_detected!("avx2") { + unsafe { + hamming_batch_avx2(query, targets, results); + } + return; + } + } + + // Scalar fallback (LLVM auto-vectorizes well on Apple Silicon) + hamming_batch_scalar(query, targets, results); +} + +/// Scalar fallback using count_ones() which compiles to POPCNT. +#[inline] +fn hamming_batch_scalar(query: u64, targets: &[u64], results: &mut [u32]) { + // Unroll for better auto-vectorization + let n = targets.len(); + let chunks = n / 8; + let mut i = 0; + + for _ in 0..chunks { + results[i] = (query ^ targets[i]).count_ones(); + results[i + 1] = (query ^ targets[i + 1]).count_ones(); + results[i + 2] = (query ^ targets[i + 2]).count_ones(); + results[i + 3] = (query ^ targets[i + 3]).count_ones(); + results[i + 4] = (query ^ targets[i + 4]).count_ones(); + results[i + 5] = (query ^ targets[i + 5]).count_ones(); + results[i + 6] = (query ^ targets[i + 6]).count_ones(); + results[i + 7] = (query ^ targets[i + 7]).count_ones(); + i += 8; + } + + // Handle remainder + while i < n { + results[i] = (query ^ targets[i]).count_ones(); + i += 1; + } +} + +/// AVX-512 VPOPCNTDQ: Process 8 x 64-bit values at once. +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx512f", enable = "avx512vpopcntdq")] +unsafe fn hamming_batch_avx512(query: u64, targets: &[u64], results: &mut [u32]) { + use std::arch::x86_64::*; + + let n = targets.len(); + let query_vec = _mm512_set1_epi64(query as i64); + + let chunks = n / 8; + let remainder = n % 8; + + for i in 0..chunks { + let offset = i * 8; + let targets_ptr = targets.as_ptr().add(offset) as *const __m512i; + let target_vec = _mm512_loadu_si512(targets_ptr); + + let xor_result = _mm512_xor_si512(query_vec, target_vec); + let popcount = _mm512_popcnt_epi64(xor_result); + let popcount_32 = _mm512_cvtepi64_epi32(popcount); + + _mm256_storeu_si256( + results.as_mut_ptr().add(offset) as *mut __m256i, + popcount_32, + ); + } + + if remainder > 0 { + let offset = chunks * 8; + for j in 0..remainder { + results[offset + j] = (query ^ targets[offset + j]).count_ones(); + } + } +} + +/// AVX2 popcount using lookup table (Harley-Seal / PSHUFB method). +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx2")] +unsafe fn hamming_batch_avx2(query: u64, targets: &[u64], results: &mut [u32]) { + use std::arch::x86_64::*; + + let n = targets.len(); + + let lookup = _mm256_setr_epi8( + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, + 3, 4, + ); + let low_mask = _mm256_set1_epi8(0x0f); + let query_vec = _mm256_set1_epi64x(query as i64); + + let chunks = n / 4; + let remainder = n % 4; + + for i in 0..chunks { + let offset = i * 4; + let targets_ptr = targets.as_ptr().add(offset) as *const __m256i; + let target_vec = _mm256_loadu_si256(targets_ptr); + + let xor_result = _mm256_xor_si256(query_vec, target_vec); + + // Popcount using nibble lookup + let lo = _mm256_and_si256(xor_result, low_mask); + let hi = _mm256_and_si256(_mm256_srli_epi16(xor_result, 4), low_mask); + let popcnt_lo = _mm256_shuffle_epi8(lookup, lo); + let popcnt_hi = _mm256_shuffle_epi8(lookup, hi); + let popcnt_bytes = _mm256_add_epi8(popcnt_lo, popcnt_hi); + let popcount = _mm256_sad_epu8(popcnt_bytes, _mm256_setzero_si256()); + + let results_ptr = results.as_mut_ptr().add(offset); + *results_ptr = _mm256_extract_epi32::<0>(popcount) as u32; + *results_ptr.add(1) = _mm256_extract_epi32::<2>(popcount) as u32; + *results_ptr.add(2) = _mm256_extract_epi32::<4>(popcount) as u32; + *results_ptr.add(3) = _mm256_extract_epi32::<6>(popcount) as u32; + } + + if remainder > 0 { + let offset = chunks * 4; + for j in 0..remainder { + results[offset + j] = (query ^ targets[offset + j]).count_ones(); + } + } +} + +/// Compute pairwise hamming distances for all pairs of hashes. +/// +/// Returns pairs where distance <= threshold (if provided). +/// +/// # Arguments +/// * `hashes` - Vector of 64-bit hash values +/// * `row_ids` - Optional row IDs (defaults to indices if None) +/// * `threshold` - Optional maximum distance to include in results +pub fn pairwise_hamming_distance( + hashes: &[u64], + row_ids: Option<&[u64]>, + threshold: Option, +) -> PairwiseResult { + let n = hashes.len(); + if n < 2 { + return PairwiseResult::new(); + } + + let threshold = threshold.unwrap_or(u32::MAX); + let num_pairs = n * (n - 1) / 2; + let mut result = PairwiseResult::with_capacity(num_pairs.min(1_000_000)); + + for i in 0..n { + for j in (i + 1)..n { + let dist = hamming_u64(hashes[i], hashes[j]); + if dist <= threshold { + let id_a = row_ids.map_or(i as u64, |ids| ids[i]); + let id_b = row_ids.map_or(j as u64, |ids| ids[j]); + result.push(id_a, id_b, dist); + } + } + } + + result +} + +/// Compute pairwise hamming distances in parallel using rayon + SIMD. +/// +/// Uses chunked parallelization for balanced workload distribution. +pub fn pairwise_hamming_distance_parallel( + hashes: &[u64], + row_ids: Option<&[u64]>, + threshold: Option, +) -> PairwiseResult { + let n = hashes.len(); + if n < 2 { + return PairwiseResult::new(); + } + + let threshold = threshold.unwrap_or(u32::MAX); + let total_pairs = n * (n - 1) / 2; + + // For small datasets, use sequential to avoid thread overhead + if total_pairs < 10_000 { + return pairwise_hamming_distance(hashes, row_ids, Some(threshold)); + } + + let threads = rayon::current_num_threads(); + let pairs_per_chunk = total_pairs.div_ceil(threads); + let chunks = compute_balanced_chunks(n, pairs_per_chunk); + + let results: Vec = chunks + .into_par_iter() + .map(|(start_row, end_row)| { + process_row_range(hashes, row_ids, threshold, start_row, end_row) + }) + .collect(); + + let mut combined = PairwiseResult::new(); + for r in results { + combined.extend(r); + } + combined +} + +/// Compute balanced chunks for parallel processing. +fn compute_balanced_chunks(n: usize, target_pairs_per_chunk: usize) -> Vec<(usize, usize)> { + let mut chunks = Vec::new(); + let mut current_start = 0; + let mut current_pairs = 0; + + for i in 0..n { + let pairs_for_row = n - i - 1; + current_pairs += pairs_for_row; + + if current_pairs >= target_pairs_per_chunk || i == n - 1 { + chunks.push((current_start, i + 1)); + current_start = i + 1; + current_pairs = 0; + } + } + + chunks +} + +/// Process a range of rows for pairwise comparison using SIMD. +fn process_row_range( + hashes: &[u64], + row_ids: Option<&[u64]>, + threshold: u32, + start_row: usize, + end_row: usize, +) -> PairwiseResult { + let n = hashes.len(); + let mut result = PairwiseResult::new(); + + for i in start_row..end_row { + let remaining = n - i - 1; + if remaining == 0 { + continue; + } + + let mut distances = vec![0u32; remaining]; + hamming_batch_u64(hashes[i], &hashes[i + 1..], &mut distances); + + let id_a = row_ids.map_or(i as u64, |ids| ids[i]); + for (j_offset, &dist) in distances.iter().enumerate() { + if dist <= threshold { + let j = i + 1 + j_offset; + let id_b = row_ids.map_or(j as u64, |ids| ids[j]); + result.push(id_a, id_b, dist); + } + } + } + + result +} + +/// Extract u64 hashes from a FixedSizeList Arrow array. +pub fn extract_hashes_from_fixed_list(array: &FixedSizeListArray) -> Result> { + let list_size = array.value_length(); + if list_size != 8 { + return Err(Error::InvalidArgumentError(format!( + "Expected FixedSizeList with size 8, got size {}", + list_size + ))); + } + + let values = array + .values() + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::InvalidArgumentError("Expected UInt8Array values in FixedSizeList".to_string()) + })?; + + let n = array.len(); + let mut hashes = Vec::with_capacity(n); + + for i in 0..n { + let start = i * 8; + let bytes = &values.values()[start..start + 8]; + let mut arr = [0u8; 8]; + arr.copy_from_slice(bytes); + hashes.push(u64::from_le_bytes(arr)); + } + + Ok(hashes) +} + +/// Union-Find data structure with path compression for clustering. +pub struct UnionFind { + parent: HashMap, + rank: HashMap, +} + +impl UnionFind { + pub fn new() -> Self { + Self { + parent: HashMap::new(), + rank: HashMap::new(), + } + } + + pub fn with_capacity(capacity: usize) -> Self { + Self { + parent: HashMap::with_capacity(capacity), + rank: HashMap::with_capacity(capacity), + } + } + + /// Find the root of a node with path compression. + pub fn find(&mut self, x: u64) -> u64 { + if let std::collections::hash_map::Entry::Vacant(e) = self.parent.entry(x) { + e.insert(x); + self.rank.insert(x, 0); + return x; + } + + let mut current = x; + let mut path = Vec::new(); + + while self.parent[¤t] != current { + path.push(current); + current = self.parent[¤t]; + } + let root = current; + + for node in path { + self.parent.insert(node, root); + } + + root + } + + /// Union two nodes, using union by rank. + pub fn union(&mut self, a: u64, b: u64) -> bool { + let root_a = self.find(a); + let root_b = self.find(b); + + if root_a == root_b { + return false; + } + + let rank_a = self.rank[&root_a]; + let rank_b = self.rank[&root_b]; + + if rank_a < rank_b { + self.parent.insert(root_a, root_b); + } else if rank_a > rank_b { + self.parent.insert(root_b, root_a); + } else if root_a < root_b { + self.parent.insert(root_b, root_a); + *self.rank.get_mut(&root_a).unwrap() += 1; + } else { + self.parent.insert(root_a, root_b); + *self.rank.get_mut(&root_b).unwrap() += 1; + } + + true + } + + pub fn nodes(&self) -> impl Iterator { + self.parent.keys() + } + + pub fn len(&self) -> usize { + self.parent.len() + } + + pub fn is_empty(&self) -> bool { + self.parent.is_empty() + } +} + +impl Default for UnionFind { + fn default() -> Self { + Self::new() + } +} + +/// A cluster with representative and duplicates. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Cluster { + /// The representative row ID (smallest in the cluster). + pub representative: u64, + /// List of duplicate row IDs (excludes the representative). + pub duplicates: Vec, +} + +impl Cluster { + pub fn size(&self) -> usize { + 1 + self.duplicates.len() + } +} + +/// Result of the clustering operation. +#[derive(Debug, Clone)] +pub struct ClusteringResult { + /// List of clusters, each with a representative and duplicates. + pub clusters: Vec, +} + +impl ClusteringResult { + pub fn num_clusters(&self) -> usize { + self.clusters.len() + } + + pub fn num_duplicates(&self) -> usize { + self.clusters.iter().map(|c| c.duplicates.len()).sum() + } + + pub fn num_unique(&self) -> usize { + self.clusters.len() + } + + /// Get the schema for clustering result batches. + pub fn schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("representative", DataType::UInt64, false), + Field::new( + "duplicates", + DataType::List(Arc::new(Field::new("item", DataType::UInt64, true))), + false, + ), + ])) + } + + /// Convert to Arrow RecordBatch with columns: + /// - `representative`: `UInt64` + /// - `duplicates`: `List` + pub fn to_record_batch(&self) -> RecordBatch { + let schema = Self::schema(); + + let mut representatives = Vec::with_capacity(self.clusters.len()); + let mut duplicates_builder = ListBuilder::new(UInt64Builder::new()); + + for cluster in &self.clusters { + representatives.push(cluster.representative); + for &dup in &cluster.duplicates { + duplicates_builder.values().append_value(dup); + } + duplicates_builder.append(true); + } + + let representative_array: ArrayRef = Arc::new(UInt64Array::from(representatives)); + let duplicates_array: ArrayRef = Arc::new(duplicates_builder.finish()); + + RecordBatch::try_new(schema, vec![representative_array, duplicates_array]) + .expect("Failed to create RecordBatch") + } + + /// Convert to a RecordBatchReader that yields batches of the specified size. + /// + /// # Arguments + /// * `batch_size` - Number of clusters per batch (default: 10000) + pub fn into_reader(self, batch_size: Option) -> Box { + let batch_size = batch_size.unwrap_or(10_000); + let schema = Self::schema(); + + if self.clusters.is_empty() { + // Return empty reader + let batches: Vec> = vec![]; + return Box::new(RecordBatchIterator::new(batches, schema)); + } + + let batches: Vec> = self + .clusters + .chunks(batch_size) + .map(|chunk| { + let mut representatives = Vec::with_capacity(chunk.len()); + let mut duplicates_builder = ListBuilder::new(UInt64Builder::new()); + + for cluster in chunk { + representatives.push(cluster.representative); + for &dup in &cluster.duplicates { + duplicates_builder.values().append_value(dup); + } + duplicates_builder.append(true); + } + + let representative_array: ArrayRef = Arc::new(UInt64Array::from(representatives)); + let duplicates_array: ArrayRef = Arc::new(duplicates_builder.finish()); + + RecordBatch::try_new(Self::schema(), vec![representative_array, duplicates_array]) + }) + .collect(); + + Box::new(RecordBatchIterator::new(batches, schema)) + } +} + +/// Cluster edges using union-find algorithm. +/// +/// Takes a list of edges (row_id_a, row_id_b) and groups connected nodes +/// into clusters. Each cluster has a representative (smallest row ID) +/// and a list of duplicates. +pub fn cluster_edges(edges: I) -> ClusteringResult +where + I: IntoIterator, +{ + let mut uf = UnionFind::new(); + + for (a, b) in edges { + uf.union(a, b); + } + + let mut clusters_map: HashMap> = HashMap::new(); + let nodes: Vec = uf.nodes().copied().collect(); + + for node in nodes { + let root = uf.find(node); + clusters_map.entry(root).or_default().push(node); + } + + let mut clusters = Vec::new(); + for (_root, mut members) in clusters_map { + members.sort_unstable(); + + if members.len() > 1 { + let representative = *members.iter().min().unwrap(); + let duplicates: Vec = members + .into_iter() + .filter(|&m| m != representative) + .collect(); + + clusters.push(Cluster { + representative, + duplicates, + }); + } + } + + clusters.sort_by_key(|c| c.representative); + + ClusteringResult { clusters } +} + +/// Cluster edges from PairwiseResult. +pub fn cluster_pairwise_result(result: &PairwiseResult) -> ClusteringResult { + let edges = result + .row_id_a + .iter() + .zip(result.row_id_b.iter()) + .map(|(&a, &b)| (a, b)); + + cluster_edges(edges) +} + #[cfg(test)] mod tests { use super::*; @@ -102,4 +746,677 @@ mod tests { let y = vec![0b1101_1010, 0b1010_1010, 0b1010_1001]; assert_eq!(hamming(&x, &y), 2.0); } + + #[test] + fn test_hamming_u64() { + assert_eq!(hamming_u64(0, 0), 0); + assert_eq!(hamming_u64(0, 1), 1); + assert_eq!(hamming_u64(0b1111, 0b0000), 4); + assert_eq!(hamming_u64(u64::MAX, 0), 64); + assert_eq!(hamming_u64(0xAAAAAAAAAAAAAAAA, 0x5555555555555555), 64); + } + + #[test] + fn test_hamming_batch_u64() { + let query = 0u64; + let targets: Vec = (0..128).collect(); + let mut results = vec![0u32; 128]; + + hamming_batch_u64(query, &targets, &mut results); + + assert_eq!(results[0], 0); + assert_eq!(results[1], 1); + assert_eq!(results[3], 2); // 0b11 has 2 bits set + assert_eq!(results[7], 3); // 0b111 has 3 bits set + } + + #[test] + fn test_pairwise_basic() { + let hashes = vec![0b0000u64, 0b0001, 0b0011, 0b0111]; + let result = pairwise_hamming_distance(&hashes, None, None); + + assert_eq!(result.len(), 6); // C(4,2) = 6 pairs + assert!(result.distances.iter().all(|&d| d <= 3)); + } + + #[test] + fn test_pairwise_with_threshold() { + let hashes = vec![0b0000u64, 0b0001, 0b1111]; + let result = pairwise_hamming_distance(&hashes, None, Some(1)); + + assert_eq!(result.len(), 1); + assert_eq!(result.row_id_a[0], 0); + assert_eq!(result.row_id_b[0], 1); + assert_eq!(result.distances[0], 1); + } + + #[test] + fn test_pairwise_with_row_ids() { + let hashes = vec![0b0000u64, 0b0001]; + let row_ids = vec![100u64, 200u64]; + let result = pairwise_hamming_distance(&hashes, Some(&row_ids), None); + + assert_eq!(result.len(), 1); + assert_eq!(result.row_id_a[0], 100); + assert_eq!(result.row_id_b[0], 200); + } + + #[test] + fn test_pairwise_parallel() { + let hashes: Vec = (0..100).collect(); + let result_seq = pairwise_hamming_distance(&hashes, None, None); + let result_par = pairwise_hamming_distance_parallel(&hashes, None, None); + + assert_eq!(result_seq.len(), result_par.len()); + } + + #[test] + fn test_union_find_basic() { + let mut uf = UnionFind::new(); + + assert_eq!(uf.find(1), 1); + assert_eq!(uf.find(2), 2); + assert_eq!(uf.find(3), 3); + + assert!(uf.union(1, 2)); + assert_eq!(uf.find(1), uf.find(2)); + + assert!(uf.union(2, 3)); + assert_eq!(uf.find(1), uf.find(3)); + + assert!(!uf.union(1, 3)); + } + + #[test] + fn test_cluster_edges_simple() { + let edges = vec![(1, 2), (2, 3), (4, 5)]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 2); + + let c1 = result + .clusters + .iter() + .find(|c| c.representative == 1) + .unwrap(); + assert_eq!(c1.duplicates.len(), 2); + assert!(c1.duplicates.contains(&2)); + assert!(c1.duplicates.contains(&3)); + + let c2 = result + .clusters + .iter() + .find(|c| c.representative == 4) + .unwrap(); + assert_eq!(c2.duplicates.len(), 1); + assert!(c2.duplicates.contains(&5)); + } + + #[test] + fn test_cluster_pairwise_result() { + let hashes = vec![0b0000u64, 0b0001, 0b0011]; // distances: (0,1)=1, (0,2)=2, (1,2)=1 + let pairwise = pairwise_hamming_distance(&hashes, None, Some(1)); // threshold 1 + + // Only pairs with distance <= 1: (0,1) and (1,2) + assert_eq!(pairwise.len(), 2); + + let clustering = cluster_pairwise_result(&pairwise); + // All three should be in one cluster since 0-1-2 are connected + assert_eq!(clustering.num_clusters(), 1); + assert_eq!(clustering.clusters[0].representative, 0); + assert_eq!(clustering.clusters[0].duplicates.len(), 2); + } + + #[test] + fn test_into_record_batch() { + let hashes = vec![0b0000u64, 0b0001, 0b0011]; + let result = pairwise_hamming_distance(&hashes, None, None); + let batch = result.into_record_batch(); + + assert_eq!(batch.num_rows(), 3); + assert_eq!(batch.num_columns(), 3); + assert_eq!(batch.schema().field(0).name(), "row_id_a"); + assert_eq!(batch.schema().field(1).name(), "row_id_b"); + assert_eq!(batch.schema().field(2).name(), "distance"); + } + + // ========================================================================= + // Additional tests from pairwise-hamming reference implementation + // ========================================================================= + + /// Reference implementation for validation - simple O(n²) nested loop + fn reference_pairwise(hashes: &[u64], threshold: Option) -> Vec<(usize, usize, u32)> { + let threshold = threshold.unwrap_or(u32::MAX); + let mut results = Vec::new(); + for i in 0..hashes.len() { + for j in (i + 1)..hashes.len() { + let dist = (hashes[i] ^ hashes[j]).count_ones(); + if dist <= threshold { + results.push((i, j, dist)); + } + } + } + results + } + + /// Convert PairwiseResult to sorted vec for comparison + fn result_to_sorted_vec(result: &PairwiseResult) -> Vec<(u64, u64, u32)> { + let mut v: Vec<_> = result + .row_id_a + .iter() + .zip(result.row_id_b.iter()) + .zip(result.distances.iter()) + .map(|((&a, &b), &d)| (a, b, d)) + .collect(); + v.sort(); + v + } + + #[test] + fn test_pairwise_correctness_small() { + // Deterministic hashes with known distances + let hashes = vec![ + 0b0000_0000u64, // 0 + 0b0000_0001u64, // 1 bit from 0 + 0b0000_0011u64, // 2 bits from 0, 1 bit from 1 + 0b0000_0111u64, // 3 bits from 0, 2 bits from 1, 1 bit from 2 + 0b0000_1111u64, // 4 bits from 0, 3 bits from 1, 2 bits from 2, 1 bit from 3 + ]; + + let result = pairwise_hamming_distance(&hashes, None, None); + let reference = reference_pairwise(&hashes, None); + + assert_eq!(result.len(), reference.len()); + assert_eq!(result.len(), 10); // C(5,2) = 10 pairs + + // Verify specific distances + let result_vec = result_to_sorted_vec(&result); + for (i, j, expected_dist) in &reference { + let found = result_vec + .iter() + .find(|(a, b, _)| *a == *i as u64 && *b == *j as u64); + assert!(found.is_some(), "Missing pair ({}, {})", i, j); + assert_eq!( + found.unwrap().2, + *expected_dist, + "Wrong distance for pair ({}, {})", + i, + j + ); + } + } + + #[test] + fn test_pairwise_correctness_1000_deterministic() { + // Generate deterministic hashes using simple linear pattern + let hashes: Vec = (0u64..1000) + .map(|i| i.wrapping_mul(0x123456789ABCDEF)) + .collect(); + + let result_seq = pairwise_hamming_distance(&hashes, None, Some(10)); + let result_par = pairwise_hamming_distance_parallel(&hashes, None, Some(10)); + let reference = reference_pairwise(&hashes, Some(10)); + + // Both implementations should match reference + assert_eq!( + result_seq.len(), + reference.len(), + "Sequential result count mismatch" + ); + assert_eq!( + result_par.len(), + reference.len(), + "Parallel result count mismatch" + ); + + // Verify all pairs match + let seq_sorted = result_to_sorted_vec(&result_seq); + let par_sorted = result_to_sorted_vec(&result_par); + + for (i, j, dist) in &reference { + let seq_found = seq_sorted + .iter() + .find(|(a, b, _)| *a == *i as u64 && *b == *j as u64); + let par_found = par_sorted + .iter() + .find(|(a, b, _)| *a == *i as u64 && *b == *j as u64); + + assert!( + seq_found.is_some(), + "Sequential missing pair ({}, {})", + i, + j + ); + assert!(par_found.is_some(), "Parallel missing pair ({}, {})", i, j); + assert_eq!(seq_found.unwrap().2, *dist); + assert_eq!(par_found.unwrap().2, *dist); + } + } + + #[test] + fn test_pairwise_correctness_10000_deterministic() { + // Larger test with 10K hashes + let hashes: Vec = (0u64..10_000) + .map(|i| { + // Mix bits using a simple hash-like transformation + let x = i.wrapping_mul(0xDEADBEEFCAFEBABE); + x ^ (x >> 17) ^ (x << 13) + }) + .collect(); + + let result_seq = pairwise_hamming_distance(&hashes, None, Some(5)); + let result_par = pairwise_hamming_distance_parallel(&hashes, None, Some(5)); + + // Both should find the same number of pairs + assert_eq!( + result_seq.len(), + result_par.len(), + "10K test: sequential found {} pairs, parallel found {} pairs", + result_seq.len(), + result_par.len() + ); + + // Verify they contain the same pairs (sorted comparison) + let seq_sorted = result_to_sorted_vec(&result_seq); + let par_sorted = result_to_sorted_vec(&result_par); + assert_eq!(seq_sorted, par_sorted, "10K test: pair contents differ"); + } + + #[test] + fn test_pairwise_total_pairs_count() { + // Without threshold, should return exactly n*(n-1)/2 pairs + for n in [10, 50, 100, 500] { + let hashes: Vec = (0..n).map(|i| i as u64).collect(); + let result = pairwise_hamming_distance_parallel(&hashes, None, None); + let expected = n * (n - 1) / 2; + assert_eq!( + result.len(), + expected, + "n={}: expected {} pairs, got {}", + n, + expected, + result.len() + ); + } + } + + #[test] + fn test_pairwise_threshold_filtering() { + // All identical hashes should have distance 0 + let hashes = vec![0xABCDEF0123456789u64; 100]; + let result = pairwise_hamming_distance_parallel(&hashes, None, Some(0)); + + // All pairs should be included (distance 0) + assert_eq!(result.len(), 100 * 99 / 2); + assert!(result.distances.iter().all(|&d| d == 0)); + + // With threshold 0 and all different hashes, should find fewer pairs + let different_hashes: Vec = (0u64..100).collect(); + let result2 = pairwise_hamming_distance_parallel(&different_hashes, None, Some(0)); + // Only pairs with identical values should match (none in this case except 0^0) + assert!(result2.len() < 100 * 99 / 2); + } + + #[test] + fn test_pairwise_row_ids_preserved() { + let hashes: Vec = (0u64..100).collect(); + let row_ids: Vec = (1000u64..1100).collect(); // offset row IDs + + let result = pairwise_hamming_distance_parallel(&hashes, Some(&row_ids), Some(5)); + + // All row IDs should be in range [1000, 1100) + for &id in &result.row_id_a { + assert!((1000..1100).contains(&id), "row_id_a {} out of range", id); + } + for &id in &result.row_id_b { + assert!((1000..1100).contains(&id), "row_id_b {} out of range", id); + } + // row_id_a should always be less than row_id_b (upper triangular) + for (&a, &b) in result.row_id_a.iter().zip(result.row_id_b.iter()) { + assert!(a < b, "Expected row_id_a < row_id_b, got {} >= {}", a, b); + } + } + + #[test] + fn test_pairwise_distance_bounds() { + // All distances should be in [0, 64] for u64 hashes + let hashes: Vec = (0u64..1000).map(|i| i.wrapping_mul(0x123456789)).collect(); + + let result = pairwise_hamming_distance_parallel(&hashes, None, None); + + for &d in &result.distances { + assert!(d <= 64, "Distance {} exceeds maximum 64", d); + } + } + + #[test] + fn test_pairwise_symmetry() { + // Hamming distance is symmetric: d(a,b) = d(b,a) + let hashes: Vec = vec![ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xAAAAAAAAAAAAAAAA, + 0x5555555555555555, + 0x123456789ABCDEF0, + ]; + + let result = pairwise_hamming_distance(&hashes, None, None); + + // For each pair (i,j), verify distance matches manual calculation + for idx in 0..result.len() { + let i = result.row_id_a[idx] as usize; + let j = result.row_id_b[idx] as usize; + let dist = result.distances[idx]; + + let expected = (hashes[i] ^ hashes[j]).count_ones(); + assert_eq!(dist, expected, "Distance mismatch for pair ({}, {})", i, j); + } + } + + #[test] + fn test_balanced_chunks() { + // Verify chunks are reasonably balanced + let n = 10000; + let total_pairs = n * (n - 1) / 2; + let target_per_chunk = total_pairs / 16; + + let chunks = compute_balanced_chunks(n, target_per_chunk); + + // Should have roughly 16 chunks + assert!( + chunks.len() >= 14 && chunks.len() <= 18, + "Expected ~16 chunks, got {}", + chunks.len() + ); + + // Each chunk should have roughly equal work + for (start, end) in &chunks { + let mut chunk_pairs = 0usize; + for i in *start..*end { + chunk_pairs += n - i - 1; + } + // Allow 20% deviation from target + let lower = target_per_chunk * 80 / 100; + // last chunk may be smaller + assert!( + chunk_pairs >= lower || *end == n, + "Chunk [{}, {}) has {} pairs, expected ~{}", + start, + end, + chunk_pairs, + target_per_chunk + ); + } + + // Chunks should cover all rows without gaps + assert_eq!(chunks[0].0, 0); + assert_eq!(chunks.last().unwrap().1, n); + for i in 1..chunks.len() { + assert_eq!(chunks[i].0, chunks[i - 1].1, "Gap between chunks"); + } + } + + // ========================================================================= + // SIMD-specific tests + // ========================================================================= + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_avx2_popcount() { + if !is_x86_feature_detected!("avx2") { + return; + } + + let query = 0u64; + let targets = vec![0u64, 1, 3, 7, 15, 31, 63, 127]; + let mut results = vec![0u32; 8]; + + unsafe { + hamming_batch_avx2(query, &targets, &mut results); + } + + assert_eq!(results[0], 0); // 0 ^ 0 = 0 bits + assert_eq!(results[1], 1); // 0 ^ 1 = 1 bit + assert_eq!(results[2], 2); // 0 ^ 3 = 2 bits + assert_eq!(results[3], 3); // 0 ^ 7 = 3 bits + assert_eq!(results[4], 4); // 0 ^ 15 = 4 bits + assert_eq!(results[5], 5); // 0 ^ 31 = 5 bits + assert_eq!(results[6], 6); // 0 ^ 63 = 6 bits + assert_eq!(results[7], 7); // 0 ^ 127 = 7 bits + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_avx2_max_distance() { + if !is_x86_feature_detected!("avx2") { + return; + } + + let query = 0u64; + let targets = vec![u64::MAX; 4]; + let mut results = vec![0u32; 4]; + + unsafe { + hamming_batch_avx2(query, &targets, &mut results); + } + + for &r in &results { + assert_eq!(r, 64); + } + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_avx512_popcount() { + if !is_x86_feature_detected!("avx512vpopcntdq") || !is_x86_feature_detected!("avx512f") { + return; + } + + let query = 0u64; + let targets = vec![0u64, 1, 3, 7, 15, 31, 63, 127]; + let mut results = vec![0u32; 8]; + + unsafe { + hamming_batch_avx512(query, &targets, &mut results); + } + + assert_eq!(results[0], 0); + assert_eq!(results[1], 1); + assert_eq!(results[2], 2); + assert_eq!(results[3], 3); + assert_eq!(results[4], 4); + assert_eq!(results[5], 5); + assert_eq!(results[6], 6); + assert_eq!(results[7], 7); + } + + // ========================================================================= + // Additional clustering tests + // ========================================================================= + + #[test] + fn test_union_find_path_compression() { + let mut uf = UnionFind::new(); + + // Create a chain: 1 -> 2 -> 3 -> 4 -> 5 + uf.union(4, 5); + uf.union(3, 4); + uf.union(2, 3); + uf.union(1, 2); + + // All should have the same root + let root = uf.find(1); + assert_eq!(uf.find(2), root); + assert_eq!(uf.find(3), root); + assert_eq!(uf.find(4), root); + assert_eq!(uf.find(5), root); + } + + #[test] + fn test_cluster_edges_single_cluster() { + // All connected: 1-2-3-4-5 + let edges = vec![(1, 2), (2, 3), (3, 4), (4, 5)]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 1); + let cluster = &result.clusters[0]; + assert_eq!(cluster.representative, 1); + assert_eq!(cluster.duplicates.len(), 4); + assert_eq!(cluster.size(), 5); + } + + #[test] + fn test_cluster_edges_no_duplicates() { + // No edges means no clusters + let edges: Vec<(u64, u64)> = vec![]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 0); + assert_eq!(result.num_duplicates(), 0); + } + + #[test] + fn test_cluster_edges_self_loop() { + // Self-loop shouldn't create a cluster (size 1) + let edges = vec![(1, 1), (2, 3)]; + let result = cluster_edges(edges); + + // Only {2,3} should be a cluster + assert_eq!(result.num_clusters(), 1); + assert_eq!(result.clusters[0].representative, 2); + } + + #[test] + fn test_cluster_edges_duplicate_edges() { + // Duplicate edges should be handled correctly + let edges = vec![(1, 2), (1, 2), (2, 3), (2, 3), (3, 1)]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 1); + assert_eq!(result.clusters[0].size(), 3); + } + + #[test] + fn test_cluster_edges_large() { + // Create 100 clusters of size 10 each + let mut edges = Vec::new(); + for cluster_id in 0..100u64 { + let base = cluster_id * 10; + for i in 0..9 { + edges.push((base + i, base + i + 1)); + } + } + + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 100); + for cluster in &result.clusters { + assert_eq!(cluster.size(), 10); + assert_eq!(cluster.duplicates.len(), 9); + } + } + + #[test] + fn test_cluster_edges_random_order() { + // Same edges in different order should produce same result + let edges1 = vec![(1, 2), (2, 3), (4, 5), (3, 4)]; + let edges2 = vec![(4, 5), (1, 2), (3, 4), (2, 3)]; + let edges3 = vec![(3, 4), (4, 5), (2, 3), (1, 2)]; + + let r1 = cluster_edges(edges1); + let r2 = cluster_edges(edges2); + let r3 = cluster_edges(edges3); + + // All should produce the same single cluster + assert_eq!(r1.num_clusters(), 1); + assert_eq!(r2.num_clusters(), 1); + assert_eq!(r3.num_clusters(), 1); + + assert_eq!(r1.clusters[0].representative, 1); + assert_eq!(r2.clusters[0].representative, 1); + assert_eq!(r3.clusters[0].representative, 1); + + assert_eq!(r1.clusters[0].size(), 5); + assert_eq!(r2.clusters[0].size(), 5); + assert_eq!(r3.clusters[0].size(), 5); + } + + #[test] + fn test_cluster_edges_non_contiguous_ids() { + // Row IDs don't need to be contiguous + let edges = vec![(100, 200), (200, 500), (1000, 2000)]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 2); + + let c1 = result + .clusters + .iter() + .find(|c| c.representative == 100) + .unwrap(); + assert_eq!(c1.duplicates, vec![200, 500]); + + let c2 = result + .clusters + .iter() + .find(|c| c.representative == 1000) + .unwrap(); + assert_eq!(c2.duplicates, vec![2000]); + } + + #[test] + fn test_cluster_representative_is_minimum() { + // Representative should always be the minimum row ID in cluster + let edges = vec![ + (5, 3), + (3, 7), + (7, 1), // 1 is minimum + (100, 50), + (50, 75), // 50 is minimum + ]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 2); + + let c1 = result + .clusters + .iter() + .find(|c| c.duplicates.contains(&7)) + .unwrap(); + assert_eq!(c1.representative, 1); + + let c2 = result + .clusters + .iter() + .find(|c| c.duplicates.contains(&100)) + .unwrap(); + assert_eq!(c2.representative, 50); + } + + #[test] + fn test_cluster_duplicates_sorted() { + // Duplicates should be sorted + let edges = vec![(1, 5), (1, 3), (1, 7), (1, 2)]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 1); + assert_eq!(result.clusters[0].representative, 1); + assert_eq!(result.clusters[0].duplicates, vec![2, 3, 5, 7]); + } + + #[test] + fn test_clustering_result_stats() { + let edges = vec![ + (1, 2), + (2, 3), // cluster of 3 + (10, 20), + (20, 30), + (30, 40), // cluster of 4 + ]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 2); + assert_eq!(result.num_duplicates(), 5); // 2 + 3 + assert_eq!(result.num_unique(), 2); + } } diff --git a/rust/lance-namespace-datafusion/tests/sql.rs b/rust/lance-namespace-datafusion/tests/sql.rs index e49cd7e58e3..5332e831cb6 100755 --- a/rust/lance-namespace-datafusion/tests/sql.rs +++ b/rust/lance-namespace-datafusion/tests/sql.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +#![recursion_limit = "256"] + use std::sync::Arc; use arrow_array::{Int32Array, Int64Array, RecordBatch, RecordBatchIterator, StringArray}; diff --git a/rust/lance-namespace-impls/BENCHMARK.md b/rust/lance-namespace-impls/BENCHMARK.md new file mode 100644 index 00000000000..074ec303347 --- /dev/null +++ b/rust/lance-namespace-impls/BENCHMARK.md @@ -0,0 +1,73 @@ +# `__manifest` commit benchmark + +Measures how fast the copy-on-write directory catalog commits `__manifest` mutations as +the manifest scales, with the inline scalar indices on or off. + +The catalog commits every mutation by rewriting the whole `__manifest` (copy-on-write) +and atomically writing a new manifest version. This benchmark characterises: + +- **Continuous commit** — a single process commits `N` times into a manifest already + holding `rows` entries (per-commit latency + throughput). +- **Concurrent commit** — `C` processes commit continuously for a fixed duration against + a manifest of `rows` entries (steady, contended TPS). + +## Binary: `examples/manifest_bench.rs` + +``` +manifest_bench seed-large --root --count --inline-optimization \ + [--storage-option aws_region=us-east-1] +manifest_bench run --root --operation write-create-namespace \ + --concurrency 1 --operations 100 --initial-entries --inline-optimization # continuous +manifest_bench run --root --operation write-create-namespace \ + --concurrency 50 --duration-secs 30 --initial-entries --inline-optimization # concurrent +``` + +- `seed-large` bootstraps a manifest to `count` rows by writing the Lance dataset + directly (O(rows) once) and then triggering one CoW rewrite so the on-disk state + matches the steady catalog form (single fragment; inline indices when enabled). +- `run` spawns `--concurrency` worker subprocesses. With `--operations` it runs a fixed + commit budget (continuous); with `--duration-secs` each worker commits until the + deadline (steady TPS). It prints one JSON `BenchResult` per concurrency level with + throughput and p50/p90/p99 latency. +- The committed operation (`--operation`) defaults to `write-create-namespace`, the + cheapest pure-`__manifest` mutation (no table data). `write-create-table` / + `write-declare-table` are also available. + +S3 requires the default `dir-aws` feature (on by default) and AWS credentials in the +environment; pass `--storage-option aws_region=`. + +## Sweep panel: `benches/manifest_commit_sweep.sh` + +Runs the full panel — sizes × {inline index, no index} × {continuous, concurrent×C} — +with per-run S3-copy isolation (each run starts at exactly the bootstrapped size), +JSONL results, a `summary.csv`, and resume support. + +```bash +cargo build --release --example manifest_bench -p lance-namespace-impls +S3_BASE=s3:///manifest-cow-bench/$(date -u +%Y%m%dT%H%M%SZ) \ + rust/lance-namespace-impls/benches/manifest_commit_sweep.sh +``` + +Default panel (override via env): `SIZES="1000 2000 5000 10000 20000 50000 100000 200000 +500000 1000000"`, `CONCURRENCY="10 20 50 100 120 150 200"`, `INLINE_VARIANTS="true false"`, +`CONT_OPS=100`, `CONC_DURATION_SECS=30`. Results land in `$OUT_DIR` (default +`~/manifest_cow_bench_`). + +## Representative results + +EC2 `c7i.48xlarge`, S3 `us-east-1`, op `write-create-namespace`. The catalog is a +single-writer-throughput system: per-commit cost scales ~O(rows) and throughput does **not** +scale with concurrency (every commit is a serialized `__manifest` version bump). + +Continuous (1 process, 100 commits), ops/s — inline index vs no index: + +| rows | inline | no index | +|---:|---:|---:| +| 1,000 | 2.0 | 3.5 | +| 100,000 | 1.1 | 2.1 | +| 1,000,000 | 0.34 | 0.53 | + +Concurrent steady TPS is flat across C=10..200 (e.g. inline @100k ≈ 1.4–1.5 ops/s at every C; +@1M ≈ 0.3 ops/s). Conflicts that exceed the retry budget surface as errors and grow with C +(≈0 at C≤20, climbing at C≥100) — the contention ceiling, not data loss. No-index commits run +~1.5–2× faster (no per-commit index build) at the cost of unindexed reads. diff --git a/rust/lance-namespace-impls/Cargo.toml b/rust/lance-namespace-impls/Cargo.toml index 53ff79fb333..27b9a4bc0e2 100644 --- a/rust/lance-namespace-impls/Cargo.toml +++ b/rust/lance-namespace-impls/Cargo.toml @@ -51,6 +51,8 @@ object_store = { workspace = true } arrow = { workspace = true } arrow-ipc = { workspace = true } arrow-schema = { workspace = true } +datafusion-common = { workspace = true } +datafusion-physical-plan = { workspace = true } # REST adapter implementation dependencies (optional, enabled by "rest-adapter" feature) axum = { workspace = true, optional = true } @@ -66,6 +68,8 @@ serde_json = { workspace = true } futures.workspace = true log.workspace = true rand.workspace = true +roaring.workspace = true +uuid.workspace = true # Shared credential vending dependencies sha2 = { version = "0.10", optional = true } @@ -75,6 +79,11 @@ base64 = { version = "0.22", optional = true } aws-sdk-sts = { version = "1.38.0", optional = true, default-features = false, features = ["default-https-client", "rt-tokio"] } aws-config = { workspace = true, optional = true } +# Pin: time 0.3.48 conflicts with aws-smithy-types (E0119: conflicting `From` impls), which this +# crate pulls in via the AWS credential vendor. Capping time here forces the workspace resolver to +# 0.3.47 even for no-lock builds. Not used directly; remove once the upstream conflict is resolved. +time = "=0.3.47" + # GCP credential vending dependencies (optional, enabled by "credential-vendor-gcp" feature) ring = { version = "0.17", optional = true } rustls-pki-types = { version = "1", optional = true } @@ -96,6 +105,11 @@ rstest.workspace = true lance-table.workspace = true lance-arrow = { workspace = true } lance = { workspace = true } +serde = { workspace = true, features = ["derive"] } + +[[example]] +name = "manifest_bench" +path = "examples/manifest_bench.rs" [lints] workspace = true diff --git a/rust/lance-namespace-impls/benches/manifest_commit_sweep.sh b/rust/lance-namespace-impls/benches/manifest_commit_sweep.sh new file mode 100644 index 00000000000..7384ced4152 --- /dev/null +++ b/rust/lance-namespace-impls/benches/manifest_commit_sweep.sh @@ -0,0 +1,146 @@ +#!/usr/bin/env bash +# Copy-on-write __manifest commit benchmark sweep panel. +# +# Drives `cargo run --release --example manifest_bench` across a panel of: +# - bootstrap manifest sizes (rows already in __manifest) +# - inline scalar indices on vs off +# - continuous commit (single process, N commits) and +# concurrent commit (C processes, steady TPS over a fixed duration) +# +# Each run is isolated: a "golden" manifest is bootstrapped once per (size, index) +# and server-side-copied to a fresh S3 prefix per run, so every run starts at exactly +# the bootstrapped size. Results are written as JSONL (one BenchResult per line) and +# summarised to CSV. The sweep is resumable: completed runs are skipped. +# +# Usage: +# S3_BASE=s3://jack-devland-build/manifest-cow-bench/$(date -u +%Y%m%dT%H%M%SZ) \ +# ./manifest_commit_sweep.sh +# +# Env knobs (defaults match the requested panel): +# SIZES, CONCURRENCY, INLINE_VARIANTS, CONT_OPS, CONC_DURATION_SECS, +# AWS_REGION, OUT_DIR, BIN +# +# Resilient by design: a single failed run is logged and skipped rather than aborting +# the sweep, and re-running fills the gaps (completed runs are detected and skipped). +set -uo pipefail + +RUN_ID="${RUN_ID:-$(date -u +%Y%m%dT%H%M%SZ)}" +S3_BASE="${S3_BASE:?set S3_BASE, e.g. s3://jack-devland-build/manifest-cow-bench/$RUN_ID}" +AWS_REGION="${AWS_REGION:-us-east-1}" +export AWS_REGION AWS_DEFAULT_REGION="$AWS_REGION" + +REPO_ROOT="${REPO_ROOT:-$HOME/oss/lance}" +BIN="${BIN:-$REPO_ROOT/target/release/examples/manifest_bench}" +OUT_DIR="${OUT_DIR:-$HOME/manifest_cow_bench_${RUN_ID}}" +RESULTS="$OUT_DIR/results.jsonl" +PROGRESS="$OUT_DIR/progress.log" +mkdir -p "$OUT_DIR" + +SIZES=(${SIZES:-1000 2000 5000 10000 20000 50000 100000 200000 500000 1000000}) +CONCURRENCY=(${CONCURRENCY:-10 20 50 100 120 150 200}) +INLINE_VARIANTS=(${INLINE_VARIANTS:-true false}) +CONT_OPS="${CONT_OPS:-100}" +CONC_DURATION_SECS="${CONC_DURATION_SECS:-30}" +STORAGE_OPT=(--storage-option "aws_region=${AWS_REGION}") + +log() { printf '%s %s\n' "$(date -u +%H:%M:%S)" "$*" | tee -a "$PROGRESS"; } + +# Skip a run if its tag already appears in results.jsonl (resume support). +done_already() { grep -q "\"bench_tag\":\"$1\"" "$RESULTS" 2>/dev/null; } + +# Append a result line, tagging it so reruns can resume and we can pivot later. +record() { + local tag="$1"; shift + # shellcheck disable=SC2016 + python3 -c 'import json,sys; d=json.load(sys.stdin); d["bench_tag"]=sys.argv[1]; print(json.dumps(d))' \ + "$tag" >> "$RESULTS" +} + +s3_copy() { aws s3 cp --recursive --quiet "$1" "$2" --region "$AWS_REGION"; } +s3_rm() { aws s3 rm --recursive --quiet "$1" --region "$AWS_REGION" || true; } + +# Backstops for unattended runs: cap any single run and clear leaked worker processes +# (a killed coordinator can orphan its worker children) before the next run. +RUN_TIMEOUT="${RUN_TIMEOUT:-1200}" +clear_stragglers() { pkill -f 'examples/manifest_bench worker' 2>/dev/null || true; sleep 1; } + +for inline in "${INLINE_VARIANTS[@]}"; do + for rows in "${SIZES[@]}"; do + golden="${S3_BASE}/golden/inline_${inline}_rows_${rows}" + boot_tag="boot_inline_${inline}_rows_${rows}" + + if ! done_already "$boot_tag"; then + log "BOOTSTRAP inline=$inline rows=$rows -> $golden" + s3_rm "$golden" + if "$BIN" seed-large --root "$golden" --count "$rows" \ + --inline-optimization "$inline" "${STORAGE_OPT[@]}"; then + echo "{\"bench_tag\":\"$boot_tag\"}" >> "$RESULTS" + else + log "BOOTSTRAP FAILED inline=$inline rows=$rows (skipping this size)" + continue + fi + else + log "skip bootstrap $boot_tag (done)" + fi + + # ---- Continuous: single process, CONT_OPS commits ---- + cont_tag="cont_inline_${inline}_rows_${rows}" + if ! done_already "$cont_tag"; then + run_prefix="${S3_BASE}/run/${cont_tag}" + log "CONTINUOUS inline=$inline rows=$rows ops=$CONT_OPS" + clear_stragglers + s3_copy "$golden" "$run_prefix" + timeout "$RUN_TIMEOUT" "$BIN" run --root "$run_prefix" --operation write-create-namespace \ + --concurrency 1 --operations "$CONT_OPS" --initial-entries "$rows" \ + --inline-optimization "$inline" "${STORAGE_OPT[@]}" \ + 2>>"$PROGRESS" | while read -r line; do record "$cont_tag" <<<"$line"; done + s3_rm "$run_prefix" + else + log "skip continuous $cont_tag (done)" + fi + + # ---- Concurrent: C processes, steady TPS over CONC_DURATION_SECS ---- + for c in "${CONCURRENCY[@]}"; do + conc_tag="conc_inline_${inline}_rows_${rows}_c_${c}" + if done_already "$conc_tag"; then log "skip concurrent $conc_tag (done)"; continue; fi + run_prefix="${S3_BASE}/run/${conc_tag}" + log "CONCURRENT inline=$inline rows=$rows c=$c dur=${CONC_DURATION_SECS}s" + clear_stragglers + s3_copy "$golden" "$run_prefix" + timeout "$RUN_TIMEOUT" "$BIN" run --root "$run_prefix" --operation write-create-namespace \ + --concurrency "$c" --duration-secs "$CONC_DURATION_SECS" --initial-entries "$rows" \ + --inline-optimization "$inline" "${STORAGE_OPT[@]}" \ + 2>>"$PROGRESS" | while read -r line; do record "$conc_tag" <<<"$line"; done + s3_rm "$run_prefix" + done + done +done + +# ---- Summarise to CSV ---- +CSV="$OUT_DIR/summary.csv" +python3 - "$RESULTS" "$CSV" <<'PY' +import json, sys, csv +rows = [] +with open(sys.argv[1]) as f: + for line in f: + d = json.loads(line) + if "throughput_ops_per_sec" not in d: + continue # bootstrap marker + mode = "continuous" if d["duration_secs"] == 0 else "concurrent" + rows.append({ + "mode": mode, "variant": d["variant"], "initial_entries": d["initial_entries"], + "concurrency": d["concurrency"], "duration_secs": d["duration_secs"], + "ops": d["total_operations"], "errors": d["errors"], + "tps": round(d["throughput_ops_per_sec"], 3), + "avg_ms": round(d["avg_latency_ms"], 2), "p50_ms": round(d["p50_latency_ms"], 2), + "p90_ms": round(d["p90_latency_ms"], 2), "p99_ms": round(d["p99_latency_ms"], 2), + }) +rows.sort(key=lambda r: (r["mode"], r["variant"], r["initial_entries"], r["concurrency"])) +with open(sys.argv[2], "w", newline="") as f: + w = csv.DictWriter(f, fieldnames=list(rows[0].keys()) if rows else []) + w.writeheader(); w.writerows(rows) +print(f"wrote {len(rows)} rows to {sys.argv[2]}") +PY + +log "SWEEP COMPLETE. Results: $RESULTS Summary: $CSV" +s3_rm "${S3_BASE}/golden" "${S3_BASE}/run" 2>/dev/null || true diff --git a/rust/lance-namespace-impls/examples/manifest_bench.rs b/rust/lance-namespace-impls/examples/manifest_bench.rs new file mode 100644 index 00000000000..4841f2471d7 --- /dev/null +++ b/rust/lance-namespace-impls/examples/manifest_bench.rs @@ -0,0 +1,714 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Copy-on-write `__manifest` directory-catalog commit benchmark (S3 capable). +//! +//! Measures how fast the directory catalog commits `__manifest` mutations as the +//! manifest scales, with the inline scalar indices on or off. +//! +//! Modes: +//! seed-large — bootstrap a `__manifest` with N rows (direct dataset write + one +//! CoW rewrite to build indices) +//! run — coordinator: spawn `--concurrency` worker processes committing for +//! either a fixed op count (continuous) or a fixed duration (steady TPS) +//! worker — (internal) a single committing process spawned by `run` +//! +//! Examples: +//! # Bootstrap 100k rows with inline indices +//! manifest_bench seed-large --root s3://bucket/bench/p --count 100000 \ +//! --inline-optimization true --storage-option aws_region=us-east-1 +//! +//! # Continuous: 100 commits, single process +//! manifest_bench run --root s3://bucket/bench/p --operation write-create-namespace \ +//! --concurrency 1 --operations 100 --initial-entries 100000 --inline-optimization true +//! +//! # Concurrent steady TPS: 50 processes committing for 30s +//! manifest_bench run --root s3://bucket/bench/p --operation write-create-namespace \ +//! --concurrency 50 --duration-secs 30 --initial-entries 100000 --inline-optimization true + +// A CLI benchmark tool: workers emit JSON latency records on stdout and progress on +// stderr, so stdout/stderr printing is intentional here. +#![allow(clippy::print_stdout, clippy::print_stderr)] + +use std::collections::HashMap; +use std::io::{BufRead, BufReader}; +use std::process::{Command, Stdio}; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use arrow::array::builder::{ListBuilder, StringBuilder}; +use arrow::array::{RecordBatch, RecordBatchIterator, StringArray}; +use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; +use bytes::Bytes; +use lance::dataset::{InsertBuilder, WriteMode, WriteParams}; +use lance_core::datatypes::LANCE_UNENFORCED_PRIMARY_KEY_POSITION; +use lance_namespace::LanceNamespace; +use lance_namespace::models::{ + CreateNamespaceRequest, CreateTableRequest, DeclareTableRequest, DescribeTableRequest, + ListNamespacesRequest, ListTablesRequest, +}; +use lance_namespace_impls::DirectoryNamespaceBuilder; +use serde::{Deserialize, Serialize}; + +#[derive(Serialize, Deserialize, Clone)] +struct LatencyRecord { + operation: String, + latency_ms: f64, + error: bool, +} + +#[derive(Serialize)] +struct BenchResult { + variant: String, + operation: String, + concurrency: usize, + initial_entries: usize, + duration_secs: u64, + total_operations: usize, + total_duration_ms: f64, + throughput_ops_per_sec: f64, + avg_latency_ms: f64, + p50_latency_ms: f64, + p90_latency_ms: f64, + p99_latency_ms: f64, + min_latency_ms: f64, + max_latency_ms: f64, + errors: usize, +} + +fn percentile(sorted: &[f64], p: f64) -> f64 { + if sorted.is_empty() { + return 0.0; + } + let idx = ((sorted.len() as f64 - 1.0) * p).round() as usize; + sorted[idx.min(sorted.len() - 1)] +} + +#[allow(clippy::too_many_arguments)] +fn compute_result( + variant: &str, + operation: &str, + concurrency: usize, + initial_entries: usize, + duration_secs: u64, + wall_duration: Duration, + mut latencies: Vec, + errors: usize, +) -> BenchResult { + latencies.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let total = latencies.len(); + let total_ms = wall_duration.as_secs_f64() * 1000.0; + let throughput = if total_ms > 0.0 { + total as f64 / (total_ms / 1000.0) + } else { + 0.0 + }; + BenchResult { + variant: variant.to_string(), + operation: operation.to_string(), + concurrency, + initial_entries, + duration_secs, + total_operations: total, + total_duration_ms: total_ms, + throughput_ops_per_sec: throughput, + avg_latency_ms: if total > 0 { + latencies.iter().sum::() / total as f64 + } else { + 0.0 + }, + p50_latency_ms: percentile(&latencies, 0.50), + p90_latency_ms: percentile(&latencies, 0.90), + p99_latency_ms: percentile(&latencies, 0.99), + min_latency_ms: latencies.first().copied().unwrap_or(0.0), + max_latency_ms: latencies.last().copied().unwrap_or(0.0), + errors, + } +} + +fn create_test_ipc_data() -> Vec { + use arrow::array::Int32Array; + use arrow_ipc::writer::StreamWriter; + + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec!["a", "b", "c"])), + ], + ) + .unwrap(); + let mut buffer = Vec::new(); + { + let mut writer = StreamWriter::try_new(&mut buffer, &schema).unwrap(); + writer.write(&batch).unwrap(); + writer.finish().unwrap(); + } + buffer +} + +/// The `__manifest` schema used by the copy-on-write directory catalog: +/// `object_id`, `object_type`, `location`, `metadata` (Utf8), `base_objects` (List). +fn manifest_schema() -> Arc { + Arc::new(ArrowSchema::new(vec![ + Field::new("object_id", DataType::Utf8, false).with_metadata( + [( + LANCE_UNENFORCED_PRIMARY_KEY_POSITION.to_string(), + "0".to_string(), + )] + .into_iter() + .collect(), + ), + Field::new("object_type", DataType::Utf8, false), + Field::new("location", DataType::Utf8, true), + Field::new("metadata", DataType::Utf8, true), + Field::new( + "base_objects", + DataType::List(Arc::new(Field::new("object_id", DataType::Utf8, true))), + true, + ), + ])) +} + +async fn build_namespace( + root: &str, + inline_optimization: bool, + storage_options: &HashMap, +) -> Box { + let mut properties = HashMap::new(); + properties.insert("root".to_string(), root.to_string()); + properties.insert("dir_listing_enabled".to_string(), "false".to_string()); + properties.insert( + "inline_optimization_enabled".to_string(), + inline_optimization.to_string(), + ); + for (k, v) in storage_options { + properties.insert(format!("storage.{}", k), v.clone()); + } + let builder = DirectoryNamespaceBuilder::from_properties(properties, None) + .expect("Failed to create namespace builder from properties"); + Box::new(builder.build().await.expect("Failed to build namespace")) +} + +// ──────────────────── seed-large mode ──────────────────── +// Bootstrap a `__manifest` with N rows by writing the Lance dataset directly (fast, +// O(N) once), then trigger a single CoW rewrite via the namespace so the on-disk state +// matches what the catalog produces (single fragment + inline indices when enabled). + +const SEED_LARGE_BATCH_SIZE: usize = 50_000; + +fn generate_manifest_batch(start_idx: usize, batch_size: usize, total_count: usize) -> RecordBatch { + let ns_count = total_count / 3; + let actual_size = batch_size.min(total_count - start_idx); + + let mut object_ids = Vec::with_capacity(actual_size); + let mut object_types = Vec::with_capacity(actual_size); + let mut locations: Vec> = Vec::with_capacity(actual_size); + let mut metadatas: Vec> = Vec::with_capacity(actual_size); + + for i in start_idx..start_idx + actual_size { + if i < ns_count { + object_ids.push(format!("ns_{}", i)); + object_types.push("namespace".to_string()); + locations.push(None); + metadatas.push(None); + } else { + let table_idx = i - ns_count; + object_ids.push(format!("table_{}", table_idx)); + object_types.push("table".to_string()); + locations.push(Some(format!("table_{}", table_idx))); + metadatas.push(Some(r#"{"bench":"true"}"#.to_string())); + } + } + + // base_objects is null for every bootstrapped row. + let mut base_objects_builder = ListBuilder::new(StringBuilder::new()) + .with_field(Arc::new(Field::new("object_id", DataType::Utf8, true))); + for _ in 0..actual_size { + base_objects_builder.append_null(); + } + + RecordBatch::try_new( + manifest_schema(), + vec![ + Arc::new(StringArray::from(object_ids)), + Arc::new(StringArray::from(object_types)), + Arc::new(StringArray::from( + locations.iter().map(|l| l.as_deref()).collect::>(), + )), + Arc::new(StringArray::from( + metadatas.iter().map(|m| m.as_deref()).collect::>(), + )), + Arc::new(base_objects_builder.finish()), + ], + ) + .expect("Failed to create manifest batch") +} + +async fn seed_large( + root: &str, + count: usize, + inline_optimization: bool, + storage_options: &HashMap, +) { + let manifest_uri = format!("{}/{}", root, "__manifest"); + eprintln!("Seed-large: writing {} rows to {}", count, manifest_uri); + + let schema = manifest_schema(); + let mut batches = Vec::new(); + let mut offset = 0; + while offset < count { + let batch_size = SEED_LARGE_BATCH_SIZE.min(count - offset); + batches.push(generate_manifest_batch(offset, batch_size, count)); + offset += batch_size; + } + eprintln!(" generated {} batches", batches.len()); + + let mut write_params = WriteParams { + mode: WriteMode::Create, + ..WriteParams::default() + }; + if !storage_options.is_empty() { + let accessor = Arc::new( + lance_io::object_store::StorageOptionsAccessor::with_static_options( + storage_options.clone(), + ), + ); + write_params.store_params = Some(lance_io::object_store::ObjectStoreParams { + storage_options_accessor: Some(accessor), + ..Default::default() + }); + } + + let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone()); + InsertBuilder::new(manifest_uri.as_str()) + .with_params(&write_params) + .execute_stream(reader) + .await + .expect("Failed to write manifest dataset"); + eprintln!(" wrote Lance dataset"); + + // Trigger one CoW rewrite so the manifest is in steady catalog form (single + // fragment; inline indices when enabled). For the no-index variant the first real + // commit performs this rewrite instead. + if inline_optimization { + eprintln!(" triggering initial CoW rewrite to build indices..."); + let start = Instant::now(); + let ns = build_namespace(root, true, storage_options).await; + let mut req = CreateNamespaceRequest::new(); + req.id = Some(vec!["__seed_trigger__".to_string()]); + ns.create_namespace(req) + .await + .expect("Failed to trigger CoW rewrite"); + eprintln!( + " CoW rewrite with index build took {:.1}s", + start.elapsed().as_secs_f64() + ); + } + + let ns_count = count / 3; + eprintln!( + "Seed-large complete: {} rows ({} namespaces, {} tables)", + count, + ns_count, + count - ns_count + ); +} + +// ──────────────────── worker mode ──────────────────── + +#[allow(clippy::too_many_arguments)] +async fn worker( + root: &str, + operation: &str, + operations: usize, + duration_secs: u64, + warmup: usize, + worker_id: usize, + table_count: usize, + inline_optimization: bool, + storage_options: &HashMap, +) { + let ns = build_namespace(root, inline_optimization, storage_options).await; + let ipc_data = Bytes::from(create_test_ipc_data()); + + if operation.starts_with("warm-read") { + for _ in 0..warmup { + let _ = + run_operation(ns.as_ref(), operation, worker_id, 0, table_count, &ipc_data).await; + } + } + + let emit = |op_idx: usize, start: Instant, err: bool| { + let record = LatencyRecord { + operation: operation.to_string(), + latency_ms: start.elapsed().as_secs_f64() * 1000.0, + error: err, + }; + let _ = op_idx; + println!("{}", serde_json::to_string(&record).unwrap()); + }; + + if duration_secs > 0 { + // Steady-TPS mode: commit continuously until the deadline. + let deadline = Instant::now() + Duration::from_secs(duration_secs); + let mut op_idx = 0; + while Instant::now() < deadline { + let start = Instant::now(); + let err = run_operation( + ns.as_ref(), + operation, + worker_id, + op_idx, + table_count, + &ipc_data, + ) + .await + .is_err(); + emit(op_idx, start, err); + op_idx += 1; + } + } else { + for op_idx in 0..operations { + let start = Instant::now(); + let err = run_operation( + ns.as_ref(), + operation, + worker_id, + op_idx, + table_count, + &ipc_data, + ) + .await + .is_err(); + emit(op_idx, start, err); + } + } +} + +async fn run_operation( + ns: &dyn LanceNamespace, + operation: &str, + worker_id: usize, + op_idx: usize, + table_count: usize, + ipc_data: &Bytes, +) -> Result<(), Box> { + match operation { + "cold-read-list-namespaces" | "warm-read-list-namespaces" => { + let mut req = ListNamespacesRequest::new(); + req.id = Some(vec![]); + ns.list_namespaces(req).await?; + } + "cold-read-list-tables" | "warm-read-list-tables" => { + let mut req = ListTablesRequest::new(); + req.id = Some(vec![]); + ns.list_tables(req).await?; + } + "cold-read-describe-table" | "warm-read-describe-table" => { + let table_idx = (worker_id * 1_000_000 + op_idx) % table_count.max(1); + let req = DescribeTableRequest { + id: Some(vec![format!("table_{}", table_idx)]), + ..Default::default() + }; + ns.describe_table(req).await?; + } + "write-create-namespace" => { + let mut req = CreateNamespaceRequest::new(); + req.id = Some(vec![format!("bench_w{}_{}", worker_id, op_idx)]); + ns.create_namespace(req).await?; + } + "write-create-table" => { + let mut req = CreateTableRequest::new(); + req.id = Some(vec![format!("bench_t{}_{}", worker_id, op_idx)]); + ns.create_table(req, ipc_data.clone()).await?; + } + "write-declare-table" => { + let req = DeclareTableRequest { + id: Some(vec![format!("bench_d{}_{}", worker_id, op_idx)]), + ..Default::default() + }; + ns.declare_table(req).await?; + } + _ => { + return Err(format!("unknown operation: {}", operation).into()); + } + } + Ok(()) +} + +// ──────────────────── run mode (coordinator) ──────────────────── + +#[allow(clippy::too_many_arguments)] +fn run_workers( + self_exe: &str, + root: &str, + operation: &str, + concurrency: usize, + operations: usize, + duration_secs: u64, + warmup: usize, + table_count: usize, + initial_entries: usize, + inline_optimization: bool, + variant: &str, + storage_options: &HashMap, +) -> BenchResult { + // Continuous mode splits a fixed op budget across workers; steady-TPS mode lets each + // worker run for the full duration. + let ops_per_worker = if duration_secs > 0 { + 0 + } else { + operations / concurrency.max(1) + }; + if duration_secs == 0 && ops_per_worker == 0 { + return compute_result( + variant, + operation, + concurrency, + initial_entries, + duration_secs, + Duration::ZERO, + vec![], + 0, + ); + } + + let wall_start = Instant::now(); + let children: Vec<_> = (0..concurrency) + .map(|worker_id| { + let mut cmd = Command::new(self_exe); + cmd.arg("worker") + .arg("--root") + .arg(root) + .arg("--operation") + .arg(operation) + .arg("--operations") + .arg(ops_per_worker.to_string()) + .arg("--duration-secs") + .arg(duration_secs.to_string()) + .arg("--warmup") + .arg(warmup.to_string()) + .arg("--worker-id") + .arg(worker_id.to_string()) + .arg("--table-count") + .arg(table_count.to_string()) + .arg("--inline-optimization") + .arg(inline_optimization.to_string()); + for (k, v) in storage_options { + cmd.arg("--storage-option").arg(format!("{}={}", k, v)); + } + cmd.stdout(Stdio::piped()) + .stderr(Stdio::inherit()) + .spawn() + .expect("Failed to spawn worker") + }) + .collect(); + + let mut all_latencies = Vec::new(); + let mut total_errors = 0; + for mut child in children { + let stdout = child.stdout.take().unwrap(); + for line in BufReader::new(stdout).lines() { + let line = line.expect("failed to read worker output"); + if let Ok(record) = serde_json::from_str::(&line) { + if record.error { + total_errors += 1; + } else { + all_latencies.push(record.latency_ms); + } + } + } + let status = child.wait().expect("failed to wait for worker"); + if !status.success() { + eprintln!("Worker exited with status: {}", status); + } + } + + compute_result( + variant, + operation, + concurrency, + initial_entries, + duration_secs, + wall_start.elapsed(), + all_latencies, + total_errors, + ) +} + +fn parse_concurrency_list(s: &str) -> Vec { + s.split(',') + .filter_map(|v| v.trim().parse::().ok()) + .filter(|v| *v > 0) + .collect() +} + +#[tokio::main] +async fn main() { + let args: Vec = std::env::args().collect(); + if args.len() < 2 { + eprintln!("Usage: manifest_bench [options]"); + std::process::exit(1); + } + + let mode = args[1].as_str(); + let mut root = String::new(); + let mut operation = String::new(); + let mut operations: usize = 100; + let mut duration_secs: u64 = 0; + let mut warmup: usize = 0; + let mut concurrency_list = vec![1]; + let mut count: usize = 1000; + let mut worker_id: usize = 0; + let mut table_count: usize = 667; + let mut initial_entries: usize = 0; + let mut inline_optimization = true; + let mut variant = String::new(); + let mut storage_options: HashMap = HashMap::new(); + + let mut i = 2; + while i < args.len() { + match args[i].as_str() { + "--root" => { + root = args[i + 1].clone(); + i += 2; + } + "--operation" => { + operation = args[i + 1].clone(); + i += 2; + } + "--operations" => { + operations = args[i + 1].parse().unwrap(); + i += 2; + } + "--duration-secs" => { + duration_secs = args[i + 1].parse().unwrap(); + i += 2; + } + "--warmup" => { + warmup = args[i + 1].parse().unwrap(); + i += 2; + } + "--concurrency" => { + concurrency_list = parse_concurrency_list(&args[i + 1]); + i += 2; + } + "--count" => { + count = args[i + 1].parse().unwrap(); + i += 2; + } + "--worker-id" => { + worker_id = args[i + 1].parse().unwrap(); + i += 2; + } + "--table-count" => { + table_count = args[i + 1].parse().unwrap(); + i += 2; + } + "--initial-entries" => { + initial_entries = args[i + 1].parse().unwrap(); + i += 2; + } + "--inline-optimization" => { + inline_optimization = args[i + 1].parse().unwrap(); + i += 2; + } + "--variant" => { + variant = args[i + 1].clone(); + i += 2; + } + "--storage-option" => { + if let Some((k, v)) = args[i + 1].split_once('=') { + storage_options.insert(k.to_string(), v.to_string()); + } + i += 2; + } + other => { + eprintln!("Unknown argument: {}", other); + std::process::exit(1); + } + } + } + + if variant.is_empty() { + variant = if inline_optimization { + "inline_index".to_string() + } else { + "no_index".to_string() + }; + } + + match mode { + "seed-large" => { + seed_large(&root, count, inline_optimization, &storage_options).await; + } + "worker" => { + worker( + &root, + &operation, + operations, + duration_secs, + warmup, + worker_id, + table_count, + inline_optimization, + &storage_options, + ) + .await; + } + "run" => { + let self_exe = std::env::current_exe() + .expect("failed to get self exe path") + .to_string_lossy() + .to_string(); + let op = if operation.is_empty() { + "write-create-namespace" + } else { + operation.as_str() + }; + + eprintln!("=== Manifest commit benchmark ==="); + eprintln!( + "variant={} op={} root={} initial_entries={} concurrency={:?} operations={} duration_secs={}", + variant, op, root, initial_entries, concurrency_list, operations, duration_secs + ); + + for &concurrency in &concurrency_list { + let result = run_workers( + &self_exe, + &root, + op, + concurrency, + operations, + duration_secs, + warmup, + table_count, + initial_entries, + inline_optimization, + &variant, + &storage_options, + ); + eprintln!( + " c={} -> {:.2} ops/s ({} ops, {} errors, p50={:.0}ms p99={:.0}ms)", + concurrency, + result.throughput_ops_per_sec, + result.total_operations, + result.errors, + result.p50_latency_ms, + result.p99_latency_ms + ); + println!("{}", serde_json::to_string(&result).unwrap()); + } + eprintln!("=== complete ==="); + } + _ => { + eprintln!("Unknown mode: {}. Use seed-large, run, or worker.", mode); + std::process::exit(1); + } + } +} diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index 8859e4bc237..e97c5c836b7 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -7,6 +7,7 @@ //! that stores tables as Lance datasets in a filesystem directory structure. pub mod manifest; +pub mod manifest_feature_flags; use arrow::array::Float32Array; use arrow::record_batch::RecordBatchIterator; @@ -195,9 +196,6 @@ pub struct DirectoryNamespaceBuilder { dir_listing_enabled: bool, inline_optimization_enabled: bool, table_version_tracking_enabled: bool, - /// When true, table versions are stored in the `__manifest` table instead of - /// relying on Lance's native version management. - table_version_storage_enabled: bool, /// When true, enables migration mode where the namespace checks the manifest first /// before falling back to directory listing for root-level tables. When false (default), /// root-level tables use directory listing directly without checking the manifest, @@ -233,10 +231,6 @@ impl std::fmt::Debug for DirectoryNamespaceBuilder { "table_version_tracking_enabled", &self.table_version_tracking_enabled, ) - .field( - "table_version_storage_enabled", - &self.table_version_storage_enabled, - ) .field( "dir_listing_to_manifest_migration_enabled", &self.dir_listing_to_manifest_migration_enabled, @@ -273,7 +267,6 @@ impl DirectoryNamespaceBuilder { dir_listing_enabled: true, // Default to enabled for backwards compatibility inline_optimization_enabled: true, table_version_tracking_enabled: false, // Default to disabled - table_version_storage_enabled: false, // Default to disabled dir_listing_to_manifest_migration_enabled: false, // Default to disabled credential_vendor_properties: HashMap::new(), context_provider: None, @@ -313,11 +306,10 @@ impl DirectoryNamespaceBuilder { self } - /// Enable or disable inline optimization of the __manifest table. + /// Enable or disable replacement index maintenance for the __manifest table. /// - /// When enabled (default), performs compaction and indexing on the __manifest table - /// after every write operation to maintain optimal performance. - /// When disabled, manual optimization must be performed separately. + /// When enabled (default), copy-on-write manifest rewrites build replacement indices + /// for fast reads. When disabled, rewrites only replace data files. pub fn inline_optimization_enabled(mut self, enabled: bool) -> Self { self.inline_optimization_enabled = enabled; self @@ -335,19 +327,6 @@ impl DirectoryNamespaceBuilder { self } - /// Enable or disable table version management through the `__manifest` table. - /// - /// When enabled, table versions are tracked as `table_version` entries in the - /// `__manifest` Lance table. This enables: - /// - Centralized version tracking instead of per-table `_versions/` directories - /// - /// Requires `manifest_enabled` to be true. - /// When disabled (default), version storage uses per-table storage operations. - pub fn table_version_storage_enabled(mut self, enabled: bool) -> Self { - self.table_version_storage_enabled = enabled; - self - } - /// Create a DirectoryNamespaceBuilder from properties HashMap. /// /// This method parses a properties map into builder configuration. @@ -355,7 +334,7 @@ impl DirectoryNamespaceBuilder { /// - `root`: The root directory path (required) /// - `manifest_enabled`: Enable manifest-based table tracking (optional, default: true) /// - `dir_listing_enabled`: Enable directory listing for table discovery (optional, default: true) - /// - `inline_optimization_enabled`: Enable inline optimization of __manifest table (optional, default: true) + /// - `inline_optimization_enabled`: Enable replacement indices on __manifest rewrites (optional, default: true) /// - `storage.*`: Storage options (optional, prefix will be stripped) /// /// Credential vendor properties (prefixed with `credential_vendor.`, prefix is stripped): @@ -465,12 +444,6 @@ impl DirectoryNamespaceBuilder { .and_then(|v| v.parse::().ok()) .unwrap_or(false); - // Extract table_version_storage_enabled (default: false) - let table_version_storage_enabled = properties - .get("table_version_storage_enabled") - .and_then(|v| v.parse::().ok()) - .unwrap_or(false); - // Extract dir_listing_to_manifest_migration_enabled (default: false) let dir_listing_to_manifest_migration_enabled = properties .get("dir_listing_to_manifest_migration_enabled") @@ -517,7 +490,6 @@ impl DirectoryNamespaceBuilder { dir_listing_enabled, inline_optimization_enabled, table_version_tracking_enabled, - table_version_storage_enabled, dir_listing_to_manifest_migration_enabled, credential_vendor_properties, context_provider: None, @@ -694,14 +666,6 @@ impl DirectoryNamespaceBuilder { /// - Connection to the storage backend fails /// - Storage options are invalid pub async fn build(self) -> Result { - // Validate: table_version_storage_enabled requires manifest_enabled - if self.table_version_storage_enabled && !self.manifest_enabled { - return Err(NamespaceError::InvalidInput { - message: "table_version_storage_enabled requires manifest_enabled=true".to_string(), - } - .into()); - } - let (object_store, base_path) = Self::initialize_object_store(&self.root, &self.storage_options, &self.session).await?; @@ -715,11 +679,16 @@ impl DirectoryNamespaceBuilder { self.dir_listing_enabled, self.inline_optimization_enabled, self.commit_retries, - self.table_version_storage_enabled, ) .await { Ok(ns) => Some(Arc::new(ns)), + Err(e) if manifest_feature_flags::is_incompatible_manifest_error(&e) => { + // The manifest exists but was written with a feature flag this + // build does not understand. Refuse rather than silently + // degrading to a directory-listing view that ignores it. + return Err(e); + } Err(e) => { // Failed to initialize manifest namespace, fall back to directory listing only log::warn!( @@ -760,7 +729,6 @@ impl DirectoryNamespaceBuilder { dir_listing_to_manifest_migration_enabled: self .dir_listing_to_manifest_migration_enabled, table_version_tracking_enabled: self.table_version_tracking_enabled, - table_version_storage_enabled: self.table_version_storage_enabled, credential_vendor, context_provider: self.context_provider, vend_input_storage_options: self.vend_input_storage_options, @@ -843,8 +811,6 @@ pub struct DirectoryNamespace { /// When true, `describe_table` returns `managed_versioning: true` to indicate /// commits should go through namespace table version APIs. table_version_tracking_enabled: bool, - /// When true, table versions are stored in the `__manifest` table. - table_version_storage_enabled: bool, /// Credential vendor created once during initialization. /// Used to vend temporary credentials for table access. credential_vendor: Option>, @@ -1413,6 +1379,11 @@ impl DirectoryNamespace { } return Ok(response); } + Err(e) if manifest_feature_flags::is_incompatible_manifest_error(&e) => { + // An incompatible manifest must surface "please upgrade" + // rather than degrading to a directory-listing view. + return Err(e); + } Err(_) if self.dir_listing_enabled && is_root_level => { // Fall through to directory check only for single-level IDs } @@ -2143,6 +2114,7 @@ impl DirectoryNamespace { /// to the manifest to enable manifest-only mode: /// /// ```no_run + /// #![recursion_limit = "256"] /// # use lance_namespace_impls::DirectoryNamespaceBuilder; /// # async fn example() -> Result<(), Box> { /// // Create namespace with dual mode (manifest + directory listing) @@ -2211,18 +2183,16 @@ impl DirectoryNamespace { Ok(migrated_count) } - /// Delete physical manifest files for the given table version ranges (best-effort). + /// Delete physical manifest files for the given table version ranges. /// - /// This helper is used by `batch_delete_table_versions` in both the manifest-enabled - /// and non-manifest paths. It resolves each table's storage location, computes the - /// version file paths, and attempts to delete them. Errors are logged (best-effort) - /// when `best_effort` is true, or returned immediately when false. + /// This helper backs `batch_delete_table_versions`. It resolves each table's storage + /// location, computes the version file paths, and deletes them, returning an error on + /// the first failure. /// /// Returns the number of files successfully deleted. async fn delete_physical_version_files( &self, table_entries: &[TableDeleteEntry], - best_effort: bool, branch: Option<&str>, ) -> Result { let mut deleted_count = 0i64; @@ -2268,22 +2238,13 @@ impl DirectoryNamespace { } Err(object_store::Error::NotFound { .. }) => {} Err(e) => { - if best_effort { - log::warn!( - "Failed to delete manifest file for version {} of table {:?}: {:?}", - v, - te.table_id, - e - ); - } else { - return Err(NamespaceError::Internal { - message: format!( - "Failed to delete version {} for table at '{}': {}", - v, table_uri, e - ), - } - .into()); + return Err(NamespaceError::Internal { + message: format!( + "Failed to delete version {} for table at '{}': {}", + v, table_uri, e + ), } + .into()); } } } @@ -2650,6 +2611,11 @@ impl LanceNamespace for DirectoryNamespace { { match manifest_ns.table_exists(request.clone()).await { Ok(()) => return Ok(()), + Err(e) if manifest_feature_flags::is_incompatible_manifest_error(&e) => { + // An incompatible manifest must surface "please upgrade" + // rather than degrading to a directory-listing view. + return Err(e); + } Err(_) if self.dir_listing_enabled && is_root_level => { // Fall through to directory check only for single-level IDs } @@ -2927,20 +2893,6 @@ impl LanceNamespace for DirectoryNamespace { ) -> Result { self.record_op("list_table_versions"); let branch = Self::normalized_branch(request.branch.as_deref())?; - // The manifest catalog has no branch concept, so a branch lists its own - // version chain from storage under its tree path instead. - if branch.is_none() - && self.table_version_storage_enabled - && let Some(ref manifest_ns) = self.manifest_ns - { - let table_id = request.id.clone().unwrap_or_default(); - let want_descending = request.descending == Some(true); - return manifest_ns - .list_table_versions(&table_id, want_descending, request.limit) - .await; - } - - // Fallback when table_version_storage is not enabled: list from _versions/ directory let table_uri = self.resolve_table_location(&request.id).await?; let table_uri = match branch { Some(b) => self.resolve_branch_location(&table_uri, b).await?, @@ -3087,43 +3039,6 @@ impl LanceNamespace for DirectoryNamespace { ); } - // Also record in __manifest (best-effort). Branches aren't tracked there, - // so for a branch the storage manifest above is the only record. - if branch.is_none() - && self.table_version_storage_enabled - && let Some(ref manifest_ns) = self.manifest_ns - { - let table_id_str = - manifest::ManifestNamespace::str_object_id(&request.id.clone().unwrap_or_default()); - let object_id = - manifest::ManifestNamespace::build_version_object_id(&table_id_str, version as i64); - let metadata_json = serde_json::json!({ - "manifest_path": final_path.to_string(), - "manifest_size": manifest_size, - "e_tag": final_meta.e_tag, - "naming_scheme": request.naming_scheme.as_deref().unwrap_or("V2"), - }) - .to_string(); - - if let Err(e) = manifest_ns - .insert_into_manifest_with_metadata( - vec![manifest::ManifestEntry { - object_id, - object_type: manifest::ObjectType::TableVersion, - location: None, - metadata: Some(metadata_json), - }], - None, - ) - .await - { - log::warn!( - "Failed to record table version in __manifest (best-effort): {:?}", - e - ); - } - } - Ok(CreateTableVersionResponse { transaction_id: None, version: Some(Box::new(TableVersion { @@ -3143,18 +3058,6 @@ impl LanceNamespace for DirectoryNamespace { ) -> Result { self.record_op("describe_table_version"); let branch = Self::normalized_branch(request.branch.as_deref())?; - // When table_version_storage_enabled and a specific version is requested, - // query from __manifest to avoid opening the entire dataset. A branch has - // no manifest-catalog entry, so it resolves from storage instead. - if branch.is_none() - && self.table_version_storage_enabled - && let (Some(manifest_ns), Some(version)) = (&self.manifest_ns, request.version) - { - let table_id = request.id.clone().unwrap_or_default(); - return manifest_ns.describe_table_version(&table_id, version).await; - } - - // Fallback when table_version_storage is not enabled: inspect physical manifests directly. let table_uri = self.resolve_table_location(&request.id).await?; let table_uri = match branch { Some(b) => self.resolve_branch_location(&table_uri, b).await?, @@ -3206,9 +3109,9 @@ impl LanceNamespace for DirectoryNamespace { .map(|r| (r.start_version, r.end_version)) .collect(); - // Reject pathological bounded ranges up front: the manifest path below - // builds one id per version, so (0, i64::MAX) would exhaust memory. A - // through-latest range (end < 0) is bounded by the manifests that exist. + // Reject pathological bounded ranges up front: an explicit huge bounded + // range like (0, i64::MAX) is almost certainly a mistake. A through-latest + // range (end < 0) is bounded by the manifests that actually exist on storage. const MAX_VERSIONS_PER_REQUEST: i128 = 1_000_000; let requested: i128 = ranges .iter() @@ -3235,76 +3138,8 @@ impl LanceNamespace for DirectoryNamespace { ranges, }]; - let mut total_deleted_count = 0i64; - - // Branches are not tracked in the manifest catalog, so a branch skips the - // __manifest phase entirely and deletes its physical manifests directly. - if branch.is_none() - && self.table_version_storage_enabled - && let Some(ref manifest_ns) = self.manifest_ns - { - // Through-latest ranges (end_version < 0) would require enumerating the - // __manifest chain up to the latest version, which is not wired up here. - // Reject rather than silently delete physical files while leaving the - // __manifest records in place. - if table_entries - .iter() - .any(|te| te.ranges.iter().any(|&(_, e)| e < 0)) - { - return Err(NamespaceError::Unsupported { - message: "through-latest delete (end_version < 0) is not supported \ - for managed-versioning tables" - .to_string(), - } - .into()); - } - - // Phase 1 (atomic commit point): Delete version records from __manifest - // for ALL tables in a single atomic operation. This is the authoritative - // source of truth — once __manifest entries are removed, the versions - // are logically deleted across all tables atomically. - - // Collect all (table_id_str, ranges) for batch deletion - let mut all_object_ids: Vec = Vec::new(); - for te in &table_entries { - let table_id_str = manifest::ManifestNamespace::str_object_id( - &te.table_id.clone().unwrap_or_default(), - ); - for (start, end) in &te.ranges { - for version in *start..*end { - let object_id = manifest::ManifestNamespace::build_version_object_id( - &table_id_str, - version, - ); - all_object_ids.push(object_id); - } - } - } - - if !all_object_ids.is_empty() { - total_deleted_count = manifest_ns - .batch_delete_table_versions_by_object_ids(&all_object_ids) - .await?; - } - - // Phase 2: Delete physical manifest files (best-effort). - // Even if some file deletions fail, the versions are already removed from - // __manifest, so they won't be visible to readers. Leftover files are - // orphaned but harmless and can be cleaned up later. - let _ = self - .delete_physical_version_files(&table_entries, true, branch) - .await; - - return Ok(BatchDeleteTableVersionsResponse { - deleted_count: Some(total_deleted_count), - transaction_id: None, - }); - } - - // Direct path: delete physical files (no __manifest). Reached when storage - // tracking is off, or for any branch (which has no __manifest entries). - total_deleted_count = self - .delete_physical_version_files(&table_entries, false, branch) + let total_deleted_count = self + .delete_physical_version_files(&table_entries, branch) .await?; Ok(BatchDeleteTableVersionsResponse { @@ -5380,7 +5215,6 @@ mod tests { DirectoryNamespaceBuilder::new(temp.to_str().unwrap()) .manifest_enabled(true) .table_version_tracking_enabled(true) - .table_version_storage_enabled(true) .ops_metrics_enabled(true) .build() .await @@ -5755,150 +5589,12 @@ mod tests { ); } - /// The managed `__manifest` delete path (the authoritative catalog) must honor - /// the exclusive end: `[min, max)` removes exactly min..max from `__manifest`, - /// keeping max. With storage tracking on, the writes register versions in - /// `__manifest` and `list_table_versions` reads it back, so this exercises the - /// Phase-1 path that the physical-path tests never reach. - #[tokio::test] - async fn test_batch_delete_managed_manifest_exclusive() { - use arrow::array::Int32Array; - use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; - use lance_namespace::models::{BatchDeleteTableVersionsRequest, VersionRange}; - - let temp = TempStdDir::default(); - let ns: Arc = Arc::new( - DirectoryNamespaceBuilder::new(temp.to_str().unwrap()) - .manifest_enabled(true) - .table_version_tracking_enabled(true) - .table_version_storage_enabled(true) - .build() - .await - .unwrap(), - ); - let table_id = vec!["users".to_string()]; - let schema = Arc::new(ArrowSchema::new(vec![Field::new( - "id", - DataType::Int32, - false, - )])); - let batch = |seed: i32| { - arrow::record_batch::RecordBatch::try_new( - schema.clone(), - vec![Arc::new(Int32Array::from(vec![seed]))], - ) - .unwrap() - }; - - // Register v1, v2, v3 in __manifest via the managed write flow. - let mut ds = Dataset::write_into_namespace( - RecordBatchIterator::new(vec![Ok(batch(1))], schema.clone()), - ns.clone(), - table_id.clone(), - Some(WriteParams { - mode: WriteMode::Create, - ..Default::default() - }), - ) - .await - .unwrap(); - ds.append( - RecordBatchIterator::new(vec![Ok(batch(2))], schema.clone()), - None, - ) - .await - .unwrap(); - ds.append( - RecordBatchIterator::new(vec![Ok(batch(3))], schema.clone()), - None, - ) - .await - .unwrap(); - - let before = ns - .list_table_versions(ListTableVersionsRequest { - id: Some(table_id.clone()), - ..Default::default() - }) - .await - .unwrap() - .versions; - assert!( - before.len() >= 3, - "expected v1..v3 tracked in __manifest: {:?}", - before - ); - let min_v = before.iter().map(|v| v.version).min().unwrap(); - let max_v = before.iter().map(|v| v.version).max().unwrap(); - - // [min, max): exclusive end keeps max. - ns.batch_delete_table_versions(BatchDeleteTableVersionsRequest { - id: Some(table_id.clone()), - ranges: vec![VersionRange::new(min_v, max_v)], - ..Default::default() - }) - .await - .unwrap(); - - let after = ns - .list_table_versions(ListTableVersionsRequest { - id: Some(table_id.clone()), - ..Default::default() - }) - .await - .unwrap() - .versions; - assert_eq!( - after.len(), - 1, - "only the exclusive end (max) should remain in __manifest: {:?}", - after - ); - assert_eq!(after[0].version, max_v, "max must be kept"); - } - - /// On the managed path, a through-latest delete (`end_version < 0`) is rejected - /// rather than silently deleting physical files while leaving `__manifest` - /// records in place. - #[tokio::test] - async fn test_batch_delete_managed_rejects_through_latest() { - use lance_namespace::models::{BatchDeleteTableVersionsRequest, VersionRange}; - - let temp = TempStdDir::default(); - let ns: Arc = Arc::new( - DirectoryNamespaceBuilder::new(temp.to_str().unwrap()) - .manifest_enabled(true) - .table_version_tracking_enabled(true) - .table_version_storage_enabled(true) - .build() - .await - .unwrap(), - ); - - let err = ns - .batch_delete_table_versions(BatchDeleteTableVersionsRequest { - id: Some(vec!["users".to_string()]), - ranges: vec![VersionRange::new(0, -1)], - ..Default::default() - }) - .await; - assert!( - err.is_err(), - "through-latest delete must be rejected on the managed path" - ); - assert!( - err.unwrap_err().to_string().contains("not supported"), - "expected a not-supported error" - ); - } - /// Build a managed (manifest-tracked) namespace over `path`. async fn create_managed_namespace(path: &str) -> Arc { Arc::new( DirectoryNamespaceBuilder::new(path) .manifest_enabled(true) .table_version_tracking_enabled(true) - .table_version_storage_enabled(true) .build() .await .unwrap(), @@ -6328,7 +6024,6 @@ mod tests { DirectoryNamespaceBuilder::new(temp.to_str().unwrap()) .manifest_enabled(true) .table_version_tracking_enabled(true) - .table_version_storage_enabled(true) .ops_metrics_enabled(true) .build() .await @@ -6474,49 +6169,6 @@ mod tests { ); } - /// With the manifest store enabled, branch ops must still bypass the catalog - /// fast-path and read the chain from `tree//_versions/`. Without the - /// `branch.is_none()` guard this would query `__manifest` (which has no - /// branch entries) and return the wrong result. The other branch tests use a - /// store-disabled namespace, so this pins the enabled path specifically. - #[tokio::test] - async fn test_branch_ops_skip_manifest_store_when_enabled() { - let temp_dir = TempStdDir::default(); - let namespace = DirectoryNamespaceBuilder::new(temp_dir.to_str().unwrap()) - .manifest_enabled(true) - .table_version_storage_enabled(true) - .build() - .await - .unwrap(); - - create_scalar_table(&namespace, "users").await; - create_branch_with_commits(&namespace, "users", "exp", 2).await; - - // list resolves the branch chain from storage despite storage tracking - // being on (a successful result with tree/exp paths proves the bypass: - // the catalog has no "exp" entry, so the fast-path would not return these). - let branch_versions = list_versions(&namespace, "users", Some("exp")) - .await - .unwrap(); - assert!(branch_versions.len() >= 2); - assert!( - branch_versions - .iter() - .all(|v| v.manifest_path.contains("tree/exp")), - "branch versions must come from branch storage with the store enabled: {:?}", - branch_versions - ); - - // describe likewise resolves from the branch's storage. - let req = DescribeTableVersionRequest { - id: Some(vec!["users".to_string()]), - branch: Some("exp".to_string()), - ..Default::default() - }; - let resp = namespace.describe_table_version(req).await.unwrap(); - assert!(resp.version.manifest_path.contains("tree/exp")); - } - #[tokio::test] async fn test_create_table() { let (namespace, _temp_dir) = create_test_namespace().await; @@ -11281,155 +10933,6 @@ mod tests { } } - /// Tests for multi-table transaction support via table_version_storage_enabled. - mod multi_table_transactions { - use super::*; - use futures::TryStreamExt; - use lance::dataset::builder::DatasetBuilder; - use lance_namespace::models::CreateTableVersionRequest; - - /// Helper to create a namespace with table_version_storage_enabled enabled - async fn create_managed_namespace(temp_path: &str) -> Arc { - Arc::new( - DirectoryNamespaceBuilder::new(temp_path) - .table_version_tracking_enabled(true) - .table_version_storage_enabled(true) - .manifest_enabled(true) - .build() - .await - .unwrap(), - ) - } - - /// Helper to create a table and get its staging manifest path - async fn create_table_and_get_staging( - namespace: Arc, - table_name: &str, - ) -> (Vec, object_store::path::Path) { - let schema = create_test_schema(); - let ipc_data = create_test_ipc_data(&schema); - let mut create_req = CreateTableRequest::new(); - create_req.id = Some(vec![table_name.to_string()]); - namespace - .create_table(create_req, bytes::Bytes::from(ipc_data)) - .await - .unwrap(); - - let table_id = vec![table_name.to_string()]; - let dataset = DatasetBuilder::from_namespace(namespace.clone(), table_id.clone()) - .await - .unwrap() - .load() - .await - .unwrap(); - - // Find existing manifest and create a staging copy - let versions_path = dataset.versions_dir(); - let manifest_metas: Vec<_> = dataset - .object_store(None) - .await - .unwrap() - .inner - .list(Some(&versions_path)) - .try_collect() - .await - .unwrap(); - - let manifest_meta = manifest_metas - .iter() - .find(|m| { - m.location - .filename() - .map(|f| f.ends_with(".manifest")) - .unwrap_or(false) - }) - .expect("No manifest file found"); - - let manifest_data = dataset - .object_store(None) - .await - .unwrap() - .inner - .get(&manifest_meta.location) - .await - .unwrap() - .bytes() - .await - .unwrap(); - - let staging_path = dataset - .versions_dir() - .join(format!("staging_{}", table_name)); - dataset - .object_store(None) - .await - .unwrap() - .inner - .put(&staging_path, manifest_data.into()) - .await - .unwrap(); - - (table_id, staging_path) - } - - #[tokio::test] - async fn test_table_version_storage_enabled_requires_manifest() { - // table_version_storage_enabled=true requires manifest_enabled=true - let temp_dir = TempStdDir::default(); - let temp_path = temp_dir.to_str().unwrap(); - - let result = DirectoryNamespaceBuilder::new(temp_path) - .table_version_storage_enabled(true) - .manifest_enabled(false) - .build() - .await; - - assert!( - result.is_err(), - "Should fail when table_version_storage_enabled=true but manifest_enabled=false" - ); - } - - #[tokio::test] - async fn test_create_table_version_records_in_manifest() { - // When table_version_storage_enabled is enabled, single create_table_version - // should also record the version in __manifest - let temp_dir = TempStrDir::default(); - let temp_path: &str = &temp_dir; - - let namespace = create_managed_namespace(temp_path).await; - let ns: Arc = namespace.clone(); - - let (table_id, staging_path) = - create_table_and_get_staging(ns.clone(), "table_managed").await; - - // Create version 2 - let mut create_req = CreateTableVersionRequest::new(2, staging_path.to_string()); - create_req.id = Some(table_id.clone()); - create_req.naming_scheme = Some("V2".to_string()); - let response = namespace.create_table_version(create_req).await.unwrap(); - - assert!(response.version.is_some()); - let version = response.version.unwrap(); - assert_eq!(version.version, 2); - - // Verify the version is recorded in __manifest by querying it - let manifest_ns = namespace.manifest_ns.as_ref().unwrap(); - let table_id_str = manifest::ManifestNamespace::str_object_id(&table_id); - let versions = manifest_ns - .query_table_versions(&table_id_str, false, None) - .await - .unwrap(); - - assert!( - !versions.is_empty(), - "Version should be recorded in __manifest" - ); - let (ver, _path) = &versions[0]; - assert_eq!(*ver, 2, "Recorded version should be 2"); - } - } - #[tokio::test] async fn test_list_all_tables() { use lance_namespace::models::ListTablesRequest; @@ -11783,6 +11286,40 @@ mod tests { ); } + #[tokio::test] + async fn test_manifest_reload_observes_new_version_from_other_namespace() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let namespace_a = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(true) + .dir_listing_enabled(false) + .build() + .await + .unwrap(); + create_scalar_table(&namespace_a, "alpha").await; + + let namespace_b = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(true) + .dir_listing_enabled(false) + .build() + .await + .unwrap(); + create_scalar_table(&namespace_b, "beta").await; + + let response = namespace_a + .list_tables(ListTablesRequest { + id: Some(vec![]), + ..Default::default() + }) + .await + .unwrap(); + + let mut tables = response.tables; + tables.sort(); + assert_eq!(tables, vec!["alpha", "beta"]); + } + #[tokio::test] async fn test_migration_not_found_errors_include_table_id() { let temp_dir = TempStdDir::default(); diff --git a/rust/lance-namespace-impls/src/dir/manifest.rs b/rust/lance-namespace-impls/src/dir/manifest.rs index 0e22f1e8b69..aae924378da 100644 --- a/rust/lance-namespace-impls/src/dir/manifest.rs +++ b/rust/lance-namespace-impls/src/dir/manifest.rs @@ -6,52 +6,72 @@ //! This module provides a namespace implementation that uses a manifest table //! to track tables and nested namespaces. +use super::manifest_feature_flags::{ensure_readable, ensure_writable}; use arrow::array::builder::{ListBuilder, StringBuilder}; -use arrow::array::{Array, RecordBatch, RecordBatchIterator, StringArray}; -use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; +use arrow::array::{Array, ListArray, RecordBatch, RecordBatchIterator, StringArray, UInt64Array}; +use arrow::datatypes::{DataType, Field, Schema as ArrowSchema, SchemaRef}; use arrow_ipc::reader::StreamReader; use async_trait::async_trait; use bytes::Bytes; -use futures::{FutureExt, TryStreamExt, stream::StreamExt}; -use lance::dataset::optimize::{CompactionOptions, compact_files}; +use datafusion_common::DataFusionError; +use datafusion_physical_plan::{ + SendableRecordBatchStream, + stream::RecordBatchStreamAdapter as DatafusionRecordBatchStreamAdapter, +}; +use futures::{ + FutureExt, TryStreamExt, + stream::{self, StreamExt}, +}; +use lance::dataset::index::LanceIndexStoreExt; +use lance::dataset::transaction::{Operation, Transaction}; use lance::dataset::{ - DeleteBuilder, MergeInsertBuilder, ReadParams, WhenMatched, WhenNotMatched, WriteMode, - WriteParams, builder::DatasetBuilder, + InsertBuilder, ReadParams, WhenMatched, WriteMode, WriteParams, builder::DatasetBuilder, }; -use lance::index::DatasetIndexExt; use lance::session::Session; use lance::{Dataset, dataset::scanner::Scanner}; use lance_core::Error as LanceError; use lance_core::datatypes::LANCE_UNENFORCED_PRIMARY_KEY_POSITION; -use lance_core::{Error, Result}; -use lance_index::IndexType; -use lance_index::optimize::OptimizeOptions; -use lance_index::scalar::{BuiltinIndexType, ScalarIndexParams}; +use lance_core::{Error, ROW_ID, Result}; +use lance_index::progress::noop_progress; +use lance_index::registry::IndexPluginRegistry; +use lance_index::scalar::lance_format::LanceIndexStore; +use lance_index::scalar::registry::VALUE_COLUMN_NAME; +use lance_index::scalar::{BuiltinIndexType, CreatedIndex, ScalarIndexParams}; use lance_io::object_store::{ObjectStore, ObjectStoreParams}; +use lance_io::stream::RecordBatchStream as LanceRecordBatchStream; use lance_namespace::LanceNamespace; use lance_namespace::error::NamespaceError; use lance_namespace::models::{ CreateNamespaceRequest, CreateNamespaceResponse, CreateTableRequest, CreateTableResponse, DeclareTableRequest, DeclareTableResponse, DeregisterTableRequest, DeregisterTableResponse, DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableRequest, - DescribeTableResponse, DescribeTableVersionResponse, DropNamespaceRequest, - DropNamespaceResponse, DropTableRequest, DropTableResponse, ListNamespacesRequest, - ListNamespacesResponse, ListTableVersionsResponse, ListTablesRequest, ListTablesResponse, - NamespaceExistsRequest, RegisterTableRequest, RegisterTableResponse, TableExistsRequest, - TableVersion, + DescribeTableResponse, DropNamespaceRequest, DropNamespaceResponse, DropTableRequest, + DropTableResponse, ListNamespacesRequest, ListNamespacesResponse, ListTablesRequest, + ListTablesResponse, NamespaceExistsRequest, RegisterTableRequest, RegisterTableResponse, + TableExistsRequest, }; use lance_namespace::schema::arrow_schema_to_json; +use lance_table::feature_flags::apply_feature_flags; +use lance_table::format::{Fragment, IndexMetadata, Manifest}; +use lance_table::io::commit::{ + CommitError, CommitHandler, commit_handler_from_url, write_manifest_file_to_path, +}; use object_store::{Error as ObjectStoreError, path::Path}; +use roaring::RoaringBitmap; use std::io::Cursor; +use std::time::{SystemTime, UNIX_EPOCH}; use std::{ - collections::HashMap, + collections::{BTreeMap, HashMap, HashSet}, hash::{DefaultHasher, Hash, Hasher}, ops::{Deref, DerefMut}, - sync::Arc, + sync::{Arc, Mutex as StdMutex, MutexGuard as StdMutexGuard}, }; use tokio::sync::{Mutex, RwLock, RwLockReadGuard, RwLockWriteGuard}; +use uuid::Uuid; const MANIFEST_TABLE_NAME: &str = "__manifest"; +const LANCE_DATA_DIR: &str = "data"; +const LANCE_INDICES_DIR: &str = "_indices"; const DELIMITER: &str = "$"; /// Bounded concurrency for per-table `_versions/` probes when filtering declared tables. /// Higher values reduce latency but increase burst load against the object store. @@ -64,24 +84,23 @@ const OBJECT_ID_INDEX_NAME: &str = "object_id_btree"; const OBJECT_TYPE_INDEX_NAME: &str = "object_type_bitmap"; /// LabelList index on the base_objects column for view dependencies const BASE_OBJECTS_INDEX_NAME: &str = "base_objects_label_list"; -/// Inline maintenance on the manifest table is expensive relative to a single-row mutation. -/// Wait until enough fragments accumulate before compacting files or merging indices. -const MANIFEST_INLINE_OPTIMIZATION_FRAGMENT_THRESHOLD: usize = 8; +// Each retry reloads and rewrites the full manifest. Match the regular Lance +// commit retry budget so multi-process namespace writes can make progress. +const DEFAULT_MANIFEST_REWRITE_COMMIT_RETRIES: u32 = 20; +const MANIFEST_INDEX_BATCH_SIZE: usize = 8192; /// Object types that can be stored in the manifest #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ObjectType { Namespace, Table, - TableVersion, } impl ObjectType { - pub fn as_str(&self) -> &str { + pub fn as_str(&self) -> &'static str { match self { Self::Namespace => "namespace", Self::Table => "table", - Self::TableVersion => "table_version", } } @@ -89,7 +108,6 @@ impl ObjectType { match s { "namespace" => Ok(Self::Namespace), "table" => Ok(Self::Table), - "table_version" => Ok(Self::TableVersion), _ => Err(NamespaceError::Internal { message: format!("Invalid object type: {}", s), } @@ -152,7 +170,7 @@ pub struct TableInfo { pub struct ManifestEntry { /// The unique object identifier (e.g., table name or version object_id) pub object_id: String, - /// The type of the object (Namespace, Table, or TableVersion) + /// The type of the object (Namespace or Table) pub object_type: ObjectType, /// The storage location (e.g., directory name for tables) pub location: Option, @@ -160,6 +178,401 @@ pub struct ManifestEntry { pub metadata: Option, } +struct CopyOnWriteMutation { + result: T, + has_changes: bool, +} + +impl CopyOnWriteMutation { + fn updated(result: T) -> Self { + Self { + result, + has_changes: true, + } + } + + fn unchanged(result: T) -> Self { + Self { + result, + has_changes: false, + } + } +} + +struct ManifestIndexBuildInput { + index_name: &'static str, + column_name: &'static str, + params: ScalarIndexParams, + field: Field, + stream: SendableRecordBatchStream, +} + +struct ManifestTrainedIndex { + index_name: &'static str, + column_name: &'static str, + uuid: Uuid, + created_index: CreatedIndex, +} + +struct ManifestRowValue { + object_id: String, + object_type: ObjectType, + location: Option, + metadata: Option, + base_objects: Option>, +} + +struct ManifestOutputRow<'a> { + object_id: &'a str, + object_type: ObjectType, + location: Option<&'a str>, + metadata: Option<&'a str>, + base_objects: Option<&'a [String]>, +} + +#[derive(Default)] +struct ManifestIndexAccumulator { + object_ids: BTreeMap, u64>, + object_types: BTreeMap<&'static str, RoaringBitmap>, + base_objects_values: Vec>>, + base_objects_row_ids: Vec, + row_count: u64, +} + +impl ManifestIndexAccumulator { + fn next_row_id(&self) -> Result { + if self.row_count >= u64::from(u32::MAX) { + return Err(NamespaceError::Internal { + message: format!( + "Manifest rewrite exceeded maximum single-fragment row count: {}", + self.row_count + ), + } + .into()); + } + Ok(self.row_count) + } + + fn push(&mut self, row: &ManifestOutputRow<'_>) -> Result { + let row_id = self.next_row_id()?; + if self + .object_ids + .insert(Arc::::from(row.object_id), row_id) + .is_some() + { + return Err(NamespaceError::Internal { + message: format!("Manifest contains duplicate object_id '{}'", row.object_id), + } + .into()); + } + self.object_types + .entry(row.object_type.as_str()) + .or_default() + .insert(row_id as u32); + self.base_objects_values + .push(row.base_objects.map(|objects| objects.to_vec())); + self.base_objects_row_ids.push(row_id); + self.row_count += 1; + Ok(row_id) + } +} + +struct ManifestBatchBuilder { + object_ids: Vec, + object_types: Vec<&'static str>, + locations: Vec>, + metadatas: Vec>, + base_objects: Vec>>, +} + +impl ManifestBatchBuilder { + fn new() -> Self { + Self { + object_ids: Vec::new(), + object_types: Vec::new(), + locations: Vec::new(), + metadatas: Vec::new(), + base_objects: Vec::new(), + } + } + + fn is_empty(&self) -> bool { + self.object_ids.is_empty() + } + + fn append( + &mut self, + index_data: &mut ManifestIndexAccumulator, + row: ManifestOutputRow<'_>, + ) -> Result<()> { + index_data.push(&row)?; + self.object_ids.push(row.object_id.to_string()); + self.object_types.push(row.object_type.as_str()); + self.locations.push(row.location.map(ToString::to_string)); + self.metadatas.push(row.metadata.map(ToString::to_string)); + self.base_objects + .push(row.base_objects.map(|objects| objects.to_vec())); + Ok(()) + } + + fn finish(self) -> Result { + let base_objects_array = ManifestNamespace::base_objects_array(&self.base_objects); + RecordBatch::try_new( + ManifestNamespace::manifest_schema(), + vec![ + Arc::new(StringArray::from(self.object_ids)), + Arc::new(StringArray::from(self.object_types)), + Arc::new(StringArray::from(self.locations)), + Arc::new(StringArray::from(self.metadatas)), + Arc::new(base_objects_array), + ], + ) + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to create manifest snapshot batch: {:?}", e), + }) + }) + } +} + +/// How to resolve a storage commit conflict (or an ambiguous commit error that did +/// not land) against the latest catalog state, without re-staging the full rewrite. +enum ConflictResolution { + /// Re-read the latest manifest and re-apply the mutation (upserts, version-range + /// deletes). The staged data/index files are discarded and a new rewrite is attempted. + Retry, + /// Creating these object ids with fail-on-conflict semantics. If any of them now + /// exists in the latest manifest, the create lost the race and must fail with a + /// concurrent-modification error; otherwise retry the rewrite. + FailIfExists(Vec), + /// Deleting `object_id`. If it is already absent from the latest manifest the delete + /// has effectively happened, so return `output` as success; otherwise retry. + SucceedIfAbsent { object_id: String, output: O }, +} + +trait ManifestStreamMutation: Send { + type Output: Clone + Send + 'static; + + fn process_existing_row( + &mut self, + row: ManifestRowValue, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> Result<()>; + + fn append_rows( + &mut self, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> Result<()>; + + fn finish(&self) -> CopyOnWriteMutation; + + /// Declares how a storage commit conflict should be resolved against the latest + /// committed catalog state. Defaults to re-reading and re-applying. + fn conflict_resolution(&self) -> ConflictResolution { + ConflictResolution::Retry + } +} + +struct ManifestRewriteShared { + mutation: M, + index_data: Option, + result: Option>, + error: Option, +} + +impl ManifestRewriteShared { + fn new(mutation: M) -> Self { + Self { + mutation, + index_data: Some(ManifestIndexAccumulator::default()), + result: None, + error: None, + } + } +} + +struct UpsertManifestMutation { + entries: Vec, + base_objects: Vec>>, + entry_positions: HashMap, + matched: Vec, + when_matched: WhenMatched, +} + +impl UpsertManifestMutation { + fn new( + entries: Vec, + base_objects: Option>, + when_matched: WhenMatched, + ) -> Self { + let entry_positions = entries + .iter() + .enumerate() + .map(|(index, entry)| (entry.object_id.clone(), index)) + .collect(); + let matched = vec![false; entries.len()]; + let mut entry_base_objects = vec![None; entries.len()]; + if !entry_base_objects.is_empty() { + entry_base_objects[0] = base_objects; + } + Self { + entries, + base_objects: entry_base_objects, + entry_positions, + matched, + when_matched, + } + } + + fn entry_row(&self, index: usize) -> ManifestOutputRow<'_> { + let entry = &self.entries[index]; + ManifestOutputRow { + object_id: &entry.object_id, + object_type: entry.object_type, + location: entry.location.as_deref(), + metadata: entry.metadata.as_deref(), + base_objects: self.base_objects[index].as_deref(), + } + } +} + +impl ManifestStreamMutation for UpsertManifestMutation { + type Output = (); + + fn process_existing_row( + &mut self, + row: ManifestRowValue, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> Result<()> { + if let Some(index) = self.entry_positions.get(&row.object_id).copied() { + match self.when_matched { + WhenMatched::Fail => { + return Err(NamespaceError::ConcurrentModification { + message: format!( + "Object '{}' was concurrently created by another operation", + row.object_id + ), + } + .into()); + } + WhenMatched::UpdateAll => { + self.matched[index] = true; + output.append(index_data, self.entry_row(index))?; + return Ok(()); + } + _ => { + return Err(NamespaceError::Internal { + message: format!( + "Unsupported manifest rewrite matched action: {:?}", + self.when_matched + ), + } + .into()); + } + } + } + + output.append( + index_data, + ManifestOutputRow { + object_id: &row.object_id, + object_type: row.object_type, + location: row.location.as_deref(), + metadata: row.metadata.as_deref(), + base_objects: row.base_objects.as_deref(), + }, + ) + } + + fn append_rows( + &mut self, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> Result<()> { + for index in 0..self.entries.len() { + if !self.matched[index] { + output.append(index_data, self.entry_row(index))?; + } + } + Ok(()) + } + + fn finish(&self) -> CopyOnWriteMutation { + CopyOnWriteMutation::updated(()) + } + + fn conflict_resolution(&self) -> ConflictResolution { + match self.when_matched { + // Fail-on-conflict create: a concurrent writer may have created one of these + // ids. Re-applying would still fail, so check directly instead of re-staging. + WhenMatched::Fail => ConflictResolution::FailIfExists( + self.entries.iter().map(|e| e.object_id.clone()).collect(), + ), + // Metadata upsert is last-writer-wins: re-read and re-apply. + _ => ConflictResolution::Retry, + } + } +} + +struct DeleteObjectMutation { + object_id: String, + deleted: bool, +} + +impl ManifestStreamMutation for DeleteObjectMutation { + type Output = (); + + fn process_existing_row( + &mut self, + row: ManifestRowValue, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> Result<()> { + if row.object_id == self.object_id { + self.deleted = true; + return Ok(()); + } + + output.append( + index_data, + ManifestOutputRow { + object_id: &row.object_id, + object_type: row.object_type, + location: row.location.as_deref(), + metadata: row.metadata.as_deref(), + base_objects: row.base_objects.as_deref(), + }, + ) + } + + fn append_rows( + &mut self, + _output: &mut ManifestBatchBuilder, + _index_data: &mut ManifestIndexAccumulator, + ) -> Result<()> { + Ok(()) + } + + fn finish(&self) -> CopyOnWriteMutation { + if self.deleted { + CopyOnWriteMutation::updated(()) + } else { + CopyOnWriteMutation::unchanged(()) + } + } + + fn conflict_resolution(&self) -> ConflictResolution { + // If a concurrent writer already removed the object, the delete is satisfied. + ConflictResolution::SucceedIfAbsent { + object_id: self.object_id.clone(), + output: (), + } + } +} + /// Information about a namespace stored in the manifest #[derive(Debug, Clone)] pub struct NamespaceInfo { @@ -171,13 +584,23 @@ pub struct NamespaceInfo { /// A wrapper around a Dataset that provides concurrent access. /// /// This can be cloned cheaply. It supports concurrent reads or exclusive writes. -/// The manifest dataset is always kept strongly consistent by reloading on each read. +/// The manifest dataset uses contiguous attached versions and this module never +/// runs old-version cleanup on it, allowing reads to check only the immediate +/// successor manifest before deciding whether a reload is needed. #[derive(Debug, Clone)] pub struct DatasetConsistencyWrapper(Arc>); impl DatasetConsistencyWrapper { /// Create a new wrapper with the given dataset. pub fn new(dataset: Dataset) -> Self { + debug_assert!( + !dataset + .manifest() + .config + .keys() + .any(|key| key.starts_with("lance.auto_cleanup.")), + "the directory manifest dataset must not enable old-version cleanup" + ); Self(Arc::new(RwLock::new(dataset))) } @@ -185,18 +608,35 @@ impl DatasetConsistencyWrapper { /// Always reloads to ensure strong consistency. pub async fn get(&self) -> Result> { self.reload().await?; - Ok(DatasetReadGuard { + let guard = DatasetReadGuard { guard: self.0.read().await, - }) + }; + // Refuse manifests written with a reader feature flag this build does + // not understand instead of misreading them. + ensure_readable(guard.metadata())?; + Ok(guard) + } + + /// Reload the dataset and return a reference. + pub async fn get_refreshed(&self) -> Result> { + self.reload().await?; + let guard = DatasetReadGuard { + guard: self.0.read().await, + }; + ensure_readable(guard.metadata())?; + Ok(guard) } /// Get a mutable reference to the dataset. /// Always reloads to ensure strong consistency. pub async fn get_mut(&self) -> Result> { self.reload().await?; - Ok(DatasetWriteGuard { + let guard = DatasetWriteGuard { guard: self.0.write().await, - }) + }; + ensure_readable(guard.metadata())?; + ensure_writable(guard.metadata())?; + Ok(guard) } /// Provide a known latest version of the dataset. @@ -221,21 +661,25 @@ impl DatasetConsistencyWrapper { dataset_uri, current_version ); - let latest_version = read_guard.latest_version_id().await.map_err(|e| { + // The directory manifest table uses contiguous attached versions and + // does not run old-version cleanup, so the immediate successor probe is + // enough to detect changes without resolving or loading the latest + // manifest on every namespace read. + let has_successor_version = read_guard.has_successor_version().await.map_err(|e| { lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to get latest version: {:?}", e), + message: format!("Failed to check dataset staleness: {:?}", e), }) })?; log::debug!( - "Reload got latest_version={} for uri={}, current_version={}", - latest_version, + "Reload checked successor_version_exists={} for uri={}, current_version={}", + has_successor_version, dataset_uri, current_version ); drop(read_guard); // If already up-to-date, return early - if latest_version == current_version { + if !has_successor_version { log::debug!("Already up-to-date for uri={}", dataset_uri); return Ok(()); } @@ -244,13 +688,13 @@ impl DatasetConsistencyWrapper { let mut write_guard = self.0.write().await; // Double-check after acquiring write lock (someone else might have reloaded) - let latest_version = write_guard.latest_version_id().await.map_err(|e| { + let has_successor_version = write_guard.has_successor_version().await.map_err(|e| { lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to get latest version: {:?}", e), + message: format!("Failed to check dataset staleness: {:?}", e), }) })?; - if latest_version != write_guard.version().version { + if has_successor_version { write_guard.checkout_latest().await.map_err(|e| { lance_core::Error::from(NamespaceError::Internal { message: format!("Failed to checkout latest: {:?}", e), @@ -306,8 +750,8 @@ pub struct ManifestNamespace { /// If true, root namespace tables use {table_name}.lance naming /// If false, they use namespace-prefixed names dir_listing_enabled: bool, - /// Whether to perform inline optimization (compaction and indexing) on the __manifest table - /// after every write. Defaults to true. + /// Whether copy-on-write manifest rewrites should build replacement indices. + /// Defaults to true. inline_optimization_enabled: bool, /// Number of retries for commit operations on the manifest table. /// If None, defaults to [`lance_table::io::commit::CommitConfig`] default (20). @@ -401,15 +845,10 @@ impl ManifestNamespace { dir_listing_enabled: bool, inline_optimization_enabled: bool, commit_retries: Option, - table_version_storage_enabled: bool, ) -> Result { - let manifest_dataset = Self::ensure_manifest_table_up_to_date( - &root, - &storage_options, - session.clone(), - table_version_storage_enabled, - ) - .await?; + let manifest_dataset = + Self::ensure_manifest_table_up_to_date(&root, &storage_options, session.clone()) + .await?; Ok(Self { root, @@ -473,34 +912,6 @@ impl ManifestNamespace { format!("table id '{}'", Self::str_object_id(table_id)) } - /// Format a version number as a zero-padded lexicographically sortable string. - /// - /// Versions are stored as 20-digit zero-padded integers (e.g., `00000000000000000001` - /// for version 1) so that string-based range queries and sorting work correctly. - pub fn format_table_version(version: i64) -> String { - format!("{:020}", version) - } - - /// Build the object_id for a table version entry. - /// - /// Format: `{table_object_id}${zero_padded_version}` - pub fn build_version_object_id(table_object_id: &str, version: i64) -> String { - format!( - "{}{}{}", - table_object_id, - DELIMITER, - Self::format_table_version(version) - ) - } - - /// Parse a version number from the version suffix of a table version object_id. - /// - /// The object_id is formatted as `{table_id}${zero_padded_version}`. - pub fn parse_version_from_object_id(object_id: &str) -> Option { - let (_namespace, name) = Self::parse_object_id(object_id); - name.parse::().ok() - } - /// Generate a new directory name in format: `_` /// The hash is used to (1) optimize object store throughput, /// (2) have high enough entropy in a short period of time to prevent issues like @@ -556,168 +967,392 @@ impl ManifestNamespace { Ok(full_url.to_string()) } - /// Perform inline optimization on the __manifest table. - /// - /// This method: - /// 1. Creates three indexes on the manifest table: - /// - BTREE index on object_id for fast lookups - /// - Bitmap index on object_type for filtering by type - /// - LabelList index on base_objects for view dependencies - /// 2. Runs file compaction to merge small files - /// 3. Optimizes existing indices - /// - /// This is called automatically after writes when inline_optimization_enabled is true. - async fn run_inline_optimization(&self) -> Result<()> { - if !self.inline_optimization_enabled { - return Ok(()); - } - - // Get a mutable reference to the dataset to perform optimization - let mut dataset_guard = self.manifest_dataset.get_mut().await?; - let dataset: &mut Dataset = &mut dataset_guard; - - // Step 1: Create indexes if they don't already exist - let indices = dataset.load_indices().await?; - - // Check which indexes already exist - let has_object_id_index = indices.iter().any(|idx| idx.name == OBJECT_ID_INDEX_NAME); - let has_object_type_index = indices.iter().any(|idx| idx.name == OBJECT_TYPE_INDEX_NAME); - let has_base_objects_index = indices - .iter() - .any(|idx| idx.name == BASE_OBJECTS_INDEX_NAME); - - // Create BTREE index on object_id - if !has_object_id_index { - log::debug!( - "Creating BTREE index '{}' on object_id for __manifest table", - OBJECT_ID_INDEX_NAME - ); - let params = ScalarIndexParams::for_builtin(BuiltinIndexType::BTree); - if let Err(e) = dataset - .create_index( - &["object_id"], - IndexType::BTree, - Some(OBJECT_ID_INDEX_NAME.to_string()), - ¶ms, - true, - ) - .await - { - log::warn!( - "Failed to create BTREE index on object_id for __manifest table: {:?}. Query performance may be impacted.", - e - ); - } else { - log::info!( - "Created BTREE index '{}' on object_id for __manifest table", - OBJECT_ID_INDEX_NAME - ); + fn string_list_array(values: &[Option>], child_name: &str) -> ListArray { + let string_builder = StringBuilder::new(); + let mut list_builder = ListBuilder::new(string_builder).with_field(Arc::new(Field::new( + child_name, + DataType::Utf8, + true, + ))); + for value in values { + match value { + Some(objects) => { + for object in objects { + list_builder.values().append_value(object); + } + list_builder.append(true); + } + None => list_builder.append_null(), } } + list_builder.finish() + } - // Create Bitmap index on object_type - if !has_object_type_index { - log::debug!( - "Creating Bitmap index '{}' on object_type for __manifest table", - OBJECT_TYPE_INDEX_NAME - ); - let params = ScalarIndexParams::default(); - if let Err(e) = dataset - .create_index( - &["object_type"], - IndexType::Bitmap, - Some(OBJECT_TYPE_INDEX_NAME.to_string()), - ¶ms, - true, - ) - .await - { - log::warn!( - "Failed to create Bitmap index on object_type for __manifest table: {:?}. Query performance may be impacted.", - e - ); - } else { - log::info!( - "Created Bitmap index '{}' on object_type for __manifest table", - OBJECT_TYPE_INDEX_NAME - ); - } - } + fn base_objects_array(values: &[Option>]) -> ListArray { + Self::string_list_array(values, "object_id") + } - // Create LabelList index on base_objects - if !has_base_objects_index { - log::debug!( - "Creating LabelList index '{}' on base_objects for __manifest table", - BASE_OBJECTS_INDEX_NAME - ); - let params = ScalarIndexParams::default(); - if let Err(e) = dataset - .create_index( - &["base_objects"], - IndexType::LabelList, - Some(BASE_OBJECTS_INDEX_NAME.to_string()), - ¶ms, - true, - ) - .await - { - log::warn!( - "Failed to create LabelList index on base_objects for __manifest table: {:?}. Query performance may be impacted.", - e - ); - } else { - log::info!( - "Created LabelList index '{}' on base_objects for __manifest table", - BASE_OBJECTS_INDEX_NAME - ); - } - } + fn value_row_id_schema(value_field: Field) -> SchemaRef { + Arc::new(ArrowSchema::new(vec![ + value_field, + Field::new(ROW_ID, DataType::UInt64, false), + ])) + } - let should_compact_and_optimize = - dataset.count_fragments() >= MANIFEST_INLINE_OPTIMIZATION_FRAGMENT_THRESHOLD; + fn string_row_id_batch( + schema: SchemaRef, + values: Vec, + row_ids: Vec, + ) -> Result { + RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from(values)), + Arc::new(UInt64Array::from(row_ids)), + ], + ) + .map_err(Into::into) + } - if !should_compact_and_optimize { - return Ok(()); - } + fn list_row_id_batch( + schema: SchemaRef, + values: Vec>>, + row_ids: Vec, + ) -> Result { + RecordBatch::try_new( + schema, + vec![ + Arc::new(Self::string_list_array(&values, "item")), + Arc::new(UInt64Array::from(row_ids)), + ], + ) + .map_err(Into::into) + } - // Step 2: Run file compaction - log::debug!("Running file compaction on __manifest table"); - match compact_files(dataset, CompactionOptions::default(), None).await { - Ok(compaction_metrics) => { - if compaction_metrics.fragments_removed > 0 { - log::info!( - "Compacted __manifest table: removed {} fragments, added {} fragments", - compaction_metrics.fragments_removed, - compaction_metrics.fragments_added - ); + fn object_id_index_stream(object_ids: BTreeMap, u64>) -> SendableRecordBatchStream { + let schema = + Self::value_row_id_schema(Field::new(VALUE_COLUMN_NAME, DataType::Utf8, false)); + let stream_schema = schema.clone(); + let stream = stream::unfold( + (object_ids.into_iter(), false, schema), + |(mut iter, emitted, schema)| async move { + let mut values = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE); + let mut row_ids = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE); + for _ in 0..MANIFEST_INDEX_BATCH_SIZE { + let Some((value, row_id)) = iter.next() else { + break; + }; + values.push(value.to_string()); + row_ids.push(row_id); } - } - Err(e) => { - log::warn!( - "Failed to compact files for __manifest table: {:?}. Continuing with optimization.", - e - ); - } - } - - // Step 3: Optimize indices - log::debug!("Optimizing indices on __manifest table"); - match dataset.optimize_indices(&OptimizeOptions::default()).await { - Ok(_) => { - log::info!("Successfully optimized indices on __manifest table"); - } - Err(e) => { - log::warn!( - "Failed to optimize indices on __manifest table: {:?}. Continuing anyway.", - e - ); - } - } - - Ok(()) + if values.is_empty() { + if emitted { + None + } else { + let batch = Self::string_row_id_batch(schema.clone(), values, row_ids) + .map_err(|err| DataFusionError::External(Box::new(err))); + Some((batch, (iter, true, schema))) + } + } else { + let batch = Self::string_row_id_batch(schema.clone(), values, row_ids) + .map_err(|err| DataFusionError::External(Box::new(err))); + Some((batch, (iter, true, schema))) + } + }, + ); + Box::pin(DatafusionRecordBatchStreamAdapter::new( + stream_schema, + stream.fuse(), + )) } - /// Get the manifest schema + fn object_type_index_stream( + object_types: BTreeMap<&'static str, RoaringBitmap>, + ) -> SendableRecordBatchStream { + let schema = + Self::value_row_id_schema(Field::new(VALUE_COLUMN_NAME, DataType::Utf8, false)); + let stream_schema = schema.clone(); + let entries = object_types + .into_iter() + .map(|(value, bitmap)| { + ( + value, + Box::new(bitmap.into_iter()) as Box + Send>, + ) + }) + .collect::>() + .into_iter(); + let stream = stream::unfold( + (entries, None, false, schema), + |(mut entries, mut current, emitted, schema)| async move { + let mut values = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE); + let mut row_ids = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE); + while values.len() < MANIFEST_INDEX_BATCH_SIZE { + if current.is_none() { + current = entries.next(); + } + let Some((value, iter)) = current.as_mut() else { + break; + }; + if let Some(row_id) = iter.next() { + values.push((*value).to_string()); + row_ids.push(u64::from(row_id)); + } else { + current = None; + } + } + + if values.is_empty() { + if emitted { + None + } else { + let batch = Self::string_row_id_batch(schema.clone(), values, row_ids) + .map_err(|err| DataFusionError::External(Box::new(err))); + Some((batch, (entries, current, true, schema))) + } + } else { + let batch = Self::string_row_id_batch(schema.clone(), values, row_ids) + .map_err(|err| DataFusionError::External(Box::new(err))); + Some((batch, (entries, current, true, schema))) + } + }, + ); + Box::pin(DatafusionRecordBatchStreamAdapter::new( + stream_schema, + stream.fuse(), + )) + } + + fn base_objects_index_stream( + base_objects_values: Vec>>, + base_objects_row_ids: Vec, + ) -> SendableRecordBatchStream { + let schema = Self::value_row_id_schema(Field::new( + VALUE_COLUMN_NAME, + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + true, + )); + let stream_schema = schema.clone(); + let stream = stream::unfold( + ( + base_objects_values.into_iter().zip(base_objects_row_ids), + false, + schema, + ), + |(mut iter, emitted, schema)| async move { + let mut values = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE); + let mut row_ids = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE); + for _ in 0..MANIFEST_INDEX_BATCH_SIZE { + let Some((value, row_id)) = iter.next() else { + break; + }; + values.push(value); + row_ids.push(row_id); + } + if values.is_empty() { + if emitted { + None + } else { + let batch = Self::list_row_id_batch(schema.clone(), values, row_ids) + .map_err(|err| DataFusionError::External(Box::new(err))); + Some((batch, (iter, true, schema))) + } + } else { + let batch = Self::list_row_id_batch(schema.clone(), values, row_ids) + .map_err(|err| DataFusionError::External(Box::new(err))); + Some((batch, (iter, true, schema))) + } + }, + ); + Box::pin(DatafusionRecordBatchStreamAdapter::new( + stream_schema, + stream.fuse(), + )) + } + + async fn train_manifest_index( + dataset: &Dataset, + registry: Arc, + input: ManifestIndexBuildInput, + index_uuid: Uuid, + ) -> Result { + let index_store = LanceIndexStore::from_dataset_for_new(dataset, &index_uuid)?; + let plugin = registry.get_plugin_by_name(&input.params.index_type)?; + let training_request = plugin + .new_training_request(input.params.params.as_deref().unwrap_or("{}"), &input.field)?; + let created_index = plugin + .train_index( + input.stream, + &index_store, + training_request, + None, + noop_progress(), + ) + .await?; + Ok(ManifestTrainedIndex { + index_name: input.index_name, + column_name: input.column_name, + uuid: index_uuid, + created_index, + }) + } + + fn manifest_index_metadata( + lance_schema: &lance_core::datatypes::Schema, + fragment_bitmap: &RoaringBitmap, + dataset_version: u64, + trained_index: ManifestTrainedIndex, + ) -> Result { + Ok(IndexMetadata { + uuid: trained_index.uuid, + fields: vec![lance_schema.field_id(trained_index.column_name)?], + name: trained_index.index_name.to_string(), + dataset_version, + fragment_bitmap: Some(fragment_bitmap.clone()), + index_details: Some(Arc::new(trained_index.created_index.index_details)), + index_version: trained_index.created_index.index_version as i32, + created_at: None, + base_id: None, + files: Some(trained_index.created_index.files), + }) + } + + fn manifest_fragment_bitmap(manifest: &Manifest) -> Result { + let mut bitmap = RoaringBitmap::new(); + for fragment in manifest.fragments.iter() { + let fragment_id = u32::try_from(fragment.id).map_err(|_| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Manifest fragment id {} exceeds u32", fragment.id), + }) + })?; + bitmap.insert(fragment_id); + } + Ok(bitmap) + } + + fn manifest_from_overwrite_transaction( + previous: &Manifest, + schema: lance_core::datatypes::Schema, + fragments: &[Fragment], + ) -> Manifest { + let mut next_fragment_id = 0; + let mut fragments = fragments + .iter() + .cloned() + .map(|mut fragment| { + if fragment.id == 0 { + fragment.id = next_fragment_id; + next_fragment_id += 1; + } + fragment + }) + .collect::>(); + fragments.sort_by_key(|fragment| fragment.id); + Manifest::new_from_previous(previous, schema, Arc::new(fragments)) + } + + async fn build_manifest_indices( + dataset: &Dataset, + manifest: &Manifest, + index_data: ManifestIndexAccumulator, + index_uuids: [Uuid; 3], + ) -> Result> { + let fragment_bitmap = Self::manifest_fragment_bitmap(manifest)?; + let schema = &manifest.schema; + let ManifestIndexAccumulator { + object_ids, + object_types, + base_objects_values, + base_objects_row_ids, + .. + } = index_data; + let [object_id_uuid, object_type_uuid, base_objects_uuid] = index_uuids; + let registry = IndexPluginRegistry::with_default_plugins(); + + let dataset_version = manifest.version; + let object_id_index_fut = Self::build_manifest_index( + dataset, + registry.clone(), + schema, + ManifestIndexBuildInput { + index_name: OBJECT_ID_INDEX_NAME, + column_name: "object_id", + params: ScalarIndexParams::for_builtin(BuiltinIndexType::BTree), + field: Field::new(VALUE_COLUMN_NAME, DataType::Utf8, false), + stream: Self::object_id_index_stream(object_ids), + }, + &fragment_bitmap, + dataset_version, + object_id_uuid, + ); + let object_type_index_fut = Self::build_manifest_index( + dataset, + registry.clone(), + schema, + ManifestIndexBuildInput { + index_name: OBJECT_TYPE_INDEX_NAME, + column_name: "object_type", + params: ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap), + field: Field::new(VALUE_COLUMN_NAME, DataType::Utf8, false), + stream: Self::object_type_index_stream(object_types), + }, + &fragment_bitmap, + dataset_version, + object_type_uuid, + ); + let base_objects_index_fut = Self::build_manifest_index( + dataset, + registry, + schema, + ManifestIndexBuildInput { + index_name: BASE_OBJECTS_INDEX_NAME, + column_name: "base_objects", + params: ScalarIndexParams::for_builtin(BuiltinIndexType::LabelList), + field: Field::new( + VALUE_COLUMN_NAME, + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + true, + ), + stream: Self::base_objects_index_stream(base_objects_values, base_objects_row_ids), + }, + &fragment_bitmap, + dataset_version, + base_objects_uuid, + ); + + let (object_id_index, object_type_index, base_objects_index) = futures::join!( + object_id_index_fut, + object_type_index_fut, + base_objects_index_fut + ); + + Ok(vec![ + object_id_index?, + object_type_index?, + base_objects_index?, + ]) + } + + async fn build_manifest_index( + dataset: &Dataset, + registry: Arc, + lance_schema: &lance_core::datatypes::Schema, + input: ManifestIndexBuildInput, + fragment_bitmap: &RoaringBitmap, + dataset_version: u64, + index_uuid: Uuid, + ) -> Result { + let trained_index = + Self::train_manifest_index(dataset, registry, input, index_uuid).await?; + Self::manifest_index_metadata( + lance_schema, + fragment_bitmap, + dataset_version, + trained_index, + ) + } + + /// Get the manifest schema fn manifest_schema() -> Arc { Arc::new(ArrowSchema::new(vec![ // Set unenforced primary key on object_id for bloom filter conflict detection @@ -783,6 +1418,627 @@ impl ManifestNamespace { }) } + fn required_string_value<'a>( + array: &'a StringArray, + row: usize, + column_name: &str, + ) -> Result<&'a str> { + if array.is_null(row) { + return Err(NamespaceError::Internal { + message: format!("Manifest column '{}' has null at row {}", column_name, row), + } + .into()); + } + Ok(array.value(row)) + } + + fn optional_string_value(array: &StringArray, row: usize) -> Option { + (!array.is_null(row)).then(|| array.value(row).to_string()) + } + + fn base_objects_column_values(batch: &RecordBatch) -> Result>>> { + let Some(column) = batch.column_by_name("base_objects") else { + return Ok(vec![None; batch.num_rows()]); + }; + let array = column.as_any().downcast_ref::().ok_or_else(|| { + lance_core::Error::from(NamespaceError::Internal { + message: format!( + "Column 'base_objects' is not a list array: {:?}", + column.data_type() + ), + }) + })?; + + let mut values = Vec::with_capacity(batch.num_rows()); + for row in 0..batch.num_rows() { + if array.is_null(row) { + values.push(None); + continue; + } + let row_values = array.value(row); + let row_values = row_values + .as_any() + .downcast_ref::() + .ok_or_else(|| { + lance_core::Error::from(NamespaceError::Internal { + message: "Column 'base_objects' values are not strings".to_string(), + }) + })?; + let mut objects = Vec::with_capacity(row_values.len()); + for value_index in 0..row_values.len() { + if row_values.is_null(value_index) { + return Err(NamespaceError::Internal { + message: format!( + "Manifest column 'base_objects' has null item at row {} item {}", + row, value_index + ), + } + .into()); + } + objects.push(row_values.value(value_index).to_string()); + } + values.push(Some(objects)); + } + Ok(values) + } + + async fn manifest_projected_stream(dataset: &Dataset) -> Result { + let mut scanner = dataset.scan(); + scanner + .project(&[ + "object_id", + "object_type", + "location", + "metadata", + "base_objects", + ]) + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to project manifest columns: {:?}", e), + }) + })?; + let stream = scanner.try_into_stream().await.map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to create manifest stream: {:?}", e), + }) + })?; + let schema = stream.schema(); + let stream = stream.map_err(|err| DataFusionError::External(Box::new(err))); + Ok(Box::pin(DatafusionRecordBatchStreamAdapter::new( + schema, + stream.fuse(), + ))) + } + + fn manifest_rewrite_commit_retries(&self) -> u32 { + self.commit_retries + .unwrap_or(DEFAULT_MANIFEST_REWRITE_COMMIT_RETRIES) + } + + fn lock_manifest_rewrite_shared( + shared: &Arc>>, + ) -> Result>> { + shared.lock().map_err(|_| { + lance_core::Error::from(NamespaceError::Internal { + message: "Manifest rewrite state mutex was poisoned".to_string(), + }) + }) + } + + fn set_manifest_rewrite_error( + shared: &Arc>>, + err: LanceError, + ) { + match shared.lock() { + Ok(mut guard) => { + guard.error = Some(err); + } + Err(poisoned) => { + let mut guard = poisoned.into_inner(); + guard.error = Some(err); + } + } + } + + fn take_manifest_rewrite_error( + shared: &Arc>>, + ) -> Result> { + let mut guard = Self::lock_manifest_rewrite_shared(shared)?; + Ok(guard.error.take()) + } + + fn process_manifest_rewrite_batch( + batch: RecordBatch, + shared: &Arc>>, + ) -> Result> { + let object_ids = Self::get_string_column(&batch, "object_id")?; + let object_types = Self::get_string_column(&batch, "object_type")?; + let locations = Self::get_string_column(&batch, "location")?; + let metadatas = Self::get_string_column(&batch, "metadata")?; + let base_objects = Self::base_objects_column_values(&batch)?; + let mut output = ManifestBatchBuilder::new(); + let mut guard = Self::lock_manifest_rewrite_shared(shared)?; + let mut index_data = guard.index_data.take().ok_or_else(|| { + lance_core::Error::from(NamespaceError::Internal { + message: "Manifest rewrite index state is unavailable".to_string(), + }) + })?; + for (row, base_objects) in base_objects.into_iter().enumerate().take(batch.num_rows()) { + let row_value = ManifestRowValue { + object_id: Self::required_string_value(object_ids, row, "object_id")?.to_string(), + object_type: ObjectType::parse(Self::required_string_value( + object_types, + row, + "object_type", + )?)?, + location: Self::optional_string_value(locations, row), + metadata: Self::optional_string_value(metadatas, row), + base_objects, + }; + guard + .mutation + .process_existing_row(row_value, &mut output, &mut index_data)?; + } + guard.index_data = Some(index_data); + if output.is_empty() { + return Ok(None); + } + Ok(Some(output.finish()?)) + } + + fn finish_manifest_rewrite_stream( + shared: &Arc>>, + ) -> Result> { + let mut output = ManifestBatchBuilder::new(); + let mut guard = Self::lock_manifest_rewrite_shared(shared)?; + let mut index_data = guard.index_data.take().ok_or_else(|| { + lance_core::Error::from(NamespaceError::Internal { + message: "Manifest rewrite index state is unavailable".to_string(), + }) + })?; + guard.mutation.append_rows(&mut output, &mut index_data)?; + let result = guard.mutation.finish(); + let force_empty_batch = index_data.row_count == 0; + guard.result = Some(result); + guard.index_data = Some(index_data); + if output.is_empty() && !force_empty_batch { + Ok(None) + } else { + Ok(Some(output.finish()?)) + } + } + + fn manifest_rewrite_output_stream( + source: SendableRecordBatchStream, + shared: Arc>>, + ) -> SendableRecordBatchStream { + enum Phase { + Source, + Finish, + Done, + } + + let schema = Self::manifest_schema(); + let stream = stream::unfold( + (source, shared, Phase::Source), + |(mut source, shared, mut phase)| async move { + loop { + match phase { + Phase::Source => match source.next().await { + Some(Ok(batch)) => { + match Self::process_manifest_rewrite_batch(batch, &shared) { + Ok(Some(batch)) => { + return Some((Ok(batch), (source, shared, phase))); + } + Ok(None) => continue, + Err(err) => { + let message = err.to_string(); + Self::set_manifest_rewrite_error(&shared, err); + return Some(( + Err(DataFusionError::External(Box::new( + std::io::Error::other(message), + ))), + (source, shared, Phase::Done), + )); + } + } + } + Some(Err(err)) => { + return Some((Err(err), (source, shared, Phase::Done))); + } + None => phase = Phase::Finish, + }, + Phase::Finish => { + phase = Phase::Done; + match Self::finish_manifest_rewrite_stream(&shared) { + Ok(Some(batch)) => { + return Some((Ok(batch), (source, shared, phase))); + } + Ok(None) => continue, + Err(err) => { + let message = err.to_string(); + Self::set_manifest_rewrite_error(&shared, err); + return Some(( + Err(DataFusionError::External(Box::new( + std::io::Error::other(message), + ))), + (source, shared, Phase::Done), + )); + } + } + } + Phase::Done => return None, + } + } + }, + ); + Box::pin(DatafusionRecordBatchStreamAdapter::new( + schema, + stream.fuse(), + )) + } + + fn take_manifest_rewrite_result( + shared: &Arc>>, + ) -> Result<(CopyOnWriteMutation, ManifestIndexAccumulator)> { + let mut guard = Self::lock_manifest_rewrite_shared(shared)?; + let result = guard.result.take().ok_or_else(|| { + lance_core::Error::from(NamespaceError::Internal { + message: "Manifest rewrite stream did not finish".to_string(), + }) + })?; + let index_data = guard.index_data.take().ok_or_else(|| { + lance_core::Error::from(NamespaceError::Internal { + message: "Manifest rewrite index state is unavailable".to_string(), + }) + })?; + Ok((result, index_data)) + } + + /// Delete the staged (uncommitted) data files and index directories for a rewrite. + /// Only call this once the rewrite is known *not* to have landed (a put-if-not-exists + /// conflict, or an ambiguous error whose target version does not reference our data + /// file) — otherwise it would orphan files a committed manifest still references. + async fn cleanup_staged_manifest_files( + &self, + object_store: &ObjectStore, + data_files: &HashSet, + index_uuids: &[Uuid], + ) { + let data_dir = self + .base_path + .clone() + .join(MANIFEST_TABLE_NAME) + .join(LANCE_DATA_DIR); + for path in data_files { + let data_path = data_dir.clone().join(path.as_str()); + if let Err(err) = object_store.delete(&data_path).await { + log::warn!( + "Failed to clean up uncommitted manifest rewrite data file '{}': {}", + data_path, + err + ); + } + } + self.cleanup_uncommitted_manifest_index_dirs(object_store, index_uuids.iter().copied()) + .await; + } + + async fn cleanup_uncommitted_manifest_index_dirs( + &self, + object_store: &ObjectStore, + index_uuids: impl IntoIterator, + ) { + for index_uuid in index_uuids { + let index_dir = self + .base_path + .clone() + .join(MANIFEST_TABLE_NAME) + .join(LANCE_INDICES_DIR) + .join(index_uuid.to_string()); + if let Err(err) = object_store.remove_dir_all(index_dir.clone()).await + && !matches!(err, LanceError::NotFound { .. }) + { + log::warn!( + "Failed to clean up uncommitted manifest rewrite index directory '{}': {}", + index_dir, + err + ); + } + } + } + + /// Resolve the commit handler for the `__manifest` dataset's storage backend. + async fn manifest_commit_handler(&self) -> Result> { + commit_handler_from_url(&self.root, &None) + .await + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to resolve manifest commit handler: {:?}", e), + }) + }) + } + + /// Directly write the rewritten `__manifest` as a new version using the storage + /// backend's atomic put-if-not-exists. The overwrite transaction is embedded inline + /// (no separate transaction file) and the commit handler writes the version hint. + async fn commit_manifest_overwrite( + &self, + dataset: &Dataset, + commit_handler: &dyn CommitHandler, + manifest: &mut Manifest, + indices: Option>, + transaction: Transaction, + ) -> std::result::Result<(), CommitError> { + apply_feature_flags(manifest, false, false).map_err(CommitError::from)?; + let timestamp_nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_nanos()) + .unwrap_or(0); + manifest.set_timestamp(timestamp_nanos); + manifest.update_max_fragment_id(); + + // Commit through the dataset's own object store, not `self.object_store`: for + // stores like `memory://` the namespace and the dataset can hold different + // instances, and a commit written to the wrong one is invisible to reads. + let object_store = dataset + .object_store(None) + .await + .map_err(CommitError::from)?; + let base_path = self.base_path.clone().join(MANIFEST_TABLE_NAME); + let naming_scheme = dataset.manifest_location().naming_scheme; + commit_handler + .commit( + manifest, + indices, + &base_path, + &object_store, + write_manifest_file_to_path, + naming_scheme, + Some((&transaction).into()), + ) + .await + .map(|_location| ()) + } + + /// After an ambiguous commit error, determine whether our overwrite actually landed at + /// `target_version`. A network failure can leave the manifest committed even though the + /// client observed an error; in that case the committed version references one of our + /// staged data files, and deleting them would corrupt the catalog. + async fn manifest_commit_landed( + &self, + dataset: &Dataset, + target_version: u64, + data_files: &HashSet, + ) -> bool { + let Ok(committed) = dataset.checkout_version(target_version).await else { + return false; + }; + committed.manifest().fragments.iter().any(|fragment| { + fragment + .files + .iter() + .any(|file| data_files.contains(file.path.as_str())) + }) + } + + /// Resolve a storage commit conflict against the latest committed catalog state. + /// Returns `Some(output)` when the mutation's intent is already satisfied (no retry + /// needed), `Ok(None)` to retry the rewrite, or an error for a terminal conflict. + async fn resolve_manifest_conflict( + &self, + resolution: &ConflictResolution, + ) -> Result> { + match resolution { + ConflictResolution::Retry => Ok(None), + ConflictResolution::FailIfExists(object_ids) => { + for object_id in object_ids { + if self.manifest_contains_object(object_id).await? { + return Err(NamespaceError::ConcurrentModification { + message: format!( + "Object '{}' was concurrently created by another operation", + object_id + ), + } + .into()); + } + } + Ok(None) + } + ConflictResolution::SucceedIfAbsent { object_id, output } => { + if self.manifest_contains_object(object_id).await? { + Ok(None) + } else { + Ok(Some(output.clone())) + } + } + } + } + + /// Validate that this build can write the current `__manifest` before a + /// mutating operation performs any side effect (e.g. writing table data), so + /// a refused write leaves nothing orphaned behind. The eventual + /// `rewrite_manifest` commit re-checks `ensure_writable` on each retry, so a + /// concurrent upgrade in between is still caught. + async fn ensure_manifest_writable(&self) -> Result<()> { + let dataset_guard = self.manifest_dataset.get().await?; + ensure_writable(dataset_guard.metadata()) + } + + async fn rewrite_manifest( + &self, + operation: &str, + mut make_mutation: F, + ) -> Result + where + M: ManifestStreamMutation + 'static, + F: FnMut() -> M, + { + let _mutation_guard = self.manifest_mutation_lock.lock().await; + let max_retries = self.manifest_rewrite_commit_retries(); + let mut retries = 0; + let build_indices = self.inline_optimization_enabled; + let commit_handler = self.manifest_commit_handler().await?; + + loop { + let dataset_guard = self.manifest_dataset.get_refreshed().await?; + let dataset = Arc::new(dataset_guard.clone()); + drop(dataset_guard); + // Refuse to mutate a manifest written with a writer feature flag this + // build does not understand. + ensure_writable(dataset.metadata())?; + // Staged files, indices, the commit, and cleanup must all use the dataset's + // own object store (see `commit_manifest_overwrite`). + let object_store = dataset.object_store(None).await?; + + let source = Self::manifest_projected_stream(&dataset).await?; + let resolution = make_mutation().conflict_resolution(); + let shared = Arc::new(StdMutex::new(ManifestRewriteShared::new(make_mutation()))); + let output_stream = Self::manifest_rewrite_output_stream(source, shared.clone()); + // Pin both limits so the overwrite never splits into multiple fragments: the + // replacement indices map each row to address `(0 << 32) | offset`, valid only + // for a single fragment with id 0. The row count is bounded below u32::MAX by + // `ManifestIndexAccumulator::next_row_id`. + let write_params = WriteParams { + mode: WriteMode::Overwrite, + session: self.session.clone(), + max_rows_per_file: u32::MAX as usize, + max_bytes_per_file: usize::MAX, + skip_auto_cleanup: true, + ..WriteParams::default() + }; + + let transaction = match InsertBuilder::new(dataset.clone()) + .with_params(&write_params) + .execute_uncommitted_stream(output_stream) + .await + { + Ok(transaction) => transaction, + Err(err) => { + if let Some(stream_err) = Self::take_manifest_rewrite_error(&shared)? { + return Err(stream_err); + } + return Err(convert_lance_commit_error(&err, operation, None)); + } + }; + + let (mutation, index_data) = Self::take_manifest_rewrite_result(&shared)?; + + let Operation::Overwrite { + fragments, schema, .. + } = &transaction.operation + else { + return Err(NamespaceError::Internal { + message: "Manifest rewrite transaction is not an overwrite".to_string(), + } + .into()); + }; + // Unique data files this attempt staged. Used to clean up orphans and to + // attribute an ambiguous commit error back to us. + let staged_data_files = fragments + .iter() + .flat_map(|fragment| fragment.files.iter()) + .filter(|file| file.base_id.is_none()) + .map(|file| file.path.clone()) + .collect::>(); + + if !mutation.has_changes { + self.cleanup_staged_manifest_files(&object_store, &staged_data_files, &[]) + .await; + return Ok(mutation.result); + } + + let mut manifest = Self::manifest_from_overwrite_transaction( + dataset.manifest(), + schema.clone(), + fragments, + ); + let target_version = manifest.version; + + let index_uuids = [Uuid::new_v4(), Uuid::new_v4(), Uuid::new_v4()]; + let indices = if build_indices { + match Self::build_manifest_indices(&dataset, &manifest, index_data, index_uuids) + .await + { + Ok(indices) => Some(indices), + Err(err) => { + self.cleanup_staged_manifest_files( + &object_store, + &staged_data_files, + &index_uuids, + ) + .await; + return Err(err); + } + } + } else { + None + }; + let staged_index_uuids: &[Uuid] = if build_indices { &index_uuids } else { &[] }; + + let commit_result = self + .commit_manifest_overwrite( + &dataset, + commit_handler.as_ref(), + &mut manifest, + indices, + transaction, + ) + .await; + + match commit_result { + Ok(()) => { + let _ = self.manifest_dataset.get_refreshed().await; + return Ok(mutation.result); + } + Err(err) => { + // The put may have landed even though the client saw an error (lost + // ack). Verify before deleting anything so we never orphan files that a + // committed manifest still references. + if self + .manifest_commit_landed(&dataset, target_version, &staged_data_files) + .await + { + let _ = self.manifest_dataset.get_refreshed().await; + return Ok(mutation.result); + } + self.cleanup_staged_manifest_files( + &object_store, + &staged_data_files, + staged_index_uuids, + ) + .await; + match err { + CommitError::CommitConflict => { + if let Some(output) = + self.resolve_manifest_conflict(&resolution).await? + { + return Ok(output); + } + if retries >= max_retries { + return Err(NamespaceError::ConcurrentModification { + message: format!( + "{}: still conflicting after {} retries", + operation, max_retries + ), + } + .into()); + } + retries += 1; + tokio::time::sleep(std::time::Duration::from_millis( + 10 * u64::from(retries), + )) + .await; + } + CommitError::OtherError(err) => { + return Err(convert_lance_commit_error(&err, operation, None)); + } + } + } + } + } + } + /// Check if the manifest contains an object with the given ID async fn manifest_contains_object(&self, object_id: &str) -> Result { let escaped_id = object_id.replace('\'', "''"); @@ -999,7 +2255,6 @@ impl ManifestNamespace { /// Insert one or more entries into the manifest table with metadata and base_objects. /// /// This is the unified entry point for both single and batch inserts. - /// Uses a single MergeInsert operation to insert all entries at once. /// If any entry already exists (matching object_id), the entire batch fails. pub async fn insert_into_manifest_with_metadata( &self, @@ -1029,181 +2284,55 @@ impl ManifestNamespace { return Ok(()); } - let schema = Self::manifest_schema(); - - let mut object_ids = Vec::with_capacity(entries.len()); - let mut object_types = Vec::with_capacity(entries.len()); - let mut locations: Vec> = Vec::with_capacity(entries.len()); - let mut metadatas: Vec> = Vec::with_capacity(entries.len()); + self.rewrite_manifest("Failed to overwrite manifest", || { + UpsertManifestMutation::new(entries.clone(), base_objects.clone(), when_matched.clone()) + }) + .await + } - let string_builder = StringBuilder::new(); - let mut list_builder = ListBuilder::new(string_builder).with_field(Arc::new(Field::new( - "object_id", - DataType::Utf8, - true, - ))); + /// Delete an entry from the manifest table + pub async fn delete_from_manifest(&self, object_id: &str) -> Result<()> { + let object_id = object_id.to_string(); + self.rewrite_manifest("Failed to delete from manifest", || DeleteObjectMutation { + object_id: object_id.clone(), + deleted: false, + }) + .await + } - for (i, entry) in entries.iter().enumerate() { - object_ids.push(entry.object_id.as_str()); - object_types.push(entry.object_type.as_str()); - locations.push(entry.location.clone()); - metadatas.push(entry.metadata.clone()); - - // Only the first entry gets the base_objects (for single-entry inserts - // with base_objects like view creation); batch entries use null. - if i == 0 { - match &base_objects { - Some(objects) => { - for obj in objects { - list_builder.values().append_value(obj); - } - list_builder.append(true); - } - None => { - list_builder.append_null(); - } - } - } else { - list_builder.append_null(); + /// Register a table in the manifest without creating the physical table (internal helper for migration) + pub async fn register_table(&self, name: &str, location: String) -> Result<()> { + let object_id = Self::build_object_id(&[], name); + if self.manifest_contains_object(&object_id).await? { + return Err(NamespaceError::Internal { + message: format!("Table '{}' already exists", name), } + .into()); } - let base_objects_array = list_builder.finish(); - - let location_array: Arc = Arc::new(StringArray::from( - locations.iter().map(|l| l.as_deref()).collect::>(), - )); - - let metadata_array: Arc = Arc::new(StringArray::from( - metadatas.iter().map(|m| m.as_deref()).collect::>(), - )); - - let batch = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(StringArray::from(object_ids)), - Arc::new(StringArray::from(object_types.to_vec())), - location_array, - metadata_array, - Arc::new(base_objects_array), - ], - ) - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to create manifest entries: {:?}", e), - }) - })?; - - let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); - - // Use MergeInsert so callers can choose fail-on-existing inserts or metadata upserts. - let _mutation_guard = self.manifest_mutation_lock.lock().await; - let dataset_guard = self.manifest_dataset.get().await?; - let dataset_arc = Arc::new(dataset_guard.clone()); - drop(dataset_guard); // Drop read guard before merge insert - - let mut merge_builder = - MergeInsertBuilder::try_new(dataset_arc, vec!["object_id".to_string()]).map_err( - |e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to create merge builder: {:?}", e), - }) - }, - )?; - merge_builder.when_matched(when_matched); - merge_builder.when_not_matched(WhenNotMatched::InsertAll); - // Use conflict_retries to handle cross-process races on manifest mutations. - merge_builder.conflict_retries(5); - // TODO: after BTREE index creation on object_id, has_scalar_index=true causes - // MergeInsert to use V1 path which lacks bloom filters for conflict detection. This - // results in (Some, None) filter mismatch when rebasing against V2 operations. - // Setting use_index=false ensures all operations consistently use V2 path. - merge_builder.use_index(false); - if let Some(retries) = self.commit_retries { - merge_builder.commit_retries(retries); - } - - let (new_dataset_arc, _merge_stats) = merge_builder - .try_build() - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to build merge: {:?}", e), - }) - })? - .execute_reader(Box::new(reader)) + self.insert_into_manifest(object_id, ObjectType::Table, Some(location)) .await - .map_err(|e| { - convert_lance_commit_error(&e, "Failed to execute merge insert into manifest", None) - })?; - - let new_dataset = Arc::try_unwrap(new_dataset_arc).unwrap_or_else(|arc| (*arc).clone()); - self.manifest_dataset.set_latest(new_dataset).await; - - // Run inline optimization after write - if let Err(e) = self.run_inline_optimization().await { - log::warn!( - "Unexpected failure when running inline optimization: {:?}", - e - ); - } - - Ok(()) } - /// Delete an entry from the manifest table - pub async fn delete_from_manifest(&self, object_id: &str) -> Result<()> { - let predicate = format!("object_id = '{}'", object_id); - - // Get dataset and use DeleteBuilder with configured retries - let _mutation_guard = self.manifest_mutation_lock.lock().await; - let dataset_guard = self.manifest_dataset.get().await?; - let dataset = Arc::new(dataset_guard.clone()); - drop(dataset_guard); // Drop read guard before delete - - let new_dataset = DeleteBuilder::new(dataset, &predicate) - .execute() - .await - .map_err(|e| convert_lance_commit_error(&e, "Failed to delete", None))?; - - // Update the wrapper with the new dataset - self.manifest_dataset - .set_latest( - Arc::try_unwrap(new_dataset.new_dataset).unwrap_or_else(|arc| (*arc).clone()), - ) - .await; - - // Run inline optimization after delete - if let Err(e) = self.run_inline_optimization().await { - log::warn!( - "Unexpected failure when running inline optimization: {:?}", - e - ); + /// Validate that all levels of a namespace path exist + async fn validate_namespace_levels_exist(&self, namespace_path: &[String]) -> Result<()> { + for i in 1..=namespace_path.len() { + let partial_path = &namespace_path[..i]; + let object_id = partial_path.join(DELIMITER); + if !self.manifest_contains_object(&object_id).await? { + return Err(NamespaceError::NamespaceNotFound { + message: format!("parent namespace '{}'", object_id), + } + .into()); + } } - Ok(()) } - /// Query the manifest for all versions of a table, sorted by version. - /// - /// Returns a list of (version, metadata_json_string) tuples where metadata_json_string - /// contains the full metadata JSON stored in the manifest (manifest_path, manifest_size, - /// e_tag, naming_scheme). - /// - /// **Known limitation**: All matching rows are loaded into memory, sorted in Rust, - /// and then truncated. For tables with a very large number of versions this may be - /// expensive. Pushing sort/limit into the scan is not yet supported by Lance. - pub async fn query_table_versions( - &self, - object_id: &str, - descending: bool, - limit: Option, - ) -> Result> { + /// Query the manifest for a namespace with the given object ID + async fn query_manifest_for_namespace(&self, object_id: &str) -> Result> { let escaped_id = object_id.replace('\'', "''"); - // table_version object_ids are formatted as "{object_id}${zero_padded_version}" - let filter = format!( - "object_type = 'table_version' AND starts_with(object_id, '{}{}')", - escaped_id, DELIMITER - ); + let filter = format!("object_id = '{}' AND object_type = 'namespace'", escaped_id); let mut scanner = self.manifest_scanner().await?; scanner.filter(&filter).map_err(|e| { lance_core::Error::from(NamespaceError::Internal { @@ -1217,200 +2346,285 @@ impl ManifestNamespace { })?; let batches = Self::execute_scanner(scanner).await?; - let mut versions: Vec<(i64, String)> = Vec::new(); + let mut found_result: Option = None; + let mut total_rows = 0; + for batch in batches { if batch.num_rows() == 0 { continue; } - let object_id_array = Self::get_string_column(&batch, "object_id")?; - let metadata_array = Self::get_string_column(&batch, "metadata")?; - for i in 0..batch.num_rows() { - let oid = object_id_array.value(i); - // Parse version from object_id - if let Some(version) = Self::parse_version_from_object_id(oid) { - let metadata_str = metadata_array.value(i).to_string(); - versions.push((version, metadata_str)); + + total_rows += batch.num_rows(); + if total_rows > 1 { + return Err(NamespaceError::Internal { + message: format!( + "Expected exactly 1 namespace with id '{}', found {}", + object_id, total_rows + ), } + .into()); } - } - if descending { - versions.sort_by(|a, b| b.0.cmp(&a.0)); - } else { - versions.sort_by(|a, b| a.0.cmp(&b.0)); - } + let object_id_array = Self::get_string_column(&batch, "object_id")?; + let metadata_array = Self::get_string_column(&batch, "metadata")?; + + let object_id_str = object_id_array.value(0); + let metadata = if !metadata_array.is_null(0) { + let metadata_str = metadata_array.value(0); + match serde_json::from_str::>(metadata_str) { + Ok(map) => Some(map), + Err(e) => { + return Err(NamespaceError::Internal { + message: format!( + "Failed to deserialize metadata for namespace '{}': {}", + object_id, e + ), + } + .into()); + } + } + } else { + None + }; - if let Some(limit) = limit { - versions.truncate(limit as usize); + let (namespace, name) = Self::parse_object_id(object_id_str); + found_result = Some(NamespaceInfo { + namespace, + name, + metadata, + }); } - Ok(versions) + Ok(found_result) } - /// Query the manifest for a specific version of a table. - /// - /// Returns the full metadata JSON string if found, which contains - /// manifest_path, manifest_size, e_tag, and naming_scheme. + /// Create or load the manifest dataset, ensuring it has the latest schema setup. /// - pub async fn query_table_version( - &self, - object_id: &str, - version: i64, - ) -> Result> { - let version_object_id = Self::build_version_object_id(object_id, version); - self.query_table_version_by_object_id(&version_object_id) - .await - } + /// This function will: + /// 1. Try to load an existing manifest table + /// 2. If it exists, check and migrate the schema if needed (e.g., add primary key metadata) + /// 3. If it doesn't exist, create a new manifest table with the current schema + async fn ensure_manifest_table_up_to_date( + root: &str, + storage_options: &Option>, + session: Option>, + ) -> Result { + let manifest_path = format!("{}/{}", root, MANIFEST_TABLE_NAME); + log::debug!("Attempting to load manifest from {}", manifest_path); + let store_options = ObjectStoreParams { + storage_options_accessor: storage_options.as_ref().map(|opts| { + Arc::new( + lance_io::object_store::StorageOptionsAccessor::with_static_options( + opts.clone(), + ), + ) + }), + ..Default::default() + }; + let read_params = ReadParams { + session: session.clone(), + store_options: Some(store_options.clone()), + ..Default::default() + }; + let dataset_result = DatasetBuilder::from_uri(&manifest_path) + .with_read_params(read_params) + .load() + .await; + if let Ok(mut dataset) = dataset_result { + // Reject a manifest written with a reader feature flag this build + // does not understand before touching it. + ensure_readable(dataset.metadata())?; - /// Query a specific table version by its exact object_id. - async fn query_table_version_by_object_id( - &self, - version_object_id: &str, - ) -> Result> { - let escaped_id = version_object_id.replace('\'', "''"); - let filter = format!( - "object_id = '{}' AND object_type = 'table_version'", - escaped_id - ); - let mut scanner = self.manifest_scanner().await?; - scanner.filter(&filter).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to filter: {:?}", e), - }) - })?; - scanner.project(&["metadata"]).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to project: {:?}", e), - }) - })?; - let batches = Self::execute_scanner(scanner).await?; + // Check if the object_id field has primary key metadata, migrate if not + let needs_pk_migration = dataset + .schema() + .field("object_id") + .map(|f| { + !f.metadata + .contains_key(LANCE_UNENFORCED_PRIMARY_KEY_POSITION) + }) + .unwrap_or(false); - for batch in batches { - if batch.num_rows() == 0 { - continue; + if needs_pk_migration { + // This legacy migration writes to the manifest, so confirm this + // build is allowed to write the current format first. + ensure_writable(dataset.metadata())?; + log::info!("Migrating __manifest table to add primary key metadata on object_id"); + dataset + .update_field_metadata() + .update("object_id", [(LANCE_UNENFORCED_PRIMARY_KEY_POSITION, "0")]) + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!( + "Failed to find object_id field for migration: {:?}", + e + ), + }) + })? + .await + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to migrate primary key metadata: {:?}", e), + }) + })?; } - let metadata_array = Self::get_string_column(&batch, "metadata")?; - return Ok(Some(metadata_array.value(0).to_string())); - } - - Ok(None) - } - - /// Delete table version entries from the manifest for a given table and version ranges. - /// - /// Each range is (start_version, end_version) inclusive. Deletes all matching - /// `object_type = 'table_version'` entries whose object_id matches - /// `{object_id}${zero_padded_version}`. - /// - /// Builds a single filter expression covering all version ranges and executes - /// one bulk delete operation instead of deleting versions one at a time. - pub async fn delete_table_versions( - &self, - object_id: &str, - ranges: &[(i64, i64)], - ) -> Result { - if ranges.is_empty() { - return Ok(0); - } - // Collect all object_ids to delete (both new zero-padded and legacy formats) - let mut object_id_conditions: Vec = Vec::new(); - for (start, end) in ranges { - for version in *start..=*end { - let oid = Self::build_version_object_id(object_id, version); - let escaped = oid.replace('\'', "''"); - object_id_conditions.push(format!("'{}'", escaped)); - } - } + Ok(DatasetConsistencyWrapper::new(dataset)) + } else { + log::info!("Creating new manifest table at {}", manifest_path); + let schema = Self::manifest_schema(); + let empty_batch = RecordBatch::new_empty(schema.clone()); + let reader = RecordBatchIterator::new(vec![Ok(empty_batch)], schema.clone()); - if object_id_conditions.is_empty() { - return Ok(0); - } + let store_params = ObjectStoreParams { + storage_options_accessor: storage_options.as_ref().map(|opts| { + Arc::new( + lance_io::object_store::StorageOptionsAccessor::with_static_options( + opts.clone(), + ), + ) + }), + ..Default::default() + }; + let write_params = WriteParams { + session: session.clone(), + store_params: Some(store_params), + ..Default::default() + }; - // First, count how many entries exist so we can report the deleted count - let in_list = object_id_conditions.join(", "); - let filter = format!( - "object_type = 'table_version' AND object_id IN ({})", - in_list - ); + let dataset = + Dataset::write(Box::new(reader), &manifest_path, Some(write_params)).await; - let mut scanner = self.manifest_scanner().await?; - scanner.filter(&filter).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to filter: {:?}", e), - }) - })?; - scanner.project(&["object_id", "location"]).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to project: {:?}", e), - }) - })?; - let batches = Self::execute_scanner(scanner).await?; - let deleted_count: i64 = batches.iter().map(|b| b.num_rows() as i64).sum(); - - if deleted_count == 0 { - return Ok(0); + // Handle race condition where another process created the manifest concurrently + match dataset { + Ok(dataset) => { + log::info!( + "Successfully created manifest table at {}, version={}, uri={}", + manifest_path, + dataset.version().version, + dataset.uri() + ); + Ok(DatasetConsistencyWrapper::new(dataset)) + } + Err(ref e) + if matches!( + e, + LanceError::DatasetAlreadyExists { .. } + | LanceError::CommitConflict { .. } + | LanceError::IncompatibleTransaction { .. } + | LanceError::RetryableCommitConflict { .. } + ) => + { + // Another process created the manifest concurrently, try to load it + log::info!( + "Manifest table was created by another process, loading it: {}", + manifest_path + ); + let recovery_store_options = ObjectStoreParams { + storage_options_accessor: storage_options.as_ref().map(|opts| { + Arc::new( + lance_io::object_store::StorageOptionsAccessor::with_static_options( + opts.clone(), + ), + ) + }), + ..Default::default() + }; + let recovery_read_params = ReadParams { + session, + store_options: Some(recovery_store_options), + ..Default::default() + }; + let dataset = DatasetBuilder::from_uri(&manifest_path) + .with_read_params(recovery_read_params) + .load() + .await + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!( + "Failed to load manifest dataset after creation conflict: {}", + e + ), + }) + })?; + Ok(DatasetConsistencyWrapper::new(dataset)) + } + Err(e) => Err(lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to create manifest dataset: {:?}", e), + })), + } } + } - // Execute a single bulk delete with the combined filter - let _mutation_guard = self.manifest_mutation_lock.lock().await; - let dataset_guard = self.manifest_dataset.get().await?; - let dataset = Arc::new(dataset_guard.clone()); - drop(dataset_guard); - - let new_dataset = DeleteBuilder::new(dataset, &filter) - .execute() - .await - .map_err(|e| { - convert_lance_commit_error(&e, "Failed to batch delete table versions", None) - })?; + /// Sorts names alphabetically and applies pagination using page_token (start_after) and limit. + /// + /// Returns the next page token (last item in this page) if more results exist beyond the limit, + /// or `None` if this is the last page. + fn apply_pagination( + names: &mut Vec, + page_token: Option, + limit: Option, + ) -> Option { + names.sort(); - self.manifest_dataset - .set_latest( - Arc::try_unwrap(new_dataset.new_dataset).unwrap_or_else(|arc| (*arc).clone()), - ) - .await; + if let Some(start_after) = page_token { + if let Some(index) = names + .iter() + .position(|name| name.as_str() > start_after.as_str()) + { + names.drain(0..index); + } else { + names.clear(); + } + } - if let Err(e) = self.run_inline_optimization().await { - log::warn!( - "Unexpected failure when running inline optimization: {:?}", - e - ); + if let Some(limit) = limit + && limit >= 0 + { + let limit = limit as usize; + if names.len() > limit { + let next_page_token = if limit > 0 { + Some(names[limit - 1].clone()) + } else { + None + }; + names.truncate(limit); + return next_page_token; + } } - Ok(deleted_count) + None } +} - /// Atomically delete table version entries from the manifest by their object_ids. - /// - /// This method supports multi-table transactional deletion: all specified - /// object_ids (which may span multiple tables) are deleted in a single atomic - /// `DeleteBuilder` operation. Either all entries are removed or none are. - /// - /// Object IDs are formatted as `{table_id}${version}`. - pub async fn batch_delete_table_versions_by_object_ids( - &self, - object_ids: &[String], - ) -> Result { - if object_ids.is_empty() { - return Ok(0); - } +#[async_trait] +impl LanceNamespace for ManifestNamespace { + fn namespace_id(&self) -> String { + self.root.clone() + } - let in_list: String = object_ids - .iter() - .map(|oid| { - let escaped = oid.replace('\'', "''"); - format!("'{}'", escaped) + async fn list_tables(&self, request: ListTablesRequest) -> Result { + let namespace_id = request.id.as_ref().ok_or_else(|| { + lance_core::Error::from(NamespaceError::InvalidInput { + message: "Namespace ID is required".to_string(), }) - .collect::>() - .join(", "); + })?; - let filter = format!( - "object_type = 'table_version' AND object_id IN ({})", - in_list - ); + // Build filter to find tables in this namespace + let filter = if namespace_id.is_empty() { + // Root namespace: find tables without a namespace prefix + "object_type = 'table' AND NOT contains(object_id, '$')".to_string() + } else { + // Namespaced: find tables that start with namespace$ but have no additional $ + let prefix = namespace_id.join(DELIMITER); + format!( + "object_type = 'table' AND starts_with(object_id, '{}{}') AND NOT contains(substring(object_id, {}), '$')", + prefix, + DELIMITER, + prefix.len() + 2 + ) + }; - // Count how many entries exist so we can report the deleted count let mut scanner = self.manifest_scanner().await?; scanner.filter(&filter).map_err(|e| { lance_core::Error::from(NamespaceError::Internal { @@ -1422,576 +2636,420 @@ impl ManifestNamespace { message: format!("Failed to project: {:?}", e), }) })?; - let batches = Self::execute_scanner(scanner).await?; - let deleted_count: i64 = batches.iter().map(|b| b.num_rows() as i64).sum(); - - if deleted_count == 0 { - return Ok(0); - } - - // Execute a single atomic bulk delete covering all tables - let _mutation_guard = self.manifest_mutation_lock.lock().await; - let dataset_guard = self.manifest_dataset.get().await?; - let dataset = Arc::new(dataset_guard.clone()); - drop(dataset_guard); - let new_dataset = DeleteBuilder::new(dataset, &filter) - .execute() - .await - .map_err(|e| { - convert_lance_commit_error( - &e, - "Failed to batch delete table versions across multiple tables", - None, - ) - })?; + let batches = Self::execute_scanner(scanner).await?; - self.manifest_dataset - .set_latest( - Arc::try_unwrap(new_dataset.new_dataset).unwrap_or_else(|arc| (*arc).clone()), - ) - .await; + let mut table_entries = Vec::new(); + for batch in batches { + if batch.num_rows() == 0 { + continue; + } - if let Err(e) = self.run_inline_optimization().await { - log::warn!( - "Unexpected failure when running inline optimization: {:?}", - e - ); + let object_id_array = Self::get_string_column(&batch, "object_id")?; + let location_array = Self::get_string_column(&batch, "location")?; + for i in 0..batch.num_rows() { + let object_id = object_id_array.value(i); + let location = location_array.value(i); + let (_namespace, name) = Self::parse_object_id(object_id); + table_entries.push((name, location.to_string())); + } } - Ok(deleted_count) - } + let mut tables: Vec = if request.include_declared.unwrap_or(true) { + table_entries.into_iter().map(|(name, _)| name).collect() + } else { + let mut stream = futures::stream::iter(table_entries.into_iter().map( + |(name, location)| async move { + // `include_declared=false` is an explicit opt-in. We still pay one + // `_versions/` probe per table so declared-state is derived from actual + // manifests. This is linear in the total number of listed tables, and we do + // the probes with bounded concurrency before pagination. + if self.location_has_actual_manifests(&location).await? { + Ok::, Error>(Some(name)) + } else { + Ok::, Error>(None) + } + }, + )) + .buffered(DECLARED_FILTER_CONCURRENCY); - /// Set a property flag in the __manifest table's metadata key-value map. - /// - /// This uses `dataset.update_metadata()` to persist the flag in the - /// __manifest dataset's table metadata, rather than inserting a row. - /// If the property already exists with the same value, this is a no-op. - pub async fn set_property(&self, name: &str, value: &str) -> Result<()> { - let _mutation_guard = self.manifest_mutation_lock.lock().await; - let dataset_guard = self.manifest_dataset.get().await?; - if dataset_guard.metadata().get(name) == Some(&value.to_string()) { - return Ok(()); - } - drop(dataset_guard); + let mut filtered = Vec::new(); + while let Some(result) = stream.next().await { + if let Some(name) = result? { + filtered.push(name); + } + } + filtered + }; - let mut dataset_guard = self.manifest_dataset.get_mut().await?; - dataset_guard - .update_metadata([(name, value)]) - .await - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!( - "Failed to set property '{}' in __manifest metadata: {}", - name, e - ), - }) - })?; - Ok(()) + let next_page_token = + Self::apply_pagination(&mut tables, request.page_token, request.limit); + let mut response = ListTablesResponse::new(tables); + response.page_token = next_page_token; + Ok(response) } - /// Check if a property flag exists in the __manifest table's metadata key-value map. - pub async fn has_property(&self, name: &str) -> Result { - let dataset_guard = self.manifest_dataset.get().await?; - Ok(dataset_guard.metadata().contains_key(name)) - } + async fn describe_table(&self, request: DescribeTableRequest) -> Result { + let table_id = request.id.as_ref().ok_or_else(|| { + lance_core::Error::from(NamespaceError::InvalidInput { + message: "Table ID is required".to_string(), + }) + })?; - /// Parse metadata JSON into a `TableVersion`. - /// - /// Returns `None` if metadata is invalid or missing required fields. - fn parse_table_version(version: i64, metadata_str: &str) -> Option { - let meta: serde_json::Value = match serde_json::from_str(metadata_str) { - Ok(v) => v, - Err(e) => { - log::warn!( - "Skipping version {} due to invalid metadata JSON: {}", - version, - e - ); - return None; - } - }; - let manifest_path = match meta.get("manifest_path").and_then(|v| v.as_str()) { - Some(p) => p.to_string(), - None => { - log::warn!( - "Skipping version {} due to missing 'manifest_path' in metadata — \ - this may indicate data corruption", - version - ); - return None; + if table_id.is_empty() { + return Err(NamespaceError::InvalidInput { + message: "Table ID cannot be empty".to_string(), } - }; - let manifest_size = meta.get("manifest_size").and_then(|v| v.as_i64()); - let e_tag = meta - .get("e_tag") - .and_then(|v| v.as_str()) - .map(|s| s.to_string()); - Some(TableVersion { - version, - manifest_path, - manifest_size, - e_tag, - timestamp_millis: None, - metadata: None, - }) - } + .into()); + } - /// List table versions from the __manifest table. - /// - /// Queries the manifest for all versions of the given table and returns - /// them as a `ListTableVersionsResponse`. - pub async fn list_table_versions( - &self, - table_id: &[String], - descending: bool, - limit: Option, - ) -> Result { let object_id = Self::str_object_id(table_id); - let manifest_versions = self - .query_table_versions(&object_id, descending, limit) - .await?; - - let table_versions: Vec = manifest_versions - .into_iter() - .filter_map(|(version, metadata_str)| Self::parse_table_version(version, &metadata_str)) - .collect(); + let table_info = self.query_manifest_for_table(&object_id).boxed().await?; - Ok(ListTableVersionsResponse { - versions: table_versions, - page_token: None, - }) - } + // Extract table name and namespace from table_id + let table_name = table_id.last().cloned().unwrap_or_default(); + let namespace_id: Vec = if table_id.len() > 1 { + table_id[..table_id.len() - 1].to_vec() + } else { + vec![] + }; - /// Describe a specific table version from the __manifest table. - /// - /// Queries the manifest for a specific version and returns it as a - /// `DescribeTableVersionResponse`. Returns an error if the version is not found. - pub async fn describe_table_version( - &self, - table_id: &[String], - version: i64, - ) -> Result { - let object_id = Self::str_object_id(table_id); - if let Some(metadata_str) = self.query_table_version(&object_id, version).await? - && let Some(tv) = Self::parse_table_version(version, &metadata_str) - { - return Ok(DescribeTableVersionResponse { - version: Box::new(tv), - }); - } - Err(NamespaceError::TableVersionNotFound { - message: format!("version {} for table {:?}", version, table_id), - } - .into()) - } + let load_detailed_metadata = request.load_detailed_metadata.unwrap_or(false); + let should_check_declared = + load_detailed_metadata || request.check_declared.unwrap_or(false); + // For backwards compatibility, only skip vending credentials when explicitly set to false + let vend_credentials = request.vend_credentials.unwrap_or(true); - /// Register a table in the manifest without creating the physical table (internal helper for migration) - pub async fn register_table(&self, name: &str, location: String) -> Result<()> { - let object_id = Self::build_object_id(&[], name); - if self.manifest_contains_object(&object_id).await? { - return Err(NamespaceError::Internal { - message: format!("Table '{}' already exists", name), - } - .into()); - } + match table_info { + Some(info) => { + // Construct full URI from relative location + let table_uri = Self::construct_full_uri(&self.root, &info.location)?; - self.insert_into_manifest(object_id, ObjectType::Table, Some(location)) - .await - } + let storage_options = if vend_credentials { + self.storage_options.clone() + } else { + None + }; + let is_only_declared = if should_check_declared { + Some(!self.location_has_actual_manifests(&info.location).await?) + } else { + None + }; - /// Validate that all levels of a namespace path exist - async fn validate_namespace_levels_exist(&self, namespace_path: &[String]) -> Result<()> { - for i in 1..=namespace_path.len() { - let partial_path = &namespace_path[..i]; - let object_id = partial_path.join(DELIMITER); - if !self.manifest_contains_object(&object_id).await? { - return Err(NamespaceError::NamespaceNotFound { - message: format!("parent namespace '{}'", object_id), + if !load_detailed_metadata { + return Ok(DescribeTableResponse { + table: Some(table_name), + namespace: Some(namespace_id), + location: Some(table_uri.clone()), + table_uri: Some(table_uri), + storage_options, + properties: info.metadata, + is_only_declared, + ..Default::default() + }); + } + + if is_only_declared == Some(true) { + return Ok(DescribeTableResponse { + table: Some(table_name), + namespace: Some(namespace_id), + location: Some(table_uri.clone()), + table_uri: Some(table_uri), + storage_options, + properties: info.metadata, + is_only_declared, + ..Default::default() + }); + } + + let mut builder = DatasetBuilder::from_uri(&table_uri); + if let Some(opts) = &self.storage_options { + builder = builder.with_storage_options(opts.clone()); + } + if let Some(session) = &self.session { + builder = builder.with_session(session.clone()); + } + + match builder.load().await { + Ok(mut dataset) => { + // If a specific version is requested, checkout that version + if let Some(requested_version) = request.version { + dataset = dataset.checkout_version(requested_version as u64).await?; + } + + let version = dataset.version().version; + let lance_schema = dataset.schema(); + let arrow_schema: arrow_schema::Schema = lance_schema.into(); + let json_schema = arrow_schema_to_json(&arrow_schema)?; + + Ok(DescribeTableResponse { + table: Some(table_name.clone()), + namespace: Some(namespace_id.clone()), + version: Some(version as i64), + location: Some(table_uri.clone()), + table_uri: Some(table_uri), + schema: Some(Box::new(json_schema)), + storage_options, + properties: info.metadata.clone(), + is_only_declared, + ..Default::default() + }) + } + Err(err) => Err(NamespaceError::Internal { + message: format!( + "Table exists in manifest but failed to load dataset '{}': {}", + object_id, err + ), + } + .into()), } - .into()); } + None => Err(NamespaceError::TableNotFound { + message: Self::format_table_id(table_id), + } + .into()), } - Ok(()) } - /// Query the manifest for a namespace with the given object ID - async fn query_manifest_for_namespace(&self, object_id: &str) -> Result> { - let escaped_id = object_id.replace('\'', "''"); - let filter = format!("object_id = '{}' AND object_type = 'namespace'", escaped_id); - let mut scanner = self.manifest_scanner().await?; - scanner.filter(&filter).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to filter: {:?}", e), - }) - })?; - scanner.project(&["object_id", "metadata"]).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to project: {:?}", e), + async fn table_exists(&self, request: TableExistsRequest) -> Result<()> { + let table_id = request.id.as_ref().ok_or_else(|| { + lance_core::Error::from(NamespaceError::InvalidInput { + message: "Table ID is required".to_string(), }) })?; - let batches = Self::execute_scanner(scanner).await?; - - let mut found_result: Option = None; - let mut total_rows = 0; - for batch in batches { - if batch.num_rows() == 0 { - continue; + if table_id.is_empty() { + return Err(NamespaceError::InvalidInput { + message: "Table ID cannot be empty".to_string(), } + .into()); + } - total_rows += batch.num_rows(); - if total_rows > 1 { - return Err(NamespaceError::Internal { - message: format!( - "Expected exactly 1 namespace with id '{}', found {}", - object_id, total_rows - ), - } - .into()); + let object_id = Self::str_object_id(table_id); + let exists = self.manifest_contains_object(&object_id).await?; + if exists { + Ok(()) + } else { + Err(NamespaceError::TableNotFound { + message: Self::format_table_id(table_id), } - - let object_id_array = Self::get_string_column(&batch, "object_id")?; - let metadata_array = Self::get_string_column(&batch, "metadata")?; - - let object_id_str = object_id_array.value(0); - let metadata = if !metadata_array.is_null(0) { - let metadata_str = metadata_array.value(0); - match serde_json::from_str::>(metadata_str) { - Ok(map) => Some(map), - Err(e) => { - return Err(NamespaceError::Internal { - message: format!( - "Failed to deserialize metadata for namespace '{}': {}", - object_id, e - ), - } - .into()); - } - } - } else { - None - }; - - let (namespace, name) = Self::parse_object_id(object_id_str); - found_result = Some(NamespaceInfo { - namespace, - name, - metadata, - }); + .into()) } - - Ok(found_result) } - /// Create or load the manifest dataset, ensuring it has the latest schema setup. - /// - /// This function will: - /// 1. Try to load an existing manifest table - /// 2. If it exists, check and migrate the schema if needed (e.g., add primary key metadata) - /// 3. If it doesn't exist, create a new manifest table with the current schema - /// 4. Persist feature flags (e.g., table_version_storage_enabled) if requested - async fn ensure_manifest_table_up_to_date( - root: &str, - storage_options: &Option>, - session: Option>, - table_version_storage_enabled: bool, - ) -> Result { - let manifest_path = format!("{}/{}", root, MANIFEST_TABLE_NAME); - log::debug!("Attempting to load manifest from {}", manifest_path); - let store_options = ObjectStoreParams { - storage_options_accessor: storage_options.as_ref().map(|opts| { - Arc::new( - lance_io::object_store::StorageOptionsAccessor::with_static_options( - opts.clone(), - ), - ) - }), - ..Default::default() - }; - let read_params = ReadParams { - session: session.clone(), - store_options: Some(store_options.clone()), - ..Default::default() - }; - let dataset_result = DatasetBuilder::from_uri(&manifest_path) - .with_read_params(read_params) - .load() - .await; - if let Ok(mut dataset) = dataset_result { - // Check if the object_id field has primary key metadata, migrate if not - let needs_pk_migration = dataset - .schema() - .field("object_id") - .map(|f| { - !f.metadata - .contains_key(LANCE_UNENFORCED_PRIMARY_KEY_POSITION) - }) - .unwrap_or(false); + async fn create_table( + &self, + request: CreateTableRequest, + data: Bytes, + ) -> Result { + let table_id = request.id.as_ref().ok_or_else(|| { + lance_core::Error::from(NamespaceError::InvalidInput { + message: "Table ID is required".to_string(), + }) + })?; - if needs_pk_migration { - log::info!("Migrating __manifest table to add primary key metadata on object_id"); - dataset - .update_field_metadata() - .update("object_id", [(LANCE_UNENFORCED_PRIMARY_KEY_POSITION, "0")]) - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!( - "Failed to find object_id field for migration: {:?}", - e - ), - }) - })? - .await - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to migrate primary key metadata: {:?}", e), - }) - })?; + if table_id.is_empty() { + return Err(NamespaceError::InvalidInput { + message: "Table ID cannot be empty".to_string(), } + .into()); + } - // Persist table_version_storage_enabled flag in __manifest so that once - // enabled, it becomes a permanent property of this namespace. - if table_version_storage_enabled { - let needs_flag = dataset - .metadata() - .get("table_version_storage_enabled") - .map(|v| v != "true") - .unwrap_or(true); + let (namespace, table_name) = Self::split_object_id(table_id); + let object_id = Self::build_object_id(&namespace, &table_name); - if needs_flag - && let Err(e) = dataset - .update_metadata([("table_version_storage_enabled", "true")]) - .await - { - log::warn!( - "Failed to persist table_version_storage_enabled flag in __manifest: {:?}", - e - ); - } - } + // Refuse before writing any table data if this build cannot write the + // manifest, so a refused create leaves no orphaned dataset behind. + self.ensure_manifest_writable().await?; - Ok(DatasetConsistencyWrapper::new(dataset)) + let existing_table = self.query_manifest_for_table(&object_id).await?; + let existing_has_manifests = if let Some(existing_table) = &existing_table { + Some( + self.location_has_actual_manifests(&existing_table.location) + .await?, + ) } else { - log::info!("Creating new manifest table at {}", manifest_path); - let schema = Self::manifest_schema(); - let empty_batch = RecordBatch::new_empty(schema.clone()); - let reader = RecordBatchIterator::new(vec![Ok(empty_batch)], schema.clone()); + None + }; - let store_params = ObjectStoreParams { - storage_options_accessor: storage_options.as_ref().map(|opts| { - Arc::new( - lance_io::object_store::StorageOptionsAccessor::with_static_options( - opts.clone(), - ), - ) - }), - ..Default::default() - }; - let write_params = WriteParams { - session: session.clone(), - store_params: Some(store_params), - ..Default::default() - }; + if existing_has_manifests == Some(false) + && request + .properties + .as_ref() + .is_some_and(|properties| !properties.is_empty()) + { + return Err(NamespaceError::InvalidInput { + message: format!( + "create_table cannot set properties for already declared table '{}'", + object_id + ), + } + .into()); + } - let dataset = - Dataset::write(Box::new(reader), &manifest_path, Some(write_params)).await; + let create_mode = if existing_has_manifests == Some(false) { + CreateTableMode::Create + } else { + CreateTableMode::parse(request.mode.as_deref())? + }; + let dir_name = if let Some(existing_table) = &existing_table { + existing_table.location.clone() + } else if namespace.is_empty() && self.dir_listing_enabled { + format!("{}.lance", table_name) + } else { + Self::generate_dir_name(&object_id) + }; + let table_uri = Self::construct_full_uri(&self.root, &dir_name)?; + let overwriting_existing_table = + existing_has_manifests == Some(true) && create_mode == CreateTableMode::Overwrite; - // Handle race condition where another process created the manifest concurrently - match dataset { - Ok(dataset) => { - log::info!( - "Successfully created manifest table at {}, version={}, uri={}", - manifest_path, - dataset.version().version, - dataset.uri() - ); - Ok(DatasetConsistencyWrapper::new(dataset)) + if existing_has_manifests == Some(true) { + match create_mode { + CreateTableMode::Create => { + return Err(NamespaceError::TableAlreadyExists { + message: table_name.clone(), + } + .into()); } - Err(ref e) - if matches!( - e, - LanceError::DatasetAlreadyExists { .. } - | LanceError::CommitConflict { .. } - | LanceError::IncompatibleTransaction { .. } - | LanceError::RetryableCommitConflict { .. } - ) => - { - // Another process created the manifest concurrently, try to load it - log::info!( - "Manifest table was created by another process, loading it: {}", - manifest_path - ); - let recovery_store_options = ObjectStoreParams { - storage_options_accessor: storage_options.as_ref().map(|opts| { - Arc::new( - lance_io::object_store::StorageOptionsAccessor::with_static_options( - opts.clone(), - ), - ) - }), - ..Default::default() - }; - let recovery_read_params = ReadParams { - session, - store_options: Some(recovery_store_options), + CreateTableMode::ExistOk => { + let properties = existing_table + .as_ref() + .and_then(|table| table.metadata.clone()); + return Ok(CreateTableResponse { + location: Some(table_uri), + storage_options: self.storage_options.clone(), + properties, ..Default::default() - }; - let dataset = DatasetBuilder::from_uri(&manifest_path) - .with_read_params(recovery_read_params) - .load() - .await - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!( - "Failed to load manifest dataset after creation conflict: {}", - e - ), - }) - })?; - Ok(DatasetConsistencyWrapper::new(dataset)) + }); } - Err(e) => Err(lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to create manifest dataset: {:?}", e), - })), - } - } - } - - /// Sorts names alphabetically and applies pagination using page_token (start_after) and limit. - /// - /// Returns the next page token (last item in this page) if more results exist beyond the limit, - /// or `None` if this is the last page. - fn apply_pagination( - names: &mut Vec, - page_token: Option, - limit: Option, - ) -> Option { - names.sort(); - - if let Some(start_after) = page_token { - if let Some(index) = names - .iter() - .position(|name| name.as_str() > start_after.as_str()) - { - names.drain(0..index); - } else { - names.clear(); + CreateTableMode::Overwrite => {} } } - if let Some(limit) = limit - && limit >= 0 - { - let limit = limit as usize; - if names.len() > limit { - let next_page_token = if limit > 0 { - Some(names[limit - 1].clone()) - } else { - None - }; - names.truncate(limit); - return next_page_token; + // Validate that request_data is provided + if data.is_empty() { + return Err(NamespaceError::InvalidInput { + message: "Request data (Arrow IPC stream) is required for create_table".to_string(), } + .into()); } - None - } -} - -#[async_trait] -impl LanceNamespace for ManifestNamespace { - fn namespace_id(&self) -> String { - self.root.clone() - } - - async fn list_tables(&self, request: ListTablesRequest) -> Result { - let namespace_id = request.id.as_ref().ok_or_else(|| { - lance_core::Error::from(NamespaceError::InvalidInput { - message: "Namespace ID is required".to_string(), - }) - })?; - - // Build filter to find tables in this namespace - let filter = if namespace_id.is_empty() { - // Root namespace: find tables without a namespace prefix - "object_type = 'table' AND NOT contains(object_id, '$')".to_string() - } else { - // Namespaced: find tables that start with namespace$ but have no additional $ - let prefix = namespace_id.join(DELIMITER); - format!( - "object_type = 'table' AND starts_with(object_id, '{}{}') AND NOT contains(substring(object_id, {}), '$')", - prefix, - DELIMITER, - prefix.len() + 2 - ) - }; - - let mut scanner = self.manifest_scanner().await?; - scanner.filter(&filter).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to filter: {:?}", e), - }) - })?; - scanner.project(&["object_id", "location"]).map_err(|e| { + // Write the data using Lance Dataset + let cursor = Cursor::new(data.to_vec()); + let stream_reader = StreamReader::try_new(cursor, None).map_err(|e| { lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to project: {:?}", e), + message: format!("Failed to read IPC stream: {:?}", e), }) })?; - let batches = Self::execute_scanner(scanner).await?; + let batches: Vec = stream_reader + .collect::, _>>() + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to collect batches: {:?}", e), + }) + })?; - let mut table_entries = Vec::new(); - for batch in batches { - if batch.num_rows() == 0 { - continue; + if batches.is_empty() { + return Err(NamespaceError::Internal { + message: "No data provided for table creation".to_string(), } + .into()); + } - let object_id_array = Self::get_string_column(&batch, "object_id")?; - let location_array = Self::get_string_column(&batch, "location")?; - for i in 0..batch.num_rows() { - let object_id = object_id_array.value(i); - let location = location_array.value(i); - let (_namespace, name) = Self::parse_object_id(object_id); - table_entries.push((name, location.to_string())); - } + let schema = batches[0].schema(); + let batch_results: Vec> = + batches.into_iter().map(Ok).collect(); + let reader = RecordBatchIterator::new(batch_results, schema); + + let mut write_storage_options = self.storage_options.clone().unwrap_or_default(); + if let Some(request_storage_options) = request.storage_options.as_ref() { + write_storage_options.extend(request_storage_options.clone()); } - let mut tables: Vec = if request.include_declared.unwrap_or(true) { - table_entries.into_iter().map(|(name, _)| name).collect() + let store_params = ObjectStoreParams { + storage_options_accessor: (!write_storage_options.is_empty()).then(|| { + Arc::new( + lance_io::object_store::StorageOptionsAccessor::with_static_options( + write_storage_options, + ), + ) + }), + ..Default::default() + }; + let write_params = WriteParams { + mode: create_mode.write_mode(), + session: self.session.clone(), + store_params: Some(store_params), + ..Default::default() + }; + let dataset = Dataset::write(Box::new(reader), &table_uri, Some(write_params)) + .await + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to write dataset: {:?}", e), + }) + })?; + let version = dataset.version().version as i64; + + if overwriting_existing_table { + let metadata = + Self::serialize_metadata(request.properties.as_ref(), "table", &object_id)?; + self.upsert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id, + object_type: ObjectType::Table, + location: Some(dir_name), + metadata, + }], + None, + ) + .await?; + + Ok(CreateTableResponse { + version: Some(version), + location: Some(table_uri), + storage_options: self.storage_options.clone(), + properties: request.properties, + ..Default::default() + }) } else { - let mut stream = futures::stream::iter(table_entries.into_iter().map( - |(name, location)| async move { - // `include_declared=false` is an explicit opt-in. We still pay one - // `_versions/` probe per table so declared-state is derived from actual - // manifests. This is linear in the total number of listed tables, and we do - // the probes with bounded concurrency before pagination. - if self.location_has_actual_manifests(&location).await? { - Ok::, Error>(Some(name)) - } else { - Ok::, Error>(None) - } - }, - )) - .buffered(DECLARED_FILTER_CONCURRENCY); + match existing_table { + Some(existing_table) => Ok(CreateTableResponse { + version: Some(version), + location: Some(table_uri), + storage_options: self.storage_options.clone(), + properties: existing_table.metadata, + ..Default::default() + }), + None => { + let metadata = + Self::serialize_metadata(request.properties.as_ref(), "table", &object_id)?; + // Register in manifest (store dir_name, not full URI) + self.insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id, + object_type: ObjectType::Table, + location: Some(dir_name.clone()), + metadata, + }], + None, + ) + .await?; - let mut filtered = Vec::new(); - while let Some(result) = stream.next().await { - if let Some(name) = result? { - filtered.push(name); + Ok(CreateTableResponse { + version: Some(version), + location: Some(table_uri), + storage_options: self.storage_options.clone(), + properties: request.properties, + ..Default::default() + }) } } - filtered - }; - - let next_page_token = - Self::apply_pagination(&mut tables, request.page_token, request.limit); - let mut response = ListTablesResponse::new(tables); - response.page_token = next_page_token; - Ok(response) + } } - async fn describe_table(&self, request: DescribeTableRequest) -> Result { + async fn drop_table(&self, request: DropTableRequest) -> Result { let table_id = request.id.as_ref().ok_or_else(|| { lance_core::Error::from(NamespaceError::InvalidInput { message: "Table ID is required".to_string(), @@ -2005,349 +3063,276 @@ impl LanceNamespace for ManifestNamespace { .into()); } - let object_id = Self::str_object_id(table_id); - let table_info = self.query_manifest_for_table(&object_id).boxed().await?; - - // Extract table name and namespace from table_id - let table_name = table_id.last().cloned().unwrap_or_default(); - let namespace_id: Vec = if table_id.len() > 1 { - table_id[..table_id.len() - 1].to_vec() - } else { - vec![] - }; + let (namespace, table_name) = Self::split_object_id(table_id); + let object_id = Self::build_object_id(&namespace, &table_name); - let load_detailed_metadata = request.load_detailed_metadata.unwrap_or(false); - let should_check_declared = - load_detailed_metadata || request.check_declared.unwrap_or(false); - // For backwards compatibility, only skip vending credentials when explicitly set to false - let vend_credentials = request.vend_credentials.unwrap_or(true); + // Query manifest for table location + let table_info = self.query_manifest_for_table(&object_id).boxed().await?; match table_info { Some(info) => { - // Construct full URI from relative location - let table_uri = Self::construct_full_uri(&self.root, &info.location)?; - - let storage_options = if vend_credentials { - self.storage_options.clone() - } else { - None - }; - let is_only_declared = if should_check_declared { - Some(!self.location_has_actual_manifests(&info.location).await?) - } else { - None - }; - - if !load_detailed_metadata { - return Ok(DescribeTableResponse { - table: Some(table_name), - namespace: Some(namespace_id), - location: Some(table_uri.clone()), - table_uri: Some(table_uri), - storage_options, - properties: info.metadata, - is_only_declared, - ..Default::default() - }); - } - - if is_only_declared == Some(true) { - return Ok(DescribeTableResponse { - table: Some(table_name), - namespace: Some(namespace_id), - location: Some(table_uri.clone()), - table_uri: Some(table_uri), - storage_options, - properties: info.metadata, - is_only_declared, - ..Default::default() - }); - } - - let mut builder = DatasetBuilder::from_uri(&table_uri); - if let Some(opts) = &self.storage_options { - builder = builder.with_storage_options(opts.clone()); - } - if let Some(session) = &self.session { - builder = builder.with_session(session.clone()); - } - - match builder.load().await { - Ok(mut dataset) => { - // If a specific version is requested, checkout that version - if let Some(requested_version) = request.version { - dataset = dataset.checkout_version(requested_version as u64).await?; - } + // Delete from manifest first + self.delete_from_manifest(&object_id).boxed().await?; - let version = dataset.version().version; - let lance_schema = dataset.schema(); - let arrow_schema: arrow_schema::Schema = lance_schema.into(); - let json_schema = arrow_schema_to_json(&arrow_schema)?; + // Delete physical data directory using the dir_name from manifest + let table_path = self.base_path.clone().join(info.location.as_str()); + let table_uri = Self::construct_full_uri(&self.root, &info.location)?; - Ok(DescribeTableResponse { - table: Some(table_name.clone()), - namespace: Some(namespace_id.clone()), - version: Some(version as i64), - location: Some(table_uri.clone()), - table_uri: Some(table_uri), - schema: Some(Box::new(json_schema)), - storage_options, - properties: info.metadata.clone(), - is_only_declared, - ..Default::default() + // Remove the table directory + self.object_store + .remove_dir_all(table_path) + .boxed() + .await + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to delete table directory: {:?}", e), }) - } - Err(err) => Err(NamespaceError::Internal { - message: format!( - "Table exists in manifest but failed to load dataset '{}': {}", - object_id, err - ), - } - .into()), - } + })?; + + Ok(DropTableResponse { + id: request.id.clone(), + location: Some(table_uri), + ..Default::default() + }) } None => Err(NamespaceError::TableNotFound { - message: Self::format_table_id(table_id), + message: table_name.to_string(), } .into()), } } - async fn table_exists(&self, request: TableExistsRequest) -> Result<()> { - let table_id = request.id.as_ref().ok_or_else(|| { + async fn list_namespaces( + &self, + request: ListNamespacesRequest, + ) -> Result { + let parent_namespace = request.id.as_ref().ok_or_else(|| { lance_core::Error::from(NamespaceError::InvalidInput { - message: "Table ID is required".to_string(), + message: "Namespace ID is required".to_string(), }) })?; - if table_id.is_empty() { - return Err(NamespaceError::InvalidInput { - message: "Table ID cannot be empty".to_string(), + // Build filter to find direct child namespaces + let filter = if parent_namespace.is_empty() { + // Root namespace: find all namespaces without a parent + "object_type = 'namespace' AND NOT contains(object_id, '$')".to_string() + } else { + // Non-root: find namespaces that start with parent$ but have no additional $ + let prefix = parent_namespace.join(DELIMITER); + format!( + "object_type = 'namespace' AND starts_with(object_id, '{}{}') AND NOT contains(substring(object_id, {}), '$')", + prefix, + DELIMITER, + prefix.len() + 2 + ) + }; + + let mut scanner = self.manifest_scanner().await?; + scanner.filter(&filter).map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to filter: {:?}", e), + }) + })?; + scanner.project(&["object_id"]).map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to project: {:?}", e), + }) + })?; + + let batches = Self::execute_scanner(scanner).await?; + let mut namespaces = Vec::new(); + + for batch in batches { + if batch.num_rows() == 0 { + continue; + } + + let object_id_array = Self::get_string_column(&batch, "object_id")?; + for i in 0..batch.num_rows() { + let object_id = object_id_array.value(i); + let (_namespace, name) = Self::parse_object_id(object_id); + namespaces.push(name); } - .into()); } - let object_id = Self::str_object_id(table_id); - let exists = self.manifest_contains_object(&object_id).await?; - if exists { - Ok(()) - } else { - Err(NamespaceError::TableNotFound { - message: Self::format_table_id(table_id), + let next_page_token = + Self::apply_pagination(&mut namespaces, request.page_token, request.limit); + let mut response = ListNamespacesResponse::new(namespaces); + response.page_token = next_page_token; + Ok(response) + } + + async fn describe_namespace( + &self, + request: DescribeNamespaceRequest, + ) -> Result { + let namespace_id = request.id.as_ref().ok_or_else(|| { + lance_core::Error::from(NamespaceError::InvalidInput { + message: "Namespace ID is required".to_string(), + }) + })?; + + // Root namespace always exists + if namespace_id.is_empty() { + #[allow(clippy::needless_update)] + return Ok(DescribeNamespaceResponse { + properties: Some(HashMap::new()), + ..Default::default() + }); + } + + // Check if namespace exists in manifest + let object_id = namespace_id.join(DELIMITER); + let namespace_info = self.query_manifest_for_namespace(&object_id).await?; + + match namespace_info { + #[allow(clippy::needless_update)] + Some(info) => Ok(DescribeNamespaceResponse { + properties: info.metadata, + ..Default::default() + }), + None => Err(NamespaceError::NamespaceNotFound { + message: object_id.to_string(), } - .into()) + .into()), } } - async fn create_table( + async fn create_namespace( &self, - request: CreateTableRequest, - data: Bytes, - ) -> Result { - let table_id = request.id.as_ref().ok_or_else(|| { + request: CreateNamespaceRequest, + ) -> Result { + let namespace_id = request.id.as_ref().ok_or_else(|| { lance_core::Error::from(NamespaceError::InvalidInput { - message: "Table ID is required".to_string(), + message: "Namespace ID is required".to_string(), }) })?; - if table_id.is_empty() { - return Err(NamespaceError::InvalidInput { - message: "Table ID cannot be empty".to_string(), + // Root namespace always exists and cannot be created + if namespace_id.is_empty() { + return Err(NamespaceError::NamespaceAlreadyExists { + message: "root namespace".to_string(), } .into()); } - let (namespace, table_name) = Self::split_object_id(table_id); - let object_id = Self::build_object_id(&namespace, &table_name); - - let existing_table = self.query_manifest_for_table(&object_id).await?; - let existing_has_manifests = if let Some(existing_table) = &existing_table { - Some( - self.location_has_actual_manifests(&existing_table.location) - .await?, - ) - } else { - None - }; + // Validate parent namespaces exist (but not the namespace being created) + if namespace_id.len() > 1 { + self.validate_namespace_levels_exist(&namespace_id[..namespace_id.len() - 1]) + .await?; + } - if existing_has_manifests == Some(false) - && request - .properties - .as_ref() - .is_some_and(|properties| !properties.is_empty()) - { - return Err(NamespaceError::InvalidInput { - message: format!( - "create_table cannot set properties for already declared table '{}'", - object_id - ), + let object_id = namespace_id.join(DELIMITER); + if self.manifest_contains_object(&object_id).await? { + return Err(NamespaceError::NamespaceAlreadyExists { + message: object_id.to_string(), } .into()); } - let create_mode = if existing_has_manifests == Some(false) { - CreateTableMode::Create - } else { - CreateTableMode::parse(request.mode.as_deref())? - }; - let dir_name = if let Some(existing_table) = &existing_table { - existing_table.location.clone() - } else if namespace.is_empty() && self.dir_listing_enabled { - format!("{}.lance", table_name) - } else { - Self::generate_dir_name(&object_id) - }; - let table_uri = Self::construct_full_uri(&self.root, &dir_name)?; - let overwriting_existing_table = - existing_has_manifests == Some(true) && create_mode == CreateTableMode::Overwrite; + let metadata = + Self::serialize_metadata(request.properties.as_ref(), "namespace", &object_id)?; - if existing_has_manifests == Some(true) { - match create_mode { - CreateTableMode::Create => { - return Err(NamespaceError::TableAlreadyExists { - message: table_name.clone(), - } - .into()); - } - CreateTableMode::ExistOk => { - let properties = existing_table - .as_ref() - .and_then(|table| table.metadata.clone()); - return Ok(CreateTableResponse { - location: Some(table_uri), - storage_options: self.storage_options.clone(), - properties, - ..Default::default() - }); - } - CreateTableMode::Overwrite => {} + self.insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id, + object_type: ObjectType::Namespace, + location: None, + metadata, + }], + None, + ) + .await?; + + Ok(CreateNamespaceResponse { + properties: request.properties, + ..Default::default() + }) + } + + async fn drop_namespace(&self, request: DropNamespaceRequest) -> Result { + let namespace_id = request.id.as_ref().ok_or_else(|| { + lance_core::Error::from(NamespaceError::InvalidInput { + message: "Namespace ID is required".to_string(), + }) + })?; + + // Root namespace always exists and cannot be dropped + if namespace_id.is_empty() { + return Err(NamespaceError::InvalidInput { + message: "Root namespace cannot be dropped".to_string(), } + .into()); } - // Validate that request_data is provided - if data.is_empty() { - return Err(NamespaceError::InvalidInput { - message: "Request data (Arrow IPC stream) is required for create_table".to_string(), + let object_id = namespace_id.join(DELIMITER); + + // Check if namespace exists + if !self.manifest_contains_object(&object_id).boxed().await? { + return Err(NamespaceError::NamespaceNotFound { + message: object_id.to_string(), } .into()); } - // Write the data using Lance Dataset - let cursor = Cursor::new(data.to_vec()); - let stream_reader = StreamReader::try_new(cursor, None).map_err(|e| { + // Check for child namespaces + let escaped_id = object_id.replace('\'', "''"); + let prefix = format!("{}{}", escaped_id, DELIMITER); + let filter = format!("starts_with(object_id, '{}')", prefix); + let mut scanner = self.manifest_scanner().boxed().await?; + scanner.filter(&filter).map_err(|e| { lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to read IPC stream: {:?}", e), + message: format!("Failed to filter: {:?}", e), }) })?; - - let batches: Vec = stream_reader - .collect::, _>>() - .map_err(|e| { + scanner.project::<&str>(&[]).map_err(|e| { lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to collect batches: {:?}", e), + message: format!("Failed to project: {:?}", e), }) })?; - - if batches.is_empty() { - return Err(NamespaceError::Internal { - message: "No data provided for table creation".to_string(), + scanner.with_row_id(); + let count = scanner.count_rows().boxed().await.map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to count rows: {:?}", e), + }) + })?; + + if count > 0 { + return Err(NamespaceError::NamespaceNotEmpty { + message: format!("'{}' (contains {} child objects)", object_id, count), } .into()); } - let schema = batches[0].schema(); - let batch_results: Vec> = - batches.into_iter().map(Ok).collect(); - let reader = RecordBatchIterator::new(batch_results, schema); + self.delete_from_manifest(&object_id).boxed().await?; - let mut write_storage_options = self.storage_options.clone().unwrap_or_default(); - if let Some(request_storage_options) = request.storage_options.as_ref() { - write_storage_options.extend(request_storage_options.clone()); - } + Ok(DropNamespaceResponse::default()) + } - let store_params = ObjectStoreParams { - storage_options_accessor: (!write_storage_options.is_empty()).then(|| { - Arc::new( - lance_io::object_store::StorageOptionsAccessor::with_static_options( - write_storage_options, - ), - ) - }), - ..Default::default() - }; - let write_params = WriteParams { - mode: create_mode.write_mode(), - session: self.session.clone(), - store_params: Some(store_params), - ..Default::default() - }; - let dataset = Dataset::write(Box::new(reader), &table_uri, Some(write_params)) - .await - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to write dataset: {:?}", e), - }) - })?; - let version = dataset.version().version as i64; + async fn namespace_exists(&self, request: NamespaceExistsRequest) -> Result<()> { + let namespace_id = request.id.as_ref().ok_or_else(|| { + lance_core::Error::from(NamespaceError::InvalidInput { + message: "Namespace ID is required".to_string(), + }) + })?; - if overwriting_existing_table { - let metadata = - Self::serialize_metadata(request.properties.as_ref(), "table", &object_id)?; - self.upsert_into_manifest_with_metadata( - vec![ManifestEntry { - object_id, - object_type: ObjectType::Table, - location: Some(dir_name), - metadata, - }], - None, - ) - .await?; + // Root namespace always exists + if namespace_id.is_empty() { + return Ok(()); + } - Ok(CreateTableResponse { - version: Some(version), - location: Some(table_uri), - storage_options: self.storage_options.clone(), - properties: request.properties, - ..Default::default() - }) + let object_id = namespace_id.join(DELIMITER); + if self.manifest_contains_object(&object_id).await? { + Ok(()) } else { - match existing_table { - Some(existing_table) => Ok(CreateTableResponse { - version: Some(version), - location: Some(table_uri), - storage_options: self.storage_options.clone(), - properties: existing_table.metadata, - ..Default::default() - }), - None => { - let metadata = - Self::serialize_metadata(request.properties.as_ref(), "table", &object_id)?; - // Register in manifest (store dir_name, not full URI) - self.insert_into_manifest_with_metadata( - vec![ManifestEntry { - object_id, - object_type: ObjectType::Table, - location: Some(dir_name.clone()), - metadata, - }], - None, - ) - .await?; - - Ok(CreateTableResponse { - version: Some(version), - location: Some(table_uri), - storage_options: self.storage_options.clone(), - properties: request.properties, - ..Default::default() - }) - } + Err(NamespaceError::NamespaceNotFound { + message: object_id.to_string(), } + .into()) } } - async fn drop_table(&self, request: DropTableRequest) -> Result { + async fn declare_table(&self, request: DeclareTableRequest) -> Result { let table_id = request.id.as_ref().ok_or_else(|| { lance_core::Error::from(NamespaceError::InvalidInput { message: "Table ID is required".to_string(), @@ -2364,546 +3349,1046 @@ impl LanceNamespace for ManifestNamespace { let (namespace, table_name) = Self::split_object_id(table_id); let object_id = Self::build_object_id(&namespace, &table_name); - // Query manifest for table location - let table_info = self.query_manifest_for_table(&object_id).boxed().await?; - - match table_info { - Some(info) => { - // Delete from manifest first - self.delete_from_manifest(&object_id).boxed().await?; - - // Delete physical data directory using the dir_name from manifest - let table_path = self.base_path.clone().join(info.location.as_str()); - let table_uri = Self::construct_full_uri(&self.root, &info.location)?; - - // Remove the table directory - self.object_store - .remove_dir_all(table_path) - .boxed() - .await - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to delete table directory: {:?}", e), - }) - })?; - - Ok(DropTableResponse { - id: request.id.clone(), - location: Some(table_uri), - ..Default::default() - }) - } - None => Err(NamespaceError::TableNotFound { + // Check if table already exists in manifest + let existing = self.query_manifest_for_table(&object_id).await?; + if existing.is_some() { + return Err(NamespaceError::TableAlreadyExists { message: table_name.to_string(), } - .into()), + .into()); } - } - - async fn list_namespaces( - &self, - request: ListNamespacesRequest, - ) -> Result { - let parent_namespace = request.id.as_ref().ok_or_else(|| { - lance_core::Error::from(NamespaceError::InvalidInput { - message: "Namespace ID is required".to_string(), - }) - })?; - // Build filter to find direct child namespaces - let filter = if parent_namespace.is_empty() { - // Root namespace: find all namespaces without a parent - "object_type = 'namespace' AND NOT contains(object_id, '$')".to_string() + // Create table location path with hash-based naming + // When dir_listing_enabled is true and it's a root table, use directory-style naming: {table_name}.lance + // Otherwise, use hash-based naming: {hash}_{object_id} + let dir_name = if namespace.is_empty() && self.dir_listing_enabled { + // Root table with directory listing enabled: use {table_name}.lance + format!("{}.lance", table_name) } else { - // Non-root: find namespaces that start with parent$ but have no additional $ - let prefix = parent_namespace.join(DELIMITER); - format!( - "object_type = 'namespace' AND starts_with(object_id, '{}{}') AND NOT contains(substring(object_id, {}), '$')", - prefix, - DELIMITER, - prefix.len() + 2 - ) + // Child namespace table or dir listing disabled: use hash-based naming + Self::generate_dir_name(&object_id) }; + let table_path = self.base_path.clone().join(dir_name.as_str()); + let table_uri = Self::construct_full_uri(&self.root, &dir_name)?; - let mut scanner = self.manifest_scanner().await?; - scanner.filter(&filter).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to filter: {:?}", e), - }) - })?; - scanner.project(&["object_id"]).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to project: {:?}", e), - }) - })?; + // Validate location if provided + if let Some(req_location) = &request.location { + let req_location = req_location.trim_end_matches('/'); + if req_location != table_uri { + return Err(NamespaceError::InvalidInput { + message: format!( + "Cannot declare table {} at location {}, must be at location {}", + table_name, req_location, table_uri + ), + } + .into()); + } + } - let batches = Self::execute_scanner(scanner).await?; - let mut namespaces = Vec::new(); + // Create the .lance-reserved file to mark the table as existing + let reserved_file_path = table_path.clone().join(".lance-reserved"); - for batch in batches { - if batch.num_rows() == 0 { - continue; - } + self.object_store + .create(&reserved_file_path) + .await + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!( + "Failed to create .lance-reserved file for table {}: {}", + table_name, e + ), + }) + })? + .shutdown() + .await + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!( + "Failed to finalize .lance-reserved file for table {}: {}", + table_name, e + ), + }) + })?; - let object_id_array = Self::get_string_column(&batch, "object_id")?; - for i in 0..batch.num_rows() { - let object_id = object_id_array.value(i); - let (_namespace, name) = Self::parse_object_id(object_id); - namespaces.push(name); - } - } + let metadata = Self::serialize_metadata(request.properties.as_ref(), "table", &object_id)?; + + // Add entry to manifest marking this as a declared table (store dir_name, not full path) + self.insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id, + object_type: ObjectType::Table, + location: Some(dir_name), + metadata, + }], + None, + ) + .await?; + + log::info!( + "Declared table '{}' in manifest at {}", + table_name, + table_uri + ); - let next_page_token = - Self::apply_pagination(&mut namespaces, request.page_token, request.limit); - let mut response = ListNamespacesResponse::new(namespaces); - response.page_token = next_page_token; - Ok(response) + // For backwards compatibility, only skip vending credentials when explicitly set to false + let vend_credentials = request.vend_credentials.unwrap_or(true); + let storage_options = if vend_credentials { + self.storage_options.clone() + } else { + None + }; + + Ok(DeclareTableResponse { + location: Some(table_uri), + storage_options, + properties: request.properties, + ..Default::default() + }) } - async fn describe_namespace( - &self, - request: DescribeNamespaceRequest, - ) -> Result { - let namespace_id = request.id.as_ref().ok_or_else(|| { + async fn register_table(&self, request: RegisterTableRequest) -> Result { + let table_id = request.id.as_ref().ok_or_else(|| { lance_core::Error::from(NamespaceError::InvalidInput { - message: "Namespace ID is required".to_string(), + message: "Table ID is required".to_string(), }) })?; - // Root namespace always exists - if namespace_id.is_empty() { - #[allow(clippy::needless_update)] - return Ok(DescribeNamespaceResponse { - properties: Some(HashMap::new()), - ..Default::default() - }); + if table_id.is_empty() { + return Err(NamespaceError::InvalidInput { + message: "Table ID cannot be empty".to_string(), + } + .into()); } - // Check if namespace exists in manifest - let object_id = namespace_id.join(DELIMITER); - let namespace_info = self.query_manifest_for_namespace(&object_id).await?; + let location = request.location.clone(); - match namespace_info { - #[allow(clippy::needless_update)] - Some(info) => Ok(DescribeNamespaceResponse { - properties: info.metadata, - ..Default::default() - }), - None => Err(NamespaceError::NamespaceNotFound { - message: object_id.to_string(), + // Validate that location is a relative path within the root directory + // We don't allow absolute URIs or paths that escape the root + if location.contains("://") { + return Err(NamespaceError::InvalidInput { + message: format!( + "Absolute URIs are not allowed for register_table. Location must be a relative path within the root directory: {}", + location + ), } - .into()), + .into()); } - } - async fn create_namespace( - &self, - request: CreateNamespaceRequest, - ) -> Result { - let namespace_id = request.id.as_ref().ok_or_else(|| { - lance_core::Error::from(NamespaceError::InvalidInput { - message: "Namespace ID is required".to_string(), - }) - })?; + if location.starts_with('/') { + return Err(NamespaceError::InvalidInput { + message: format!( + "Absolute paths are not allowed for register_table. Location must be a relative path within the root directory: {}", + location + ), + } + .into()); + } - // Root namespace always exists and cannot be created - if namespace_id.is_empty() { - return Err(NamespaceError::NamespaceAlreadyExists { - message: "root namespace".to_string(), + // Check for path traversal attempts + if location.contains("..") { + return Err(NamespaceError::InvalidInput { + message: format!( + "Path traversal is not allowed. Location must be a relative path within the root directory: {}", + location + ), } .into()); } - // Validate parent namespaces exist (but not the namespace being created) - if namespace_id.len() > 1 { - self.validate_namespace_levels_exist(&namespace_id[..namespace_id.len() - 1]) - .await?; + let (namespace, table_name) = Self::split_object_id(table_id); + let object_id = Self::build_object_id(&namespace, &table_name); + + // Validate that parent namespaces exist (if not root) + if !namespace.is_empty() { + self.validate_namespace_levels_exist(&namespace).await?; } - let object_id = namespace_id.join(DELIMITER); + // Check if table already exists if self.manifest_contains_object(&object_id).await? { - return Err(NamespaceError::NamespaceAlreadyExists { + return Err(NamespaceError::TableAlreadyExists { message: object_id.to_string(), } .into()); } - let metadata = - Self::serialize_metadata(request.properties.as_ref(), "namespace", &object_id)?; - - self.insert_into_manifest_with_metadata( - vec![ManifestEntry { - object_id, - object_type: ObjectType::Namespace, - location: None, - metadata, - }], - None, - ) - .await?; + // Register the table with its location in the manifest + self.insert_into_manifest(object_id, ObjectType::Table, Some(location.clone())) + .await?; - Ok(CreateNamespaceResponse { - properties: request.properties, + Ok(RegisterTableResponse { + location: Some(location), ..Default::default() }) } - async fn drop_namespace(&self, request: DropNamespaceRequest) -> Result { - let namespace_id = request.id.as_ref().ok_or_else(|| { + async fn deregister_table( + &self, + request: DeregisterTableRequest, + ) -> Result { + let table_id = request.id.as_ref().ok_or_else(|| { lance_core::Error::from(NamespaceError::InvalidInput { - message: "Namespace ID is required".to_string(), + message: "Table ID is required".to_string(), }) })?; - // Root namespace always exists and cannot be dropped - if namespace_id.is_empty() { + if table_id.is_empty() { return Err(NamespaceError::InvalidInput { - message: "Root namespace cannot be dropped".to_string(), + message: "Table ID cannot be empty".to_string(), } .into()); } - let object_id = namespace_id.join(DELIMITER); + let (namespace, table_name) = Self::split_object_id(table_id); + let object_id = Self::build_object_id(&namespace, &table_name); - // Check if namespace exists - if !self.manifest_contains_object(&object_id).boxed().await? { - return Err(NamespaceError::NamespaceNotFound { - message: object_id.to_string(), + // Get table info before deleting + let table_info = self.query_manifest_for_table(&object_id).await?; + + let table_uri = match table_info { + Some(info) => { + // Delete from manifest only (leave physical data intact) + self.delete_from_manifest(&object_id).boxed().await?; + Self::construct_full_uri(&self.root, &info.location)? } - .into()); - } + None => { + return Err(NamespaceError::TableNotFound { + message: object_id.to_string(), + } + .into()); + } + }; - // Check for child namespaces - let escaped_id = object_id.replace('\'', "''"); - let prefix = format!("{}{}", escaped_id, DELIMITER); - let filter = format!("starts_with(object_id, '{}')", prefix); - let mut scanner = self.manifest_scanner().boxed().await?; - scanner.filter(&filter).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to filter: {:?}", e), - }) - })?; - scanner.project::<&str>(&[]).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to project: {:?}", e), - }) - })?; - scanner.with_row_id(); - let count = scanner.count_rows().boxed().await.map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to count rows: {:?}", e), - }) - })?; + Ok(DeregisterTableResponse { + id: request.id.clone(), + location: Some(table_uri), + ..Default::default() + }) + } +} - if count > 0 { - return Err(NamespaceError::NamespaceNotEmpty { - message: format!("'{}' (contains {} child objects)", object_id, count), - } - .into()); - } +#[cfg(test)] +mod tests { + use super::{ + BASE_OBJECTS_INDEX_NAME, ConflictResolution, CopyOnWriteMutation, DeleteObjectMutation, + LANCE_DATA_DIR, LANCE_INDICES_DIR, MANIFEST_TABLE_NAME, ManifestBatchBuilder, + ManifestEntry, ManifestIndexAccumulator, ManifestNamespace, ManifestOutputRow, + ManifestRowValue, ManifestStreamMutation, OBJECT_ID_INDEX_NAME, OBJECT_TYPE_INDEX_NAME, + ObjectType, + }; + use crate::DirectoryNamespaceBuilder; + use arrow::datatypes::DataType; + use bytes::Bytes; + use futures::StreamExt; + use lance::index::DatasetIndexExt; + use lance_core::utils::tempfile::TempStdDir; + use lance_io::object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry}; + use lance_namespace::LanceNamespace; + use lance_namespace::models::{ + CreateNamespaceRequest, CreateTableRequest, DescribeTableRequest, DropTableRequest, + ListTablesRequest, TableExistsRequest, + }; + use lance_table::format::Fragment; + use rstest::rstest; + use std::collections::{HashMap, HashSet}; + use std::sync::Arc; - self.delete_from_manifest(&object_id).boxed().await?; + async fn create_manifest_namespace( + root: &str, + inline_optimization_enabled: bool, + ) -> ManifestNamespace { + create_manifest_namespace_with_retries(root, inline_optimization_enabled, None).await + } - Ok(DropNamespaceResponse::default()) + async fn create_manifest_namespace_with_retries( + root: &str, + inline_optimization_enabled: bool, + commit_retries: Option, + ) -> ManifestNamespace { + let (object_store, base_path) = ObjectStore::from_uri_and_params( + Arc::new(ObjectStoreRegistry::default()), + root, + &ObjectStoreParams::default(), + ) + .await + .unwrap(); + ManifestNamespace::from_directory( + root.to_string(), + None, + None, + object_store, + base_path, + true, + inline_optimization_enabled, + commit_retries, + ) + .await + .unwrap() } - async fn namespace_exists(&self, request: NamespaceExistsRequest) -> Result<()> { - let namespace_id = request.id.as_ref().ok_or_else(|| { - lance_core::Error::from(NamespaceError::InvalidInput { - message: "Namespace ID is required".to_string(), + struct CommitConflictAfterRewriteMutation { + root: String, + conflict_object_id: String, + } + + impl ManifestStreamMutation for CommitConflictAfterRewriteMutation { + type Output = (); + + fn process_existing_row( + &mut self, + row: ManifestRowValue, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> lance_core::Result<()> { + output.append( + index_data, + ManifestOutputRow { + object_id: &row.object_id, + object_type: row.object_type, + location: row.location.as_deref(), + metadata: row.metadata.as_deref(), + base_objects: row.base_objects.as_deref(), + }, + ) + } + + fn append_rows( + &mut self, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> lance_core::Result<()> { + output.append( + index_data, + ManifestOutputRow { + object_id: "attempted_table", + object_type: ObjectType::Table, + location: Some("attempted_table.lance"), + metadata: None, + base_objects: None, + }, + ) + } + + fn finish(&self) -> CopyOnWriteMutation { + let root = self.root.clone(); + let object_id = self.conflict_object_id.clone(); + std::thread::spawn(move || { + let runtime = tokio::runtime::Runtime::new().unwrap(); + runtime.block_on(async move { + let writer = create_manifest_namespace(&root, false).await; + writer + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id, + object_type: ObjectType::Table, + location: Some("conflicting_table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + }); }) - })?; + .join() + .unwrap(); + CopyOnWriteMutation::updated(()) + } + } + + /// A delete mutation that, during staging, has a concurrent writer delete the same + /// object and commit first, so our own commit hits a conflict while the object is + /// already gone — exercising `ConflictResolution::SucceedIfAbsent`. + struct ConcurrentDeleteBeforeCommitMutation { + inner: DeleteObjectMutation, + root: String, + target: String, + } + + impl ManifestStreamMutation for ConcurrentDeleteBeforeCommitMutation { + type Output = (); + + fn process_existing_row( + &mut self, + row: ManifestRowValue, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> lance_core::Result<()> { + self.inner.process_existing_row(row, output, index_data) + } - // Root namespace always exists - if namespace_id.is_empty() { - return Ok(()); + fn append_rows( + &mut self, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> lance_core::Result<()> { + self.inner.append_rows(output, index_data) } - let object_id = namespace_id.join(DELIMITER); - if self.manifest_contains_object(&object_id).await? { - Ok(()) - } else { - Err(NamespaceError::NamespaceNotFound { - message: object_id.to_string(), + fn finish(&self) -> CopyOnWriteMutation { + let root = self.root.clone(); + let target = self.target.clone(); + std::thread::spawn(move || { + let runtime = tokio::runtime::Runtime::new().unwrap(); + runtime.block_on(async move { + let writer = create_manifest_namespace(&root, false).await; + writer.delete_from_manifest(&target).await.unwrap(); + }); + }) + .join() + .unwrap(); + self.inner.finish() + } + + fn conflict_resolution(&self) -> ConflictResolution { + ConflictResolution::SucceedIfAbsent { + object_id: self.target.clone(), + output: (), } - .into()) } } - async fn declare_table(&self, request: DeclareTableRequest) -> Result { - let table_id = request.id.as_ref().ok_or_else(|| { - lance_core::Error::from(NamespaceError::InvalidInput { - message: "Table ID is required".to_string(), - }) - })?; - - if table_id.is_empty() { - return Err(NamespaceError::InvalidInput { - message: "Table ID cannot be empty".to_string(), + async fn manifest_base_objects( + manifest_ns: &ManifestNamespace, + ) -> HashMap>> { + let mut scanner = manifest_ns.manifest_scanner().await.unwrap(); + scanner.project(&["object_id", "base_objects"]).unwrap(); + let batches = ManifestNamespace::execute_scanner(scanner).await.unwrap(); + let mut rows = HashMap::new(); + for batch in batches { + let object_ids = ManifestNamespace::get_string_column(&batch, "object_id").unwrap(); + let base_objects = ManifestNamespace::base_objects_column_values(&batch).unwrap(); + for (row, value) in base_objects.into_iter().enumerate() { + rows.insert(object_ids.value(row).to_string(), value); } - .into()); } + rows + } - let (namespace, table_name) = Self::split_object_id(table_id); - let object_id = Self::build_object_id(&namespace, &table_name); + async fn manifest_data_paths(manifest_ns: &ManifestNamespace) -> HashSet { + let data_dir = manifest_ns + .base_path + .clone() + .join(MANIFEST_TABLE_NAME) + .join(LANCE_DATA_DIR); + let mut stream = manifest_ns.object_store.read_dir_all(&data_dir, None); + let mut paths = HashSet::new(); + while let Some(meta) = stream.next().await.transpose().unwrap() { + paths.insert(meta.location.to_string()); + } + paths + } - // Check if table already exists in manifest - let existing = self.query_manifest_for_table(&object_id).await?; - if existing.is_some() { - return Err(NamespaceError::TableAlreadyExists { - message: table_name.to_string(), - } - .into()); + async fn manifest_index_paths(manifest_ns: &ManifestNamespace) -> HashSet { + let index_dir = manifest_ns + .base_path + .clone() + .join(MANIFEST_TABLE_NAME) + .join(LANCE_INDICES_DIR); + let mut stream = manifest_ns.object_store.read_dir_all(&index_dir, None); + let mut paths = HashSet::new(); + while let Some(meta) = stream.next().await.transpose().unwrap() { + paths.insert(meta.location.to_string()); } + paths + } - // Create table location path with hash-based naming - // When dir_listing_enabled is true and it's a root table, use directory-style naming: {table_name}.lance - // Otherwise, use hash-based naming: {hash}_{object_id} - let dir_name = if namespace.is_empty() && self.dir_listing_enabled { - // Root table with directory listing enabled: use {table_name}.lance - format!("{}.lance", table_name) - } else { - // Child namespace table or dir listing disabled: use hash-based naming - Self::generate_dir_name(&object_id) - }; - let table_path = self.base_path.clone().join(dir_name.as_str()); - let table_uri = Self::construct_full_uri(&self.root, &dir_name)?; + fn create_test_ipc_data() -> Vec { + use arrow::array::{Int32Array, StringArray}; + use arrow::datatypes::{DataType, Field, Schema}; + use arrow::ipc::writer::StreamWriter; + use arrow::record_batch::RecordBatch; + use std::sync::Arc; - // Validate location if provided - if let Some(req_location) = &request.location { - let req_location = req_location.trim_end_matches('/'); - if req_location != table_uri { - return Err(NamespaceError::InvalidInput { - message: format!( - "Cannot declare table {} at location {}, must be at location {}", - table_name, req_location, table_uri - ), - } - .into()); - } + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + ])); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec!["a", "b", "c"])), + ], + ) + .unwrap(); + + let mut buffer = Vec::new(); + { + let mut writer = StreamWriter::try_new(&mut buffer, &schema).unwrap(); + writer.write(&batch).unwrap(); + writer.finish().unwrap(); } + buffer + } - // Create the .lance-reserved file to mark the table as existing - let reserved_file_path = table_path.clone().join(".lance-reserved"); + /// Open the `__manifest` dataset directly and set a table-metadata key, + /// simulating a future Lance client that persisted a feature flag. + async fn set_manifest_table_metadata(temp_path: &str, key: &str, value: &str) { + use lance::dataset::builder::DatasetBuilder; + let mut ds = DatasetBuilder::from_uri(format!("{}/{}", temp_path, MANIFEST_TABLE_NAME)) + .load() + .await + .unwrap(); + ds.update_metadata([(key, value)]).await.unwrap(); + } - self.object_store - .create(&reserved_file_path) + async fn create_namespace_with_one_table(temp_path: &str) { + let ns = DirectoryNamespaceBuilder::new(temp_path) + .build() .await - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!( - "Failed to create .lance-reserved file for table {}: {}", - table_name, e - ), - }) - })? - .shutdown() + .unwrap(); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["t1".to_string()]); + ns.create_table(create_request, Bytes::from(create_test_ipc_data())) .await - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!( - "Failed to finalize .lance-reserved file for table {}: {}", - table_name, e - ), - }) - })?; + .unwrap(); + } - let metadata = Self::serialize_metadata(request.properties.as_ref(), "table", &object_id)?; + /// This is a forward-compatibility checker only: it must not set any feature + /// flag, so existing clients keep treating the manifest as compatible. + #[tokio::test] + async fn test_manifest_has_no_feature_flags_by_default() { + use lance::dataset::builder::DatasetBuilder; + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + create_namespace_with_one_table(temp_path).await; - // Add entry to manifest marking this as a declared table (store dir_name, not full path) - self.insert_into_manifest_with_metadata( - vec![ManifestEntry { - object_id, - object_type: ObjectType::Table, - location: Some(dir_name), - metadata, - }], - None, + let ds = DatasetBuilder::from_uri(format!("{}/{}", temp_path, MANIFEST_TABLE_NAME)) + .load() + .await + .unwrap(); + assert!( + !ds.metadata() + .contains_key(crate::dir::manifest_feature_flags::READER_FEATURE_FLAGS_KEY) + ); + assert!( + !ds.metadata() + .contains_key(crate::dir::manifest_feature_flags::WRITER_FEATURE_FLAGS_KEY) + ); + } + + /// An unknown reader feature flag must block opening the catalog with a clear + /// "please upgrade" error rather than silently degrading to directory listing. + #[tokio::test] + async fn test_unknown_reader_flag_blocks_access() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + create_namespace_with_one_table(temp_path).await; + set_manifest_table_metadata( + temp_path, + crate::dir::manifest_feature_flags::READER_FEATURE_FLAGS_KEY, + "1", ) - .await?; + .await; - log::info!( - "Declared table '{}' in manifest at {}", - table_name, - table_uri + let err = DirectoryNamespaceBuilder::new(temp_path) + .build() + .await + .expect_err("opening a manifest with an unknown reader flag should fail"); + assert!( + err.to_string().to_lowercase().contains("upgrade"), + "expected an upgrade error, got: {err}" ); + } - // For backwards compatibility, only skip vending credentials when explicitly set to false - let vend_credentials = request.vend_credentials.unwrap_or(true); - let storage_options = if vend_credentials { - self.storage_options.clone() - } else { - None - }; + /// An unknown writer feature flag must still allow reads but block writes. + #[tokio::test] + async fn test_unknown_writer_flag_blocks_writes_but_allows_reads() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + create_namespace_with_one_table(temp_path).await; + set_manifest_table_metadata( + temp_path, + crate::dir::manifest_feature_flags::WRITER_FEATURE_FLAGS_KEY, + "1", + ) + .await; - Ok(DeclareTableResponse { - location: Some(table_uri), - storage_options, - properties: request.properties, - ..Default::default() - }) + let ns = DirectoryNamespaceBuilder::new(temp_path) + .build() + .await + .expect("reads should still be allowed with only a writer flag set"); + let mut list_request = ListTablesRequest::new(); + list_request.id = Some(vec![]); + assert_eq!(ns.list_tables(list_request).await.unwrap().tables.len(), 1); + + // A refused write must not leave an orphaned table dataset behind. + let entries_before = dir_entry_names(temp_path); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["t2".to_string()]); + let err = ns + .create_table(create_request, Bytes::from(create_test_ipc_data())) + .await + .expect_err("writing through an unknown writer flag should fail"); + assert!( + err.to_string().to_lowercase().contains("upgrade"), + "expected an upgrade error, got: {err}" + ); + assert_eq!( + entries_before, + dir_entry_names(temp_path), + "a refused create_table must not create an orphaned table directory" + ); + + // Mutations that go straight through rewrite_manifest (no early + // create_table check) must also be refused: an insert (create_namespace) + // and a delete (drop_table). This proves the writer check is enforced at + // the single copy-on-write chokepoint, not just on the create_table path. + let mut create_ns = CreateNamespaceRequest::new(); + create_ns.id = Some(vec!["ns1".to_string()]); + let err = ns + .create_namespace(create_ns) + .await + .expect_err("create_namespace through an unknown writer flag should fail"); + assert!( + err.to_string().to_lowercase().contains("upgrade"), + "expected an upgrade error, got: {err}" + ); + + let mut drop_request = DropTableRequest::new(); + drop_request.id = Some(vec!["t1".to_string()]); + let err = ns + .drop_table(drop_request) + .await + .expect_err("drop_table through an unknown writer flag should fail"); + assert!( + err.to_string().to_lowercase().contains("upgrade"), + "expected an upgrade error, got: {err}" + ); + } + + fn dir_entry_names(path: &str) -> std::collections::BTreeSet { + std::fs::read_dir(path) + .unwrap() + .map(|e| e.unwrap().file_name().to_string_lossy().into_owned()) + .collect() + } + + #[tokio::test] + async fn test_manifest_rewrite_preserves_utf8_metadata_and_base_objects() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, true).await; + + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "view".to_string(), + object_type: ObjectType::Table, + location: Some("view.lance".to_string()), + metadata: Some(r#"{"kind":"view"}"#.to_string()), + }], + Some(vec!["base_a".to_string(), "base_b".to_string()]), + ) + .await + .unwrap(); + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "other".to_string(), + object_type: ObjectType::Namespace, + location: None, + metadata: Some(r#"{"kind":"namespace"}"#.to_string()), + }], + None, + ) + .await + .unwrap(); + + let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap(); + let metadata_field = dataset_guard.schema().field("metadata").unwrap(); + assert_eq!(metadata_field.data_type(), DataType::Utf8); + drop(dataset_guard); + + let base_objects = manifest_base_objects(&manifest_ns).await; + assert_eq!( + base_objects.get("view").cloned().unwrap(), + Some(vec!["base_a".to_string(), "base_b".to_string()]) + ); + assert_eq!(base_objects.get("other").cloned().unwrap(), None); + } + + #[tokio::test] + async fn test_manifest_rewrite_replacement_indices_are_versioned() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, true).await; + + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + Some(vec!["base".to_string()]), + ) + .await + .unwrap(); + + let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap(); + let dataset_version = dataset_guard.version().version; + let indices = dataset_guard.load_indices().await.unwrap(); + let names = indices + .iter() + .map(|index| index.name.as_str()) + .collect::>(); + assert!(names.contains(OBJECT_ID_INDEX_NAME)); + assert!(names.contains(OBJECT_TYPE_INDEX_NAME)); + assert!(names.contains(BASE_OBJECTS_INDEX_NAME)); + for index in indices.iter() { + assert_eq!(index.dataset_version, dataset_version); + assert!(!index.fragment_bitmap.as_ref().unwrap().is_empty()); + } + } + + #[tokio::test] + async fn test_manifest_rewrite_empty_manifest_keeps_replacement_indices_valid() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, true).await; + + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + manifest_ns.delete_from_manifest("table").await.unwrap(); + + assert!(!manifest_ns.manifest_contains_object("table").await.unwrap()); + let mut scanner = manifest_ns.manifest_scanner().await.unwrap(); + scanner.project(&["object_id"]).unwrap(); + let rows = ManifestNamespace::execute_scanner(scanner) + .await + .unwrap() + .into_iter() + .map(|batch| batch.num_rows()) + .sum::(); + assert_eq!(rows, 0); + + let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap(); + let dataset_version = dataset_guard.version().version; + let indices = dataset_guard.load_indices().await.unwrap(); + let names = indices + .iter() + .map(|index| index.name.as_str()) + .collect::>(); + assert!(names.contains(OBJECT_ID_INDEX_NAME)); + assert!(names.contains(OBJECT_TYPE_INDEX_NAME)); + assert!(names.contains(BASE_OBJECTS_INDEX_NAME)); + for index in indices.iter() { + assert_eq!(index.dataset_version, dataset_version); + } + } + + #[tokio::test] + async fn test_manifest_rewrite_fragment_bitmap_uses_overwrite_fragment_ids() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, false).await; + let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap(); + let fragments = vec![Fragment::new(0), Fragment::new(0), Fragment::new(7)]; + + let manifest = ManifestNamespace::manifest_from_overwrite_transaction( + dataset_guard.manifest(), + dataset_guard.manifest().schema.clone(), + &fragments, + ); + + let fragment_ids = manifest + .fragments + .iter() + .map(|fragment| fragment.id) + .collect::>(); + assert_eq!(fragment_ids, vec![0, 1, 7]); + assert_eq!( + ManifestNamespace::manifest_fragment_bitmap(&manifest) + .unwrap() + .into_iter() + .collect::>(), + vec![0, 1, 7] + ); } - async fn register_table(&self, request: RegisterTableRequest) -> Result { - let table_id = request.id.as_ref().ok_or_else(|| { - lance_core::Error::from(NamespaceError::InvalidInput { - message: "Table ID is required".to_string(), - }) - })?; + #[tokio::test] + async fn test_manifest_noop_delete_uses_latest_snapshot() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let stale_ns = create_manifest_namespace(temp_path, false).await; + let writer_ns = create_manifest_namespace(temp_path, false).await; - if table_id.is_empty() { - return Err(NamespaceError::InvalidInput { - message: "Table ID cannot be empty".to_string(), - } - .into()); - } + writer_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "late_table".to_string(), + object_type: ObjectType::Table, + location: Some("late_table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); - let location = request.location.clone(); + stale_ns.delete_from_manifest("late_table").await.unwrap(); - // Validate that location is a relative path within the root directory - // We don't allow absolute URIs or paths that escape the root - if location.contains("://") { - return Err(NamespaceError::InvalidInput { - message: format!( - "Absolute URIs are not allowed for register_table. Location must be a relative path within the root directory: {}", - location - ), - } - .into()); - } + let check_ns = create_manifest_namespace(temp_path, false).await; + assert!( + !check_ns + .manifest_contains_object("late_table") + .await + .unwrap() + ); + } - if location.starts_with('/') { - return Err(NamespaceError::InvalidInput { - message: format!( - "Absolute paths are not allowed for register_table. Location must be a relative path within the root directory: {}", - location - ), - } - .into()); - } + #[tokio::test] + async fn test_manifest_noop_delete_cleans_uncommitted_data_file() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, false).await; - // Check for path traversal attempts - if location.contains("..") { - return Err(NamespaceError::InvalidInput { - message: format!( - "Path traversal is not allowed. Location must be a relative path within the root directory: {}", - location - ), - } - .into()); - } + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); - let (namespace, table_name) = Self::split_object_id(table_id); - let object_id = Self::build_object_id(&namespace, &table_name); + let before = manifest_data_paths(&manifest_ns).await; + assert!(!before.is_empty()); - // Validate that parent namespaces exist (if not root) - if !namespace.is_empty() { - self.validate_namespace_levels_exist(&namespace).await?; - } + manifest_ns + .delete_from_manifest("missing_table") + .await + .unwrap(); - // Check if table already exists - if self.manifest_contains_object(&object_id).await? { - return Err(NamespaceError::TableAlreadyExists { - message: object_id.to_string(), - } - .into()); - } + let after = manifest_data_paths(&manifest_ns).await; + assert_eq!(after, before); + } - // Register the table with its location in the manifest - self.insert_into_manifest(object_id, ObjectType::Table, Some(location.clone())) - .await?; + #[tokio::test] + async fn test_manifest_final_commit_failure_cleans_uncommitted_rewrite_files() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_with_retries(temp_path, true, Some(0)).await; - Ok(RegisterTableResponse { - location: Some(location), - ..Default::default() - }) - } + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); - async fn deregister_table( - &self, - request: DeregisterTableRequest, - ) -> Result { - let table_id = request.id.as_ref().ok_or_else(|| { - lance_core::Error::from(NamespaceError::InvalidInput { - message: "Table ID is required".to_string(), + let before_data_paths = manifest_data_paths(&manifest_ns).await; + let before_index_paths = manifest_index_paths(&manifest_ns).await; + + let result = manifest_ns + .rewrite_manifest("Failed to test manifest cleanup", || { + CommitConflictAfterRewriteMutation { + root: temp_path.to_string(), + conflict_object_id: "conflicting_table".to_string(), + } }) - })?; + .await; + assert!(result.is_err()); - if table_id.is_empty() { - return Err(NamespaceError::InvalidInput { - message: "Table ID cannot be empty".to_string(), - } - .into()); - } + let after_data_paths = manifest_data_paths(&manifest_ns).await; + assert!(before_data_paths.is_subset(&after_data_paths)); + assert_eq!(after_data_paths.len(), before_data_paths.len() + 1); + assert_eq!(manifest_index_paths(&manifest_ns).await, before_index_paths); + assert!( + manifest_ns + .manifest_contains_object("conflicting_table") + .await + .unwrap() + ); + assert!( + !manifest_ns + .manifest_contains_object("attempted_table") + .await + .unwrap() + ); + } - let (namespace, table_name) = Self::split_object_id(table_id); - let object_id = Self::build_object_id(&namespace, &table_name); + #[tokio::test] + async fn test_manifest_commit_visible_on_memory_store() { + // Regression: the commit must use the same object store the manifest dataset reads + // from. On `memory://` the namespace store and the dataset store can be different + // in-memory instances, so a commit written to the wrong one is invisible to reads + // (manifests as stale version -> endless conflict / "not found"). + let manifest_ns = create_manifest_namespace("memory://test_commit_visible", false).await; + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + assert!(manifest_ns.manifest_contains_object("table").await.unwrap()); + // A second sequential commit must not falsely conflict. + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table2".to_string(), + object_type: ObjectType::Table, + location: Some("table2.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + assert!( + manifest_ns + .manifest_contains_object("table2") + .await + .unwrap() + ); + } - // Get table info before deleting - let table_info = self.query_manifest_for_table(&object_id).await?; + #[tokio::test] + async fn test_manifest_commit_uses_inline_transaction() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, false).await; - let table_uri = match table_info { - Some(info) => { - // Delete from manifest only (leave physical data intact) - self.delete_from_manifest(&object_id).boxed().await?; - Self::construct_full_uri(&self.root, &info.location)? - } - None => { - return Err(NamespaceError::TableNotFound { - message: object_id.to_string(), - } - .into()); - } - }; + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); - Ok(DeregisterTableResponse { - id: request.id.clone(), - location: Some(table_uri), - ..Default::default() - }) + let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap(); + let manifest = dataset_guard.manifest(); + // The overwrite transaction is embedded inline in the manifest, never written as a + // separate _transactions/*.txn file. + assert!(manifest.transaction_section.is_some()); + assert!(manifest.transaction_file.is_none()); } -} -#[cfg(test)] -mod tests { - use crate::{DirectoryNamespaceBuilder, ManifestNamespace}; - use bytes::Bytes; - use lance_core::utils::tempfile::TempStdDir; - use lance_namespace::LanceNamespace; - use lance_namespace::models::{ - CreateNamespaceRequest, CreateTableRequest, DescribeTableRequest, DropTableRequest, - ListTablesRequest, TableExistsRequest, - }; - use rstest::rstest; + #[tokio::test] + async fn test_manifest_commit_landed_attributes_data_file() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, false).await; - fn create_test_ipc_data() -> Vec { - use arrow::array::{Int32Array, StringArray}; - use arrow::datatypes::{DataType, Field, Schema}; - use arrow::ipc::writer::StreamWriter; - use arrow::record_batch::RecordBatch; - use std::sync::Arc; + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); - let schema = Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, false), - ])); + let dataset = Arc::new(manifest_ns.manifest_dataset.get().await.unwrap().clone()); + let version = dataset.manifest().version; + let our_files = dataset + .manifest() + .fragments + .iter() + .flat_map(|fragment| fragment.files.iter()) + .map(|file| file.path.clone()) + .collect::>(); + assert!(!our_files.is_empty()); - let batch = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(Int32Array::from(vec![1, 2, 3])), - Arc::new(StringArray::from(vec!["a", "b", "c"])), - ], - ) - .unwrap(); + // The committed version references our data file => attributed to us (a lost-ack + // commit must be treated as success, not cleaned up). + assert!( + manifest_ns + .manifest_commit_landed(&dataset, version, &our_files) + .await + ); + // A different file set is not attributed to us. + let other = HashSet::from(["missing.lance".to_string()]); + assert!( + !manifest_ns + .manifest_commit_landed(&dataset, version, &other) + .await + ); + // A version that does not exist did not land. + assert!( + !manifest_ns + .manifest_commit_landed(&dataset, version + 100, &our_files) + .await + ); + } - let mut buffer = Vec::new(); - { - let mut writer = StreamWriter::try_new(&mut buffer, &schema).unwrap(); - writer.write(&batch).unwrap(); - writer.finish().unwrap(); - } - buffer + #[tokio::test] + async fn test_manifest_delete_conflict_with_concurrent_delete_succeeds() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_with_retries(temp_path, false, Some(0)).await; + + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + assert!(manifest_ns.manifest_contains_object("table").await.unwrap()); + + // A concurrent writer deletes "table" and commits first, so our own delete commit + // conflicts while "table" is already gone. Native resolution treats the goal as + // achieved and succeeds instead of erroring or retrying forever. + let result = manifest_ns + .rewrite_manifest("Failed to delete from manifest", || { + ConcurrentDeleteBeforeCommitMutation { + inner: DeleteObjectMutation { + object_id: "table".to_string(), + deleted: false, + }, + root: temp_path.to_string(), + target: "table".to_string(), + } + }) + .await; + + assert!(result.is_ok(), "delete should succeed: {result:?}"); + assert!(!manifest_ns.manifest_contains_object("table").await.unwrap()); } #[rstest] @@ -3939,9 +5424,9 @@ mod tests { /// Test that concurrent create_table calls for the same table name don't /// create duplicate entries in the manifest. Uses two independent /// ManifestNamespace instances pointing at the same directory to simulate - /// two separate OS processes racing on table creation. The conflict_retries - /// setting on the MergeInsert ensures the second operation properly detects - /// the duplicate via WhenMatched::Fail after retrying against the latest data. + /// two separate OS processes racing on table creation. Copy-on-write rewrite + /// retries ensure the second operation detects the duplicate after retrying + /// against the latest data. #[tokio::test] async fn test_concurrent_create_table_no_duplicates() { let temp_dir = TempStdDir::default(); diff --git a/rust/lance-namespace-impls/src/dir/manifest_feature_flags.rs b/rust/lance-namespace-impls/src/dir/manifest_feature_flags.rs new file mode 100644 index 00000000000..d0849ceda4f --- /dev/null +++ b/rust/lance-namespace-impls/src/dir/manifest_feature_flags.rs @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Reader/writer feature flags for the directory-catalog `__manifest` dataset. +//! +//! Forward-compatibility infrastructure for the `__manifest` Lance dataset, +//! analogous to the Lance table format's `reader_feature_flags` / +//! `writer_feature_flags` but describing the *catalog manifest* format (schema +//! and semantics) rather than the underlying Lance file format. The flags are +//! persisted in the `__manifest` dataset's `table_metadata` map. +//! +//! Each manifest feature owns one bit in a `u64` bitmask. A build may read a +//! `__manifest` only if it understands every set reader-flag bit, and may write +//! it only if it understands every set writer-flag bit; otherwise it fails fast +//! with a clear "please upgrade" error instead of silently misreading data. The +//! set of bits a build understands is `READER_KNOWN_FLAGS` / `WRITER_KNOWN_FLAGS`. +//! +//! This is the mechanism only: no manifest feature is defined yet, so the known +//! masks are `0` and nothing is ever set — every current manifest reads and +//! writes unchanged. The first format change that needs forward-compatibility +//! protection adds its bit to the known masks and stamps it on write; from then +//! on, builds without that bit refuse the new format rather than misreading it. +//! Manifests written before this mechanism carry no flag keys, which parse as +//! `0` and stay compatible with every build. + +use std::collections::HashMap; + +use lance_core::{Error, Result}; +use lance_namespace::error::NamespaceError; + +/// `table_metadata` key holding the reader feature-flag bitmask (decimal `u64`). +pub const READER_FEATURE_FLAGS_KEY: &str = "lance.namespace.manifest.reader_feature_flags"; +/// `table_metadata` key holding the writer feature-flag bitmask (decimal `u64`). +pub const WRITER_FEATURE_FLAGS_KEY: &str = "lance.namespace.manifest.writer_feature_flags"; + +/// Reader feature-flag bits this build understands. No manifest feature is +/// defined yet, so this build understands none and refuses any non-zero reader +/// flag. A future format change adds its bit here. +const READER_KNOWN_FLAGS: u64 = 0; +/// Writer feature-flag bits this build understands. +const WRITER_KNOWN_FLAGS: u64 = 0; + +/// Whether this build can read a `__manifest` whose persisted reader feature +/// flags are `reader_flags` — i.e. it understands every set bit. +pub fn can_read_manifest(reader_flags: u64) -> bool { + (reader_flags & !READER_KNOWN_FLAGS) == 0 +} + +/// Whether this build can write a `__manifest` whose persisted writer feature +/// flags are `writer_flags` — i.e. it understands every set bit. +pub fn can_write_manifest(writer_flags: u64) -> bool { + (writer_flags & !WRITER_KNOWN_FLAGS) == 0 +} + +fn parse_flags(table_metadata: &HashMap, key: &str) -> Result { + match table_metadata.get(key) { + None => Ok(0), + Some(raw) => raw.parse::().map_err(|e| { + Error::from(NamespaceError::Unsupported { + message: format!( + "The __manifest dataset has an unparsable feature-flag value '{raw}' for \ + '{key}': {e}. This likely means it was written by a newer, incompatible \ + version of Lance; please upgrade Lance to use this catalog." + ), + }) + }), + } +} + +/// Reader feature flags persisted in the `__manifest` `table_metadata` (`0` if absent). +pub fn reader_flags(table_metadata: &HashMap) -> Result { + parse_flags(table_metadata, READER_FEATURE_FLAGS_KEY) +} + +/// Writer feature flags persisted in the `__manifest` `table_metadata` (`0` if absent). +pub fn writer_flags(table_metadata: &HashMap) -> Result { + parse_flags(table_metadata, WRITER_FEATURE_FLAGS_KEY) +} + +/// Validate that this build can READ the `__manifest` described by `table_metadata`, +/// returning a clear "please upgrade" error otherwise. +pub fn ensure_readable(table_metadata: &HashMap) -> Result<()> { + let flags = reader_flags(table_metadata)?; + if !can_read_manifest(flags) { + return Err(Error::from(NamespaceError::Unsupported { + message: format!( + "The __manifest dataset was written with reader feature flags {flags}, which this \ + version of Lance does not understand (known reader flags: {READER_KNOWN_FLAGS}). \ + Please upgrade Lance to read this catalog." + ), + })); + } + Ok(()) +} + +/// Validate that this build can WRITE the `__manifest` described by `table_metadata`, +/// returning a clear "please upgrade" error otherwise. +pub fn ensure_writable(table_metadata: &HashMap) -> Result<()> { + let flags = writer_flags(table_metadata)?; + if !can_write_manifest(flags) { + return Err(Error::from(NamespaceError::Unsupported { + message: format!( + "The __manifest dataset was written with writer feature flags {flags}, which this \ + version of Lance does not understand (known writer flags: {WRITER_KNOWN_FLAGS}). \ + Please upgrade Lance to modify this catalog." + ), + })); + } + Ok(()) +} + +/// Whether `err` indicates the `__manifest` is in a format this build cannot +/// handle — i.e. it carries an unknown reader/writer feature flag, surfaced by +/// [`ensure_readable`] / [`ensure_writable`] as a [`NamespaceError::Unsupported`]. +/// +/// Catalog initialization uses this to refuse opening such a manifest rather +/// than silently degrading to a directory-listing view that ignores it. The +/// `__manifest` open path raises no other `Unsupported` error, so matching the +/// code is sufficient and avoids brittle message matching. +pub fn is_incompatible_manifest_error(err: &Error) -> bool { + matches!( + err, + Error::Namespace { source, .. } + if source + .downcast_ref::() + .is_some_and(|e| matches!(e, NamespaceError::Unsupported { .. })) + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn meta(pairs: &[(&str, &str)]) -> HashMap { + pairs + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect() + } + + #[test] + fn unflagged_is_compatible() { + assert!(can_read_manifest(0)); + assert!(can_write_manifest(0)); + let empty = HashMap::new(); + assert!(ensure_readable(&empty).is_ok()); + assert!(ensure_writable(&empty).is_ok()); + assert_eq!(reader_flags(&empty).unwrap(), 0); + assert_eq!(writer_flags(&empty).unwrap(), 0); + // Explicit zeroes are also compatible. + let zeroed = meta(&[ + (READER_FEATURE_FLAGS_KEY, "0"), + (WRITER_FEATURE_FLAGS_KEY, "0"), + ]); + assert!(ensure_readable(&zeroed).is_ok()); + assert!(ensure_writable(&zeroed).is_ok()); + } + + #[test] + fn any_unknown_flag_is_refused() { + // This build understands no feature flags, so any non-zero bit is refused. + assert!(!can_read_manifest(1)); + assert!(!can_write_manifest(1)); + assert!(!can_read_manifest(1 << 30)); + assert!(!can_write_manifest(1 << 63)); + + let reader = meta(&[(READER_FEATURE_FLAGS_KEY, "1")]); + let err = ensure_readable(&reader).unwrap_err(); + assert!(err.to_string().to_lowercase().contains("upgrade")); + assert!(is_incompatible_manifest_error(&err)); + // A reader flag does not block writers that the writer mask allows. + assert!(ensure_writable(&reader).is_ok()); + + let writer = meta(&[(WRITER_FEATURE_FLAGS_KEY, "2")]); + let err = ensure_writable(&writer).unwrap_err(); + assert!(err.to_string().to_lowercase().contains("upgrade")); + assert!(is_incompatible_manifest_error(&err)); + } + + #[test] + fn unparsable_value_is_refused() { + let m = meta(&[(READER_FEATURE_FLAGS_KEY, "not-a-number")]); + assert!(reader_flags(&m).is_err()); + assert!(ensure_readable(&m).is_err()); + } + + #[test] + fn unrelated_error_is_not_an_incompatibility() { + let other = Error::from(NamespaceError::TableNotFound { + message: "x".to_string(), + }); + assert!(!is_incompatible_manifest_error(&other)); + } +} diff --git a/rust/lance-namespace-impls/src/rest_adapter.rs b/rust/lance-namespace-impls/src/rest_adapter.rs index 7324ab0bb0e..44ebd866810 100644 --- a/rust/lance-namespace-impls/src/rest_adapter.rs +++ b/rust/lance-namespace-impls/src/rest_adapter.rs @@ -1527,8 +1527,7 @@ mod tests { } /// Like [`Self::new`], with managed versioning (table version - /// tracking through the `__manifest` catalog) enabled on the - /// backend. + /// tracking) enabled on the backend. async fn new_managed() -> Self { Self::build(true).await } @@ -1540,9 +1539,7 @@ mod tests { // Create DirectoryNamespace backend with manifest enabled let mut builder = DirectoryNamespaceBuilder::new(&temp_path).manifest_enabled(true); if managed_versioning { - builder = builder - .table_version_tracking_enabled(true) - .table_version_storage_enabled(true); + builder = builder.table_version_tracking_enabled(true); } let backend = builder.build().await.unwrap(); let backend = Arc::new(backend); diff --git a/rust/lance-select/src/mask.rs b/rust/lance-select/src/mask.rs index ecacc118074..0fea6498fc9 100644 --- a/rust/lance-select/src/mask.rs +++ b/rust/lance-select/src/mask.rs @@ -13,7 +13,7 @@ use itertools::Itertools; use lance_core::deepsize::DeepSizeOf; use roaring::{MultiOps, RoaringBitmap, RoaringTreemap}; -use lance_core::cache::CacheCodecImpl; +use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter}; use lance_core::utils::address::RowAddress; use lance_core::{Error, Result}; @@ -697,12 +697,17 @@ impl RowAddrTreeMap { } impl CacheCodecImpl for RowAddrTreeMap { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { - self.serialize_into(writer) + const TYPE_ID: &'static str = "lance.RowAddrTreeMap"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + // A roaring bitmap has its own stable, portable serialization; it is + // the whole body, so write it raw rather than length-prefixed. + self.serialize_into(w.raw_writer()) } - fn deserialize(data: &bytes::Bytes) -> Result { - Self::deserialize_from(data.as_ref()) + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + Self::deserialize_from(r.body().as_ref()) } } diff --git a/rust/lance-table/src/format/index.rs b/rust/lance-table/src/format/index.rs index 33ee464fe76..f603536a3eb 100644 --- a/rust/lance-table/src/format/index.rs +++ b/rust/lance-table/src/format/index.rs @@ -15,6 +15,7 @@ use roaring::RoaringBitmap; use uuid::Uuid; use super::pb; +use lance_core::cache::{CacheEntryReader, CacheEntryWriter}; use lance_core::{Error, Result}; /// Metadata about a single file within an index segment. @@ -235,24 +236,26 @@ impl From<&IndexMetadata> for pb::IndexMetadata { /// orphan rule prevents `impl CacheCodecImpl for Vec`. type ArcAny = Arc; +/// Stable type identifier for the `Vec` cache entry. +const INDEX_METADATA_TYPE_ID: &str = "lance.table.IndexMetadataList"; +/// Body schema version written by this build. +const INDEX_METADATA_VERSION: u32 = 1; + fn serialize_index_metadata( any: &ArcAny, - writer: &mut dyn std::io::Write, + writer: &mut CacheEntryWriter<'_>, ) -> lance_core::Result<()> { - use prost::Message; let vec = any .downcast_ref::>() .expect("index_metadata_codec: wrong type (this is a bug in the cache layer)"); let section = pb::IndexSection { indices: vec.iter().map(pb::IndexMetadata::from).collect(), }; - writer.write_all(§ion.encode_to_vec())?; - Ok(()) + writer.write_header(§ion) } -fn deserialize_index_metadata(data: &bytes::Bytes) -> lance_core::Result { - use prost::Message; - let section = pb::IndexSection::decode(data.as_ref())?; +fn deserialize_index_metadata(reader: &mut CacheEntryReader<'_>) -> lance_core::Result { + let section: pb::IndexSection = reader.read_header()?; let indices: Vec = section .indices .into_iter() @@ -262,7 +265,12 @@ fn deserialize_index_metadata(data: &bytes::Bytes) -> lance_core::Result } pub fn index_metadata_codec() -> lance_core::cache::CacheCodec { - lance_core::cache::CacheCodec::new(serialize_index_metadata, deserialize_index_metadata) + lance_core::cache::CacheCodec::new( + INDEX_METADATA_TYPE_ID, + INDEX_METADATA_VERSION, + serialize_index_metadata, + deserialize_index_metadata, + ) } /// List all files in an index directory with their sizes. @@ -348,7 +356,8 @@ mod tests { let bytes = store.get(&key).unwrap(); let recovered = codec .deserialize(&bytes::Bytes::copy_from_slice(bytes)) - .unwrap(); + .hit() + .expect("entry should decode as a hit"); let recovered = recovered .downcast::>() .expect("downcast should succeed"); diff --git a/rust/lance-table/src/io/commit.rs b/rust/lance-table/src/io/commit.rs index 3784e84a785..e1a4086730b 100644 --- a/rust/lance-table/src/io/commit.rs +++ b/rust/lance-table/src/io/commit.rs @@ -798,6 +798,26 @@ pub trait CommitHandler: Debug + Send + Sync { default_resolve_version(base_path, version, object_store).await } + /// Check whether an attached manifest version exists without loading it. + /// + /// The default implementation probes the deterministic manifest path for + /// the given naming scheme. Commit handlers with an external source of + /// truth should override this method. + async fn version_exists( + &self, + base_path: &Path, + version: u64, + object_store: &dyn OSObjectStore, + naming_scheme: ManifestNamingScheme, + ) -> Result { + let path = naming_scheme.manifest_path(base_path, version); + match object_store.head(&path).await { + Ok(_) => Ok(true), + Err(ObjectStoreError::NotFound { .. }) => Ok(false), + Err(e) => Err(e.into()), + } + } + /// List detached manifest locations. /// /// Returns a stream of detached manifest locations in arbitrary order. diff --git a/rust/lance-table/src/io/commit/external_manifest.rs b/rust/lance-table/src/io/commit/external_manifest.rs index 75993ca8d1f..a6c9bbaa90d 100644 --- a/rust/lance-table/src/io/commit/external_manifest.rs +++ b/rust/lance-table/src/io/commit/external_manifest.rs @@ -456,6 +456,31 @@ impl CommitHandler for ExternalManifestCommitHandler { .await } + async fn version_exists( + &self, + base_path: &Path, + version: u64, + object_store: &dyn OSObjectStore, + naming_scheme: ManifestNamingScheme, + ) -> Result { + match self + .external_manifest_store + .get_manifest_location(base_path.as_ref(), version) + .await + { + Ok(_) => Ok(true), + Err(Error::NotFound { .. }) => { + let path = naming_scheme.manifest_path(base_path, version); + match object_store.head(&path).await { + Ok(_) => Ok(true), + Err(ObjectStoreError::NotFound { .. }) => Ok(false), + Err(e) => Err(e.into()), + } + } + Err(e) => Err(e), + } + } + async fn commit( &self, manifest: &mut Manifest, diff --git a/rust/lance-tokenizer/Cargo.toml b/rust/lance-tokenizer/Cargo.toml index 5edfe4a9f16..e1006cd93c7 100644 --- a/rust/lance-tokenizer/Cargo.toml +++ b/rust/lance-tokenizer/Cargo.toml @@ -17,6 +17,7 @@ jieba-rs = { workspace = true, optional = true } lindera = { workspace = true, optional = true } rust-stemmers = "1.2.0" serde = { workspace = true, features = ["derive"] } +stop-words = { version = "0.10.0", default-features = false, features = ["iso", "nltk"] } unicode-normalization = "0.1.25" [features] diff --git a/rust/lance-tokenizer/src/stop_word_filter.rs b/rust/lance-tokenizer/src/stop_word_filter.rs index 0c49330a619..2acf0b3dbd5 100644 --- a/rust/lance-tokenizer/src/stop_word_filter.rs +++ b/rust/lance-tokenizer/src/stop_word_filter.rs @@ -12,6 +12,34 @@ use std::sync::Arc; use crate::{Language, Token, TokenFilter, TokenStream, Tokenizer}; +fn all_stop_words() -> impl Iterator { + [ + stop_words::get("ar"), + stopwords::DANISH, + stopwords::DUTCH, + stopwords::ENGLISH, + stopwords::FINNISH, + stopwords::FRENCH, + stopwords::GERMAN, + stop_words::get("el"), + stopwords::HUNGARIAN, + stopwords::ITALIAN, + stopwords::NORWEGIAN, + stopwords::PORTUGUESE, + stop_words::get("ro"), + stopwords::RUSSIAN, + stopwords::SPANISH, + stopwords::SWEDISH, + stop_words::get("ta"), + stop_words::get("tr"), + stop_words::get("zh"), + stop_words::get("ja"), + stop_words::get("ko"), + ] + .into_iter() + .flat_map(|words| words.iter().copied()) +} + #[derive(Clone)] pub struct StopWordFilter { words: Arc>, @@ -20,28 +48,32 @@ pub struct StopWordFilter { impl StopWordFilter { pub fn new(language: Language) -> Option { let words = match language { + Language::Arabic => stop_words::get("ar"), Language::Danish => stopwords::DANISH, Language::Dutch => stopwords::DUTCH, - Language::English => &[ - "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", - "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", - "there", "these", "they", "this", "to", "was", "will", "with", - ], + Language::English => stopwords::ENGLISH, Language::Finnish => stopwords::FINNISH, Language::French => stopwords::FRENCH, Language::German => stopwords::GERMAN, + Language::Greek => stop_words::get("el"), Language::Hungarian => stopwords::HUNGARIAN, Language::Italian => stopwords::ITALIAN, Language::Norwegian => stopwords::NORWEGIAN, Language::Portuguese => stopwords::PORTUGUESE, + Language::Romanian => stop_words::get("ro"), Language::Russian => stopwords::RUSSIAN, Language::Spanish => stopwords::SPANISH, Language::Swedish => stopwords::SWEDISH, - _ => return None, + Language::Tamil => stop_words::get("ta"), + Language::Turkish => stop_words::get("tr"), }; Some(Self::remove(words.iter().map(|word| (*word).to_owned()))) } + pub fn all() -> Self { + Self::remove(all_stop_words().map(str::to_owned)) + } + pub fn remove>(words: W) -> Self { Self { words: Arc::new(words.into_iter().collect()), @@ -49,6 +81,42 @@ impl StopWordFilter { } } +#[cfg(test)] +mod tests { + use super::all_stop_words; + use crate::StopWordFilter; + use std::collections::HashSet; + + #[test] + fn test_external_stop_word_lists_are_available() { + let words = all_stop_words().collect::>(); + for word in ["إلى", "και", "acesta", "அவர்", "ama", "的", "ある", "그리고"] + { + assert!( + words.contains(word), + "built-in stop words should contain {word}" + ); + } + } + + #[test] + fn test_language_stop_word_lists_are_available() { + for (language, word) in [ + (crate::Language::Arabic, "إلى"), + (crate::Language::Greek, "και"), + (crate::Language::Romanian, "acesta"), + (crate::Language::Tamil, "அவர்"), + (crate::Language::Turkish, "ama"), + ] { + let filter = StopWordFilter::new(language).unwrap(); + assert!( + filter.words.contains(word), + "{language:?} should contain {word}" + ); + } + } +} + impl TokenFilter for StopWordFilter { type Tokenizer = StopWordFilterWrapper; diff --git a/rust/lance-tokenizer/src/stop_word_filter/stopwords.rs b/rust/lance-tokenizer/src/stop_word_filter/stopwords.rs index 2ac3f4a28aa..227556ba527 100644 --- a/rust/lance-tokenizer/src/stop_word_filter/stopwords.rs +++ b/rust/lance-tokenizer/src/stop_word_filter/stopwords.rs @@ -37,6 +37,12 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +pub const ENGLISH: &[&str] = &[ + "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", + "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", + "they", "this", "to", "was", "will", "with", +]; + pub const DANISH: &[&str] = &[ "og", "i", "jeg", "det", "at", "en", "den", "til", "er", "som", "på", "de", "med", "han", "af", "for", "ikke", "der", "var", "mig", "sig", "men", "et", "har", "om", "vi", "min", "havde", diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml index 74e6faf5c07..6586c928de7 100644 --- a/rust/lance/Cargo.toml +++ b/rust/lance/Cargo.toml @@ -175,6 +175,10 @@ required-features = ["cli"] name = "scalar_index" harness = false +[[bench]] +name = "regex_ngram" +harness = false + [[bench]] name = "merge_insert" harness = false @@ -296,5 +300,9 @@ harness = false name = "concurrent_append" harness = false +[[bench]] +name = "hamming" +harness = false + [lints] workspace = true diff --git a/rust/lance/benches/hamming.rs b/rust/lance/benches/hamming.rs new file mode 100644 index 00000000000..7e926a795db --- /dev/null +++ b/rust/lance/benches/hamming.rs @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Benchmark for hamming distance clustering. +//! +//! This benchmark tests the pairwise hamming distance computation and clustering +//! performance at various scales. +//! +//! Run with: cargo bench -p lance --bench hamming +//! +//! Environment variables: +//! - DATASET_URI: Path to a dataset with a hash column (optional, generates random if not set) +//! - HASH_COLUMN: Name of the hash column (default: "hash") +//! - SAMPLE_SIZE: Number of rows to sample (default: 10000) +//! - THRESHOLD: Hamming distance threshold (default: 10) + +#![allow(clippy::print_stdout)] + +use std::env; +use std::sync::Arc; +use std::time::Instant; + +use arrow_array::{FixedSizeListArray, RecordBatch, RecordBatchIterator, UInt8Array}; +use arrow_schema::{DataType, Field, FieldRef, Schema}; +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use lance_arrow::FixedSizeListArrayExt; +use rand::Rng; + +use lance::index::vector::hamming::{ + hamming_clustering_for_sample, hamming_clustering_from_hashes, +}; +use lance::{Dataset, dataset::WriteParams}; +use lance_linalg::distance::pairwise_hamming_distance_parallel; + +#[cfg(target_os = "linux")] +use lance_testing::pprof::{Output, PProfProfiler}; + +/// Generate random 64-bit hashes. +fn generate_random_hashes(n: usize) -> Vec { + let mut rng = rand::rng(); + (0..n).map(|_| rng.random()).collect() +} + +/// Generate random hash dataset as Arrow arrays. +fn generate_hash_batch(num_rows: usize) -> RecordBatch { + let mut rng = rand::rng(); + + // Generate random bytes for the hashes (8 bytes per hash) + let bytes: Vec = (0..num_rows * 8).map(|_| rng.random()).collect(); + let values = UInt8Array::from(bytes); + + let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap(); + + let schema = Arc::new(Schema::new(vec![Field::new( + "hash", + DataType::FixedSizeList(FieldRef::new(Field::new("item", DataType::UInt8, true)), 8), + false, + )])); + + RecordBatch::try_new(schema, vec![Arc::new(hash_array)]).unwrap() +} + +/// Create a test dataset with random hashes. +async fn create_hash_dataset(path: &std::path::Path, num_rows: usize) { + let batch = generate_hash_batch(num_rows); + let schema = batch.schema(); + + let write_params = WriteParams { + max_rows_per_file: num_rows, + max_rows_per_group: 10_000, + ..Default::default() + }; + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + Dataset::write(reader, path.to_str().unwrap(), Some(write_params)) + .await + .unwrap(); +} + +/// Benchmark pure pairwise hamming computation (no I/O). +fn bench_pairwise_compute(c: &mut Criterion) { + let mut group = c.benchmark_group("hamming_pairwise_compute"); + + for size in [1_000, 5_000, 10_000, 20_000] { + let hashes = generate_random_hashes(size); + let total_pairs = (size as u64) * (size as u64 - 1) / 2; + + group.throughput(Throughput::Elements(total_pairs)); + group.bench_with_input(BenchmarkId::new("parallel", size), &hashes, |b, hashes| { + b.iter(|| { + pairwise_hamming_distance_parallel(hashes, None, Some(10)); + }); + }); + } + + group.finish(); +} + +/// Benchmark full clustering pipeline (compute + cluster). +fn bench_cluster_hashes(c: &mut Criterion) { + let mut group = c.benchmark_group("hamming_cluster"); + + for size in [1_000, 5_000, 10_000] { + let hashes = generate_random_hashes(size); + + group.bench_with_input( + BenchmarkId::new("full_pipeline", size), + &hashes, + |b, hashes| { + b.iter(|| { + hamming_clustering_from_hashes(hashes, None, 10); + }); + }, + ); + } + + group.finish(); +} + +/// Benchmark with dataset I/O (if DATASET_URI is set). +fn bench_dataset_cluster(c: &mut Criterion) { + let rt = tokio::runtime::Runtime::new().unwrap(); + + // Check if we should use an external dataset + let dataset_uri = env::var("DATASET_URI").ok(); + let hash_column = env::var("HASH_COLUMN").unwrap_or_else(|_| "hash".to_string()); + let sample_size: usize = env::var("SAMPLE_SIZE") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(10_000); + let threshold: u32 = env::var("THRESHOLD") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(10); + + let mut group = c.benchmark_group("hamming_dataset"); + + if let Some(uri) = dataset_uri { + // Use external dataset + println!("Using external dataset: {}", uri); + println!( + "Column: {}, Sample: {}, Threshold: {}", + hash_column, sample_size, threshold + ); + + let dataset = rt.block_on(async { Dataset::open(&uri).await.unwrap() }); + + group.bench_function(format!("external_sample_{}", sample_size), |b| { + b.to_async(&rt).iter(|| async { + hamming_clustering_for_sample(&dataset, &hash_column, Some(sample_size), threshold) + .await + .unwrap() + }); + }); + } else { + // Create temporary dataset with random hashes + let temp_dir = tempfile::tempdir().unwrap(); + let uri = temp_dir.path().join("bench_hashes.lance"); + + rt.block_on(async { + create_hash_dataset(&uri, 100_000).await; + }); + + let dataset = rt.block_on(async { Dataset::open(uri.to_str().unwrap()).await.unwrap() }); + + for sample in [1_000, 5_000, 10_000] { + group.bench_function(format!("generated_sample_{}", sample), |b| { + let ds = dataset.clone(); + b.to_async(&rt).iter(|| { + let ds = ds.clone(); + async move { + hamming_clustering_for_sample(&ds, "hash", Some(sample), 10) + .await + .unwrap() + } + }); + }); + } + } + + group.finish(); +} + +/// Quick standalone benchmark that prints results (for quick testing). +#[allow(dead_code)] +fn run_quick_bench() { + println!("=== Hamming Distance Clustering Benchmark ===\n"); + + let sizes = [1_000, 5_000, 10_000, 20_000]; + + for &size in &sizes { + let hashes = generate_random_hashes(size); + let total_pairs = (size as u64) * (size as u64 - 1) / 2; + + println!("Size: {} rows, {} pairs", size, total_pairs); + let start = Instant::now(); + let reader = hamming_clustering_from_hashes(&hashes, None, 10); + // Consume the reader to count clusters + let cluster_count: usize = reader.map(|b| b.unwrap().num_rows()).sum(); + let elapsed = start.elapsed(); + + let pairs_per_sec = total_pairs as f64 / elapsed.as_secs_f64(); + println!( + " Total time: {:?} ({:.2}M pairs/sec)", + elapsed, + pairs_per_sec / 1_000_000.0 + ); + println!(" Total clusters: {}", cluster_count); + println!(); + } +} + +#[cfg(target_os = "linux")] +criterion_group! { + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_pairwise_compute, bench_cluster_hashes, bench_dataset_cluster +} + +#[cfg(not(target_os = "linux"))] +criterion_group!( + benches, + bench_pairwise_compute, + bench_cluster_hashes, + bench_dataset_cluster +); + +criterion_main!(benches); diff --git a/rust/lance/benches/mem_wal/write/mem_wal_write.rs b/rust/lance/benches/mem_wal/write/mem_wal_write.rs index 24f3a0d7c8f..9a5fc71ab17 100644 --- a/rust/lance/benches/mem_wal/write/mem_wal_write.rs +++ b/rust/lance/benches/mem_wal/write/mem_wal_write.rs @@ -649,8 +649,10 @@ fn bench_lance_memwal_write(c: &mut Criterion) { backpressure_log_interval: default_config .backpressure_log_interval, stats_log_interval: default_config.stats_log_interval, + frozen_memtable_grace: default_config.frozen_memtable_grace, enable_memtable, hnsw_params: default_config.hnsw_params, + warmer: None, }; // Get writer through Dataset API (index configs loaded automatically) diff --git a/rust/lance/benches/regex_ngram.rs b/rust/lance/benches/regex_ngram.rs new file mode 100644 index 00000000000..76f597ad9cb --- /dev/null +++ b/rust/lance/benches/regex_ngram.rs @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Benchmark: regex predicate scans over an ngram-indexed string column. +//! +//! Each query is a `regexp_match(doc, '...')` filter against a dataset that has +//! an NGram index on `doc`. The query set spans a selective AND pattern, an +//! alternation, a plain literal (rewritten to an infix LIKE before it reaches +//! the index), and a deliberately non-accelerable pattern (`a.b`, which yields +//! no trigram) that serves as a regression guard. +//! +//! On `main` none of these use the index (regex falls through to a full scan + +//! recheck); with the ngram-regex acceleration the index prunes candidates for +//! the first three while `a.b` stays a full scan. Capture a baseline on `main` +//! with `--save-baseline before_7130`, then compare after the change with +//! `--baseline before_7130`. + +use std::hint::black_box; +use std::sync::Arc; +use std::time::Duration; + +use arrow::array::AsArray; +use arrow_array::{RecordBatch, RecordBatchIterator, StringArray}; +use arrow_schema::{DataType, Field, Schema}; +use criterion::{Criterion, criterion_group, criterion_main}; +use futures::TryStreamExt; +use lance::Dataset; +use lance::index::DatasetIndexExt; +use lance_core::utils::tempfile::TempStrDir; +use lance_datagen::{RowCount, array}; +use lance_index::IndexType; +use lance_index::scalar::ScalarIndexParams; +#[cfg(target_os = "linux")] +use lance_testing::pprof::{Output, PProfProfiler}; + +const TOTAL: usize = 200_000; + +/// Build the `doc` column: random sentences with rare markers injected into a +/// small fraction of rows so the regex queries have controlled selectivity. +/// The markers (`zqxwvu`, `needlexyz`, `qwerasdf`) are unlikely to appear in +/// the generated English-word sentences. +fn build_docs() -> StringArray { + let mut sentence_gen = array::random_sentence(1, 30, false); + let base = sentence_gen + .generate_default(RowCount::from(TOTAL as u64)) + .unwrap(); + let base = base.as_string::(); + let docs = (0..TOTAL).map(|i| { + let sentence = base.value(i); + if i % 200 == 0 { + // ~0.5% of rows match `zqxwvu.*needlexyz` and `zqxwvu`. + format!("{sentence} zqxwvu needlexyz") + } else if i % 211 == 0 { + // A second marker for the alternation query. + format!("{sentence} qwerasdf") + } else { + sentence.to_string() + } + }); + StringArray::from_iter_values(docs) +} + +async fn build_dataset(tempdir: &TempStrDir) -> Arc { + let schema = Arc::new(Schema::new(vec![Field::new("doc", DataType::Utf8, false)])); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(build_docs())]).unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let mut dataset = Dataset::write(reader, tempdir.as_str(), None) + .await + .unwrap(); + dataset + .create_index( + &["doc"], + IndexType::NGram, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + Arc::new(dataset) +} + +async fn scan_filter(dataset: &Dataset, filter: &str) -> usize { + let mut scanner = dataset.scan(); + scanner.filter(filter).unwrap(); + let stream = scanner.try_into_stream().await.unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + batches.iter().map(|b| b.num_rows()).sum() +} + +fn bench_regex_ngram(c: &mut Criterion) { + let rt = tokio::runtime::Runtime::new().unwrap(); + let tempdir = TempStrDir::default(); + let dataset = rt.block_on(build_dataset(&tempdir)); + + let queries = [ + ("selective_and", "regexp_match(doc, 'zqxwvu.*needlexyz')"), + ( + "alternation", + "regexp_match(doc, '(zqxwvu|qwerasdf|needlexyz)')", + ), + ("plain_literal", "regexp_match(doc, 'zqxwvu')"), + ("non_accelerable_a_dot_b", "regexp_match(doc, 'a.b')"), + ]; + + let mut group = c.benchmark_group("regex_ngram"); + group + .sample_size(10) + .measurement_time(Duration::from_secs(15)); + for (name, filter) in queries { + group.bench_function(name, |b| { + b.iter(|| black_box(rt.block_on(scan_filter(&dataset, filter)))); + }); + } + group.finish(); +} + +#[cfg(target_os = "linux")] +criterion_group!( + name = benches; + config = Criterion::default() + .significance_level(0.1) + .sample_size(10) + .with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_regex_ngram); + +#[cfg(not(target_os = "linux"))] +criterion_group!( + name = benches; + config = Criterion::default().significance_level(0.1).sample_size(10); + targets = bench_regex_ngram); + +criterion_main!(benches); diff --git a/rust/lance/src/blob.rs b/rust/lance/src/blob.rs index 322bf67a04c..58df42b5cd3 100644 --- a/rust/lance/src/blob.rs +++ b/rust/lance/src/blob.rs @@ -7,12 +7,16 @@ //! tagged with `ARROW:extension:name = "lance.blob.v2"`. This module offers a //! type-safe builder to construct that struct without manually wiring metadata +use std::num::NonZeroUsize; use std::sync::Arc; use arrow_array::{ArrayRef, StructArray, builder::LargeBinaryBuilder, builder::StringBuilder}; use arrow_buffer::NullBufferBuilder; use arrow_schema::{DataType, Field}; -use lance_arrow::{ARROW_EXT_NAME_KEY, BLOB_V2_EXT_NAME}; +use lance_arrow::{ + ARROW_EXT_NAME_KEY, BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, BLOB_V2_EXT_NAME, +}; use crate::{Error, Result}; @@ -21,9 +25,71 @@ use crate::{Error, Result}; /// Blob v2 expects a column shaped as `Struct` and /// tagged with `ARROW:extension:name = "lance.blob.v2"`. pub fn blob_field(name: &str, nullable: bool) -> Field { - let metadata = [(ARROW_EXT_NAME_KEY.to_string(), BLOB_V2_EXT_NAME.to_string())] + blob_field_with_options(name, nullable, BlobFieldOptions::default()) +} + +/// Options for constructing a blob v2 field. +#[derive(Clone, Debug, Default)] +pub struct BlobFieldOptions { + /// Maximum payload size to keep inline in the data file before using packed blob storage. + pub inline_size_threshold: Option, + /// Maximum payload size to store in packed blob storage before using dedicated blob storage. + /// + /// A zero threshold is invalid because dedicated blob storage is selected when + /// the payload size is greater than this value. + pub dedicated_size_threshold: Option, +} + +impl BlobFieldOptions { + /// Set the maximum payload size to keep inline in the data file. + pub fn with_inline_size_threshold(mut self, threshold: usize) -> Self { + self.inline_size_threshold = Some(threshold); + self + } + + /// Set the maximum payload size to store in packed blob storage. + pub fn with_dedicated_size_threshold(mut self, threshold: NonZeroUsize) -> Self { + self.dedicated_size_threshold = Some(threshold); + self + } +} + +/// Construct the Arrow field for a blob v2 column with storage layout options. +/// +/// Blob v2 expects a column shaped as `Struct` and +/// tagged with `ARROW:extension:name = "lance.blob.v2"`. +/// +/// ``` +/// # use lance::{BlobFieldOptions, blob_field_with_options}; +/// let field = blob_field_with_options( +/// "blob", +/// true, +/// BlobFieldOptions::default().with_inline_size_threshold(16 * 1024), +/// ); +/// assert_eq!( +/// field +/// .metadata() +/// .get("lance-encoding:blob-inline-size-threshold") +/// .map(String::as_str), +/// Some("16384"), +/// ); +/// ``` +pub fn blob_field_with_options(name: &str, nullable: bool, options: BlobFieldOptions) -> Field { + let mut metadata = [(ARROW_EXT_NAME_KEY.to_string(), BLOB_V2_EXT_NAME.to_string())] .into_iter() - .collect(); + .collect::>(); + if let Some(threshold) = options.inline_size_threshold { + metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + threshold.to_string(), + ); + } + if let Some(threshold) = options.dedicated_size_threshold { + metadata.insert( + BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY.to_string(), + threshold.get().to_string(), + ); + } Field::new( name, DataType::Struct( @@ -142,6 +208,8 @@ impl BlobArrayBuilder { #[cfg(test)] mod tests { + use std::num::NonZeroUsize; + use super::*; use arrow_array::Array; use arrow_array::cast::AsArray; @@ -156,6 +224,31 @@ mod tests { ); } + #[test] + fn test_field_metadata_with_options() { + let field = blob_field_with_options( + "blob", + true, + BlobFieldOptions::default() + .with_inline_size_threshold(16 * 1024) + .with_dedicated_size_threshold(NonZeroUsize::new(2 * 1024 * 1024).unwrap()), + ); + assert_eq!( + field + .metadata() + .get(BLOB_INLINE_SIZE_THRESHOLD_META_KEY) + .unwrap(), + "16384" + ); + assert_eq!( + field + .metadata() + .get(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY) + .unwrap(), + "2097152" + ); + } + #[test] fn test_builder_basic() { let mut b = BlobArrayBuilder::new(4); diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index c9cc356aaa6..3e0d77704da 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -24,8 +24,7 @@ use lance_core::datatypes::{OnMissing, OnTypeMismatch, Projectable, Projection}; use lance_core::traits::DatasetTakeRows; use lance_core::utils::address::RowAddress; use lance_core::utils::tracing::{ - DATASET_CLEANING_EVENT, DATASET_DELETING_EVENT, DATASET_DROPPING_COLUMN_EVENT, - TRACE_DATASET_EVENTS, + DATASET_DELETING_EVENT, DATASET_DROPPING_COLUMN_EVENT, TRACE_DATASET_EVENTS, }; use lance_datafusion::projection::ProjectionPlan; use lance_file::datatypes::populate_schema_dictionary; @@ -104,7 +103,7 @@ use self::scanner::{DatasetRecordBatchStream, Scanner}; use self::transaction::{Operation, Transaction, TransactionBuilder, UpdateMapEntry}; use self::write::{cleanup_data_fragments, write_fragments_internal}; use crate::dataset::branch_location::BranchLocation; -use crate::dataset::cleanup::{CleanupPolicy, CleanupPolicyBuilder}; +use crate::dataset::cleanup::{CleanupOperation, CleanupPolicy, CleanupPolicyBuilder}; use crate::dataset::refs::{BranchContents, BranchIdentifier, Branches, Tags}; use crate::dataset::sql::SqlQueryBuilder; use crate::datatypes::Schema; @@ -514,7 +513,10 @@ impl Dataset { let transaction = Transaction::new(version_number, clone_op, None); let builder = CommitBuilder::new(WriteDestination::Uri(branch_location.uri.as_str())) - .with_store_params(store_params.unwrap_or_default()) + // Fall back to the dataset's own store params + .with_store_params( + store_params.unwrap_or(self.store_params.as_deref().cloned().unwrap_or_default()), + ) .with_object_store(Arc::new(self.object_store.as_ref().clone())) .with_commit_handler(self.commit_handler.clone()) .with_storage_format(self.manifest.data_storage_format.lance_file_version()?); @@ -1283,8 +1285,15 @@ impl Dataset { &self, policy: CleanupPolicy, ) -> BoxFuture<'_, Result> { - info!(target: TRACE_DATASET_EVENTS, event=DATASET_CLEANING_EVENT, uri=&self.uri); - cleanup::cleanup_old_versions(self, policy).boxed() + async move { self.cleanup(policy).execute().await }.boxed() + } + + /// Creates a cleanup operation for this dataset. + /// + /// The returned operation can be explained without deleting files, or + /// executed to re-evaluate the current dataset state and remove files. + pub fn cleanup(&self, policy: CleanupPolicy) -> CleanupOperation<'_> { + CleanupOperation::new(self, policy) } #[allow(clippy::too_many_arguments)] @@ -2232,6 +2241,39 @@ impl Dataset { .version) } + /// Return whether the dataset has a newer committed version. + pub async fn is_stale(&self) -> Result { + let latest_version = self.latest_version_id().await?; + Ok(latest_version != self.manifest.version) + } + + /// Return whether the immediate attached successor manifest exists. + /// + /// This is a fast contiguous-history probe. It does not resolve the latest + /// version and may return `false` if intermediate manifests have been + /// removed. Callers that need a general freshness check should use + /// [`Self::is_stale`]. + #[doc(hidden)] + pub async fn has_successor_version(&self) -> Result { + let Some(next_version) = self.manifest.version.checked_add(1) else { + return Ok(false); + }; + if lance_table::format::is_detached_version(next_version) { + return Ok(false); + } + + let exists = self + .commit_handler + .version_exists( + &self.base, + next_version, + self.object_store.inner.as_ref(), + self.manifest_location.naming_scheme, + ) + .await?; + Ok(exists) + } + pub fn count_fragments(&self) -> usize { self.manifest.fragments.len() } diff --git a/rust/lance/src/dataset/blob.rs b/rust/lance/src/dataset/blob.rs index f2c243367ce..8cdde543e4e 100644 --- a/rust/lance/src/dataset/blob.rs +++ b/rust/lance/src/dataset/blob.rs @@ -12,14 +12,18 @@ use std::{ use arrow::array::AsArray; use arrow::datatypes::{UInt8Type, UInt32Type, UInt64Type}; -use arrow_array::Array; use arrow_array::RecordBatch; use arrow_array::builder::{LargeBinaryBuilder, PrimitiveBuilder, StringBuilder}; -use arrow_schema::DataType as ArrowDataType; +use arrow_array::{Array, ArrayRef}; +use arrow_schema::{DataType as ArrowDataType, Field as ArrowField}; use bytes::Bytes; +use futures::future::BoxFuture; use futures::stream::BoxStream; use futures::{FutureExt, StreamExt, TryStreamExt, stream}; -use lance_arrow::{BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, FieldExt}; +use lance_arrow::{ + BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, BLOB_INLINE_SIZE_THRESHOLD_META_KEY, FieldExt, + r#struct::StructArrayExt, +}; use lance_io::object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry}; use lance_io::scheduler::{FileScheduler, ScanScheduler, SchedulerConfig}; use object_store::path::Path; @@ -40,6 +44,58 @@ use lance_io::utils::CachedFileSize; const INLINE_MAX: usize = 64 * 1024; // 64KB inline cutoff const DEDICATED_THRESHOLD: usize = 4 * 1024 * 1024; // 4MB dedicated cutoff const PACK_FILE_MAX_SIZE: usize = 1024 * 1024 * 1024; // 1GiB per .pack sidecar + +pub(super) fn blob_inline_threshold_from_metadata( + metadata: &HashMap, + field_name: &str, +) -> Result { + blob_threshold_from_metadata( + metadata, + field_name, + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, + INLINE_MAX, + true, + ) +} + +pub(super) fn blob_dedicated_threshold_from_metadata( + metadata: &HashMap, + field_name: &str, +) -> Result { + blob_threshold_from_metadata( + metadata, + field_name, + BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, + DEDICATED_THRESHOLD, + false, + ) +} + +fn blob_threshold_from_metadata( + metadata: &HashMap, + field_name: &str, + key: &str, + default_value: usize, + allow_zero: bool, +) -> Result { + let Some(value) = metadata.get(key) else { + return Ok(default_value); + }; + let threshold = value.parse::().map_err(|_| { + Error::invalid_input(format!( + "Invalid blob threshold metadata {key}={value:?} for field '{field_name}'; \ + expected a non-negative integer that fits in usize" + )) + })?; + if !allow_zero && threshold == 0 { + return Err(Error::invalid_input(format!( + "Invalid blob threshold metadata {key}={value:?} for field '{field_name}'; \ + expected a positive integer" + ))); + } + Ok(threshold) +} + #[derive(Clone, Debug, PartialEq, Eq)] pub(super) struct ResolvedExternalBase { pub base_id: u32, @@ -205,9 +261,7 @@ pub struct BlobPreprocessor { data_file_key: String, local_counter: u32, pack_writer: PackWriter, - blob_v2_cols: Vec, - dedicated_thresholds: Vec, - writer_metadata: Vec>, + field_processors: Vec, external_base_resolver: Option>, allow_external_blob_outside_bases: bool, external_blob_mode: ExternalBlobMode, @@ -232,6 +286,64 @@ enum BlobWriteSource<'a> { External(&'a ExternalBlobSource), } +#[derive(Clone, Debug)] +struct BlobPreprocessField { + kind: BlobPreprocessFieldKind, +} + +#[derive(Clone, Debug)] +enum BlobPreprocessFieldKind { + BlobV2 { + inline_threshold: usize, + dedicated_threshold: usize, + writer_metadata: HashMap, + }, + Struct { + children: Vec, + }, + Passthrough, +} + +impl BlobPreprocessField { + fn new(field: &ArrowField) -> Result { + if field.is_blob_v2() { + return Ok(Self { + kind: BlobPreprocessFieldKind::BlobV2 { + inline_threshold: blob_inline_threshold_from_metadata( + field.metadata(), + field.name(), + )?, + dedicated_threshold: blob_dedicated_threshold_from_metadata( + field.metadata(), + field.name(), + )?, + writer_metadata: field.metadata().clone(), + }, + }); + } + + if let ArrowDataType::Struct(children) = field.data_type() { + let children = children + .iter() + .map(|child| Self::new(child.as_ref())) + .collect::>>()?; + if children.iter().any(|child| child.requires_preprocessing()) { + return Ok(Self { + kind: BlobPreprocessFieldKind::Struct { children }, + }); + } + } + + Ok(Self { + kind: BlobPreprocessFieldKind::Passthrough, + }) + } + + fn requires_preprocessing(&self) -> bool { + !matches!(self.kind, BlobPreprocessFieldKind::Passthrough) + } +} + impl ExternalBlobSource { /// Return the logical payload size after applying any external slice. fn size(&self) -> u64 { @@ -313,7 +425,7 @@ impl BlobPreprocessor { source_store_registry: Arc, source_store_params: ObjectStoreParams, pack_file_size_threshold: Option, - ) -> Self { + ) -> Result { let mut pack_writer = PackWriter::new( object_store.clone(), data_dir.clone(), @@ -323,32 +435,25 @@ impl BlobPreprocessor { pack_writer.max_pack_size = max_bytes; } let arrow_schema = arrow_schema::Schema::from(schema); - let fields = arrow_schema.fields(); - let blob_v2_cols = fields.iter().map(|field| field.is_blob_v2()).collect(); - let dedicated_thresholds = fields - .iter() - .map(|field| dedicated_threshold_from_metadata(field.as_ref())) - .collect(); - let writer_metadata = fields + let field_processors = arrow_schema + .fields() .iter() - .map(|field| field.metadata().clone()) - .collect(); - Self { + .map(|field| BlobPreprocessField::new(field.as_ref())) + .collect::>>()?; + Ok(Self { object_store, data_dir, data_file_key, // Start at 1 to avoid a potential all-zero blob_id value. local_counter: 1, pack_writer, - blob_v2_cols, - dedicated_thresholds, - writer_metadata, + field_processors, external_base_resolver, allow_external_blob_outside_bases, external_blob_mode, source_store_registry, source_store_params, - } + }) } fn next_blob_id(&mut self) -> u32 { @@ -443,7 +548,7 @@ impl BlobPreprocessor { } pub(crate) async fn preprocess_batch(&mut self, batch: &RecordBatch) -> Result { - let expected_columns = self.blob_v2_cols.len(); + let expected_columns = self.field_processors.len(); if batch.num_columns() != expected_columns { return Err(Error::invalid_input(format!( "Unexpected number of columns: expected {}, got {}", @@ -454,245 +559,340 @@ impl BlobPreprocessor { let batch_schema = batch.schema(); let batch_fields = batch_schema.fields(); + let field_processors = self.field_processors.clone(); let mut new_columns = Vec::with_capacity(batch.num_columns()); let mut new_fields = Vec::with_capacity(batch.num_columns()); - for idx in 0..batch.num_columns() { - let array = batch.column(idx); - let field = &batch_fields[idx]; - if !self.blob_v2_cols[idx] { - new_columns.push(array.clone()); - new_fields.push(field.clone()); + for ((processor, array), field) in field_processors + .iter() + .zip(batch.columns().iter()) + .zip(batch_fields.iter()) + { + let (new_column, new_field) = self + .preprocess_field(processor, array.clone(), field) + .await?; + new_columns.push(new_column); + new_fields.push(new_field); + } + + let new_schema = Arc::new(arrow_schema::Schema::new_with_metadata( + new_fields + .iter() + .map(|f| f.as_ref().clone()) + .collect::>(), + batch_schema.metadata().clone(), + )); + + RecordBatch::try_new(new_schema, new_columns) + .map_err(|e| Error::invalid_input(e.to_string())) + } + + fn preprocess_field<'a>( + &'a mut self, + processor: &'a BlobPreprocessField, + array: ArrayRef, + field: &'a Arc, + ) -> BoxFuture<'a, Result<(ArrayRef, Arc)>> { + async move { + match &processor.kind { + BlobPreprocessFieldKind::Passthrough => Ok((array, field.clone())), + BlobPreprocessFieldKind::BlobV2 { + inline_threshold, + dedicated_threshold, + writer_metadata, + } => { + self.preprocess_blob_array( + array, + field.as_ref(), + *inline_threshold, + *dedicated_threshold, + writer_metadata, + ) + .await + } + BlobPreprocessFieldKind::Struct { children } => { + self.preprocess_struct_array(array, field.as_ref(), children) + .await + } + } + } + .boxed() + } + + async fn preprocess_struct_array( + &mut self, + array: ArrayRef, + field: &ArrowField, + children: &[BlobPreprocessField], + ) -> Result<(ArrayRef, Arc)> { + let struct_arr = array + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::invalid_input("Struct field was not a struct array"))?; + if struct_arr.num_columns() != children.len() { + return Err(Error::invalid_input(format!( + "Struct field '{}' expected {} children, got {}", + field.name(), + children.len(), + struct_arr.num_columns() + ))); + } + + let struct_arr = struct_arr.normalize_slicing()?; + let parent_nulls = struct_arr.nulls().cloned(); + let pushed_down = struct_arr.pushdown_nulls()?; + let child_fields = pushed_down.fields().clone(); + let child_columns = pushed_down.columns().to_vec(); + + let mut new_columns = Vec::with_capacity(children.len()); + let mut new_fields = Vec::with_capacity(children.len()); + for ((child_processor, child_array), child_field) in children + .iter() + .zip(child_columns.into_iter()) + .zip(child_fields.iter()) + { + let (new_column, new_field) = self + .preprocess_field(child_processor, child_array, child_field) + .await?; + new_columns.push(new_column); + new_fields.push(new_field); + } + + let struct_array = + StructArray::try_new(new_fields.clone().into(), new_columns, parent_nulls)?; + let field = Arc::new( + ArrowField::new( + field.name(), + ArrowDataType::Struct(new_fields.into()), + field.is_nullable(), + ) + .with_metadata(field.metadata().clone()), + ); + Ok((Arc::new(struct_array), field)) + } + + async fn preprocess_blob_array( + &mut self, + array: ArrayRef, + field: &ArrowField, + inline_threshold: usize, + dedicated_threshold: usize, + writer_metadata: &HashMap, + ) -> Result<(ArrayRef, Arc)> { + let struct_arr = array + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::invalid_input("Blob column was not a struct array"))?; + + let data_col = struct_arr + .column_by_name("data") + .ok_or_else(|| Error::invalid_input("Blob struct missing `data` field"))? + .as_binary::(); + let uri_col = struct_arr + .column_by_name("uri") + .ok_or_else(|| Error::invalid_input("Blob struct missing `uri` field"))? + .as_string::(); + let position_col = struct_arr + .column_by_name("position") + .map(|col| col.as_primitive::()); + let size_col = struct_arr + .column_by_name("size") + .map(|col| col.as_primitive::()); + + let mut data_builder = LargeBinaryBuilder::with_capacity(struct_arr.len(), 0); + let mut uri_builder = StringBuilder::with_capacity(struct_arr.len(), 0); + let mut blob_id_builder = + PrimitiveBuilder::::with_capacity(struct_arr.len()); + let mut blob_size_builder = + PrimitiveBuilder::::with_capacity(struct_arr.len()); + let mut kind_builder = PrimitiveBuilder::::with_capacity(struct_arr.len()); + let mut position_builder = + PrimitiveBuilder::::with_capacity(struct_arr.len()); + + let struct_nulls = struct_arr.nulls(); + + for i in 0..struct_arr.len() { + if struct_arr.is_null(i) { + data_builder.append_null(); + uri_builder.append_null(); + blob_id_builder.append_null(); + blob_size_builder.append_null(); + kind_builder.append_null(); + position_builder.append_null(); continue; } - let struct_arr = array - .as_any() - .downcast_ref::() - .ok_or_else(|| Error::invalid_input("Blob column was not a struct array"))?; - - let data_col = struct_arr - .column_by_name("data") - .ok_or_else(|| Error::invalid_input("Blob struct missing `data` field"))? - .as_binary::(); - let uri_col = struct_arr - .column_by_name("uri") - .ok_or_else(|| Error::invalid_input("Blob struct missing `uri` field"))? - .as_string::(); - let position_col = struct_arr - .column_by_name("position") - .map(|col| col.as_primitive::()); - let size_col = struct_arr - .column_by_name("size") - .map(|col| col.as_primitive::()); - - let mut data_builder = LargeBinaryBuilder::with_capacity(struct_arr.len(), 0); - let mut uri_builder = StringBuilder::with_capacity(struct_arr.len(), 0); - let mut blob_id_builder = - PrimitiveBuilder::::with_capacity(struct_arr.len()); - let mut blob_size_builder = - PrimitiveBuilder::::with_capacity(struct_arr.len()); - let mut kind_builder = PrimitiveBuilder::::with_capacity(struct_arr.len()); - let mut position_builder = - PrimitiveBuilder::::with_capacity(struct_arr.len()); - - let struct_nulls = struct_arr.nulls(); - - for i in 0..struct_arr.len() { - if struct_arr.is_null(i) { - data_builder.append_null(); - uri_builder.append_null(); - blob_id_builder.append_null(); - blob_size_builder.append_null(); - kind_builder.append_null(); - position_builder.append_null(); - continue; - } + let has_data = !data_col.is_null(i); + let has_uri = !uri_col.is_null(i); + let has_position = position_col + .as_ref() + .map(|col| !col.is_null(i)) + .unwrap_or(false); + let has_size = size_col + .as_ref() + .map(|col| !col.is_null(i)) + .unwrap_or(false); + let data_len = if has_data { data_col.value(i).len() } else { 0 }; - let has_data = !data_col.is_null(i); - let has_uri = !uri_col.is_null(i); - let has_position = position_col - .as_ref() - .map(|col| !col.is_null(i)) - .unwrap_or(false); - let has_size = size_col - .as_ref() - .map(|col| !col.is_null(i)) - .unwrap_or(false); - let data_len = if has_data { data_col.value(i).len() } else { 0 }; - - let dedicated_threshold = self.dedicated_thresholds[idx]; - if has_data && data_len > dedicated_threshold { - let blob_id = self.next_blob_id(); - self.write_dedicated(blob_id, BlobWriteSource::Bytes(data_col.value(i))) - .await?; - - kind_builder.append_value(BlobKind::Dedicated as u8); - data_builder.append_null(); - uri_builder.append_null(); - blob_id_builder.append_value(blob_id); - blob_size_builder.append_value(data_len as u64); - position_builder.append_null(); - continue; - } + if has_data && data_len > dedicated_threshold { + let blob_id = self.next_blob_id(); + self.write_dedicated(blob_id, BlobWriteSource::Bytes(data_col.value(i))) + .await?; - if has_data && data_len > INLINE_MAX { - let (pack_blob_id, position) = self - .write_packed(BlobWriteSource::Bytes(data_col.value(i))) - .await?; + kind_builder.append_value(BlobKind::Dedicated as u8); + data_builder.append_null(); + uri_builder.append_null(); + blob_id_builder.append_value(blob_id); + blob_size_builder.append_value(data_len as u64); + position_builder.append_null(); + continue; + } - kind_builder.append_value(BlobKind::Packed as u8); - data_builder.append_null(); - uri_builder.append_null(); - blob_id_builder.append_value(pack_blob_id); - blob_size_builder.append_value(data_len as u64); - position_builder.append_value(position); - continue; - } + if has_data && data_len > inline_threshold { + let (pack_blob_id, position) = self + .write_packed(BlobWriteSource::Bytes(data_col.value(i))) + .await?; - if has_uri { - let uri_val = uri_col.value(i); - if self.external_blob_mode == ExternalBlobMode::Ingest { - let position = if has_position { - Some( - position_col - .as_ref() - .expect("position column must exist") - .value(i), - ) - } else { - None - }; - let size = if has_size { - Some(size_col.as_ref().expect("size column must exist").value(i)) - } else { - None - }; - let source = self.open_external_source(uri_val, position, size).await?; - let data_len = source.size(); - - if data_len > dedicated_threshold as u64 { - let blob_id = self.next_blob_id(); - self.write_dedicated(blob_id, BlobWriteSource::External(&source)) - .await?; - - kind_builder.append_value(BlobKind::Dedicated as u8); - data_builder.append_null(); - uri_builder.append_null(); - blob_id_builder.append_value(blob_id); - blob_size_builder.append_value(data_len); - position_builder.append_null(); - continue; - } + kind_builder.append_value(BlobKind::Packed as u8); + data_builder.append_null(); + uri_builder.append_null(); + blob_id_builder.append_value(pack_blob_id); + blob_size_builder.append_value(data_len as u64); + position_builder.append_value(position); + continue; + } - if data_len > INLINE_MAX as u64 { - let (pack_blob_id, position) = self - .write_packed(BlobWriteSource::External(&source)) - .await?; - - kind_builder.append_value(BlobKind::Packed as u8); - data_builder.append_null(); - uri_builder.append_null(); - blob_id_builder.append_value(pack_blob_id); - blob_size_builder.append_value(data_len); - position_builder.append_value(position); - continue; - } + if has_uri { + let uri_val = uri_col.value(i); + if self.external_blob_mode == ExternalBlobMode::Ingest { + let position = if has_position { + Some( + position_col + .as_ref() + .expect("position column must exist") + .value(i), + ) + } else { + None + }; + let size = if has_size { + Some(size_col.as_ref().expect("size column must exist").value(i)) + } else { + None + }; + let source = self.open_external_source(uri_val, position, size).await?; + let data_len = source.size(); - let data = source.read_all().await?; + if data_len > dedicated_threshold as u64 { + let blob_id = self.next_blob_id(); + self.write_dedicated(blob_id, BlobWriteSource::External(&source)) + .await?; - kind_builder.append_value(BlobKind::Inline as u8); - data_builder.append_value(data.as_ref()); + kind_builder.append_value(BlobKind::Dedicated as u8); + data_builder.append_null(); uri_builder.append_null(); - blob_id_builder.append_null(); - blob_size_builder.append_null(); + blob_id_builder.append_value(blob_id); + blob_size_builder.append_value(data_len); position_builder.append_null(); continue; } - let (external_base_id, external_uri_or_path) = - self.resolve_external_reference(uri_val).await?; - kind_builder.append_value(BlobKind::External as u8); - data_builder.append_null(); - uri_builder.append_value(external_uri_or_path); - blob_id_builder.append_value(external_base_id); - if has_position && has_size { - let position = position_col - .as_ref() - .expect("position column must exist") - .value(i); - let size = size_col.as_ref().expect("size column must exist").value(i); - blob_size_builder.append_value(size); + if data_len > inline_threshold as u64 { + let (pack_blob_id, position) = self + .write_packed(BlobWriteSource::External(&source)) + .await?; + + kind_builder.append_value(BlobKind::Packed as u8); + data_builder.append_null(); + uri_builder.append_null(); + blob_id_builder.append_value(pack_blob_id); + blob_size_builder.append_value(data_len); position_builder.append_value(position); - } else { - blob_size_builder.append_null(); - position_builder.append_null(); + continue; } - continue; - } - if has_data { + let data = source.read_all().await?; + kind_builder.append_value(BlobKind::Inline as u8); - let value = data_col.value(i); - data_builder.append_value(value); + data_builder.append_value(data.as_ref()); uri_builder.append_null(); blob_id_builder.append_null(); blob_size_builder.append_null(); position_builder.append_null(); + continue; + } + + let (external_base_id, external_uri_or_path) = + self.resolve_external_reference(uri_val).await?; + kind_builder.append_value(BlobKind::External as u8); + data_builder.append_null(); + uri_builder.append_value(external_uri_or_path); + blob_id_builder.append_value(external_base_id); + if has_position && has_size { + let position = position_col + .as_ref() + .expect("position column must exist") + .value(i); + let size = size_col.as_ref().expect("size column must exist").value(i); + blob_size_builder.append_value(size); + position_builder.append_value(position); } else { - data_builder.append_null(); - uri_builder.append_null(); - blob_id_builder.append_null(); blob_size_builder.append_null(); - kind_builder.append_null(); position_builder.append_null(); } + continue; } - let child_fields = vec![ - arrow_schema::Field::new("kind", ArrowDataType::UInt8, true), - arrow_schema::Field::new("data", ArrowDataType::LargeBinary, true), - arrow_schema::Field::new("uri", ArrowDataType::Utf8, true), - arrow_schema::Field::new("blob_id", ArrowDataType::UInt32, true), - arrow_schema::Field::new("blob_size", ArrowDataType::UInt64, true), - arrow_schema::Field::new("position", ArrowDataType::UInt64, true), - ]; - - let struct_array = arrow_array::StructArray::try_new( - child_fields.clone().into(), - vec![ - Arc::new(kind_builder.finish()), - Arc::new(data_builder.finish()), - Arc::new(uri_builder.finish()), - Arc::new(blob_id_builder.finish()), - Arc::new(blob_size_builder.finish()), - Arc::new(position_builder.finish()), - ], - struct_nulls.cloned(), - )?; - - new_columns.push(Arc::new(struct_array)); - new_fields.push(Arc::new( - arrow_schema::Field::new( - field.name(), - ArrowDataType::Struct(child_fields.into()), - field.is_nullable(), - ) - .with_metadata(self.writer_metadata[idx].clone()), - )); + if has_data { + kind_builder.append_value(BlobKind::Inline as u8); + let value = data_col.value(i); + data_builder.append_value(value); + uri_builder.append_null(); + blob_id_builder.append_null(); + blob_size_builder.append_null(); + position_builder.append_null(); + } else { + data_builder.append_null(); + uri_builder.append_null(); + blob_id_builder.append_null(); + blob_size_builder.append_null(); + kind_builder.append_null(); + position_builder.append_null(); + } } - let new_schema = Arc::new(arrow_schema::Schema::new_with_metadata( - new_fields - .iter() - .map(|f| f.as_ref().clone()) - .collect::>(), - batch_schema.metadata().clone(), - )); + let child_fields = vec![ + ArrowField::new("kind", ArrowDataType::UInt8, true), + ArrowField::new("data", ArrowDataType::LargeBinary, true), + ArrowField::new("uri", ArrowDataType::Utf8, true), + ArrowField::new("blob_id", ArrowDataType::UInt32, true), + ArrowField::new("blob_size", ArrowDataType::UInt64, true), + ArrowField::new("position", ArrowDataType::UInt64, true), + ]; - RecordBatch::try_new(new_schema, new_columns) - .map_err(|e| Error::invalid_input(e.to_string())) + let struct_array = StructArray::try_new( + child_fields.clone().into(), + vec![ + Arc::new(kind_builder.finish()), + Arc::new(data_builder.finish()), + Arc::new(uri_builder.finish()), + Arc::new(blob_id_builder.finish()), + Arc::new(blob_size_builder.finish()), + Arc::new(position_builder.finish()), + ], + struct_nulls.cloned(), + )?; + + let field = Arc::new( + ArrowField::new( + field.name(), + ArrowDataType::Struct(child_fields.into()), + field.is_nullable(), + ) + .with_metadata(writer_metadata.clone()), + ); + Ok((Arc::new(struct_array), field)) } pub(crate) async fn finish(&mut self) -> Result<()> { @@ -700,16 +900,6 @@ impl BlobPreprocessor { } } -fn dedicated_threshold_from_metadata(field: &arrow_schema::Field) -> usize { - field - .metadata() - .get(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY) - .and_then(|value| value.parse::().ok()) - .filter(|value| *value > 0) - .and_then(|value| usize::try_from(value).ok()) - .unwrap_or(DEDICATED_THRESHOLD) -} - pub async fn preprocess_blob_batches( batches: &[RecordBatch], pre: &mut BlobPreprocessor, @@ -2103,7 +2293,7 @@ mod tests { }; use arrow_array::RecordBatch; use arrow_array::{ - ArrayRef, RecordBatchIterator, StringArray, StructArray, UInt32Array, UInt64Array, + Array, ArrayRef, RecordBatchIterator, StringArray, StructArray, UInt32Array, UInt64Array, }; use arrow_schema::{DataType, Field, Schema}; use async_trait::async_trait; @@ -2111,7 +2301,8 @@ mod tests { use chrono::Utc; use futures::{StreamExt, TryStreamExt, future::try_join_all}; use lance_arrow::{ - ARROW_EXT_NAME_KEY, BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, BLOB_V2_EXT_NAME, DataTypeExt, + ARROW_EXT_NAME_KEY, BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, BLOB_V2_EXT_NAME, DataTypeExt, }; use lance_core::datatypes::BlobKind; use lance_io::object_store::{ @@ -2142,7 +2333,7 @@ mod tests { use crate::{ Dataset, blob::{BlobArrayBuilder, blob_field}, - dataset::{ExternalBlobMode, WriteParams}, + dataset::{ExternalBlobMode, WriteMode, WriteParams}, utils::test::TestDatasetGenerator, }; @@ -2158,6 +2349,32 @@ mod tests { expected: Vec, } + fn nested_blob_v2_batch(blob_array: ArrayRef) -> (Arc, RecordBatch) { + let blob_field = blob_field("blob", true); + let info_fields = vec![Field::new("name", DataType::Utf8, false), blob_field]; + let info_array: ArrayRef = Arc::new( + StructArray::try_new( + info_fields.clone().into(), + vec![ + Arc::new(StringArray::from_iter_values( + (0..blob_array.len()).map(|idx| format!("name-{idx}")), + )) as ArrayRef, + blob_array, + ], + None, + ) + .unwrap(), + ); + + let schema = Arc::new(Schema::new(vec![Field::new( + "info", + DataType::Struct(info_fields.into()), + true, + )])); + let batch = RecordBatch::try_new(schema.clone(), vec![info_array]).unwrap(); + (schema, batch) + } + #[cfg(feature = "azure")] fn azure_store_params(account_name: &str) -> ObjectStoreParams { ObjectStoreParams { @@ -3045,6 +3262,114 @@ mod tests { assert_eq!(second.as_ref(), b"world"); } + #[tokio::test] + async fn test_write_and_take_nested_blob_v2() { + let test_dir = TempStrDir::default(); + let packed_payload = vec![0x4A; super::INLINE_MAX + 1024]; + + let mut blob_builder = BlobArrayBuilder::new(3); + blob_builder.push_bytes(b"hello").unwrap(); + blob_builder.push_bytes(&packed_payload).unwrap(); + blob_builder.push_null().unwrap(); + let blob_array: ArrayRef = blob_builder.finish().unwrap(); + + let (schema, batch) = nested_blob_v2_batch(blob_array); + let reader = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema); + + let dataset = Arc::new( + Dataset::write( + reader, + &test_dir, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await + .unwrap(), + ); + + let info_batch = dataset + .scan() + .project(&["info"]) + .unwrap() + .try_into_batch() + .await + .unwrap(); + let blob_desc = info_batch + .column(0) + .as_struct() + .column_by_name("blob") + .unwrap() + .as_struct(); + assert_eq!( + blob_desc + .column_by_name("kind") + .unwrap() + .as_primitive::() + .value(0), + BlobKind::Inline as u8 + ); + assert_eq!( + blob_desc + .column_by_name("kind") + .unwrap() + .as_primitive::() + .value(1), + BlobKind::Packed as u8 + ); + + let blobs = dataset + .take_blobs_by_indices(&[0, 1], "info.blob") + .await + .unwrap(); + assert_eq!(blobs.len(), 2); + assert_eq!(blobs[0].read().await.unwrap().as_ref(), b"hello"); + assert_eq!( + blobs[1].read().await.unwrap().as_ref(), + packed_payload.as_slice() + ); + + let null_blobs = dataset + .take_blobs_by_indices(&[2], "info.blob") + .await + .unwrap(); + assert!(null_blobs.is_empty()); + } + + #[tokio::test] + async fn test_nested_blob_v2_requires_v2_2() { + let test_dir = TempStrDir::default(); + + let mut blob_builder = BlobArrayBuilder::new(1); + blob_builder.push_bytes(b"hello").unwrap(); + let blob_array: ArrayRef = blob_builder.finish().unwrap(); + + let (schema, batch) = nested_blob_v2_batch(blob_array); + let reader = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema); + + let result = Dataset::write( + reader, + &test_dir, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_1), + ..Default::default() + }), + ) + .await; + + assert!( + result.is_err(), + "Nested blob v2 should be rejected for file version 2.1" + ); + assert!( + result + .unwrap_err() + .to_string() + .contains("Blob v2 requires file version >= 2.2") + ); + } + #[tokio::test] async fn test_blob_file_read_empty_range_returns_empty_bytes() { let store = reject_empty_range_store(); @@ -3621,6 +3946,50 @@ mod tests { assert_eq!(blobs[0].read().await.unwrap().as_ref(), payload.as_slice()); } + #[tokio::test] + async fn test_blob_v2_external_ingest_respects_inline_threshold() { + let dataset_dir = TempDir::default(); + let external_dir = TempDir::default(); + let external_path = external_dir.std_path().join("external.bin"); + let payload = vec![0x5A; 2048]; + std::fs::write(&external_path, &payload).unwrap(); + let external_uri = format!("file://{}", external_path.display()); + + let mut blob_builder = BlobArrayBuilder::new(1); + blob_builder.push_uri(external_uri).unwrap(); + let blob_array: arrow_array::ArrayRef = blob_builder.finish().unwrap(); + + let mut field = blob_field("blob", true); + let mut metadata = field.metadata().clone(); + metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + "1024".to_string(), + ); + field = field.with_metadata(metadata); + let schema = Arc::new(Schema::new(vec![field])); + let batch = RecordBatch::try_new(schema.clone(), vec![blob_array]).unwrap(); + let reader = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema); + + let dataset = Arc::new( + Dataset::write( + reader, + &dataset_dir.path_str(), + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + external_blob_mode: ExternalBlobMode::Ingest, + ..Default::default() + }), + ) + .await + .unwrap(), + ); + + let blobs = dataset.take_blobs_by_indices(&[0], "blob").await.unwrap(); + assert_eq!(blobs.len(), 1); + assert_eq!(blobs[0].kind(), BlobKind::Packed); + assert_eq!(blobs[0].read().await.unwrap().as_ref(), payload.as_slice()); + } + #[tokio::test] async fn test_blob_v2_external_ingest_dedicated() { let dataset_dir = TempDir::default(); @@ -3713,7 +4082,10 @@ mod tests { ); } - async fn preprocess_kind_with_schema_metadata(metadata_value: &str, data_len: usize) -> u8 { + async fn try_preprocess_kind_with_blob_metadata( + metadata_entries: Vec<(&'static str, String)>, + data_len: usize, + ) -> Result { let (object_store, base_path) = ObjectStore::from_uri_and_params( Arc::new(ObjectStoreRegistry::default()), "memory://blob_preprocessor", @@ -3726,10 +4098,9 @@ mod tests { let mut field = blob_field("blob", true); let mut metadata = field.metadata().clone(); - metadata.insert( - BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY.to_string(), - metadata_value.to_string(), - ); + for (key, value) in metadata_entries { + metadata.insert(key.to_string(), value); + } field = field.with_metadata(metadata); let writer_arrow_schema = Schema::new(vec![field.clone()]); @@ -3746,7 +4117,7 @@ mod tests { Arc::new(ObjectStoreRegistry::default()), ObjectStoreParams::default(), None, - ); + )?; let mut blob_builder = BlobArrayBuilder::new(1); blob_builder.push_bytes(vec![0u8; data_len]).unwrap(); @@ -3757,36 +4128,442 @@ mod tests { let batch_schema = Arc::new(Schema::new(vec![field_without_metadata])); let batch = RecordBatch::try_new(batch_schema, vec![blob_array]).unwrap(); - let out = preprocessor.preprocess_batch(&batch).await.unwrap(); + let out = preprocessor.preprocess_batch(&batch).await?; let struct_arr = out .column(0) .as_any() .downcast_ref::() .unwrap(); - struct_arr + Ok(struct_arr .column_by_name("kind") .unwrap() .as_primitive::() - .value(0) + .value(0)) + } + + async fn preprocess_kind_with_blob_metadata( + metadata_entries: Vec<(&'static str, String)>, + data_len: usize, + ) -> u8 { + try_preprocess_kind_with_blob_metadata(metadata_entries, data_len) + .await + .unwrap() } #[tokio::test] - async fn test_blob_v2_dedicated_threshold_ignores_non_positive_metadata() { - let kind = preprocess_kind_with_schema_metadata("0", 256 * 1024).await; - assert_eq!(kind, lance_core::datatypes::BlobKind::Packed as u8); + async fn test_blob_v2_dedicated_threshold_rejects_non_positive_metadata() { + let err = try_preprocess_kind_with_blob_metadata( + vec![(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, "0".to_string())], + 256 * 1024, + ) + .await + .unwrap_err(); + assert!(err.to_string().contains("expected a positive integer")); + } + + #[tokio::test] + async fn test_blob_v2_inline_threshold_rejects_invalid_metadata() { + let err = try_preprocess_kind_with_blob_metadata( + vec![( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, + "not-a-number".to_string(), + )], + 256 * 1024, + ) + .await + .unwrap_err(); + assert!( + err.to_string() + .contains("expected a non-negative integer that fits in usize") + ); + } + + #[tokio::test] + async fn test_blob_v2_write_rejects_invalid_inline_threshold_metadata() { + let dataset_dir = TempDir::default(); + let mut field = blob_field("blob", true); + let mut metadata = field.metadata().clone(); + metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + "not-a-number".to_string(), + ); + field = field.with_metadata(metadata); + let schema = Arc::new(Schema::new(vec![field])); + + let mut blob_builder = BlobArrayBuilder::new(1); + blob_builder.push_bytes(vec![0u8; 256]).unwrap(); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(blob_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let result = Dataset::write( + reader, + &dataset_dir.path_str(), + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await; + let Err(err) = result else { + panic!("write with invalid blob threshold metadata should fail"); + }; + assert!( + err.to_string() + .contains("expected a non-negative integer that fits in usize") + ); } #[tokio::test] async fn test_blob_v2_dedicated_threshold_respects_smaller_metadata() { - let kind = preprocess_kind_with_schema_metadata("131072", 256 * 1024).await; + let kind = preprocess_kind_with_blob_metadata( + vec![(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, "131072".to_string())], + 256 * 1024, + ) + .await; assert_eq!(kind, lance_core::datatypes::BlobKind::Dedicated as u8); } #[tokio::test] async fn test_blob_v2_dedicated_threshold_respects_larger_metadata() { - let kind = - preprocess_kind_with_schema_metadata("8388608", super::DEDICATED_THRESHOLD + 1024) - .await; + let kind = preprocess_kind_with_blob_metadata( + vec![( + BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, + "8388608".to_string(), + )], + super::DEDICATED_THRESHOLD + 1024, + ) + .await; + assert_eq!(kind, lance_core::datatypes::BlobKind::Packed as u8); + } + + #[tokio::test] + async fn test_blob_v2_inline_threshold_respects_smaller_metadata() { + let kind = preprocess_kind_with_blob_metadata( + vec![(BLOB_INLINE_SIZE_THRESHOLD_META_KEY, "1024".to_string())], + 2048, + ) + .await; assert_eq!(kind, lance_core::datatypes::BlobKind::Packed as u8); } + + #[tokio::test] + async fn test_blob_v2_inline_threshold_respects_larger_metadata() { + let kind = preprocess_kind_with_blob_metadata( + vec![( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, + (super::INLINE_MAX + 8192).to_string(), + )], + super::INLINE_MAX + 4096, + ) + .await; + assert_eq!(kind, lance_core::datatypes::BlobKind::Inline as u8); + } + + #[tokio::test] + async fn test_blob_v2_inline_threshold_uses_strict_greater_than() { + let kind = preprocess_kind_with_blob_metadata( + vec![(BLOB_INLINE_SIZE_THRESHOLD_META_KEY, "1024".to_string())], + 1024, + ) + .await; + assert_eq!(kind, lance_core::datatypes::BlobKind::Inline as u8); + } + + #[tokio::test] + async fn test_blob_v2_dedicated_threshold_uses_strict_greater_than() { + let kind = preprocess_kind_with_blob_metadata( + vec![ + (BLOB_INLINE_SIZE_THRESHOLD_META_KEY, "2048".to_string()), + (BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, "1024".to_string()), + ], + 1024, + ) + .await; + assert_eq!(kind, lance_core::datatypes::BlobKind::Inline as u8); + } + + #[tokio::test] + async fn test_blob_v2_inline_threshold_does_not_override_dedicated_threshold() { + let kind = preprocess_kind_with_blob_metadata( + vec![ + (BLOB_INLINE_SIZE_THRESHOLD_META_KEY, "8192".to_string()), + (BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, "4096".to_string()), + ], + 6144, + ) + .await; + assert_eq!(kind, lance_core::datatypes::BlobKind::Dedicated as u8); + } + + #[tokio::test] + async fn test_blob_v2_inline_threshold_is_per_column() { + let (object_store, base_path) = ObjectStore::from_uri_and_params( + Arc::new(ObjectStoreRegistry::default()), + "memory://blob_preprocessor", + &ObjectStoreParams::default(), + ) + .await + .unwrap(); + let object_store = object_store.as_ref().clone(); + let data_dir = base_path.clone().join("data"); + + let mut inline_field = blob_field("inline_blob", true); + let mut inline_metadata = inline_field.metadata().clone(); + inline_metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + "4096".to_string(), + ); + inline_field = inline_field.with_metadata(inline_metadata); + + let mut packed_field = blob_field("packed_blob", true); + let mut packed_metadata = packed_field.metadata().clone(); + packed_metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + "1024".to_string(), + ); + packed_field = packed_field.with_metadata(packed_metadata); + + let writer_arrow_schema = Schema::new(vec![inline_field.clone(), packed_field.clone()]); + let writer_schema = lance_core::datatypes::Schema::try_from(&writer_arrow_schema).unwrap(); + + let mut preprocessor = super::BlobPreprocessor::new( + object_store.clone(), + data_dir, + "data_file_key".to_string(), + &writer_schema, + None, + false, + ExternalBlobMode::Reference, + Arc::new(ObjectStoreRegistry::default()), + ObjectStoreParams::default(), + None, + ) + .unwrap(); + + let mut inline_builder = BlobArrayBuilder::new(1); + inline_builder.push_bytes(vec![0u8; 2048]).unwrap(); + let inline_array: arrow_array::ArrayRef = inline_builder.finish().unwrap(); + + let mut packed_builder = BlobArrayBuilder::new(1); + packed_builder.push_bytes(vec![0u8; 2048]).unwrap(); + let packed_array: arrow_array::ArrayRef = packed_builder.finish().unwrap(); + + let batch_schema = Arc::new(Schema::new(vec![ + Field::new( + "inline_blob", + inline_field.data_type().clone(), + inline_field.is_nullable(), + ), + Field::new( + "packed_blob", + packed_field.data_type().clone(), + packed_field.is_nullable(), + ), + ])); + let batch = RecordBatch::try_new(batch_schema, vec![inline_array, packed_array]).unwrap(); + + let out = preprocessor.preprocess_batch(&batch).await.unwrap(); + let inline_kind = out + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .column_by_name("kind") + .unwrap() + .as_primitive::() + .value(0); + let packed_kind = out + .column(1) + .as_any() + .downcast_ref::() + .unwrap() + .column_by_name("kind") + .unwrap() + .as_primitive::() + .value(0); + + assert_eq!(inline_kind, lance_core::datatypes::BlobKind::Inline as u8); + assert_eq!(packed_kind, lance_core::datatypes::BlobKind::Packed as u8); + } + + #[tokio::test] + async fn test_blob_v2_append_rejects_explicit_inline_threshold_mismatch() { + let dataset_dir = TempDir::default(); + let payload = vec![0u8; 2048]; + + let schema = Arc::new(Schema::new(vec![blob_field("blob", true)])); + let mut initial_builder = BlobArrayBuilder::new(1); + initial_builder.push_bytes(payload.clone()).unwrap(); + let initial_batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(initial_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let initial_reader = RecordBatchIterator::new(vec![Ok(initial_batch)], schema); + let dataset = Dataset::write( + initial_reader, + &dataset_dir.path_str(), + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await + .unwrap(); + + let mut append_field = blob_field("blob", true); + let mut append_metadata = append_field.metadata().clone(); + append_metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + "1024".to_string(), + ); + append_field = append_field.with_metadata(append_metadata); + let append_schema = Arc::new(Schema::new(vec![append_field])); + let mut append_builder = BlobArrayBuilder::new(1); + append_builder.push_bytes(payload).unwrap(); + let append_batch = RecordBatch::try_new( + append_schema.clone(), + vec![Arc::new(append_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let append_reader = RecordBatchIterator::new(vec![Ok(append_batch)], append_schema); + + let result = Dataset::write( + append_reader, + Arc::new(dataset), + Some(WriteParams { + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await; + let Err(err) = result else { + panic!("append with explicit blob threshold mismatch should fail"); + }; + let message = err.to_string(); + assert!(message.contains("Cannot append data with blob threshold metadata")); + assert!(message.contains(BLOB_INLINE_SIZE_THRESHOLD_META_KEY)); + } + + #[tokio::test] + async fn test_blob_v2_append_rejects_threshold_mismatch_with_non_blob_input_extension() { + let dataset_dir = TempDir::default(); + let payload = vec![0u8; 2048]; + + let schema = Arc::new(Schema::new(vec![blob_field("blob", true)])); + let mut initial_builder = BlobArrayBuilder::new(1); + initial_builder.push_bytes(payload.clone()).unwrap(); + let initial_batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(initial_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let initial_reader = RecordBatchIterator::new(vec![Ok(initial_batch)], schema); + let dataset = Dataset::write( + initial_reader, + &dataset_dir.path_str(), + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await + .unwrap(); + + let mut append_field = blob_field("blob", true); + let mut append_metadata = append_field.metadata().clone(); + append_metadata.insert( + ARROW_EXT_NAME_KEY.to_string(), + "some.other.extension".to_string(), + ); + append_metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + "1024".to_string(), + ); + append_field = append_field.with_metadata(append_metadata); + let append_schema = Arc::new(Schema::new(vec![append_field])); + let mut append_builder = BlobArrayBuilder::new(1); + append_builder.push_bytes(payload).unwrap(); + let append_batch = RecordBatch::try_new( + append_schema.clone(), + vec![Arc::new(append_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let append_reader = RecordBatchIterator::new(vec![Ok(append_batch)], append_schema); + + let result = Dataset::write( + append_reader, + Arc::new(dataset), + Some(WriteParams { + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await; + let Err(err) = result else { + panic!("append with ignored blob threshold metadata should fail"); + }; + let message = err.to_string(); + assert!(message.contains("Cannot append data with blob threshold metadata")); + assert!(message.contains(BLOB_INLINE_SIZE_THRESHOLD_META_KEY)); + } + + #[tokio::test] + async fn test_blob_v2_append_accepts_explicit_default_inline_threshold() { + let dataset_dir = TempDir::default(); + let payload = vec![0u8; 2048]; + + let schema = Arc::new(Schema::new(vec![blob_field("blob", true)])); + let mut initial_builder = BlobArrayBuilder::new(1); + initial_builder.push_bytes(payload.clone()).unwrap(); + let initial_batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(initial_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let initial_reader = RecordBatchIterator::new(vec![Ok(initial_batch)], schema); + let dataset = Dataset::write( + initial_reader, + &dataset_dir.path_str(), + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await + .unwrap(); + + let mut append_field = blob_field("blob", true); + let mut append_metadata = append_field.metadata().clone(); + append_metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + super::INLINE_MAX.to_string(), + ); + append_field = append_field.with_metadata(append_metadata); + let append_schema = Arc::new(Schema::new(vec![append_field])); + let mut append_builder = BlobArrayBuilder::new(1); + append_builder.push_bytes(payload).unwrap(); + let append_batch = RecordBatch::try_new( + append_schema.clone(), + vec![Arc::new(append_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let append_reader = RecordBatchIterator::new(vec![Ok(append_batch)], append_schema); + + let dataset = Dataset::write( + append_reader, + Arc::new(dataset), + Some(WriteParams { + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await + .unwrap(); + assert_eq!(dataset.count_rows(None).await.unwrap(), 2); + } } diff --git a/rust/lance/src/dataset/branch_location.rs b/rust/lance/src/dataset/branch_location.rs index 3a1185c8cf8..7ebce36ec86 100644 --- a/rust/lance/src/dataset/branch_location.rs +++ b/rust/lance/src/dataset/branch_location.rs @@ -31,14 +31,20 @@ impl BranchLocation { } fn get_root_path(path_str: &str, branch_name: &str) -> Result { + // A uri may carry a query string (e.g. `s3+ddb://...?ddbTableName=t`); + // the branch suffix sits on the path part, before the query. + let (path_part, query) = match path_str.split_once('?') { + Some((path, query)) => (path, Some(query)), + None => (path_str, None), + }; let branch_suffix = format!("{}/{}", BRANCH_DIR, branch_name); let branch_suffix = branch_suffix.as_str(); - let root_path_str = path_str + let root_path_str = path_part .strip_suffix(branch_suffix) .or_else(|| { if cfg!(windows) { let windows_suffix = branch_suffix.replace('/', "\\"); - path_str.strip_suffix(&windows_suffix) + path_part.strip_suffix(&windows_suffix) } else { None } @@ -59,7 +65,10 @@ impl BranchLocation { root_path_str, path_str, ))); }; - Ok(root_path_str) + Ok(match query { + Some(query) => format!("{}?{}", root_path_str, query), + None => root_path_str, + }) } /// The branch a location under `root` targets: the inverse of @@ -132,13 +141,23 @@ impl BranchLocation { } fn join_str(base: &str, segment: &str) -> Result { + // A uri may carry a query string (e.g. `s3+ddb://...?ddbTableName=t`); + // path segments must be appended before it. + let (path_part, query) = match base.split_once('?') { + Some((path, query)) => (path, Some(query)), + None => (base, None), + }; let normalized_segment = segment.trim_start_matches('/'); - let is_base_dir = base.ends_with("/"); - if is_base_dir { - Ok(format!("{}{}", base, normalized_segment)) + let is_base_dir = path_part.ends_with("/"); + let joined = if is_base_dir { + format!("{}{}", path_part, normalized_segment) } else { - Ok(format!("{}/{}", base, normalized_segment)) - } + format!("{}/{}", path_part, normalized_segment) + }; + Ok(match query { + Some(query) => format!("{}?{}", joined, query), + None => joined, + }) } } @@ -255,6 +274,30 @@ mod tests { assert!(fs::create_dir_all(std::path::Path::new(new_location.uri.as_str())).is_ok()); } + #[test] + fn test_branch_location_with_query_uri() { + // Uris like `s3+ddb://...?ddbTableName=t` carry the commit handler + // config in the query string; branch path segments must be inserted + // before it and the query must survive the round trip. + let location = BranchLocation { + path: Path::parse("bucket/table.lance").unwrap(), + uri: "s3+ddb://bucket/table.lance?ddbTableName=t".to_string(), + branch: None, + }; + let dev = location.find_branch(Some("dev")).unwrap(); + assert_eq!( + dev.uri, + "s3+ddb://bucket/table.lance/tree/dev?ddbTableName=t" + ); + assert_eq!(dev.path.as_ref(), "bucket/table.lance/tree/dev"); + assert_eq!(dev.branch.as_deref(), Some("dev")); + + let main = dev.find_main().unwrap(); + assert_eq!(main.uri, "s3+ddb://bucket/table.lance?ddbTableName=t"); + assert_eq!(main.path.as_ref(), "bucket/table.lance"); + assert_eq!(main.branch, None); + } + #[test] fn test_branch_of() { let derive = |root: &str, location: &str| BranchLocation::branch_of(root, location); diff --git a/rust/lance/src/dataset/cleanup.rs b/rust/lance/src/dataset/cleanup.rs index b3ca60cfa0f..65928038cea 100644 --- a/rust/lance/src/dataset/cleanup.rs +++ b/rust/lance/src/dataset/cleanup.rs @@ -46,7 +46,8 @@ use lance_core::{ Error, Result, utils::tracing::{ AUDIT_MODE_DELETE, AUDIT_MODE_DELETE_UNVERIFIED, AUDIT_TYPE_DATA, AUDIT_TYPE_DELETION, - AUDIT_TYPE_INDEX, AUDIT_TYPE_MANIFEST, TRACE_FILE_AUDIT, + AUDIT_TYPE_INDEX, AUDIT_TYPE_MANIFEST, DATASET_CLEANING_EVENT, TRACE_DATASET_EVENTS, + TRACE_FILE_AUDIT, }, }; use lance_table::{ @@ -78,7 +79,7 @@ struct ReferencedFiles { index_uuids: HashSet, } -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, PartialEq, Eq)] pub struct RemovalStats { pub bytes_removed: u64, pub old_versions: u64, @@ -88,12 +89,194 @@ pub struct RemovalStats { pub deletion_files_removed: u64, } -#[derive(Clone, Copy, Debug)] -enum RemovedFileType { +/// A read-only explanation of what a cleanup operation would remove. +/// +/// This is an explanation, not a deletion plan. Calling +/// [`CleanupOperation::execute`] re-evaluates the current dataset and reference +/// state before deleting files. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct CleanupExplanation { + /// Dataset version observed when the explanation was produced. + pub read_version: u64, + /// Aggregate statistics for files that would be removed. + pub stats: RemovalStats, + /// Candidate files that would be removed, capped by `candidate_file_limit`. + pub candidate_files: Vec, + /// True if more candidate files were found than are included. + pub candidate_files_truncated: bool, + /// Maximum number of candidate files included in this explanation. + pub candidate_file_limit: usize, + /// Referenced child branches and whether cleanup would cascade into them. + pub referenced_branches: Vec, + /// Non-fatal warnings about the explanation. + pub warnings: Vec, +} + +/// A file that cleanup identified as removable. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct CleanupCandidateFile { + /// Dataset-relative or storage path for the candidate file. + pub path: String, + /// Kind of file identified by cleanup. + pub kind: CleanupFileKind, + /// True if the file is removable only because it aged past the unverified + /// retention threshold or `delete_unverified` is enabled. + pub unverified: bool, + /// Candidate file size in bytes. + pub size_bytes: u64, +} + +/// A branch that references the current branch lineage. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct CleanupReferencedBranch { + /// Branch name. + pub name: String, + /// Version of the current lineage referenced by this branch. + pub referenced_version: u64, + /// True if this branch would be cleaned when cascading cleanup is enabled. + pub cleanup_candidate: bool, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum CleanupFileKind { + Manifest, Data, Transaction, Index, Deletion, + /// A leftover `_versions/.tmp` manifest from a failed transaction. These + /// are deleted but excluded from per-kind `RemovalStats` counts and audit + /// logs to match the long-standing cleanup behavior. Their bytes + /// are still included in `bytes_removed`. + TemporaryManifest, +} + +impl CleanupCandidateFile { + fn from_cleanup_file(file: &CleanupFile) -> Self { + Self { + path: file.path.to_string(), + kind: file.kind, + unverified: file.unverified, + size_bytes: file.size_bytes, + } + } +} + +fn cleanup_file( + path: Path, + kind: CleanupFileKind, + unverified: bool, + size_bytes: u64, +) -> Option { + Some(CleanupFile { + path, + kind, + unverified, + size_bytes, + }) +} + +#[derive(Clone, Debug)] +struct CleanupFile { + path: Path, + kind: CleanupFileKind, + /// True when the file was kept on disk past its referenced lifetime + /// because we could not verify it was safe to remove (e.g. produced by an + /// unfinished commit) and is being deleted only because it has aged past + /// the unverified-retention threshold or `delete_unverified` is set. + unverified: bool, + size_bytes: u64, +} + +impl RemovalStats { + fn record_file(&mut self, file: &CleanupFile) { + self.bytes_removed += file.size_bytes; + match file.kind { + CleanupFileKind::Manifest => self.old_versions += 1, + CleanupFileKind::Data => self.data_files_removed += 1, + CleanupFileKind::Transaction => self.transaction_files_removed += 1, + CleanupFileKind::Index => self.index_files_removed += 1, + CleanupFileKind::Deletion => self.deletion_files_removed += 1, + CleanupFileKind::TemporaryManifest => {} + } + } + + fn merge(&mut self, other: &Self) { + self.bytes_removed += other.bytes_removed; + self.old_versions += other.old_versions; + self.data_files_removed += other.data_files_removed; + self.transaction_files_removed += other.transaction_files_removed; + self.index_files_removed += other.index_files_removed; + self.deletion_files_removed += other.deletion_files_removed; + } +} + +#[derive(Debug, Default)] +struct CleanupRunResult { + stats: RemovalStats, + removed_manifests: HashSet, + candidate_files: Vec, + candidate_files_truncated: bool, + referenced_branches: Vec, +} + +impl CleanupRunResult { + fn record_file( + &mut self, + file: &CleanupFile, + candidate_file_limit: Option, + track_removed_manifests: bool, + ) { + self.stats.record_file(file); + if track_removed_manifests && matches!(file.kind, CleanupFileKind::Manifest) { + self.removed_manifests.insert(file.path.clone()); + } + if let Some(limit) = candidate_file_limit { + if self.candidate_files.len() < limit { + self.candidate_files + .push(CleanupCandidateFile::from_cleanup_file(file)); + } else { + self.candidate_files_truncated = true; + } + } + } + + fn merge(&mut self, other: Self, candidate_file_limit: Option) { + self.stats.merge(&other.stats); + self.removed_manifests.extend(other.removed_manifests); + self.referenced_branches.extend(other.referenced_branches); + if let Some(limit) = candidate_file_limit { + for file in other.candidate_files { + if self.candidate_files.len() < limit { + self.candidate_files.push(file); + } else { + self.candidate_files_truncated = true; + } + } + self.candidate_files_truncated |= other.candidate_files_truncated; + } + } +} + +#[derive(Clone, Copy, Debug)] +enum CleanupAction { + Execute, + Explain { max_candidate_files: usize }, +} + +impl CleanupAction { + fn deletes_files(self) -> bool { + matches!(self, Self::Execute) + } + + fn candidate_file_limit(self) -> Option { + match self { + Self::Execute => None, + Self::Explain { + max_candidate_files, + } => Some(max_candidate_files), + } + } } fn remove_prefix(path: &Path, prefix: &Path) -> Path { @@ -108,6 +291,11 @@ fn remove_prefix(path: &Path, prefix: &Path) -> Path { struct CleanupTask<'a> { dataset: &'a Dataset, policy: CleanupPolicy, + action: CleanupAction, + read_version: u64, + ignored_manifests: HashSet, + track_removed_manifests: bool, + include_referenced_branches: bool, } /// Information about the dataset that we learn by inspecting all of the manifests @@ -131,21 +319,131 @@ struct CleanupInspection { const UNVERIFIED_THRESHOLD_DAYS: i64 = 7; const S3_DELETE_STREAM_BATCH_SIZE: u64 = 1_000; const AZURE_DELETE_STREAM_BATCH_SIZE: u64 = 256; +const DEFAULT_EXPLANATION_MAX_CANDIDATE_FILES: usize = 1_000; + +/// Builder-style cleanup operation. +/// +/// Call [`Self::explain`] for a read-only explanation of what cleanup would +/// remove, or [`Self::execute`] to re-evaluate the current dataset state and +/// delete files. +pub struct CleanupOperation<'a> { + dataset: &'a Dataset, + policy: CleanupPolicy, + max_candidate_files: usize, +} + +impl<'a> CleanupOperation<'a> { + pub(crate) fn new(dataset: &'a Dataset, policy: CleanupPolicy) -> Self { + Self { + dataset, + policy, + max_candidate_files: DEFAULT_EXPLANATION_MAX_CANDIDATE_FILES, + } + } + + /// Set the maximum number of candidate files included in explanations. + /// + /// The aggregate [`RemovalStats`] in [`CleanupExplanation`] still include + /// all files that would be removed. + pub fn with_max_candidate_files(mut self, max_candidate_files: usize) -> Self { + self.max_candidate_files = max_candidate_files; + self + } + + /// Explain what cleanup would remove without deleting files. + pub async fn explain(&self) -> Result { + let cleanup = CleanupTask::new( + self.dataset, + self.policy.clone(), + CleanupAction::Explain { + max_candidate_files: self.max_candidate_files, + }, + ); + let read_version = cleanup.read_version; + let result = cleanup.run().await?; + let warnings = if result.candidate_files_truncated { + vec![format!( + "candidate_files truncated to {} entries", + self.max_candidate_files + )] + } else { + Vec::new() + }; + Ok(CleanupExplanation { + read_version, + stats: result.stats, + candidate_files: result.candidate_files, + candidate_files_truncated: result.candidate_files_truncated, + candidate_file_limit: self.max_candidate_files, + referenced_branches: result.referenced_branches, + warnings, + }) + } + + /// Execute cleanup by re-evaluating the current dataset state. + pub async fn execute(&self) -> Result { + info!(target: TRACE_DATASET_EVENTS, event=DATASET_CLEANING_EVENT, uri=&self.dataset.uri); + let cleanup = CleanupTask::new(self.dataset, self.policy.clone(), CleanupAction::Execute); + Ok(cleanup.run().await?.stats) + } +} impl<'a> CleanupTask<'a> { - fn new(dataset: &'a Dataset, policy: CleanupPolicy) -> Self { - Self { dataset, policy } + fn new(dataset: &'a Dataset, policy: CleanupPolicy, action: CleanupAction) -> Self { + let track_removed_manifests = policy.clean_referenced_branches; + let include_referenced_branches = action.candidate_file_limit().is_some(); + Self::new_with_ignored_manifests( + dataset, + policy, + action, + HashSet::new(), + track_removed_manifests, + include_referenced_branches, + ) + } + + fn new_with_ignored_manifests( + dataset: &'a Dataset, + policy: CleanupPolicy, + action: CleanupAction, + ignored_manifests: HashSet, + track_removed_manifests: bool, + include_referenced_branches: bool, + ) -> Self { + Self { + dataset, + policy, + action, + read_version: dataset.version().version, + ignored_manifests, + track_removed_manifests, + include_referenced_branches, + } } - async fn run(self) -> Result { - let mut final_stats = RemovalStats::default(); + async fn run(self) -> Result { + let mut final_result = CleanupRunResult::default(); + let candidate_file_limit = self.action.candidate_file_limit(); // First check if we need to clean referenced branches // For cases that referenced branches never clean and the current cleanup cannot clean anything // This must happen before cleaning the current branch if the setting is enabled. let referenced_branches: Vec<(String, u64)> = self.find_referenced_branches().await?; + if self.include_referenced_branches { + final_result.referenced_branches = referenced_branches + .iter() + .map(|(name, referenced_version)| CleanupReferencedBranch { + name: name.clone(), + referenced_version: *referenced_version, + cleanup_candidate: self.policy.clean_referenced_branches, + }) + .collect(); + } if self.policy.clean_referenced_branches { - self.clean_referenced_branches(&referenced_branches).await?; + final_result.merge( + self.clean_referenced_branches(&referenced_branches).await?, + candidate_file_limit, + ); } // we process all manifest files in parallel to figure @@ -179,19 +477,21 @@ impl<'a> CleanupTask<'a> { } if !referenced_branches.is_empty() { + let ignored_manifests: HashSet<_> = final_result + .removed_manifests + .union(&self.ignored_manifests) + .cloned() + .collect(); inspection = self - .retain_branch_lineage_files(inspection, &referenced_branches) + .retain_branch_lineage_files(inspection, &referenced_branches, &ignored_manifests) .await? }; - let stats = self.delete_unreferenced_files(inspection).await?; - final_stats.bytes_removed += stats.bytes_removed; - final_stats.old_versions += stats.old_versions; - final_stats.data_files_removed += stats.data_files_removed; - final_stats.transaction_files_removed += stats.transaction_files_removed; - final_stats.index_files_removed += stats.index_files_removed; - final_stats.deletion_files_removed += stats.deletion_files_removed; - Ok(final_stats) + final_result.merge( + self.delete_unreferenced_files(inspection).await?, + candidate_file_limit, + ); + Ok(final_result) } #[instrument(level = "debug", skip_all)] @@ -203,6 +503,7 @@ impl<'a> CleanupTask<'a> { self.dataset .commit_handler .list_manifest_locations(&self.dataset.base, &self.dataset.object_store, false) + .try_filter(|location| future::ready(!self.ignored_manifests.contains(&location.path))) .try_for_each_concurrent(self.dataset.object_store.io_parallelism(), |location| { self.process_manifest_file(location, &inspection, tagged_versions) }) @@ -224,12 +525,10 @@ impl<'a> CleanupTask<'a> { let manifest = read_manifest(&self.dataset.object_store, &location.path, location.size).await?; - let dataset_version = self.dataset.version().version; - // Don't delete the latest version, even if it is old. Don't delete tagged versions, // regardless of age. Don't delete manifests if their version is newer than the dataset // version. These are either in-progress or newly added since we started. - let is_latest = dataset_version <= manifest.version; + let is_latest = self.read_version <= manifest.version; let is_tagged = tagged_versions.contains(&manifest.version); let in_working_set = is_latest || !self.policy.should_clean(&manifest) || is_tagged; let indexes = @@ -319,8 +618,10 @@ impl<'a> CleanupTask<'a> { async fn delete_unreferenced_files( &self, inspection: CleanupInspection, - ) -> Result { - let removal_stats = Mutex::new(RemovalStats::default()); + ) -> Result { + let cleanup_result = Mutex::new(CleanupRunResult::default()); + let deletes_files = self.action.deletes_files(); + let candidate_file_limit = self.action.candidate_file_limit(); let verification_threshold = utc_now() - TimeDelta::try_days(UNVERIFIED_THRESHOLD_DAYS).expect("TimeDelta::try_days"); @@ -335,9 +636,8 @@ impl<'a> CleanupTask<'a> { ) }; // Build stream for a managed subtree - let build_listing_stream = |dir: Path, file_type: Option| { + let build_listing_stream = |dir: Path| { let inspection_ref = &inspection; - let removal_stats_ref = &removal_stats; self.dataset .object_store .read_dir_all(&dir, inspection.earliest_retained_manifest_time) @@ -356,118 +656,133 @@ impl<'a> CleanupTask<'a> { // delete it if we can verify it is part of an old version. let maybe_in_progress = !self.policy.delete_unverified && obj_meta.last_modified >= verification_threshold; - let path_to_remove = self.path_if_not_referenced( - obj_meta.location, + let file_to_remove = self.cleanup_file_if_not_referenced( + obj_meta, maybe_in_progress, inspection_ref, ); - if matches!(path_to_remove, Ok(Some(..))) { - let mut stats = removal_stats_ref.lock().unwrap(); - stats.bytes_removed += obj_meta.size; - if let Some(file_type) = file_type { - match file_type { - RemovedFileType::Data => stats.data_files_removed += 1, - RemovedFileType::Transaction => { - stats.transaction_files_removed += 1 - } - RemovedFileType::Index => stats.index_files_removed += 1, - RemovedFileType::Deletion => stats.deletion_files_removed += 1, - } - } - } - future::ready(path_to_remove) + future::ready(file_to_remove) }) .boxed() }; // Restrict scanning to Lance-managed subtrees for safety and performance. let streams = vec![ - build_listing_stream(self.dataset.versions_dir(), None), - build_listing_stream( - self.dataset.transactions_dir(), - Some(RemovedFileType::Transaction), - ), - build_listing_stream(self.dataset.data_dir(), Some(RemovedFileType::Data)), - build_listing_stream(self.dataset.indices_dir(), Some(RemovedFileType::Index)), - build_listing_stream( - self.dataset.deletions_dir(), - Some(RemovedFileType::Deletion), - ), + build_listing_stream(self.dataset.versions_dir()), + build_listing_stream(self.dataset.transactions_dir()), + build_listing_stream(self.dataset.data_dir()), + build_listing_stream(self.dataset.indices_dir()), + build_listing_stream(self.dataset.deletions_dir()), ]; - let unreferenced_paths = stream::iter(streams).flatten().boxed(); + let unreferenced_files = stream::iter(streams).flatten().boxed(); let old_manifests = inspection.old_manifests.clone(); - let num_old_manifests = old_manifests.len(); - - // Ideally this collect shouldn't be needed here but it seems necessary - // to avoid https://github.com/rust-lang/rust/issues/102211 - let manifest_bytes_removed = stream::iter(old_manifests.keys()) - .map(|path| self.dataset.object_store.size(path)) - .collect::>() - .await; - let manifest_bytes_removed = stream::iter(manifest_bytes_removed) - .buffer_unordered(self.dataset.object_store.io_parallelism()) - .try_fold(0, |acc, size| async move { Ok(acc + (size)) }) - .await; - - let old_manifests_stream = stream::iter(old_manifests.into_keys()) - .map(|path| { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_MANIFEST, path = path.as_ref()); - Ok(path) + let manifest_files = stream::iter(old_manifests) + .map(|(path, _version)| async move { + let size_bytes = self.dataset.object_store.size(&path).await?; + Ok::(CleanupFile { + path, + kind: CleanupFileKind::Manifest, + unverified: false, + size_bytes, + }) }) + .buffer_unordered(self.dataset.object_store.io_parallelism()) .boxed(); - let all_paths_to_remove = - stream::iter(vec![unreferenced_paths, old_manifests_stream]).flatten(); - - let paths_to_delete: BoxStream> = if let Some(rate) = - self.policy.delete_rate_limit - { - let duration = calculate_duration(self.dataset.object_store.scheme().to_string(), rate); - let mut ticker = interval(duration); - ticker.set_missed_tick_behavior(MissedTickBehavior::Delay); - IntervalStream::new(ticker) - .zip(all_paths_to_remove) - .map(|(_, path)| path) - .boxed() - } else { - all_paths_to_remove.boxed() - }; - let delete_fut = self - .dataset - .object_store - .remove_stream(paths_to_delete) - .try_for_each(|_| future::ready(Ok(()))); + let all_files = stream::iter(vec![unreferenced_files, manifest_files]).flatten(); + let all_paths_to_remove = all_files.map(|file| { + let file = file?; + if deletes_files { + let mode = if file.unverified { + AUDIT_MODE_DELETE_UNVERIFIED + } else { + AUDIT_MODE_DELETE + }; + let path_str = file.path.as_ref(); + match file.kind { + CleanupFileKind::Manifest => { + info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_MANIFEST, path = path_str); + } + CleanupFileKind::Data => { + info!(target: TRACE_FILE_AUDIT, mode=mode, r#type=AUDIT_TYPE_DATA, path = path_str); + } + CleanupFileKind::Deletion => { + info!(target: TRACE_FILE_AUDIT, mode=mode, r#type=AUDIT_TYPE_DELETION, path = path_str); + } + CleanupFileKind::Index => { + info!(target: TRACE_FILE_AUDIT, mode=mode, r#type=AUDIT_TYPE_INDEX, path = path_str); + } + CleanupFileKind::Transaction | CleanupFileKind::TemporaryManifest => {} + } + } + cleanup_result + .lock() + .unwrap() + .record_file(&file, candidate_file_limit, self.track_removed_manifests); + Ok(file.path) + }); + + if deletes_files { + let paths_to_delete: BoxStream> = + if let Some(rate) = self.policy.delete_rate_limit { + let duration = + calculate_duration(self.dataset.object_store.scheme().to_string(), rate); + let mut ticker = interval(duration); + ticker.set_missed_tick_behavior(MissedTickBehavior::Delay); + IntervalStream::new(ticker) + .zip(all_paths_to_remove) + .map(|(_, path)| path) + .boxed() + } else { + all_paths_to_remove.boxed() + }; - delete_fut.await?; + self.dataset + .object_store + .remove_stream(paths_to_delete) + .try_for_each(|_| future::ready(Ok(()))) + .await?; + } else { + // Drain the stream to populate stats, but do not call remove_stream. + all_paths_to_remove + .try_for_each(|_| future::ready(Ok(()))) + .await?; + } - let mut removal_stats = removal_stats.into_inner().unwrap(); - removal_stats.old_versions = num_old_manifests as u64; - removal_stats.bytes_removed += manifest_bytes_removed?; + let cleanup_result = cleanup_result.into_inner().unwrap(); let span = Span::current(); - span.record("bytes_removed", removal_stats.bytes_removed); - span.record("data_files_removed", removal_stats.data_files_removed); + span.record("bytes_removed", cleanup_result.stats.bytes_removed); + span.record( + "data_files_removed", + cleanup_result.stats.data_files_removed, + ); span.record( "transaction_files_removed", - removal_stats.transaction_files_removed, + cleanup_result.stats.transaction_files_removed, + ); + span.record( + "index_files_removed", + cleanup_result.stats.index_files_removed, ); - span.record("index_files_removed", removal_stats.index_files_removed); span.record( "deletion_files_removed", - removal_stats.deletion_files_removed, + cleanup_result.stats.deletion_files_removed, ); - Ok(removal_stats) + Ok(cleanup_result) } - fn path_if_not_referenced( + fn cleanup_file_if_not_referenced( &self, - path: Path, + obj_meta: ObjectMeta, maybe_in_progress: bool, inspection: &CleanupInspection, - ) -> Result> { + ) -> Result> { + let path = obj_meta.location; let relative_path = remove_prefix(&path, &self.dataset.base); + let size_bytes = obj_meta.size; if relative_path.as_ref().starts_with("_versions/.tmp") { // This is a temporary manifest file. // @@ -476,7 +791,12 @@ impl<'a> CleanupTask<'a> { if maybe_in_progress { return Ok(None); } else { - return Ok(Some(path)); + return Ok(cleanup_file( + path, + CleanupFileKind::TemporaryManifest, + true, + size_bytes, + )); } } if relative_path.as_ref().starts_with("_indices") { @@ -490,15 +810,18 @@ impl<'a> CleanupTask<'a> { { return Ok(None); } else if !maybe_in_progress { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE_UNVERIFIED, r#type=AUDIT_TYPE_INDEX, path = path.to_string()); - return Ok(Some(path)); + return Ok(cleanup_file(path, CleanupFileKind::Index, true, size_bytes)); } else if inspection .verified_files .index_uuids .contains(uuid.as_ref()) { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_INDEX, path = path.to_string()); - return Ok(Some(path)); + return Ok(cleanup_file( + path, + CleanupFileKind::Index, + false, + size_bytes, + )); } } else { return Ok(None); @@ -514,15 +837,13 @@ impl<'a> CleanupTask<'a> { { Ok(None) } else if !maybe_in_progress { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE_UNVERIFIED, r#type=AUDIT_TYPE_DATA, path = path.to_string()); - Ok(Some(path)) + Ok(cleanup_file(path, CleanupFileKind::Data, true, size_bytes)) } else if inspection .verified_files .data_paths .contains(&relative_path) { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_DATA, path = path.to_string()); - Ok(Some(path)) + Ok(cleanup_file(path, CleanupFileKind::Data, false, size_bytes)) } else { Ok(None) } @@ -587,15 +908,13 @@ impl<'a> CleanupTask<'a> { { Ok(None) } else if !maybe_in_progress { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE_UNVERIFIED, r#type=AUDIT_TYPE_DATA, path = path.to_string()); - Ok(Some(path)) + Ok(cleanup_file(path, CleanupFileKind::Data, true, size_bytes)) } else if inspection .verified_files .data_paths .contains(&parent_data_path) { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_DATA, path = path.to_string()); - Ok(Some(path)) + Ok(cleanup_file(path, CleanupFileKind::Data, false, size_bytes)) } else { Ok(None) } @@ -613,15 +932,23 @@ impl<'a> CleanupTask<'a> { { Ok(None) } else if !maybe_in_progress { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE_UNVERIFIED, r#type=AUDIT_TYPE_DELETION, path = path.to_string()); - Ok(Some(path)) + Ok(cleanup_file( + path, + CleanupFileKind::Deletion, + true, + size_bytes, + )) } else if inspection .verified_files .delete_paths .contains(&relative_path) { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_DELETION, path = path.to_string()); - Ok(Some(path)) + Ok(cleanup_file( + path, + CleanupFileKind::Deletion, + false, + size_bytes, + )) } else { Ok(None) } @@ -640,7 +967,14 @@ impl<'a> CleanupTask<'a> { } else if !maybe_in_progress || inspection.verified_files.tx_paths.contains(&relative_path) { - Ok(Some(path)) + let unverified = + !inspection.verified_files.tx_paths.contains(&relative_path); + Ok(cleanup_file( + path, + CleanupFileKind::Transaction, + unverified, + size_bytes, + )) } else { Ok(None) } @@ -709,8 +1043,8 @@ impl<'a> CleanupTask<'a> { async fn clean_referenced_branches( &self, referenced_branches: &[(String, u64)], - ) -> Result { - let final_stats = Mutex::new(RemovalStats::default()); + ) -> Result { + let final_result = Mutex::new(CleanupRunResult::default()); // Group branches by their lineage identifier (BranchIdentifier). // Branches with the same identifier share a lineage and must be cleaned sequentially @@ -722,30 +1056,32 @@ impl<'a> CleanupTask<'a> { .or_insert_with(Vec::new) .push(branch.clone()); } + let action = self.action; + let candidate_file_limit = self.action.candidate_file_limit(); let tasks: Vec<_> = branches_chains .values() .map(|branch_chain| { - let final_stats = &final_stats; + let final_result = &final_result; async move { for branch in branch_chain { let branch_dataset = self .dataset .checkout_version((branch.as_str(), None)) .await?; - if let Some(stats) = cleanup_cascade_branch( + let ignored_manifests = + final_result.lock().unwrap().removed_manifests.clone(); + if let Some(result) = cleanup_cascade_branch_run( &branch_dataset, branch_dataset.manifest.as_ref(), + action, + ignored_manifests, ) .await? { - let mut stats_guard = final_stats.lock().unwrap(); - stats_guard.bytes_removed += stats.bytes_removed; - stats_guard.old_versions += stats.old_versions; - stats_guard.data_files_removed += stats.data_files_removed; - stats_guard.transaction_files_removed += - stats.transaction_files_removed; - stats_guard.index_files_removed += stats.index_files_removed; - stats_guard.deletion_files_removed += stats.deletion_files_removed; + final_result + .lock() + .unwrap() + .merge(result, candidate_file_limit); } } Ok::<(), Error>(()) @@ -753,7 +1089,7 @@ impl<'a> CleanupTask<'a> { }) .collect(); try_join_all(tasks).await?; - Ok(final_stats.into_inner().unwrap()) + Ok(final_result.into_inner().unwrap()) } // Retain manifests containing files referenced by descendant branches. @@ -762,6 +1098,7 @@ impl<'a> CleanupTask<'a> { &self, inspection: CleanupInspection, referenced_branches: &[(String, u64)], + removed_branch_manifests: &HashSet, ) -> Result { let inspection = Mutex::new(inspection); for (branch, root_version_number) in referenced_branches { @@ -772,6 +1109,9 @@ impl<'a> CleanupTask<'a> { self.dataset .commit_handler .list_manifest_locations(&branch_location.path, &self.dataset.object_store, false) + .try_filter(|location| { + future::ready(!removed_branch_manifests.contains(&location.path)) + }) .try_for_each_concurrent(self.dataset.object_store.io_parallelism(), |location| { self.process_branch_referenced_manifests( location, @@ -1020,8 +1360,7 @@ pub async fn cleanup_old_versions( dataset: &Dataset, policy: CleanupPolicy, ) -> Result { - let cleanup = CleanupTask::new(dataset, policy); - cleanup.run().await + CleanupOperation::new(dataset, policy).execute().await } /// If the dataset config has `lance.auto_cleanup` parameters set, @@ -1048,11 +1387,35 @@ pub async fn cleanup_cascade_branch( dataset: &Dataset, manifest: &Manifest, ) -> Result> { + Ok( + cleanup_cascade_branch_run(dataset, manifest, CleanupAction::Execute, HashSet::new()) + .await? + .map(|result| result.stats), + ) +} + +async fn cleanup_cascade_branch_run( + dataset: &Dataset, + manifest: &Manifest, + action: CleanupAction, + ignored_manifests: HashSet, +) -> Result> { let policy = build_cleanup_policy(dataset, manifest).await?; if let Some(mut policy) = policy { policy.clean_referenced_branches = false; policy.error_if_tagged_old_versions = false; - Ok(Some(dataset.cleanup_with_policy(policy).await?)) + if action.deletes_files() { + info!(target: TRACE_DATASET_EVENTS, event=DATASET_CLEANING_EVENT, uri=&dataset.uri); + } + let cleanup = CleanupTask::new_with_ignored_manifests( + dataset, + policy, + action, + ignored_manifests, + true, + false, + ); + Ok(Some(cleanup.run().await?)) } else { Ok(None) } @@ -1443,6 +1806,14 @@ mod tests { cleanup_old_versions(&db, policy).await } + async fn explain_cleanup_with_policy( + &self, + policy: CleanupPolicy, + ) -> Result { + let db = self.open().await?; + db.cleanup(policy).explain().await + } + async fn run_cleanup_with_override( &self, before: DateTime, @@ -1670,6 +2041,51 @@ mod tests { assert_gt!(after_count.num_tx_files, 0); } + #[tokio::test] + async fn explain_cleanup_does_not_delete_files() { + let fixture = MockDatasetFixture::try_new().unwrap(); + fixture.create_some_data().await.unwrap(); + MockClock::set_system_time(TimeDelta::try_seconds(1).unwrap().to_std().unwrap()); + fixture.overwrite_some_data().await.unwrap(); + + let before_count = fixture.count_files().await.unwrap(); + let policy = CleanupPolicyBuilder::default() + .before_timestamp(utc_now()) + .build(); + + let explanation = fixture + .explain_cleanup_with_policy(policy.clone()) + .await + .unwrap(); + let after_preview_count = fixture.count_files().await.unwrap(); + + // Files are not actually removed when explaining cleanup. + assert_eq!(before_count, after_preview_count); + assert_eq!(explanation.read_version, 2); + assert_eq!(explanation.stats.old_versions, 1); + assert_eq!(explanation.stats.data_files_removed, 1); + assert_eq!(explanation.stats.transaction_files_removed, 1); + assert_gt!(explanation.stats.bytes_removed, 0); + assert!(!explanation.candidate_files.is_empty()); + assert!(!explanation.candidate_files_truncated); + + // Running cleanup with the same policy should remove the same files the + // explanation reported for this unchanged dataset. + let removed = fixture.run_cleanup_with_policy(policy).await.unwrap(); + let after_cleanup_count = fixture.count_files().await.unwrap(); + + assert_eq!( + removed.bytes_removed, + before_count.num_bytes - after_cleanup_count.num_bytes + ); + assert_eq!(removed.old_versions, explanation.stats.old_versions); + assert_eq!( + removed.data_files_removed, + explanation.stats.data_files_removed + ); + assert_eq!(removed.bytes_removed, explanation.stats.bytes_removed); + } + #[tokio::test] async fn cleanup_blob_v2_sidecar_files() { let fixture = MockDatasetFixture::try_new().unwrap(); @@ -3073,6 +3489,17 @@ mod tests { self.run_cleanup_inner(policy).await } + async fn explain_cleanup_with_referenced_branches(&mut self) -> Result { + let policy = CleanupPolicyBuilder::default() + .error_if_tagged_old_versions(false) + .clean_referenced_branches(true) + .retain_n_versions(&self.dataset, 1) + .await? + .build(); + self.dataset.checkout_latest().await?; + self.dataset.cleanup(policy).explain().await + } + async fn run_cleanup_inner(&mut self, policy: CleanupPolicy) -> Result { let pre_count = self.count_data().await?; self.dataset.checkout_latest().await?; @@ -3653,6 +4080,74 @@ mod tests { setup.assert_unchanged(&["branch4"]).await; } + #[tokio::test] + async fn explain_cleanup_with_referenced_branches_matches_cleanup() { + let mut setup = build_lineage_datasets().await.unwrap(); + + setup.enable_auto_cleanup().await.unwrap(); + setup.main.write_data().await.unwrap(); + setup.main.compact().await.unwrap(); + setup.branch4.compact().await.unwrap(); + setup.branch1.write_data().await.unwrap(); + setup.branch1.compact().await.unwrap(); + setup.branch2.write_data().await.unwrap(); + setup.branch2.compact().await.unwrap(); + setup.branch3.write_data().await.unwrap(); + setup.branch3.compact().await.unwrap(); + + setup.main.refresh().await.unwrap(); + setup.branch1.refresh().await.unwrap(); + setup.branch2.refresh().await.unwrap(); + setup.branch3.refresh().await.unwrap(); + setup.branch4.refresh().await.unwrap(); + let main_counts_before = setup.main.counts; + let branch1_counts_before = setup.branch1.counts; + let branch2_counts_before = setup.branch2.counts; + let branch3_counts_before = setup.branch3.counts; + let branch4_counts_before = setup.branch4.counts; + + let explanation = setup + .main + .explain_cleanup_with_referenced_branches() + .await + .unwrap(); + + setup.main.refresh().await.unwrap(); + setup.branch1.refresh().await.unwrap(); + setup.branch2.refresh().await.unwrap(); + setup.branch3.refresh().await.unwrap(); + setup.branch4.refresh().await.unwrap(); + assert_eq!(setup.main.counts, main_counts_before); + assert_eq!(setup.branch1.counts, branch1_counts_before); + assert_eq!(setup.branch2.counts, branch2_counts_before); + assert_eq!(setup.branch3.counts, branch3_counts_before); + assert_eq!(setup.branch4.counts, branch4_counts_before); + + let removed = setup + .main + .run_cleanup_with_referenced_branches() + .await + .unwrap(); + + assert!(!explanation.referenced_branches.is_empty()); + assert!( + explanation + .referenced_branches + .iter() + .any(|branch| branch.cleanup_candidate) + ); + assert_eq!(explanation.stats, removed); + setup.branch1.refresh().await.unwrap(); + setup.branch2.refresh().await.unwrap(); + setup.branch3.refresh().await.unwrap(); + setup.branch4.refresh().await.unwrap(); + assert_eq!(setup.main.counts.num_manifest_files, 1); + assert_eq!(setup.branch1.counts.num_manifest_files, 1); + assert_eq!(setup.branch2.counts.num_manifest_files, 1); + assert_eq!(setup.branch3.counts.num_manifest_files, 1); + assert_eq!(setup.branch4.counts.num_manifest_files, 1); + } + #[tokio::test] async fn auto_clean_referenced_branches_with_tags() { let mut setup = build_lineage_datasets().await.unwrap(); diff --git a/rust/lance/src/dataset/fragment.rs b/rust/lance/src/dataset/fragment.rs index 11851e8846e..eb165e5f612 100644 --- a/rust/lance/src/dataset/fragment.rs +++ b/rust/lance/src/dataset/fragment.rs @@ -1792,7 +1792,7 @@ impl FileFragment { read_columns: Option>, batch_size: Option, ) -> Result<(Fragment, Schema)> { - let (fragments, schema) = schema_evolution::add_columns_to_fragments( + let (fragments, schema, _) = schema_evolution::add_columns_to_fragments( self.dataset.as_ref(), transforms, read_columns, diff --git a/rust/lance/src/dataset/index/frag_reuse.rs b/rust/lance/src/dataset/index/frag_reuse.rs index 4fbefcd4725..ceebe456bbf 100644 --- a/rust/lance/src/dataset/index/frag_reuse.rs +++ b/rust/lance/src/dataset/index/frag_reuse.rs @@ -243,4 +243,198 @@ mod tests { Err(Error::RetryableCommitConflict { .. }) )); } + + /// With more than one index on the table, remapping every index must catch + /// all of them up so the reuse index can be trimmed. + /// + /// Regression: `remap_column_index` used to decide whether to remap an + /// index's data from the presence of the old fragments in its fragment + /// bitmap. But `load_indices` coverage-remaps the bitmap onto the new + /// fragments in memory, and remapping the *first* index commits a manifest + /// that persists that cleaned bitmap for the others — so remapping the + /// remaining indexes became a silent no-op (their data was never remapped + /// and their `dataset_version` never advanced), and the reuse index could + /// never be trimmed. + #[tokio::test] + async fn test_cleanup_frag_reuse_index_multiple_indices() { + let mut dataset = lance_datagen::gen_batch() + .col("i", lance_datagen::array::step::()) + .col("j", lance_datagen::array::step::()) + .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000)) + .await + .unwrap(); + + for col in ["i", "j"] { + dataset + .create_index( + &[col], + IndexType::Scalar, + Some(format!("{col}_idx")), + &ScalarIndexParams::default(), + false, + ) + .await + .unwrap(); + } + + compact_files( + &mut dataset, + CompactionOptions { + target_rows_per_fragment: 2_000, + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + + let frag_reuse_index_meta = dataset + .load_index_by_name(FRAG_REUSE_INDEX_NAME) + .await + .unwrap() + .expect("Fragment reuse index must be available"); + let frag_reuse_details = load_frag_reuse_index_details(&dataset, &frag_reuse_index_meta) + .await + .unwrap(); + assert_eq!(frag_reuse_details.versions.len(), 1); + + for col in ["i", "j"] { + remapping::remap_column_index(&mut dataset, &[col], Some(format!("{col}_idx"))) + .await + .unwrap(); + } + + // Every index must now be caught up (data remapped, version advanced). + let indices = dataset.load_indices().await.unwrap(); + for col in ["i", "j"] { + let index = indices + .iter() + .find(|idx| idx.name == format!("{col}_idx")) + .unwrap(); + assert!( + is_index_remap_caught_up(&frag_reuse_details.versions[0], index).unwrap(), + "index {col}_idx was not caught up after remap" + ); + } + + // ... so the reuse index trims down to zero versions. + cleanup_frag_reuse_index(&mut dataset).await.unwrap(); + let frag_reuse_index_meta = dataset + .load_index_by_name(FRAG_REUSE_INDEX_NAME) + .await + .unwrap() + .expect("Fragment reuse index must be available"); + let frag_reuse_details = load_frag_reuse_index_details(&dataset, &frag_reuse_index_meta) + .await + .unwrap(); + assert_eq!(frag_reuse_details.versions.len(), 0); + + // Data correctness, not just version bookkeeping: with the reuse index + // trimmed there is no auto-remap safety net, so each index must resolve + // to LIVE rows. An index whose data was not actually remapped (e.g. one + // whose bitmap was coverage-remapped by a sibling's commit before its + // own data remap) points at compacted-away fragments and errors on take. + use futures::TryStreamExt; + for col in ["i", "j"] { + let rows: usize = dataset + .scan() + .filter(&format!("{col} >= 2000 AND {col} < 3000")) + .unwrap() + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap() + .iter() + .map(|b| b.num_rows()) + .sum(); + assert_eq!( + rows, 1000, + "index {col}_idx must resolve to live rows after remap+trim" + ); + } + } + + /// When the reuse index has accumulated several versions, a single remap + /// must compose them and rebuild + commit the index exactly ONCE, not once + /// per version. + #[tokio::test] + async fn test_remap_index_batches_multiple_reuse_versions() { + let mut dataset = lance_datagen::gen_batch() + .col("i", lance_datagen::array::step::()) + .into_ram_dataset(FragmentCount::from(8), FragmentRowCount::from(1000)) + .await + .unwrap(); + dataset + .create_index( + &["i"], + IndexType::Scalar, + Some("i_idx".into()), + &ScalarIndexParams::default(), + false, + ) + .await + .unwrap(); + + // Accumulate multiple reuse versions: each round deletes a prefix, which + // shrinks fragments below target and forces another deferred compaction. + let options = CompactionOptions { + target_rows_per_fragment: 4_000, + defer_index_remap: true, + ..Default::default() + }; + for round in 0..4 { + dataset + .delete(&format!("i < {}", 1_000 * (round + 1))) + .await + .unwrap(); + compact_files(&mut dataset, options.clone(), None) + .await + .unwrap(); + } + + let frag_reuse_index_meta = dataset + .load_index_by_name(FRAG_REUSE_INDEX_NAME) + .await + .unwrap() + .expect("Fragment reuse index must be available"); + let num_versions = load_frag_reuse_index_details(&dataset, &frag_reuse_index_meta) + .await + .unwrap() + .versions + .len(); + assert!( + num_versions >= 2, + "test needs multiple reuse versions to exercise batching, got {num_versions}" + ); + + // A single remap must commit exactly once, regardless of version count. + let version_before = dataset.manifest.version; + remapping::remap_column_index(&mut dataset, &["i"], Some("i_idx".into())) + .await + .unwrap(); + let commits = dataset.manifest.version - version_before; + assert_eq!( + commits, 1, + "batched remap must commit once, not once per reuse version ({num_versions})" + ); + + // ... and the reuse index then trims to zero. + cleanup_frag_reuse_index(&mut dataset).await.unwrap(); + let frag_reuse_index_meta = dataset + .load_index_by_name(FRAG_REUSE_INDEX_NAME) + .await + .unwrap() + .expect("Fragment reuse index must be available"); + assert_eq!( + load_frag_reuse_index_details(&dataset, &frag_reuse_index_meta) + .await + .unwrap() + .versions + .len(), + 0 + ); + } } diff --git a/rust/lance/src/dataset/mem_wal/api.rs b/rust/lance/src/dataset/mem_wal/api.rs index b67f6434c9c..79184c13ec8 100644 --- a/rust/lance/src/dataset/mem_wal/api.rs +++ b/rust/lance/src/dataset/mem_wal/api.rs @@ -26,7 +26,7 @@ use crate::index::mem_wal::{load_mem_wal_index_details, new_mem_wal_index_meta}; use super::ShardWriterConfig; use super::scanner::flushed_cache::open_flushed_dataset; -use super::scanner::{FlushedMemTableCache, ShardSnapshot}; +use super::scanner::{DatasetCache, ShardSnapshot}; use super::write::MemIndexConfig; use super::write::ShardWriter; @@ -500,7 +500,7 @@ pub trait DatasetMemWalExt { async fn prewarm_mem_wal( &self, _snapshots: &[ShardSnapshot], - _cache: Option<&Arc>, + _cache: Option<&Arc>, ) -> Result<()> { Ok(()) } @@ -586,7 +586,7 @@ impl DatasetMemWalExt for Dataset { async fn prewarm_mem_wal( &self, snapshots: &[ShardSnapshot], - cache: Option<&Arc>, + cache: Option<&Arc>, ) -> Result<()> { let session = self.session(); // Resolve flushed paths exactly as the LSM collector does, so the @@ -601,7 +601,8 @@ impl DatasetMemWalExt for Dataset { snapshot.flushed_generations.iter().map(move |flushed| { let path = format!("{}/_mem_wal/{}/{}", base_path, shard_id, flushed.path); async move { - let dataset = open_flushed_dataset(&path, Some(session), cache).await?; + let dataset = + open_flushed_dataset(&path, Some(session), cache, None).await?; prewarm_all_indexes(&dataset).await } }) @@ -762,6 +763,7 @@ async fn load_vector_index_config( #[cfg(test)] mod tests { + use super::super::scanner::FlushedMemTableCache; use super::*; use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator}; @@ -831,7 +833,7 @@ mod tests { .with_current_generation(2) .with_flushed_generation(1, folder.to_string()); - let cache = Arc::new(FlushedMemTableCache::new(4)); + let cache: Arc = Arc::new(FlushedMemTableCache::new(4)); base.prewarm_mem_wal(std::slice::from_ref(&snapshot), Some(&cache)) .await .expect("prewarm must open the generation and warm its index"); diff --git a/rust/lance/src/dataset/mem_wal/index.rs b/rust/lance/src/dataset/mem_wal/index.rs index 116ea6c60ce..208971f7be6 100644 --- a/rust/lance/src/dataset/mem_wal/index.rs +++ b/rust/lance/src/dataset/mem_wal/index.rs @@ -18,10 +18,14 @@ mod arena_skiplist; mod btree; mod fts; mod hnsw; +mod pk_key; use std::collections::HashMap; +use std::sync::Arc; use std::sync::atomic::{AtomicUsize, Ordering}; +use datafusion::common::ScalarValue; + use super::memtable::batch_store::StoredBatch; use arrow_array::RecordBatch; use lance_core::datatypes::Schema as LanceSchema; @@ -44,6 +48,32 @@ pub type RowPosition = u64; pub use btree::{BTreeIndexConfig, BTreeMemIndex}; pub use fts::{FtsIndexConfig, FtsMemIndex, FtsQueryExpr, SearchOptions}; pub use hnsw::{HnswIndexConfig, HnswMemIndex}; +pub use pk_key::encode_pk_tuple; + +use pk_key::encode_pk_batch; + +/// Synthetic column the composite PK index is keyed on: the order-preserving +/// encoded tuple (see [`encode_pk_tuple`]), stored as `Binary` so a +/// [`BTreeMemIndex`]'s byte backend indexes it directly. +const PK_KEY_COLUMN: &str = "__pk_key__"; + +/// The memtable's primary-key index, used to answer "newest visible version of +/// this key" for dedup. Single-column PKs reuse the column's compact typed +/// [`BTreeMemIndex`] (no second copy); composite PKs key a `BTreeMemIndex` on +/// the order-preserving encoded tuple ([`encode_pk_tuple`]) instead. Either way +/// the lookup is a single seek on one `BTreeMemIndex`. +enum PkIndex { + /// Arity 1: aliases a `btree_indexes` entry, so the insert loop maintains it. + Single(Arc), + /// Arity >= 2: a `BTreeMemIndex` over the encoded-tuple `Binary` key, + /// maintained explicitly in the insert paths (the original batch lacks the + /// synthetic key column). `columns` are the PK columns in order, resolved + /// against each batch's schema at insert time. + Composite { + index: Arc, + columns: Vec, + }, +} // ============================================================================ // Index Store @@ -195,12 +225,17 @@ impl MemIndexConfig { /// therefore safe for scanners to read. Scanners snapshot this at plan /// construction time so every plan keys on a stable MVCC cursor. pub struct IndexStore { - /// BTree indexes keyed by index name. - btree_indexes: HashMap, + /// BTree indexes keyed by index name. `Arc` so the primary-key BTrees can be + /// shared into [`Self::pk_btrees`] without a second copy or a second insert. + btree_indexes: HashMap>, /// HNSW vector indexes keyed by index name. hnsw_indexes: HashMap, /// FTS indexes keyed by index name. fts_indexes: HashMap, + /// The primary-key index (single-column or composite), or `None` without a + /// primary key. Queried via [`Self::pk_newest_visible`] (see + /// [`Self::enable_pk_index`]). + pk_index: Option, /// Maximum batch position that is durable in the WAL and therefore /// visible to scanners. Advanced unconditionally after a WAL append /// succeeds; not gated on whether any indexes are configured. @@ -213,6 +248,7 @@ impl Default for IndexStore { btree_indexes: HashMap::new(), hnsw_indexes: HashMap::new(), fts_indexes: HashMap::new(), + pk_index: None, max_visible_batch_position: AtomicUsize::new(0), } } @@ -230,6 +266,16 @@ impl std::fmt::Debug for IndexStore { &self.hnsw_indexes.keys().collect::>(), ) .field("fts_indexes", &self.fts_indexes.keys().collect::>()) + .field( + "pk_index", + &match &self.pk_index { + None => "none".to_string(), + Some(PkIndex::Single(b)) => format!("single({})", b.column_name()), + Some(PkIndex::Composite { columns, .. }) => { + format!("composite({})", columns.join(", ")) + } + }, + ) .field( "max_visible_batch_position", &self.max_visible_batch_position.load(Ordering::Acquire), @@ -264,7 +310,7 @@ impl IndexStore { for config in configs { match config { MemIndexConfig::BTree(c) => { - let index = BTreeMemIndex::new(c.field_id, c.column.clone()); + let index = Arc::new(BTreeMemIndex::new(c.field_id, c.column.clone())); registry.btree_indexes.insert(c.name.clone(), index); } MemIndexConfig::Hnsw(c) => { @@ -293,7 +339,7 @@ impl IndexStore { /// the production memtable path goes through [`Self::from_configs`]. pub fn add_btree(&mut self, name: String, field_id: i32, column: String) { self.btree_indexes - .insert(name, BTreeMemIndex::new(field_id, column)); + .insert(name, Arc::new(BTreeMemIndex::new(field_id, column))); } /// Add an HNSW vector index with default build parameters. @@ -362,6 +408,158 @@ impl IndexStore { .insert(name, FtsMemIndex::with_params(field_id, column, params)); } + /// Maintain a primary-key index so the memtable can answer "newest visible + /// version of this key" (see [`Self::pk_newest_visible`]). + /// + /// Single-column PKs reuse an existing BTree on the field, else auto-create + /// one under a `__pk__*` name so the normal insert loop maintains it (no + /// second copy). Composite (arity >= 2) PKs key a `BTreeMemIndex` on the + /// order-preserving encoded tuple (synthetic `PK_KEY_COLUMN`), maintained + /// explicitly in the insert paths. Call once at construction, after + /// [`Self::from_configs`] and before any inserts; a no-op when `pk_columns` + /// is empty. + pub fn enable_pk_index(&mut self, pk_columns: &[(String, i32)]) { + self.pk_index = match pk_columns { + [] => None, + [(column, field_id)] => { + let btree = match self + .btree_indexes + .values() + .find(|b| b.field_id() == *field_id) + { + Some(existing) => existing.clone(), + None => { + let btree = Arc::new(BTreeMemIndex::new(*field_id, column.clone())); + self.btree_indexes + .insert(format!("__pk__{column}"), btree.clone()); + btree + } + }; + Some(PkIndex::Single(btree)) + } + multi => Some(PkIndex::Composite { + // Synthetic field id (-1): the composite index is held directly, + // never resolved by field id. + index: Arc::new(BTreeMemIndex::new(-1, PK_KEY_COLUMN.to_string())), + columns: multi.iter().map(|(c, _)| c.clone()).collect(), + }), + }; + } + + /// Whether the memtable has a primary-key index. + pub fn has_pk_index(&self) -> bool { + self.pk_index.is_some() + } + + /// Sorted `(value, row_id)` training batches for the flushed on-disk PK + /// BTree (the sidecar dedup index). Single-column emits the typed PK value; + /// composite emits the order-preserving `Binary` encoded tuple. Empty when + /// there is no primary key. Row positions line up 1:1 with the forward- + /// written data file, so they are the flushed row ids directly. + pub fn pk_training_batches(&self, batch_size: usize) -> Result> { + match &self.pk_index { + None => Ok(Vec::new()), + Some(PkIndex::Single(btree)) => btree.to_training_batches(batch_size), + Some(PkIndex::Composite { index, .. }) => index.to_training_batches(batch_size), + } + } + + /// Resolve the PK columns' positions in `batch` (composite insert helper). + fn pk_batch_indices(batch: &RecordBatch, columns: &[String]) -> Result> { + columns + .iter() + .map(|c| { + batch + .schema() + .column_with_name(c) + .map(|(i, _)| i) + .ok_or_else(|| { + Error::invalid_input(format!("PK column '{c}' not found in batch")) + }) + }) + .collect() + } + + /// Maintain the composite PK index for `batch` (no-op for single/no PK): + /// encode the PK columns into the synthetic `PK_KEY_COLUMN` `Binary` column + /// and feed that to the keyed `BTreeMemIndex`. + fn insert_composite_pk(&self, batch: &RecordBatch, row_offset: u64) -> Result<()> { + if let Some(PkIndex::Composite { index, columns }) = &self.pk_index { + let pk_indices = Self::pk_batch_indices(batch, columns)?; + let encoded = encode_pk_batch(batch, &pk_indices)?; + let schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new( + PK_KEY_COLUMN, + arrow_schema::DataType::Binary, + false, + )])); + let key_batch = RecordBatch::try_new(schema, vec![Arc::new(encoded)]) + .map_err(|e| Error::invalid_input(e.to_string()))?; + index.insert(&key_batch, row_offset)?; + } + Ok(()) + } + + /// The newest row position of the primary-key tuple `values` (in PK order) + /// visible at `max_visible_row`, or `None`. A single seek either way: + /// single-column probes the typed BTree; composite probes the encoded-tuple + /// index. Collision-free, since `position` is the row identity. + pub fn pk_newest_visible( + &self, + values: &[ScalarValue], + max_visible_row: RowPosition, + ) -> Option { + match &self.pk_index { + None => None, + Some(PkIndex::Single(btree)) => btree.get_newest_visible(&values[0], max_visible_row), + Some(PkIndex::Composite { index, .. }) => { + // An unsupported PK type would have failed at insert, so the + // index can't hold a tuple this fails to encode. The probe key is + // the same `Binary`-encoded tuple the insert path indexed. + let key = encode_pk_tuple(values).ok()?; + index.get_newest_visible(&ScalarValue::Binary(Some(key)), max_visible_row) + } + } + } + + /// Whether `position` is the newest visible row of `values` — the recency + /// check the active index-search arms apply to drop predicate-crossing + /// stale hits. Callers gate on [`Self::has_pk_index`] first, since this is + /// `false` (drop) when the memtable has no primary-key index. + pub fn pk_is_newest( + &self, + values: &[ScalarValue], + position: RowPosition, + max_visible_row: RowPosition, + ) -> bool { + self.pk_newest_visible(values, max_visible_row) == Some(position) + } + + /// Whether `key` has any version visible at `max_visible_row` — the + /// cross-source block-list's existence query, snapshot-bounded so a + /// not-yet-visible write can't shadow an older visible copy. + /// + /// `key` is already in the index's key space: the typed PK value for a + /// single-column key, the `Binary`-encoded tuple for a composite one (built + /// by `block_list::on_disk_pk_key`, the same key the flushed on-disk index is + /// probed with). Both arities forward it straight to the keyed BTree. + pub fn pk_contains_key(&self, key: &ScalarValue, max_visible_row: RowPosition) -> bool { + match &self.pk_index { + None => false, + Some(PkIndex::Single(btree)) | Some(PkIndex::Composite { index: btree, .. }) => { + btree.get_newest_visible(key, max_visible_row).is_some() + } + } + } + + /// Whether the primary-key index holds no rows (or doesn't exist). + pub fn pk_is_empty(&self) -> bool { + match &self.pk_index { + None => true, + Some(PkIndex::Single(btree)) => btree.is_empty(), + Some(PkIndex::Composite { index, .. }) => index.is_empty(), + } + } + /// Insert a batch into all indexes. pub fn insert(&self, batch: &RecordBatch, row_offset: u64) -> Result<()> { self.insert_with_batch_position(batch, row_offset, None) @@ -384,6 +582,9 @@ impl IndexStore { for index in self.fts_indexes.values() { index.insert(batch, row_offset)?; } + // Single-column PK aliases a `btree_indexes` entry (maintained above); + // a composite PK has its own index, maintained here. + self.insert_composite_pk(batch, row_offset)?; // Update global watermark after all indexes have been updated if let Some(bp) = batch_position { @@ -440,6 +641,12 @@ impl IndexStore { } } + // Single-column PK aliases a `btree_indexes` entry (maintained above); + // a composite PK has its own index, maintained here. + for stored in batches { + self.insert_composite_pk(&stored.data, stored.row_offset)?; + } + // Update global watermark to the max batch position let max_bp = batches.iter().map(|b| b.batch_position).max().unwrap(); self.advance_max_visible_batch_position(max_bp); @@ -552,6 +759,14 @@ impl IndexStore { .map(|(name, _idx_type, duration)| (name.to_string(), duration)) .collect(); + // Single-column PK aliases a `btree_indexes` entry — its thread above + // already maintained it (and joined). A composite PK has its own + // index; maintain it here before the watermark advances so the + // visible prefix is fully indexed. + for stored in batches { + self.insert_composite_pk(&stored.data, stored.row_offset)?; + } + // Update global watermark to the max batch position let max_bp = batches.iter().map(|b| b.batch_position).max().unwrap(); self.advance_max_visible_batch_position(max_bp); @@ -562,7 +777,7 @@ impl IndexStore { /// Get a BTree index by name. pub fn get_btree(&self, name: &str) -> Option<&BTreeMemIndex> { - self.btree_indexes.get(name) + self.btree_indexes.get(name).map(Arc::as_ref) } /// Get an HNSW vector index by name. @@ -583,6 +798,7 @@ impl IndexStore { self.btree_indexes .values() .find(|idx| idx.field_id() == field_id) + .map(Arc::as_ref) } /// Get an HNSW vector index by field ID. @@ -607,6 +823,7 @@ impl IndexStore { self.btree_indexes .values() .find(|idx| idx.column_name() == column) + .map(Arc::as_ref) } /// Get an HNSW vector index by column name. @@ -694,6 +911,73 @@ mod tests { .unwrap() } + /// Single-column `id` batch for primary-key lookup tests. + fn id_batch(ids: &[i32]) -> RecordBatch { + RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![Field::new( + "id", + DataType::Int32, + false, + )])), + vec![Arc::new(Int32Array::from(ids.to_vec()))], + ) + .unwrap() + } + + #[test] + fn pk_newest_visible_single_column() { + let mut store = IndexStore::new(); + store.enable_pk_index(&[("id".to_string(), 0)]); + // id=1 at positions 0 and 2 (an update), id=2 at position 1. + store.insert(&id_batch(&[1, 2]), 0).unwrap(); + store.insert(&id_batch(&[1]), 2).unwrap(); + + let one = [ScalarValue::Int32(Some(1))]; + // Watermark above the update sees the newest position; below it, the older. + assert_eq!(store.pk_newest_visible(&one, 5), Some(2)); + assert_eq!(store.pk_newest_visible(&one, 1), Some(0)); + assert!(store.pk_is_newest(&one, 2, 5)); + assert!(!store.pk_is_newest(&one, 0, 5)); + // Absent key (probed by the typed value, as the block-list does). + assert!(!store.pk_contains_key(&ScalarValue::Int32(Some(9)), 5)); + } + + #[test] + fn pk_newest_visible_composite_seeks_encoded_tuple() { + let mut store = IndexStore::new(); + store.enable_pk_index(&[("id".to_string(), 0), ("name".to_string(), 1)]); + // Rows: (1,"a")@0, (1,"b")@1, (1,"a")@2 — an update of (1,"a"). + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(vec![1, 1, 1])), + Arc::new(StringArray::from(vec!["a", "b", "a"])), + ], + ) + .unwrap(); + store.insert(&batch, 0).unwrap(); + + let tuple_1a = [ScalarValue::Int32(Some(1)), ScalarValue::from("a")]; + let tuple_1b = [ScalarValue::Int32(Some(1)), ScalarValue::from("b")]; + // (1,"a")'s newest visible row is its re-write at position 2. + assert_eq!(store.pk_newest_visible(&tuple_1a, 5), Some(2)); + assert!(store.pk_is_newest(&tuple_1a, 2, 5)); + assert!(!store.pk_is_newest(&tuple_1a, 0, 5)); + // (1,"b") only exists at position 1. + assert_eq!(store.pk_newest_visible(&tuple_1b, 5), Some(1)); + // Watermark below the re-write: the older (1,"a")@0 is the newest visible. + assert_eq!(store.pk_newest_visible(&tuple_1a, 1), Some(0)); + // An absent tuple (probed by its Binary-encoded key, as the block-list + // does). + let tuple_2a = [ScalarValue::Int32(Some(2)), ScalarValue::from("a")]; + let key_2a = ScalarValue::Binary(Some(encode_pk_tuple(&tuple_2a).unwrap())); + assert!(!store.pk_contains_key(&key_2a, 5)); + } + #[test] fn test_index_registry() { let schema = create_test_schema(); diff --git a/rust/lance/src/dataset/mem_wal/index/pk_key.rs b/rust/lance/src/dataset/mem_wal/index/pk_key.rs new file mode 100644 index 00000000000..b31fe42c995 --- /dev/null +++ b/rust/lance/src/dataset/mem_wal/index/pk_key.rs @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Composite primary-key encoding for MemWAL dedup. +//! +//! A multi-column primary key is reduced to a single order-preserving byte +//! string ([`encode_pk_tuple`]) so the whole tuple is one comparable key: +//! lexicographic byte order equals tuple order, and distinct tuples never +//! collide. Encoded as a `Binary` value, the tuple is indexed directly by a +//! [`super::BTreeMemIndex`] (its byte backend) — both in memory and, after +//! flush, as the on-disk BTree's `Binary` value column — so a probe builds +//! `ScalarValue::Binary(key)` and every layer agrees. +//! +//! Single-column primary keys do **not** use this — they key the typed +//! `BTreeMemIndex` on the column value directly. + +use arrow_array::{BinaryArray, RecordBatch}; +use datafusion::common::ScalarValue; +use lance_core::{Error, Result}; + +/// Sign-flip a signed integer to an order-preserving unsigned key (matches the +/// fixed-int BTree backend). Big-endian bytes of the result sort like the value. +#[inline] +fn encode_signed(v: i64) -> u64 { + (v as u64) ^ (1u64 << 63) +} + +/// Append an order-preserving encoding of one non-null byte string: each `0x00` +/// is escaped to `0x00 0xFF`, then a `0x00 0x00` terminator is appended. The +/// terminator sorts before any escaped content, so a prefix orders before its +/// extensions and no value can forge a column boundary. +fn encode_bytes(out: &mut Vec, bytes: &[u8]) { + for &b in bytes { + out.push(b); + if b == 0x00 { + out.push(0xFF); + } + } + out.extend_from_slice(&[0x00, 0x00]); +} + +/// Append the order-preserving encoding of a single PK column value. A leading +/// tag (`0x00` null / `0x01` non-null) makes nulls sort first and keeps the +/// per-column encoding self-delimiting (fixed-width for ints, terminated for +/// bytes), so concatenating columns stays injective and order-preserving. +fn encode_value(out: &mut Vec, value: &ScalarValue) -> Result<()> { + if value.is_null() { + out.push(0x00); + return Ok(()); + } + out.push(0x01); + macro_rules! be_signed { + ($v:expr) => { + out.extend_from_slice(&encode_signed($v as i64).to_be_bytes()) + }; + } + match value { + ScalarValue::Int8(Some(v)) => be_signed!(*v), + ScalarValue::Int16(Some(v)) => be_signed!(*v), + ScalarValue::Int32(Some(v)) => be_signed!(*v), + ScalarValue::Int64(Some(v)) => be_signed!(*v), + ScalarValue::Date32(Some(v)) => be_signed!(*v), + ScalarValue::Date64(Some(v)) => be_signed!(*v), + ScalarValue::UInt8(Some(v)) => out.extend_from_slice(&(*v as u64).to_be_bytes()), + ScalarValue::UInt16(Some(v)) => out.extend_from_slice(&(*v as u64).to_be_bytes()), + ScalarValue::UInt32(Some(v)) => out.extend_from_slice(&(*v as u64).to_be_bytes()), + ScalarValue::UInt64(Some(v)) => out.extend_from_slice(&v.to_be_bytes()), + ScalarValue::Boolean(Some(b)) => out.push(*b as u8), + ScalarValue::Utf8(Some(s)) | ScalarValue::LargeUtf8(Some(s)) => { + encode_bytes(out, s.as_bytes()) + } + ScalarValue::Binary(Some(b)) + | ScalarValue::LargeBinary(Some(b)) + | ScalarValue::FixedSizeBinary(_, Some(b)) => encode_bytes(out, b), + other => { + return Err(Error::invalid_input(format!( + "Unsupported primary-key column type for composite key: {other:?}" + ))); + } + } + Ok(()) +} + +/// Encode a PK tuple (values in PK column order) to one order-preserving key. +pub fn encode_pk_tuple(values: &[ScalarValue]) -> Result> { + let mut out = Vec::with_capacity(values.len() * 9); + for value in values { + encode_value(&mut out, value)?; + } + Ok(out) +} + +/// Encode row `row` of `batch`'s PK columns (at `pk_indices`) to one key. +fn encode_pk_row(batch: &RecordBatch, pk_indices: &[usize], row: usize) -> Result> { + let mut out = Vec::with_capacity(pk_indices.len() * 9); + for &col in pk_indices { + let value = ScalarValue::try_from_array(batch.column(col), row)?; + encode_value(&mut out, &value)?; + } + Ok(out) +} + +/// Encode every row of `batch`'s PK columns (at `pk_indices`) into a `Binary` +/// column of order-preserving composite keys — the form a [`super::BTreeMemIndex`] +/// indexes directly (its byte backend), so the composite PK reuses the same +/// index as a single-column one. +pub fn encode_pk_batch(batch: &RecordBatch, pk_indices: &[usize]) -> Result { + let mut keys: Vec> = Vec::with_capacity(batch.num_rows()); + for row in 0..batch.num_rows() { + keys.push(encode_pk_row(batch, pk_indices, row)?); + } + Ok(BinaryArray::from_iter_values(keys.iter())) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::{Int32Array, StringArray}; + use arrow_schema::{DataType, Field, Schema}; + use std::sync::Arc; + + fn tuple(a: i32, b: &str) -> Vec { + vec![ScalarValue::Int32(Some(a)), ScalarValue::from(b)] + } + + #[test] + fn encoding_is_order_preserving_and_injective() { + // Sorting tuples by their encoding must match tuple order, and distinct + // tuples must produce distinct bytes. + let tuples = [ + tuple(1, "a"), + tuple(1, "ab"), + tuple(1, "b"), + tuple(2, "a"), + tuple(-1, "z"), + ]; + let mut encoded: Vec<(Vec, &Vec)> = tuples + .iter() + .map(|t| (encode_pk_tuple(t).unwrap(), t)) + .collect(); + encoded.sort_by(|x, y| x.0.cmp(&y.0)); + let order: Vec<_> = encoded.iter().map(|(_, t)| (*t).clone()).collect(); + // -1 < 1 < 2; within id=1, "a" < "ab" < "b". + assert_eq!( + order, + vec![ + tuple(-1, "z"), + tuple(1, "a"), + tuple(1, "ab"), + tuple(1, "b"), + tuple(2, "a"), + ] + ); + // Injective: 5 distinct tuples → 5 distinct keys. + let mut keys: Vec> = tuples.iter().map(|t| encode_pk_tuple(t).unwrap()).collect(); + keys.sort(); + keys.dedup(); + assert_eq!(keys.len(), 5); + } + + #[test] + fn null_sorts_first_and_is_distinct() { + let null_a = vec![ScalarValue::Int32(None), ScalarValue::from("a")]; + let one_a = tuple(1, "a"); + assert!(encode_pk_tuple(&null_a).unwrap() < encode_pk_tuple(&one_a).unwrap()); + assert_ne!( + encode_pk_tuple(&null_a).unwrap(), + encode_pk_tuple(&one_a).unwrap() + ); + } + + #[test] + fn prefix_safety_with_embedded_zero() { + // A string containing 0x00 must not collide with or sort incorrectly + // against a shorter one (escaping + terminator). + let with_zero = vec![ScalarValue::Binary(Some(vec![0x00]))]; + let empty = vec![ScalarValue::Binary(Some(vec![]))]; + assert!(encode_pk_tuple(&empty).unwrap() < encode_pk_tuple(&with_zero).unwrap()); + } + + #[test] + fn encode_pk_batch_matches_per_tuple_encoding() { + // Each row of the encoded `Binary` column equals `encode_pk_tuple` of + // that row's PK values — so the column a BTreeMemIndex indexes is exactly + // what a probe builds. + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(vec![2, 1])), + Arc::new(StringArray::from(vec!["a", "b"])), + ], + ) + .unwrap(); + let encoded = encode_pk_batch(&batch, &[0, 1]).unwrap(); + assert_eq!(encoded.value(0), encode_pk_tuple(&tuple(2, "a")).unwrap()); + assert_eq!(encoded.value(1), encode_pk_tuple(&tuple(1, "b")).unwrap()); + // (1,"b") encodes below (2,"a"). + assert!(encoded.value(1) < encoded.value(0)); + } +} diff --git a/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs b/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs index f4d4d797acc..054d9b1630e 100644 --- a/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs +++ b/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs @@ -615,6 +615,22 @@ impl BatchStore { (0..end).collect() } + /// The inclusive maximum visible *row* position at `max_visible_batch_position`, + /// or `None` when no rows are visible. The visible batches are the committed + /// prefix `[0, last_visible_idx]`; each batch carries its cumulative + /// `row_offset`, so this is the end of the last visible batch minus one. + /// Used to bound MVCC seeks against the maintained PK-position index. + pub fn max_visible_row(&self, max_visible_batch_position: usize) -> Option { + let len = self.committed_len.load(Ordering::Acquire); + if len == 0 { + return None; + } + let last_visible_idx = max_visible_batch_position.min(len - 1); + let last = self.get(last_visible_idx)?; + let visible_end = last.row_offset + last.num_rows as u64; // exclusive + visible_end.checked_sub(1) + } + /// Check if a specific batch is visible at a given visibility position. #[inline] pub fn is_batch_visible( @@ -910,6 +926,37 @@ mod tests { assert!(!store.is_batch_visible(3, 10)); } + #[test] + fn test_max_visible_row() { + // (1) Empty store: no rows are visible at any position. + let store = BatchStore::with_capacity(10); + assert_eq!(store.max_visible_row(0), None); + assert_eq!(store.max_visible_row(100), None); + + // Three batches → rows [0,10) [10,30) [30,60); row_offsets 0, 10, 30. + store.append(create_test_batch(10)).unwrap(); // position 0 + store.append(create_test_batch(20)).unwrap(); // position 1 + store.append(create_test_batch(30)).unwrap(); // position 2 + + // (2) A position within range yields the inclusive end of that prefix. + assert_eq!(store.max_visible_row(0), Some(9)); // batch 0: 0..10 + assert_eq!(store.max_visible_row(1), Some(29)); // batch 1: 10..30 + assert_eq!(store.max_visible_row(2), Some(59)); // batch 2: 30..60 + + // (3) A position beyond the committed range clamps to the last batch, + // i.e. the inclusive max over all rows. + assert_eq!(store.max_visible_row(100), Some(59)); + + // (4) An empty leading batch contributes no rows: at its own position + // the inclusive end underflows to None, while a later non-empty batch + // is reported correctly. + let store = BatchStore::with_capacity(10); + store.append(create_test_batch(0)).unwrap(); // position 0: rows [0,0) + store.append(create_test_batch(5)).unwrap(); // position 1: rows [0,5) + assert_eq!(store.max_visible_row(0), None); // empty prefix → no rows + assert_eq!(store.max_visible_row(1), Some(4)); // through batch 1 + } + #[test] fn test_recommended_capacity() { // 64MB memtable, 64KB avg batch = 1024 batches * 1.2 = ~1228 diff --git a/rust/lance/src/dataset/mem_wal/memtable/flush.rs b/rust/lance/src/dataset/mem_wal/memtable/flush.rs index c4794d4c8f3..ebcc06cab44 100644 --- a/rust/lance/src/dataset/mem_wal/memtable/flush.rs +++ b/rust/lance/src/dataset/mem_wal/memtable/flush.rs @@ -18,7 +18,7 @@ use lance_io::object_store::ObjectStore; use lance_table::format::IndexMetadata; use lance_table::io::commit::write_manifest_file_to_path; use lance_table::io::deletion::write_deletion_file; -use log::info; +use log::{info, warn}; use object_store::ObjectStoreExt; use object_store::path::Path; use roaring::RoaringBitmap; @@ -29,6 +29,7 @@ use super::super::index::MemIndexConfig; use super::super::memtable::MemTable; use crate::Dataset; use crate::dataset::mem_wal::manifest::ShardManifestStore; +use crate::dataset::mem_wal::scanner::GenerationWarmer; use crate::dataset::mem_wal::scanner::exec::{compute_pk_hash, validate_pk_types}; use crate::dataset::mem_wal::util::{flushed_memtable_path, generate_random_hash}; @@ -68,6 +69,9 @@ pub struct MemTableFlusher { base_uri: String, shard_id: Uuid, manifest_store: Arc, + /// When present, each new generation is warmed before it is committed, so + /// the first query sees zero cold reads. `None` => no warming. + warmer: Option>, } impl MemTableFlusher { @@ -84,6 +88,26 @@ impl MemTableFlusher { base_uri: base_uri.into(), shard_id, manifest_store, + warmer: None, + } + } + + /// Attach the warmer fired pre-commit for each new generation. + pub fn with_warmer(mut self, warmer: Option>) -> Self { + self.warmer = warmer; + self + } + + /// Warm a just-written generation before it is committed. Best-effort: a + /// failure is logged and the flush proceeds — warming is never a commit + /// gate. No-op without a warmer. `uri` must be the resolved reader path + /// (`path_to_uri(gen_path)`) so warmed entries key-match later queries. + async fn warm_generation(&self, uri: &str) { + let Some(warmer) = &self.warmer else { + return; + }; + if let Err(e) = warmer.warm(uri).await { + warn!("pre-commit warm failed for generation {uri}; committing cold: {e}"); } } @@ -178,6 +202,16 @@ impl MemTableFlusher { self.write_bloom_filter(&bloom_path, memtable.bloom_filter()) .await?; + // Write the standalone primary-key dedup sidecar. A primary key needs + // no secondary index, so this is required on the plain-flush path too — + // the LSM scanner opens it to dedup the generation. (`flush_with_indexes` + // writes it on the indexed path.) No-op when the memtable has no PK. + self.create_pk_index(&gen_path, memtable.indexes()).await?; + + // Warm before commit (zero cold window); no-op without a warmer. + let warm_uri = self.path_to_uri(&gen_path); + self.warm_generation(&warm_uri).await; + let new_manifest = self .update_manifest( epoch, @@ -449,6 +483,10 @@ impl MemTableFlusher { all_indexes.extend(fts_indexes); } + // Write the standalone primary-key dedup index (sidecar, not a manifest + // index — the block-list opens it directly by path). + self.create_pk_index(&gen_path, memtable.indexes()).await?; + // Write a single manifest that records the fragments, the // within-generation deletion vector, and all indexes, overwriting the // data-only v1 manifest created by Dataset::write. @@ -459,6 +497,10 @@ impl MemTableFlusher { self.write_bloom_filter(&bloom_path, memtable.bloom_filter()) .await?; + // Warm before commit (zero cold window); no-op without a warmer. + let warm_uri = self.path_to_uri(&gen_path); + self.warm_generation(&warm_uri).await; + let new_manifest = self .update_manifest( epoch, @@ -543,6 +585,49 @@ impl MemTableFlusher { Ok(created_indexes) } + /// Write the standalone primary-key dedup index for this generation. + /// + /// Unlike user indexes, this is a **sidecar**: it is not registered in the + /// manifest. The block-list opens it directly by path + /// ([`pk_index_path`]) and probes it with `Equals`. Single-column primary + /// keys index the typed value; composite keys index the order-preserving + /// `Binary` encoded tuple (see [`super::super::index::encode_pk_tuple`]). + /// Row positions line up 1:1 with the forward-written data file, so they are + /// the flushed row ids directly. No-op without a primary-key index. + async fn create_pk_index( + &self, + gen_path: &Path, + mem_indexes: Option<&super::super::index::IndexStore>, + ) -> Result<()> { + use datafusion::physical_plan::SendableRecordBatchStream; + use datafusion::physical_plan::stream::RecordBatchStreamAdapter; + use lance_index::scalar::btree::train_btree_index; + use lance_index::scalar::lance_format::LanceIndexStore; + + use crate::dataset::mem_wal::util::pk_index_path; + + let Some(registry) = mem_indexes else { + return Ok(()); + }; + let batches = registry.pk_training_batches(8192)?; + if batches.is_empty() { + return Ok(()); + } + + let schema = batches[0].schema(); + let store = LanceIndexStore::new( + self.object_store.clone(), + pk_index_path(gen_path), + Arc::new(LanceCache::no_cache()), + ); + let stream: SendableRecordBatchStream = Box::pin(RecordBatchStreamAdapter::new( + schema, + futures::stream::iter(batches.into_iter().map(Ok)), + )); + train_btree_index(stream, &store, 8192, None, None).await?; + Ok(()) + } + /// Create FTS (Full-Text Search) indexes from in-memory data (uncommitted). /// /// Writes the FTS index files and returns index metadata without committing. @@ -965,21 +1050,30 @@ impl MemTableFlusher { } } -/// Message to trigger flush of a frozen memtable to Lance storage. -pub struct TriggerMemTableFlush { - /// The frozen memtable to flush. - pub memtable: Arc, - /// Optional channel to notify when flush completes. - pub done: Option>>, +/// Message driving the background memtable-flush task. +pub enum TriggerMemTableFlush { + /// Flush a frozen memtable to Lance storage. + Flush { + /// The frozen memtable to flush. + memtable: Arc, + /// Optional channel to notify when flush completes. + done: Option>>, + }, + /// Periodic tick: evict frozen memtables whose post-flush grace has elapsed. + SweepExpired, } impl std::fmt::Debug for TriggerMemTableFlush { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("TriggerMemTableFlush") - .field("memtable_gen", &self.memtable.generation()) - .field("memtable_rows", &self.memtable.row_count()) - .field("has_done", &self.done.is_some()) - .finish() + match self { + Self::Flush { memtable, done } => f + .debug_struct("TriggerMemTableFlush::Flush") + .field("memtable_gen", &memtable.generation()) + .field("memtable_rows", &memtable.row_count()) + .field("has_done", &done.is_some()) + .finish(), + Self::SweepExpired => f.write_str("TriggerMemTableFlush::SweepExpired"), + } } } @@ -1139,6 +1233,79 @@ mod tests { assert_eq!(updated_manifest.flushed_generations.len(), 1); } + /// A `GenerationWarmer` that counts calls and optionally fails. + #[derive(Debug)] + struct CountingWarmer { + calls: Arc, + fail: bool, + } + + #[async_trait::async_trait] + impl GenerationWarmer for CountingWarmer { + async fn warm(&self, _path: &str) -> Result<()> { + self.calls.fetch_add(1, std::sync::atomic::Ordering::SeqCst); + if self.fail { + Err(Error::io("simulated warm failure".to_string())) + } else { + Ok(()) + } + } + } + + /// Warming is a best-effort optimization, never a commit gate: a warmer that + /// errors pre-commit must still let the flush commit the generation. The + /// warm fires exactly once on the pre-commit path. + #[tokio::test] + async fn test_flusher_commits_when_warm_fails() { + let (store, base_path, base_uri, _temp_dir) = create_local_store().await; + let shard_id = Uuid::new_v4(); + let manifest_store = Arc::new(ShardManifestStore::new( + store.clone(), + &base_path, + shard_id, + 2, + )); + let (epoch, _manifest) = manifest_store.claim_epoch(0).await.unwrap(); + + let schema = create_test_schema(); + let mut memtable = MemTable::new(schema.clone(), 1, vec![]).unwrap(); + let frag_id = memtable + .insert(create_test_batch(&schema, 10)) + .await + .unwrap(); + memtable.mark_wal_flushed(&[frag_id], 1, &[0]); + + let calls = Arc::new(std::sync::atomic::AtomicUsize::new(0)); + let warmer: Arc = Arc::new(CountingWarmer { + calls: calls.clone(), + fail: true, + }); + + let flusher = MemTableFlusher::new( + store.clone(), + base_path, + base_uri, + shard_id, + manifest_store.clone(), + ) + .with_warmer(Some(warmer)); + // Flush must succeed despite the warmer erroring. + let result = flusher.flush(&memtable, epoch, 1).await.unwrap(); + + assert_eq!(result.generation.generation, 1); + assert_eq!( + calls.load(std::sync::atomic::Ordering::SeqCst), + 1, + "pre-commit warm fires exactly once" + ); + let updated = manifest_store.read_latest().await.unwrap().unwrap(); + assert_eq!( + updated.flushed_generations.len(), + 1, + "generation still committed after a failed warm" + ); + } + /// Flushing a generation with within-generation duplicate PKs writes a /// deletion vector so the flushed dataset exposes newest-per-PK on scan. #[tokio::test] @@ -1227,6 +1394,202 @@ mod tests { assert_eq!(rows.get(&3), Some(&"c2".to_string())); } + /// Flushing a memtable with a primary-key index writes a standalone sidecar + /// BTree at `{gen}/_pk_index` that the block-list can reopen by path and + /// probe by value — including for a within-gen-superseded PK (existence, + /// not visibility). + #[tokio::test] + async fn flushed_pk_index_sidecar_is_probeable() { + use lance_core::cache::LanceCache; + use lance_index::metrics::NoOpMetricsCollector; + use lance_index::registry::IndexPluginRegistry; + use lance_index::scalar::lance_format::LanceIndexStore; + use lance_index::scalar::{SargableQuery, SearchResult}; + + use super::super::super::index::IndexStore; + use crate::dataset::mem_wal::util::pk_index_path; + use datafusion::common::ScalarValue; + + let (store, base_path, _base_uri, _temp_dir) = create_local_store().await; + let shard_id = Uuid::new_v4(); + let manifest_store = Arc::new(ShardManifestStore::new( + store.clone(), + &base_path, + shard_id, + 2, + )); + let (epoch, _manifest) = manifest_store.claim_epoch(0).await.unwrap(); + + // Primary-key index on `id`, no user indexes. + let schema = create_pk_schema(); + let mut memtable = MemTable::new(schema.clone(), 1, vec![0]).unwrap(); + let mut registry = IndexStore::new(); + registry.enable_pk_index(&[("id".to_string(), 0)]); + memtable.set_indexes(registry); + + // id=1 updated in-gen (a -> a2); id=2 unique. + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 1])), + Arc::new(StringArray::from(vec!["a", "b", "a2"])), + ], + ) + .unwrap(); + let frag_id = memtable.insert(batch).await.unwrap(); + memtable.mark_wal_flushed(&[frag_id], 1, &[0]); + + let flusher = MemTableFlusher::new( + store.clone(), + base_path.clone(), + _base_uri.clone(), + shard_id, + manifest_store.clone(), + ); + let result = flusher + .flush_with_indexes(&memtable, epoch, &[], 1) + .await + .unwrap(); + + // Reopen the sidecar directly by path (the block-list's route). + let gen_path = base_path + .clone() + .join("_mem_wal") + .join(shard_id.to_string()) + .join(result.generation.path.as_str()); + let index_store = Arc::new(LanceIndexStore::new( + store.clone(), + pk_index_path(&gen_path), + Arc::new(LanceCache::no_cache()), + )); + let registry = IndexPluginRegistry::with_default_plugins(); + let plugin = registry.get_plugin_by_name("BTree").unwrap(); + let details = + prost_types::Any::from_msg(&lance_index::pbold::BTreeIndexDetails::default()).unwrap(); + let index = plugin + .load_index(index_store, &details, None, &LanceCache::no_cache()) + .await + .unwrap(); + + let contains = |id: i32| { + let index = index.clone(); + async move { + let result = index + .search( + &SargableQuery::Equals(ScalarValue::Int32(Some(id))), + &NoOpMetricsCollector, + ) + .await + .unwrap(); + match result { + SearchResult::Exact(s) | SearchResult::AtMost(s) | SearchResult::AtLeast(s) => { + !s.is_empty() + } + } + } + }; + // Both PKs present (id=1 even though its first version was superseded); + // an absent PK is not. + assert!(contains(1).await); + assert!(contains(2).await); + assert!(!contains(99).await); + } + + /// Regression: production dispatches a PK-only flush (a primary key, no + /// secondary index) to `flush`, not `flush_with_indexes`. `flush` must still + /// write the PK dedup sidecar, otherwise cross-generation dedup fails with + /// `page_lookup.lance not found`. + #[tokio::test] + async fn plain_flush_writes_pk_sidecar() { + use lance_core::cache::LanceCache; + use lance_index::metrics::NoOpMetricsCollector; + use lance_index::registry::IndexPluginRegistry; + use lance_index::scalar::lance_format::LanceIndexStore; + use lance_index::scalar::{SargableQuery, SearchResult}; + + use super::super::super::index::IndexStore; + use crate::dataset::mem_wal::util::pk_index_path; + use datafusion::common::ScalarValue; + + let (store, base_path, _base_uri, _temp_dir) = create_local_store().await; + let shard_id = Uuid::new_v4(); + let manifest_store = Arc::new(ShardManifestStore::new( + store.clone(), + &base_path, + shard_id, + 2, + )); + let (epoch, _manifest) = manifest_store.claim_epoch(0).await.unwrap(); + + // Primary-key index on `id`, no user indexes. + let schema = create_pk_schema(); + let mut memtable = MemTable::new(schema.clone(), 1, vec![0]).unwrap(); + let mut registry = IndexStore::new(); + registry.enable_pk_index(&[("id".to_string(), 0)]); + memtable.set_indexes(registry); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2])), + Arc::new(StringArray::from(vec!["a", "b"])), + ], + ) + .unwrap(); + let frag_id = memtable.insert(batch).await.unwrap(); + memtable.mark_wal_flushed(&[frag_id], 1, &[0]); + + let flusher = MemTableFlusher::new( + store.clone(), + base_path.clone(), + _base_uri.clone(), + shard_id, + manifest_store.clone(), + ); + // The plain-flush path — what the writer dispatches to with no indexes. + let result = flusher.flush(&memtable, epoch, 1).await.unwrap(); + + let gen_path = base_path + .clone() + .join("_mem_wal") + .join(shard_id.to_string()) + .join(result.generation.path.as_str()); + let index_store = Arc::new(LanceIndexStore::new( + store.clone(), + pk_index_path(&gen_path), + Arc::new(LanceCache::no_cache()), + )); + let registry = IndexPluginRegistry::with_default_plugins(); + let plugin = registry.get_plugin_by_name("BTree").unwrap(); + let details = + prost_types::Any::from_msg(&lance_index::pbold::BTreeIndexDetails::default()).unwrap(); + let index = plugin + .load_index(index_store, &details, None, &LanceCache::no_cache()) + .await + .unwrap(); + + let contains = |id: i32| { + let index = index.clone(); + async move { + let result = index + .search( + &SargableQuery::Equals(ScalarValue::Int32(Some(id))), + &NoOpMetricsCollector, + ) + .await + .unwrap(); + match result { + SearchResult::Exact(s) | SearchResult::AtMost(s) | SearchResult::AtLeast(s) => { + !s.is_empty() + } + } + } + }; + assert!(contains(1).await); + assert!(contains(2).await); + assert!(!contains(99).await); + } + /// Covers `finalize_generation` writing both a deletion vector *and* /// indexes into the same manifest — the deletion-only and index-only /// paths are exercised by sibling tests. diff --git a/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs b/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs index 2c5192e28a1..17fa9c76a65 100644 --- a/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs +++ b/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs @@ -366,6 +366,14 @@ impl MemTableScanner { self } + /// The `max_visible_batch_position` snapshot this scanner latched at + /// construction. A downstream recency filter must key on this same snapshot + /// (not a fresh read of the IndexStore watermark, which a concurrent append + /// could have advanced) so it stays consistent with the rows the search saw. + pub fn max_visible_batch_position(&self) -> usize { + self.max_visible_batch_position + } + /// Include the _rowaddr column in output. /// /// Same value as _rowid but named for compatibility with LSM scanner. diff --git a/rust/lance/src/dataset/mem_wal/scanner.rs b/rust/lance/src/dataset/mem_wal/scanner.rs index b1766f8525f..fe14bd82dd8 100644 --- a/rust/lance/src/dataset/mem_wal/scanner.rs +++ b/rust/lance/src/dataset/mem_wal/scanner.rs @@ -43,12 +43,15 @@ mod point_lookup; mod projection; mod vector_search; +pub use block_list::write_pk_sidecar; pub use builder::LsmScanner; pub use collector::{ ActiveMemTableRef, InMemoryMemTableRef, InMemoryMemTables, LsmDataSourceCollector, }; -pub use data_source::{FlushedGeneration, LsmDataSource, LsmGeneration, ShardSnapshot}; -pub use flushed_cache::FlushedMemTableCache; +pub use data_source::{ + FlushedGeneration, FreshTierWatermark, LsmDataSource, LsmGeneration, ShardSnapshot, +}; +pub use flushed_cache::{DatasetCache, FlushedMemTableCache, GenerationWarmer}; pub use fts_search::{LsmFtsSearchPlanner, SCORE_COLUMN}; pub use point_lookup::LsmPointLookupPlanner; pub use projection::DISTANCE_COLUMN; diff --git a/rust/lance/src/dataset/mem_wal/scanner/block_list.rs b/rust/lance/src/dataset/mem_wal/scanner/block_list.rs index 684fde48da1..69d16930888 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/block_list.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/block_list.rs @@ -3,37 +3,151 @@ //! Per-source block-list construction for LSM vector search. //! -//! A generation's membership is an `Arc>` of PK hashes -//! ([`compute_pk_hash`]), built once (immutable gens cached). Each source gets a -//! `Vec>>` of the newer generations' sets (`NEWER(G)`; base: all -//! of them) — referenced, never merged. The KNN drops candidates whose PK is in -//! any (see [`super::exec::PkHashFilterExec`]). +//! A generation's membership is a [`GenMembership`]: in-memory generations +//! (active / frozen) are probed by value against their maintained primary-key +//! index (no per-query set), while flushed generations are probed against their +//! standalone on-disk PK BTree (the sidecar written at flush, opened by path). +//! Probing is batched — [`GenMembership::contains_keys`] tests a whole batch of +//! keys per generation in one pass. Each source gets a `Vec` of +//! the newer generations (`NEWER(G)`; base: all of them); the KNN drops a +//! candidate whose PK any of them contains (see +//! [`super::exec::PkBlockFilterExec`]). //! -//! Cross-generation only: within-gen dups share a hash and fall to the global -//! dedup's `(generation, freshness)` tiebreaker. +//! Cross-generation only: within-gen dups collapse via the global dedup's +//! `(generation, freshness)` tiebreaker. + +use std::collections::HashMap; +use std::sync::{Arc, LazyLock}; + +use datafusion::common::ScalarValue; +use lance_core::{Error, Result}; + +use lance_index::metrics::NoOpMetricsCollector; +use lance_index::registry::IndexPluginRegistry; +use lance_index::scalar::btree::BTreeIndex; +use lance_index::scalar::lance_format::LanceIndexStore; +use lance_index::scalar::{ + IndexStore as ScalarIndexStore, SargableQuery, ScalarIndex, SearchResult, +}; +use uuid::Uuid; -use std::collections::{HashMap, HashSet}; -use std::sync::Arc; +use super::data_source::{FreshTierWatermark, LsmDataSource, LsmGeneration}; +use super::flushed_cache::{DatasetCache, open_flushed_dataset}; +use crate::dataset::mem_wal::index::encode_pk_tuple; +use crate::dataset::mem_wal::util::PK_INDEX_DIR; +use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; +use crate::session::Session; -use arrow_array::RecordBatch; -use futures::TryStreamExt; -use lance_core::Result; +/// Default-plugin registry, used only to load the standalone PK BTree by its +/// `BTreeIndexDetails` type. Built once. +static PK_BTREE_REGISTRY: LazyLock> = + LazyLock::new(IndexPluginRegistry::with_default_plugins); + +/// One newer generation's PK membership, used to decide whether it shadows an +/// older source's row. +#[derive(Clone, Debug)] +pub enum GenMembership { + /// Probe the in-memory memtable's primary-key index, bounded to its visible + /// prefix (so a not-yet-visible write can't shadow an older visible copy). + InMemory { + index_store: Arc, + /// Inclusive visible row watermark; `None` when no rows are visible. + max_visible_row: Option, + }, + /// Probe the flushed generation's standalone on-disk PK BTree. + OnDisk(Arc), +} -use uuid::Uuid; +impl GenMembership { + /// Whether this generation visibly contains the primary `key` — the typed + /// value for a single-column PK, the encoded `Binary` tuple for a composite + /// one (built by [`on_disk_pk_key`]). The same key probes the in-memory + /// BTree and the flushed on-disk BTree, which now share a key space. + pub async fn contains(&self, key: &ScalarValue) -> Result { + match self { + Self::InMemory { + index_store, + max_visible_row, + } => Ok(max_visible_row.is_some_and(|max| index_store.pk_contains_key(key, max))), + Self::OnDisk(index) => { + let result = index + .search(&SargableQuery::Equals(key.clone()), &NoOpMetricsCollector) + .await + .map_err(|e| Error::io(e.to_string()))?; + Ok(!search_is_empty(&result)) + } + } + } -use super::data_source::{LsmDataSource, LsmGeneration}; -use super::exec::{compute_pk_hash, resolve_pk_indices}; -use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; -use crate::dataset::Dataset; -use crate::dataset::mem_wal::write::BatchStore; -use crate::session::Session; + /// Batched [`Self::contains`]: for each key in `keys`, whether this + /// generation visibly contains it, returned as a mask aligned to `keys`. + /// + /// One probe replaces N. The on-disk arm issues a single + /// [`BTreeIndex::contains_keys`] (no per-key `SearchResult` allocation); the + /// in-memory arm maps the sync, allocation-free PK lookup over the slice. + /// Keys are in the index's key space (see [`on_disk_pk_key`]). + pub async fn contains_keys(&self, keys: &[ScalarValue]) -> Result> { + match self { + Self::InMemory { + index_store, + max_visible_row, + } => Ok(keys + .iter() + .map(|key| max_visible_row.is_some_and(|max| index_store.pk_contains_key(key, max))) + .collect()), + Self::OnDisk(index) => { + // The flushed PK sidecar is always a BTree (built via + // `PK_BTREE_REGISTRY`); downcast to reach the batched probe. + let btree = index.as_any().downcast_ref::().ok_or_else(|| { + Error::io("flushed PK dedup index is not a BTree".to_string()) + })?; + btree + .contains_keys(keys, &NoOpMetricsCollector) + .await + .map_err(|e| Error::io(e.to_string())) + } + } + } + + /// Whether this generation has no (visible) membership — used to skip adding + /// an empty blocked set. A flushed generation always has rows (flush rejects + /// an empty memtable), so it is never empty. + fn is_empty(&self) -> bool { + match self { + Self::InMemory { + index_store, + max_visible_row, + } => max_visible_row.is_none() || index_store.pk_is_empty(), + Self::OnDisk(_) => false, + } + } +} -/// Per-source blocked PK-hash sets, keyed by `(shard_id, generation)`. Each -/// value is the membership sets of the generations newer than that source. -pub type SourceBlockLists = HashMap<(Option, LsmGeneration), Vec>>>; +/// Whether a scalar search returned no rows (existence test for the block-list). +fn search_is_empty(result: &SearchResult) -> bool { + match result { + SearchResult::Exact(set) | SearchResult::AtMost(set) | SearchResult::AtLeast(set) => { + set.is_empty() + } + } +} -/// A shard's generations paired with their PK-hash membership, before sorting. -type ShardGenSets = HashMap>)>>; +/// The probe key for the on-disk PK BTree: a single-column PK indexes its typed +/// value directly; a composite PK indexes the order-preserving encoded tuple as +/// `Binary` (matching what flush wrote — see [`encode_pk_tuple`]). +pub fn on_disk_pk_key(values: &[ScalarValue]) -> Result { + match values { + [single] => Ok(single.clone()), + _ => Ok(ScalarValue::Binary(Some(encode_pk_tuple(values)?))), + } +} + +/// Per-source blocked memberships, keyed by `(shard_id, generation)`. Each value +/// is the memberships of the generations newer than that source. +pub type SourceBlockLists = HashMap<(Option, LsmGeneration), Vec>; + +/// A shard's generations paired with their membership, before sorting. +type ShardGenSets = HashMap>; /// Per-source `NEWER(G)`, keyed by `(shard_id, generation)`. Generations are /// per-shard, so a source is superseded only by strictly-newer generations of @@ -42,59 +156,64 @@ type ShardGenSets = HashMap>)>>; /// Only superseded sources get an entry; the newest of each shard never does. pub async fn compute_source_block_lists( sources: &[LsmDataSource], - pk_columns: &[String], session: Option<&Arc>, - flushed_cache: Option<&Arc>, + flushed_cache: Option<&Arc>, ) -> Result { - // Hash each non-base source's membership, grouped by shard (generations are + // Membership per non-base source, grouped by shard (generations are // per-shard, so supersession is within-shard only). let mut by_shard: ShardGenSets = HashMap::new(); let mut has_base = false; + // Flushed PK-BTree opens are cold S3 reads; overlap them with + // `try_join_all`. Order is irrelevant — gens are sorted per-shard below. + let mut flushed_loads = Vec::new(); for source in sources { match source { LsmDataSource::BaseTable { .. } => has_base = true, LsmDataSource::ActiveMemTable { batch_store, + index_store, shard_id, generation, .. } => { - let hashes = Arc::new(pk_hashes_from_batch_store(batch_store, pk_columns)?); + let membership = in_memory_membership(batch_store, index_store); by_shard .entry(*shard_id) .or_default() - .push((*generation, hashes)); + .push((*generation, membership)); } LsmDataSource::FlushedMemTable { path, shard_id, generation, .. - } => { - // Cached by immutable path so repeated searches skip the PK scan. - let hashes = flushed_pk_hashes(path, pk_columns, session, flushed_cache).await?; - by_shard - .entry(*shard_id) - .or_default() - .push((*generation, hashes)); - } + } => flushed_loads.push(async move { + let index = open_pk_index(path, session, flushed_cache).await?; + Ok::<_, Error>((*shard_id, *generation, GenMembership::OnDisk(index))) + }), } } + for (shard_id, generation, membership) in futures::future::try_join_all(flushed_loads).await? { + by_shard + .entry(shard_id) + .or_default() + .push((generation, membership)); + } let mut blocked: SourceBlockLists = HashMap::new(); // Base (shardless, oldest) is superseded by every non-base generation. - let mut base_blocked: Vec>> = Vec::new(); + let mut base_blocked: Vec = Vec::new(); for (shard, mut gens) in by_shard { // Newest-first: a gen's blocked list is its own shard's newer gens. gens.sort_by_key(|(generation, _)| std::cmp::Reverse(*generation)); - let mut newer: Vec>> = Vec::new(); - for (generation, hashes) in gens { + let mut newer: Vec = Vec::new(); + for (generation, membership) in gens { if !newer.is_empty() { blocked.insert((Some(shard), generation), newer.clone()); } - if !hashes.is_empty() { - base_blocked.push(hashes.clone()); - newer.push(hashes); + if !membership.is_empty() { + base_blocked.push(membership.clone()); + newer.push(membership); } } } @@ -104,260 +223,355 @@ pub async fn compute_source_block_lists( Ok(blocked) } -/// The fresh-tier block-list: one membership set per generation that shadows the -/// base table — active + frozen memtables (hashed now) and flushed generations -/// (from the cache). Same `Vec>>` shape the vector-search filter -/// consumes; a base/external reader can drop any row whose PK is in one of them. -/// The base source, if present, is skipped (it is what gets shadowed). +/// The fresh-tier block-list: one [`GenMembership`] per generation that shadows +/// the base table — active + frozen memtables (probed against their index) and +/// flushed generations (probed against their on-disk PK BTree). A base/external +/// reader can test any PK against these (via [`GenMembership::contains`]) to +/// decide whether the fresh tier shadows it. The base source, if present, is +/// skipped (it is what gets shadowed). +/// +/// When `watermarks` carries a watermark for a source's shard, membership is +/// bounded to it (see [`FreshTierWatermark`]): higher generations are excluded, +/// the active generation is bounded to its first `active_batch_count` batches, +/// and lower generations (frozen and flushed) are immutable and included whole. +/// A shard absent from `watermarks` (or `watermarks == None`) uses the live tier. pub async fn fresh_tier_block_list( sources: &[LsmDataSource], - pk_columns: &[String], session: Option<&Arc>, - flushed_cache: Option<&Arc>, -) -> Result>>> { - let mut sets = Vec::new(); + flushed_cache: Option<&Arc>, + watermarks: Option<&HashMap>, +) -> Result> { + // Membership per source, in source order (`None` = skipped). Flushed + // PK-BTree opens are cold S3 reads, so collect them tagged with their slot + // and overlap with `try_join_all` rather than opening one at a time. + let mut slots: Vec> = Vec::with_capacity(sources.len()); + let mut flushed_loads = Vec::new(); for source in sources { - let set = match source { - LsmDataSource::BaseTable { .. } => continue, - LsmDataSource::ActiveMemTable { batch_store, .. } => { - Arc::new(pk_hashes_from_batch_store(batch_store, pk_columns)?) + match source { + LsmDataSource::BaseTable { .. } => slots.push(None), + LsmDataSource::ActiveMemTable { + batch_store, + index_store, + shard_id, + generation, + .. + } => { + let membership = match watermarks.and_then(|m| m.get(shard_id)) { + None => Some(in_memory_membership(batch_store, index_store)), + Some(watermark) => { + let g = generation.as_u64(); + if g > watermark.active_generation { + // Rolled in after the snapshot; the arm never saw it. + None + } else if g == watermark.active_generation { + // Bound the active generation to the batches the arm saw. + Some(bounded_in_memory_membership( + batch_store, + index_store, + watermark.active_batch_count, + )) + } else { + // Lower (frozen) generations are immutable — include all. + Some(in_memory_membership(batch_store, index_store)) + } + } + }; + slots.push(membership); } - LsmDataSource::FlushedMemTable { path, .. } => { - flushed_pk_hashes(path, pk_columns, session, flushed_cache).await? + LsmDataSource::FlushedMemTable { + path, + shard_id, + generation, + .. + } => { + // A generation at or above the active one was flushed after the + // snapshot; exclude it. Lower generations are immutable. The + // `==` case is the active generation flushed between the two + // reads: excluding the flushed copy loses nothing, since its + // rows are already captured by the in-memory arm above (bounded + // to `active_batch_count`). + let flushed_after_snapshot = watermarks + .and_then(|m| m.get(shard_id)) + .is_some_and(|watermark| generation.as_u64() >= watermark.active_generation); + if flushed_after_snapshot { + slots.push(None); + } else { + let slot = slots.len(); + slots.push(None); + flushed_loads.push(async move { + let index = open_pk_index(path, session, flushed_cache).await?; + Ok::<_, Error>((slot, GenMembership::OnDisk(index))) + }); + } } - }; - if !set.is_empty() { - sets.push(set); } } - Ok(sets) + for (slot, membership) in futures::future::try_join_all(flushed_loads).await? { + slots[slot] = Some(membership); + } + Ok(slots + .into_iter() + .flatten() + .filter(|membership| !membership.is_empty()) + .collect()) } -/// Hash the PK membership of an in-memory memtable (active or frozen) from its -/// committed `BatchStore` rows. -pub fn pk_hashes_from_batch_store( - store: &BatchStore, - pk_columns: &[String], -) -> Result> { - let mut batches: Vec = Vec::with_capacity(store.len()); - for i in 0..store.len() { - if let Some(stored) = store.get(i) { - batches.push(stored.data.clone()); - } +/// Cross-source membership of an in-memory (active / frozen) memtable: a +/// snapshot-bounded probe of its maintained primary-key index. A memtable +/// without a primary-key index can't be probed, so it blocks nothing — the +/// production vector-search path always enables the index. +fn in_memory_membership( + batch_store: &Arc, + index_store: &Arc, +) -> GenMembership { + let max_visible_row = batch_store.max_visible_row(index_store.max_visible_batch_position()); + GenMembership::InMemory { + index_store: index_store.clone(), + max_visible_row, } - pk_hashes_from_batches(&batches, pk_columns) } -/// Hash every row's primary key across `batches` into a membership set. -fn pk_hashes_from_batches(batches: &[RecordBatch], pk_columns: &[String]) -> Result> { - let mut pk_hashes = HashSet::new(); - for batch in batches { - if batch.num_rows() == 0 { - continue; - } - let pk_indices = resolve_pk_indices(batch, pk_columns) - .map_err(|e| lance_core::Error::invalid_input(e.to_string()))?; - for row_idx in 0..batch.num_rows() { - pk_hashes.insert(compute_pk_hash(batch, &pk_indices, row_idx)); - } +/// As-of variant of [`in_memory_membership`] for the active generation under a +/// watermark: bounds visibility to the first `batch_count` batches — those a +/// prior scan observed before the memtable grew. A later append lands at a +/// higher row position and is excluded by the probe, so it can't shadow a base +/// row whose replacement the scan never delivered. `batch_count == 0` leaves the +/// membership empty. +fn bounded_in_memory_membership( + batch_store: &Arc, + index_store: &Arc, + batch_count: u64, +) -> GenMembership { + let max_visible_row = batch_count + .checked_sub(1) + .and_then(|last_batch| batch_store.max_visible_row(last_batch as usize)); + GenMembership::InMemory { + index_store: index_store.clone(), + max_visible_row, } - Ok(pk_hashes) } -/// Build (or fetch the cached) PK-hash membership for one flushed generation. -/// Cached by immutable path (single-flight); the build scans the flushed -/// dataset's PK columns. -async fn flushed_pk_hashes( +/// Open the standalone PK BTree at `{flushed gen}/_pk_index` for one flushed +/// generation. Reuses the flushed dataset's (session-configured) object store +/// and **its index cache**, then loads the sidecar directly by path through the +/// BTree plugin — it is not a manifest index. The opened index and its pages +/// are cached in the session's index cache (keyed by the immutable flushed +/// path), so repeated probes reuse them with no separate cache path and no +/// upfront scan; concurrent first-opens may each load before the cache fills. +/// A stable cache UUID for a non-manifest index identified only by its path. +/// +/// `DSIndexCache::for_index` keys by `&Uuid`, but the flushed PK sidecar has no +/// manifest UUID — its identity is its immutable path. Derive a deterministic +/// UUID from the path so the cache namespace is per-path and stable across +/// probes (the `uuid` crate lacks the `v5` "name-based" feature here, so hash to +/// a `u128` instead). +fn path_cache_uuid(path: &str) -> Uuid { + use std::hash::{Hash, Hasher}; + let mut lo = std::collections::hash_map::DefaultHasher::new(); + path.hash(&mut lo); + let mut hi = std::collections::hash_map::DefaultHasher::new(); + // Seed the high half differently so it never equals the low half. + "lance/flushed-pk-index".hash(&mut hi); + path.hash(&mut hi); + Uuid::from_u128(((hi.finish() as u128) << 64) | lo.finish() as u128) +} + +async fn open_pk_index( path: &str, - pk_columns: &[String], session: Option<&Arc>, - flushed_cache: Option<&Arc>, -) -> Result>> { - match flushed_cache { - Some(cache) => { - let build_cache = cache.clone(); - let build_path = path.to_string(); - let build_session = session.cloned(); - let build_pk = pk_columns.to_vec(); - cache - .get_or_build_pk_hashes( - path, - // `Box::pin` keeps this build future off the caller's future - // (avoids `clippy::large_futures`). - Box::pin(async move { - let dataset = open_flushed_dataset( - &build_path, - build_session.as_ref(), - Some(&build_cache), - ) - .await?; - scan_pk_hashes(&dataset, &build_pk).await - }), - ) - .await - } - None => { - let dataset = open_flushed_dataset(path, session, None).await?; - Ok(Arc::new(scan_pk_hashes(&dataset, pk_columns).await?)) - } + flushed_cache: Option<&Arc>, +) -> Result> { + let dataset = open_flushed_dataset(path, session, flushed_cache, None).await?; + // Namespace the session index cache by the (immutable) flushed path so this + // sidecar's pages live alongside every other index instead of a bespoke + // cache. `fri_uuid` is None — flushed generations carry no fragment-reuse. + let index_cache = dataset.index_cache.for_index(&path_cache_uuid(path), None); + let index_dir = dataset.base.clone().join(PK_INDEX_DIR); + let store: Arc = Arc::new(LanceIndexStore::new( + dataset.object_store.clone(), + index_dir, + Arc::new(index_cache.clone()), + )); + + let plugin = PK_BTREE_REGISTRY.get_plugin_by_name("BTree")?; + // Cache the opened index in the session cache (mirrors `open_scalar_index`). + if let Some(index) = plugin + .get_from_cache(store.clone(), None, &index_cache) + .await? + { + return Ok(index); } + let details = prost_types::Any::from_msg(&lance_index::pbold::BTreeIndexDetails::default()) + .map_err(|e| Error::io(e.to_string()))?; + let index = plugin + .load_index(store, &details, None, &index_cache) + .await?; + plugin.put_in_cache(&index_cache, index.clone()).await?; + Ok(index) } -/// Scan a dataset's PK columns and fold them into a membership set, one batch -/// resident at a time (no full PK-column buffer). -async fn scan_pk_hashes(dataset: &Dataset, pk_columns: &[String]) -> Result> { - let pk_refs: Vec<&str> = pk_columns.iter().map(String::as_str).collect(); - let mut scanner = dataset.scan(); - scanner.project(&pk_refs)?; - let mut stream = scanner.try_into_stream().await?; - let mut hashes = HashSet::new(); - while let Some(batch) = stream.try_next().await? { - if batch.num_rows() == 0 { - continue; - } - let pk_indices = resolve_pk_indices(&batch, pk_columns) - .map_err(|e| lance_core::Error::invalid_input(e.to_string()))?; - for row in 0..batch.num_rows() { - hashes.insert(compute_pk_hash(&batch, &pk_indices, row)); - } +/// Write a flushed generation's standalone PK sidecar at `{uri}/_pk_index` from +/// `batches`, mirroring what flush does in production. `pk_columns` are the +/// primary-key column names (field ids are synthesized by position — `insert` +/// resolves columns by name). A no-op when no batch carries the PK columns. +/// +/// Used by Rust scanner tests and by the Python test-support binding to stage +/// faithful flushed generations (a flushed dataset alone, with no sidecar, is +/// not a state production ever produces). +pub async fn write_pk_sidecar( + uri: &str, + batches: &[arrow_array::RecordBatch], + pk_columns: &[&str], +) -> Result<()> { + use datafusion::physical_plan::stream::RecordBatchStreamAdapter; + use lance_core::cache::LanceCache; + use lance_index::scalar::btree::train_btree_index; + use lance_io::object_store::ObjectStore; + + use crate::dataset::mem_wal::util::pk_index_path; + + let pk: Vec<(String, i32)> = pk_columns + .iter() + .enumerate() + .map(|(i, c)| (c.to_string(), i as i32)) + .collect(); + let mut index = IndexStore::new(); + index.enable_pk_index(&pk); + let mut offset = 0u64; + for batch in batches { + index.insert(batch, offset)?; + offset += batch.num_rows() as u64; + } + + let training = index.pk_training_batches(8192)?; + if training.is_empty() { + return Ok(()); } - Ok(hashes) + let schema = training[0].schema(); + let (object_store, base_path) = ObjectStore::from_uri(uri).await?; + let store = LanceIndexStore::new( + object_store, + pk_index_path(&base_path), + Arc::new(LanceCache::no_cache()), + ); + let stream = Box::pin(RecordBatchStreamAdapter::new( + schema, + futures::stream::iter(training.into_iter().map(Ok)), + )); + // `train_btree_index` now returns the written index files; the sidecar + // writer only needs success/failure. + train_btree_index(stream, &store, 8192, None, None).await?; + Ok(()) } #[cfg(test)] mod tests { use super::*; - use arrow_array::Int32Array; + use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; + use crate::dataset::mem_wal::write::IndexStore; + use arrow_array::{Int32Array, RecordBatch}; use arrow_schema::{DataType, Field, Schema}; use std::sync::Arc; + use uuid::Uuid; fn id_batch(ids: &[i32]) -> RecordBatch { let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(ids.to_vec()))]).unwrap() } - /// Hash a single Int32 `id` PK the way the planner does, so a test can probe - /// a returned blocked set by value. - fn hash_id(id: i32) -> u64 { - let batch = id_batch(&[id]); - let pk_indices = resolve_pk_indices(&batch, &["id".to_string()]).unwrap(); - compute_pk_hash(&batch, &pk_indices, 0) - } - - /// Whether `id`'s PK hash is blocked by any of a source's newer-gen sets. - fn blocks(sets: &[Arc>], id: i32) -> bool { - sets.iter().any(|s| s.contains(&hash_id(id))) - } - - #[test] - fn pk_hashes_collapse_within_gen_duplicates() { - // Two rows share pk=1 (a within-gen duplicate); pk=2 is unique. - let hashes = pk_hashes_from_batches(&[id_batch(&[1, 2, 1])], &["id".to_string()]).unwrap(); - assert_eq!(hashes.len(), 2); // distinct pks: 1, 2 + /// An active/frozen memtable source whose PK index holds one row per id in + /// `ids` (positions 0..n), all committed and visible. + fn active_source(shard: Uuid, generation: u64, ids: &[i32]) -> LsmDataSource { + let store = BatchStore::with_capacity(16); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + for &id in ids { + let b = id_batch(&[id]); + let (bp, off, _) = store.append(b.clone()).unwrap(); + index.insert_with_batch_position(&b, off, Some(bp)).unwrap(); + } + LsmDataSource::ActiveMemTable { + batch_store: Arc::new(store), + index_store: Arc::new(index), + schema: id_batch(&[1]).schema(), + shard_id: shard, + generation: LsmGeneration::memtable(generation), + } } - #[test] - fn empty_batches_yield_empty_membership() { - let hashes = pk_hashes_from_batches(&[id_batch(&[])], &["id".to_string()]).unwrap(); - assert!(hashes.is_empty()); + /// Whether `id`'s PK is blocked by any of a source's newer-gen memberships. + async fn blocks(memberships: &[GenMembership], id: i32) -> bool { + let key = on_disk_pk_key(&[ScalarValue::Int32(Some(id))]).unwrap(); + for m in memberships { + if m.contains(&key).await.unwrap() { + return true; + } + } + false } #[test] - fn batch_store_membership_collapses_within_gen_dups() { - let store = BatchStore::with_capacity(8); - // Two single-row batches, both pk=1 (a within-gen update). - store.append(id_batch(&[1])).unwrap(); - store.append(id_batch(&[1])).unwrap(); - // A two-row batch: pk=2, pk=3. - store.append(id_batch(&[2, 3])).unwrap(); - - let hashes = pk_hashes_from_batch_store(&store, &["id".to_string()]).unwrap(); - assert_eq!(hashes.len(), 3); // distinct pks: 1, 2, 3 + fn on_disk_key_is_typed_for_single_and_binary_for_composite() { + // Single-column → the typed value; composite → encoded Binary. + let single = [ScalarValue::Int32(Some(7))]; + assert_eq!( + on_disk_pk_key(&single).unwrap(), + ScalarValue::Int32(Some(7)) + ); + let composite = [ScalarValue::Int32(Some(1)), ScalarValue::from("a")]; + assert!(matches!( + on_disk_pk_key(&composite).unwrap(), + ScalarValue::Binary(Some(_)) + )); } #[tokio::test] - async fn fresh_tier_block_list_one_set_per_in_memory_gen() { - use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; - use crate::dataset::mem_wal::write::IndexStore; - use uuid::Uuid; - + async fn fresh_tier_block_list_one_membership_per_in_memory_gen() { let shard = Uuid::new_v4(); - let mk = |ids: &[i32], generation: u64| { - let store = BatchStore::with_capacity(8); - store.append(id_batch(ids)).unwrap(); - LsmDataSource::ActiveMemTable { - batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), - schema: id_batch(&[1]).schema(), - shard_id: shard, - generation: LsmGeneration::memtable(generation), - } - }; // Active gen 2: pk=1,2. Frozen gen 1: pk=3. - let sources = vec![mk(&[1, 2], 2), mk(&[3], 1)]; + let sources = vec![ + active_source(shard, 2, &[1, 2]), + active_source(shard, 1, &[3]), + ]; - let sets = fresh_tier_block_list(&sources, &["id".to_string()], None, None) + let memberships = fresh_tier_block_list(&sources, None, None, None) .await .unwrap(); - // One set per generation; together they cover pk=1,2,3 (not 4). - assert_eq!(sets.len(), 2); + // One membership per generation; together they cover pk=1,2,3 (not 4). + assert_eq!(memberships.len(), 2); for id in [1, 2, 3] { - assert!(blocks(&sets, id)); + assert!(blocks(&memberships, id).await); } - assert!(!blocks(&sets, 4)); + assert!(!blocks(&memberships, 4).await); } #[tokio::test] async fn block_lists_suppress_stale_across_in_memory_gens() { - use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; - use crate::dataset::mem_wal::write::IndexStore; - use uuid::Uuid; - let shard = Uuid::new_v4(); - let mk = |batches: &[&[i32]], generation: u64| { - let store = BatchStore::with_capacity(8); - for ids in batches { - store.append(id_batch(ids)).unwrap(); - } - LsmDataSource::ActiveMemTable { - batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), - schema: id_batch(&[1]).schema(), - shard_id: shard, - generation: LsmGeneration::memtable(generation), - } - }; - - // Frozen gen 1: stale pk=1. - // Active gen 2: pk=1 re-written, pk=2 new. - let sources = vec![mk(&[&[1]], 1), mk(&[&[1], &[2]], 2)]; + // Frozen gen 1: stale pk=1. Active gen 2: pk=1 re-written, pk=2 new. + let sources = vec![ + active_source(shard, 1, &[1]), + active_source(shard, 2, &[1, 2]), + ]; - let blocked = Box::pin(compute_source_block_lists( - &sources, - &["id".to_string()], - None, - None, - )) - .await - .unwrap(); + let blocked = Box::pin(compute_source_block_lists(&sources, None, None)) + .await + .unwrap(); let g1 = LsmGeneration::memtable(1); let g2 = LsmGeneration::memtable(2); // The newer active write supersedes the frozen copy: gen 1 is blocked on // pk=1, so its KNN drops pk=1. - assert!(blocks(&blocked[&(Some(shard), g1)], 1)); + assert!(blocks(&blocked[&(Some(shard), g1)], 1).await); // The active (newest) generation is superseded by nothing — no entry. assert!(!blocked.contains_key(&(Some(shard), g2))); } #[tokio::test] async fn block_lists_suppress_stale_base_row() { - use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; - use crate::dataset::mem_wal::write::IndexStore; use crate::dataset::{Dataset, WriteParams}; use arrow_array::RecordBatchIterator; - use uuid::Uuid; // Base (gen 0): pk=1 (stale), pk=3 (live). let base_batch = id_batch(&[1, 3]); @@ -372,89 +586,239 @@ mod tests { ); // Active gen 1: pk=1 re-written, pk=2 new. - let store = BatchStore::with_capacity(8); - store.append(id_batch(&[1])).unwrap(); - store.append(id_batch(&[2])).unwrap(); - let sources = vec![ LsmDataSource::BaseTable { dataset: base }, - LsmDataSource::ActiveMemTable { - batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), - schema, - shard_id: Uuid::new_v4(), - generation: LsmGeneration::memtable(1), - }, + active_source(Uuid::new_v4(), 1, &[1, 2]), ]; - let blocked = Box::pin(compute_source_block_lists( - &sources, - &["id".to_string()], - None, - None, - )) - .await - .unwrap(); + let blocked = Box::pin(compute_source_block_lists(&sources, None, None)) + .await + .unwrap(); // Base is blocked by every newer gen: pk=1 (re-written in gen 1) is - // blocked, pk=3 (base-only) is not. End-to-end drop: vector_search specs. + // blocked, pk=3 (base-only) is not. let base_blocked = blocked .get(&(None, LsmGeneration::BASE_TABLE)) .expect("base has a blocked set"); - assert!(blocks(base_blocked, 1)); - assert!(!blocks(base_blocked, 3)); + assert!(blocks(base_blocked, 1).await); + assert!(!blocks(base_blocked, 3).await); } #[tokio::test] async fn block_lists_are_keyed_per_shard() { // Regression: generations are per-shard, so a source must only be blocked - // by newer generations of its OWN shard. A generation-only key would - // cross-block same-generation sources from different shards. - use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; - use crate::dataset::mem_wal::write::IndexStore; - use uuid::Uuid; - - let mk = |shard: Uuid, ids: &[i32], generation: u64| { - let store = BatchStore::with_capacity(8); - store.append(id_batch(ids)).unwrap(); - LsmDataSource::ActiveMemTable { - batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), - schema: id_batch(&[1]).schema(), - shard_id: shard, - generation: LsmGeneration::memtable(generation), - } - }; - - // Two shards, each: frozen gen 1 (stale) + active gen 2 (re-write). - // Shard A keys pk=1; shard B keys pk=2 (disjoint partitions). + // by newer generations of its OWN shard. let a = Uuid::new_v4(); let b = Uuid::new_v4(); + // Two shards, each: frozen gen 1 (stale) + active gen 2 (re-write). + // Shard A keys pk=1; shard B keys pk=2 (disjoint partitions). let sources = vec![ - mk(a, &[1], 1), - mk(a, &[1], 2), - mk(b, &[2], 1), - mk(b, &[2], 2), + active_source(a, 1, &[1]), + active_source(a, 2, &[1]), + active_source(b, 1, &[2]), + active_source(b, 2, &[2]), ]; - let blocked = Box::pin(compute_source_block_lists( - &sources, - &["id".to_string()], - None, - None, - )) - .await - .unwrap(); + let blocked = Box::pin(compute_source_block_lists(&sources, None, None)) + .await + .unwrap(); let g1 = LsmGeneration::memtable(1); let g2 = LsmGeneration::memtable(2); // Each shard's gen 1 is blocked by its OWN gen 2 only. - assert!(blocks(&blocked[&(Some(a), g1)], 1)); - assert!(!blocks(&blocked[&(Some(a), g1)], 2)); - assert!(blocks(&blocked[&(Some(b), g1)], 2)); - assert!(!blocks(&blocked[&(Some(b), g1)], 1)); + assert!(blocks(&blocked[&(Some(a), g1)], 1).await); + assert!(!blocks(&blocked[&(Some(a), g1)], 2).await); + assert!(blocks(&blocked[&(Some(b), g1)], 2).await); + assert!(!blocks(&blocked[&(Some(b), g1)], 1).await); // The newest generation of each shard is superseded by nothing. assert!(!blocked.contains_key(&(Some(a), g2))); assert!(!blocked.contains_key(&(Some(b), g2))); } + + #[tokio::test] + async fn index_membership_is_snapshot_bounded() { + // The index-sourced membership only counts a PK whose version is visible + // at the source's watermark, so a newer generation's not-yet-visible + // write can't shadow an older generation's visible copy. + let shard = Uuid::new_v4(); + let schema = id_batch(&[1]).schema(); + + // Older frozen gen 1: pk=1. + let g1 = active_source(shard, 1, &[1]); + + // Newer active gen 2: pk=99 visible at position 0, then pk=1 written at + // position 1 but with the watermark left at batch 0 (so pk=1 is in the + // index yet not visible) — the concurrent-write race. + let g2_store = BatchStore::with_capacity(8); + let mut g2_index = IndexStore::new(); + g2_index.enable_pk_index(&[("id".to_string(), 0)]); + let b0 = id_batch(&[99]); + let (bp0, off0, _) = g2_store.append(b0.clone()).unwrap(); + g2_index + .insert_with_batch_position(&b0, off0, Some(bp0)) // advances watermark to 0 + .unwrap(); + let b1 = id_batch(&[1]); + let (_, off1, _) = g2_store.append(b1.clone()).unwrap(); + g2_index + .insert_with_batch_position(&b1, off1, None) // index updated, watermark unchanged + .unwrap(); + let g2 = LsmDataSource::ActiveMemTable { + batch_store: Arc::new(g2_store), + index_store: Arc::new(g2_index), + schema, + shard_id: shard, + generation: LsmGeneration::memtable(2), + }; + + let blocked = Box::pin(compute_source_block_lists(&[g1, g2], None, None)) + .await + .unwrap(); + + let g1_block = &blocked[&(Some(shard), LsmGeneration::memtable(1))]; + // pk=99 is visible in gen 2 → it blocks gen 1's pk=99. + assert!(blocks(g1_block, 99).await); + // pk=1's only gen-2 copy is not yet visible → it must NOT shadow gen 1. + assert!( + !blocks(g1_block, 1).await, + "a not-yet-visible newer write must not shadow an older visible copy" + ); + } + + /// A fresh-tier watermark bounds the active generation to the first + /// `active_batch_count` batches — those the arm observed before the memtable + /// grew. A later append is invisible, so a base row is never dropped without + /// the arm having delivered its replacement. + #[tokio::test] + async fn fresh_tier_watermark_bounds_active_memtable_by_batch_count() { + use crate::dataset::mem_wal::scanner::data_source::FreshTierWatermark; + use std::collections::HashMap; + + let shard = Uuid::new_v4(); + // Three single-row batches: pk=1 at batch 0, pk=2 at batch 1, pk=3 at + // batch 2 (appended after the arm). + let sources = vec![active_source(shard, 1, &[1, 2, 3])]; + + // Watermark at 2 batches of gen 1: pk=1,2 are members; pk=3 (batch 2) is not. + let watermarks: HashMap = [( + shard, + FreshTierWatermark { + active_generation: 1, + active_batch_count: 2, + }, + )] + .into_iter() + .collect(); + let sets = fresh_tier_block_list(&sources, None, None, Some(&watermarks)) + .await + .unwrap(); + assert!(blocks(&sets, 1).await); + assert!(blocks(&sets, 2).await); + assert!(!blocks(&sets, 3).await); + + // No watermark → live tier: all three are members. + let sets = fresh_tier_block_list(&sources, None, None, None) + .await + .unwrap(); + for id in [1, 2, 3] { + assert!(blocks(&sets, id).await); + } + } + + /// A generation above the active one rolled in after the snapshot and is + /// excluded whole; a lower one is immutable (frozen) and included whole + /// regardless of the active batch count. + #[tokio::test] + async fn fresh_tier_watermark_excludes_newer_gen_includes_lower_gen() { + use crate::dataset::mem_wal::scanner::data_source::FreshTierWatermark; + use std::collections::HashMap; + + let shard = Uuid::new_v4(); + // gen 3 newer (after snapshot), gen 2 == active (bounded to 1 batch), + // gen 1 lower/immutable (whole). Each id is its own batch. + let sources = vec![ + active_source(shard, 3, &[100]), + active_source(shard, 2, &[20, 21]), + active_source(shard, 1, &[1, 2]), + ]; + + let watermarks: HashMap = [( + shard, + FreshTierWatermark { + active_generation: 2, + active_batch_count: 1, + }, + )] + .into_iter() + .collect(); + let sets = fresh_tier_block_list(&sources, None, None, Some(&watermarks)) + .await + .unwrap(); + assert!(blocks(&sets, 1).await); // gen 1, whole + assert!(blocks(&sets, 2).await); // gen 1, whole + assert!(blocks(&sets, 20).await); // gen 2, batch 0 + assert!(!blocks(&sets, 21).await); // gen 2, batch 1 — past the watermark + assert!(!blocks(&sets, 100).await); // gen 3 — after the snapshot + } + + /// A flushed generation at or above the active generation was produced by a + /// flush after the snapshot and is excluded; one strictly below it is + /// immutable and included. + #[tokio::test] + async fn fresh_tier_watermark_excludes_flushed_at_or_above_active() { + use crate::dataset::mem_wal::scanner::data_source::FreshTierWatermark; + use crate::dataset::{Dataset, WriteParams}; + use arrow_array::RecordBatchIterator; + use std::collections::HashMap; + + // A flushed generation 2 holding pk=5, staged as a flushed dataset with + // its standalone PK sidecar (what the on-disk membership probes). + let flushed_batch = id_batch(&[5]); + let schema = flushed_batch.schema(); + let tmp = tempfile::tempdir().unwrap(); + let path = format!("{}/gen2", tmp.path().to_str().unwrap()); + let reader = RecordBatchIterator::new(vec![Ok(flushed_batch.clone())], schema.clone()); + Dataset::write(reader, &path, Some(WriteParams::default())) + .await + .unwrap(); + write_pk_sidecar(&path, &[flushed_batch], &["id"]) + .await + .unwrap(); + + let shard = Uuid::new_v4(); + let sources = vec![LsmDataSource::FlushedMemTable { + path, + shard_id: shard, + generation: LsmGeneration::memtable(2), + }]; + + // active_generation 2 (gen 2 flushed at/after the snapshot): excluded. + let at: HashMap = [( + shard, + FreshTierWatermark { + active_generation: 2, + active_batch_count: u64::MAX, + }, + )] + .into_iter() + .collect(); + let sets = fresh_tier_block_list(&sources, None, None, Some(&at)) + .await + .unwrap(); + assert!(!blocks(&sets, 5).await); + + // active_generation 3 (gen 2 strictly below, immutable): included. + let above: HashMap = [( + shard, + FreshTierWatermark { + active_generation: 3, + active_batch_count: u64::MAX, + }, + )] + .into_iter() + .collect(); + let sets = fresh_tier_block_list(&sources, None, None, Some(&above)) + .await + .unwrap(); + assert!(blocks(&sets, 5).await); + } } diff --git a/rust/lance/src/dataset/mem_wal/scanner/builder.rs b/rust/lance/src/dataset/mem_wal/scanner/builder.rs index ade4164d485..a006257493b 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/builder.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/builder.rs @@ -20,8 +20,8 @@ use lance_core::{Error, Result, is_system_column}; use uuid::Uuid; use super::collector::{InMemoryMemTableRef, InMemoryMemTables, LsmDataSourceCollector}; -use super::data_source::ShardSnapshot; -use super::flushed_cache::FlushedMemTableCache; +use super::data_source::{FreshTierWatermark, ShardSnapshot}; +use super::flushed_cache::{DatasetCache, GenerationWarmer}; use super::planner::LsmScanPlanner; use super::point_lookup::LsmPointLookupPlanner; use crate::dataset::Dataset; @@ -124,7 +124,12 @@ pub struct LsmScanner { session: Option>, /// Cache of opened flushed-generation datasets. When set, repeated /// queries against the same generation skip the manifest read entirely. - flushed_cache: Option>, + flushed_cache: Option>, + /// Optional warmer fired on first open of a flushed generation. + warmer: Option>, + /// Over-fetch multiple for block-listed sources in search plans + /// (see [`super::LsmFtsSearchPlanner::with_overfetch_factor`]). + overfetch_factor: Option, } impl LsmScanner { @@ -160,6 +165,8 @@ impl LsmScanner { pk_columns, session, flushed_cache: None, + warmer: None, + overfetch_factor: None, } } @@ -198,6 +205,8 @@ impl LsmScanner { pk_columns, session: None, flushed_cache: None, + warmer: None, + overfetch_factor: None, } } @@ -246,13 +255,29 @@ impl LsmScanner { /// /// With a cache, repeated queries against the same generation become a /// pure `Arc::clone` with no manifest read or object-store I/O. The cache - /// is owned and sized by the caller (see [`FlushedMemTableCache`]); not - /// set by default, so behavior is unchanged unless opted in. - pub fn with_flushed_cache(mut self, cache: Arc) -> Self { + /// is owned and sized by the caller (any [`DatasetCache`] impl, e.g. + /// [`FlushedMemTableCache`](super::FlushedMemTableCache)); not set by + /// default, so behavior is unchanged unless opted in. + pub fn with_flushed_cache(mut self, cache: Arc) -> Self { self.flushed_cache = Some(cache); self } + /// Inject the warmer fired on first open of a flushed generation. Not set by + /// default, so behavior is unchanged unless opted in. + pub fn with_warmer(mut self, warmer: Arc) -> Self { + self.warmer = Some(warmer); + self + } + + /// Set the over-fetch multiple block-listed sources use in search plans + /// so they still yield `k` live rows after cross-generation dedup. + /// Threaded into [`super::LsmFtsSearchPlanner`]; clamped to `>= 1.0`. + pub fn with_overfetch_factor(mut self, factor: f64) -> Self { + self.overfetch_factor = Some(factor); + self + } + /// Project specific columns. /// /// If not called, all columns from the base schema are included. @@ -354,6 +379,9 @@ impl LsmScanner { if let Some(cache) = &self.flushed_cache { planner = planner.with_flushed_cache(cache.clone()); } + if let Some(warmer) = &self.warmer { + planner = planner.with_warmer(warmer.clone()); + } let plan = planner .plan_point_lookup(&keys, self.projection.as_deref()) .await?; @@ -370,6 +398,12 @@ impl LsmScanner { if let Some(cache) = &self.flushed_cache { planner = planner.with_flushed_cache(cache.clone()); } + if let Some(warmer) = &self.warmer { + planner = planner.with_warmer(warmer.clone()); + } + if let Some(factor) = self.overfetch_factor { + planner = planner.with_overfetch_factor(factor); + } planner .plan_scan( @@ -405,6 +439,12 @@ impl LsmScanner { if let Some(cache) = &self.flushed_cache { planner = planner.with_flushed_cache(cache.clone()); } + if let Some(warmer) = &self.warmer { + planner = planner.with_warmer(warmer.clone()); + } + if let Some(factor) = self.overfetch_factor { + planner = planner.with_overfetch_factor(factor); + } planner .plan_search(column, query, k, self.projection.as_deref()) .await @@ -454,24 +494,65 @@ impl LsmScanner { /// the primary-key columns; the returned `Vec` is aligned with its /// rows. Hashing matches the scanner's internal dedup, so the caller never /// hashes PKs itself. Flushed membership comes from the injected - /// [`FlushedMemTableCache`] when one is set. + /// [`DatasetCache`] when one is set. pub async fn contains_pks(&self, pks: &RecordBatch) -> Result> { + self.contains_pks_at(pks, None).await + } + + /// As-of variant of [`Self::contains_pks`]. Membership is evaluated against + /// a per-shard watermark on the fresh tier, supplied via `watermarks` (see + /// [`FreshTierWatermark`]), matching the tier a prior scan observed and + /// avoiding the two-snapshot skew that would drop a base row with no + /// delivered replacement. `None` evaluates against the live tier. + pub async fn contains_pks_at( + &self, + pks: &RecordBatch, + watermarks: Option<&HashMap>, + ) -> Result> { let sources = self.build_collector().collect()?; - let sets = super::block_list::fresh_tier_block_list( + let memberships = super::block_list::fresh_tier_block_list( &sources, - &self.pk_columns, self.session.as_ref(), self.flushed_cache.as_ref(), + watermarks, ) .await?; let pk_indices = super::exec::resolve_pk_indices(pks, &self.pk_columns) .map_err(|e| Error::invalid_input(e.to_string()))?; - Ok((0..pks.num_rows()) + // One key per row, in the index key space (typed value, or encoded + // `Binary` tuple for a composite PK). + let keys: Vec = (0..pks.num_rows()) .map(|row| { - let hash = super::exec::compute_pk_hash(pks, &pk_indices, row); - sets.iter().any(|set| set.contains(&hash)) + let values: Vec = pk_indices + .iter() + .map(|&col| ScalarValue::try_from_array(pks.column(col), row)) + .collect::>() + .map_err(|e| Error::invalid_input(e.to_string()))?; + super::block_list::on_disk_pk_key(&values) }) - .collect()) + .collect::>()?; + + // A row is contained if any generation contains its key. Probe each + // generation once (batched), narrowing to still-unfound rows. + let mut contained = vec![false; keys.len()]; + let mut live: Vec = (0..keys.len()).collect(); + for membership in &memberships { + if live.is_empty() { + break; + } + let live_keys: Vec = live.iter().map(|&i| keys[i].clone()).collect(); + let mask = membership.contains_keys(&live_keys).await?; + let mut next_live = Vec::with_capacity(live.len()); + for (pos, &row) in live.iter().enumerate() { + if mask[pos] { + contained[row] = true; + } else { + next_live.push(row); + } + } + live = next_live; + } + Ok(contained) } /// Build the data source collector. @@ -572,35 +653,42 @@ mod tests { assert_eq!(memtable_ref.generation, 10); } - #[tokio::test] - async fn contains_pks_reports_fresh_tier_membership() { - use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; - use arrow_array::Int32Array; + /// Single-column `id: Int32` schema used by the PK-membership tests. + fn pk_schema() -> SchemaRef { use arrow_schema::{DataType, Field, Schema}; + Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])) + } - let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); - let id_batch = |ids: &[i32]| { - RecordBatch::try_new( - schema.clone(), - vec![Arc::new(Int32Array::from(ids.to_vec()))], - ) - .unwrap() - }; - let mk = |ids: &[i32], generation: u64| { - let store = BatchStore::with_capacity(8); - store.append(id_batch(ids)).unwrap(); - InMemoryMemTableRef { - batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), - schema: schema.clone(), - generation, - } - }; + /// A `RecordBatch` of `id` values against [`pk_schema`]. + fn id_pk_batch(ids: &[i32]) -> RecordBatch { + use arrow_array::Int32Array; + RecordBatch::try_new(pk_schema(), vec![Arc::new(Int32Array::from(ids.to_vec()))]).unwrap() + } + + /// An active/frozen memtable holding `ids` at `generation`, with a single + /// batch and a maintained primary-key index on `id`. + fn mk_pk_memtable(ids: &[i32], generation: u64) -> InMemoryMemTableRef { + use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; + let store = BatchStore::with_capacity(8); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + let b = id_pk_batch(ids); + let (bp, off, _) = store.append(b.clone()).unwrap(); + index.insert_with_batch_position(&b, off, Some(bp)).unwrap(); + InMemoryMemTableRef { + batch_store: Arc::new(store), + index_store: Arc::new(index), + schema: pk_schema(), + generation, + } + } + #[tokio::test] + async fn contains_pks_reports_fresh_tier_membership() { // Fresh-tier only: active gen 2 (pk=1,2) + frozen gen 1 (pk=3). let shard = Uuid::new_v4(); let scanner = LsmScanner::without_base_table( - schema.clone(), + pk_schema(), "memory://t", vec![], vec!["id".to_string()], @@ -608,16 +696,68 @@ mod tests { .with_in_memory_memtables( shard, InMemoryMemTables { - active: mk(&[1, 2], 2), - frozen: vec![mk(&[3], 1)], + active: mk_pk_memtable(&[1, 2], 2), + frozen: vec![mk_pk_memtable(&[3], 1)], }, ); // pk=1 (active), pk=4 (absent), pk=3 (frozen). - let result = scanner.contains_pks(&id_batch(&[1, 4, 3])).await.unwrap(); + let result = scanner + .contains_pks(&id_pk_batch(&[1, 4, 3])) + .await + .unwrap(); assert_eq!(result, vec![true, false, true]); } + /// `contains_pks_at` probes each generation once over the still-unfound + /// rows, so a multi-PK batch spanning several generations resolves to the + /// right per-row mask — and a watermark bounds which generations count. + #[tokio::test] + async fn contains_pks_at_batched_probe_respects_watermark() { + use crate::dataset::mem_wal::scanner::data_source::FreshTierWatermark; + + // active gen 2 (pk=1,2) + frozen gen 1 (pk=3,4). + let shard = Uuid::new_v4(); + let scanner = LsmScanner::without_base_table( + pk_schema(), + "memory://t", + vec![], + vec!["id".to_string()], + ) + .with_in_memory_memtables( + shard, + InMemoryMemTables { + active: mk_pk_memtable(&[1, 2], 2), + frozen: vec![mk_pk_memtable(&[3, 4], 1)], + }, + ); + + // Duplicate and out-of-order keys exercise the live-row narrowing: each + // generation only re-probes the rows earlier generations didn't claim. + let probe = id_pk_batch(&[4, 1, 9, 3, 2, 1]); + + // watermark=None → live tier: every PK present in either generation. + let live = scanner.contains_pks_at(&probe, None).await.unwrap(); + assert_eq!(live, vec![true, true, false, true, true, true]); + + // watermark at gen 1 → active gen 2 rolled in after the snapshot and is + // excluded; only the frozen gen 1 keys (3,4) remain members. + let watermarks: HashMap = [( + shard, + FreshTierWatermark { + active_generation: 1, + active_batch_count: u64::MAX, + }, + )] + .into_iter() + .collect(); + let bounded = scanner + .contains_pks_at(&probe, Some(&watermarks)) + .await + .unwrap(); + assert_eq!(bounded, vec![true, false, false, true, false, false]); + } + /// One active memtable with a maintained BTree on `id`, all rows visible. fn mk_indexed_memtable(schema: &SchemaRef, ids: &[i32], names: &[&str]) -> InMemoryMemTableRef { use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; diff --git a/rust/lance/src/dataset/mem_wal/scanner/collector.rs b/rust/lance/src/dataset/mem_wal/scanner/collector.rs index 2db4b4f277d..6645f159b12 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/collector.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/collector.rs @@ -229,6 +229,19 @@ impl LsmDataSourceCollector { .collect() } + /// True when `generation` for `shard_id` is still pinned in memory as a + /// frozen memtable. During the post-flush grace window a generation is both + /// committed to the manifest (a flushed source) and held in memory (an + /// in-memory source); it must be served only from memory — which preserves + /// the per-batch boundaries the flushed dataset has lost, so as-of reads + /// stay snapshot-bounded — and its on-disk copy skipped to avoid scanning + /// the generation twice. See `ShardWriterConfig::frozen_memtable_grace`. + fn flushed_gen_pinned_in_memory(&self, shard_id: &Uuid, generation: u64) -> bool { + self.in_memory_memtables + .get(shard_id) + .is_some_and(|mems| mems.frozen.iter().any(|f| f.generation == generation)) + } + /// Collect all data sources. /// /// Returns sources in a consistent order: @@ -246,6 +259,9 @@ impl LsmDataSourceCollector { for snapshot in &self.shard_snapshots { for flushed in &snapshot.flushed_generations { + if self.flushed_gen_pinned_in_memory(&snapshot.shard_id, flushed.generation) { + continue; + } let path = self.resolve_flushed_path(&snapshot.shard_id, &flushed.path); sources.push(LsmDataSource::FlushedMemTable { path, @@ -284,6 +300,9 @@ impl LsmDataSourceCollector { } for flushed in &snapshot.flushed_generations { + if self.flushed_gen_pinned_in_memory(&snapshot.shard_id, flushed.generation) { + continue; + } let path = self.resolve_flushed_path(&snapshot.shard_id, &flushed.path); sources.push(LsmDataSource::FlushedMemTable { path, @@ -443,4 +462,53 @@ mod tests { 3 ); } + + /// During the post-flush grace window a generation is both committed to the + /// manifest (a flushed source) and still pinned in memory (a frozen + /// source). The collector must emit it once, from memory — so as-of reads + /// keep batch-resolved membership — and skip the on-disk copy. Flushed + /// generations NOT pinned in memory are still emitted from disk. + #[test] + fn test_collect_suppresses_flushed_gen_pinned_in_memory() { + let shard = Uuid::new_v4(); + // Manifest lists gens 1 and 2 as flushed; gen 2 is still pinned in + // memory (just flushed, within grace), gen 1 has been swept. + let snapshot = ShardSnapshot { + shard_id: shard, + spec_id: 0, + current_generation: 3, + flushed_generations: vec![ + FlushedGeneration { + generation: 1, + path: "gen_1".to_string(), + }, + FlushedGeneration { + generation: 2, + path: "gen_2".to_string(), + }, + ], + }; + let mems = InMemoryMemTables { + active: memtable_ref(3), + frozen: vec![memtable_ref(2)], + }; + let collector = LsmDataSourceCollector::without_base_table("/tmp/x", vec![snapshot]) + .with_in_memory_memtables(shard, mems); + + let sources = collector.collect().unwrap(); + // gen 1: on-disk (not pinned). gen 2: in-memory only (pinned, disk + // copy suppressed). gen 3: active. No duplicate gen 2. + let flushed: Vec = sources + .iter() + .filter(|s| !s.is_active_memtable()) + .map(|s| s.generation().as_u64()) + .collect(); + let in_memory: Vec = sources + .iter() + .filter(|s| s.is_active_memtable()) + .map(|s| s.generation().as_u64()) + .collect(); + assert_eq!(flushed, vec![1], "only the unpinned flushed gen from disk"); + assert_eq!(in_memory, vec![2, 3], "pinned gen 2 served from memory"); + } } diff --git a/rust/lance/src/dataset/mem_wal/scanner/data_source.rs b/rust/lance/src/dataset/mem_wal/scanner/data_source.rs index 1a6207f27e3..0d5f3fdc925 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/data_source.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/data_source.rs @@ -11,6 +11,29 @@ use uuid::Uuid; use crate::dataset::Dataset; use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; +/// A watermark marking how far into one shard's fresh tier a prior scan +/// observed, so membership can be evaluated as of that point (see +/// [`super::builder::LsmScanner::contains_pks_at`]). +/// +/// Only the active memtable grows between two reads (appended batches, and a new +/// generation when it rolls); everything at a lower generation — frozen and +/// flushed — is immutable and was fully observed. The watermark includes lower +/// generations whole, the active generation up to `active_batch_count` batches, +/// and excludes higher generations (which appeared after it). It uses only the +/// batch count and generation — both always available, unlike per-batch WAL +/// positions, which the write path does not track. The bound only excludes rows +/// the scan did not observe, so a stale watermark under-counts (a tolerable +/// stale read) rather than dropping a row with no replacement. +#[derive(Debug, Clone, Copy)] +pub struct FreshTierWatermark { + /// Active generation the scan observed. Higher generations are excluded; + /// lower ones are immutable and included whole. + pub active_generation: u64, + /// Active-memtable batch count at snapshot time. Within the active + /// generation, only batches at index `< active_batch_count` were observed. + pub active_batch_count: u64, +} + /// Generation number in LSM tree. /// /// The base table has generation 0. MemTables have positive integers diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec.rs b/rust/lance/src/dataset/mem_wal/scanner/exec.rs index 88fd617dc0a..115cffccc81 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/exec.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/exec.rs @@ -9,22 +9,22 @@ //! - [`MemtableGenTagExec`]: Wraps a scan to add `_memtable_gen` column //! - [`BloomFilterGuardExec`]: Guards child execution with bloom filter check //! - [`CoalesceFirstExec`]: Returns first non-empty result with short-circuit -//! - [`WithinSourceDedupExec`]: Deduplicates rows with the same PK from a single source -//! - [`PkHashFilterExec`]: Drops rows whose PK hash was superseded by a newer generation (the cross-generation block-list) +//! - [`PkBlockFilterExec`]: Drops rows whose PK was superseded by a newer generation (the cross-generation block-list) +//! - [`NewestPkFilterExec`]: Drops active-memtable hits that aren't the newest visible version of their PK (the within-source recency filter) mod bloom_guard; mod coalesce_first; mod generation_tag; +mod newest_pk_filter; mod pk; -mod pk_hash_filter; -mod within_source_dedup; +mod pk_block_filter; pub use bloom_guard::{BloomFilterGuardExec, compute_pk_hash_from_scalars}; pub use coalesce_first::CoalesceFirstExec; pub use generation_tag::{MEMTABLE_GEN_COLUMN, MemtableGenTagExec}; +pub use newest_pk_filter::NewestPkFilterExec; pub use pk::{ ROW_ADDRESS_COLUMN, compute_pk_hash, is_supported_pk_type, resolve_pk_indices, validate_pk_types, }; -pub use pk_hash_filter::PkHashFilterExec; -pub use within_source_dedup::{DedupDirection, WithinSourceDedupExec}; +pub use pk_block_filter::PkBlockFilterExec; diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/newest_pk_filter.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/newest_pk_filter.rs new file mode 100644 index 00000000000..e1495cb0bb1 --- /dev/null +++ b/rust/lance/src/dataset/mem_wal/scanner/exec/newest_pk_filter.rs @@ -0,0 +1,393 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Drop predicate-crossing stale rows from an active-memtable index search. +//! +//! The active memtable's HNSW / inverted index are append-only, so an updated +//! row's old entries stay live. When an update moves a row out of the query's +//! match set, the fresh version isn't in the index result, so a result-set +//! dedup (keep-newest among the returned rows) has nothing to suppress the +//! stale version against — and it leaks. +//! +//! This node closes that hole with a predicate-independent recency check: for +//! each hit it asks the memtable's maintained primary-key index +//! ([`IndexStore::pk_is_newest`]) whether the hit's own row position is the +//! newest version of its primary key visible at the query's `max_visible` +//! watermark, and keeps the hit **iff so**. A stale hit (some +//! newer version exists) is dropped even when that newer version never appears +//! in the result. This is exactly the seek point-lookup already does; the index +//! search arms simply didn't do it. + +use std::any::Any; +use std::fmt; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use arrow::compute::filter_record_batch; +use arrow_array::{Array, BooleanArray, RecordBatch, UInt64Array}; +use arrow_schema::SchemaRef; +use datafusion::common::ScalarValue; +use datafusion::error::{DataFusionError, Result as DFResult}; +use datafusion::execution::TaskContext; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, + SendableRecordBatchStream, +}; +use futures::{Stream, StreamExt}; + +use super::pk::resolve_pk_indices; +use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; + +/// Keeps only the index hits that are the newest visible version of their PK. +/// +/// The input must expose all `pk_columns` and the `row_id_column` (`UInt64`, +/// the BatchStore row position). The output schema is unchanged. +pub struct NewestPkFilterExec { + input: Arc, + pk_columns: Vec, + row_id_column: String, + /// Holds the maintained primary-key index, queried per hit via + /// [`IndexStore::pk_is_newest`]. + index_store: Arc, + /// Resolves the `max_visible` row watermark from the visible batch prefix. + batch_store: Arc, + /// The MVCC batch-position snapshot the index search latched. Captured once + /// at plan time and shared with the search so the recency check keys on the + /// same snapshot the hits came from. + max_visible_batch_position: usize, + properties: Arc, +} + +impl fmt::Debug for NewestPkFilterExec { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // `BatchStore` / `IndexStore` aren't `Debug`; show only the knobs. + f.debug_struct("NewestPkFilterExec") + .field("pk_columns", &self.pk_columns) + .field("row_id_column", &self.row_id_column) + .field( + "max_visible_batch_position", + &self.max_visible_batch_position, + ) + .finish() + } +} + +impl NewestPkFilterExec { + pub fn new( + input: Arc, + pk_columns: Vec, + row_id_column: impl Into, + index_store: Arc, + batch_store: Arc, + max_visible_batch_position: usize, + ) -> Self { + // A filter preserves the input schema and partitioning. + let properties = Arc::new(PlanProperties::new( + EquivalenceProperties::new(input.schema()), + input.output_partitioning().clone(), + input.pipeline_behavior(), + input.boundedness(), + )); + Self { + input, + pk_columns, + row_id_column: row_id_column.into(), + index_store, + batch_store, + max_visible_batch_position, + properties, + } + } + + /// The inclusive max visible row position for this snapshot, or `None` when + /// no rows are visible. + fn max_visible_row(&self) -> Option { + self.batch_store + .max_visible_row(self.max_visible_batch_position) + } +} + +impl DisplayAs for NewestPkFilterExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + match t { + DisplayFormatType::Default + | DisplayFormatType::Verbose + | DisplayFormatType::TreeRender => { + write!( + f, + "NewestPkFilterExec: pk=[{}], row_id={}, max_visible_batch={}", + self.pk_columns.join(", "), + self.row_id_column, + self.max_visible_batch_position, + ) + } + } + } +} + +impl ExecutionPlan for NewestPkFilterExec { + fn name(&self) -> &str { + "NewestPkFilterExec" + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.input.schema() + } + + fn properties(&self) -> &Arc { + &self.properties + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> DFResult> { + if children.len() != 1 { + return Err(DataFusionError::Internal( + "NewestPkFilterExec requires exactly one child".to_string(), + )); + } + Ok(Arc::new(Self::new( + children[0].clone(), + self.pk_columns.clone(), + self.row_id_column.clone(), + self.index_store.clone(), + self.batch_store.clone(), + self.max_visible_batch_position, + ))) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> DFResult { + let input_stream = self.input.execute(partition, context)?; + Ok(Box::pin(NewestPkFilterStream { + input: input_stream, + pk_columns: self.pk_columns.clone(), + row_id_column: self.row_id_column.clone(), + index_store: self.index_store.clone(), + max_visible_row: self.max_visible_row(), + schema: self.schema(), + })) + } +} + +struct NewestPkFilterStream { + input: SendableRecordBatchStream, + pk_columns: Vec, + row_id_column: String, + index_store: Arc, + /// Inclusive watermark snapshot; `None` when no rows are visible. + max_visible_row: Option, + schema: SchemaRef, +} + +impl NewestPkFilterStream { + fn filter_batch(&self, batch: RecordBatch) -> DFResult { + // No primary-key index (memtable without a primary key), no visible + // rows, or an empty batch: nothing to dedup against, so pass it through. + if !self.index_store.has_pk_index() { + return Ok(batch); + } + let Some(max_visible_row) = self.max_visible_row else { + return Ok(batch); + }; + if batch.num_rows() == 0 { + return Ok(batch); + } + + let pk_indices = resolve_pk_indices(&batch, &self.pk_columns)?; + let row_ids = batch + .column_by_name(&self.row_id_column) + .ok_or_else(|| { + DataFusionError::Internal(format!( + "Row-id column '{}' not found in NewestPkFilterExec input", + self.row_id_column + )) + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal(format!( + "Row-id column '{}' is not UInt64", + self.row_id_column + )) + })?; + + let mut keep = Vec::with_capacity(batch.num_rows()); + for row in 0..batch.num_rows() { + // A null row position can't be ordered; keep it rather than guess + // (callers always project a real position here). + if row_ids.is_null(row) { + keep.push(true); + continue; + } + let position = row_ids.value(row); + let values: Vec = pk_indices + .iter() + .map(|&col| ScalarValue::try_from_array(batch.column(col), row)) + .collect::>()?; + // Keep iff this hit is the newest visible version of its PK. + keep.push( + self.index_store + .pk_is_newest(&values, position, max_visible_row), + ); + } + filter_record_batch(&batch, &BooleanArray::from(keep)) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) + } +} + +impl Stream for NewestPkFilterStream { + type Item = DFResult; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match self.input.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(batch))) => Poll::Ready(Some(self.filter_batch(batch))), + other => other, + } + } +} + +impl datafusion::physical_plan::RecordBatchStream for NewestPkFilterStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::Int32Array; + use arrow_schema::{DataType, Field, Schema}; + use datafusion::prelude::SessionContext; + use datafusion_physical_plan::test::TestMemoryExec; + use futures::TryStreamExt; + + /// Single-column `id` PK batch, one per append so a caller can control + /// row-level visibility via `max_visible_batch_position`. + fn id_batch(id: i32) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(vec![id]))]).unwrap() + } + + /// Index-search "hits": `(id, _rowid)` pairs the filter evaluates. + fn hits(rows: &[(i32, u64)]) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new(lance_core::ROW_ID, DataType::UInt64, true), + ])); + let ids: Vec = rows.iter().map(|(id, _)| *id).collect(); + let rowids: Vec = rows.iter().map(|(_, p)| *p).collect(); + RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(ids)), + Arc::new(UInt64Array::from(rowids)), + ], + ) + .unwrap() + } + + /// Build an active memtable whose PK index + BatchStore hold one row per + /// `id` in `appended` (positions 0..n), all committed. + fn active(appended: &[i32]) -> (Arc, Arc) { + let batch_store = Arc::new(BatchStore::with_capacity(16)); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + for &id in appended { + let b = id_batch(id); + let (bp, off, _) = batch_store.append(b.clone()).unwrap(); + index.insert_with_batch_position(&b, off, Some(bp)).unwrap(); + } + (Arc::new(index), batch_store) + } + + async fn run( + index_store: Arc, + batch_store: Arc, + max_visible_batch_position: usize, + hits_batch: RecordBatch, + ) -> Vec<(i32, u64)> { + let input = + TestMemoryExec::try_new_exec(&[vec![hits_batch.clone()]], hits_batch.schema(), None) + .unwrap(); + let exec = NewestPkFilterExec::new( + input, + vec!["id".to_string()], + lance_core::ROW_ID, + index_store, + batch_store, + max_visible_batch_position, + ); + let ctx = SessionContext::new(); + let out: Vec = exec + .execute(0, ctx.task_ctx()) + .unwrap() + .try_collect() + .await + .unwrap(); + let mut rows = Vec::new(); + for b in &out { + let ids = b.column(0).as_any().downcast_ref::().unwrap(); + let pos = b.column(1).as_any().downcast_ref::().unwrap(); + for i in 0..b.num_rows() { + rows.push((ids.value(i), pos.value(i))); + } + } + rows + } + + #[tokio::test] + async fn keeps_only_the_newest_visible_position_per_pk() { + // id=1 written at positions 0 and 2 (an update), id=2 at position 1; all + // visible. A stale hit (id=1 @ 0) is dropped; the newest (id=1 @ 2) and + // the unrelated id=2 survive — even though all three were "returned" by + // the index search. + let (index, store) = active(&[1, 2, 1]); + let rows = run(index, store, 2, hits(&[(1, 0), (2, 1), (1, 2)])).await; + assert_eq!(rows, vec![(2, 1), (1, 2)]); + } + + #[tokio::test] + async fn does_not_vanish_a_visible_row_under_a_newer_invisible_write() { + // The store/index hold id=1 at positions 0 and 2, but the query latched + // `max_visible_batch_position = 0` (only position 0 visible) — i.e. the + // update at position 2 was committed *after* this query's snapshot. The + // visible older row (id=1 @ 0) must be KEPT (its newest *visible* version + // is itself), not dropped because of the not-yet-visible position 2. + let (index, store) = active(&[1, 2, 1]); + let kept = run(index.clone(), store.clone(), 0, hits(&[(1, 0)])).await; + assert_eq!(kept, vec![(1, 0)], "visible row must not vanish"); + + // And the not-yet-visible position is itself dropped (outside snapshot). + let dropped = run(index, store, 0, hits(&[(1, 2)])).await; + assert!( + dropped.is_empty(), + "row beyond the snapshot must be dropped" + ); + } + + #[tokio::test] + async fn passes_through_when_no_pk_index() { + // A memtable without a primary-key index can't be deduped here, so the + // filter is a pass-through rather than dropping everything. + let batch_store = Arc::new(BatchStore::with_capacity(16)); + batch_store.append(id_batch(1)).unwrap(); + let index = Arc::new(IndexStore::new()); // no enable_pk_index + let rows = run(index, batch_store, 0, hits(&[(1, 0), (1, 9)])).await; + assert_eq!(rows, vec![(1, 0), (1, 9)]); + } +} diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs index 523dd30bf82..0707eb5e8dd 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs @@ -4,7 +4,7 @@ //! Shared primary-key helpers for the LSM scanner execution nodes. //! //! Centralizes PK column resolution and per-row hashing so that every -//! consumer (e.g. [`super::WithinSourceDedupExec`], [`super::PkHashFilterExec`]) +//! consumer (e.g. [`super::PkBlockFilterExec`], [`super::NewestPkFilterExec`]) //! resolves and hashes a primary key the same way. The row hash is kept //! consistent with the variants supported by [`super::compute_pk_hash_from_scalars`] //! so a single PK produces the same hash regardless of which exec consumes it. diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/pk_block_filter.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/pk_block_filter.rs new file mode 100644 index 00000000000..c5b8f959d26 --- /dev/null +++ b/rust/lance/src/dataset/mem_wal/scanner/exec/pk_block_filter.rs @@ -0,0 +1,373 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Drop superseded rows from a per-source result by primary-key membership. +//! +//! Drops a row when any newer generation's membership ([`GenMembership`]) +//! contains its primary key — in-memory generations probe their PK index by +//! value, flushed generations probe their on-disk PK BTree. Each generation is +//! probed once per batch (see the perf note below). Used both as the KNN +//! post-filter (vector search, with over-fetch) and the cross-generation scan +//! filter (`k = 0`). +//! +//! Cross-generation only: within-gen duplicates collapse via the global dedup's +//! `(generation, freshness)` tiebreaker. +//! +//! Post-filters an over-fetched KNN (the planner's `overfetch_factor`); warns +//! when a source had >= k candidates but < k survived (over-fetch too small). +//! +//! Perf note: each generation is probed once per batch via +//! [`GenMembership::contains_keys`] — a batched existence check over the +//! batch's keys — not once per row. The on-disk arm issues a single +//! `BTreeIndex::contains_keys` (one page pass, no per-key `SearchResult` +//! allocation); the in-memory arm maps a sync PK lookup over the keys. Probes +//! are not disk-bound in steady state: the opened index and its (small, +//! memtable-sized) pages are held by the injected `FlushedMemTableCache` / +//! `LanceCache`, so after the first touch every probe is memory-resident. +//! Already-blocked rows are dropped from the key set before probing older +//! generations, preserving the per-row short-circuit. + +use std::any::Any; +use std::fmt; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use arrow::compute::filter_record_batch; +use arrow_array::{BooleanArray, RecordBatch}; +use arrow_schema::SchemaRef; +use datafusion::common::ScalarValue; +use datafusion::error::{DataFusionError, Result as DFResult}; +use datafusion::execution::TaskContext; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, + SendableRecordBatchStream, +}; +use futures::future::BoxFuture; +use futures::{FutureExt, Stream, StreamExt}; +use tracing::warn; + +use super::super::block_list::{GenMembership, on_disk_pk_key}; +use super::pk::resolve_pk_indices; + +/// Filters out rows whose PK is contained in any newer generation's membership. +#[derive(Debug)] +pub struct PkBlockFilterExec { + input: Arc, + pk_columns: Vec, + /// Newer generations' membership; a row is blocked if any contains its PK. + blocked: Vec, + /// Target neighbor count, used only to warn on a per-source under-fetch. + k: usize, + properties: Arc, +} + +impl PkBlockFilterExec { + pub fn new( + input: Arc, + pk_columns: Vec, + blocked: Vec, + k: usize, + ) -> Self { + // A filter preserves the input schema and partitioning. + let properties = Arc::new(PlanProperties::new( + EquivalenceProperties::new(input.schema()), + input.output_partitioning().clone(), + input.pipeline_behavior(), + input.boundedness(), + )); + Self { + input, + pk_columns, + blocked, + k, + properties, + } + } +} + +impl DisplayAs for PkBlockFilterExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + match t { + DisplayFormatType::Default + | DisplayFormatType::Verbose + | DisplayFormatType::TreeRender => { + write!( + f, + "PkBlockFilterExec: pk_cols=[{}], gens={}", + self.pk_columns.join(", "), + self.blocked.len(), + ) + } + } + } +} + +impl ExecutionPlan for PkBlockFilterExec { + fn name(&self) -> &str { + "PkBlockFilterExec" + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.input.schema() + } + + fn properties(&self) -> &Arc { + &self.properties + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> DFResult> { + if children.len() != 1 { + return Err(DataFusionError::Internal( + "PkBlockFilterExec requires exactly one child".to_string(), + )); + } + Ok(Arc::new(Self::new( + children[0].clone(), + self.pk_columns.clone(), + self.blocked.clone(), + self.k, + ))) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> DFResult { + let input_stream = self.input.execute(partition, context)?; + Ok(Box::pin(PkBlockFilterStream { + input: input_stream, + config: Arc::new(FilterConfig { + pk_columns: self.pk_columns.clone(), + blocked: self.blocked.clone(), + }), + k: self.k, + schema: self.schema(), + pending: None, + input_seen: 0, + kept: 0, + warned: false, + })) + } +} + +/// Immutable per-stream filter config. Shared into each batch's `'static` async +/// future by a single `Arc` clone, rather than deep-cloning the PK columns and +/// memberships per batch. +struct FilterConfig { + pk_columns: Vec, + blocked: Vec, +} + +struct PkBlockFilterStream { + input: SendableRecordBatchStream, + config: Arc, + k: usize, + schema: SchemaRef, + /// The in-flight filter for the batch currently being processed (the probe + /// is async, so a batch is filtered off-poll and resumed here). + pending: Option>>, + input_seen: usize, + kept: usize, + warned: bool, +} + +/// Keep only the rows no newer-gen membership contains. Async because flushed +/// generations are probed against their on-disk PK BTree. +async fn filter_batch(batch: RecordBatch, config: Arc) -> DFResult { + let FilterConfig { + pk_columns, + blocked, + } = config.as_ref(); + if blocked.is_empty() || batch.num_rows() == 0 { + return Ok(batch); + } + let pk_indices = resolve_pk_indices(&batch, pk_columns)?; + let to_df = |e: lance_core::Error| DataFusionError::Execution(e.to_string()); + + // One key per row, in the index key space. + let keys: Vec = (0..batch.num_rows()) + .map(|row| { + let values: Vec = pk_indices + .iter() + .map(|&col| ScalarValue::try_from_array(batch.column(col), row)) + .collect::>()?; + on_disk_pk_key(&values).map_err(to_df) + }) + .collect::>()?; + + // A row is dropped if any newer generation contains its key. Probe each + // generation once (batched) rather than once per row, narrowing to the + // still-live rows so an already-blocked row isn't re-probed against older + // generations. + let mut blocked_row = vec![false; keys.len()]; + let mut live: Vec = (0..keys.len()).collect(); + for membership in blocked { + if live.is_empty() { + break; + } + let live_keys: Vec = live.iter().map(|&i| keys[i].clone()).collect(); + let mask = membership.contains_keys(&live_keys).await.map_err(to_df)?; + let mut next_live = Vec::with_capacity(live.len()); + for (pos, &row) in live.iter().enumerate() { + if mask[pos] { + blocked_row[row] = true; + } else { + next_live.push(row); + } + } + live = next_live; + } + + let keep = BooleanArray::from_iter(blocked_row.into_iter().map(|b| Some(!b))); + filter_record_batch(&batch, &keep).map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) +} + +impl Stream for PkBlockFilterStream { + type Item = DFResult; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + loop { + // Drive an in-flight filter to completion before pulling more input. + if let Some(fut) = this.pending.as_mut() { + return match fut.as_mut().poll(cx) { + Poll::Ready(Ok(out)) => { + this.pending = None; + this.kept += out.num_rows(); + Poll::Ready(Some(Ok(out))) + } + Poll::Ready(Err(e)) => { + this.pending = None; + Poll::Ready(Some(Err(e))) + } + Poll::Pending => Poll::Pending, + }; + } + + match this.input.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(batch))) => { + this.input_seen += batch.num_rows(); + this.pending = Some(filter_batch(batch, this.config.clone()).boxed()); + // Loop to poll the just-created future. + } + Poll::Ready(Some(Err(e))) => return Poll::Ready(Some(Err(e))), + Poll::Ready(None) => { + // >= k candidates in, < k out: over-fetch missed superseded rows. + if !this.warned && this.input_seen >= this.k && this.kept < this.k { + warn!( + k = this.k, + fetched = this.input_seen, + kept = this.kept, + "LSM vector search: < k live rows survived the PK post-filter; \ + raise the over-fetch factor or use a true KNN prefilter." + ); + this.warned = true; + } + return Poll::Ready(None); + } + Poll::Pending => return Poll::Pending, + } + } + } +} + +impl datafusion::physical_plan::RecordBatchStream for PkBlockFilterStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; + use arrow_array::Int32Array; + use arrow_schema::{DataType, Field, Schema}; + use datafusion::prelude::SessionContext; + use datafusion_physical_plan::test::TestMemoryExec; + use futures::TryStreamExt; + + fn int_batch(ids: &[i32]) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(ids.to_vec()))]).unwrap() + } + + /// An in-memory membership whose PK index holds `ids` (positions 0..n). + fn membership(ids: &[i32]) -> GenMembership { + let store = BatchStore::with_capacity(16); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + for &id in ids { + let b = int_batch(&[id]); + let (bp, off, _) = store.append(b.clone()).unwrap(); + index.insert_with_batch_position(&b, off, Some(bp)).unwrap(); + } + let max_visible_row = store.max_visible_row(index.max_visible_batch_position()); + GenMembership::InMemory { + index_store: Arc::new(index), + max_visible_row, + } + } + + async fn run(exec: PkBlockFilterExec) -> Vec { + let ctx = SessionContext::new(); + let out: Vec = exec + .execute(0, ctx.task_ctx()) + .unwrap() + .try_collect() + .await + .unwrap(); + out.iter() + .flat_map(|b| { + b.column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .values() + .to_vec() + }) + .collect() + } + + #[tokio::test] + async fn drops_rows_blocked_by_a_newer_generation() { + let b = int_batch(&[10, 20, 30]); + let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); + let exec = + PkBlockFilterExec::new(input, vec!["id".to_string()], vec![membership(&[20])], 1); + assert_eq!(run(exec).await, vec![10, 30]); + } + + #[tokio::test] + async fn blocks_a_pk_present_in_any_generation() { + // Two newer-gen memberships: a row is dropped if either contains its PK. + let b = int_batch(&[10, 20, 30]); + let blocked = vec![membership(&[10]), membership(&[30])]; + let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); + let exec = PkBlockFilterExec::new(input, vec!["id".to_string()], blocked, 1); + assert_eq!(run(exec).await, vec![20]); + } + + #[tokio::test] + async fn empty_blocked_keeps_all_rows() { + let b = int_batch(&[1, 2, 3]); + let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); + let exec = PkBlockFilterExec::new(input, vec!["id".to_string()], Vec::new(), 1); + assert_eq!(run(exec).await, vec![1, 2, 3]); + } +} diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/pk_hash_filter.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/pk_hash_filter.rs deleted file mode 100644 index ee473047d01..00000000000 --- a/rust/lance/src/dataset/mem_wal/scanner/exec/pk_hash_filter.rs +++ /dev/null @@ -1,350 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -//! Drop superseded rows from a per-source KNN result by primary-key hash. -//! -//! Drops a row when its PK hash ([`super::compute_pk_hash`]) is in any `blocked` -//! set — the newer generations' membership (`Arc`, shared, never merged; -//! base table: all generations). Only the KNN output is hashed. -//! -//! Cross-generation only: within-gen duplicates share a hash, so the global -//! dedup's `(generation, freshness)` tiebreaker collapses those instead. -//! -//! Post-filters an over-fetched KNN (the planner's `overfetch_factor`); warns -//! when a source had >= k candidates but < k survived (over-fetch too small). - -use std::any::Any; -use std::collections::HashSet; -use std::fmt; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; - -use arrow::compute::filter_record_batch; -use arrow_array::{BooleanArray, RecordBatch}; -use arrow_schema::SchemaRef; -use datafusion::error::{DataFusionError, Result as DFResult}; -use datafusion::execution::TaskContext; -use datafusion::physical_expr::EquivalenceProperties; -use datafusion::physical_plan::{ - DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, - SendableRecordBatchStream, -}; -use futures::{Stream, StreamExt}; -use tracing::warn; - -use super::pk::{compute_pk_hash, resolve_pk_indices}; - -/// Filters out rows whose PK hash is in any set of `blocked`. -#[derive(Debug)] -pub struct PkHashFilterExec { - input: Arc, - pk_columns: Vec, - /// Newer generations' membership; a row is blocked if any set holds its hash. - blocked: Vec>>, - /// Target neighbor count, used only to warn on a per-source under-fetch. - k: usize, - properties: Arc, -} - -impl PkHashFilterExec { - pub fn new( - input: Arc, - pk_columns: Vec, - blocked: Vec>>, - k: usize, - ) -> Self { - // A filter preserves the input schema and partitioning. - let properties = Arc::new(PlanProperties::new( - EquivalenceProperties::new(input.schema()), - input.output_partitioning().clone(), - input.pipeline_behavior(), - input.boundedness(), - )); - Self { - input, - pk_columns, - blocked, - k, - properties, - } - } -} - -impl DisplayAs for PkHashFilterExec { - fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { - match t { - DisplayFormatType::Default - | DisplayFormatType::Verbose - | DisplayFormatType::TreeRender => { - let total: usize = self.blocked.iter().map(|s| s.len()).sum(); - write!( - f, - "PkHashFilterExec: pk_cols=[{}], gens={}, blocked={}", - self.pk_columns.join(", "), - self.blocked.len(), - total, - ) - } - } - } -} - -impl ExecutionPlan for PkHashFilterExec { - fn name(&self) -> &str { - "PkHashFilterExec" - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn schema(&self) -> SchemaRef { - self.input.schema() - } - - fn properties(&self) -> &Arc { - &self.properties - } - - fn children(&self) -> Vec<&Arc> { - vec![&self.input] - } - - fn with_new_children( - self: Arc, - children: Vec>, - ) -> DFResult> { - if children.len() != 1 { - return Err(DataFusionError::Internal( - "PkHashFilterExec requires exactly one child".to_string(), - )); - } - Ok(Arc::new(Self::new( - children[0].clone(), - self.pk_columns.clone(), - self.blocked.clone(), - self.k, - ))) - } - - fn execute( - &self, - partition: usize, - context: Arc, - ) -> DFResult { - let input_stream = self.input.execute(partition, context)?; - Ok(Box::pin(PkHashFilterStream { - input: input_stream, - pk_columns: self.pk_columns.clone(), - blocked: self.blocked.clone(), - k: self.k, - schema: self.schema(), - input_seen: 0, - kept: 0, - warned: false, - })) - } -} - -struct PkHashFilterStream { - input: SendableRecordBatchStream, - pk_columns: Vec, - blocked: Vec>>, - k: usize, - schema: SchemaRef, - input_seen: usize, - kept: usize, - warned: bool, -} - -impl PkHashFilterStream { - fn filter_batch(&self, batch: RecordBatch) -> DFResult { - if self.blocked.is_empty() || batch.num_rows() == 0 { - return Ok(batch); - } - let pk_indices = resolve_pk_indices(&batch, &self.pk_columns)?; - let keep: BooleanArray = (0..batch.num_rows()) - .map(|row| { - let hash = compute_pk_hash(&batch, &pk_indices, row); - !self.blocked.iter().any(|set| set.contains(&hash)) - }) - .collect(); - filter_record_batch(&batch, &keep) - .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) - } -} - -impl Stream for PkHashFilterStream { - type Item = DFResult; - - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - match self.input.poll_next_unpin(cx) { - Poll::Ready(Some(Ok(batch))) => { - self.input_seen += batch.num_rows(); - match self.filter_batch(batch) { - Ok(out) => { - self.kept += out.num_rows(); - Poll::Ready(Some(Ok(out))) - } - Err(e) => Poll::Ready(Some(Err(e))), - } - } - Poll::Ready(None) => { - // >= k candidates in, < k out: the over-fetch missed superseded rows. - if !self.warned && self.input_seen >= self.k && self.kept < self.k { - warn!( - k = self.k, - fetched = self.input_seen, - kept = self.kept, - "LSM vector search: < k live rows survived the PK-hash post-filter; \ - raise the over-fetch factor or use a true KNN prefilter." - ); - self.warned = true; - } - Poll::Ready(None) - } - other => other, - } - } -} - -impl datafusion::physical_plan::RecordBatchStream for PkHashFilterStream { - fn schema(&self) -> SchemaRef { - self.schema.clone() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow_array::{Int32Array, StringArray}; - use arrow_schema::{DataType, Field, Schema}; - use datafusion::prelude::SessionContext; - use datafusion_physical_plan::test::TestMemoryExec; - use futures::TryStreamExt; - - /// Hash a single-column Int32 PK value the way the exec does, so a test can - /// build blocked sets from values rather than hand-computed hashes. - fn hash_int_pk(id: i32) -> u64 { - let batch = int_batch(&[id]); - let pk_indices = resolve_pk_indices(&batch, &["id".to_string()]).unwrap(); - compute_pk_hash(&batch, &pk_indices, 0) - } - - fn int_batch(ids: &[i32]) -> RecordBatch { - let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); - RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(ids.to_vec()))]).unwrap() - } - - fn blocked(ids: &[i32]) -> Vec>> { - vec![Arc::new(ids.iter().map(|&id| hash_int_pk(id)).collect())] - } - - async fn run(exec: PkHashFilterExec) -> Vec { - let ctx = SessionContext::new(); - let out: Vec = exec - .execute(0, ctx.task_ctx()) - .unwrap() - .try_collect() - .await - .unwrap(); - out.iter() - .flat_map(|b| { - b.column_by_name("id") - .unwrap() - .as_any() - .downcast_ref::() - .unwrap() - .values() - .to_vec() - }) - .collect() - } - - #[tokio::test] - async fn drops_rows_with_blocked_pk_hash() { - let b = int_batch(&[10, 20, 30]); - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, vec!["id".to_string()], blocked(&[20]), 1); - assert_eq!(run(exec).await, vec![10, 30]); - } - - #[tokio::test] - async fn blocks_a_pk_present_in_any_generation_set() { - // Two newer-gen sets: a row is dropped if either contains its PK. - let b = int_batch(&[10, 20, 30]); - let sets = vec![ - Arc::new(HashSet::from([hash_int_pk(10)])), - Arc::new(HashSet::from([hash_int_pk(30)])), - ]; - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, vec!["id".to_string()], sets, 1); - assert_eq!(run(exec).await, vec![20]); - } - - #[tokio::test] - async fn empty_blocked_keeps_all_rows() { - let b = int_batch(&[1, 2, 3]); - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, vec!["id".to_string()], Vec::new(), 1); - assert_eq!(run(exec).await, vec![1, 2, 3]); - } - - #[tokio::test] - async fn null_pk_is_hashed_consistently_and_blockable() { - // A null PK hashes deterministically (compute_pk_hash hashes is_null), - // so a superseded null-key row can be dropped like any other. - let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, true)])); - let with_null = |ids: Vec>| { - RecordBatch::try_new(schema.clone(), vec![Arc::new(Int32Array::from(ids))]).unwrap() - }; - let pk = vec!["id".to_string()]; - let null_row = with_null(vec![None]); - let pk_indices = resolve_pk_indices(&null_row, &pk).unwrap(); - let sets = vec![Arc::new(HashSet::from([compute_pk_hash( - &null_row, - &pk_indices, - 0, - )]))]; - - // Rows: 10, NULL, 30 — only the NULL-key row is dropped. - let b = with_null(vec![Some(10), None, Some(30)]); - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, pk, sets, 1); - assert_eq!(run(exec).await, vec![10, 30]); - } - - #[tokio::test] - async fn composite_pk_hash_matches_block_set() { - // Composite PK (id, name): block the (2, "b") tuple only. - let schema = Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, false), - ])); - let mk = |ids: &[i32], names: &[&str]| { - RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(Int32Array::from(ids.to_vec())), - Arc::new(StringArray::from(names.to_vec())), - ], - ) - .unwrap() - }; - let pk = vec!["id".to_string(), "name".to_string()]; - let one_row = mk(&[2], &["b"]); - let pk_indices = resolve_pk_indices(&one_row, &pk).unwrap(); - let sets = vec![Arc::new(HashSet::from([compute_pk_hash( - &one_row, - &pk_indices, - 0, - )]))]; - - // (1,"a") and (2,"a") survive; only the exact (2,"b") tuple is dropped. - let b = mk(&[1, 2, 2], &["a", "a", "b"]); - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, pk, sets, 1); - assert_eq!(run(exec).await, vec![1, 2]); - } -} diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/within_source_dedup.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/within_source_dedup.rs deleted file mode 100644 index be5dae6a668..00000000000 --- a/rust/lance/src/dataset/mem_wal/scanner/exec/within_source_dedup.rs +++ /dev/null @@ -1,432 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -//! WithinSourceDedupExec - Deduplicates rows with the same primary key from a -//! single LSM source, keeping the newest insert. -//! -//! In MemWAL/LSM mode the same primary key can be written multiple times into -//! the same memtable. The active memtable stores rows in insert order (larger -//! `_rowaddr` = newer), while flushed memtables are reverse-written so that -//! within a flushed file the smallest `_rowid` is the newest insert (see -//! `memtable/flush.rs:152` and `hnsw/storage.rs:307`). Point lookup uses this -//! node to collapse such duplicates *within a single source* so that the -//! downstream `CoalesceFirstExec` / `LIMIT` sees at most one row per primary -//! key per source. - -use std::any::Any; -use std::collections::HashMap; -use std::fmt; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; - -use arrow_array::{Array, RecordBatch, UInt64Array}; -use arrow_schema::SchemaRef; -use datafusion::error::Result as DFResult; -use datafusion::execution::TaskContext; -use datafusion::physical_expr::{EquivalenceProperties, Partitioning}; -use datafusion::physical_plan::{ - DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, - SendableRecordBatchStream, -}; -use futures::{Stream, StreamExt, ready}; - -use super::pk::{compute_pk_hash, resolve_pk_indices}; - -/// Among rows that share a primary key, which row-address extreme identifies -/// the newest insert to keep. The kept row is always the freshest; only the -/// row address (`_rowaddr`/`_rowid`) used to find it differs by source. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum DedupDirection { - /// Keep the row with the largest row-address value (active memtable: larger - /// `_rowaddr` = inserted later). - KeepMaxRowAddr, - /// Keep the row with the smallest row-address value (flushed memtable under - /// reverse-write: smaller `_rowid` = inserted later). - KeepMinRowAddr, -} - -/// Deduplicates rows from a single source by primary key, keeping the row -/// whose `row_addr_column` value wins per [`DedupDirection`]. -/// -/// # Required columns -/// -/// The input must expose: -/// - All `pk_columns` -/// - `row_addr_column` of `UInt64` type -/// -/// The output schema is unchanged from the input. Callers that need to hide -/// the row-address column from downstream consumers should compose this node -/// with `project_to_canonical` or `null_columns`. -/// -/// # Performance -/// -/// Memory: `O(unique primary keys in input)`. For point lookup the input is -/// already filtered to a single primary key so the map holds at most one -/// entry. -#[derive(Debug)] -pub struct WithinSourceDedupExec { - input: Arc, - pk_columns: Vec, - row_addr_column: String, - direction: DedupDirection, - schema: SchemaRef, - properties: Arc, -} - -impl WithinSourceDedupExec { - pub fn new( - input: Arc, - pk_columns: Vec, - row_addr_column: impl Into, - direction: DedupDirection, - ) -> Self { - let schema = input.schema(); - let properties = Arc::new(PlanProperties::new( - EquivalenceProperties::new(schema.clone()), - Partitioning::UnknownPartitioning(1), - input.pipeline_behavior(), - input.boundedness(), - )); - Self { - input, - pk_columns, - row_addr_column: row_addr_column.into(), - direction, - schema, - properties, - } - } - - pub fn pk_columns(&self) -> &[String] { - &self.pk_columns - } - - pub fn row_addr_column(&self) -> &str { - &self.row_addr_column - } - - pub fn direction(&self) -> DedupDirection { - self.direction - } -} - -impl DisplayAs for WithinSourceDedupExec { - fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { - match t { - DisplayFormatType::Default - | DisplayFormatType::Verbose - | DisplayFormatType::TreeRender => { - write!( - f, - "WithinSourceDedupExec: pk=[{}], row_addr={}, direction={:?}", - self.pk_columns.join(", "), - self.row_addr_column, - self.direction, - ) - } - } - } -} - -impl ExecutionPlan for WithinSourceDedupExec { - fn name(&self) -> &str { - "WithinSourceDedupExec" - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn schema(&self) -> SchemaRef { - self.schema.clone() - } - - fn properties(&self) -> &Arc { - &self.properties - } - - fn children(&self) -> Vec<&Arc> { - vec![&self.input] - } - - fn with_new_children( - self: Arc, - children: Vec>, - ) -> DFResult> { - if children.len() != 1 { - return Err(datafusion::error::DataFusionError::Internal( - "WithinSourceDedupExec requires exactly one child".to_string(), - )); - } - Ok(Arc::new(Self::new( - children[0].clone(), - self.pk_columns.clone(), - self.row_addr_column.clone(), - self.direction, - ))) - } - - fn execute( - &self, - partition: usize, - context: Arc, - ) -> DFResult { - let input_stream = self.input.execute(partition, context)?; - Ok(Box::pin(WithinSourceDedupStream { - input: input_stream, - pk_columns: self.pk_columns.clone(), - row_addr_column: self.row_addr_column.clone(), - direction: self.direction, - schema: self.schema.clone(), - winners: HashMap::new(), - emitted: false, - })) - } -} - -/// One winning row, materialized as a single-row `RecordBatch` so we don't -/// have to keep the source batch alive after we've picked the winner. -struct Winner { - batch: RecordBatch, - row_addr: u64, -} - -struct WithinSourceDedupStream { - input: SendableRecordBatchStream, - pk_columns: Vec, - row_addr_column: String, - direction: DedupDirection, - schema: SchemaRef, - winners: HashMap, - emitted: bool, -} - -impl WithinSourceDedupStream { - fn consume_batch(&mut self, batch: RecordBatch) -> DFResult<()> { - if batch.num_rows() == 0 { - return Ok(()); - } - let pk_indices = resolve_pk_indices(&batch, &self.pk_columns)?; - let row_addr_array = batch - .column_by_name(&self.row_addr_column) - .ok_or_else(|| { - datafusion::error::DataFusionError::Internal(format!( - "Row-address column '{}' not found in batch", - self.row_addr_column - )) - })? - .as_any() - .downcast_ref::() - .ok_or_else(|| { - datafusion::error::DataFusionError::Internal(format!( - "Row-address column '{}' is not UInt64", - self.row_addr_column - )) - })?; - - for row_idx in 0..batch.num_rows() { - if row_addr_array.is_null(row_idx) { - // A NULL row address can't be ordered against a real one. Skip - // rather than guess — callers should always project a real - // row-address column for dedup-eligible sources. - continue; - } - let row_addr = row_addr_array.value(row_idx); - let pk_hash = compute_pk_hash(&batch, &pk_indices, row_idx); - - let take_row = match self.winners.get(&pk_hash) { - None => true, - Some(existing) => match self.direction { - DedupDirection::KeepMaxRowAddr => row_addr > existing.row_addr, - DedupDirection::KeepMinRowAddr => row_addr < existing.row_addr, - }, - }; - - if take_row { - let single = batch.slice(row_idx, 1); - self.winners.insert( - pk_hash, - Winner { - batch: single, - row_addr, - }, - ); - } - } - Ok(()) - } - - fn finalize(&mut self) -> DFResult { - if self.winners.is_empty() { - return Ok(RecordBatch::new_empty(self.schema.clone())); - } - let batches: Vec = self.winners.drain().map(|(_, w)| w.batch).collect(); - let batch_refs: Vec<&RecordBatch> = batches.iter().collect(); - arrow_select::concat::concat_batches(&self.schema, batch_refs) - .map_err(|e| datafusion::error::DataFusionError::ArrowError(Box::new(e), None)) - } -} - -impl Stream for WithinSourceDedupStream { - type Item = DFResult; - - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - loop { - if self.emitted { - return Poll::Ready(None); - } - match ready!(self.input.poll_next_unpin(cx)) { - Some(Ok(batch)) => { - if let Err(e) = self.consume_batch(batch) { - self.emitted = true; - return Poll::Ready(Some(Err(e))); - } - } - Some(Err(e)) => { - self.emitted = true; - return Poll::Ready(Some(Err(e))); - } - None => { - self.emitted = true; - return Poll::Ready(Some(self.finalize())); - } - } - } - } -} - -impl datafusion::physical_plan::RecordBatchStream for WithinSourceDedupStream { - fn schema(&self) -> SchemaRef { - self.schema.clone() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow_array::{Float32Array, Int32Array, StringArray}; - use arrow_schema::{DataType, Field, Schema}; - use datafusion::prelude::SessionContext; - use datafusion_physical_plan::test::TestMemoryExec; - use futures::TryStreamExt; - - fn create_test_schema() -> SchemaRef { - Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - Field::new("_distance", DataType::Float32, true), - Field::new("_row_addr", DataType::UInt64, true), - ])) - } - - fn batch(ids: &[i32], names: &[&str], distances: &[f32], row_addr: &[u64]) -> RecordBatch { - let schema = create_test_schema(); - RecordBatch::try_new( - schema, - vec![ - Arc::new(Int32Array::from(ids.to_vec())), - Arc::new(StringArray::from(names.to_vec())), - Arc::new(Float32Array::from(distances.to_vec())), - Arc::new(UInt64Array::from(row_addr.to_vec())), - ], - ) - .unwrap() - } - - async fn run(batches: Vec, direction: DedupDirection) -> Vec { - let schema = create_test_schema(); - let input = TestMemoryExec::try_new_exec(&[batches], schema, None).unwrap(); - let exec = - WithinSourceDedupExec::new(input, vec!["id".to_string()], "_row_addr", direction); - let ctx = SessionContext::new(); - let stream = exec.execute(0, ctx.task_ctx()).unwrap(); - stream.try_collect().await.unwrap() - } - - fn extract(batches: &[RecordBatch]) -> Vec<(i32, String, u64)> { - let mut out = Vec::new(); - for b in batches { - let ids = b.column(0).as_any().downcast_ref::().unwrap(); - let names = b.column(1).as_any().downcast_ref::().unwrap(); - let addr = b.column(3).as_any().downcast_ref::().unwrap(); - for i in 0..b.num_rows() { - out.push((ids.value(i), names.value(i).to_string(), addr.value(i))); - } - } - out.sort_by_key(|(id, _, _)| *id); - out - } - - #[tokio::test] - async fn keep_max_picks_largest_row_addr() { - // Active-memtable case: same pk inserted twice; newer = larger _rowaddr. - let b1 = batch( - &[1, 1, 2], - &["old", "new", "two"], - &[0.1, 0.2, 0.3], - &[10, 99, 5], - ); - let out = run(vec![b1], DedupDirection::KeepMaxRowAddr).await; - let rows = extract(&out); - assert_eq!(rows.len(), 2); - assert_eq!(rows[0], (1, "new".to_string(), 99)); - assert_eq!(rows[1], (2, "two".to_string(), 5)); - } - - #[tokio::test] - async fn keep_min_picks_smallest_row_addr() { - // Flushed-memtable case under reverse-write: newer = smaller _rowid. - let b1 = batch( - &[1, 1, 2], - &["old", "new", "two"], - &[0.1, 0.2, 0.3], - &[99, 10, 5], - ); - let out = run(vec![b1], DedupDirection::KeepMinRowAddr).await; - let rows = extract(&out); - assert_eq!(rows.len(), 2); - assert_eq!(rows[0], (1, "new".to_string(), 10)); - assert_eq!(rows[1], (2, "two".to_string(), 5)); - } - - #[tokio::test] - async fn dedup_across_batches() { - let b1 = batch(&[1, 2], &["a", "b"], &[0.1, 0.2], &[1, 1]); - let b2 = batch(&[1, 3], &["a_new", "c"], &[0.5, 0.4], &[7, 1]); - let out = run(vec![b1, b2], DedupDirection::KeepMaxRowAddr).await; - let rows = extract(&out); - assert_eq!(rows.len(), 3); - assert_eq!(rows[0], (1, "a_new".to_string(), 7)); - assert_eq!(rows[1], (2, "b".to_string(), 1)); - assert_eq!(rows[2], (3, "c".to_string(), 1)); - } - - #[tokio::test] - async fn empty_input() { - let out = run(vec![], DedupDirection::KeepMaxRowAddr).await; - let total: usize = out.iter().map(|b| b.num_rows()).sum(); - assert_eq!(total, 0); - } - - #[tokio::test] - async fn null_row_addr_skipped() { - // Rows with NULL row address can't be ordered — they're dropped so they - // don't accidentally become winners against real values. - let schema = create_test_schema(); - let b = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(Int32Array::from(vec![1, 1])), - Arc::new(StringArray::from(vec!["nulladdr", "real"])), - Arc::new(Float32Array::from(vec![0.1, 0.2])), - Arc::new(UInt64Array::from(vec![None, Some(5)])), - ], - ) - .unwrap(); - let out = run(vec![b], DedupDirection::KeepMaxRowAddr).await; - let rows = extract(&out); - assert_eq!(rows.len(), 1); - assert_eq!(rows[0], (1, "real".to_string(), 5)); - } -} diff --git a/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs b/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs index 39abf7e8c71..7a5280bedb8 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs @@ -22,6 +22,7 @@ use std::collections::HashSet; use std::sync::Arc; +use async_trait::async_trait; use lance_core::{Error, Result}; use crate::dataset::{Dataset, DatasetBuilder}; @@ -41,12 +42,10 @@ use crate::session::Session; pub struct FlushedMemTableCache { // `moka`'s async cache gives a bounded size plus single-flight // `try_get_with`, so concurrent first-queries on a just-flushed - // generation open the dataset exactly once. + // generation open the dataset exactly once. The opened dataset carries the + // session index cache, which also backs each generation's standalone PK + // dedup index (see `block_list::open_pk_index`) — no separate cache path. inner: moka::future::Cache>, - // Per-generation set of PK hashes for the vector-search block-list, keyed by - // the same immutable flushed path. Built lazily on the first query that needs - // it (single-flight) so repeated searches skip re-scanning the PK column. - pk_hashes: moka::future::Cache>>, } impl FlushedMemTableCache { @@ -63,10 +62,6 @@ impl FlushedMemTableCache { // into at build time. .support_invalidation_closures() .build(), - pk_hashes: moka::future::Cache::builder() - .max_capacity(max_entries) - .support_invalidation_closures() - .build(), } } @@ -96,21 +91,6 @@ impl FlushedMemTableCache { .map_err(|e: Arc| Error::cloned(e.to_string())) } - /// Get the cached set of PK hashes for `path`, building it (exactly once) on - /// a miss via `build`. The flushed path is immutable, so a cached set is - /// never stale; concurrent first-queries share one build via `moka`'s - /// single-flight `try_get_with`. - pub async fn get_or_build_pk_hashes( - &self, - path: &str, - build: impl std::future::Future>>, - ) -> Result>> { - self.pk_hashes - .try_get_with(path.to_string(), async move { build.await.map(Arc::new) }) - .await - .map_err(|e: Arc| Error::cloned(e.to_string())) - } - /// Drop cached entries whose path is not in `live_paths`. /// /// Called by the consumer after compaction retires generations. Purely a @@ -125,10 +105,6 @@ impl FlushedMemTableCache { let _ = self .inner .invalidate_entries_if(move |path, _| !live.contains(path)); - let live = live_paths.clone(); - let _ = self - .pk_hashes - .invalidate_entries_if(move |path, _| !live.contains(path)); } } @@ -140,29 +116,92 @@ impl std::fmt::Debug for FlushedMemTableCache { } } +/// Caching of opened flushed-generation datasets, keyed by immutable path. The +/// opened dataset carries the session index cache, which also backs each +/// generation's secondary indexes and its PK dedup sidecar (see +/// `block_list::open_pk_index`) — so a single `get_or_open` is the +/// whole caching surface. Implemented by [`FlushedMemTableCache`]; a +/// [`GenerationWarmer`] composes one to warm through it, and a consumer may +/// supply its own implementation. +#[async_trait] +pub trait DatasetCache: Send + Sync + std::fmt::Debug { + async fn get_or_open(&self, path: &str, session: Option>) -> Result>; + + /// Drop cached entries whose path is not in `live_paths`. Async so an + /// implementation can evict retired generations' index objects (e.g. + /// `Session::invalidate_index_prefix`) without a later breaking signature + /// change; [`FlushedMemTableCache`]'s own eviction is synchronous. + async fn retain_paths(&self, live_paths: &HashSet); +} + +#[async_trait] +impl DatasetCache for FlushedMemTableCache { + async fn get_or_open(&self, path: &str, session: Option>) -> Result> { + Self::get_or_open(self, path, session).await + } + + async fn retain_paths(&self, live_paths: &HashSet) { + Self::retain_paths(self, live_paths) + } +} + +/// Proactively warms a flushed generation into the shared caches: open the +/// dataset and pre-load its secondary indexes and PK dedup sidecar so the first +/// query sees no cold reads. This is the **seam** the flush and read paths fire +/// — lance defines it; the consumer (e.g. the WAL pod) implements it. `None` => +/// no warming, generations warm lazily on first read. +/// +/// Everything a warmer touches is keyed by the immutable generation `path` +/// (opened dataset, its secondary indexes, its PK dedup sidecar), so `path` is +/// the only input it needs. +/// +/// `warm` is fired fire-and-forget from every read path that opens a generation +/// (all four LSM planners) as well as pre-commit on flush, so the same path may +/// be warmed concurrently and repeatedly. Implementations **must be idempotent +/// and cheap when the path is already warm** (e.g. dedup in-flight and +/// completed paths) — a redundant call must not re-do work or fail. +#[async_trait] +pub trait GenerationWarmer: Send + Sync + std::fmt::Debug { + async fn warm(&self, path: &str) -> Result<()>; +} + /// Open a flushed-generation dataset, shared by all three LSM open sites /// (scan, point lookup, vector search). /// -/// - `cache` present: route through [`FlushedMemTableCache`] (single-flight, -/// shared `Arc`, manifest read amortized across queries). +/// - `cache` present: route through a [`DatasetCache`] (e.g. +/// [`FlushedMemTableCache`]: single-flight, shared `Arc`, manifest read +/// amortized across queries). /// - `cache` absent: cold open via [`DatasetBuilder`]. Passing `session` /// still reuses the shared index / metadata caches; `None`/`None` /// reproduces the original per-query cold-open behavior exactly. +/// - `warmer` present: fire a fire-and-forget warm-on-open backstop behind the +/// returned handle (the warmer dedups already-warm paths). `None` => no warming. pub async fn open_flushed_dataset( path: &str, session: Option<&Arc>, - cache: Option<&Arc>, + cache: Option<&Arc>, + warmer: Option<&Arc>, ) -> Result> { - match cache { - Some(cache) => cache.get_or_open(path, session.cloned()).await, + let dataset = match cache { + Some(cache) => cache.get_or_open(path, session.cloned()).await?, None => { let mut builder = DatasetBuilder::from_uri(path); if let Some(session) = session { builder = builder.with_session(session.clone()); } - Ok(Arc::new(builder.load().await?)) + Arc::new(builder.load().await?) } + }; + if let Some(warmer) = warmer { + let warmer = Arc::clone(warmer); + let path = path.to_string(); + tokio::spawn(async move { + if let Err(e) = warmer.warm(&path).await { + tracing::debug!(generation = %path, error = %e, "warm-on-open failed"); + } + }); } + Ok(dataset) } #[cfg(test)] @@ -250,34 +289,6 @@ mod tests { assert_eq!(cache.inner.entry_count(), 1, "exactly one entry cached"); } - #[tokio::test] - async fn pk_hashes_cached_reuses_first_build() { - // The PK-hash set is keyed by the immutable flushed path: a hit returns - // the first-built set and never runs the second build closure. - let cache = FlushedMemTableCache::new(8); - let path = "memory://shard/gen_1"; - let first = cache - .get_or_build_pk_hashes(path, async { Ok(HashSet::from([1u64, 2])) }) - .await - .unwrap(); - let second = cache - .get_or_build_pk_hashes(path, async { - // Different contents; must be ignored because the path is cached. - Ok(HashSet::from([9u64])) - }) - .await - .unwrap(); - assert!( - Arc::ptr_eq(&first, &second), - "a PK-hash cache hit must reuse the first-built set" - ); - assert_eq!( - second.len(), - 2, - "cached set keeps the first build's contents" - ); - } - #[tokio::test] async fn test_retain_paths_drops_unreferenced() { let temp_dir = tempfile::tempdir().unwrap(); @@ -310,8 +321,8 @@ mod tests { let uri = format!("{}/gen_1", temp_dir.path().to_str().unwrap()); write_dataset(&uri, &[7, 8, 9]).await; - let a = open_flushed_dataset(&uri, None, None).await.unwrap(); - let b = open_flushed_dataset(&uri, None, None).await.unwrap(); + let a = open_flushed_dataset(&uri, None, None, None).await.unwrap(); + let b = open_flushed_dataset(&uri, None, None, None).await.unwrap(); assert!( !Arc::ptr_eq(&a, &b), "no-cache path must cold-open each call" @@ -319,13 +330,57 @@ mod tests { assert_eq!(a.count_rows(None).await.unwrap(), 3); // With a cache, the second call is a shared clone. - let cache = Arc::new(FlushedMemTableCache::new(8)); - let c = open_flushed_dataset(&uri, None, Some(&cache)) + let cache: Arc = Arc::new(FlushedMemTableCache::new(8)); + let c = open_flushed_dataset(&uri, None, Some(&cache), None) .await .unwrap(); - let d = open_flushed_dataset(&uri, None, Some(&cache)) + let d = open_flushed_dataset(&uri, None, Some(&cache), None) .await .unwrap(); assert!(Arc::ptr_eq(&c, &d), "cached path must reuse the Arc"); } + + /// A warmer that records calls and signals each one. + #[derive(Debug)] + struct NotifyingWarmer { + calls: Arc, + notify: Arc, + } + + #[async_trait] + impl GenerationWarmer for NotifyingWarmer { + async fn warm(&self, _path: &str) -> Result<()> { + self.calls.fetch_add(1, Ordering::SeqCst); + self.notify.notify_one(); + Ok(()) + } + } + + #[tokio::test] + async fn test_open_flushed_dataset_fires_warm_on_open() { + // The warm-on-open backstop fires the warmer (fire-and-forget) when a + // generation is opened, so generations the flusher never warmed still + // get warmed lazily on first read. + let temp_dir = tempfile::tempdir().unwrap(); + let uri = format!("{}/gen_1", temp_dir.path().to_str().unwrap()); + write_dataset(&uri, &[1, 2, 3]).await; + + let calls = Arc::new(AtomicUsize::new(0)); + let notify = Arc::new(tokio::sync::Notify::new()); + let warmer: Arc = Arc::new(NotifyingWarmer { + calls: calls.clone(), + notify: notify.clone(), + }); + + let ds = open_flushed_dataset(&uri, None, None, Some(&warmer)) + .await + .unwrap(); + assert_eq!(ds.count_rows(None).await.unwrap(), 3); + + // The warm is spawned fire-and-forget; wait (bounded) for it to run. + tokio::time::timeout(std::time::Duration::from_secs(5), notify.notified()) + .await + .expect("warm-on-open must fire"); + assert_eq!(calls.load(Ordering::SeqCst), 1, "warmer fired once on open"); + } } diff --git a/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs b/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs index e3ef44d8b1a..e7c8d205d5d 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs @@ -44,7 +44,7 @@ use datafusion::physical_plan::ExecutionPlan; use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use datafusion::physical_plan::union::UnionExec; -use lance_core::{Error, ROW_ID, Result, is_system_column}; +use lance_core::{Error, Result, is_system_column}; use lance_index::scalar::FullTextSearchQuery; use lance_index::scalar::inverted::query::FtsQuery as IndexFtsQuery; use tracing::instrument; @@ -52,8 +52,8 @@ use tracing::instrument; use super::block_list::compute_source_block_lists; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; -use super::exec::{DedupDirection, PkHashFilterExec, WithinSourceDedupExec}; -use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; +use super::exec::{NewestPkFilterExec, PkBlockFilterExec}; +use super::flushed_cache::{DatasetCache, GenerationWarmer, open_flushed_dataset}; use super::projection::project_to_canonical; use crate::dataset::mem_wal::memtable::scanner::MemTableScanner; use crate::session::Session; @@ -76,7 +76,9 @@ pub struct LsmFtsSearchPlanner { /// Session threaded into flushed-generation opens (shared caches). session: Option>, /// Cache of opened flushed-generation datasets. - flushed_cache: Option>, + flushed_cache: Option>, + /// Optional warmer fired on first open of a flushed generation. + warmer: Option>, /// Over-fetch multiple for blocked sources (clamped to `>= 1.0`). overfetch_factor: f64, } @@ -94,6 +96,7 @@ impl LsmFtsSearchPlanner { base_schema, session: None, flushed_cache: None, + warmer: None, overfetch_factor: DEFAULT_OVERFETCH_FACTOR, } } @@ -114,11 +117,17 @@ impl LsmFtsSearchPlanner { /// Inject a cache of opened flushed-generation datasets, making repeated /// searches against the same generation a pure `Arc::clone`. - pub fn with_flushed_cache(mut self, cache: Arc) -> Self { + pub fn with_flushed_cache(mut self, cache: Arc) -> Self { self.flushed_cache = Some(cache); self } + /// Inject the warmer fired on first open of a flushed generation. + pub fn with_warmer(mut self, warmer: Arc) -> Self { + self.warmer = Some(warmer); + self + } + /// Build the FTS execution plan (local scoring). /// /// # Arguments @@ -154,51 +163,57 @@ impl LsmFtsSearchPlanner { return self.empty_plan(&target_schema); } - // Per-source PK-hash block sets for cross-generation dedup (NEWER(G) - // per shard; base = union of all gens). Query-type-agnostic — same - // call the vector planner makes. `Box::pin` keeps the future off + // Per-source PK block sets for cross-generation dedup (NEWER(G) per + // shard; base = union of all gens). Query-type-agnostic — same call the + // vector planner makes. `Box::pin` keeps the future off // `clippy::large_futures`. let block_lists = Box::pin(compute_source_block_lists( &sources, - &self.pk_columns, self.session.as_ref(), self.flushed_cache.as_ref(), )) .await?; let overfetch = self.overfetch_factor.max(1.0); - let mut per_source_plans: Vec> = Vec::with_capacity(sources.len()); - for source in &sources { - let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. }); - let blocked = block_lists.get(&(source.shard_id(), source.generation())); - // Over-fetch a blocked source so the post-filter still yields k live - // rows. The active arm returns all matches (no builder limit), so its - // within-source dedup needs no over-fetch hint. - let fetch_k = if blocked.is_some() { - ((k as f64) * overfetch).ceil() as usize - } else { - k - }; - - let plan = self - .build_source_plan(source, column, &query, fetch_k, projection, is_active) - .await?; + // Stage the per-source over-fetch decisions, then build every source + // plan concurrently — the builds are independent and a sequential loop + // was the dominant serial planning cost at multiple generations. + let arm_inputs: Vec<_> = sources + .iter() + .map(|source| { + let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. }); + let blocked = block_lists.get(&(source.shard_id(), source.generation())); + // Over-fetch a blocked source so the post-filter still yields k live + // rows. The active arm returns all matches (no builder limit), so its + // within-source dedup needs no over-fetch hint. + let fetch_k = if blocked.is_some() { + ((k as f64) * overfetch).ceil() as usize + } else { + k + }; + (source, is_active, blocked, fetch_k) + }) + .collect(); + let built = + futures::future::try_join_all(arm_inputs.iter().map(|(source, _, _, fetch_k)| { + Box::pin(self.build_source_plan(source, column, &query, *fetch_k, projection)) + })) + .await?; + let mut per_source_plans: Vec> = Vec::with_capacity(sources.len()); + for ((_, is_active, blocked, _), plan) in arm_inputs.iter().zip(built) { + let is_active = *is_active; + let blocked = *blocked; // Dedup, mirroring LsmVectorSearchPlanner: - // * active: collapse duplicate-PK appends to the newest insert - // (larger _rowid = inserted later). The FTS index is append-only, - // so an in-memtable update leaves both versions searchable. + // * active: already wrapped in `NewestPkFilterExec` inside + // `build_source_plan` (drops predicate-crossing stale hits, which a + // result-set dedup can't catch). // * flushed/base: drop rows superseded by a newer generation via the // block-list (within-gen is handled by the flushed deletion vector). let deduped = if is_active { - Arc::new(WithinSourceDedupExec::new( - plan, - self.pk_columns.clone(), - ROW_ID, - DedupDirection::KeepMaxRowAddr, - )) as Arc + plan } else if let Some(set) = blocked { - Arc::new(PkHashFilterExec::new( + Arc::new(PkBlockFilterExec::new( plan, self.pk_columns.clone(), set.clone(), @@ -219,8 +234,11 @@ impl LsmFtsSearchPlanner { per_source_plans.into_iter().next().unwrap() } else { #[allow(deprecated)] - let union: Arc = Arc::new(UnionExec::new(per_source_plans)); - union + // The downstream `SortPreservingMergeExec` already spawns one driver + // task per input partition (one per union arm) via `spawn_buffered`, + // so each arm's per-arm CPU (posting decode, BM25) runs on its own + // task without an extra repartition. + Arc::new(UnionExec::new(per_source_plans)) }; let score_idx = merged.schema().index_of(SCORE_COLUMN).map_err(|_| { @@ -263,7 +281,6 @@ impl LsmFtsSearchPlanner { query: &FullTextSearchQuery, k: usize, projection: Option<&[String]>, - emit_row_id: bool, ) -> Result> { match source { LsmDataSource::BaseTable { dataset } => { @@ -278,9 +295,13 @@ impl LsmFtsSearchPlanner { scanner.create_plan().await } LsmDataSource::FlushedMemTable { path, .. } => { - let dataset = - open_flushed_dataset(path, self.session.as_ref(), self.flushed_cache.as_ref()) - .await?; + let dataset = open_flushed_dataset( + path, + self.session.as_ref(), + self.flushed_cache.as_ref(), + self.warmer.as_ref(), + ) + .await?; let mut scanner = dataset.scan(); let cols = self.fts_scanner_projection(projection); scanner.project(&cols.iter().map(|s| s.as_str()).collect::>())?; @@ -301,11 +322,12 @@ impl LsmFtsSearchPlanner { MemTableScanner::new(batch_store.clone(), index_store.clone(), schema.clone()); let cols = self.fts_scanner_projection(projection); scanner.project(&cols.iter().map(|s| s.as_str()).collect::>()); - // Emit `_rowid` (row position) so the planner can collapse - // duplicate-PK appends via WithinSourceDedupExec before the union. - if emit_row_id { - scanner.with_row_id(); - } + // Expose the row position so the recency filter can identify the + // newest visible version of each PK. The append-only inverted + // index keeps an updated row's old postings live, so a stale hit + // can match a query the fresh row no longer does; the filter + // drops it. `project_to_canonical` strips `_rowid` afterward. + scanner.with_row_id(); // `MemTableScanner::full_text_search` takes a raw match // string; richer query shapes (phrase/boolean/fuzzy) can // be plumbed through once the MemTable scanner accepts a @@ -324,7 +346,19 @@ impl LsmFtsSearchPlanner { // today; the per-partition Sort+fetch above bounds the // emitted rows. let _ = k; - scanner.create_plan().await + let plan = scanner.create_plan().await?; + // Drop predicate-crossing stale hits: keep a hit iff it is the + // newest visible version of its PK (collapses duplicate-PK + // appends too — supersedes the old WithinSourceDedupExec). + let filtered: Arc = Arc::new(NewestPkFilterExec::new( + plan, + self.pk_columns.clone(), + lance_core::ROW_ID, + index_store.clone(), + batch_store.clone(), + scanner.max_visible_batch_position(), + )); + Ok(filtered) } } } @@ -478,6 +512,7 @@ mod tests { // Active memtable with its own FTS index, containing a matching row. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut indexes = IndexStore::new(); + indexes.enable_pk_index(&[("id".to_string(), 0)]); indexes.add_fts("text_fts".to_string(), 1, "text".to_string()); let active_batch = make_batch( &schema, @@ -646,6 +681,7 @@ mod tests { let schema = fts_schema(); let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut indexes = IndexStore::new(); + indexes.enable_pk_index(&[("id".to_string(), 0)]); indexes.add_fts("text_fts".to_string(), 1, "text".to_string()); // First append (positions 0,1): id=1 is the stale version of the PK. @@ -725,4 +761,88 @@ mod tests { "dedup must keep the newest (max row-position) version" ); } + + #[tokio::test] + async fn active_stale_update_predicate_crossing_leaks() { + // A PK update that crosses out of the match set: pk=1 inserted as + // "alpha lance", then updated to "beta lance". The append-only inverted + // index keeps the old "alpha" posting live, so an "alpha" search still + // matches the STALE pk=1 row — and the fresh "beta lance" row isn't even + // a candidate, so a result-set dedup has nothing to suppress it against. + // `NewestPkFilterExec` drops it predicate-independently: pk=1's newest + // visible row is "beta lance", so the "alpha" hit is not the newest. + let schema = fts_schema(); + let batch_store = Arc::new(BatchStore::with_capacity(16)); + let mut indexes = IndexStore::new(); + indexes.enable_pk_index(&[("id".to_string(), 0)]); + indexes.add_fts("text_fts".to_string(), 1, "text".to_string()); + + // Insert pk=1 ("alpha lance") and an unrelated live pk=2 ("alpha foo"). + let b1 = make_batch(&schema, &[1, 2], &["alpha lance", "alpha foo"]); + let (bp1, off1, _) = batch_store.append(b1.clone()).unwrap(); + indexes + .insert_with_batch_position(&b1, off1, Some(bp1)) + .unwrap(); + + // Update pk=1 → "beta lance" (no longer matches "alpha"). + let b2 = make_batch(&schema, &[1], &["beta lance"]); + let (bp2, off2, _) = batch_store.append(b2.clone()).unwrap(); + indexes + .insert_with_batch_position(&b2, off2, Some(bp2)) + .unwrap(); + let indexes = Arc::new(indexes); + + let tmp = tempfile::tempdir().unwrap(); + let base_uri = format!("{}/base", tmp.path().to_str().unwrap()); + let collector = LsmDataSourceCollector::without_base_table(base_uri, vec![]) + .with_in_memory_memtables( + uuid::Uuid::new_v4(), + InMemoryMemTables { + active: InMemoryMemTableRef { + batch_store, + index_store: indexes, + schema: schema.clone(), + generation: 1, + }, + frozen: vec![], + }, + ); + + let planner = LsmFtsSearchPlanner::new(collector, vec!["id".to_string()], schema); + let plan = planner + .plan_search( + "text", + FullTextSearchQuery::new("alpha".to_string()), + 10, + None, + ) + .await + .expect("planner should produce a plan"); + + let ctx = datafusion::prelude::SessionContext::new(); + let stream = plan.execute(0, ctx.task_ctx()).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + + let mut ids: Vec = Vec::new(); + for b in &batches { + let col = b + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..b.num_rows() { + ids.push(col.value(i)); + } + } + + assert!( + !ids.contains(&1), + "stale pk=1 (now 'beta lance') leaked on an 'alpha' search; got ids={ids:?}" + ); + assert!( + ids.contains(&2), + "live pk=2 ('alpha foo') must still match 'alpha'; got ids={ids:?}" + ); + } } diff --git a/rust/lance/src/dataset/mem_wal/scanner/planner.rs b/rust/lance/src/dataset/mem_wal/scanner/planner.rs index f3f15e2e680..f040428f342 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/planner.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/planner.rs @@ -15,8 +15,8 @@ use tracing::instrument; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; -use super::exec::{MEMTABLE_GEN_COLUMN, MemtableGenTagExec, PkHashFilterExec, ROW_ADDRESS_COLUMN}; -use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; +use super::exec::{MEMTABLE_GEN_COLUMN, MemtableGenTagExec, PkBlockFilterExec, ROW_ADDRESS_COLUMN}; +use super::flushed_cache::{DatasetCache, GenerationWarmer, open_flushed_dataset}; use super::projection::{ build_scanner_projection, canonical_output_schema, null_columns, project_to_canonical, }; @@ -33,7 +33,13 @@ pub struct LsmScanPlanner { /// Session threaded into flushed-generation opens (shared caches). session: Option>, /// Cache of opened flushed-generation datasets. - flushed_cache: Option>, + flushed_cache: Option>, + /// Optional warmer fired on first open of a flushed generation. + warmer: Option>, + /// Over-fetch multiple for the per-source limit pushdown: block-listed + /// sources scan `(offset + limit) * factor` rows so cross-gen dedup drops + /// still leave enough live rows. Clamped to `>= 1.0`. + overfetch_factor: f64, } impl LsmScanPlanner { @@ -49,6 +55,8 @@ impl LsmScanPlanner { base_schema, session: None, flushed_cache: None, + warmer: None, + overfetch_factor: 1.0, } } @@ -61,11 +69,24 @@ impl LsmScanPlanner { /// Inject a cache of opened flushed-generation datasets, making repeated /// queries against the same generation a pure `Arc::clone`. - pub fn with_flushed_cache(mut self, cache: Arc) -> Self { + pub fn with_flushed_cache(mut self, cache: Arc) -> Self { self.flushed_cache = Some(cache); self } + /// Inject the warmer fired on first open of a flushed generation. + pub fn with_warmer(mut self, warmer: Arc) -> Self { + self.warmer = Some(warmer); + self + } + + /// Set the over-fetch multiple for the per-source limit pushdown + /// (see the field docs). Clamped to `>= 1.0` at use. + pub fn with_overfetch_factor(mut self, factor: f64) -> Self { + self.overfetch_factor = factor; + self + } + /// Create scan plan with deduplication. /// /// # Arguments @@ -82,7 +103,7 @@ impl LsmScanPlanner { /// Each source is independently newest-per-PK (active via the fused /// [`MemTableDedupScanExec`](super::super::memtable::scanner), flushed via /// its within-generation deletion vector) and a cross-generation block-list - /// ([`PkHashFilterExec`]) drops any PK superseded by a newer generation. + /// ([`PkBlockFilterExec`]) drops any PK superseded by a newer generation. /// Each PK therefore survives in exactly one source, so a plain /// `UnionExec` carries at most one row per PK — no cross-source dedup, /// sort, or merge needed. `_memtable_gen` / `_rowaddr` are output-only and @@ -119,7 +140,6 @@ impl LsmScanPlanner { // `Box::pin` keeps the future off `clippy::large_futures`. let block_lists = Box::pin(super::block_list::compute_source_block_lists( &sources, - &self.pk_columns, self.session.as_ref(), self.flushed_cache.as_ref(), )) @@ -130,23 +150,59 @@ impl LsmScanPlanner { // cross-gen block-list, not from output ordering. let sources: Vec<_> = sources.into_iter().rev().collect(); + // Per-source limit pushdown: an unordered LIMIT needs only + // `offset + limit` live rows from EACH source to fill the global + // limit after dedup (any-N semantics), so cap every on-disk source + // instead of scanning whole generations and trimming above the + // union. Block-listed sources over-fetch by `overfetch_factor` so + // cross-gen dedup drops still leave `n_needed` live rows; the + // PkBlockFilter warns when that was not enough. The active memtable + // is in-memory and within-gen append duplicates are resolved by its + // own dedup, so it is never capped here. + let n_needed = limit.map(|l| l.saturating_add(offset.unwrap_or(0))); + let overfetch = self.overfetch_factor.max(1.0); + let mut source_plans = Vec::new(); for source in sources { let is_base = matches!(source, LsmDataSource::BaseTable { .. }); - let scan = self.build_source_scan(&source, projection, filter).await?; + let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. }); + let blocked = block_lists + .get(&(source.shard_id(), source.generation())) + .cloned(); + let fetch = match (n_needed, is_active) { + (Some(n), false) => Some(if blocked.is_some() { + ((n as f64) * overfetch).ceil() as usize + } else { + n + }), + _ => None, + }; + let scan = self + .build_source_scan(&source, projection, filter, fetch) + .await?; // Drop cross-generation stale rows (PKs superseded by a newer gen). - // `k = 0`: there is no top-k, so the under-fetch warning never fires. - let scan = match block_lists.get(&(source.shard_id(), source.generation())) { - Some(set) => Arc::new(PkHashFilterExec::new( + // With a limit, `k = n_needed` arms the under-fetch warning; with + // no limit `k = 0` keeps it silent. + let scan = match blocked { + Some(set) => Arc::new(PkBlockFilterExec::new( scan, self.pk_columns.clone(), - set.clone(), - 0, + set, + n_needed.unwrap_or(0), )) as Arc, None => scan, }; + // Post-block-list cap: each source contributes at most `n_needed` + // live rows toward the global limit. + let scan: Arc = match n_needed { + Some(n) if !is_active => Arc::new( + datafusion::physical_plan::limit::LocalLimitExec::new(scan, n), + ), + _ => scan, + }; + // When `_rowaddr` is surfaced, NULL it for non-base arms: only base // values are meaningful (e.g. for `take_rows`); per-source addresses // collide with base IDs. @@ -229,6 +285,7 @@ impl LsmScanPlanner { source: &LsmDataSource, projection: Option<&[String]>, filter: Option<&Expr>, + fetch: Option, ) -> Result> { match source { LsmDataSource::BaseTable { dataset } => { @@ -247,13 +304,22 @@ impl LsmScanPlanner { if let Some(expr) = filter { scanner.filter_expr(expr.clone()); } + // Per-source limit pushdown (post-filter rows): bounds the + // physical scan instead of trimming above the union. + if let Some(fetch) = fetch { + scanner.limit(Some(fetch as i64), None)?; + } scanner.create_plan().await } LsmDataSource::FlushedMemTable { path, .. } => { - let dataset = - open_flushed_dataset(path, self.session.as_ref(), self.flushed_cache.as_ref()) - .await?; + let dataset = open_flushed_dataset( + path, + self.session.as_ref(), + self.flushed_cache.as_ref(), + self.warmer.as_ref(), + ) + .await?; let mut scanner = dataset.scan(); let cols = @@ -264,6 +330,12 @@ impl LsmScanPlanner { if let Some(expr) = filter { scanner.filter_expr(expr.clone()); } + // Per-source limit pushdown: flushed generations are + // within-gen live (dedup-on-flush deletion vectors), so any + // `fetch` post-filter rows are valid contributions. + if let Some(fetch) = fetch { + scanner.limit(Some(fetch as i64), None)?; + } scanner.create_plan().await } @@ -413,13 +485,36 @@ mod integration_tests { .unwrap() } - /// Create a dataset at the given URI with the provided batches. + /// Create a dataset at the given URI with the provided batches. Also writes + /// the standalone PK sidecar (on `id`) so a flushed-generation source can be + /// probed by the block-list; harmless for a base table (never probed). async fn create_dataset(uri: &str, batches: Vec) -> Dataset { let schema = batches[0].schema(); - let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema); - Dataset::write(reader, uri, Some(WriteParams::default())) + let has_id = schema.column_with_name("id").is_some(); + let reader = RecordBatchIterator::new(batches.clone().into_iter().map(Ok), schema); + let dataset = Dataset::write(reader, uri, Some(WriteParams::default())) .await - .unwrap() + .unwrap(); + if has_id { + super::super::block_list::write_pk_sidecar(uri, &batches, &["id"]) + .await + .unwrap(); + } + dataset + } + + /// Build an in-memory memtable's `(batch_store, index_store)` with the PK + /// index enabled and populated (mirrors production — the block-list needs + /// the PK index to dedup in-memory generations). + fn pk_indexed(batches: &[RecordBatch]) -> (Arc, Arc) { + let batch_store = Arc::new(BatchStore::with_capacity(100)); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + for b in batches { + let (bp, off, _) = batch_store.append(b.clone()).unwrap(); + index.insert_with_batch_position(b, off, Some(bp)).unwrap(); + } + (batch_store, Arc::new(index)) } /// Setup a multi-level LSM structure with: @@ -470,10 +565,8 @@ mod integration_tests { .with_flushed_generation(2, "gen_2".to_string()); // Create active memtable - let batch_store = Arc::new(BatchStore::with_capacity(100)); - let index_store = Arc::new(IndexStore::new()); - let active_batch = create_test_batch(&schema, &[5, 6, 7], "active"); - let _ = batch_store.append(active_batch); + let (batch_store, index_store) = + pk_indexed(&[create_test_batch(&schema, &[5, 6, 7], "active")]); let active_memtable = InMemoryMemTables { active: InMemoryMemTableRef { @@ -515,18 +608,18 @@ mod integration_tests { // Verify the plan (gen DESC order: active -> gen2 -> gen1 -> base): // - plain UnionExec at top // - active arm: MemTableDedupScanExec (newest gen, not block-listed) - // - older arms: PkHashFilterExec (cross-gen block-list) -> LanceRead + // - older arms: PkBlockFilterExec (cross-gen block-list) -> LanceRead assert_plan_node_equals( plan, "ProjectionExec:... CoalescePartitionsExec UnionExec MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_2... - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_1... - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...base/data...refine_filter=--", ) .await @@ -549,9 +642,9 @@ mod integration_tests { // Verify the plan with `_memtable_gen` tags (gen DESC order): // - plain UnionExec at top - // - each arm: MemtableGenTagExec -> (PkHashFilterExec ->) data source + // - each arm: MemtableGenTagExec -> (PkBlockFilterExec ->) data source // - gen3 (active): MemtableGenTagExec -> MemTableDedupScanExec - // - gen2/gen1/base: MemtableGenTagExec -> PkHashFilterExec -> LanceRead + // - gen2/gen1/base: MemtableGenTagExec -> PkBlockFilterExec -> LanceRead assert_plan_node_equals( plan, "ProjectionExec:... @@ -560,13 +653,13 @@ mod integration_tests { MemtableGenTagExec: gen=gen3 MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true MemtableGenTagExec: gen=gen2 - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_2... MemtableGenTagExec: gen=gen1 - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_1... MemtableGenTagExec: gen=base - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...base/data...refine_filter=--", ) .await @@ -647,14 +740,14 @@ mod integration_tests { } // base/gen1/gen2 all hold PKs superseded by a newer generation, so each - // is wrapped in a `PkHashFilterExec`; the newest (active) arm is not. + // is wrapped in a `PkBlockFilterExec`; the newest (active) arm is not. let plan = scanner.create_plan().await.unwrap(); let plan_str = format!( "{}", datafusion::physical_plan::displayable(plan.as_ref()).indent(true) ); assert!( - plan_str.contains("PkHashFilterExec"), + plan_str.contains("PkBlockFilterExec"), "filtered-read plan must apply the cross-gen block-list, got:\n{}", plan_str ); @@ -730,21 +823,21 @@ mod integration_tests { .with_flushed_generation(2, "gen_2".to_string()); // Frozen gen3 (sealed, NOT in the manifest) and active gen4. - let frozen_store = Arc::new(BatchStore::with_capacity(100)); - let _ = frozen_store.append(create_test_batch(&schema, &[6, 7], "frozen")); + let (frozen_store, frozen_index) = + pk_indexed(&[create_test_batch(&schema, &[6, 7], "frozen")]); let frozen = InMemoryMemTableRef { batch_store: frozen_store, - index_store: Arc::new(IndexStore::new()), + index_store: frozen_index, schema: schema.clone(), generation: 3, }; - let active_store = Arc::new(BatchStore::with_capacity(100)); - let _ = active_store.append(create_test_batch(&schema, &[7, 8], "active")); + let (active_store, active_index) = + pk_indexed(&[create_test_batch(&schema, &[7, 8], "active")]); let in_memory = InMemoryMemTables { active: InMemoryMemTableRef { batch_store: active_store, - index_store: Arc::new(IndexStore::new()), + index_store: active_index, schema: schema.clone(), generation: 4, }, @@ -969,12 +1062,12 @@ mod integration_tests { ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_2... ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_1... - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...base/data...refine_filter=--", ) .await @@ -1037,14 +1130,14 @@ mod integration_tests { MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true MemtableGenTagExec: gen=gen2 ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_2... MemtableGenTagExec: gen=gen1 ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_1... MemtableGenTagExec: gen=base - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...base/data...refine_filter=--", ) .await @@ -1113,6 +1206,8 @@ mod integration_tests { let mut index_store = IndexStore::new(); // Add BTree index on id column (field_id=0) index_store.add_btree("id_idx".to_string(), 0, "id".to_string()); + // Reuse it as the PK index so the block-list can dedup this generation. + index_store.enable_pk_index(&[("id".to_string(), 0)]); let active_batch = create_test_batch(&schema, &[5, 6, 7], "active"); let _ = batch_store.append(active_batch.clone()); @@ -1177,7 +1272,7 @@ mod integration_tests { // 1. Verify overall structure assert!(plan_str.contains("UnionExec"), "Should have UnionExec"); assert!( - plan_str.contains("PkHashFilterExec"), + plan_str.contains("PkBlockFilterExec"), "older generations should be block-list filtered" ); assert!( @@ -1365,7 +1460,6 @@ mod integration_tests { // Active memtable: id=10 inserted ("keep") then updated to NULL within // the same generation; id=20 ("active_20") is a control that matches. - let batch_store = Arc::new(BatchStore::with_capacity(16)); let active_batch = RecordBatch::try_new( schema.clone(), vec![ @@ -1378,12 +1472,12 @@ mod integration_tests { ], ) .unwrap(); - batch_store.append(active_batch).unwrap(); + let (batch_store, index_store) = pk_indexed(&[active_batch]); let in_memory = InMemoryMemTables { active: InMemoryMemTableRef { batch_store, - index_store: Arc::new(IndexStore::new()), + index_store, schema: schema.clone(), generation: 1, }, diff --git a/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs b/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs index d1353e72dcc..2da4b5cd9a6 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs @@ -9,11 +9,14 @@ use std::collections::HashMap; use std::sync::Arc; use arrow_array::{Array, RecordBatch}; -use arrow_schema::SchemaRef; +use arrow_schema::{SchemaRef, SortOptions}; use datafusion::common::ScalarValue; use datafusion::execution::TaskContext; +use datafusion::physical_expr::expressions::Column; +use datafusion::physical_expr::{LexOrdering, PhysicalSortExpr}; use datafusion::physical_plan::ExecutionPlan; use datafusion::physical_plan::limit::GlobalLimitExec; +use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::prelude::{Expr, SessionContext}; use futures::TryStreamExt; @@ -27,11 +30,8 @@ use crate::dataset::mem_wal::memtable::batch_store::BatchStore; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; -use super::exec::{ - BloomFilterGuardExec, CoalesceFirstExec, DedupDirection, WithinSourceDedupExec, - compute_pk_hash_from_scalars, -}; -use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; +use super::exec::{BloomFilterGuardExec, CoalesceFirstExec, compute_pk_hash_from_scalars}; +use super::flushed_cache::{DatasetCache, GenerationWarmer, open_flushed_dataset}; use super::projection::{ build_scanner_projection, canonical_output_schema, null_columns, project_to_canonical, wants_row_address, wants_row_id, @@ -87,7 +87,9 @@ pub struct LsmPointLookupPlanner { /// Session threaded into flushed-generation opens (shared caches). session: Option>, /// Cache of opened flushed-generation datasets. - flushed_cache: Option>, + flushed_cache: Option>, + /// Optional warmer fired on first open of a flushed generation. + warmer: Option>, /// Precomputed canonical output schema for the no-projection case, so the /// hot `lookup(.., None)` path clones an `Arc` instead of rebuilding the /// schema on every call. @@ -120,6 +122,7 @@ impl LsmPointLookupPlanner { bloom_filters: std::collections::HashMap::new(), session: None, flushed_cache: None, + warmer: None, none_target, task_ctx: SessionContext::new().task_ctx(), } @@ -137,11 +140,17 @@ impl LsmPointLookupPlanner { /// front during scan setup via /// [`DatasetMemWalExt::prewarm_mem_wal`](crate::dataset::mem_wal::DatasetMemWalExt::prewarm_mem_wal) /// so the first gen-key lookup does not pay the dataset open. - pub fn with_flushed_cache(mut self, cache: Arc) -> Self { + pub fn with_flushed_cache(mut self, cache: Arc) -> Self { self.flushed_cache = Some(cache); self } + /// Inject the warmer fired on first open of a flushed generation. + pub fn with_warmer(mut self, warmer: Arc) -> Self { + self.warmer = Some(warmer); + self + } + /// Add a bloom filter for a generation. /// /// Bloom filters are optional but improve performance by skipping @@ -546,9 +555,13 @@ impl LsmPointLookupPlanner { scanner.create_plan().await? } LsmDataSource::FlushedMemTable { path, .. } => { - let dataset = - open_flushed_dataset(path, self.session.as_ref(), self.flushed_cache.as_ref()) - .await?; + let dataset = open_flushed_dataset( + path, + self.session.as_ref(), + self.flushed_cache.as_ref(), + self.warmer.as_ref(), + ) + .await?; let mut scanner = dataset.scan(); scanner.project(&cols.iter().map(|s| s.as_str()).collect::>())?; scanner.filter_expr(filter.clone()); @@ -573,19 +586,29 @@ impl LsmPointLookupPlanner { // multiple rows sharing the target primary key. scanner.with_row_id(); let raw = scanner.create_plan().await?; - // Within the active memtable, larger `_rowid` = newer - // insert. After dedup there is exactly one row per PK. - let deduped: Arc = Arc::new(WithinSourceDedupExec::new( - raw, - self.pk_columns.clone(), - lance_core::ROW_ID, - DedupDirection::KeepMaxRowAddr, - )); + // The filter already restricts to the exact PK value, so the + // scan yields that key's insert history. Within the active + // memtable larger `_rowid` = newer insert, so sorting `_rowid` + // DESC and keeping the first row picks the newest version — one + // row per (value-exact) PK. + let rowid_idx = raw.schema().index_of(lance_core::ROW_ID)?; + let ordering = LexOrdering::new(vec![PhysicalSortExpr { + expr: Arc::new(Column::new(lance_core::ROW_ID, rowid_idx)), + options: SortOptions { + descending: true, + nulls_first: false, + }, + }]) + .ok_or_else(|| { + lance_core::Error::internal("point-lookup: failed to build _rowid ordering") + })?; + let newest: Arc = + Arc::new(SortExec::new(ordering, raw).with_fetch(Some(1))); // Per-source `_rowid` would collide with the base table's; // NULL it before canonicalization (the value is internal to // this arm). project_to_canonical drops it entirely when // the user didn't request `_rowid` in the projection. - null_columns(deduped, &[lance_core::ROW_ID])? + null_columns(newest, &[lance_core::ROW_ID])? } }; project_to_canonical(scan, &target) @@ -642,10 +665,6 @@ fn probe_position( pk_column: &str, pk_value: &ScalarValue, ) -> Result { - let Some(btree) = index_store.get_btree_by_column(pk_column) else { - return Ok(ProbePos::NoIndex); - }; - // Visible batches are the committed prefix [0, last_visible_idx]; each // `StoredBatch` carries its cumulative `row_offset`, so visibility and the // position→batch mapping are O(1)/O(log) with no per-probe allocation. @@ -661,22 +680,37 @@ fn probe_position( if visible_end == 0 { return Ok(ProbePos::Miss); } + let max_visible_row = visible_end - 1; - // Newest visible position of the key — a single seek-and-stop on the - // ordered skiplist (largest key ≤ (value, max_visible_row)). No range - // collect, no allocation. - let Some(pos) = btree.get_newest_visible(pk_value, visible_end - 1) else { + // A single-column primary key always has a value-keyed BTree (reused or + // auto-created — see `IndexStore::enable_pk_index`): collision-free, so one + // seek yields the answer with no re-check. Absent only when the table has no + // PK index, where the caller falls back to the plan path. + let Some(btree) = index_store.get_btree_by_column(pk_column) else { + return Ok(ProbePos::NoIndex); + }; + let Some(pos) = btree.get_newest_visible(pk_value, max_visible_row) else { return Ok(ProbePos::Miss); }; + let (batch_idx, row) = resolve_position(batch_store, last_visible_idx, pos)?; + Ok(ProbePos::Found { batch_idx, row }) +} - // Binary-search the owning batch by `row_offset` (appended in order). +/// Map a global row `position` to its `(batch_idx, row_in_batch)` by binary +/// searching the visible batch prefix on cumulative `row_offset` (batches are +/// appended in order). +fn resolve_position( + batch_store: &BatchStore, + last_visible_idx: usize, + position: u64, +) -> Result<(usize, usize)> { let (mut lo, mut hi) = (0usize, last_visible_idx); while lo < hi { let mid = lo + (hi - lo).div_ceil(2); let off = batch_store.get(mid).map(|b| b.row_offset).ok_or_else(|| { lance_core::Error::internal("point-lookup: batch index out of range during search") })?; - if off <= pos { + if off <= position { lo = mid; } else { hi = mid - 1; @@ -685,10 +719,7 @@ fn probe_position( let stored = batch_store .get(lo) .ok_or_else(|| lance_core::Error::internal("point-lookup: resolved batch missing"))?; - Ok(ProbePos::Found { - batch_idx: lo, - row: (pos - stored.row_offset) as usize, - }) + Ok((lo, (position - stored.row_offset) as usize)) } /// Gather `rows` from `batch_store`'s batch `batch_idx` into the `target` @@ -1097,8 +1128,8 @@ mod tests { // Regression: same primary key inserted twice into one active // memtable must return the *newest* row. The bug was that // `FilterExec → LIMIT 1` over an insert-ordered scan returned the - // first (oldest) match. `WithinSourceDedupExec` collapses by PK, - // keeping the row with the largest `_rowid` (insert order). + // first (oldest) match. The plan-path active arm now sorts `_rowid` + // DESC and keeps the first row (largest `_rowid` = newest insert). use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables}; use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; use futures::TryStreamExt; @@ -1118,17 +1149,17 @@ mod tests { let b_old = create_test_batch(&schema, &[1], "old"); let b_new = create_test_batch(&schema, &[1], "new"); let b_other = create_test_batch(&schema, &[2], "two"); - let (_, _, bp_old) = batch_store.append(b_old.clone()).unwrap(); + let (bp_old, off_old, _) = batch_store.append(b_old.clone()).unwrap(); index_store - .insert_with_batch_position(&b_old, 0, Some(bp_old)) + .insert_with_batch_position(&b_old, off_old, Some(bp_old)) .unwrap(); - let (_, _, bp_new) = batch_store.append(b_new.clone()).unwrap(); + let (bp_new, off_new, _) = batch_store.append(b_new.clone()).unwrap(); index_store - .insert_with_batch_position(&b_new, 1, Some(bp_new)) + .insert_with_batch_position(&b_new, off_new, Some(bp_new)) .unwrap(); - let (_, _, bp_other) = batch_store.append(b_other.clone()).unwrap(); + let (bp_other, off_other, _) = batch_store.append(b_other.clone()).unwrap(); index_store - .insert_with_batch_position(&b_other, 2, Some(bp_other)) + .insert_with_batch_position(&b_other, off_other, Some(bp_other)) .unwrap(); let index_store = Arc::new(index_store); @@ -1168,6 +1199,88 @@ mod tests { ); } + #[tokio::test] + async fn test_point_lookup_probes_auto_created_pk_btree() { + // No user `add_btree` on the PK column — only `enable_pk_index`, which + // auto-creates a BTree on the primary key (the production default). The + // fast probe must resolve the newest visible version through that + // collision-free BTree rather than falling back to the plan path. + use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables}; + use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; + + let schema = create_pk_schema(); + let temp_dir = tempfile::tempdir().unwrap(); + let base_uri = format!("{}/base", temp_dir.path().to_str().unwrap()); + + let batch_store = Arc::new(BatchStore::with_capacity(16)); + let mut index_store = IndexStore::new(); + // No `add_btree` — `enable_pk_index` auto-creates the PK BTree. + index_store.enable_pk_index(&[("id".to_string(), 0)]); + + // pk=1 written twice (the newer second), plus an unrelated pk=2. + let b_old = create_test_batch(&schema, &[1], "old"); + let b_new = create_test_batch(&schema, &[1], "new"); + let b_other = create_test_batch(&schema, &[2], "two"); + let (bp_old, off_old, _) = batch_store.append(b_old.clone()).unwrap(); + index_store + .insert_with_batch_position(&b_old, off_old, Some(bp_old)) + .unwrap(); + let (bp_new, off_new, _) = batch_store.append(b_new.clone()).unwrap(); + index_store + .insert_with_batch_position(&b_new, off_new, Some(bp_new)) + .unwrap(); + let (bp_other, off_other, _) = batch_store.append(b_other.clone()).unwrap(); + index_store + .insert_with_batch_position(&b_other, off_other, Some(bp_other)) + .unwrap(); + let index_store = Arc::new(index_store); + + let shard_id = Uuid::new_v4(); + let collector = LsmDataSourceCollector::without_base_table(base_uri, vec![]) + .with_in_memory_memtables( + shard_id, + InMemoryMemTables { + active: InMemoryMemTableRef { + batch_store, + index_store, + schema: schema.clone(), + generation: 1, + }, + frozen: vec![], + }, + ); + let planner = LsmPointLookupPlanner::new(collector, vec!["id".to_string()], schema); + + // `lookup` takes the fast probe path (single-column PK, no system cols). + let hit = planner + .lookup(&[ScalarValue::Int32(Some(1))], None) + .await + .unwrap() + .expect("pk=1 must be found via the PK-position index probe"); + assert_eq!(hit.num_rows(), 1); + let name = hit + .column_by_name("name") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!( + name.value(0), + "new_1", + "probe must return the newest version" + ); + + // An absent key resolves to None (no on-disk sources to consult). + assert!( + planner + .lookup(&[ScalarValue::Int32(Some(999))], None) + .await + .unwrap() + .is_none(), + "absent key must miss" + ); + } + #[tokio::test] async fn test_point_lookup_flushed_memtable_returns_newest_duplicate() { // Regression / invariant pin: when a flushed memtable contains two diff --git a/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs b/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs index b6b1f952b25..7f849f3d8bf 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs @@ -27,8 +27,7 @@ use crate::io::exec::TakeExec; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; -use super::exec::{DedupDirection, WithinSourceDedupExec}; -use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; +use super::flushed_cache::{DatasetCache, GenerationWarmer, open_flushed_dataset}; use super::projection::{ DISTANCE_COLUMN, build_scanner_projection, canonical_output_schema, null_columns, project_to_canonical, wants_row_id, @@ -38,10 +37,12 @@ use crate::session::Session; /// Plans vector search queries over LSM data. /// /// Each source is independently newest-per-PK before the union — the active -/// memtable via an over-fetched KNN + within-source dedup, flushed generations -/// via their within-generation deletion vector — and the cross-generation -/// block-list ([`super::exec::PkHashFilterExec`]) drops any PK superseded by a -/// newer generation. So each PK reaches the union from exactly one source and a +/// memtable via an over-fetched KNN + a newest-per-PK recency filter +/// ([`super::exec::NewestPkFilterExec`], which drops a hit that isn't the newest +/// visible version of its PK), flushed generations via their within-generation +/// deletion vector — and the cross-generation block-list +/// ([`super::exec::PkBlockFilterExec`]) drops any PK superseded by a newer +/// generation. So each PK reaches the union from exactly one source and a /// distance-ordered merge yields the global top-k; no cross-source dedup is /// needed. /// @@ -54,15 +55,15 @@ use crate::session::Session; /// UnionExec /// ProjectionExec (canonical output schema) /// SortExec(_distance, fetch=k) -/// WithinSourceDedupExec: KeepMaxRowAddr (active) +/// NewestPkFilterExec: newest-per-PK recency (active) /// KNNExec: active memtable, fetch=ceil(k*overfetch) /// ProjectionExec (canonical output schema) /// ProjectionExec (null_columns _rowid) -/// PkHashFilterExec: block-list (flushed) +/// PkBlockFilterExec: block-list (flushed) /// KNNExec: flushed gen N, fetch=ceil(k*overfetch) (fast_search) /// … one per flushed gen … /// ProjectionExec (canonical output schema) -/// PkHashFilterExec: block-list (base) +/// PkBlockFilterExec: block-list (base) /// KNNExec: base table, k (fast_search)[.refine()?] /// ``` /// @@ -92,7 +93,9 @@ pub struct LsmVectorSearchPlanner { /// Session threaded into flushed-generation opens (shared caches). session: Option>, /// Cache of opened flushed-generation datasets. - flushed_cache: Option>, + flushed_cache: Option>, + /// Optional warmer fired on first open of a flushed generation. + warmer: Option>, } impl LsmVectorSearchPlanner { @@ -121,6 +124,7 @@ impl LsmVectorSearchPlanner { dataset: None, session: None, flushed_cache: None, + warmer: None, } } @@ -133,11 +137,17 @@ impl LsmVectorSearchPlanner { /// Inject a cache of opened flushed-generation datasets, making repeated /// searches against the same generation a pure `Arc::clone`. - pub fn with_flushed_cache(mut self, cache: Arc) -> Self { + pub fn with_flushed_cache(mut self, cache: Arc) -> Self { self.flushed_cache = Some(cache); self } + /// Inject the warmer fired on first open of a flushed generation. + pub fn with_warmer(mut self, warmer: Arc) -> Self { + self.warmer = Some(warmer); + self + } + /// Set the base dataset for post-rerank take. /// /// After global PK dedup and sort, a `TakeExec` against this dataset @@ -168,7 +178,7 @@ impl LsmVectorSearchPlanner { /// the rows that filtering drops: /// /// - `factor < 1.0` (e.g. `0.0`): **stale filtering off.** The per-source - /// block-list / [`super::exec::PkHashFilterExec`] is not built or applied, + /// block-list / [`super::exec::PkBlockFilterExec`] is not built or applied, /// so rows superseded by a newer generation can surface. The global PK /// dedup still runs, so it still suppresses stale copies in the cases /// where both the stale and the fresh row reach it. @@ -210,11 +220,10 @@ impl LsmVectorSearchPlanner { // live candidates after the post-filter. let overfetch_factor = overfetch_factor.max(1.0); - // Per-source PK-hash block sets (`NEWER(G)`; base = union of all gens). + // Per-source PK block sets (`NEWER(G)`; base = union of all gens). // `Box::pin` keeps the future off `clippy::large_futures`. let block_lists = Box::pin(super::block_list::compute_source_block_lists( &sources, - &self.pk_columns, self.session.as_ref(), self.flushed_cache.as_ref(), )) @@ -233,49 +242,83 @@ impl LsmVectorSearchPlanner { // `block_lists` is non-empty exactly when a newer generation exists. let refine_base = refine_base_table || !block_lists.is_empty(); + // Stage per-source over-fetch decisions, then build every KNN plan + // concurrently — the builds are independent and a sequential loop was + // the dominant serial planning cost at multiple generations. + let arm_inputs: Vec<_> = sources + .iter() + .map(|source| { + let generation = source.generation(); + let is_base = matches!(source, LsmDataSource::BaseTable { .. }); + let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. }); + // Over-fetch when the post-source filter can drop candidates: a + // blocked source loses superseded rows; the active source's + // within-source dedup collapses duplicate-PK HNSW nodes. Block + // lookup is per shard — generations are per-shard. + let blocked = block_lists.get(&(source.shard_id(), generation)); + let fetch_k = if blocked.is_some() || is_active { + ((k as f64) * overfetch_factor).ceil() as usize + } else { + k + }; + (source, is_base, is_active, blocked, fetch_k) + }) + .collect(); + let built = futures::future::try_join_all(arm_inputs.iter().map( + |(source, is_base, _, _, fetch_k)| { + Box::pin(self.build_knn_plan( + source, + query_vector, + *fetch_k, + nprobes, + projection, + *is_base && refine_base, + )) + }, + )) + .await?; + let mut knn_plans = Vec::new(); - for source in &sources { - let generation = source.generation(); - let is_base = matches!(source, LsmDataSource::BaseTable { .. }); - let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. }); - // Over-fetch when the post-source filter can drop candidates: a - // blocked source loses superseded rows; the active source's - // within-source dedup collapses duplicate-PK HNSW nodes. Block - // lookup is per shard — generations are per-shard. - let blocked = block_lists.get(&(source.shard_id(), generation)); - let fetch_k = if blocked.is_some() || is_active { - ((k as f64) * overfetch_factor).ceil() as usize - } else { - k - }; - let knn = Box::pin(self.build_knn_plan( - source, - query_vector, - fetch_k, - nprobes, - projection, - is_base && refine_base, - )) - .await?; + // `build_knn_plan` returns each active arm's max-visible snapshot + // alongside its plan; the active arm's NewestPkFilterExec needs both it + // and `source` (for the batch/index stores), so neither is discarded. + for ((source, is_base, is_active, blocked, _), (knn, active_max_visible)) in + arm_inputs.iter().zip(built) + { + let is_base = *is_base; + let is_active = *is_active; + let blocked = *blocked; // Make each source independently newest-per-PK before the union: // * active: the append-only HNSW returns one node per inserted - // version, so collapse duplicate PKs to the newest insert - // (KeepMaxRowAddr on `_rowid`) and re-sort by distance. This - // stays probabilistic — a fresh version evicted from the - // over-fetched top-k still leaks. + // version *and* leaves stale versions of updated PKs live. The + // recency filter keeps only the hit that is the newest visible + // version of its PK (per the maintained MVCC PK-position index), + // closing the predicate-crossing stale read, then re-sort by + // distance. // * flushed/base: drop cross-gen superseded rows via the // block-list (within-gen is handled by the flushed DV). let knn = if is_active { - let deduped: Arc = Arc::new(WithinSourceDedupExec::new( - knn, - self.pk_columns.clone(), - lance_core::ROW_ID, - DedupDirection::KeepMaxRowAddr, - )); - sort_by_distance(deduped, k)? + let (batch_store, index_store) = match source { + LsmDataSource::ActiveMemTable { + batch_store, + index_store, + .. + } => (batch_store.clone(), index_store.clone()), + _ => unreachable!("is_active implies ActiveMemTable"), + }; + let filtered: Arc = + Arc::new(super::exec::NewestPkFilterExec::new( + knn, + self.pk_columns.clone(), + lance_core::ROW_ID, + index_store, + batch_store, + active_max_visible.expect("active arm returns its max_visible snapshot"), + )); + sort_by_distance(filtered, k)? } else { match blocked { - Some(set) => Arc::new(super::exec::PkHashFilterExec::new( + Some(set) => Arc::new(super::exec::PkBlockFilterExec::new( knn, self.pk_columns.clone(), set.clone(), @@ -301,6 +344,10 @@ impl LsmVectorSearchPlanner { // No cross-source dedup needed (see struct doc): SortExec(per partition) // + SortPreservingMerge does the p-way distance-ordered top-k merge. #[allow(deprecated)] + // The downstream `SortPreservingMergeExec` already spawns one driver + // task per input partition (one per union arm) via `spawn_buffered`, so + // each arm's per-arm CPU (HNSW search, distance refine) runs on its own + // task without an extra repartition. let merged: Arc = Arc::new(UnionExec::new(knn_plans)); let distance_idx = merged.schema().index_of(DISTANCE_COLUMN).map_err(|_| { @@ -364,11 +411,15 @@ impl LsmVectorSearchPlanner { merged_sorted }; - // Under-fetch is warned per-source inside `PkHashFilterExec`. + // Under-fetch is warned per-source inside `PkBlockFilterExec`. Ok(result) } /// Build KNN plan for a single data source. + /// + /// Returns the plan and, for the active memtable, the `max_visible_batch_position` + /// snapshot its scanner latched — threaded into the recency filter so it keys + /// on the same snapshot the search saw (`None` for base / flushed sources). async fn build_knn_plan( &self, source: &LsmDataSource, @@ -377,7 +428,7 @@ impl LsmVectorSearchPlanner { nprobes: usize, projection: Option<&[String]>, refine: bool, - ) -> Result> { + ) -> Result<(Arc, Option)> { match source { LsmDataSource::BaseTable { dataset } => { let mut scanner = dataset.scan(); @@ -402,12 +453,16 @@ impl LsmVectorSearchPlanner { if refine { scanner.refine(1); } - scanner.create_plan().await + Ok((scanner.create_plan().await?, None)) } LsmDataSource::FlushedMemTable { path, .. } => { - let dataset = - open_flushed_dataset(path, self.session.as_ref(), self.flushed_cache.as_ref()) - .await?; + let dataset = open_flushed_dataset( + path, + self.session.as_ref(), + self.flushed_cache.as_ref(), + self.warmer.as_ref(), + ) + .await?; let mut scanner = dataset.scan(); let cols = build_scanner_projection(projection, &self.base_schema, &self.pk_columns); @@ -418,7 +473,7 @@ impl LsmVectorSearchPlanner { scanner.nprobes(nprobes); scanner.distance_metric(self.distance_type); scanner.fast_search(); - scanner.create_plan().await + Ok((scanner.create_plan().await?, None)) } LsmDataSource::ActiveMemTable { batch_store, @@ -436,8 +491,8 @@ impl LsmVectorSearchPlanner { build_scanner_projection(projection, &self.base_schema, &self.pk_columns); scanner.project(&cols.iter().map(|s| s.as_str()).collect::>()); // Expose `_rowid` (BatchStore row offset, monotonic with - // insert order) so [`WithinSourceDedupExec`] can collapse - // duplicate-PK rows to the newest insert. The value is + // insert order) so `NewestPkFilterExec` can compare each hit's + // position against the PK-position index. The value is // per-source and NULL'd before reaching the canonical merge. // (VectorIndexExec only plumbs `with_row_id`, not // `with_row_address`, but the two yield identical values @@ -447,7 +502,9 @@ impl LsmVectorSearchPlanner { scanner.nearest(&self.vector_column, query_arr, k); scanner.nprobes(nprobes); scanner.distance_metric(self.distance_type); - scanner.create_plan().await + let plan = scanner.create_plan().await?; + // Capture the scanner's own latched snapshot for the recency filter. + Ok((plan, Some(scanner.max_visible_batch_position()))) } } } @@ -567,10 +624,19 @@ mod tests { async fn create_dataset(uri: &str, batches: Vec) -> Dataset { let schema = batches[0].schema(); - let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema); - Dataset::write(reader, uri, Some(WriteParams::default())) + let has_id = schema.column_with_name("id").is_some(); + let reader = RecordBatchIterator::new(batches.clone().into_iter().map(Ok), schema); + let dataset = Dataset::write(reader, uri, Some(WriteParams::default())) .await - .unwrap() + .unwrap(); + // Also write the standalone PK sidecar (on `id`) so a flushed-generation + // source can be probed by the block-list (harmless for a base table). + if has_id { + crate::dataset::mem_wal::scanner::block_list::write_pk_sidecar(uri, &batches, &["id"]) + .await + .unwrap(); + } + dataset } #[tokio::test] @@ -641,6 +707,7 @@ mod tests { // Active memtable with HNSW index over the "vector" column. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -759,6 +826,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -838,6 +906,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -951,6 +1020,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1007,8 +1077,7 @@ mod tests { plan_str ); assert!( - plan_str.contains("WithinSourceDedupExec") - && plan_str.contains("SortPreservingMergeExec"), + plan_str.contains("NewestPkFilterExec") && plan_str.contains("SortPreservingMergeExec"), "expected per-arm dedup + distance merge, got:\n{}", plan_str ); @@ -1091,6 +1160,7 @@ mod tests { // "right" vector close to the query, plus an unrelated pk=2. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1210,6 +1280,7 @@ mod tests { // Active memtable: id=3 with HNSW index. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1436,9 +1507,9 @@ mod tests { #[tokio::test] async fn test_vector_search_dedup_within_active_memtable() { // Regression: same PK inserted twice into one active memtable with - // *different* vectors. HNSW indexes each as a distinct node, so - // without WithinSourceDedupExec a KNN can return both candidates - // for the same PK and pollute top-k. The newer insert must win. + // *different* vectors. HNSW indexes each as a distinct node, so without + // the recency filter a KNN can return both candidates for the same PK + // and pollute top-k. The newer insert must win. use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables}; use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; use datafusion::prelude::SessionContext; @@ -1450,6 +1521,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1513,14 +1585,14 @@ mod tests { .await .unwrap(); - // The active arm collapses duplicate-PK HNSW nodes itself via - // WithinSourceDedupExec — there is no cross-source dedup fallback. + // The active arm collapses duplicate-PK HNSW nodes itself via the + // recency filter — there is no cross-source dedup fallback. let plan_str = format!( "{}", datafusion::physical_plan::displayable(plan.as_ref()).indent(true) ); assert!( - plan_str.contains("WithinSourceDedupExec"), + plan_str.contains("NewestPkFilterExec"), "active vector arm must self-dedup, got:\n{}", plan_str ); @@ -1549,10 +1621,120 @@ mod tests { ); } + #[tokio::test] + async fn test_vector_search_active_stale_update_out_of_neighborhood() { + // BUG REPRODUCTION (vector case: a PK update that moves out of the neighborhood). + // + // Within a *single* active memtable, pk=1 is first inserted ON the query + // (distance ~0), then updated to a FAR vector. The append-only HNSW keeps + // both nodes live. A result-set dedup only collapses duplicate PKs that + // are BOTH present in the over-fetched candidate set. + // + // Here the fresh (far) pk=1 is evicted from the candidate set — there are + // enough nearer filler rows that it ranks below the fetch cutoff — so the + // dedup never sees it and the STALE near pk=1 leaks as the nearest hit. + // This is the predicate-crossing hole: the row that *would* suppress the + // stale version isn't in the result set, so result-set dedup can't help. + // + // Desired (NewestPkFilterExec) behaviour: pk=1's newest row-position is + // the far one, computed predicate-independently over the whole memtable, + // so the stale near node is dropped and pk=1 must NOT surface at ~0. + use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables}; + use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; + use datafusion::prelude::SessionContext; + use futures::TryStreamExt; + + let schema = create_vector_schema(); + let temp_dir = tempfile::tempdir().unwrap(); + let base_uri = format!("{}/base", temp_dir.path().to_str().unwrap()); + + let batch_store = Arc::new(BatchStore::with_capacity(16)); + let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); + index_store.add_hnsw( + "vector_hnsw".to_string(), + 1, + "vector".to_string(), + lance_linalg::distance::DistanceType::L2, + 64, + 8, + ); + + // First append: stale pk=1 ON the query, plus five filler rows strictly + // farther than pk=1 but far nearer than the eventual fresh pk=1. + let q = [0.1, 0.2, 0.3, 0.4]; + let stale_then_fillers = batch_rows( + &schema, + &[ + (1, q), + (10, [0.11, 0.21, 0.31, 0.41]), + (11, [0.13, 0.23, 0.33, 0.43]), + (12, [0.15, 0.25, 0.35, 0.45]), + (13, [0.17, 0.27, 0.37, 0.47]), + (14, [0.19, 0.29, 0.39, 0.49]), + ], + ); + let (bp0, off0, _) = batch_store.append(stale_then_fillers.clone()).unwrap(); + index_store + .insert_with_batch_position(&stale_then_fillers, off0, Some(bp0)) + .unwrap(); + + // Second append: the UPDATE — pk=1 moved far from the query. This is the + // newest version (largest row position) but it sits well outside top-k. + let fresh_pk1 = batch_rows(&schema, &[(1, [9.0, 9.0, 9.0, 9.0])]); + let (bp1, off1, _) = batch_store.append(fresh_pk1.clone()).unwrap(); + index_store + .insert_with_batch_position(&fresh_pk1, off1, Some(bp1)) + .unwrap(); + let index_store = Arc::new(index_store); + + let shard_id = uuid::Uuid::new_v4(); + let collector = LsmDataSourceCollector::without_base_table(base_uri, vec![]) + .with_in_memory_memtables( + shard_id, + InMemoryMemTables { + active: InMemoryMemTableRef { + batch_store, + index_store, + schema: schema.clone(), + generation: 1, + }, + frozen: vec![], + }, + ); + + let planner = LsmVectorSearchPlanner::new( + collector, + vec!["id".to_string()], + schema, + "vector".to_string(), + lance_linalg::distance::DistanceType::L2, + ); + + // k=3, no over-fetch: the candidate set is {pk1@near, two nearest + // fillers}; fresh pk1@far ranks 7th and never enters the candidates. + let query = create_query_vector(); + let plan = planner + .plan_search(&query, 3, 1, None, false, 1.0) + .await + .unwrap(); + let ctx = SessionContext::new(); + let stream = plan.execute(0, ctx.task_ctx()).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + let rows = collect_id_dist(&batches); + + assert!( + !rows.iter().any(|&(id, d)| id == 1 && d.abs() < 1e-3), + "stale near pk=1 leaked: its live vector is far from the query, so it \ + must not appear at distance ~0. results={:?}", + rows + ); + } + #[tokio::test] async fn test_vector_search_stale_read_when_fresh_falls_out_of_top_k() { // Regression for the cross-generation stale-read gap that the - // PkHashFilterExec block-list closes. + // PkBlockFilterExec block-list closes. // // Scenario: // * Base (gen 0): stale pk=1 sitting on the query (distance ~0). @@ -1587,6 +1769,7 @@ mod tests { // active arm surfaces pk=2 and drops fresh pk=1. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1783,6 +1966,7 @@ mod tests { // Active (gen 1): pk 1,2,3 re-inserted with a far vector (the fresh value). let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1987,6 +2171,7 @@ mod tests { // Active: (1,1) re-inserted far (fresh) + an unrelated nearby (2,2). let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id1".to_string(), 0), ("id2".to_string(), 1)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -2091,6 +2276,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, diff --git a/rust/lance/src/dataset/mem_wal/util.rs b/rust/lance/src/dataset/mem_wal/util.rs index d1413b84b2a..3f5090f6b40 100644 --- a/rust/lance/src/dataset/mem_wal/util.rs +++ b/rust/lance/src/dataset/mem_wal/util.rs @@ -169,6 +169,16 @@ pub fn flushed_memtable_path( shard_base_path(base_path, shard_id).join(format!("{}_gen_{}", random_hash, generation)) } +/// Subdirectory of a flushed generation holding its standalone primary-key +/// dedup index (a sidecar BTree, not registered in the manifest). Both the +/// flush writer and the block-list probe join this onto the generation path. +pub const PK_INDEX_DIR: &str = "_pk_index"; + +/// Path to a flushed generation's standalone primary-key dedup index. +pub fn pk_index_path(gen_path: &Path) -> Path { + gen_path.clone().join(PK_INDEX_DIR) +} + /// Generate an 8-character random hex string for flushed MemTable directories. pub fn generate_random_hash() -> String { let bytes: [u8; 4] = rand::random(); diff --git a/rust/lance/src/dataset/mem_wal/write.rs b/rust/lance/src/dataset/mem_wal/write.rs index 441da920b57..491bb68aec5 100644 --- a/rust/lance/src/dataset/mem_wal/write.rs +++ b/rust/lance/src/dataset/mem_wal/write.rs @@ -47,6 +47,7 @@ pub use super::util::{WatchableOnceCell, WatchableOnceCellReader}; pub use super::wal::{WalEntry, WalEntryData, WalFlushResult, WalFlusher}; use super::memtable::flush::TriggerMemTableFlush; +use super::scanner::GenerationWarmer; use super::wal::{ TriggerWalFlush, WalAppender, WalFlushSource, WalOnlyState, WalTailer, empty_flush_result, }; @@ -177,6 +178,21 @@ pub struct ShardWriterConfig { /// Default: 60 seconds pub stats_log_interval: Option, + /// How long a frozen memtable lingers in memory after its flush commits, + /// before it is evicted and served only from the on-disk flushed dataset. + /// + /// `Duration::ZERO` (the default) disables retention: evict on commit, no + /// sweep ticker. Correct for single-shot queries, which can't observe a + /// generation evicted mid-read. + /// + /// A non-zero value is required only for queries split across reads (e.g. + /// fresh tier and base table read separately, then deduped): the flushed + /// dataset loses the per-batch boundaries that bound as-of membership + /// (see [`crate::dataset::mem_wal::scanner::FreshTierWatermark`]), so a + /// generation evicted between a query's reads can serve a stale row. Set it + /// above the worst-case multi-part query latency, with margin. + pub frozen_memtable_grace: Duration, + /// Whether to maintain an in-memory MemTable on top of the WAL. /// /// When `true` (default), the writer maintains an in-memory `MemTable`, @@ -216,6 +232,11 @@ pub struct ShardWriterConfig { /// /// Default: empty. pub hnsw_params: HashMap, + + /// Optional warmer fired pre-commit for each new generation (zero cold reads + /// on first query). Wired to the flusher; supplied by the consumer (e.g. the + /// WAL pod). Default: `None`. + pub warmer: Option>, } impl Default for ShardWriterConfig { @@ -236,8 +257,10 @@ impl Default for ShardWriterConfig { async_index_buffer_rows: 10_000, async_index_interval: Duration::from_secs(1), stats_log_interval: Some(Duration::from_secs(60)), // 1 minute + frozen_memtable_grace: Duration::ZERO, enable_memtable: true, hnsw_params: HashMap::new(), + warmer: None, } } } @@ -335,6 +358,13 @@ impl ShardWriterConfig { self } + /// Set how long a flushed memtable lingers in memory before eviction. MUST + /// exceed the maximum query elapsed time — see `frozen_memtable_grace`. + pub fn with_frozen_memtable_grace(mut self, grace: Duration) -> Self { + self.frozen_memtable_grace = grace; + self + } + /// Toggle the in-memory MemTable layer. See `enable_memtable` for the /// full WAL-only-mode contract. Defaults to `true`. pub fn with_enable_memtable(mut self, enable: bool) -> Self { @@ -708,6 +738,15 @@ pub struct WriteResult { pub batch_positions: std::ops::Range, } +/// A sealed memtable kept queryable in memory. `flushed_at_ms` is `None` while +/// the generation is still awaiting (or retrying) its flush, and `Some(t)` once +/// the flush commits — after which it lingers for `frozen_memtable_grace` so +/// in-flight as-of reads keep batch-resolved membership, then is swept. +struct FrozenMemTable { + memtable: Arc, + flushed_at_ms: Option, +} + /// ShardWriter state shared across tasks. struct WriterState { memtable: MemTable, @@ -716,12 +755,13 @@ struct WriterState { frozen_memtable_bytes: usize, /// Flush watchers for frozen memtables (for backpressure). frozen_flush_watchers: VecDeque<(usize, DurabilityWatcher)>, - /// Sealed-but-undrained memtables, kept queryable so a concurrent reader - /// sees no hole between `freeze_memtable` and the flush task's manifest - /// commit. Pushed in `freeze_memtable`; removed by generation in - /// `flush_memtable` on commit success only (retained on failure until a - /// later flush or WAL replay on reopen). - frozen_memtables: VecDeque>, + /// Sealed memtables, kept queryable so a concurrent reader sees no hole + /// between `freeze_memtable` and the flush task's manifest commit, and for + /// `frozen_memtable_grace` beyond it so as-of reads stay batch-resolved. + /// Pushed in `freeze_memtable`; stamped `flushed_at_ms` by `flush_memtable` + /// on commit success only (retained un-stamped on failure until a later + /// flush or WAL replay on reopen); swept after the grace by `SweepExpired`. + frozen_memtables: VecDeque, /// Flag to prevent duplicate memtable flush requests. flush_requested: bool, /// Counter for WAL flush threshold crossings. @@ -846,6 +886,16 @@ async fn replay_memtable_from_wal( Ok(position) } +/// Pair each primary-key column name with its field id (both derived from the +/// schema's primary key, in the same order) for [`IndexStore::enable_pk_index`]. +fn pk_index_columns(pk_columns: &[String], pk_field_ids: &[i32]) -> Vec<(String, i32)> { + pk_columns + .iter() + .cloned() + .zip(pk_field_ids.iter().copied()) + .collect() +} + /// Shared state for writer operations. struct SharedWriterState { state: Arc>, @@ -855,6 +905,9 @@ struct SharedWriterState { config: ShardWriterConfig, schema: Arc, pk_field_ids: Vec, + /// Primary-key column names, used to (re)enable the PK-position index on + /// each fresh active memtable created at freeze. + pk_columns: Vec, max_memtable_batches: usize, max_memtable_rows: usize, index_configs: Vec, @@ -870,6 +923,7 @@ impl SharedWriterState { config: ShardWriterConfig, schema: Arc, pk_field_ids: Vec, + pk_columns: Vec, max_memtable_batches: usize, max_memtable_rows: usize, index_configs: Vec, @@ -882,6 +936,7 @@ impl SharedWriterState { config, schema, pk_field_ids, + pk_columns, max_memtable_batches, max_memtable_rows, index_configs, @@ -907,13 +962,17 @@ impl SharedWriterState { self.max_memtable_batches, )?; - if !self.index_configs.is_empty() { - let indexes = Arc::new(IndexStore::from_configs( + // Build an IndexStore when there are user indexes *or* a primary key: + // the PK dedup index (and its flushed on-disk sidecar) is required for + // cross-generation dedup even when no secondary index is configured. + if !self.index_configs.is_empty() || !self.pk_columns.is_empty() { + let mut indexes = IndexStore::from_configs( &self.index_configs, self.max_memtable_rows, self.max_memtable_batches, - )?); - new_memtable.set_indexes_arc(indexes); + )?; + indexes.enable_pk_index(&pk_index_columns(&self.pk_columns, &self.pk_field_ids)); + new_memtable.set_indexes_arc(Arc::new(indexes)); } let mut old_memtable = std::mem::replace(&mut state.memtable, new_memtable); @@ -949,10 +1008,13 @@ impl SharedWriterState { let frozen_memtable = Arc::new(old_memtable); - // Keep this generation queryable until its manifest commit lands - // (dropped in `flush_memtable`, success only). Arc refcount, not a - // copy — the flush task holds it alive for the whole drain anyway. - state.frozen_memtables.push_back(frozen_memtable.clone()); + // Keep this generation queryable past its manifest commit (swept after + // the grace by `SweepExpired`). Arc refcount, not a copy — the flush + // task holds it alive for the whole drain anyway. + state.frozen_memtables.push_back(FrozenMemTable { + memtable: frozen_memtable.clone(), + flushed_at_ms: None, + }); debug!( "Frozen memtable generation {}, pending_count = {}", @@ -960,7 +1022,7 @@ impl SharedWriterState { state.frozen_flush_watchers.len() ); - let _ = self.memtable_flush_tx.send(TriggerMemTableFlush { + let _ = self.memtable_flush_tx.send(TriggerMemTableFlush::Flush { memtable: frozen_memtable, done: None, }); @@ -1287,11 +1349,9 @@ impl ShardWriter { ) -> Result { // Create MemTable with primary key field IDs from schema let lance_schema = Schema::try_from(schema.as_ref())?; - let pk_field_ids: Vec = lance_schema - .unenforced_primary_key() - .iter() - .map(|f| f.id) - .collect(); + let pk_fields = lance_schema.unenforced_primary_key(); + let pk_field_ids: Vec = pk_fields.iter().map(|f| f.id).collect(); + let pk_columns: Vec = pk_fields.iter().map(|f| f.name.clone()).collect(); let mut memtable = MemTable::with_capacity( schema.clone(), manifest.current_generation, @@ -1300,14 +1360,18 @@ impl ShardWriter { config.max_memtable_batches, )?; - // Create indexes if configured and set them on the MemTable. - if !index_configs.is_empty() { - let indexes = Arc::new(IndexStore::from_configs( + // Create indexes if configured and set them on the MemTable. The + // PK-position index is enabled before any WAL replay below so replayed + // rows are recorded in it. A primary key alone (no secondary index) + // still needs the PK index so flush writes its on-disk dedup sidecar. + if !index_configs.is_empty() || !pk_columns.is_empty() { + let mut indexes = IndexStore::from_configs( index_configs, config.max_memtable_rows, config.max_memtable_batches, - )?); - memtable.set_indexes_arc(indexes); + )?; + indexes.enable_pk_index(&pk_index_columns(&pk_columns, &pk_field_ids)); + memtable.set_indexes_arc(Arc::new(indexes)); } // Replay any WAL entries written after the last successfully-flushed @@ -1357,13 +1421,10 @@ impl ShardWriter { let (memtable_flush_tx, memtable_flush_rx) = mpsc::unbounded_channel(); - let flusher = Arc::new(MemTableFlusher::new( - object_store, - base_path, - base_uri, - shard_id, - manifest_store, - )); + let flusher = Arc::new( + MemTableFlusher::new(object_store, base_path, base_uri, shard_id, manifest_store) + .with_warmer(config.warmer.clone()), + ); let backpressure = BackpressureController::new(config.clone()); @@ -1378,8 +1439,14 @@ impl ShardWriter { // Background MemTable flush handler — frozen memtable to Lance file. // It rebuilds the same secondary indexes on each flushed generation. - let memtable_handler = - MemTableFlushHandler::new(state.clone(), flusher, epoch, index_configs.to_vec(), stats); + let memtable_handler = MemTableFlushHandler::new( + state.clone(), + flusher, + epoch, + index_configs.to_vec(), + stats, + config.frozen_memtable_grace, + ); task_executor.add_handler( "memtable_flusher".to_string(), Box::new(memtable_handler), @@ -1395,6 +1462,7 @@ impl ShardWriter { config.clone(), schema.clone(), pk_field_ids, + pk_columns, config.max_memtable_batches, config.max_memtable_rows, index_configs.to_vec(), @@ -1789,7 +1857,7 @@ impl ShardWriter { frozen: state .frozen_memtables .iter() - .map(|m| in_memory_ref(m)) + .map(|m| in_memory_ref(&m.memtable)) .collect(), }) } @@ -2182,6 +2250,9 @@ struct MemTableFlushHandler { /// at all. index_configs: Vec, stats: SharedWriteStats, + /// How long a frozen memtable lingers in memory after its flush commits + /// before `SweepExpired` evicts it. See `ShardWriterConfig::frozen_memtable_grace`. + grace: Duration, } impl MemTableFlushHandler { @@ -2191,6 +2262,7 @@ impl MemTableFlushHandler { epoch: u64, index_configs: Vec, stats: SharedWriteStats, + grace: Duration, ) -> Self { Self { state, @@ -2198,22 +2270,51 @@ impl MemTableFlushHandler { epoch, index_configs, stats, + grace, } } + + /// Evict frozen memtables whose post-flush grace has elapsed. Un-stamped + /// (not-yet-flushed) entries are always kept. + async fn sweep_expired_frozen(&self) { + let now = now_millis(); + let grace_ms = self.grace.as_millis() as u64; + let mut state = self.state.write().await; + state + .frozen_memtables + .retain(|frozen| match frozen.flushed_at_ms { + Some(flushed_at) => now.saturating_sub(flushed_at) < grace_ms, + None => true, + }); + } } #[async_trait] impl MessageHandler for MemTableFlushHandler { - async fn handle(&mut self, message: TriggerMemTableFlush) -> Result<()> { - let TriggerMemTableFlush { memtable, done } = message; + fn tickers(&mut self) -> Vec<(Duration, MessageFactory)> { + // Zero grace evicts on commit, so no sweeper is needed. + if self.grace.is_zero() { + return vec![]; + } + // Sweep often enough that eviction lags the grace by at most ~1/3, so a + // generation lives no more than ~grace * 4/3 past its flush commit. + let tick = (self.grace / 3).max(Duration::from_millis(100)); + vec![(tick, Box::new(|| TriggerMemTableFlush::SweepExpired))] + } - let result = self.flush_memtable(memtable).await; - if let Some(tx) = done { - // Send result through the channel - caller is waiting for it - let _ = tx.send(result); - } else { - // No done channel, propagate errors - result?; + async fn handle(&mut self, message: TriggerMemTableFlush) -> Result<()> { + match message { + TriggerMemTableFlush::Flush { memtable, done } => { + let result = self.flush_memtable(memtable).await; + if let Some(tx) = done { + // Send result through the channel - caller is waiting for it + let _ = tx.send(result); + } else { + // No done channel, propagate errors + result?; + } + } + TriggerMemTableFlush::SweepExpired => self.sweep_expired_frozen().await, } Ok(()) } @@ -2307,15 +2408,26 @@ impl MemTableFlushHandler { state.frozen_memtable_bytes = state.frozen_memtable_bytes.saturating_sub(memtable_size); } - // Drop the queryable handle ONLY on commit success. On failure - // keep it: rows must stay in the read union until a later flush - // or WAL replay, else a transient flush error reopens the hole. - // Keyed by generation, so non-FIFO completion is fine. + // Retire the frozen handle on commit success, keyed by generation + // (non-FIFO completion is fine). Zero grace evicts here; otherwise + // stamp the grace clock so it lingers for multi-part as-of reads + // until `SweepExpired`. On failure leave it un-stamped: rows stay in + // the read union until a later flush or WAL replay, else a transient + // error reopens the hole. if flush_result.is_ok() { let flushed_generation = memtable.generation(); - state - .frozen_memtables - .retain(|m| m.generation() != flushed_generation); + if self.grace.is_zero() { + state + .frozen_memtables + .retain(|frozen| frozen.memtable.generation() != flushed_generation); + } else { + let now = now_millis(); + for frozen in state.frozen_memtables.iter_mut() { + if frozen.memtable.generation() == flushed_generation { + frozen.flushed_at_ms = Some(now); + } + } + } } } @@ -4198,10 +4310,12 @@ mod tests { writer.close().await.unwrap(); } - /// On a successful flush commit the sealed generation is dropped from - /// the queryable set (no leak), and its rows land in the manifest. + /// On a successful flush commit the sealed generation's rows land in the + /// manifest immediately, but the in-memory handle is NOT dropped — it + /// lingers for `frozen_memtable_grace` (so in-flight as-of reads keep + /// batch-resolved membership), then is swept by the `SweepExpired` ticker. #[tokio::test] - async fn test_frozen_dropped_after_successful_flush() { + async fn test_frozen_retained_during_grace_then_swept() { let (store, base_path, base_uri, _temp_dir) = create_local_store().await; let schema = create_test_schema(); let config = ShardWriterConfig { @@ -4213,6 +4327,8 @@ mod tests { max_wal_flush_interval: None, max_memtable_size: 64 * 1024 * 1024, manifest_scan_batch_size: 2, + // Short grace so the sweep is observable without a slow test. + frozen_memtable_grace: Duration::from_secs(1), ..Default::default() }; let writer = ShardWriter::open(store, base_path, base_uri, config, schema.clone(), vec![]) @@ -4227,13 +4343,66 @@ mod tests { writer.force_seal_active().await.unwrap(); writer.wait_for_flush_drain().await.unwrap(); + // Recorded in the manifest at commit time. + let manifest = writer.manifest().await.unwrap().expect("manifest exists"); + assert!( + manifest + .flushed_generations + .iter() + .any(|g| g.generation == initial_gen), + "flushed generation must be recorded in the manifest" + ); + + // Still queryable in memory immediately after commit (within grace). + let refs = writer.in_memory_memtable_refs().await.unwrap(); + assert_eq!(refs.active.generation, initial_gen + 1); + assert!( + refs.frozen.iter().any(|f| f.generation == initial_gen), + "flushed generation must stay queryable during the grace window" + ); + + // After the grace elapses (plus a sweep tick) the handle is evicted. + tokio::time::sleep(Duration::from_millis(1_500)).await; let refs = writer.in_memory_memtable_refs().await.unwrap(); assert!( refs.frozen.is_empty(), - "frozen handle must be dropped once the flush commit lands" + "frozen handle must be swept once the grace elapses" ); - assert_eq!(refs.active.generation, initial_gen + 1); + writer.close().await.unwrap(); + } + + /// With zero grace (the default) a frozen handle is evicted synchronously on + /// flush commit — no sweep tick, no lingering window. + #[tokio::test] + async fn test_frozen_evicted_immediately_with_zero_grace() { + let (store, base_path, base_uri, _temp_dir) = create_local_store().await; + let schema = create_test_schema(); + let config = ShardWriterConfig { + shard_id: Uuid::new_v4(), + shard_spec_id: 0, + durable_write: false, + sync_indexed_write: false, + max_wal_buffer_size: 64 * 1024 * 1024, + max_wal_flush_interval: None, + max_memtable_size: 64 * 1024 * 1024, + manifest_scan_batch_size: 2, + frozen_memtable_grace: Duration::ZERO, + ..Default::default() + }; + let writer = ShardWriter::open(store, base_path, base_uri, config, schema.clone(), vec![]) + .await + .unwrap(); + + let initial_gen = writer.memtable_stats().await.unwrap().generation; + writer + .put(vec![create_test_batch(&schema, 0, 10)]) + .await + .unwrap(); + writer.force_seal_active().await.unwrap(); + writer.wait_for_flush_drain().await.unwrap(); + + // Rows are durably in the manifest... let manifest = writer.manifest().await.unwrap().expect("manifest exists"); assert!( manifest @@ -4243,6 +4412,13 @@ mod tests { "flushed generation must be recorded in the manifest" ); + // ...and the in-memory handle is already gone, no sweep tick needed. + let refs = writer.in_memory_memtable_refs().await.unwrap(); + assert!( + refs.frozen.is_empty(), + "frozen handle must be evicted on commit when grace is zero" + ); + writer.close().await.unwrap(); } diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index d591e42cc73..87dda8e7e57 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -191,6 +191,13 @@ pub struct CompactionOptions { /// specified then the default (see /// [`crate::dataset::Scanner::batch_size`]) will be used. pub batch_size: Option, + /// The number of bytes to allow to queue up in the I/O buffer when scanning + /// the input fragments. If not specified then the default (see + /// [`crate::dataset::Scanner::io_buffer_size`]) will be used. + /// + /// Increasing this can avoid a deadlock that occurs when a single batch of + /// data is larger than the I/O buffer size. + pub io_buffer_size: Option, /// Whether to defer remapping indices during compaction. If true, indices will /// not be remapped during this compaction operation. Instead, the fragment reuse index /// is updated and will be used to perform remapping later. @@ -237,6 +244,7 @@ impl Default for CompactionOptions { num_threads: None, max_bytes_per_file: None, batch_size: None, + io_buffer_size: None, defer_index_remap: false, compaction_mode: None, enable_binary_copy: false, @@ -264,6 +272,7 @@ impl CompactionOptions { /// - `lance.compaction.materialize_deletions_threshold` /// - `lance.compaction.defer_index_remap` /// - `lance.compaction.batch_size` + /// - `lance.compaction.io_buffer_size` /// - `lance.compaction.compaction_mode` /// - `lance.compaction.binary_copy_read_batch_bytes` /// - `lance.compaction.max_source_fragments` @@ -347,6 +356,14 @@ impl CompactionOptions { )) })?); } + "io_buffer_size" => { + self.io_buffer_size = Some(value.parse().map_err(|_| { + Error::invalid_input(format!( + "Invalid value for {}: '{}' (expected a non-negative integer)", + key, value + )) + })?); + } "compaction_mode" => { self.compaction_mode = Some(CompactionMode::try_from(value.as_str())?); } @@ -1194,6 +1211,8 @@ async fn transform_blob_v2_batch( /// and preserve insertion order. /// - `batch_size`: Optional batch size; if provided, set it on the scanner to control /// read batching. +/// - `io_buffer_size`: Optional I/O buffer size in bytes; if provided, set it on the +/// scanner to control how much data is queued during reads. /// - `with_frags`: Whether to scan only the specified old fragments and force /// in-order reading. /// - `capture_row_ids`: When index remapping is needed, include and capture the @@ -1209,6 +1228,7 @@ async fn prepare_reader( dataset: &Dataset, fragments: &[Fragment], batch_size: Option, + io_buffer_size: Option, with_frags: bool, capture_row_ids: bool, ) -> Result<( @@ -1234,6 +1254,9 @@ async fn prepare_reader( if let Some(bs) = batch_size { scanner.batch_size(bs); } + if let Some(io_buffer_size) = io_buffer_size { + scanner.io_buffer_size(io_buffer_size); + } if with_frags { scanner .with_fragments(fragments.to_vec()) @@ -1515,6 +1538,7 @@ async fn rewrite_files( dataset.as_ref(), &fragments, options.batch_size, + options.io_buffer_size, true, needs_remapping, ) @@ -2636,6 +2660,57 @@ mod tests { assert_eq!(scanned_data, data); } + #[rstest] + #[tokio::test] + async fn test_compact_with_io_buffer_size( + #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)] + data_storage_version: LanceFileVersion, + ) { + // Compaction should succeed and produce correct results when an + // explicit io_buffer_size is provided via CompactionOptions. + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + + let data = sample_data(); + + // Create a table with 2 small fragments so there is something to compact. + let reader = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); + let write_params = WriteParams { + max_rows_per_file: 5_000, + max_rows_per_group: 1_000, + data_storage_version: Some(data_storage_version), + ..Default::default() + }; + let mut dataset = Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + assert_eq!(dataset.get_fragments().len(), 2); + + let options = CompactionOptions { + // A generous buffer so the read does not deadlock on large batches. + io_buffer_size: Some(256 * 1024 * 1024), + ..Default::default() + }; + let plan = plan_compaction(&dataset, &options).await.unwrap(); + assert_eq!(plan.tasks().len(), 1); + + let metrics = compact_files(&mut dataset, options, None).await.unwrap(); + assert_eq!(metrics.fragments_removed, 2); + assert_eq!(metrics.fragments_added, 1); + + // All rows are preserved after compaction. + let scanner = dataset.scan(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + let scanned_data = concat_batches(&batches[0].schema(), &batches).unwrap(); + assert_eq!(scanned_data.num_rows(), data.num_rows()); + } + #[rstest] #[tokio::test] async fn test_compact_deletions( @@ -4232,6 +4307,133 @@ mod tests { assert_eq!(scanner.count_rows().await.unwrap(), count3); } + /// Deferred compaction that materializes deletions must not corrupt an + /// inverted (FTS) index read through the fragment-reuse index. The index's + /// posting lists reference doc_ids positionally; if the load-time remap + /// dropped the deleted rows it would renumber the doc_ids and desync the + /// posting lists (out-of-bounds `num_tokens`, wrong/stale row ids). The + /// tombstone-preserve-positions load path must keep results correct in the + /// FRI window and after the physical remap + trim. + #[tokio::test] + async fn test_read_inverted_index_with_defer_index_remap_and_deletions() { + // Enough surviving docs for several compressed posting-list blocks + // (BLOCK_SIZE = 128), split across several fragments so compaction has + // real work — but no larger. + const ROWS: i32 = 1200; + const DELETED: i32 = 400; + + // Every row contains "lance", so the term matches all live rows; `id` + // tells us exactly which rows survive. + let ids = Int32Array::from_iter_values(0..ROWS); + let docs = LargeStringArray::from_iter_values((0..ROWS).map(|_| "lance apple orange")); + let batch = RecordBatch::try_new( + Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("doc", DataType::LargeUtf8, false), + ]) + .into(), + vec![Arc::new(ids) as ArrayRef, Arc::new(docs) as ArrayRef], + ) + .unwrap(); + let schema_ref = batch.schema(); + let stream = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema_ref); + let mut dataset = Dataset::write( + stream, + "memory://test/table", + Some(WriteParams { + max_rows_per_file: 200, // 6 fragments + ..Default::default() + }), + ) + .await + .unwrap(); + + dataset + .create_index( + &["doc"], + IndexType::Inverted, + Some("doc_idx".into()), + &InvertedIndexParams::default(), + false, + ) + .await + .unwrap(); + + // Delete a prefix, then deferred-compact so the deletions are + // materialized into the fragment-reuse index the index is read through. + dataset.delete(&format!("id < {DELETED}")).await.unwrap(); + compact_files( + &mut dataset, + CompactionOptions { + target_rows_per_fragment: 2_000, + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + assert!( + dataset + .load_index_by_name(FRAG_REUSE_INDEX_NAME) + .await + .unwrap() + .is_some(), + "deferred compaction must leave a fragment-reuse index" + ); + + // FTS "lance" → sorted surviving ids. Projecting `id` forces a take, so + // a stale row address would error or return a wrong/dead row. + async fn search_ids(dataset: &Dataset) -> Vec { + let mut scanner = dataset.scan(); + scanner + .full_text_search(FullTextSearchQuery::new("lance".to_owned())) + .unwrap(); + scanner.project::<&str>(&["id"]).unwrap(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + let mut ids: Vec = batches + .iter() + .flat_map(|b| { + b.column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .values() + .to_vec() + }) + .collect(); + ids.sort_unstable(); + ids + } + + let expected = (DELETED..ROWS).collect::>(); + + // FRI window: index read through the reuse index. + let during = search_ids(&dataset).await; + assert_eq!( + during, expected, + "FRI-window FTS must return exactly the surviving rows (no resurrection, no loss, no stale rows)" + ); + + // Physical remap + trim: must still be correct. + remapping::remap_column_index(&mut dataset, &["doc"], Some("doc_idx".into())) + .await + .unwrap(); + cleanup_frag_reuse_index(&mut dataset).await.unwrap(); + let after = search_ids(&dataset).await; + assert_eq!( + after, expected, + "FTS must stay correct after physical remap + fragment-reuse trim" + ); + } + #[tokio::test] async fn test_read_ngram_index_with_defer_index_remap() { // Generate random words using lance-datagen @@ -4615,6 +4817,668 @@ mod tests { ); } + #[tokio::test] + async fn test_read_ivf_rq_index_v3_with_defer_index_remap() { + use arrow_array::cast::AsArray; + use lance_index::vector::bq::RQBuildParams; + + let mut dataset = lance_datagen::gen_batch() + .col( + "vec", + lance_datagen::array::rand_vec::(Dimension::from(128)), + ) + .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000)) + .await + .unwrap(); + + let stored: Vec> = { + let mut scanner = dataset.scan(); + scanner.project(&["vec"]).unwrap(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + let mut out = Vec::new(); + for batch in &batches { + let vecs = batch["vec"].as_fixed_size_list(); + for i in 0..batch.num_rows() { + let values = vecs.value(i); + let values = values.as_primitive::(); + out.push(values.values().to_vec()); + } + } + out + }; + + let index_name = Some("vec_idx".into()); + dataset + .create_index( + &["vec"], + IndexType::Vector, + index_name.clone(), + &VectorIndexParams { + metric_type: DistanceType::L2, + stages: vec![ + StageParams::Ivf(IvfBuildParams { + max_iters: 2, + num_partitions: Some(2), + sample_rate: 2, + ..Default::default() + }), + StageParams::RQ(RQBuildParams::new(1)), + ], + version: crate::index::vector::IndexFileVersion::V3, + skip_transpose: false, + runtime_hints: Default::default(), + }, + false, + ) + .await + .unwrap(); + let indices = dataset.load_indices().await.unwrap(); + let original_index = indices.iter().find(|idx| idx.name == "vec_idx").unwrap(); + + let options = CompactionOptions { + target_rows_per_fragment: 2_000, + defer_index_remap: true, + ..Default::default() + }; + let metrics = compact_files(&mut dataset, options, None).await.unwrap(); + assert!(metrics.fragments_removed > 0); + assert!(metrics.fragments_added > 0); + + let Some(current_index) = dataset.load_index_by_name("vec_idx").await.unwrap() else { + panic!("vec index must be available"); + }; + assert_eq!(current_index.uuid, original_index.uuid); + + let frag_reuse_present = dataset + .load_indices() + .await + .unwrap() + .iter() + .any(|idx| idx.name == FRAG_REUSE_INDEX_NAME); + assert!( + frag_reuse_present, + "defer_index_remap must record a {} index", + FRAG_REUSE_INDEX_NAME + ); + + let sample_step = (stored.len() / 8).max(1); + let mut checked = 0; + for query in stored.iter().step_by(sample_step) { + let query_vec = PrimitiveArray::::from_iter_values(query.iter().copied()); + let mut scanner = dataset.scan(); + scanner.nearest("vec", &query_vec, 5).unwrap(); + scanner.project(&["vec"]).unwrap().with_row_id(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + assert!(!batches.is_empty(), "query returned no batches"); + let top = &batches[0]; + assert!(top.num_rows() > 0, "query returned empty top batch"); + let top_vec = top["vec"].as_fixed_size_list().value(0); + let top_vec = top_vec.as_primitive::(); + assert_eq!( + top_vec.values(), + query.as_slice(), + "top-1 self-recall returned a different vector than the query" + ); + checked += 1; + } + assert!(checked > 0, "expected to check at least one stored vector"); + } + + /// Build an `id` + `vec` dataset, create the given IVF vector index, + /// optionally delete rows, then run deferred compaction (which materializes + /// the deletions into the fragment-reuse index) and assert that KNN over + /// surviving vectors during the FRI window (a) never returns a deleted row + /// and (b) stays consistent with the pre-compaction answer. + /// + /// The deletion path is the interesting one: materialized deletions drop + /// rows from the quantization storage at load time, which shifts storage + /// positions. Flat storage (FLAT/PQ/SQ/RQ) is scanned linearly so this is + /// fine, but the HNSW graph addresses storage positionally and is not + /// frag-reuse aware, so a desync would surface here as recall collapse or a + /// resurrected/again-deleted row. + /// Top-k `id`s for a KNN query against the `vec` column. + async fn vector_knn_ids(dataset: &Dataset, query: &[f32], k: usize) -> Vec { + use arrow_array::cast::AsArray; + use arrow_array::types::{Float32Type, Int32Type}; + let qa = PrimitiveArray::::from_iter_values(query.iter().copied()); + let mut scanner = dataset.scan(); + scanner.nearest("vec", &qa, k).unwrap(); + scanner.project(&["id"]).unwrap(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + let mut ids = Vec::new(); + for b in &batches { + ids.extend(b["id"].as_primitive::().values().iter().copied()); + } + ids + } + + async fn check_vector_defer_compaction( + params: VectorIndexParams, + delete_predicate: Option<&str>, + k: usize, + min_overlap: usize, + ) { + use arrow_array::cast::AsArray; + use arrow_array::types::{Float32Type, Int32Type}; + use lance_datagen::Dimension; + + const DIM: u32 = 32; + let mut dataset = lance_datagen::gen_batch() + .col("id", lance_datagen::array::step::()) + .col( + "vec", + lance_datagen::array::rand_vec::(Dimension::from(DIM)), + ) + .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000)) + .await + .unwrap(); + + dataset + .create_index( + &["vec"], + IndexType::Vector, + Some("vec_idx".into()), + ¶ms, + false, + ) + .await + .unwrap(); + let original_uuid = dataset + .load_index_by_name("vec_idx") + .await + .unwrap() + .unwrap() + .uuid; + + if let Some(pred) = delete_predicate { + dataset.delete(pred).await.unwrap(); + } + + // Collect surviving (id, vec) pairs and the set of surviving ids. + let mut survivors: Vec<(i32, Vec)> = Vec::new(); + { + let mut scanner = dataset.scan(); + scanner.project(&["id", "vec"]).unwrap(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + for batch in &batches { + let ids = batch["id"].as_primitive::(); + let vecs = batch["vec"].as_fixed_size_list(); + for i in 0..batch.num_rows() { + let v = vecs.value(i); + let v = v.as_primitive::().values().to_vec(); + survivors.push((ids.value(i), v)); + } + } + } + assert!(!survivors.is_empty()); + let surviving_ids: std::collections::HashSet = + survivors.iter().map(|(id, _)| *id).collect(); + + // Sample queries from survivors and capture the pre-compaction answer. + let step = (survivors.len() / 16).max(1); + let queries: Vec<(i32, Vec)> = survivors.iter().step_by(step).cloned().collect(); + let mut baseline: Vec> = Vec::new(); + for (_, q) in &queries { + baseline.push(vector_knn_ids(&dataset, q, k).await); + } + + // Deferred compaction materializes the deletions into the frag-reuse index. + let metrics = compact_files( + &mut dataset, + CompactionOptions { + target_rows_per_fragment: 2_000, + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + assert!(metrics.fragments_removed > 0); + assert!( + dataset + .load_indices() + .await + .unwrap() + .iter() + .any(|idx| idx.name == FRAG_REUSE_INDEX_NAME), + "deferred compaction must record a frag-reuse index" + ); + assert_eq!( + dataset + .load_index_by_name("vec_idx") + .await + .unwrap() + .unwrap() + .uuid, + original_uuid, + "index must not be physically remapped yet (FRI window)" + ); + + // During the FRI window: no deleted rows, and stable vs the baseline. + for (i, (_, q)) in queries.iter().enumerate() { + let after = vector_knn_ids(&dataset, q, k).await; + for id in &after { + assert!( + surviving_ids.contains(id), + "KNN returned id {id} that is not a surviving row (query #{i})" + ); + } + let overlap = after.iter().filter(|id| baseline[i].contains(id)).count(); + assert!( + overlap >= min_overlap, + "KNN top-{k} diverged after deferred compaction: overlap {overlap} < {min_overlap} (query #{i})" + ); + } + } + + fn small_ivf() -> lance_index::vector::ivf::IvfBuildParams { + lance_index::vector::ivf::IvfBuildParams { + max_iters: 2, + num_partitions: Some(2), + sample_rate: 2, + ..Default::default() + } + } + + #[tokio::test] + async fn test_ivf_flat_defer_compaction_with_deletions() { + let params = VectorIndexParams::with_ivf_flat_params(DistanceType::L2, small_ivf()); + // Flat storage is scanned linearly; dropping deleted rows is exact. + check_vector_defer_compaction(params, Some("id < 1500"), 10, 10).await; + } + + #[tokio::test] + async fn test_ivf_hnsw_sq_defer_compaction_merge_only() { + use lance_index::vector::{hnsw::builder::HnswBuildParams, sq::builder::SQBuildParams}; + let params = VectorIndexParams::with_ivf_hnsw_sq_params( + DistanceType::L2, + small_ivf(), + HnswBuildParams::default(), + SQBuildParams::default(), + ); + // No deletions: storage positions are stable, so the graph stays aligned. + check_vector_defer_compaction(params, None, 10, 9).await; + } + + // NOTE: IVF_HNSW_* under materialized deletions is a known gap (lance#3993, + // HNSW auto-remap not implemented) — the HNSW graph isn't realigned after the + // frag-reuse drop. Deferred remap is gated off for HNSW tables, so there is + // no lance-level reproducer here; the gate is tested in the data plane. + // Merge-only HNSW is covered (see the *_remap_and_trim tests). + + #[tokio::test] + async fn test_ivf_pq_defer_compaction_with_deletions() { + use lance_index::vector::pq::PQBuildParams; + let params = VectorIndexParams::with_ivf_pq_params( + DistanceType::L2, + small_ivf(), + PQBuildParams { + max_iters: 2, + num_sub_vectors: 2, + ..Default::default() + }, + ); + check_vector_defer_compaction(params, Some("id < 1500"), 10, 8).await; + } + + #[tokio::test] + async fn test_ivf_sq_defer_compaction_with_deletions() { + use lance_index::vector::sq::builder::SQBuildParams; + let params = VectorIndexParams::with_ivf_sq_params( + DistanceType::L2, + small_ivf(), + SQBuildParams::default(), + ); + check_vector_defer_compaction(params, Some("id < 1500"), 10, 8).await; + } + + #[tokio::test] + async fn test_ivf_rq_defer_compaction_with_deletions() { + use lance_index::vector::bq::RQBuildParams; + let params = VectorIndexParams::with_ivf_rq_params( + DistanceType::L2, + small_ivf(), + RQBuildParams::new(1), + ); + check_vector_defer_compaction(params, Some("id < 1500"), 10, 8).await; + } + + /// Merge-only deferred compaction, then a PHYSICAL remap + FRI trim. Asserts + /// the index is rebuilt, the fragment-reuse index trims to zero versions, + /// and KNN stays consistent with the pre-compaction answer through both the + /// FRI window and the physical remap. (HNSW rebuilds its graph on physical + /// remap, so the overlap is recall-tolerant.) + async fn check_vector_remap_and_trim( + params: VectorIndexParams, + k: usize, + window_overlap: usize, + post_remap_overlap: Option, + ) { + use arrow_array::cast::AsArray; + use arrow_array::types::{Float32Type, Int32Type}; + use lance_datagen::Dimension; + + const DIM: u32 = 32; + let mut dataset = lance_datagen::gen_batch() + .col("id", lance_datagen::array::step::()) + .col( + "vec", + lance_datagen::array::rand_vec::(Dimension::from(DIM)), + ) + .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000)) + .await + .unwrap(); + dataset + .create_index( + &["vec"], + IndexType::Vector, + Some("vec_idx".into()), + ¶ms, + false, + ) + .await + .unwrap(); + let original_uuid = dataset + .load_index_by_name("vec_idx") + .await + .unwrap() + .unwrap() + .uuid; + + // Sample queries from stored vectors + capture the pre-compaction answer. + let mut rows: Vec> = Vec::new(); + { + let mut scanner = dataset.scan(); + scanner.project(&["vec"]).unwrap(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + for batch in &batches { + let vecs = batch["vec"].as_fixed_size_list(); + for i in 0..batch.num_rows() { + let v = vecs.value(i); + rows.push(v.as_primitive::().values().to_vec()); + } + } + } + let step = (rows.len() / 16).max(1); + let queries: Vec> = rows.iter().step_by(step).cloned().collect(); + let mut baseline: Vec> = Vec::new(); + for q in &queries { + baseline.push(vector_knn_ids(&dataset, q, k).await); + } + + // Merge-only deferred compaction. + let metrics = compact_files( + &mut dataset, + CompactionOptions { + target_rows_per_fragment: 2_000, + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + assert!(metrics.fragments_removed > 0); + assert_eq!( + dataset + .load_index_by_name("vec_idx") + .await + .unwrap() + .unwrap() + .uuid, + original_uuid, + "index must not be physically remapped yet (FRI window)" + ); + for (i, q) in queries.iter().enumerate() { + let window = vector_knn_ids(&dataset, q, k).await; + let overlap = window.iter().filter(|id| baseline[i].contains(id)).count(); + assert!( + overlap >= window_overlap, + "FRI-window KNN diverged: overlap {overlap} < {window_overlap} (query #{i})" + ); + } + + // Physical remap + trim the fragment-reuse index. + remapping::remap_column_index(&mut dataset, &["vec"], Some("vec_idx".into())) + .await + .unwrap(); + cleanup_frag_reuse_index(&mut dataset).await.unwrap(); + + let remapped_uuid = dataset + .load_index_by_name("vec_idx") + .await + .unwrap() + .unwrap() + .uuid; + assert_ne!( + remapped_uuid, original_uuid, + "index should have been physically remapped" + ); + if let Some(meta) = dataset + .load_index_by_name(FRAG_REUSE_INDEX_NAME) + .await + .unwrap() + { + let versions = load_frag_reuse_index_details(&dataset, &meta) + .await + .unwrap() + .versions + .len(); + assert_eq!(versions, 0, "frag-reuse index must trim to zero versions"); + } + + for (i, q) in queries.iter().enumerate() { + let after = vector_knn_ids(&dataset, q, k).await; + // No stale/desynced addresses (a bad address fails the take above). + assert!( + !after.is_empty(), + "post-remap KNN returned no rows (query #{i})" + ); + // Physical remap rebuilds the HNSW graph, so recall is only compared + // for the exact (non-HNSW) types. + if let Some(min_overlap) = post_remap_overlap { + let overlap = after.iter().filter(|id| baseline[i].contains(id)).count(); + assert!( + overlap >= min_overlap, + "post-remap KNN diverged: overlap {overlap} < {min_overlap} (query #{i})" + ); + } + } + } + + #[tokio::test] + async fn test_ivf_flat_remap_and_trim() { + let params = VectorIndexParams::with_ivf_flat_params(DistanceType::L2, small_ivf()); + check_vector_remap_and_trim(params, 10, 8, Some(8)).await; + } + + // Regression: PQ storage used to remap its codes through the frag-reuse + // index but keep the pre-remap `row_ids` field, so search returned stale + // (compacted-away) addresses and the take failed with "fragment ... does + // not exist" — even merge-only, and only observable when the query fetches + // row content (the existing `test_read_ivf_pq_index_v3_with_defer_index_remap` + // projects no columns, so it never takes and missed this). + #[tokio::test] + async fn test_ivf_pq_remap_and_trim() { + use lance_index::vector::pq::PQBuildParams; + let params = VectorIndexParams::with_ivf_pq_params( + DistanceType::L2, + small_ivf(), + PQBuildParams { + max_iters: 2, + num_sub_vectors: 2, + ..Default::default() + }, + ); + check_vector_remap_and_trim(params, 10, 8, Some(8)).await; + } + + #[tokio::test] + async fn test_ivf_sq_remap_and_trim() { + use lance_index::vector::sq::builder::SQBuildParams; + let params = VectorIndexParams::with_ivf_sq_params( + DistanceType::L2, + small_ivf(), + SQBuildParams::default(), + ); + check_vector_remap_and_trim(params, 10, 8, Some(8)).await; + } + + #[tokio::test] + async fn test_ivf_rq_remap_and_trim() { + use lance_index::vector::bq::RQBuildParams; + let params = VectorIndexParams::with_ivf_rq_params( + DistanceType::L2, + small_ivf(), + RQBuildParams::new(1), + ); + check_vector_remap_and_trim(params, 10, 8, Some(8)).await; + } + + #[tokio::test] + async fn test_ivf_hnsw_sq_remap_and_trim() { + use lance_index::vector::{hnsw::builder::HnswBuildParams, sq::builder::SQBuildParams}; + let params = VectorIndexParams::with_ivf_hnsw_sq_params( + DistanceType::L2, + small_ivf(), + HnswBuildParams::default(), + SQBuildParams::default(), + ); + // Physical remap rebuilds the HNSW graph, so use a recall-tolerant overlap. + check_vector_remap_and_trim(params, 10, 7, None).await; + } + + #[tokio::test] + async fn test_ivf_hnsw_pq_remap_and_trim() { + use lance_index::vector::{hnsw::builder::HnswBuildParams, pq::PQBuildParams}; + let params = VectorIndexParams::with_ivf_hnsw_pq_params( + DistanceType::L2, + small_ivf(), + HnswBuildParams::default(), + PQBuildParams { + max_iters: 2, + num_sub_vectors: 2, + ..Default::default() + }, + ); + check_vector_remap_and_trim(params, 10, 7, None).await; + } + + // Scalar index correctness across deferred compaction WITH materialized + // deletions. The existing test_read_*_index_with_defer_index_remap tests are + // merge-only and project no columns (count-only), so they never take and + // never exercise the deletion drop path. These add an `id` column, delete a + // prefix, defer-compact, then run the indexed query *projecting id* (a take) + // and assert no deleted row is returned. Bitmap/BTree have no positional + // internal structure so the drop path is exact; the Inverted (FTS) index + // does (see its test below), and currently desyncs under deletions. + + #[tokio::test] + async fn test_bitmap_index_defer_compaction_with_deletions() { + use arrow_array::cast::AsArray; + use arrow_array::types::Int32Type; + let mut dataset = lance_datagen::gen_batch() + .col("id", lance_datagen::array::step::()) + .col( + "category", + lance_datagen::array::cycle::(vec![1, 2, 3]), + ) + .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000)) + .await + .unwrap(); + dataset + .create_index( + &["category"], + IndexType::Bitmap, + Some("category_idx".into()), + &ScalarIndexParams::default(), + false, + ) + .await + .unwrap(); + dataset.delete("id < 1500").await.unwrap(); + let metrics = compact_files( + &mut dataset, + CompactionOptions { + target_rows_per_fragment: 2_000, + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + assert!(metrics.fragments_removed > 0); + assert!( + dataset + .load_indices() + .await + .unwrap() + .iter() + .any(|idx| idx.name == FRAG_REUSE_INDEX_NAME), + "deferred compaction must record a frag-reuse index" + ); + + let mut scanner = dataset.scan(); + scanner.filter("category = 3").unwrap(); + scanner.project(&["id"]).unwrap(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + let mut returned = 0; + for b in &batches { + for id in b["id"].as_primitive::().values() { + assert!( + *id >= 1500, + "bitmap returned deleted id {id} in the FRI window" + ); + returned += 1; + } + } + assert!(returned > 0, "expected surviving category=3 rows"); + } + + // NOTE: Inverted/FTS under materialized deletions is broken (BM25 scores + // via positional num_tokens[doc_id]; the frag-reuse drop shifts doc_id + // positions -> out-of-bounds). It is gated off defer in the data plane + // until fixed, so there is no lance-level reproducer here. Merge-only FTS + // is covered by test_read_inverted_index_with_defer_index_remap. + #[tokio::test] async fn test_default_compaction_planner() { let test_dir = TempStrDir::default(); @@ -4683,6 +5547,10 @@ mod tests { "lance.compaction.batch_size".to_string(), "4096".to_string(), ), + ( + "lance.compaction.io_buffer_size".to_string(), + "1073741824".to_string(), + ), ( "lance.compaction.compaction_mode".to_string(), "try_binary_copy".to_string(), @@ -4701,6 +5569,7 @@ mod tests { assert!((opts.materialize_deletions_threshold - 0.25).abs() < f32::EPSILON); assert!(opts.defer_index_remap); assert_eq!(opts.batch_size, Some(4096)); + assert_eq!(opts.io_buffer_size, Some(1_073_741_824)); assert_eq!(opts.compaction_mode, Some(CompactionMode::TryBinaryCopy)); assert_eq!(opts.binary_copy_read_batch_bytes, Some(8_388_608)); } diff --git a/rust/lance/src/dataset/optimize/remapping.rs b/rust/lance/src/dataset/optimize/remapping.rs index dab62bf6166..266ac977a69 100644 --- a/rust/lance/src/dataset/optimize/remapping.rs +++ b/rust/lance/src/dataset/optimize/remapping.rs @@ -220,25 +220,37 @@ async fn remap_index(dataset: &mut Dataset, index_id: &Uuid) -> Result<()> { return Ok(()); } - // Sequentially apply the row addr maps from oldest to latest - let mut curr_index_id = *index_id; - for (i, row_id_map) in frag_reuse_index.row_id_maps.iter().enumerate() { - let version = &frag_reuse_index.details.versions[i]; - // load on-disk index metadata before auto-remap - let curr_index_meta = read_manifest_indexes( - &dataset.object_store, - &dataset.manifest_location, - &dataset.manifest, - ) - .await? - .into_iter() - .find(|idx| idx.uuid == curr_index_id) - .unwrap(); - - let maybe_index_bitmap = curr_index_meta.fragment_bitmap.clone(); - let (should_remap, bitmap_after_remap) = match maybe_index_bitmap { - Some(mut index_frag_bitmap) => { - let mut should_remap = false; + // Read the index's on-disk metadata once. Its stored row addresses are at + // this baseline; we compose all reuse versions into a single remap so the + // index file is rebuilt and committed exactly once, rather than once per + // version (the reuse index can accumulate many versions before remap runs). + let curr_index_meta = read_manifest_indexes( + &dataset.object_store, + &dataset.manifest_location, + &dataset.manifest, + ) + .await? + .into_iter() + .find(|idx| idx.uuid == *index_id) + .ok_or_else(|| { + Error::index(format!( + "index {index_id} not found in manifest; it may have been concurrently dropped" + )) + })?; + + // Compose the coverage (fragment bitmap) remap across every reuse version in + // one pass. Chaining is automatic: a version inserts its new fragments, + // which a later version then sees as its old fragments. `data_predates_version` + // is evaluated against the fixed baseline (there are no intermediate + // commits), and the new-fragment branch handles a bitmap that was already + // coverage-remapped + persisted before the data was remapped (e.g. while + // remapping a *sibling* index). + let baseline_version = curr_index_meta.dataset_version; + let (should_remap, bitmap_after_remap) = match curr_index_meta.fragment_bitmap.clone() { + Some(mut index_frag_bitmap) => { + let mut should_remap = false; + for version in frag_reuse_index.details.versions.iter() { + let data_predates_version = baseline_version < version.dataset_version; for group in version.groups.iter() { let mut old_frag_in_index = 0; for old_frag in group.old_frags.iter() { @@ -258,67 +270,97 @@ async fn remap_index(dataset: &mut Dataset, index_id: &Uuid) -> Result<()> { group.old_frags ))); } - index_frag_bitmap - .extend(group.new_frags.clone().into_iter().map(|f| f.id as u32)); + index_frag_bitmap.extend(group.new_frags.iter().map(|f| f.id as u32)); + should_remap = true; + } else if data_predates_version + && group + .new_frags + .iter() + .any(|new_frag| index_frag_bitmap.contains(new_frag.id as u32)) + { + // The bitmap was already coverage-remapped onto this + // group's new fragments and persisted before the data was + // remapped, so the old fragments are gone from the bitmap + // but the index data still needs remapping. should_remap = true; } } - (should_remap, Some(index_frag_bitmap)) } - // if there is no fragment bitmap for the index, - // we attempt remapping but will not update the fragment bitmap. - None => (true, None), - }; - - if should_remap { - let remap_result = index::remap_index(dataset, &curr_index_id, row_id_map).await?; - - let new_index_meta = match remap_result { - RemapResult::Drop => continue, - RemapResult::Keep(new_id) => IndexMetadata { - uuid: new_id, - name: curr_index_meta.name.clone(), - fields: curr_index_meta.fields.clone(), - dataset_version: dataset.manifest.version, - fragment_bitmap: bitmap_after_remap, - index_details: curr_index_meta.index_details.clone(), - index_version: curr_index_meta.index_version, - created_at: curr_index_meta.created_at, - base_id: None, - files: curr_index_meta.files.clone(), - }, - RemapResult::Remapped(remapped_index) => IndexMetadata { - uuid: remapped_index.new_id, - name: curr_index_meta.name.clone(), - fields: curr_index_meta.fields.clone(), - dataset_version: dataset.manifest.version, - fragment_bitmap: bitmap_after_remap, - index_details: Some(Arc::new(remapped_index.index_details)), - index_version: remapped_index.index_version as i32, - created_at: curr_index_meta.created_at, - base_id: None, - files: remapped_index.files, - }, - }; - - let new_id = new_index_meta.uuid; + (should_remap, Some(index_frag_bitmap)) + } + // if there is no fragment bitmap for the index, + // we attempt remapping but will not update the fragment bitmap. + None => (true, None), + }; - let transaction = Transaction::new( - dataset.manifest.version, - Operation::CreateIndex { - new_indices: vec![new_index_meta], - removed_indices: vec![curr_index_meta.clone()], - }, - None, - ); + if !should_remap { + return Ok(()); + } - dataset - .apply_commit(transaction, &Default::default(), &Default::default()) - .await?; + // Compose the row-address remap across all versions. `remap_row_id` already + // chains every version (and passes through addresses a version does not + // touch), so mapping the union of all versions' keys yields a single + // baseline -> final address map applied in one rebuild. + // + // Map every old address; do NOT filter by the current `fragment_bitmap`. In + // the sibling-coverage-remap case the bitmap was already advanced onto the + // new fragments while the index data still holds old addresses, so filtering + // by it would drop exactly the keys this index needs and leave its data + // stale (an empty map makes `index::remap_index` return `Keep`). The map is + // bounded by the rows the reuse index touched; addresses this index does not + // store are simply never looked up. + let composed_row_id_map: HashMap> = frag_reuse_index + .row_id_maps + .iter() + .flat_map(|row_id_map| row_id_map.keys().copied()) + .map(|old_addr| (old_addr, frag_reuse_index.remap_row_id(old_addr))) + .collect(); + + let remap_result = index::remap_index(dataset, index_id, &composed_row_id_map).await?; + + let new_index_meta = match remap_result { + // The composed remap emptied the index (every row deleted). Matching the + // prior per-version behavior, leave the existing index untouched and + // commit nothing -- there is no remap to apply. + RemapResult::Drop => return Ok(()), + RemapResult::Keep(new_id) => IndexMetadata { + uuid: new_id, + name: curr_index_meta.name.clone(), + fields: curr_index_meta.fields.clone(), + dataset_version: dataset.manifest.version, + fragment_bitmap: bitmap_after_remap, + index_details: curr_index_meta.index_details.clone(), + index_version: curr_index_meta.index_version, + created_at: curr_index_meta.created_at, + base_id: None, + files: curr_index_meta.files.clone(), + }, + RemapResult::Remapped(remapped_index) => IndexMetadata { + uuid: remapped_index.new_id, + name: curr_index_meta.name.clone(), + fields: curr_index_meta.fields.clone(), + dataset_version: dataset.manifest.version, + fragment_bitmap: bitmap_after_remap, + index_details: Some(Arc::new(remapped_index.index_details)), + index_version: remapped_index.index_version as i32, + created_at: curr_index_meta.created_at, + base_id: None, + files: remapped_index.files, + }, + }; - curr_index_id = new_id; - } - } + let transaction = Transaction::new( + dataset.manifest.version, + Operation::CreateIndex { + new_indices: vec![new_index_meta], + removed_indices: vec![curr_index_meta], + }, + None, + ); + + dataset + .apply_commit(transaction, &Default::default(), &Default::default()) + .await?; Ok(()) } diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 6b19150c17b..09cd7023e74 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -3591,33 +3591,35 @@ impl Scanner { .clone(); let mut columns = vec![column]; - if let Some(expr) = filter_plan.full_expr.as_ref() { - let filter_columns = Planner::column_names_in_expr(expr); - columns.extend(filter_columns); + if let Some(refine_expr) = filter_plan.refine_expr.as_ref() { + columns.extend(Planner::column_names_in_expr(refine_expr)); } - let flat_fts_scan_schema = Arc::new(self.dataset.schema().project(&columns).unwrap()); - let mut scan_node = self.scan_fragments( - true, - false, - false, - false, - false, - flat_fts_scan_schema, - Arc::new(fragments), - None, - false, - ); + let scan_projection = self + .dataset + .empty_projection() + .with_row_id() + .union_columns(&columns, OnMissing::Error)?; - if let Some(expr) = filter_plan.full_expr.as_ref() { - // If there is a prefilter we need to manually apply it to the new data - scan_node = Arc::new(LanceFilterExec::try_new(expr.clone(), scan_node)?); + let PlannedFilteredScan { mut plan, .. } = self + .filtered_read( + filter_plan, + scan_projection, + /*make_deletions_null=*/ false, + Some(Arc::new(fragments)), + None, + /*is_prefilter=*/ true, + ) + .await?; + + if let Some(refine_expr) = filter_plan.refine_expr.as_ref() { + plan = Arc::new(LanceFilterExec::try_new(refine_expr.clone(), plan)?); } let flat_match_plan = Arc::new(FlatMatchQueryExec::new( self.dataset.clone(), query.clone(), params.clone(), - scan_node, + plan, )); Ok(flat_match_plan) } @@ -8412,6 +8414,198 @@ mod test { .unwrap(); } + #[tokio::test] + async fn test_ngram_regex_index_scan() { + use arrow::array::AsArray; + + // A small, fixed corpus written across multiple fragments so the ngram + // index spans fragment boundaries. + let values = [ + "rhino", // 0 + "rhinos nose", // 1 + "cat", // 2 + "dog", // 3 + "cat dog", // 4 + "elephant", // 5 + "catalog", // 6 + "scatter", // 7 + "rhino horn", // 8 + "mouse", // 9 + "category", // 10 + "dogma", // 11 + ]; + let array = StringArray::from_iter_values(values); + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "s", + DataType::Utf8, + false, + )])); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let write_params = WriteParams { + max_rows_per_file: 4, // 12 rows -> 3 fragments + ..Default::default() + }; + let mut dataset = Dataset::write(reader, "memory://test_ngram_regex", Some(write_params)) + .await + .unwrap(); + dataset + .create_index( + &["s"], + IndexType::NGram, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + assert!( + dataset.get_fragments().len() > 1, + "expected a multi-fragment dataset" + ); + + // Scan with `filter` and return the matched `s` values, sorted. + async fn matched(dataset: &Dataset, filter: &str) -> Vec { + let mut scan = dataset.scan(); + scan.filter(filter).unwrap(); + let batches = scan + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + let mut out = Vec::new(); + for batch in batches { + let col = batch.column_by_name("s").unwrap().as_string::(); + out.extend(col.iter().flatten().map(|s| s.to_string())); + } + out.sort(); + out + } + + // `regexp_like`: a plain literal substring. + assert_eq!( + matched(&dataset, "regexp_like(s, 'rhino')").await, + ["rhino", "rhino horn", "rhinos nose"] + ); + // `regexp_match` (coerced to `IsNotNull(regexp_match(...))`) accelerates too. + assert_eq!( + matched(&dataset, "regexp_match(s, 'rhino')").await, + ["rhino", "rhino horn", "rhinos nose"] + ); + // Anchored: recheck must drop trigram false positives -- the `cat` + // trigram also occurs in cat dog / catalog / scatter / category. + assert_eq!(matched(&dataset, "regexp_like(s, 'cat$')").await, ["cat"]); + // AND across `.*`: row 8 ("rhino horn") shares the rhino trigrams but + // lacks the nose trigrams, so only "rhinos nose" survives. + assert_eq!( + matched(&dataset, "regexp_like(s, 'rhino.*nose')").await, + ["rhinos nose"] + ); + // Alternation. + assert_eq!( + matched(&dataset, "regexp_like(s, '(catalog|elephant)')").await, + ["catalog", "elephant"] + ); + // A non-accelerable pattern (no trigram derivable) still returns correct + // results via a full recheck. + assert_eq!(matched(&dataset, "regexp_like(s, 'o.m')").await, ["dogma"]); + // A case-insensitive flag is not accelerated (the index normalization + // disagrees with Unicode case folding) but must still return correct + // results via a full recheck -- here matching despite the upper-case + // pattern. This exercises the three-argument `regexp_like` flags path. + assert_eq!( + matched(&dataset, "regexp_like(s, 'RHINO', 'i')").await, + ["rhino", "rhino horn", "rhinos nose"] + ); + + // Infix LIKE is accelerated through the same machinery (a plain-literal + // `regexp_like` is rewritten to LIKE before it reaches the index). + assert_eq!( + matched(&dataset, "s LIKE '%rhino%'").await, + ["rhino", "rhino horn", "rhinos nose"] + ); + // Prefix LIKE: recheck drops "scatter", which contains the `cat` trigram + // but does not start with "cat". + assert_eq!( + matched(&dataset, "s LIKE 'cat%'").await, + ["cat", "cat dog", "catalog", "category"] + ); + + // The ngram index is actually engaged for every accelerated form. + for filter in [ + "regexp_like(s, 'rhino')", + "regexp_match(s, 'rhino')", + "s LIKE '%rhino%'", + ] { + let mut scan = dataset.scan(); + scan.filter(filter).unwrap(); + let plan = scan.create_plan().await.unwrap(); + let plan_str = format!( + "{}", + datafusion::physical_plan::displayable(plan.as_ref()).indent(true) + ); + assert!( + plan_str.contains("ScalarIndexQuery") && plan_str.contains("NGram"), + "expected ngram index usage for `{filter}`, got plan:\n{plan_str}" + ); + } + } + + #[tokio::test] + async fn test_ngram_regex_non_accelerable_recheck() { + // `a.b` yields no trigram, so the index returns "recheck everything". + // This must still produce ALL correct matches across fragments, not an + // empty set (a regression test for the AtLeast recheck path, which a + // single-match case would not catch). + let unit = ["acb", "dog", "axb", "cat", "qqq", "rhino"]; + let values: Vec<&str> = unit.iter().copied().cycle().take(60).collect(); + let array = StringArray::from_iter_values(values); + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "text", + DataType::Utf8, + false, + )])); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let write_params = WriteParams { + max_rows_per_file: 20, // 60 rows -> 3 fragments + ..Default::default() + }; + let mut dataset = + Dataset::write(reader, "memory://test_ngram_regex_ne", Some(write_params)) + .await + .unwrap(); + dataset + .create_index( + &["text"], + IndexType::NGram, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + + async fn count(dataset: &Dataset, filter: &str) -> usize { + let mut scan = dataset.scan(); + scan.filter(filter).unwrap(); + let batches = scan + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + batches.iter().map(|b| b.num_rows()).sum() + } + + // "acb" and "axb" each appear 10 times in the 60 rows -> 20 matches. + assert_eq!(count(&dataset, "regexp_match(text, 'a.b')").await, 20); + assert_eq!(count(&dataset, "regexp_like(text, 'a.b')").await, 20); + } + #[tokio::test] async fn test_like_prefix_with_btree_index() { // Create dataset with string data that has various prefixes @@ -8843,6 +9037,93 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") ); } + /// Build an in-memory dataset with a single `Dictionary(Int16, Utf8)` column. + /// The dictionary cycles through "a", "b", "c" so each value appears in a + /// predictable, repeated pattern. + async fn dictionary_string_dataset() -> Dataset { + use arrow_array::{Int16Array, Int16DictionaryArray}; + + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "etld", + DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), + false, + )])); + + let dictionary = Arc::new(StringArray::from(vec!["a", "b", "c"])); + let indices = Int16Array::from((0..30).map(|i| i % 3).collect::>()); + let dict_array = Int16DictionaryArray::try_new(indices, dictionary).unwrap(); + + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(dict_array)]).unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + Dataset::write(reader, "memory://test_dict_filter", None) + .await + .unwrap() + } + + /// Regression test for filtering a dictionary-encoded string column via the + /// SQL string path (`Scanner::filter`). This used to fail to plan with + /// "could not convert to literal of type 'Dictionary(Int16, Utf8)'". + #[tokio::test] + async fn test_filter_on_dictionary_string_column() { + let dataset = dictionary_string_dataset().await; + + // Equality predicate. + let count = dataset + .scan() + .filter("etld = 'a'") + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(count, 10); + + // IN-list predicate. + let count = dataset + .scan() + .filter("etld IN ('a', 'b')") + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(count, 20); + } + + /// An `IN`/`=` predicate on a dictionary column with a scalar index should be + /// pushed down to the index rather than falling back to a full scan. + #[tokio::test] + async fn test_dictionary_string_column_uses_scalar_index() { + use lance_index::scalar::BuiltinIndexType; + + let mut dataset = dictionary_string_dataset().await; + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap); + dataset + .create_index(&["etld"], IndexType::Scalar, None, ¶ms, true) + .await + .unwrap(); + + let mut scanner = dataset.scan(); + scanner.filter("etld IN ('a', 'b')").unwrap(); + let plan = scanner.create_plan().await.unwrap(); + let plan_str = format!("{:?}", plan); + assert!( + plan_str.contains("ScalarIndexExec") || plan_str.contains("MaterializeIndex"), + "IN on a dictionary column should use the scalar index, but got: {}", + plan_str + ); + + let count = dataset + .scan() + .filter("etld IN ('a', 'b')") + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(count, 20); + } + #[tokio::test] async fn test_like_prefix_with_segmented_zone_map() { use lance_index::scalar::BuiltinIndexType; @@ -10191,7 +10472,12 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") .await?; log::info!("Test case: Full text search with unindexed rows"); - let expected = r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid] + // The flat-FTS path now reads through `FilteredReadExec`, matching the + // brute-force KNN path. With no prefilter the scan still produces no + // pushdown, but the operator differs by storage version: legacy emits + // a `LanceScan`, v2 emits a `LanceRead` with empty filters. + let expected = if data_storage_version == LanceFileVersion::Legacy { + r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid] Take: columns="_rowid, _score, (s)" CoalesceBatchesExec: target_batch_size=8192 SortExec: expr=[_score@1 DESC NULLS LAST], preserve_partitioning=[false] @@ -10199,7 +10485,18 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") UnionExec MatchQuery: column=s, query=hello FlatMatchQuery: column=s, query=hello - LanceScan: uri=..., projection=[s], row_id=true, row_addr=false, ordered=false, range=None"#; + LanceScan: uri=..., projection=[s], row_id=true, row_addr=false, ordered=true, range=None"# + } else { + r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid] + Take: columns="_rowid, _score, (s)" + CoalesceBatchesExec: target_batch_size=8192 + SortExec: expr=[_score@1 DESC NULLS LAST], preserve_partitioning=[false] + CoalescePartitionsExec + UnionExec + MatchQuery: column=s, query=hello + FlatMatchQuery: column=s, query=hello + LanceRead: uri=..., projection=[s], num_fragments=1, range_before=None, range_after=None, row_id=true, row_addr=false, full_filter=--, refine_filter=--"# + }; dataset.append_new_data().await?; assert_plan_equals( &dataset.dataset, @@ -10232,6 +10529,10 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") .await?; log::info!("Test case: Full text search with unindexed rows and prefilter"); + // After routing flat FTS through `FilteredReadExec`, the BTree on `i` + // pushes into the unindexed-fragment scan too — no more `FilterExec` on + // top of an unfiltered `LanceScan`. Legacy uses the `MaterializeIndex` + // shape, v2 uses `LanceRead` with `full_filter` set. let expected = if data_storage_version == LanceFileVersion::Legacy { r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid] Take: columns="_rowid, _score, (s)" @@ -10247,8 +10548,14 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") FilterExec: i@0 > 10 LanceScan: uri=..., projection=[i], row_id=true, row_addr=false, ordered=false, range=None FlatMatchQuery: column=s, query=hello - FilterExec: i@1 > 10 - LanceScan: uri=..., projection=[s, i], row_id=true, row_addr=false, ordered=false, range=None"# + CoalescePartitionsExec + UnionExec + Take: columns="_rowid, (s)" + CoalesceBatchesExec: target_batch_size=8192 + MaterializeIndex: query=[i > 10]@i_idx(BTree) + ProjectionExec: expr=[_rowid@2 as _rowid, s@1 as s] + FilterExec: i@0 > 10 + LanceScan: uri=..., projection=[i, s], row_id=true, row_addr=false, ordered=false, range=None"# } else { r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid] Take: columns="_rowid, _score, (s)" @@ -10260,8 +10567,8 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") LanceRead: uri=..., projection=[], num_fragments=5, range_before=None, range_after=None, row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=-- ScalarIndexQuery: query=[i > 10]@i_idx(BTree) FlatMatchQuery: column=s, query=hello - FilterExec: i@1 > 10 - LanceScan: uri=..., projection=[s, i], row_id=true, row_addr=false, ordered=false, range=None"# + LanceRead: uri=..., projection=[s], num_fragments=1, range_before=None, range_after=None, row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=-- + ScalarIndexQuery: query=[i > 10]@i_idx(BTree)"# }; assert_plan_equals( &dataset.dataset, diff --git a/rust/lance/src/dataset/schema_evolution.rs b/rust/lance/src/dataset/schema_evolution.rs index f5d792979df..5ef35a33ab7 100644 --- a/rust/lance/src/dataset/schema_evolution.rs +++ b/rust/lance/src/dataset/schema_evolution.rs @@ -1,13 +1,18 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::{collections::HashSet, sync::Arc}; +use std::{ + collections::{HashMap, HashSet}, + sync::Arc, +}; use super::fragment::FileFragment; use super::{ Dataset, transaction::{Operation, Transaction}, + write::cleanup_data_fragments, }; +use crate::index::DatasetIndexExt; use crate::{Error, Result, io::exec::Planner}; use arrow::compute::CastOptions; use arrow::compute::can_cast_types; @@ -239,7 +244,7 @@ pub(super) async fn add_columns_to_fragments( read_columns: Option>, fragments: &[FileFragment], batch_size: Option, -) -> Result<(Vec, Schema)> { +) -> Result<(Vec, Schema, Vec)> { // Check names early (before calling add_columns_impl) to avoid extra work if // the names are wrong. let version = dataset.manifest.data_storage_format.lance_file_version()?; @@ -261,10 +266,10 @@ pub(super) async fn add_columns_to_fragments( } let transforms = optimizer.optimize(dataset, transforms)?; - let (output_schema, fragments) = match transforms { + let (output_schema, new_fragments, fragments_to_cleanup) = match transforms { NewColumnTransform::BatchUDF(udf) => { check_names(udf.output_schema.as_ref())?; - let fragments = add_columns_impl( + let result = add_columns_impl( fragments, read_columns, udf.mapper, @@ -273,7 +278,11 @@ pub(super) async fn add_columns_to_fragments( None, ) .await?; - Result::Ok((udf.output_schema, fragments)) + Result::Ok(( + udf.output_schema, + result.fragments, + result.fragments_to_cleanup, + )) } NewColumnTransform::SqlExpressions(expressions) => { // We just transform the SQL expression into a UDF backed by DataFusion @@ -336,22 +345,22 @@ pub(super) async fn add_columns_to_fragments( let mapper = Box::new(mapper); let read_columns = Some(read_schema.field_names().into_iter().cloned().collect()); - let fragments = + let result = add_columns_impl(fragments, read_columns, mapper, batch_size, None, None).await?; - Ok((output_schema, fragments)) + Ok((output_schema, result.fragments, result.fragments_to_cleanup)) } NewColumnTransform::Stream(stream) => { let output_schema = stream.schema(); check_names(output_schema.as_ref())?; let fragments = add_columns_from_stream(fragments, stream, None, batch_size).await?; - Ok((output_schema, fragments)) + Ok((output_schema, fragments.clone(), fragments)) } NewColumnTransform::Reader(reader) => { let output_schema = reader.schema(); check_names(output_schema.as_ref())?; let stream = reader.into_stream(); let fragments = add_columns_from_stream(fragments, stream, None, batch_size).await?; - Ok((output_schema, fragments)) + Ok((output_schema, fragments.clone(), fragments)) } NewColumnTransform::AllNulls(output_schema) => { check_names(output_schema.as_ref())?; @@ -379,14 +388,20 @@ pub(super) async fn add_columns_to_fragments( )); } - Ok((output_schema, fragments)) + Ok((output_schema, fragments, Vec::new())) } }?; - let mut schema = dataset.schema().merge(output_schema.as_ref())?; + let mut schema = match dataset.schema().merge(output_schema.as_ref()) { + Ok(schema) => schema, + Err(e) => { + cleanup_new_column_data_files(fragments, &fragments_to_cleanup).await; + return Err(e); + } + }; schema.set_field_id(Some(dataset.manifest.max_field_id())); - Ok((fragments, schema)) + Ok((new_fragments, schema, fragments_to_cleanup)) } pub(super) async fn add_columns( @@ -395,7 +410,7 @@ pub(super) async fn add_columns( read_columns: Option>, batch_size: Option, ) -> Result<()> { - let (fragments, schema) = add_columns_to_fragments( + let (fragments, schema, fragments_to_cleanup) = add_columns_to_fragments( dataset, transforms, read_columns, @@ -406,11 +421,75 @@ pub(super) async fn add_columns( let operation = Operation::Merge { fragments, schema }; let transaction = Transaction::new(dataset.manifest.version, operation, None); - dataset + match dataset .apply_commit(transaction, &Default::default(), &Default::default()) - .await?; + .await + { + Ok(()) => Ok(()), + Err(e) => { + cleanup_new_column_data_files(&dataset.get_fragments(), &fragments_to_cleanup).await; + Err(e) + } + } +} - Ok(()) +async fn cleanup_new_column_data_files(fragments: &[FileFragment], new_fragments: &[Fragment]) { + let Some(first_fragment) = fragments.first() else { + return; + }; + + // add_columns rewrites fragment metadata in place, so cleanup must delete + // only files created by the current attempt and must not touch pre-existing + // files that still belong to the fragment. + let original_files_by_fragment = fragments + .iter() + .map(|fragment| { + let files = fragment + .metadata + .files + .iter() + .map(|file| (file.base_id, file.path.clone())) + .collect::>(); + (fragment.id() as u64, files) + }) + .collect::>(); + + let fragments_to_cleanup = new_fragments + .iter() + .filter_map(|fragment| { + let original_files = original_files_by_fragment.get(&fragment.id)?; + let files = fragment + .files + .iter() + .filter(|file| !original_files.contains(&(file.base_id, file.path.clone()))) + .cloned() + .collect::>(); + + if files.is_empty() { + None + } else { + let mut fragment = fragment.clone(); + fragment.files = files; + Some(fragment) + } + }) + .collect::>(); + + cleanup_data_fragments( + &first_fragment.dataset().object_store, + &first_fragment.dataset().base, + &fragments_to_cleanup, + ) + .await; +} + +struct AddColumnFragments { + /// Fragments produced by the add-columns operation and returned to the + /// caller for the final merge commit. + fragments: Vec, + /// Uncommitted fragments whose newly written data files must be removed if + /// the operation fails before the merge commit completes. + fragments_to_cleanup: Vec, } #[allow(clippy::type_complexity)] @@ -421,63 +500,96 @@ async fn add_columns_impl( batch_size: Option, result_cache: Option>, schemas: Option<(Schema, Schema)>, -) -> Result> { +) -> Result { let read_columns_ref = read_columns.as_deref(); let mapper_ref = mapper.as_ref(); - let fragments = futures::stream::iter(fragments) - .then(|fragment| { - let cache_ref = result_cache.clone(); - let schemas_ref = &schemas; - async move { - if let Some(cache) = &cache_ref { - let fragment_id = fragment.id() as u32; - let fragment = cache.get_fragment(fragment_id)?; - if let Some(fragment) = fragment { - return Ok(fragment); - } + + let mut new_fragments = Vec::with_capacity(fragments.len()); + let mut fragments_to_cleanup = Vec::with_capacity(fragments.len()); + + for fragment in fragments { + if let Some(cache) = &result_cache { + let fragment_id = fragment.id() as u32; + let fragment = match cache.get_fragment(fragment_id) { + Ok(fragment) => fragment, + Err(e) => { + cleanup_new_column_data_files(fragments, &fragments_to_cleanup).await; + return Err(e); } + }; + if let Some(fragment) = fragment { + new_fragments.push(fragment); + continue; + } + } - let mut updater = fragment - .updater(read_columns_ref, schemas_ref.clone(), batch_size) - .await?; - - let mut batch_index = 0; - // TODO: the structure of the updater prevents batch-level parallelism here, - // but there is no reason why we couldn't do this in parallel. - while let Some(batch) = updater.next().await? { - let batch_info = BatchInfo { - fragment_id: fragment.id() as u32, - batch_index, - }; + let mut updater = match fragment + .updater(read_columns_ref, schemas.clone(), batch_size) + .await + { + Ok(updater) => updater, + Err(e) => { + cleanup_new_column_data_files(fragments, &fragments_to_cleanup).await; + return Err(e); + } + }; + let fragment_result = async { + let mut batch_index = 0; + // TODO: the structure of the updater prevents batch-level parallelism here, + // but there is no reason why we couldn't do this in parallel. + while let Some(batch) = updater.next().await? { + let batch_info = BatchInfo { + fragment_id: fragment.id() as u32, + batch_index, + }; - let new_batch = if let Some(cache) = &cache_ref { - if let Some(batch) = cache.get_batch(&batch_info)? { - batch - } else { - let new_batch = mapper_ref(batch)?; - cache.insert_batch(batch_info, new_batch.clone())?; - new_batch - } + let new_batch = if let Some(cache) = &result_cache { + if let Some(batch) = cache.get_batch(&batch_info)? { + batch } else { - mapper_ref(batch)? - }; + let new_batch = mapper_ref(batch)?; + cache.insert_batch(batch_info, new_batch.clone())?; + new_batch + } + } else { + mapper_ref(batch)? + }; - updater.update(new_batch).await?; - batch_index += 1; - } + updater.update(new_batch).await?; + batch_index += 1; + } - let fragment = updater.finish().await?; + let new_fragment = updater.finish().await?; + fragments_to_cleanup.push(new_fragment.clone()); - if let Some(cache) = &cache_ref { - cache.insert_fragment(fragment.clone())?; - } + if let Some(cache) = &result_cache { + // Once the checkpoint store owns this fragment, retries may load + // it back instead of rewriting it. Removing it from the cleanup + // set avoids deleting data that has already been checkpointed. + cache.insert_fragment(new_fragment.clone())?; + fragments_to_cleanup.pop(); + } - Ok::<_, Error>(fragment) + Ok::<_, Error>(new_fragment) + } + .await; + + match fragment_result { + Ok(new_fragment) => { + new_fragments.push(new_fragment); } - }) - .try_collect::>() - .await?; - Ok(fragments) + Err(e) => { + updater.cleanup_unfinished_writer().await; + cleanup_new_column_data_files(fragments, &fragments_to_cleanup).await; + return Err(e); + } + } + } + + Ok(AddColumnFragments { + fragments: new_fragments, + fragments_to_cleanup, + }) } async fn add_columns_from_stream( @@ -489,49 +601,80 @@ async fn add_columns_from_stream( let mut new_fragments = Vec::with_capacity(fragments.len()); let mut last_seen_batch: Option = None; for fragment in fragments { - let mut updater = fragment + let mut updater = match fragment .updater::(Some(&[]), schemas.clone(), batch_size) - .await?; - while let Some(batch) = updater.next().await? { - debug_assert_eq!(batch.num_columns(), 1); - let mut rows_remaining = batch.num_rows(); + .await + { + Ok(updater) => updater, + Err(e) => { + cleanup_new_column_data_files(fragments, &new_fragments).await; + return Err(e); + } + }; + let result: Result = async { + while let Some(batch) = updater.next().await? { + debug_assert_eq!(batch.num_columns(), 1); + let mut rows_remaining = batch.num_rows(); + + // The updater yields an empty batch when every row in a read batch + // has been deleted (e.g. a whole batch falls within the deletion + // vector). There is nothing to pull from the stream in that case, so + // feed an empty batch back to keep the updater in sync and continue. + if rows_remaining == 0 { + updater + .update(RecordBatch::new_empty(stream.schema())) + .await?; + continue; + } - let mut batches = Vec::new(); + let mut batches = Vec::new(); - while rows_remaining > 0 { - let next_batch = if let Some(last_seen_batch) = last_seen_batch { - last_seen_batch - } else { - stream.next().await.ok_or_else(|| { - Error::invalid_input( - "Stream ended before producing values for all rows in dataset", - ) - })?? - }; - let num_rows = next_batch.num_rows(); - if num_rows > rows_remaining { - let new_batch = next_batch.slice(0, rows_remaining); - batches.push(new_batch); - last_seen_batch = - Some(next_batch.slice(rows_remaining, num_rows - rows_remaining)); - rows_remaining = 0; - } else { - batches.push(next_batch); - rows_remaining -= num_rows; - last_seen_batch = None; + while rows_remaining > 0 { + let next_batch = if let Some(last_seen) = last_seen_batch.take() { + last_seen + } else { + stream.next().await.ok_or_else(|| { + Error::invalid_input( + "Stream ended before producing values for all rows in dataset", + ) + })?? + }; + let num_rows = next_batch.num_rows(); + if num_rows > rows_remaining { + let new_batch = next_batch.slice(0, rows_remaining); + batches.push(new_batch); + last_seen_batch = + Some(next_batch.slice(rows_remaining, num_rows - rows_remaining)); + rows_remaining = 0; + } else { + batches.push(next_batch); + rows_remaining -= num_rows; + last_seen_batch = None; + } } - } - let new_batch = - arrow_select::concat::concat_batches(&batches[0].schema(), batches.iter())?; + let new_batch = + arrow_select::concat::concat_batches(&batches[0].schema(), batches.iter())?; - updater.update(new_batch).await?; + updater.update(new_batch).await?; + } + updater.finish().await + } + .await; + + match result { + Ok(new_fragment) => new_fragments.push(new_fragment), + Err(e) => { + updater.cleanup_unfinished_writer().await; + cleanup_new_column_data_files(fragments, &new_fragments).await; + return Err(e); + } } - new_fragments.push(updater.finish().await?); } // Ensure the stream is fully consumed if last_seen_batch.is_some() || stream.next().await.is_some() { + cleanup_new_column_data_files(fragments, &new_fragments).await; return Err(Error::invalid_input_source( "Stream produced more values than expected for dataset".into(), )); @@ -605,6 +748,41 @@ pub(super) async fn alter_columns( new_schema.validate()?; + // If any column being cast has an attached index, fail fast. Cast operations + // rewrite the underlying column data and silently invalidate any index on the + // affected column(s). The current behavior is to drop such indices without + // warning, which has caused production incidents where vector search silently + // regressed to brute-force scan. We require users to explicitly drop the + // index before altering the column type, so the action is never silent. + if !cast_fields.is_empty() { + let indices = dataset.load_indices().await?; + let affected: Vec<&lance_table::format::IndexMetadata> = indices + .iter() + .filter(|idx| { + cast_fields + .iter() + .any(|(old, _)| idx.fields.contains(&old.id)) + }) + .collect(); + if !affected.is_empty() { + let affected_cols: Vec = cast_fields + .iter() + .filter(|(old, _)| affected.iter().any(|i| i.fields.contains(&old.id))) + .map(|(old, _)| old.name.clone()) + .collect(); + let affected_idx_names: Vec = affected.iter().map(|i| i.name.clone()).collect(); + return Err(Error::invalid_input(format!( + "Cannot cast column(s) [{}] to a new type: they have {} index(es) \ + attached: [{}]. Cast rewrites column data and invalidates any index \ + on the affected column(s). Drop the index(es) with drop_index() \ + before altering, then recreate them after the cast completes.", + affected_cols.join(", "), + affected.len(), + affected_idx_names.join(", "), + ))); + } + } + // If we aren't casting a column, we don't need to touch the fragments. let transaction = if cast_fields.is_empty() { Transaction::new( @@ -653,7 +831,7 @@ pub(super) async fn alter_columns( }; let mapper = Box::new(mapper); - let fragments = add_columns_impl( + let result = add_columns_impl( &dataset.get_fragments(), Some(read_columns), mapper, @@ -666,7 +844,8 @@ pub(super) async fn alter_columns( // Some data files may no longer contain any columns in the dataset (e.g. if every // remaining column has been altered into a different data file) and so we remove them let schema_field_ids = new_schema.field_ids().into_iter().collect::>(); - let fragments = fragments + let fragments = result + .fragments .into_iter() .map(|mut frag| { frag.files.retain(|f| { @@ -734,56 +913,751 @@ pub(super) async fn drop_columns(dataset: &mut Dataset, columns: &[&str]) -> Res .apply_commit(transaction, &Default::default(), &Default::default()) .await?; - Ok(()) -} + Ok(()) +} + +/// Exclude the fields from `other` Schema, and returns a new Schema. +pub fn exclude(source: &Schema, other: &Schema, version: &LanceFileVersion) -> Result { + let other: Schema = other.try_into().map_err(|_| { + Error::schema("The other schema is not compatible with this schema".to_string()) + })?; + let mut fields = vec![]; + for field in source.fields.iter() { + if let Some(other_field) = other.field(&field.name) { + if version.support_remove_sub_column(field) + && let Some(f) = field.exclude(other_field) + { + fields.push(f) + } + } else { + fields.push(field.clone()); + } + } + Ok(Schema { + fields, + metadata: source.metadata.clone(), + }) +} + +#[cfg(test)] +mod test { + use std::{collections::HashMap, fs, num::NonZero, path::Path as StdPath, sync::Mutex}; + + use crate::dataset::WriteParams; + use arrow_array::{ + ArrayRef, Int32Array, ListArray, RecordBatchIterator, StringArray, StructArray, + }; + + use super::*; + use arrow_schema::Fields as ArrowFields; + use lance_core::utils::tempfile::TempStrDir; + use lance_file::version::LanceFileVersion; + use lance_table::format::{BasePath, DataFile}; + use rstest::rstest; + + // Used to validate that futures returned are Send. + fn require_send(t: T) -> T { + t + } + + fn file_paths_in(dir: impl AsRef) -> Vec { + fn collect_files( + base_dir: &StdPath, + dir: &StdPath, + files: &mut Vec, + ) -> std::io::Result<()> { + if !dir.exists() { + return Ok(()); + } + for entry in std::fs::read_dir(dir)? { + let path = entry?.path(); + if path.is_dir() { + collect_files(base_dir, &path, files)?; + } else if path.is_file() + && path + .file_name() + .and_then(|name| name.to_str()) + .is_some_and(|file_name| !file_name.starts_with('.')) + { + files.push( + path.strip_prefix(base_dir) + .unwrap() + .to_string_lossy() + .to_string(), + ); + } + } + Ok(()) + } + + let base_dir = dir.as_ref(); + let mut files = Vec::new(); + collect_files(base_dir, base_dir, &mut files).unwrap(); + files.sort(); + files + } + + fn data_file_paths_in(base_dir: &str) -> Vec { + file_paths_in(StdPath::new(base_dir).join("data")) + } + + #[tokio::test] + async fn test_append_columns_exprs() -> Result<()> { + let num_rows = 5; + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..num_rows as i32))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::Legacy), + ..Default::default() + }), + ) + .await?; + dataset.validate().await?; + + // Adding a duplicate column name will break + let fut = dataset.add_columns( + NewColumnTransform::SqlExpressions(vec![("id".into(), "id + 1".into())]), + None, + None, + ); + // (Quick validation that the future is Send) + let res = require_send(fut).await; + assert!(matches!(res, Err(Error::InvalidInput { .. }))); + + // Can add a column that is independent of any existing ones + dataset + .add_columns( + NewColumnTransform::SqlExpressions(vec![("value".into(), "2 * random()".into())]), + None, + None, + ) + .await?; + + // Can add a column derived from an existing one. + dataset + .add_columns( + NewColumnTransform::SqlExpressions(vec![("double_id".into(), "2 * id".into())]), + None, + None, + ) + .await?; + + // Can derive a column from existing ones across multiple data files. + dataset + .add_columns( + NewColumnTransform::SqlExpressions(vec![( + "triple_id".into(), + "id + double_id".into(), + )]), + None, + None, + ) + .await?; + + // These can be read back, the dataset is valid + dataset.validate().await?; + + let data = dataset.scan().try_into_batch().await?; + let expected_schema = ArrowSchema::new(vec![ + ArrowField::new("id", DataType::Int32, false), + ArrowField::new("value", DataType::Float64, true), + ArrowField::new("double_id", DataType::Int32, false), + ArrowField::new("triple_id", DataType::Int32, false), + ]); + assert_eq!(data.schema().as_ref(), &expected_schema); + assert_eq!(data.num_rows(), num_rows); + + Ok(()) + } + + #[tokio::test] + async fn test_add_columns_with_fully_deleted_batch() -> Result<()> { + // Regression test: when an entire read batch has been deleted, the + // updater yields a 0-row batch. The inner loop then never runs and + // `batches` stays empty, so `concat_batches(&batches[0]..)` used to + // panic with "index out of bounds: the len is 0 but the index is 0". + // + // A single fragment holds 105 rows; deleting the trailing 5 rows means + // that, when read with batch_size=50, the third batch [100..105) is + // fully filtered out and produces an empty batch. + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "i", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..105))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 200, // keep all rows in a single fragment + ..Default::default() + }), + ) + .await?; + + // Delete the entire trailing batch [100..105). + dataset.delete("i >= 100").await?; + assert_eq!(dataset.count_rows(None).await?, 100); + + let new_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "j", + DataType::Int32, + false, + )])); + let new_batch = RecordBatch::try_new( + new_schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..100))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(new_batch)], new_schema.clone()); + + // Read with batch_size=50 so the deleted trailing rows form a full empty batch. + dataset + .add_columns(NewColumnTransform::Reader(Box::new(reader)), None, Some(50)) + .await?; + + let data = dataset.scan().try_into_batch().await?; + assert_eq!(data.num_rows(), 100); + assert_eq!( + data.column_by_name("j").unwrap().as_ref(), + &Int32Array::from_iter_values(0..100) + ); + + Ok(()) + } + + #[rstest] + #[tokio::test] + async fn test_add_columns_cleans_up_blob_v2_data_on_stream_error( + #[values( + ("inline", b"inline".to_vec()), + ("packed", vec![1u8; 128 * 1024]), + ("dedicated", vec![2u8; 5 * 1024 * 1024]), + ("external", b"external".to_vec()) + )] + blob_case: (&str, Vec), + ) -> Result<()> { + let (blob_kind, payload) = blob_case; + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..1))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let external_dir = tempfile::tempdir()?; + let external_path = external_dir.path().join("blob.bin"); + fs::write(&external_path, &payload)?; + let external_baseline_files = file_paths_in(external_dir.path()); + let external_baseline_payload = fs::read(&external_path)?; + + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + initial_bases: Some(vec![BasePath::new( + 1, + external_dir.path().to_string_lossy().to_string(), + Some("external".to_string()), + false, + )]), + ..Default::default() + }), + ) + .await?; + let baseline_files = data_file_paths_in(test_uri); + + let mut blob_builder = crate::BlobArrayBuilder::new(2); + if blob_kind == "external" { + blob_builder.push_uri(external_path.to_string_lossy())?; + } else { + blob_builder.push_bytes(payload)?; + } + blob_builder.push_bytes(b"extra")?; + let blob_array = blob_builder.finish()?; + let blob_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])); + let blob_batch = RecordBatch::try_new(blob_schema.clone(), vec![blob_array])?; + let reader = RecordBatchIterator::new(vec![Ok(blob_batch)], blob_schema); + + let err = dataset + .add_columns(NewColumnTransform::Reader(Box::new(reader)), None, None) + .await + .unwrap_err(); + assert!( + err.to_string() + .contains("Stream produced more values than expected for dataset") + ); + + assert_eq!( + data_file_paths_in(test_uri), + baseline_files, + "add_columns should clean up new data files and blob v2 sidecars on failure" + ); + assert_eq!( + file_paths_in(external_dir.path()), + external_baseline_files, + "cleanup must not delete external files" + ); + assert_eq!( + fs::read(&external_path)?, + external_baseline_payload, + "cleanup must not modify external files" + ); + dataset.validate().await?; + + Ok(()) + } + + #[tokio::test] + async fn test_cleanup_preserves_checkpointed_fragment_files() -> Result<()> { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..2))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 1, + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await?; + let original_fragments = dataset.get_fragments(); + assert_eq!(original_fragments.len(), 2); + + let data_dir = StdPath::new(test_uri).join("data"); + let cached_file = data_dir.join("checkpointed.lance"); + let cached_blob_dir = data_dir.join("checkpointed"); + fs::write(&cached_file, b"checkpointed data")?; + fs::create_dir_all(&cached_blob_dir)?; + fs::write( + cached_blob_dir.join("00000000000000000000000000000001.blob"), + b"blob", + )?; + + let mut checkpointed_fragment = original_fragments[0].metadata().clone(); + checkpointed_fragment.files.push(DataFile::new( + "checkpointed.lance", + vec![dataset.manifest.max_field_id() + 1], + vec![0], + 2, + 2, + NonZero::new(17), + None, + )); + + #[derive(Default)] + struct CheckpointedFragmentStore { + fragment: Mutex>, + } + + impl UDFCheckpointStore for CheckpointedFragmentStore { + fn get_batch(&self, _info: &BatchInfo) -> Result> { + Ok(None) + } + + fn insert_batch(&self, _info: BatchInfo, _batch: RecordBatch) -> Result<()> { + Ok(()) + } + + fn get_fragment(&self, fragment_id: u32) -> Result> { + if fragment_id == 0 { + Ok(self.fragment.lock().unwrap().clone()) + } else { + Ok(None) + } + } + + fn insert_fragment(&self, _fragment: Fragment) -> Result<()> { + Ok(()) + } + } + + let transforms = NewColumnTransform::BatchUDF(BatchUDF { + mapper: Box::new(|_| Err(Error::invalid_input("injected UDF failure"))), + output_schema: Arc::new(ArrowSchema::new(vec![ArrowField::new( + "checkpointed", + DataType::Int32, + true, + )])), + result_checkpoint: Some(Arc::new(CheckpointedFragmentStore { + fragment: Mutex::new(Some(checkpointed_fragment)), + })), + }); + + let err = dataset + .add_columns(transforms, None, None) + .await + .unwrap_err(); + assert!(err.to_string().contains("injected UDF failure")); + + assert!( + cached_file.exists(), + "cleanup must not delete fragment files restored from a checkpoint" + ); + assert!( + cached_blob_dir.exists(), + "cleanup must not delete blob sidecars restored from a checkpoint" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_add_columns_cleans_current_blob_v2_writer_on_udf_error() -> Result<()> { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..2))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await?; + let baseline_files = data_file_paths_in(test_uri); + + let call_count = Arc::new(Mutex::new(0usize)); + let mapper_call_count = call_count.clone(); + let output_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])); + let mapper = move |batch: &RecordBatch| { + let mut call_count = mapper_call_count.lock().unwrap(); + *call_count += 1; + if *call_count == 2 { + return Err(Error::invalid_input("injected UDF failure")); + } + + let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows()); + for _ in 0..batch.num_rows() { + blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?; + } + Ok(RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])), + vec![blob_builder.finish()?], + )?) + }; + let transforms = NewColumnTransform::BatchUDF(BatchUDF { + mapper: Box::new(mapper), + output_schema, + result_checkpoint: None, + }); + + let err = dataset + .add_columns(transforms, None, Some(1)) + .await + .unwrap_err(); + assert!(err.to_string().contains("injected UDF failure")); + assert_eq!( + data_file_paths_in(test_uri), + baseline_files, + "add_columns should clean files written by the current unfinished writer" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_add_columns_preserves_checkpointed_blob_v2_fragment_on_checkpoint_lookup_error() + -> Result<()> { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..2))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 1, + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await?; + + struct FailingLookupStore { + inserted: Arc>>, + } + + impl UDFCheckpointStore for FailingLookupStore { + fn get_batch(&self, _info: &BatchInfo) -> Result> { + Ok(None) + } + + fn insert_batch(&self, _info: BatchInfo, _batch: RecordBatch) -> Result<()> { + Ok(()) + } + + fn get_fragment(&self, fragment_id: u32) -> Result> { + if fragment_id == 1 { + Err(Error::invalid_input("injected checkpoint lookup failure")) + } else { + Ok(None) + } + } + + fn insert_fragment(&self, fragment: Fragment) -> Result<()> { + *self.inserted.lock().unwrap() = Some(fragment); + Ok(()) + } + } + + let inserted = Arc::new(Mutex::new(None)); + let output_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])); + let mapper = move |batch: &RecordBatch| { + let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows()); + for _ in 0..batch.num_rows() { + blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?; + } + Ok(RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])), + vec![blob_builder.finish()?], + )?) + }; + let transforms = NewColumnTransform::BatchUDF(BatchUDF { + mapper: Box::new(mapper), + output_schema, + result_checkpoint: Some(Arc::new(FailingLookupStore { + inserted: inserted.clone(), + })), + }); + + let err = dataset + .add_columns(transforms, None, None) + .await + .unwrap_err(); + assert!( + err.to_string() + .contains("injected checkpoint lookup failure") + ); + let inserted = inserted.lock().unwrap().clone().unwrap(); + let new_file = inserted + .files + .iter() + .find(|file| { + file.fields + .iter() + .any(|field| *field > dataset.manifest.max_field_id()) + }) + .expect("checkpoint should record the newly written data file"); + let new_file_path = StdPath::new(test_uri).join("data").join(&new_file.path); + let new_blob_dir = StdPath::new(test_uri) + .join("data") + .join(StdPath::new(&new_file.path).file_stem().unwrap()); + assert!( + new_file_path.exists(), + "cleanup must not delete data files after checkpoint takes ownership" + ); + assert!( + new_blob_dir.exists(), + "cleanup must not delete blob sidecars after checkpoint takes ownership" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_add_columns_cleans_finished_blob_v2_writer_on_checkpoint_insert_error() + -> Result<()> { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..1))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await?; + let baseline_files = data_file_paths_in(test_uri); + + struct FailingInsertStore; -/// Exclude the fields from `other` Schema, and returns a new Schema. -pub fn exclude(source: &Schema, other: &Schema, version: &LanceFileVersion) -> Result { - let other: Schema = other.try_into().map_err(|_| { - Error::schema("The other schema is not compatible with this schema".to_string()) - })?; - let mut fields = vec![]; - for field in source.fields.iter() { - if let Some(other_field) = other.field(&field.name) { - if version.support_remove_sub_column(field) - && let Some(f) = field.exclude(other_field) - { - fields.push(f) + impl UDFCheckpointStore for FailingInsertStore { + fn get_batch(&self, _info: &BatchInfo) -> Result> { + Ok(None) + } + + fn insert_batch(&self, _info: BatchInfo, _batch: RecordBatch) -> Result<()> { + Ok(()) + } + + fn get_fragment(&self, _fragment_id: u32) -> Result> { + Ok(None) + } + + fn insert_fragment(&self, _fragment: Fragment) -> Result<()> { + Err(Error::invalid_input("injected checkpoint insert failure")) } - } else { - fields.push(field.clone()); } + + let output_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])); + let mapper = move |batch: &RecordBatch| { + let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows()); + for _ in 0..batch.num_rows() { + blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?; + } + Ok(RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])), + vec![blob_builder.finish()?], + )?) + }; + let transforms = NewColumnTransform::BatchUDF(BatchUDF { + mapper: Box::new(mapper), + output_schema, + result_checkpoint: Some(Arc::new(FailingInsertStore)), + }); + + let err = dataset + .add_columns(transforms, None, None) + .await + .unwrap_err(); + assert!( + err.to_string() + .contains("injected checkpoint insert failure") + ); + assert_eq!( + data_file_paths_in(test_uri), + baseline_files, + "add_columns should clean finished writer files when checkpoint insert fails" + ); + + Ok(()) } - Ok(Schema { - fields, - metadata: source.metadata.clone(), - }) -} -#[cfg(test)] -mod test { - use std::collections::HashMap; - use std::sync::Mutex; + #[tokio::test] + async fn test_add_columns_cleans_blob_v2_files_on_declared_schema_merge_error() -> Result<()> { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..1))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); - use crate::dataset::WriteParams; - use arrow_array::{ - ArrayRef, Int32Array, ListArray, RecordBatchIterator, StringArray, StructArray, - }; + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await?; + let baseline_files = data_file_paths_in(test_uri); - use super::*; - use arrow_schema::Fields as ArrowFields; - use lance_core::utils::tempfile::TempStrDir; - use lance_file::version::LanceFileVersion; - use rstest::rstest; + let mapper = move |batch: &RecordBatch| { + let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows()); + for _ in 0..batch.num_rows() { + blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?; + } + Ok(RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])), + vec![blob_builder.finish()?], + )?) + }; + let transforms = NewColumnTransform::BatchUDF(BatchUDF { + mapper: Box::new(mapper), + output_schema: Arc::new(ArrowSchema::new(vec![ + ArrowField::new("declared", DataType::Int32, true), + ArrowField::new("declared", DataType::Int32, true), + ])), + result_checkpoint: None, + }); - // Used to validate that futures returned are Send. - fn require_send(t: T) -> T { - t + let err = dataset + .add_columns(transforms, None, None) + .await + .unwrap_err(); + assert!(matches!(err, Error::Schema { .. })); + assert_eq!( + data_file_paths_in(test_uri), + baseline_files, + "add_columns should clean files written before declared schema merge fails" + ); + + Ok(()) } #[tokio::test] - async fn test_append_columns_exprs() -> Result<()> { - let num_rows = 5; + async fn test_add_columns_preserves_checkpointed_blob_v2_fragment_after_later_failure() + -> Result<()> { let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( "id", DataType::Int32, @@ -791,75 +1665,101 @@ mod test { )])); let batch = RecordBatch::try_new( schema.clone(), - vec![Arc::new(Int32Array::from_iter_values(0..num_rows as i32))], + vec![Arc::new(Int32Array::from_iter_values(0..2))], )?; - let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); let test_dir = TempStrDir::default(); - let test_uri = &test_dir; + let test_uri = test_dir.as_str(); let mut dataset = Dataset::write( reader, test_uri, Some(WriteParams { - data_storage_version: Some(LanceFileVersion::Legacy), + max_rows_per_file: 1, + data_storage_version: Some(LanceFileVersion::V2_2), ..Default::default() }), ) .await?; - dataset.validate().await?; - // Adding a duplicate column name will break - let fut = dataset.add_columns( - NewColumnTransform::SqlExpressions(vec![("id".into(), "id + 1".into())]), - None, - None, - ); - // (Quick validation that the future is Send) - let res = require_send(fut).await; - assert!(matches!(res, Err(Error::InvalidInput { .. }))); + struct InsertThenFailStore { + inserted: Arc>>, + } - // Can add a column that is independent of any existing ones - dataset - .add_columns( - NewColumnTransform::SqlExpressions(vec![("value".into(), "2 * random()".into())]), - None, - None, - ) - .await?; + impl UDFCheckpointStore for InsertThenFailStore { + fn get_batch(&self, info: &BatchInfo) -> Result> { + if info.fragment_id == 1 { + Err(Error::invalid_input("injected later checkpoint failure")) + } else { + Ok(None) + } + } - // Can add a column derived from an existing one. - dataset - .add_columns( - NewColumnTransform::SqlExpressions(vec![("double_id".into(), "2 * id".into())]), - None, - None, - ) - .await?; + fn insert_batch(&self, _info: BatchInfo, _batch: RecordBatch) -> Result<()> { + Ok(()) + } - // Can derive a column from existing ones across multiple data files. - dataset - .add_columns( - NewColumnTransform::SqlExpressions(vec![( - "triple_id".into(), - "id + double_id".into(), - )]), - None, - None, - ) - .await?; + fn get_fragment(&self, _fragment_id: u32) -> Result> { + Ok(None) + } - // These can be read back, the dataset is valid - dataset.validate().await?; + fn insert_fragment(&self, fragment: Fragment) -> Result<()> { + *self.inserted.lock().unwrap() = Some(fragment); + Ok(()) + } + } - let data = dataset.scan().try_into_batch().await?; - let expected_schema = ArrowSchema::new(vec![ - ArrowField::new("id", DataType::Int32, false), - ArrowField::new("value", DataType::Float64, true), - ArrowField::new("double_id", DataType::Int32, false), - ArrowField::new("triple_id", DataType::Int32, false), - ]); - assert_eq!(data.schema().as_ref(), &expected_schema); - assert_eq!(data.num_rows(), num_rows); + let inserted = Arc::new(Mutex::new(None)); + let output_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])); + let mapper = move |batch: &RecordBatch| { + let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows()); + for _ in 0..batch.num_rows() { + blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?; + } + Ok(RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])), + vec![blob_builder.finish()?], + )?) + }; + let transforms = NewColumnTransform::BatchUDF(BatchUDF { + mapper: Box::new(mapper), + output_schema, + result_checkpoint: Some(Arc::new(InsertThenFailStore { + inserted: inserted.clone(), + })), + }); + + let err = dataset + .add_columns(transforms, None, None) + .await + .unwrap_err(); + assert!( + err.to_string() + .contains("injected later checkpoint failure") + ); + + let inserted = inserted.lock().unwrap().clone().unwrap(); + let new_file = inserted + .files + .iter() + .find(|file| { + file.fields + .iter() + .any(|field| *field > dataset.manifest.max_field_id()) + }) + .expect("checkpoint should record the newly written data file"); + let new_file_path = StdPath::new(test_uri).join("data").join(&new_file.path); + let new_blob_dir = StdPath::new(test_uri) + .join("data") + .join(StdPath::new(&new_file.path).file_stem().unwrap()); + assert!( + new_file_path.exists(), + "cleanup must not delete data files after checkpoint takes ownership" + ); + assert!( + new_blob_dir.exists(), + "cleanup must not delete blob sidecars after checkpoint takes ownership" + ); Ok(()) } @@ -1784,7 +2684,6 @@ mod test { ) -> Result<()> { // Create a table with 2 scalar columns, 1 vector column - use crate::index::DatasetIndexExt; use arrow::datatypes::{Int32Type, Int64Type}; use arrow_array::{Float16Array, Float32Array, Int64Array, ListArray}; use half::f16; @@ -1885,7 +2784,10 @@ mod test { assert_eq!(f.files.len(), 2); }); - // Cast scalar column with index, should not keep index (TODO: keep it) + // Cast scalar column with index. The index must be dropped first; cast + // is now a fail-fast operation when an index is attached, see + // test_alter_columns_cast_fails_with_attached_index for that path. + dataset.drop_index("i_idx").await?; dataset .alter_columns(&[ColumnAlteration::new("i".into()).cast_to(DataType::Int64)]) .await?; @@ -1906,7 +2808,8 @@ mod test { ]); assert_eq!(&ArrowSchema::from(dataset.schema()), &expected_schema); - // We currently lose the index when casting a column + // The scalar index on `i` is gone (we dropped it); the vector index on + // `vec` is still present. let indices = dataset.load_indices().await?; assert_eq!(indices.len(), 1); @@ -1915,7 +2818,8 @@ mod test { assert_eq!(f.files.len(), 3); }); - // Cast vector column, should not keep index (TODO: keep it) + // Cast vector column. Drop its index first (same reason as above). + dataset.drop_index("vec_idx").await?; dataset .alter_columns(&[ ColumnAlteration::new("vec".into()).cast_to(DataType::FixedSizeList( @@ -1983,6 +2887,120 @@ mod test { Ok(()) } + /// Cast on a column with an attached index must fail fast rather than + /// silently dropping the index. This guards against the historical behavior + /// where cast would rewrite column data and the index would vanish without + /// any error or warning, causing vector search to silently regress to a + /// brute-force scan. + #[rstest] + #[tokio::test] + async fn test_alter_columns_cast_fails_with_attached_index( + #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)] + data_storage_version: LanceFileVersion, + ) -> Result<()> { + use lance_arrow::FixedSizeListArrayExt; + use lance_index::IndexType; + use lance_linalg::distance::MetricType; + use lance_testing::datagen::generate_random_array; + + use crate::index::vector::VectorIndexParams; + + // Build a small dataset with one indexed vector column. + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "vec", + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 64, + ), + false, + )])); + let nrows = 256; + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new( + ::try_new_from_values( + generate_random_array(64 * nrows as usize), + 64, + ) + .unwrap(), + )], + )?; + + let test_dir = TempStrDir::default(); + let mut dataset = Dataset::write( + RecordBatchIterator::new(vec![Ok(batch)], schema.clone()), + &test_dir, + Some(WriteParams { + data_storage_version: Some(data_storage_version), + ..Default::default() + }), + ) + .await?; + + // Build an IVF_PQ index on the vector column. + let params = VectorIndexParams::ivf_pq(4, 8, 8, MetricType::L2, 50); + dataset + .create_index(&["vec"], IndexType::Vector, None, ¶ms, false) + .await?; + + let indices_before = dataset.load_indices().await?; + assert_eq!(indices_before.len(), 1, "precondition: index exists"); + let index_name = indices_before[0].name.clone(); + + // Attempting to cast the indexed column must fail with a clear message + // that names the offending index(es). + let result = dataset + .alter_columns(&[ + ColumnAlteration::new("vec".into()).cast_to(DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float16, true)), + 64, + )), + ]) + .await; + let err = result.expect_err("cast on indexed column should fail"); + let msg = err.to_string(); + assert!( + msg.contains("vec") && msg.contains(&index_name), + "error should mention column and index name, got: {msg}" + ); + assert!( + msg.contains("drop_index"), + "error should suggest the remediation, got: {msg}" + ); + + // The dataset must be unchanged: schema is still float32, index still present. + assert_eq!( + dataset.schema().field("vec").unwrap().data_type(), + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 64, + ), + ); + let indices_after = dataset.load_indices().await?; + assert_eq!(indices_after.len(), 1, "index should still exist"); + assert_eq!(indices_after[0].name, index_name); + + // Sanity check: after dropping the index, the same cast should succeed. + dataset.drop_index(&index_name).await?; + dataset + .alter_columns(&[ + ColumnAlteration::new("vec".into()).cast_to(DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float16, true)), + 64, + )), + ]) + .await?; + assert_eq!( + dataset.schema().field("vec").unwrap().data_type(), + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float16, true)), + 64, + ), + ); + + Ok(()) + } + #[rstest] #[tokio::test] async fn test_drop_columns( diff --git a/rust/lance/src/dataset/tests/dataset_index.rs b/rust/lance/src/dataset/tests/dataset_index.rs index beb6e2b99fd..267296c984b 100644 --- a/rust/lance/src/dataset/tests/dataset_index.rs +++ b/rust/lance/src/dataset/tests/dataset_index.rs @@ -1137,6 +1137,78 @@ async fn test_fts_without_index() { assert_eq!(results.num_rows(), 1); } +#[tokio::test] +async fn test_fts_without_index_uses_scalar_index_for_prefilter() { + // Verify that flat FTS (no inverted index on text) routes its prefilter + // through `FilteredReadExec` so a scalar index on the filter column is + // actually used. Six rows with two distinct ids: a prefilter of `id = 1` + // must match exactly the three text rows tagged with id=1. + let text = StringArray::from(vec![ + "alpha bravo", + "charlie delta", + "alpha echo", + "foxtrot", + "alpha golf", + "hotel india", + ]); + let ids = Int32Array::from(vec![1, 1, 1, 2, 2, 2]); + let batch = RecordBatch::try_new( + arrow_schema::Schema::new(vec![ + Field::new("text", text.data_type().to_owned(), false), + Field::new("id", ids.data_type().to_owned(), false), + ]) + .into(), + vec![Arc::new(text) as ArrayRef, Arc::new(ids) as ArrayRef], + ) + .unwrap(); + let schema = batch.schema(); + let batches = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema); + let test_uri = TempStrDir::default(); + let mut dataset = Dataset::write(batches, &test_uri, None).await.unwrap(); + + // Scalar index on `id` only — no FTS index on `text`. + dataset + .create_index( + &["id"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + + let mut scan = dataset.scan(); + scan.prefilter(true) + .full_text_search( + FullTextSearchQuery::new("alpha".to_owned()) + .with_columns(&["text".to_string()]) + .unwrap(), + ) + .unwrap() + .filter("id = 1") + .unwrap(); + + let plan = scan.analyze_plan().await.unwrap(); + // The flat-FTS path now reads via `FilteredReadExec` (prints as `LanceRead`) + // with the prefilter plumbed into it, so the scalar index on `id` is used. + assert_contains!(&plan, "FlatMatchQuery"); + assert_contains!(&plan, "LanceRead"); + assert_contains!(&plan, "full_filter=id = Int32(1)"); + // The legacy plan ran a `LanceScan` wrapped in a manual `LanceFilterExec`; + // make sure we did not regress to that shape. + assert_not_contains!(&plan, "LanceScan:"); + + let results = scan.try_into_batch().await.unwrap(); + // Only rows with id=1 AND text matching "alpha": rows 0 ("alpha bravo") + // and 2 ("alpha echo"). Row 4 ("alpha golf") has id=2 and must be excluded. + assert_eq!( + results.num_rows(), + 2, + "expected the two id=1 rows that match `alpha`, got plan:\n{plan}" + ); +} + #[tokio::test] async fn test_fts_rank() { let params = InvertedIndexParams::default(); @@ -2078,11 +2150,7 @@ mod fts_serializing_backend { ) -> Option { let guard = self.serialized.lock().await; if let Some((bytes, stored_codec, _)) = guard.get(key) { - return Some( - stored_codec - .deserialize(&bytes.clone()) - .expect("deserialization should succeed"), - ); + return stored_codec.deserialize(&bytes.clone()).hit(); } drop(guard); self.passthrough.get(key, codec).await diff --git a/rust/lance/src/dataset/tests/dataset_versioning.rs b/rust/lance/src/dataset/tests/dataset_versioning.rs index a0bc7816a32..c04dd0f3183 100644 --- a/rust/lance/src/dataset/tests/dataset_versioning.rs +++ b/rust/lance/src/dataset/tests/dataset_versioning.rs @@ -211,6 +211,77 @@ async fn test_version_id_fast_path() { assert_eq!(historical.latest_version_id().await.unwrap(), 2); } +#[rstest] +#[tokio::test] +async fn test_stale_checks_cover_fast_successor_and_latest_version( + #[values(false, true)] enable_v2_manifest_paths: bool, +) { + let expected_scheme = if enable_v2_manifest_paths { + ManifestNamingScheme::V2 + } else { + ManifestNamingScheme::V1 + }; + let test_uri = TempStrDir::default(); + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "i", + DataType::UInt32, + false, + )])); + + let data = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(UInt32Array::from_iter_values(0..5))], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![data].into_iter().map(Ok), schema.clone()); + + let original = Dataset::write( + reader, + &test_uri, + Some(WriteParams { + enable_v2_manifest_paths, + ..Default::default() + }), + ) + .await + .unwrap(); + assert_eq!(original.manifest_location().naming_scheme, expected_scheme); + assert!(!original.is_stale().await.unwrap()); + assert!(!original.has_successor_version().await.unwrap()); + + let data = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(UInt32Array::from_iter_values(5..10))], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![data].into_iter().map(Ok), schema); + let updated = Dataset::write( + reader, + &test_uri, + Some(WriteParams { + mode: WriteMode::Append, + enable_v2_manifest_paths, + ..Default::default() + }), + ) + .await + .unwrap(); + + assert!(original.is_stale().await.unwrap()); + assert!(original.has_successor_version().await.unwrap()); + assert_eq!(updated.manifest_location().naming_scheme, expected_scheme); + assert!(!updated.is_stale().await.unwrap()); + assert!(!updated.has_successor_version().await.unwrap()); + + let historical = updated.checkout_version(1).await.unwrap(); + assert_eq!( + historical.manifest_location().naming_scheme, + expected_scheme + ); + assert!(historical.is_stale().await.unwrap()); + assert!(historical.has_successor_version().await.unwrap()); +} + #[rstest] #[tokio::test] async fn test_restore( diff --git a/rust/lance/src/dataset/updater.rs b/rust/lance/src/dataset/updater.rs index b9bc34f8706..90ef8df914b 100644 --- a/rust/lance/src/dataset/updater.rs +++ b/rust/lance/src/dataset/updater.rs @@ -6,13 +6,13 @@ use futures::StreamExt; use lance_core::datatypes::{OnMissing, OnTypeMismatch}; use lance_core::utils::deletion::DeletionVector; use lance_core::{Error, Result, datatypes::Schema}; -use lance_table::format::Fragment; +use lance_table::format::{DataFile, Fragment}; use lance_table::utils::stream::ReadBatchFutStream; use super::Dataset; use super::fragment::FragmentReader; use super::scanner::get_default_batch_size; -use super::write::{GenericWriter, open_writer}; +use super::write::{GenericWriter, cleanup_data_fragments, open_update_writer}; use crate::dataset::FileFragment; use crate::dataset::utils::SchemaAdapter; @@ -146,13 +146,7 @@ impl Updater { .data_storage_format .lance_file_version()?; - open_writer( - &self.fragment.dataset().object_store, - &schema, - &self.fragment.dataset().base, - data_storage_version, - ) - .await + open_update_writer(self.dataset(), &schema, data_storage_version).await } /// Update one batch. @@ -221,6 +215,34 @@ impl Updater { Ok(self.fragment.metadata().clone()) } + /// Clean up any data file and blob sidecars created by the current unfinished writer. + pub(super) async fn cleanup_unfinished_writer(&mut self) { + let Some(writer) = self.writer.take() else { + return; + }; + let (path, base_id) = writer.data_file_path(); + let path = path.to_string(); + drop(writer); + + if path.is_empty() { + return; + } + + let mut fragment = Fragment::new(self.fragment.id() as u64); + // cleanup_data_fragments only needs path/base_id to remove the unfinished + // data file and any blob sidecars. Build a minimal synthetic fragment so + // we can reuse the shared cleanup path without fabricating full metadata. + fragment + .files + .push(DataFile::new(path, vec![], vec![], 0, 0, None, base_id)); + cleanup_data_fragments( + &self.dataset().object_store, + &self.dataset().base, + &[fragment], + ) + .await; + } + /// Get the final schema of the fragment after the update. /// /// This may be None if the schema is not known. This can happen if it was diff --git a/rust/lance/src/dataset/write.rs b/rust/lance/src/dataset/write.rs index 1e73618fc6b..ff0a119158c 100644 --- a/rust/lance/src/dataset/write.rs +++ b/rust/lance/src/dataset/write.rs @@ -6,7 +6,10 @@ use chrono::TimeDelta; use datafusion::physical_plan::SendableRecordBatchStream; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use futures::{Stream, StreamExt, TryStreamExt}; -use lance_arrow::BLOB_META_KEY; +use lance_arrow::{ + ARROW_EXT_NAME_KEY, BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, BLOB_META_KEY, BLOB_V2_EXT_NAME, +}; use lance_core::datatypes::{ NullabilityComparison, OnMissing, OnTypeMismatch, SchemaCompareOptions, }; @@ -35,7 +38,9 @@ use tracing::{info, instrument}; use crate::Dataset; use crate::dataset::blob::{ - BlobPreprocessor, ExternalBaseCandidate, ExternalBaseResolver, preprocess_blob_batches, + BlobPreprocessor, ExternalBaseCandidate, ExternalBaseResolver, + blob_dedicated_threshold_from_metadata, blob_inline_threshold_from_metadata, + preprocess_blob_batches, }; use crate::session::Session; @@ -170,6 +175,77 @@ fn validate_external_blob_write_params(params: &WriteParams) -> Result<()> { Ok(()) } +fn validate_blob_threshold_metadata_for_append( + input_schema: &Schema, + dataset_schema: &Schema, +) -> Result<()> { + for input_field in &input_schema.fields { + let Some(dataset_field) = dataset_schema.field(&input_field.name) else { + continue; + }; + let input_is_blob_v2 = input_field + .metadata + .get(ARROW_EXT_NAME_KEY) + .is_some_and(|extension_name| extension_name == BLOB_V2_EXT_NAME); + let dataset_is_blob_v2 = dataset_field + .metadata + .get(ARROW_EXT_NAME_KEY) + .is_some_and(|extension_name| extension_name == BLOB_V2_EXT_NAME); + if !input_is_blob_v2 && !dataset_is_blob_v2 { + continue; + } + + let has_inline_threshold = input_field + .metadata + .contains_key(BLOB_INLINE_SIZE_THRESHOLD_META_KEY); + let has_dedicated_threshold = input_field + .metadata + .contains_key(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY); + if !has_inline_threshold && !has_dedicated_threshold { + continue; + } + + if has_inline_threshold { + let input_inline_threshold = + blob_inline_threshold_from_metadata(&input_field.metadata, &input_field.name)?; + let dataset_inline_threshold = + blob_inline_threshold_from_metadata(&dataset_field.metadata, &dataset_field.name)?; + if input_inline_threshold != dataset_inline_threshold { + return Err(Error::invalid_input(format!( + "Cannot append data with blob threshold metadata {}={} for field '{}'; \ + the dataset schema has effective value {}. Blob thresholds for existing \ + columns are stored in the dataset schema.", + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, + input_inline_threshold, + input_field.name, + dataset_inline_threshold, + ))); + } + } + if has_dedicated_threshold { + let input_dedicated_threshold = + blob_dedicated_threshold_from_metadata(&input_field.metadata, &input_field.name)?; + let dataset_dedicated_threshold = blob_dedicated_threshold_from_metadata( + &dataset_field.metadata, + &dataset_field.name, + )?; + if input_dedicated_threshold != dataset_dedicated_threshold { + return Err(Error::invalid_input(format!( + "Cannot append data with blob threshold metadata {}={} for field '{}'; \ + the dataset schema has effective value {}. Blob thresholds for existing \ + columns are stored in the dataset schema.", + BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, + input_dedicated_threshold, + input_field.name, + dataset_dedicated_threshold, + ))); + } + } + } + + Ok(()) +} + /// Auto cleanup parameters #[derive(Debug, Clone)] pub struct AutoCleanupParams { @@ -507,7 +583,7 @@ pub async fn do_write_fragments( }; let external_base_resolver = if storage_version >= LanceFileVersion::V2_2 - && schema.fields.iter().any(|field| field.is_blob_v2()) + && schema.fields_pre_order().any(|field| field.is_blob_v2()) { Some(Arc::new( build_external_base_resolver(dataset, ¶ms).await?, @@ -953,6 +1029,7 @@ pub async fn write_fragments_internal( ..Default::default() }, )?; + validate_blob_threshold_metadata_for_append(&converted_schema, dataset.schema())?; let write_schema = dataset.schema().project_by_schema( &converted_schema, OnMissing::Error, @@ -984,7 +1061,8 @@ pub async fn write_fragments_internal( (converted_schema, params.storage_version_or_default()) }; - if storage_version < LanceFileVersion::V2_2 && schema.fields.iter().any(|f| f.is_blob_v2()) { + if storage_version < LanceFileVersion::V2_2 && schema.fields_pre_order().any(|f| f.is_blob_v2()) + { return Err(Error::invalid_input(format!( "Blob v2 requires file version >= 2.2 (got {:?})", storage_version @@ -992,13 +1070,10 @@ pub async fn write_fragments_internal( } if storage_version >= LanceFileVersion::V2_2 - && schema - .fields - .iter() - .any(|f| f.metadata.contains_key(BLOB_META_KEY)) + && let Some(blob_field_path) = legacy_blob_field_path(&schema) { return Err(Error::invalid_input(format!( - "Legacy blob columns (field metadata key {BLOB_META_KEY:?}) are not supported for file version >= 2.2. Use the blob v2 extension type (ARROW:extension:name = \"lance.blob.v2\") and the new blob APIs (e.g. lance::blob::blob_field / lance::blob::BlobArrayBuilder)." + "Legacy blob columns (field metadata key {BLOB_META_KEY:?}) are not supported for file version >= 2.2. Found legacy blob field: {blob_field_path}. Use the blob v2 extension type (ARROW:extension:name = \"lance.blob.v2\") and the new blob APIs (e.g. lance::blob::blob_field / lance::blob::BlobArrayBuilder)." ))); } @@ -1017,10 +1092,23 @@ pub async fn write_fragments_internal( Ok((fragments, schema)) } +fn legacy_blob_field_path(schema: &Schema) -> Option { + schema + .fields_pre_order() + .find(|field| field.metadata.contains_key(BLOB_META_KEY)) + .map(|field| { + schema + .field_path(field.id) + .unwrap_or_else(|_| field.name.clone()) + }) +} + #[async_trait::async_trait] pub trait GenericWriter: Send { /// Write the given batches to the file async fn write(&mut self, batches: &[RecordBatch]) -> Result<()>; + /// Get the file path and base ID for the data file being written. + fn data_file_path(&self) -> (&str, Option); /// Get the current position in the file /// /// We use this to know when the file is too large and we need to start @@ -1047,6 +1135,9 @@ where async fn write(&mut self, batches: &[RecordBatch]) -> Result<()> { self.writer.write(batches).await } + fn data_file_path(&self) -> (&str, Option) { + (&self.path, self.base_id) + } async fn tell(&mut self) -> Result { Ok(self.writer.tell().await? as u64) } @@ -1087,6 +1178,9 @@ impl GenericWriter for V2WriterAdapter { } Ok(()) } + fn data_file_path(&self) -> (&str, Option) { + (&self.path, self.base_id) + } async fn tell(&mut self) -> Result { Ok(self.writer.tell().await?) } @@ -1140,6 +1234,39 @@ pub async fn open_writer( .await } +pub(super) async fn open_update_writer( + dataset: &Dataset, + schema: &Schema, + storage_version: LanceFileVersion, +) -> Result> { + // add_columns / alter_columns reuse the normal writer stack, but they do not + // flow through WriteParams. Rebuild the external base resolver here so blob + // v2 reference columns can resolve dataset-registered external URIs. + let external_base_resolver = if storage_version >= LanceFileVersion::V2_2 + && schema.fields_pre_order().any(|f| f.is_blob_v2()) + { + Some(Arc::new( + build_external_base_resolver(Some(dataset), &WriteParams::default()).await?, + )) + } else { + None + }; + + open_writer_with_options( + &dataset.object_store, + schema, + &dataset.base, + storage_version, + WriterOptions { + add_data_dir: true, + external_base_resolver, + source_store_registry: dataset.session.store_registry(), + ..Default::default() + }, + ) + .await +} + #[derive(Default)] struct WriterOptions { add_data_dir: bool, @@ -1216,7 +1343,7 @@ async fn open_writer_with_options( source_store_registry, source_store_params, blob_pack_file_size_threshold, - )) + )?) } else { None }; diff --git a/rust/lance/src/dataset/write/insert.rs b/rust/lance/src/dataset/write/insert.rs index 20209ed7f30..bfd702c9c3b 100644 --- a/rust/lance/src/dataset/write/insert.rs +++ b/rust/lance/src/dataset/write/insert.rs @@ -442,7 +442,7 @@ struct WriteContext<'a> { mod test { use std::collections::HashMap; - use arrow_array::{BinaryArray, Int32Array, RecordBatchReader, StructArray}; + use arrow_array::{ArrayRef, BinaryArray, Int32Array, RecordBatchReader, StructArray}; use arrow_schema::{ArrowError, DataType, Field, Schema}; use lance_arrow::BLOB_META_KEY; @@ -559,6 +559,41 @@ mod test { } } + #[tokio::test] + async fn create_v2_2_dataset_rejects_nested_legacy_blob_schema() { + let image_field = Field::new("image_bytes", DataType::Binary, true).with_metadata( + HashMap::from([(BLOB_META_KEY.to_string(), "true".to_string())]), + ); + let schema = Arc::new(Schema::new(vec![Field::new( + "summary_image_nested", + DataType::Struct(vec![image_field.clone()].into()), + true, + )])); + let image_values: ArrayRef = Arc::new(BinaryArray::from(vec![Some(b"abc".as_slice())])); + let nested_values = StructArray::from(vec![(Arc::new(image_field), image_values)]); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(nested_values)]).unwrap(); + + let dataset = InsertBuilder::new("memory://forced-nested-blob-v2") + .with_params(&WriteParams { + mode: WriteMode::Create, + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }) + .execute_stream(RecordBatchIterator::new(vec![Ok(batch)], schema.clone())) + .await; + + let err = dataset.unwrap_err(); + match err { + Error::InvalidInput { source, .. } => { + let message = source.to_string(); + assert!(message.contains("Legacy blob columns")); + assert!(message.contains("summary_image_nested.image_bytes")); + assert!(message.contains("lance.blob.v2")); + } + other => panic!("unexpected error: {other:?}"), + } + } + mod external_error { use super::*; use std::fmt; diff --git a/rust/lance/src/dataset/write/merge_insert.rs b/rust/lance/src/dataset/write/merge_insert.rs index 1f3414db4f8..b14421c963f 100644 --- a/rust/lance/src/dataset/write/merge_insert.rs +++ b/rust/lance/src/dataset/write/merge_insert.rs @@ -2224,18 +2224,13 @@ impl Merger { &self.output_schema } - // Retrieves a bitmap of rows where at least one of the columns in the range - // col_offset..coll_offset+num_cols is not null. - // - fn not_all_null( - batch: &RecordBatch, - col_offset: usize, - num_cols: usize, - ) -> Result { + // Retrieves a bitmap of rows where at least one of the given columns is + // not null. + fn not_all_null(batch: &RecordBatch, cols: &[usize]) -> Result { // For our purposes we know there is always at least 1 on key - debug_assert_ne!(num_cols, 0); - let mut at_least_one_valid = arrow::compute::is_not_null(batch.column(col_offset))?; - for idx in col_offset + 1..col_offset + num_cols { + debug_assert!(!cols.is_empty()); + let mut at_least_one_valid = arrow::compute::is_not_null(batch.column(cols[0]))?; + for &idx in &cols[1..] { let is_valid = arrow::compute::is_not_null(batch.column(idx))?; at_least_one_valid = arrow::compute::or(&at_least_one_valid, &is_valid)?; } @@ -2263,8 +2258,37 @@ impl Merger { right_offset: usize, num_keys: usize, ) -> Result<(BooleanArray, BooleanArray, BooleanArray)> { - let in_left = Self::not_all_null(combined_batch, 0, num_keys)?; - let in_right = Self::not_all_null(combined_batch, right_offset, num_keys)?; + // The outer join distinguishes its three cases by which side's join + // keys were NULL-padded: a present row always has non-null keys, while + // the absent side is filled with NULLs. We therefore test the *key* + // columns, located by name. They are NOT necessarily the first + // `num_keys` columns — a partial-schema source can place a payload + // column (e.g. an all-null vector) at position 0, and checking + // positions [0, num_keys) there misreads an all-null leading payload + // column as an absent join side, silently dropping every matched row + // (https://github.com/lancedb/lancedb/issues/3515). The target half + // carries the same columns in the same order, offset by `right_offset`. + let source_key_cols = self + .params + .on + .iter() + .map(|key| { + combined_batch.schema().index_of(key).map_err(|_| { + Error::internal(format!( + "merge insert key column '{}' not found in joined batch", + key + )) + }) + }) + .collect::>>()?; + debug_assert_eq!(source_key_cols.len(), num_keys); + let target_key_cols = source_key_cols + .iter() + .map(|c| c + right_offset) + .collect::>(); + + let in_left = Self::not_all_null(combined_batch, &source_key_cols)?; + let in_right = Self::not_all_null(combined_batch, &target_key_cols)?; let in_both = arrow::compute::and(&in_left, &in_right)?; let left_only = arrow::compute::and(&in_left, &arrow::compute::not(&in_right)?)?; let right_only = arrow::compute::and(&arrow::compute::not(&in_left)?, &in_right)?; @@ -3517,6 +3541,116 @@ mod tests { } } + /// Reproduces https://github.com/lancedb/lancedb/issues/3515: + /// a partial-schema `merge_insert` with a scalar index on the join key, + /// where every fragment is covered by the index (no unindexed data), + /// silently updates 0 rows instead of the expected matches. + #[rstest::rstest] + #[tokio::test] + async fn test_repro_3515_partial_schema_fully_indexed( + #[values(LanceFileVersion::V2_0, LanceFileVersion::V2_1, LanceFileVersion::V2_2)] + version: LanceFileVersion, + ) { + const N: usize = 1000; + const UPD: usize = 128; + let vec_field = Field::new( + "vector", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 4), + true, + ); + let full_schema = Arc::new(Schema::new(vec![ + vec_field.clone(), + Field::new("path", DataType::Utf8, false), + Field::new("status", DataType::Utf8, true), + Field::new("file_size", DataType::Int64, true), + ])); + + // 1000 rows: vector all-null, path "/img/{i}.jpg", status "pending". + let paths = StringArray::from((0..N).map(|i| format!("/img/{i}.jpg")).collect::>()); + let statuses = StringArray::from(vec!["pending"; N]); + let file_sizes = Int64Array::from((0..N as i64).map(|i| 1000 + i).collect::>()); + let null_vectors = arrow_array::new_null_array(vec_field.data_type(), N); + let batch = RecordBatch::try_new( + full_schema.clone(), + vec![ + null_vectors, + Arc::new(paths), + Arc::new(statuses), + Arc::new(file_sizes), + ], + ) + .unwrap(); + + let mut ds = Dataset::write( + RecordBatchIterator::new([Ok(batch)], full_schema.clone()), + "memory://", + Some(WriteParams { + data_storage_version: Some(version), + ..Default::default() + }), + ) + .await + .unwrap(); + + // Scalar index on the merge key, covering every fragment. + ds.create_index( + &["path"], + IndexType::Scalar, + None, + &ScalarIndexParams::default(), + false, + ) + .await + .unwrap(); + let ds = Arc::new(ds); + + // Partial-schema source (no `file_size`): update the first 128 rows. + let upd_schema = Arc::new(Schema::new(vec![ + vec_field, + Field::new("path", DataType::Utf8, false), + Field::new("status", DataType::Utf8, true), + ])); + let upd_paths = StringArray::from( + (0..UPD) + .map(|i| format!("/img/{i}.jpg")) + .collect::>(), + ); + let upd_vectors = + FixedSizeListArray::try_new_from_values(Float32Array::from(vec![0.1f32; 4 * UPD]), 4) + .unwrap(); + let upd_statuses = StringArray::from(vec!["indexed"; UPD]); + let updates = RecordBatch::try_new( + upd_schema.clone(), + vec![ + Arc::new(upd_vectors), + Arc::new(upd_paths), + Arc::new(upd_statuses), + ], + ) + .unwrap(); + + let (ds, stats) = MergeInsertBuilder::try_new(ds.clone(), vec!["path".to_string()]) + .unwrap() + .when_matched(WhenMatched::UpdateAll) + .when_not_matched(WhenNotMatched::DoNothing) + .try_build() + .unwrap() + .execute_reader(RecordBatchIterator::new([Ok(updates)], upd_schema)) + .await + .unwrap(); + + assert_eq!( + stats.num_updated_rows, UPD as u64, + "expected {UPD} updated rows on {version:?}, got {}", + stats.num_updated_rows + ); + let n_indexed = ds + .count_rows(Some("status = 'indexed'".to_string())) + .await + .unwrap(); + assert_eq!(n_indexed, UPD, "expected {UPD} rows flipped to 'indexed'"); + } + #[tokio::test] async fn test_indexed_merge_insert() { let test_dir = TempStrDir::default(); diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 8984d507408..1a3a3aa54ec 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -29,9 +29,7 @@ use lance_index::mem_wal::{MEM_WAL_INDEX_NAME, MemWalIndex}; use lance_index::optimize::OptimizeOptions; use lance_index::pb::index::Implementation; pub use lance_index::progress::{IndexBuildProgress, NoopIndexBuildProgress}; -use lance_index::scalar::expression::{ - IndexInformationProvider, MultiQueryParser, ScalarQueryParser, -}; +use lance_index::scalar::expression::{IndexInformationProvider, MultiQueryParser}; use lance_index::scalar::inverted::{InvertedIndex, InvertedIndexPlugin}; use lance_index::scalar::lance_format::LanceIndexStore; use lance_index::scalar::registry::{TrainingCriteria, TrainingOrdering}; @@ -57,7 +55,7 @@ use lance_io::utils::{ read_version, }; use lance_table::format::{Fragment, SelfDescribingFileReader}; -use lance_table::format::{IndexMetadata, list_index_files_with_sizes}; +use lance_table::format::{IndexFile, IndexMetadata, list_index_files_with_sizes}; use lance_table::io::manifest::read_manifest_indexes; use roaring::RoaringBitmap; use scalar::index_matches_criteria; @@ -166,7 +164,8 @@ pub(crate) async fn build_index_metadata_from_segments( let mut new_indices = Vec::with_capacity(segments.len()); for segment in segments { let (uuid, fragment_bitmap, index_details, index_version) = segment.into_parts(); - if index_details.type_url.ends_with("InvertedIndexDetails") { + let is_inverted_index = index_details.type_url.ends_with("InvertedIndexDetails"); + if is_inverted_index { let metadata = IndexMetadata { uuid, name: index_name.to_string(), @@ -183,7 +182,10 @@ pub(crate) async fn build_index_metadata_from_segments( .await?; } let index_dir = dataset.indices_dir().clone().join(uuid.to_string()); - let files = list_index_files_with_sizes(&dataset.object_store, &index_dir).await?; + let mut files = list_index_files_with_sizes(&dataset.object_store, &index_dir).await?; + if is_inverted_index { + retain_committed_inverted_files(&mut files); + } new_indices.push(IndexMetadata { uuid, name: index_name.to_string(), @@ -201,6 +203,10 @@ pub(crate) async fn build_index_metadata_from_segments( Ok(new_indices) } +fn retain_committed_inverted_files(files: &mut Vec) { + files.retain(|file| !file.path.starts_with("staging/")); +} + fn validate_segment_index_details(index_name: &str, segments: &[IndexMetadata]) -> Result<()> { let mut type_url = None::<&str>; for segment in segments { @@ -652,10 +658,10 @@ pub struct ScalarIndexInfo { } impl IndexInformationProvider for ScalarIndexInfo { - fn get_index(&self, col: &str) -> Option<(&DataType, &dyn ScalarQueryParser)> { + fn get_index(&self, col: &str) -> Option<(&DataType, &MultiQueryParser)> { self.indexed_columns .get(col) - .map(|(ty, parser)| (ty, parser.as_ref() as &dyn ScalarQueryParser)) + .map(|(ty, parser)| (ty, parser.as_ref())) } fn fragment_bitmap(&self, column: &str, index_name: &str) -> Option { @@ -1891,9 +1897,15 @@ impl DatasetIndexInternalExt for Dataset { if let Some(entry) = self.index_cache.get_with_key(&state_key).await { log::debug!("Found IvfIndexState in cache uuid: {}", uuid); let partition_cache = self.index_cache.with_key_prefix(&state_key.key()); + let frag_reuse_index = self.open_frag_reuse_index(metrics).await?; return entry .0 - .reconstruct(object_store, self.metadata_cache.as_ref(), partition_cache) + .reconstruct( + object_store, + self.metadata_cache.as_ref(), + partition_cache, + frag_reuse_index, + ) .await; } @@ -2158,6 +2170,15 @@ impl DatasetIndexInternalExt for Dataset { }; let (index, ivf_entry) = result?; metrics.record_index_load(); + // Attribute the one-time index-open I/O (file footers, IVF centroids, + // quantization metadata) to this query's metrics. This runs only on a + // real open; cache hits return earlier, so a warm query reports zero + // index-open I/O. + if let Some(io_stats) = metrics.io_stats() + && let Some(open_stats) = index.open_io_stats() + { + io_stats.add_scan_stats(&open_stats); + } if let Some(ivf_entry) = ivf_entry { let state_key = IvfIndexStateCacheKey::new(uuid, frag_reuse_uuid.as_ref()); self.index_cache diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs index 388f3170251..037e1086d57 100644 --- a/rust/lance/src/index/append.rs +++ b/rust/lance/src/index/append.rs @@ -94,9 +94,9 @@ pub async fn build_old_data_filter( } } -/// Split the stored fragment coverage of `segments` into fragments still live -/// in `dataset` (`effective`) and fragments that compaction or deletion has -/// already retired (`deleted`). +/// Split the stored fragment coverage of `segments` into fragments still live in +/// `dataset` (`effective`) and fragments that compaction or deletion has already +/// retired (`deleted`). pub fn split_segment_coverage<'a>( dataset: &Dataset, segments: impl IntoIterator, @@ -114,44 +114,32 @@ pub fn split_segment_coverage<'a>( (effective, deleted) } -/// Build one [`OldIndexDataFilter`] per segment, each derived from that -/// segment's *own* effective (still-live) and retired fragment coverage. +/// Build one [`OldIndexDataFilter`] per segment, each derived from that segment's +/// *own* effective (still-live) and retired fragment coverage, plus the union of +/// every segment's still-live coverage. pub async fn build_per_segment_filters( dataset: &Dataset, segments: &[&IndexMetadata], -) -> Result>> { +) -> Result<(RoaringBitmap, Vec>)> { + let mut effective_union = RoaringBitmap::new(); let mut filters = Vec::with_capacity(segments.len()); for segment in segments { + if segment.fragment_bitmap.is_none() { + return Err(Error::invalid_input(format!( + "CreateIndex: segment {} is missing fragment coverage", + segment.uuid + ))); + } let effective = segment .effective_fragment_bitmap(&dataset.fragment_bitmap) .unwrap_or_default(); let deleted = segment .deleted_fragment_bitmap(&dataset.fragment_bitmap) .unwrap_or_default(); + effective_union |= &effective; filters.push(build_old_data_filter(dataset, &effective, &deleted).await?); } - Ok(filters) -} - -/// Validate that every segment carries fragment coverage, then return the -/// combined still-live coverage (for the merged segment's fragment bitmap) -/// together with one [`OldIndexDataFilter`] per segment. -pub async fn effective_coverage_and_filters( - dataset: &Dataset, - segments: &[IndexMetadata], -) -> Result<(RoaringBitmap, Vec>)> { - for segment in segments { - if segment.fragment_bitmap.is_none() { - return Err(Error::invalid_input(format!( - "CreateIndex: segment {} is missing fragment coverage", - segment.uuid - ))); - } - } - let (effective, _deleted) = split_segment_coverage(dataset, segments); - let segment_refs: Vec<&IndexMetadata> = segments.iter().collect(); - let filters = build_per_segment_filters(dataset, &segment_refs).await?; - Ok((effective, filters)) + Ok((effective_union, filters)) } async fn load_unindexed_training_data( @@ -292,11 +280,11 @@ async fn merge_scalar_indices<'a>( load_unindexed_training_data(dataset.as_ref(), field_path, &update_criteria, unindexed) .await?; let new_store = LanceIndexStore::from_dataset_for_new(&dataset, &new_uuid)?; - let old_data_filters = - build_per_segment_filters(dataset.as_ref(), selected_old_indices).await?; match index_type { IndexType::BTree => { + let (_, old_data_filters) = + build_per_segment_filters(dataset.as_ref(), selected_old_indices).await?; crate::index::scalar::btree::open_and_merge_segments( dataset.as_ref(), field_path, @@ -315,6 +303,8 @@ async fn merge_scalar_indices<'a>( .update(new_data_stream, &new_store, None) .await? } else { + let (_, old_data_filters) = + build_per_segment_filters(dataset.as_ref(), selected_old_indices).await?; crate::index::scalar::bitmap::open_and_merge_segments( dataset.as_ref(), field_path, @@ -327,10 +317,6 @@ async fn merge_scalar_indices<'a>( } } _ => { - // Non-segmented scalar types only reach this branch with a single - // selected segment, so the union filter equals that segment's - // filter. Built lazily here so the segmented BTree/Bitmap paths - // above don't pay an extra row-id-sequence load they never use. let old_data_filter = build_old_data_filter( dataset.as_ref(), &effective_old_frags, @@ -840,7 +826,7 @@ mod tests { use rstest::rstest; use crate::dataset::builder::DatasetBuilder; - use crate::dataset::optimize::compact_files; + use crate::dataset::optimize::{CompactionOptions, compact_files}; use crate::dataset::{MergeInsertBuilder, WhenMatched, WhenNotMatched, WriteMode, WriteParams}; use crate::index::vector::VectorIndexParams; use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount}; @@ -2014,99 +2000,6 @@ mod tests { ); } - #[tokio::test] - async fn test_optimize_btree_no_duplicate_row_addr() { - let test_dir = TempStrDir::default(); - let test_uri = test_dir.as_str(); - - let schema = Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("payload", DataType::Int32, false), - ])); - let batch = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(Int32Array::from(vec![1])), - Arc::new(Int32Array::from(vec![10])), - ], - ) - .unwrap(); - let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); - let mut dataset = Dataset::write(reader, test_uri, None).await.unwrap(); - - let params = ScalarIndexParams::for_builtin(BuiltinIndexType::BTree); - dataset - .create_index( - &["id"], - IndexType::BTree, - Some("id_idx".into()), - ¶ms, - true, - ) - .await - .unwrap(); - - // Reordered source columns (payload, id) force the partial-schema - // RewriteColumns path instead of a row rewrite. - let source_schema = Arc::new(Schema::new(vec![ - Field::new("payload", DataType::Int32, false), - Field::new("id", DataType::Int32, false), - ])); - let source_batch = RecordBatch::try_new( - source_schema.clone(), - vec![ - Arc::new(Int32Array::from(vec![100])), - Arc::new(Int32Array::from(vec![1])), - ], - ) - .unwrap(); - let merge_job = - MergeInsertBuilder::try_new(Arc::new(dataset.clone()), vec!["id".to_string()]) - .unwrap() - .when_matched(WhenMatched::UpdateAll) - .try_build() - .unwrap(); - let source_reader = Box::new(RecordBatchIterator::new( - [Ok(source_batch)], - source_schema.clone(), - )); - merge_job - .execute(reader_to_stream(source_reader)) - .await - .unwrap(); - - // Build a delta BTree segment over the now-unindexed fragment. - let mut dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); - dataset - .optimize_indices(&OptimizeOptions::append()) - .await - .unwrap(); - assert_eq!( - dataset.load_indices_by_name("id_idx").await.unwrap().len(), - 2, - "append must create a delta segment over the rewritten fragment" - ); - - // Force the old segment + delta segment to merge. - dataset - .optimize_indices(&OptimizeOptions::merge(2)) - .await - .unwrap(); - - let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); - let rows = dataset - .scan() - .filter("id = 1") - .unwrap() - .project(&["id"]) - .unwrap() - .try_into_batch() - .await - .unwrap() - .num_rows(); - assert_eq!(rows, 1, "id = 1 must return exactly one row after merge"); - } - #[tokio::test] async fn test_optimize_bitmap_no_stale_postings() { async fn query_count(dataset: &Dataset, value: &str) -> usize { @@ -2225,6 +2118,107 @@ mod tests { ); } + #[tokio::test] + async fn test_optimize_bitmap_merge_remaps_deferred_compaction() { + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![Field::new("cat", DataType::Int32, false)])); + let make = |range: std::ops::Range| { + RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(range))], + ) + .unwrap() + }; + + // Two fragments: [0, 50) and [50, 100). + let reader = + RecordBatchIterator::new(vec![Ok(make(0..50)), Ok(make(50..100))], schema.clone()); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 50, + ..Default::default() + }), + ) + .await + .unwrap(); + assert_eq!(dataset.get_fragments().len(), 2); + + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap); + dataset + .create_index( + &["cat"], + IndexType::Bitmap, + Some("cat_idx".into()), + ¶ms, + true, + ) + .await + .unwrap(); + + // Deferred-remap compaction fuses the two fragments into one and leaves a + // pending FragReuseIndex; the bitmap segment is not eagerly remapped, so + // its on-disk postings still reference the pre-compaction fragments. + compact_files( + &mut dataset, + CompactionOptions { + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + + // Append a third fragment, left unindexed. + let mut dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + dataset + .append( + RecordBatchIterator::new(vec![Ok(make(100..150))], schema.clone()), + None, + ) + .await + .unwrap(); + + // Merge the deferred-remapped old segment with the new delta. + dataset + .optimize_indices(&OptimizeOptions::merge(2)) + .await + .unwrap(); + + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + // A value from the compacted fragments must still be found via the index; + // a missing remap would point the posting at a retired fragment address. + let hit = dataset + .scan() + .filter("cat = 25") + .unwrap() + .project(&["cat"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!( + hit, 1, + "compacted-then-merged row must remain queryable via the bitmap index" + ); + let total = dataset + .scan() + .filter("cat >= 0") + .unwrap() + .project(&["cat"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(total, 150, "no rows may be lost across compaction + merge"); + } + #[tokio::test] async fn test_optimize_btree_keeps_rows_with_stable_row_ids_after_compaction() { async fn query_id_count(dataset: &Dataset, id: &str) -> usize { @@ -2359,4 +2353,205 @@ mod tests { assert_eq!(after_default[0].uuid, original_uuid); assert_eq!(dataset.manifest.version, original_version); } + + #[rstest] + #[case::address_row_ids(false)] + #[case::stable_row_ids(true)] + #[tokio::test] + async fn test_optimize_btree_no_duplicate_row_addr(#[case] use_stable_row_ids: bool) { + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("payload", DataType::Int32, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1])), + Arc::new(Int32Array::from(vec![10])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let write_params = WriteParams { + enable_stable_row_ids: use_stable_row_ids, + ..Default::default() + }; + let mut dataset = Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::BTree); + dataset + .create_index( + &["id"], + IndexType::BTree, + Some("id_idx".into()), + ¶ms, + true, + ) + .await + .unwrap(); + + // Reordered source columns (payload, id) force the partial-schema + // RewriteColumns path instead of a full row rewrite. + let source_schema = Arc::new(Schema::new(vec![ + Field::new("payload", DataType::Int32, false), + Field::new("id", DataType::Int32, false), + ])); + let source_batch = RecordBatch::try_new( + source_schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![100])), + Arc::new(Int32Array::from(vec![1])), + ], + ) + .unwrap(); + let merge_job = + MergeInsertBuilder::try_new(Arc::new(dataset.clone()), vec!["id".to_string()]) + .unwrap() + .when_matched(WhenMatched::UpdateAll) + .try_build() + .unwrap(); + let source_reader = Box::new(RecordBatchIterator::new( + [Ok(source_batch)], + source_schema.clone(), + )); + merge_job + .execute(reader_to_stream(source_reader)) + .await + .unwrap(); + + // Build a delta BTree segment over the now-unindexed fragment. + let mut dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + dataset + .optimize_indices(&OptimizeOptions::append()) + .await + .unwrap(); + assert_eq!( + dataset.load_indices_by_name("id_idx").await.unwrap().len(), + 2, + "append must create a delta segment over the rewritten fragment" + ); + + // Force the old segment + delta segment to merge. + dataset + .optimize_indices(&OptimizeOptions::merge(2)) + .await + .unwrap(); + + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + let rows = dataset + .scan() + .filter("id = 1") + .unwrap() + .project(&["id"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(rows, 1, "id = 1 must return exactly one row after merge"); + } + + #[tokio::test] + async fn test_optimize_btree_merge_remaps_deferred_compaction() { + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let make = |range: std::ops::Range| { + RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(range))], + ) + .unwrap() + }; + + // Two fragments: [0, 50) and [50, 100). + let reader = + RecordBatchIterator::new(vec![Ok(make(0..50)), Ok(make(50..100))], schema.clone()); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 50, + ..Default::default() + }), + ) + .await + .unwrap(); + assert_eq!(dataset.get_fragments().len(), 2); + + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::BTree); + dataset + .create_index( + &["id"], + IndexType::BTree, + Some("id_idx".into()), + ¶ms, + true, + ) + .await + .unwrap(); + + // Deferred-remap compaction fuses the two fragments into one and leaves a + // pending FragReuseIndex; the index segment is not eagerly remapped. + compact_files( + &mut dataset, + CompactionOptions { + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + + // Append a third fragment, left unindexed. + let mut dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + dataset + .append( + RecordBatchIterator::new(vec![Ok(make(100..150))], schema.clone()), + None, + ) + .await + .unwrap(); + + // Merge the deferred-remapped old segment with the new delta. + dataset + .optimize_indices(&OptimizeOptions::merge(2)) + .await + .unwrap(); + + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + // A value from the compacted fragments must still be found via the index. + let hit = dataset + .scan() + .filter("id = 25") + .unwrap() + .project(&["id"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!( + hit, 1, + "compacted-then-merged row must remain queryable via the index" + ); + let total = dataset + .scan() + .filter("id >= 0") + .unwrap() + .project(&["id"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(total, 150, "no rows may be lost across compaction + merge"); + } } diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index 37b4df81404..507c2d23114 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -1998,6 +1998,23 @@ mod tests { let segments = input_segments.clone(); assert_eq!(segments.len(), input_segments.len()); + crate::index::scalar::inverted::finalize_segment_files_if_needed( + &dataset, + &input_segments[0], + ) + .await + .unwrap(); + let stale_staging_path = dataset + .indices_dir() + .join(input_segments[0].uuid.to_string()) + .join("staging") + .join("orphan.lance"); + dataset + .object_store + .put(&stale_staging_path, b"stale") + .await + .unwrap(); + dataset .commit_existing_index_segments("text_idx", "text", segments) .await @@ -2021,6 +2038,19 @@ mod tests { let indices = dataset.load_indices_by_name("text_idx").await.unwrap(); assert_eq!(indices.len(), input_segments.len()); + let finalized_segment = indices + .iter() + .find(|index| index.uuid == input_segments[0].uuid) + .expect("finalized segment should be committed"); + assert!( + finalized_segment + .files + .as_ref() + .expect("committed segment should track files") + .iter() + .all(|file| !file.path.starts_with("staging/")), + "stale staging files must not be committed in IndexMetadata.files" + ); } #[tokio::test] diff --git a/rust/lance/src/index/scalar.rs b/rust/lance/src/index/scalar.rs index 794ce399108..585e4e2ce72 100644 --- a/rust/lance/src/index/scalar.rs +++ b/rust/lance/src/index/scalar.rs @@ -12,6 +12,8 @@ pub(crate) mod zonemap; pub use inverted::{load_segment_details, load_segments}; +pub use crate::index::scalar_logical::{LogicalScalarIndex, load_named_scalar_segments}; + use std::sync::{Arc, LazyLock}; use uuid::Uuid; diff --git a/rust/lance/src/index/scalar/bitmap.rs b/rust/lance/src/index/scalar/bitmap.rs index d5bbdcf2961..a947de1cbe3 100644 --- a/rust/lance/src/index/scalar/bitmap.rs +++ b/rust/lance/src/index/scalar/bitmap.rs @@ -54,13 +54,10 @@ pub(in crate::index) async fn merge_segments( })?; let field_path = dataset.schema().field_path(field_id)?; - // Intersect each segment's stored coverage with the dataset's current - // fragments so we don't claim coverage on row addresses that compaction or - // pruning has already retired. + let segment_refs: Vec<&IndexMetadata> = segments.iter().collect(); let (fragment_bitmap, old_data_filters) = - crate::index::append::effective_coverage_and_filters(dataset, &segments).await?; + crate::index::append::build_per_segment_filters(dataset, &segment_refs).await?; - let segment_refs: Vec<&IndexMetadata> = segments.iter().collect(); let source_indices = open_bitmap_segments(dataset, &field_path, &segment_refs).await?; let new_uuid = Uuid::new_v4(); diff --git a/rust/lance/src/index/scalar/btree.rs b/rust/lance/src/index/scalar/btree.rs index 268048da4dd..4339b8c183b 100644 --- a/rust/lance/src/index/scalar/btree.rs +++ b/rust/lance/src/index/scalar/btree.rs @@ -117,18 +117,15 @@ pub(crate) async fn merge_segments( })?; let field_path = dataset.schema().field_path(field_id)?; - // Intersect each segment's stored bitmap with the dataset's current - // fragments so we don't claim coverage on IDs that compaction or pruning - // has already retired. + let segment_refs: Vec<&IndexMetadata> = segments.iter().collect(); let (fragment_bitmap, old_data_filters) = - crate::index::append::effective_coverage_and_filters(dataset, &segments).await?; + crate::index::append::build_per_segment_filters(dataset, &segment_refs).await?; let output_uuid = Uuid::new_v4(); let new_store = LanceIndexStore::from_dataset_for_new(dataset, &output_uuid)?; // Pure segment consolidation: no dataset scan, so `new_data` is an empty // stream and the merge is driven entirely by the source page data. let empty_new_data = empty_btree_update_stream(dataset, field_id)?; - let segment_refs: Vec<&IndexMetadata> = segments.iter().collect(); let created_index = open_and_merge_segments( dataset, &field_path, diff --git a/rust/lance/src/index/scalar_logical.rs b/rust/lance/src/index/scalar_logical.rs index 75465cc817c..f3a7b637202 100644 --- a/rust/lance/src/index/scalar_logical.rs +++ b/rust/lance/src/index/scalar_logical.rs @@ -31,7 +31,17 @@ pub struct LogicalScalarIndex { } impl LogicalScalarIndex { - fn try_new(name: String, column: String, segments: Vec>) -> Result { + /// Merge several already-opened segments of one scalar index into a single + /// searchable [`ScalarIndex`]. + /// + /// Used internally by `open_named_scalar_index`, and exposed so a + /// distributed query engine can open an explicit subset of a scalar + /// index's segments and present them as one index. + pub fn try_new( + name: String, + column: String, + segments: Vec>, + ) -> Result { let Some(first) = segments.first() else { return Err(Error::invalid_input(format!( "LogicalScalarIndex '{}' on column '{}' must contain at least one segment", @@ -210,7 +220,14 @@ fn index_intersects_dataset(index: &IndexMetadata, dataset: &Dataset) -> bool { .is_some_and(|index_bitmap| index_bitmap.intersection_len(&dataset.fragment_bitmap) > 0) } -async fn load_named_scalar_segments( +/// List the committed, dataset-intersecting segments of a named scalar index. +/// +/// Returns one [`IndexMetadata`] per usable segment. The result length is the +/// segment count: `1` means a single (non-segmented) index, `> 1` means the +/// index is split across multiple segments that a distributed engine may route +/// to different executors. All returned segments are validated to share the +/// same underlying index type. +pub async fn load_named_scalar_segments( dataset: &Dataset, column: &str, index_name: &str, diff --git a/rust/lance/src/index/vector.rs b/rust/lance/src/index/vector.rs index 0eb66ea2ede..af48bc94c41 100644 --- a/rust/lance/src/index/vector.rs +++ b/rust/lance/src/index/vector.rs @@ -9,6 +9,7 @@ use std::{any::Any, collections::HashMap}; pub mod builder; pub(crate) mod details; +pub mod hamming; pub mod ivf; pub mod pq; pub mod utils; diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs index 579449cc087..1e4fec8c762 100644 --- a/rust/lance/src/index/vector/builder.rs +++ b/rust/lance/src/index/vector/builder.rs @@ -1045,7 +1045,7 @@ impl IvfIndexBuilder continue; } - let part_storage = existing_index.load_partition_storage(part_id).await?; + let part_storage = existing_index.load_partition_storage(part_id, None).await?; let mut part_batches = part_storage.to_batches()?.collect::>(); // for PQ, the PQ codes are transposed, so we need to transpose them back match Q::quantization_type() { diff --git a/rust/lance/src/index/vector/hamming.rs b/rust/lance/src/index/vector/hamming.rs new file mode 100644 index 00000000000..ba6ea98c42d --- /dev/null +++ b/rust/lance/src/index/vector/hamming.rs @@ -0,0 +1,938 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Hamming distance clustering for IVF_FLAT indices. +//! +//! This module provides functionality to perform pairwise hamming distance +//! computation and clustering on specific partitions of IVF_FLAT indices. + +use std::time::Instant; + +use arrow_array::RecordBatchReader; +use arrow_array::cast::AsArray; +use arrow_array::types::UInt64Type; +use arrow_schema::DataType; +use lance_core::{Error, Result}; +use lance_index::metrics::NoOpMetricsCollector; +use lance_index::vector::VectorIndex; +use lance_index::vector::flat::index::{FlatBinQuantizer, FlatIndex}; +use lance_index::vector::flat::storage::FLAT_COLUMN; +use lance_index::vector::storage::VectorStore; +use lance_linalg::distance::{ + ClusteringResult, cluster_pairwise_result, extract_hashes_from_fixed_list, + pairwise_hamming_distance_parallel, +}; +use rand::rng; +use rand::seq::index::sample; + +use crate::dataset::Dataset; +use crate::index::{DatasetIndexExt, DatasetIndexInternalExt}; + +use super::ivf::v2::IVFIndex; + +/// Perform pairwise hamming distance clustering on a partition of an IVF_FLAT index. +/// +/// This function loads a specific partition from an IVF_FLAT index on a hash column, +/// computes pairwise hamming distances between all hashes in the partition, +/// filters by threshold, and clusters the results using union-find. +/// +/// # Arguments +/// +/// * `dataset` - The Lance dataset +/// * `index_name` - Name of the IVF_FLAT index on the hash column +/// * `partition_id` - The partition ID within the IVF_FLAT index +/// * `hamming_threshold` - Maximum hamming distance to consider as similar +/// +/// # Returns +/// +/// A `RecordBatchReader` yielding batches with columns: +/// - `representative`: UInt64 - The representative row ID for each cluster +/// - `duplicates`: `List` - List of duplicate row IDs in each cluster +/// +/// # Errors +/// +/// Returns an error if: +/// - The index doesn't exist or is not an IVF_FLAT index +/// - The indexed column has wrong type (must be `FixedSizeList`) +/// - The partition ID is out of range +pub async fn hamming_clustering_for_ivf_partition( + dataset: &Dataset, + index_name: &str, + partition_id: usize, + hamming_threshold: u32, +) -> Result> { + // Load indices and find the IVF_FLAT index + let indices = dataset.load_indices().await?; + let index_meta = indices + .iter() + .find(|idx| idx.name == index_name) + .ok_or_else(|| { + Error::invalid_input(format!("Index '{}' not found on dataset", index_name)) + })?; + + // Get the column name from the index metadata + let schema = dataset.schema(); + let field_id = index_meta + .fields + .first() + .ok_or_else(|| Error::invalid_input(format!("Index '{}' has no fields", index_name)))?; + let field = schema.field_by_id(*field_id).ok_or_else(|| { + Error::invalid_input(format!( + "Field with id {} not found in schema for index '{}'", + field_id, index_name + )) + })?; + let column = &field.name; + + // Check column is FixedSizeList + let data_type = field.data_type(); + match data_type { + DataType::FixedSizeList(inner, 8) => { + if *inner.data_type() != DataType::UInt8 { + return Err(Error::invalid_input(format!( + "Column '{}' must be FixedSizeList, got FixedSizeList<{:?}, 8>", + column, + inner.data_type() + ))); + } + } + _ => { + return Err(Error::invalid_input(format!( + "Column '{}' must be FixedSizeList, got {:?}", + column, data_type + ))); + } + } + + // Open the vector index + let index = dataset + .open_vector_index(column, &index_meta.uuid, &NoOpMetricsCollector) + .await?; + + // Try to downcast to IVFIndex (IVF_FLAT for binary data) + let ivf_index = index + .as_any() + .downcast_ref::>() + .ok_or_else(|| { + Error::invalid_input(format!( + "Index '{}' is not an IVF_FLAT index for binary data", + index_name + )) + })?; + + // Check partition ID is valid + let num_partitions = ivf_index.ivf_model().num_partitions(); + if partition_id >= num_partitions { + return Err(Error::invalid_input(format!( + "Partition ID {} is out of range (0..{})", + partition_id, num_partitions + ))); + } + + // Load the partition storage + let storage = ivf_index.load_partition_storage(partition_id, None).await?; + + // Get row IDs + let row_id_slice: Vec = storage.row_ids().copied().collect(); + + if row_id_slice.is_empty() { + let empty = ClusteringResult { + clusters: Vec::new(), + }; + return Ok(empty.into_reader(None)); + } + + // Get vectors from the storage batches + let batches: Vec<_> = storage.to_batches()?.collect(); + if batches.is_empty() { + let empty = ClusteringResult { + clusters: Vec::new(), + }; + return Ok(empty.into_reader(None)); + } + + // Extract the hash vectors from the FLAT_COLUMN + let mut all_hashes = Vec::new(); + for batch in &batches { + let vectors = batch + .column_by_name(FLAT_COLUMN) + .ok_or_else(|| { + Error::invalid_input(format!("Column '{}' not found in storage", FLAT_COLUMN)) + })? + .as_fixed_size_list(); + let hashes = extract_hashes_from_fixed_list(vectors)?; + all_hashes.extend(hashes); + } + + // Compute pairwise hamming distances with threshold filtering + let pairwise_result = pairwise_hamming_distance_parallel( + &all_hashes, + Some(&row_id_slice), + Some(hamming_threshold), + ); + + // Cluster the results + let clustering = cluster_pairwise_result(&pairwise_result); + + Ok(clustering.into_reader(None)) +} + +/// Get partition statistics for an IVF_FLAT index. +pub async fn get_ivf_partition_info( + dataset: &Dataset, + index_name: &str, +) -> Result> { + let indices = dataset.load_indices().await?; + let index_meta = indices + .iter() + .find(|idx| idx.name == index_name) + .ok_or_else(|| { + Error::invalid_input(format!("Index '{}' not found on dataset", index_name)) + })?; + + // Get the column name from the index metadata + let schema = dataset.schema(); + let field_id = index_meta + .fields + .first() + .ok_or_else(|| Error::invalid_input(format!("Index '{}' has no fields", index_name)))?; + let field = schema.field_by_id(*field_id).ok_or_else(|| { + Error::invalid_input(format!( + "Field with id {} not found in schema for index '{}'", + field_id, index_name + )) + })?; + let column = &field.name; + + let index = dataset + .open_vector_index(column, &index_meta.uuid, &NoOpMetricsCollector) + .await?; + + let ivf_index = index + .as_any() + .downcast_ref::>() + .ok_or_else(|| { + Error::invalid_input(format!( + "Index '{}' is not an IVF_FLAT index for binary data", + index_name + )) + })?; + + let num_partitions = ivf_index.ivf_model().num_partitions(); + let mut partition_infos = Vec::with_capacity(num_partitions); + + for i in 0..num_partitions { + partition_infos.push(PartitionInfo { + partition_id: i, + size: ivf_index.ivf_model().partition_size(i), + }); + } + + Ok(partition_infos) +} + +/// Information about an IVF partition. +#[derive(Debug, Clone)] +pub struct PartitionInfo { + pub partition_id: usize, + pub size: usize, +} + +/// Perform pairwise hamming distance clustering on sampled rows from a dataset. +/// +/// This function samples N rows randomly from the dataset, extracts hashes, +/// computes pairwise hamming distances, and clusters the results. +/// It's useful for benchmarking and testing without requiring an IVF index. +/// +/// # Arguments +/// +/// * `dataset` - The Lance dataset +/// * `column` - Name of the hash column (must be `FixedSizeList`) +/// * `sample_size` - Number of rows to sample (if None or >= total rows, uses all rows) +/// * `hamming_threshold` - Maximum hamming distance to consider as similar +/// +/// # Returns +/// +/// A `RecordBatchReader` yielding batches with columns: +/// - `representative`: UInt64 - The representative row ID for each cluster +/// - `duplicates`: `List` - List of duplicate row IDs in each cluster +pub async fn hamming_clustering_for_sample( + dataset: &Dataset, + column: &str, + sample_size: Option, + hamming_threshold: u32, +) -> Result> { + // Validate column exists and has correct type + let schema = dataset.schema(); + let field = schema.field(column).ok_or_else(|| { + Error::invalid_input(format!("Column '{}' not found in dataset schema", column)) + })?; + + // Check column is FixedSizeList + let data_type = field.data_type(); + match data_type { + DataType::FixedSizeList(inner, 8) => { + if *inner.data_type() != DataType::UInt8 { + return Err(Error::invalid_input(format!( + "Column '{}' must be FixedSizeList, got FixedSizeList<{:?}, 8>", + column, + inner.data_type() + ))); + } + } + _ => { + return Err(Error::invalid_input(format!( + "Column '{}' must be FixedSizeList, got {:?}", + column, data_type + ))); + } + } + + // Get total row count + let total_rows: usize = dataset + .get_fragments() + .iter() + .filter_map(|f| f.metadata().physical_rows) + .sum(); + + let use_sampling = sample_size.is_some_and(|s| s < total_rows); + let effective_sample = sample_size.unwrap_or(total_rows).min(total_rows); + + // Read data + let (hashes, row_ids) = if use_sampling { + // Random sample using take() with _rowid (take uses positional indices) + let indices: Vec = sample(&mut rng(), total_rows, effective_sample) + .iter() + .map(|i| i as u64) + .collect(); + + let batch = dataset + .take( + &indices, + crate::dataset::ProjectionRequest::from_columns( + [column, "_rowid"], + dataset.schema(), + ), + ) + .await?; + + let rowid_col = batch.column_by_name("_rowid").ok_or_else(|| { + Error::invalid_input("_rowid column not found in take result".to_string()) + })?; + let row_ids = rowid_col.as_primitive::(); + let row_id_vec: Vec = row_ids.values().to_vec(); + + let hash_col = batch.column_by_name(column).ok_or_else(|| { + Error::invalid_input(format!("Column '{}' not found in result", column)) + })?; + let hashes_arr = hash_col.as_fixed_size_list(); + let hashes = extract_hashes_from_fixed_list(hashes_arr)?; + + (hashes, row_id_vec) + } else { + // Full scan + let batch = dataset + .scan() + .project(&[column])? + .with_row_id() + .try_into_batch() + .await?; + + let rowid_col = batch.column_by_name("_rowid").ok_or_else(|| { + Error::invalid_input("_rowid column not found in scan result".to_string()) + })?; + let row_ids = rowid_col.as_primitive::(); + let row_id_vec: Vec = row_ids.values().to_vec(); + + let hash_col = batch.column_by_name(column).ok_or_else(|| { + Error::invalid_input(format!("Column '{}' not found in result", column)) + })?; + let hashes_arr = hash_col.as_fixed_size_list(); + let hashes = extract_hashes_from_fixed_list(hashes_arr)?; + + (hashes, row_id_vec) + }; + + if hashes.len() < 2 { + let empty = ClusteringResult { + clusters: Vec::new(), + }; + return Ok(empty.into_reader(None)); + } + + // Compute pairwise hamming distances + let pairwise = + pairwise_hamming_distance_parallel(&hashes, Some(&row_ids), Some(hamming_threshold)); + + // Cluster edges + let clustering = cluster_pairwise_result(&pairwise); + + Ok(clustering.into_reader(None)) +} + +/// Perform pairwise hamming distance clustering on a contiguous range of rows from a fragment. +/// +/// This function reads a contiguous range of rows from a specific fragment, +/// extracts hashes, computes pairwise hamming distances, and clusters the results. +/// Unlike sampling, this reads sequential rows which is useful for distributed +/// processing where each worker handles a specific range of a fragment. +/// +/// # Arguments +/// +/// * `dataset` - The Lance dataset +/// * `column` - Name of the hash column (must be `FixedSizeList`) +/// * `fragment_id` - The fragment ID to read from +/// * `start_row` - The starting row offset within the fragment +/// * `num_rows` - Number of rows to read from the start position +/// * `hamming_threshold` - Maximum hamming distance to consider as similar +/// +/// # Returns +/// +/// A `RecordBatchReader` yielding batches with columns: +/// - `representative`: UInt64 - The representative row ID for each cluster +/// - `duplicates`: `List` - List of duplicate row IDs in each cluster +/// +/// # Errors +/// +/// Returns an error if: +/// - The fragment doesn't exist +/// - The column has wrong type (must be `FixedSizeList`) +/// - The row range is out of bounds +pub async fn hamming_clustering_for_range( + dataset: &Dataset, + column: &str, + fragment_id: usize, + start_row: usize, + num_rows: usize, + hamming_threshold: u32, +) -> Result> { + // Validate column exists and has correct type + let schema = dataset.schema(); + let field = schema.field(column).ok_or_else(|| { + Error::invalid_input(format!("Column '{}' not found in dataset schema", column)) + })?; + + // Check column is FixedSizeList + let data_type = field.data_type(); + match data_type { + DataType::FixedSizeList(inner, 8) => { + if *inner.data_type() != DataType::UInt8 { + return Err(Error::invalid_input(format!( + "Column '{}' must be FixedSizeList, got FixedSizeList<{:?}, 8>", + column, + inner.data_type() + ))); + } + } + _ => { + return Err(Error::invalid_input(format!( + "Column '{}' must be FixedSizeList, got {:?}", + column, data_type + ))); + } + } + + // Get the fragment + let fragment = dataset.get_fragment(fragment_id).ok_or_else(|| { + Error::invalid_input(format!("Fragment with ID {} not found", fragment_id)) + })?; + + // Get fragment metadata for physical row count + let fragment_meta = fragment.metadata().clone(); + let physical_rows = fragment_meta + .physical_rows + .ok_or_else(|| Error::invalid_input("Fragment has no physical_rows metadata"))?; + + // Validate the range + if start_row >= physical_rows { + return Err(Error::invalid_input(format!( + "start_row {} is out of range for fragment with {} physical rows", + start_row, physical_rows + ))); + } + + // Adjust num_rows if it exceeds available rows + let effective_num_rows = num_rows.min(physical_rows - start_row); + + if effective_num_rows == 0 { + let empty = ClusteringResult { + clusters: Vec::new(), + }; + return Ok(empty.into_reader(None)); + } + + // Use scanner with the specific fragment and limit/offset + let batch = dataset + .scan() + .with_fragments(vec![fragment_meta]) + .project(&[column])? + .with_row_id() + .limit(Some(effective_num_rows as i64), Some(start_row as i64))? + .try_into_batch() + .await?; + + // Extract row IDs + let rowid_col = batch.column_by_name("_rowid").ok_or_else(|| { + Error::invalid_input("_rowid column not found in scan result".to_string()) + })?; + let row_ids = rowid_col.as_primitive::(); + let row_id_vec: Vec = row_ids.values().to_vec(); + + // Extract hashes + let hash_col = batch + .column_by_name(column) + .ok_or_else(|| Error::invalid_input(format!("Column '{}' not found in result", column)))?; + let hashes_arr = hash_col.as_fixed_size_list(); + let hashes = extract_hashes_from_fixed_list(hashes_arr)?; + + if hashes.len() < 2 { + let empty = ClusteringResult { + clusters: Vec::new(), + }; + return Ok(empty.into_reader(None)); + } + + // Compute pairwise hamming distances + let pairwise = + pairwise_hamming_distance_parallel(&hashes, Some(&row_id_vec), Some(hamming_threshold)); + + // Cluster edges + let clustering = cluster_pairwise_result(&pairwise); + + Ok(clustering.into_reader(None)) +} + +/// Perform pairwise hamming distance clustering on provided hashes (no I/O). +/// +/// This is useful for benchmarking the pure compute performance without I/O. +/// Logs timing information via tracing. +/// +/// # Arguments +/// +/// * `hashes` - Vector of 64-bit hash values +/// * `row_ids` - Optional row IDs (defaults to indices if None) +/// * `hamming_threshold` - Maximum hamming distance to consider as similar +/// +/// # Returns +/// +/// A `RecordBatchReader` yielding batches with columns: +/// - `representative`: UInt64 - The representative row ID for each cluster +/// - `duplicates`: `List` - List of duplicate row IDs in each cluster +pub fn hamming_clustering_from_hashes( + hashes: &[u64], + row_ids: Option<&[u64]>, + hamming_threshold: u32, +) -> Box { + let num_rows = hashes.len(); + if num_rows < 2 { + let empty = ClusteringResult { + clusters: Vec::new(), + }; + return empty.into_reader(None); + } + + let total_pairs = (num_rows as u64) * (num_rows as u64 - 1) / 2; + + // Compute pairwise hamming distances + let t_compute_start = Instant::now(); + let pairwise = pairwise_hamming_distance_parallel(hashes, row_ids, Some(hamming_threshold)); + let compute_time = t_compute_start.elapsed(); + + // Cluster edges + let t_cluster_start = Instant::now(); + let clustering = cluster_pairwise_result(&pairwise); + let cluster_time = t_cluster_start.elapsed(); + + // Log timing info + let pairs_per_sec = if compute_time.as_secs_f64() > 0.0 { + total_pairs as f64 / compute_time.as_secs_f64() + } else { + 0.0 + }; + tracing::info!( + num_rows, + total_pairs, + edges = pairwise.len(), + compute_time_ms = compute_time.as_millis(), + cluster_time_ms = cluster_time.as_millis(), + pairs_per_sec_millions = pairs_per_sec / 1_000_000.0, + num_clusters = clustering.num_clusters(), + num_duplicates = clustering.num_duplicates(), + "Hamming clustering completed" + ); + + clustering.into_reader(None) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::Array; + + /// Helper to collect all clusters from a reader. + fn collect_clusters(reader: Box) -> Vec<(u64, Vec)> { + let mut clusters = Vec::new(); + for batch in reader { + let batch = batch.unwrap(); + let reps = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let dups = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + + for i in 0..batch.num_rows() { + let rep = reps.value(i); + let dup_arr = dups.value(i); + let dup_values = dup_arr + .as_any() + .downcast_ref::() + .unwrap(); + let duplicates: Vec = dup_values.values().to_vec(); + clusters.push((rep, duplicates)); + } + } + clusters + } + + #[test] + fn test_hamming_clustering_from_hashes_basic() { + // Create some test hashes with known distances + let hashes = vec![ + 0b0000u64, // hash 0 + 0b0001u64, // hash 1 - distance 1 from hash 0 + 0b0011u64, // hash 2 - distance 1 from hash 1, distance 2 from hash 0 + 0b1111u64, // hash 3 - distance 2 from hash 2, distance 4 from hash 0 + ]; + + let reader = hamming_clustering_from_hashes(&hashes, None, 1); + let clusters = collect_clusters(reader); + + // With threshold 1, pairs (0,1) and (1,2) should be connected + // This forms one cluster: {0, 1, 2} + assert_eq!(clusters.len(), 1); + assert_eq!(clusters[0].1.len(), 2); // 2 duplicates in the cluster + } + + #[test] + fn test_hamming_clustering_from_hashes_no_clusters() { + // All hashes are far apart + let hashes = vec![ + 0x0000000000000000u64, + 0xFFFFFFFFFFFFFFFFu64, + 0xAAAAAAAAAAAAAAAAu64, + ]; + + let reader = hamming_clustering_from_hashes(&hashes, None, 5); + let clusters = collect_clusters(reader); + + // With threshold 5, no pairs should be connected (min distance is 32) + assert_eq!(clusters.len(), 0); + } + + #[test] + fn test_hamming_clustering_from_hashes_with_row_ids() { + let hashes = vec![0b0000u64, 0b0001u64]; + let row_ids = vec![100u64, 200u64]; + + let reader = hamming_clustering_from_hashes(&hashes, Some(&row_ids), 1); + let clusters = collect_clusters(reader); + + assert_eq!(clusters.len(), 1); + assert_eq!(clusters[0].0, 100); // representative + assert_eq!(clusters[0].1, vec![200]); // duplicates + } + + #[tokio::test] + async fn test_hamming_clustering_for_ivf_partition() { + use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array}; + use arrow_schema::{Field, Schema}; + use lance_arrow::FixedSizeListArrayExt; + use lance_index::vector::ivf::IvfBuildParams; + use std::sync::Arc; + use tempfile::tempdir; + + // Create test data with hash column (FixedSizeList) + let schema = Arc::new(Schema::new(vec![Field::new( + "hash", + arrow_schema::DataType::FixedSizeList( + Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)), + 8, + ), + false, + )])); + + // Generate hashes with some duplicates (similar hashes) + let num_rows = 100; + let mut hash_bytes = Vec::with_capacity(num_rows * 8); + for i in 0..num_rows { + // Create groups of similar hashes + let base = (i / 10) as u64; // 10 groups + let variation = (i % 10) as u64; + let hash = base.wrapping_mul(0x123456789) ^ variation; + hash_bytes.extend_from_slice(&hash.to_le_bytes()); + } + let values = UInt8Array::from(hash_bytes); + let hash_array = + FixedSizeListArray::try_new_from_values(values, 8).expect("create hash array"); + + let batch = + arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap(); + + // Write dataset + let temp_dir = tempdir().unwrap(); + let uri = temp_dir.path().to_str().unwrap(); + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let mut dataset = crate::Dataset::write(reader, uri, None).await.unwrap(); + + // Create IVF_FLAT index with 4 partitions + let ivf_params = IvfBuildParams::new(4); + let params = crate::index::vector::VectorIndexParams::with_ivf_flat_params( + lance_linalg::distance::MetricType::Hamming, + ivf_params, + ); + + dataset + .create_index( + &["hash"], + crate::index::IndexType::Vector, + None, + ¶ms, + false, + ) + .await + .unwrap(); + + // Load and test + let dataset = crate::Dataset::open(uri).await.unwrap(); + let indices = dataset.load_indices().await.unwrap(); + let index_name = &indices[0].name; + + // Test clustering on partition 0 + let reader = hamming_clustering_for_ivf_partition(&dataset, index_name, 0, 10) + .await + .unwrap(); + let clusters = collect_clusters(reader); + + // Verify we get valid results (may or may not have clusters depending on data distribution) + // At minimum, verify no panics and valid schema + for (rep, dups) in &clusters { + assert!(*rep < num_rows as u64 * 10); // row IDs should be reasonable + for dup in dups { + assert!(*dup < num_rows as u64 * 10); + } + } + } + + #[tokio::test] + async fn test_hamming_clustering_for_ivf_partition_invalid_index() { + use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array}; + use arrow_schema::{Field, Schema}; + use lance_arrow::FixedSizeListArrayExt; + use std::sync::Arc; + use tempfile::tempdir; + + let schema = Arc::new(Schema::new(vec![Field::new( + "hash", + arrow_schema::DataType::FixedSizeList( + Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)), + 8, + ), + false, + )])); + + let values = UInt8Array::from(vec![0u8; 80]); // 10 rows * 8 bytes + let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap(); + let batch = + arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap(); + + let temp_dir = tempdir().unwrap(); + let uri = temp_dir.path().to_str().unwrap(); + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let dataset = crate::Dataset::write(reader, uri, None).await.unwrap(); + + // Test with non-existent index + let result = hamming_clustering_for_ivf_partition(&dataset, "nonexistent", 0, 10).await; + assert!(result.is_err()); + let err = result.err().unwrap(); + assert!(err.to_string().contains("not found"), "Error: {}", err); + } + + #[tokio::test] + async fn test_hamming_clustering_for_sample_integration() { + use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array}; + use arrow_schema::{Field, Schema}; + use lance_arrow::FixedSizeListArrayExt; + use std::sync::Arc; + use tempfile::tempdir; + + let schema = Arc::new(Schema::new(vec![Field::new( + "hash", + arrow_schema::DataType::FixedSizeList( + Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)), + 8, + ), + false, + )])); + + // Create 50 rows with some duplicate hashes + let num_rows = 50; + let mut hash_bytes = Vec::with_capacity(num_rows * 8); + for i in 0..num_rows { + // Create some identical hashes (groups of 5) + let hash = (i / 5) as u64; + hash_bytes.extend_from_slice(&hash.to_le_bytes()); + } + let values = UInt8Array::from(hash_bytes); + let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap(); + let batch = + arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap(); + + let temp_dir = tempdir().unwrap(); + let uri = temp_dir.path().to_str().unwrap(); + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + crate::Dataset::write(reader, uri, None).await.unwrap(); + + let dataset = crate::Dataset::open(uri).await.unwrap(); + + // Test full scan (no sampling) + let reader = hamming_clustering_for_sample(&dataset, "hash", None, 0) + .await + .unwrap(); + let clusters = collect_clusters(reader); + + // With threshold 0 (exact match) and groups of 5 identical hashes, + // we should have 10 clusters with 4 duplicates each + assert_eq!(clusters.len(), 10); + for (_, dups) in &clusters { + assert_eq!(dups.len(), 4); + } + + // Test with sampling + let reader = hamming_clustering_for_sample(&dataset, "hash", Some(20), 0) + .await + .unwrap(); + let clusters = collect_clusters(reader); + // With sampling, we may get fewer clusters + assert!(clusters.len() <= 10); + } + + #[tokio::test] + async fn test_hamming_clustering_for_range_integration() { + use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array}; + use arrow_schema::{Field, Schema}; + use lance_arrow::FixedSizeListArrayExt; + use std::sync::Arc; + use tempfile::tempdir; + + let schema = Arc::new(Schema::new(vec![Field::new( + "hash", + arrow_schema::DataType::FixedSizeList( + Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)), + 8, + ), + false, + )])); + + // Create 50 rows with some duplicate hashes (groups of 5 identical hashes) + let num_rows = 50; + let mut hash_bytes = Vec::with_capacity(num_rows * 8); + for i in 0..num_rows { + let hash = (i / 5) as u64; + hash_bytes.extend_from_slice(&hash.to_le_bytes()); + } + let values = UInt8Array::from(hash_bytes); + let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap(); + let batch = + arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap(); + + let temp_dir = tempdir().unwrap(); + let uri = temp_dir.path().to_str().unwrap(); + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + crate::Dataset::write(reader, uri, None).await.unwrap(); + + let dataset = crate::Dataset::open(uri).await.unwrap(); + + // Get fragment info + let fragments = dataset.get_fragments(); + assert_eq!(fragments.len(), 1); + let fragment_id = fragments[0].id() as usize; + + // Test reading range from the fragment + // Reading rows 0-25 should cover groups 0-4 (5 groups, each with 5 rows) + let reader = hamming_clustering_for_range(&dataset, "hash", fragment_id, 0, 25, 0) + .await + .unwrap(); + let clusters = collect_clusters(reader); + + // With threshold 0 and 25 rows (groups 0-4), we should have 5 clusters + // Each cluster has 4 duplicates (5 identical hashes - 1 representative = 4 duplicates) + assert_eq!(clusters.len(), 5); + for (_, dups) in &clusters { + assert_eq!(dups.len(), 4); + } + + // Test reading a different range (rows 25-50) + let reader = hamming_clustering_for_range(&dataset, "hash", fragment_id, 25, 25, 0) + .await + .unwrap(); + let clusters = collect_clusters(reader); + + // Should have 5 clusters (groups 5-9) + assert_eq!(clusters.len(), 5); + for (_, dups) in &clusters { + assert_eq!(dups.len(), 4); + } + } + + #[tokio::test] + async fn test_hamming_clustering_for_range_invalid_fragment() { + use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array}; + use arrow_schema::{Field, Schema}; + use lance_arrow::FixedSizeListArrayExt; + use std::sync::Arc; + use tempfile::tempdir; + + let schema = Arc::new(Schema::new(vec![Field::new( + "hash", + arrow_schema::DataType::FixedSizeList( + Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)), + 8, + ), + false, + )])); + + let values = UInt8Array::from(vec![0u8; 80]); // 10 rows * 8 bytes + let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap(); + let batch = + arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap(); + + let temp_dir = tempdir().unwrap(); + let uri = temp_dir.path().to_str().unwrap(); + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + crate::Dataset::write(reader, uri, None).await.unwrap(); + + let dataset = crate::Dataset::open(uri).await.unwrap(); + + // Test with non-existent fragment + let result = hamming_clustering_for_range(&dataset, "hash", 999, 0, 10, 0).await; + assert!(result.is_err()); + let err = result.err().unwrap(); + assert!(err.to_string().contains("not found"), "Error: {}", err); + + // Test with out-of-range start_row + let result = hamming_clustering_for_range(&dataset, "hash", 0, 1000, 10, 0).await; + assert!(result.is_err()); + let err = result.err().unwrap(); + assert!(err.to_string().contains("out of range"), "Error: {}", err); + } +} diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index 579990fc03b..fb01339ead9 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -6266,7 +6266,7 @@ mod tests { ); // PQ code is on residual space - let pq_store = ivf_idx.load_partition_storage(0).await.unwrap(); + let pq_store = ivf_idx.load_partition_storage(0, None).await.unwrap(); pq_store .codebook() .values() diff --git a/rust/lance/src/index/vector/ivf/partition_serde.rs b/rust/lance/src/index/vector/ivf/partition_serde.rs index 83ced18c598..ad737620a94 100644 --- a/rust/lance/src/index/vector/ivf/partition_serde.rs +++ b/rust/lance/src/index/vector/ivf/partition_serde.rs @@ -3,32 +3,17 @@ //! Serialization and zero-copy deserialization for IVF partition cache entries. //! -//! The format is: -//! -//! ```text -//! [header_len: u64 LE] -//! [header: JSON bytes] -//! [sub_index Arrow IPC stream] -//! [... quantizer-specific IPC streams ...] -//! [storage Arrow IPC stream] -//! ``` -//! -//! Each IPC section is a self-delimiting Arrow IPC stream (schema + batches + EOS -//! marker), written directly to the underlying writer without buffering. On -//! deserialization, each message is read into a per-message buffer and zero-copy -//! decoded via [`lance_arrow::ipc`]. +//! Each entry is a protobuf header (see `lance-index/protos-cache/cache.proto`, with the +//! distance and rotation types as proto enums) followed by 64-byte-aligned +//! Arrow IPC sections in a fixed, version-keyed order: the sub-index, then any +//! quantizer-specific arrays (PQ codebook, RabitQ Matrix rotation), then the +//! quantizer storage batches. Sections decode zero-copy via [`lance_arrow::ipc`]. -use std::io::Write; use std::sync::Arc; use arrow_array::{FixedSizeListArray, RecordBatch}; use arrow_schema::{DataType, Field, Schema}; -use bytes::Bytes; -use lance_arrow::ipc::{ - read_ipc_stream_at, read_ipc_stream_single_at, read_len_prefixed_bytes_at, write_ipc_stream, - write_ipc_stream_batches, write_len_prefixed_bytes, -}; -use lance_core::cache::CacheCodecImpl; +use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter}; use lance_core::{Error, Result}; use lance_index::vector::bq::RQRotationType; use lance_index::vector::bq::builder::RabitQuantizer; @@ -38,11 +23,15 @@ use lance_index::vector::pq::ProductQuantizer; use lance_index::vector::pq::storage::ProductQuantizationMetadata; use lance_index::vector::quantizer::{Quantization, QuantizerStorage}; use lance_index::vector::sq::ScalarQuantizer; -use lance_index::vector::sq::storage::ScalarQuantizationMetadata; use lance_index::vector::storage::VectorStore; use lance_index::vector::v3::subindex::IvfSubIndex; use lance_linalg::distance::DistanceType; -use serde::{Deserialize, Serialize}; + +use lance_index::cache_pb::{ + DistanceType as PbDistanceType, FlatPartitionHeader, PqPartitionHeader, RabitPartitionHeader, + RabitQueryEstimator as PbRabitQueryEstimator, RotationType as PbRotationType, + SqPartitionHeader, +}; use super::v2::PartitionEntry; @@ -68,7 +57,7 @@ type ArcAny = Arc; fn serialize_partition_entry( any: &ArcAny, - writer: &mut dyn Write, + writer: &mut CacheEntryWriter<'_>, ) -> lance_core::Result<()> where S: IvfSubIndex + 'static, @@ -81,14 +70,16 @@ where concrete.serialize(writer) } -fn deserialize_partition_entry(data: &Bytes) -> lance_core::Result +fn deserialize_partition_entry( + reader: &mut CacheEntryReader<'_>, +) -> lance_core::Result where S: IvfSubIndex + 'static, Q: Quantization + 'static, Concrete: Quantization + 'static, PartitionEntry: CacheCodecImpl, { - let concrete = PartitionEntry::::deserialize(data)?; + let concrete = PartitionEntry::::deserialize(reader)?; let any: ArcAny = Arc::new(concrete); Ok(any .downcast::>() @@ -109,6 +100,8 @@ where PartitionEntry: CacheCodecImpl, { lance_core::cache::CacheCodec::new( + as CacheCodecImpl>::TYPE_ID, + as CacheCodecImpl>::CURRENT_VERSION, serialize_partition_entry::, deserialize_partition_entry::, ) @@ -118,51 +111,64 @@ where // Common helpers // --------------------------------------------------------------------------- -fn distance_type_to_u8(dt: DistanceType) -> u8 { +// Distance and rotation discriminants travel as proto enums in the header; +// these map to/from the in-memory Rust enums. + +fn distance_type_to_proto(dt: DistanceType) -> PbDistanceType { + match dt { + DistanceType::L2 => PbDistanceType::L2, + DistanceType::Cosine => PbDistanceType::Cosine, + DistanceType::Dot => PbDistanceType::Dot, + DistanceType::Hamming => PbDistanceType::Hamming, + } +} + +fn proto_to_distance_type(dt: PbDistanceType) -> DistanceType { match dt { - DistanceType::L2 => 0, - DistanceType::Cosine => 1, - DistanceType::Dot => 2, - DistanceType::Hamming => 3, + PbDistanceType::L2 => DistanceType::L2, + PbDistanceType::Cosine => DistanceType::Cosine, + PbDistanceType::Dot => DistanceType::Dot, + PbDistanceType::Hamming => DistanceType::Hamming, } } -fn u8_to_distance_type(v: u8) -> Result { - match v { - 0 => Ok(DistanceType::L2), - 1 => Ok(DistanceType::Cosine), - 2 => Ok(DistanceType::Dot), - 3 => Ok(DistanceType::Hamming), - _ => Err(Error::io(format!("unknown distance type: {v}"))), +fn rotation_type_to_proto(rt: RQRotationType) -> PbRotationType { + match rt { + RQRotationType::Matrix => PbRotationType::Matrix, + RQRotationType::Fast => PbRotationType::Fast, } } -fn rotation_type_to_u8(rt: RQRotationType) -> u8 { +fn proto_to_rotation_type(rt: PbRotationType) -> RQRotationType { match rt { - RQRotationType::Matrix => 0, - RQRotationType::Fast => 1, + PbRotationType::Matrix => RQRotationType::Matrix, + PbRotationType::Fast => RQRotationType::Fast, } } -fn u8_to_rotation_type(v: u8) -> Result { - match v { - 0 => Ok(RQRotationType::Matrix), - 1 => Ok(RQRotationType::Fast), - _ => Err(Error::io(format!("unknown rotation type: {v}"))), +fn query_estimator_to_proto(qe: RabitQueryEstimator) -> PbRabitQueryEstimator { + match qe { + RabitQueryEstimator::ResidualQuery => PbRabitQueryEstimator::ResidualQuery, + RabitQueryEstimator::RawQuery => PbRabitQueryEstimator::RawQuery, } } -/// Write a JSON-serializable header using [`write_len_prefixed_bytes`]. -fn write_json_header(writer: &mut dyn Write, header: &impl Serialize) -> Result<()> { - let header_json = serde_json::to_vec(header)?; - write_len_prefixed_bytes(writer, &header_json)?; - Ok(()) +fn proto_to_query_estimator(qe: PbRabitQueryEstimator) -> RabitQueryEstimator { + match qe { + PbRabitQueryEstimator::ResidualQuery => RabitQueryEstimator::ResidualQuery, + PbRabitQueryEstimator::RawQuery => RabitQueryEstimator::RawQuery, + } } -/// Read a JSON header written by [`write_json_header`]. -fn read_json_header(data: &Bytes, offset: &mut usize) -> Result { - let bytes = read_len_prefixed_bytes_at(data, offset).map_err(|e| Error::io(e.to_string()))?; - serde_json::from_slice(&bytes).map_err(|e| Error::io(e.to_string())) +/// Read a storage section expected to hold exactly one batch. +fn read_single_storage_batch(r: &mut CacheEntryReader<'_>) -> Result { + let mut batches = r.read_ipc_batches()?; + match batches.len() { + 1 => Ok(batches.remove(0)), + n => Err(Error::io(format!( + "expected exactly 1 storage batch, got {n}" + ))), + } } /// Wrap a `FixedSizeListArray` in a single-column `RecordBatch` with the given @@ -202,17 +208,11 @@ fn batch_to_codebook(batch: &RecordBatch) -> Result { // PQ // --------------------------------------------------------------------------- -#[derive(Serialize, Deserialize)] -struct PqPartitionHeader { - distance_type: u8, - nbits: u32, - num_sub_vectors: usize, - dimension: usize, - transposed: bool, -} - impl CacheCodecImpl for PartitionEntry { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { + const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.PQ"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { let metadata = self.storage.metadata(); let distance_type = self.storage.distance_type(); @@ -221,32 +221,28 @@ impl CacheCodecImpl for PartitionEntry { })?; let header = PqPartitionHeader { - distance_type: distance_type_to_u8(distance_type), + distance_type: distance_type_to_proto(distance_type) as i32, nbits: metadata.nbits, - num_sub_vectors: metadata.num_sub_vectors, - dimension: metadata.dimension, + num_sub_vectors: metadata.num_sub_vectors as u64, + dimension: metadata.dimension as u64, transposed: metadata.transposed, }; - write_json_header(writer, &header)?; - write_ipc_stream(&self.index.to_batch()?, writer)?; - write_ipc_stream(&codebook_to_batch(codebook)?, writer)?; - write_ipc_stream_batches(self.storage.to_batches()?, writer)?; + w.write_header(&header)?; + w.write_ipc(&self.index.to_batch()?)?; + w.write_ipc(&codebook_to_batch(codebook)?)?; + w.write_ipc_batches(self.storage.to_batches()?)?; Ok(()) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let header: PqPartitionHeader = read_json_header(data, &mut offset)?; - let distance_type = u8_to_distance_type(header.distance_type)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: PqPartitionHeader = r.read_header()?; + let distance_type = proto_to_distance_type(header.distance_type()); - let sub_index_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; - let codebook_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; - let storage_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; + let sub_index_batch = r.read_ipc()?; + let codebook_batch = r.read_ipc()?; + let storage_batch = read_single_storage_batch(r)?; let index = S::load(sub_index_batch)?; let codebook = batch_to_codebook(&codebook_batch)?; @@ -254,8 +250,8 @@ impl CacheCodecImpl for PartitionEntry { let metadata = ProductQuantizationMetadata { codebook_position: 0, nbits: header.nbits, - num_sub_vectors: header.num_sub_vectors, - dimension: header.dimension, + num_sub_vectors: header.num_sub_vectors as usize, + dimension: header.dimension as usize, codebook: Some(codebook), codebook_tensor: Vec::new(), transposed: header.transposed, @@ -276,41 +272,35 @@ impl CacheCodecImpl for PartitionEntry { // Flat (Float32) // --------------------------------------------------------------------------- -#[derive(Serialize, Deserialize)] -struct FlatPartitionHeader { - distance_type: u8, - dim: usize, -} - impl CacheCodecImpl for PartitionEntry { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { - let metadata = self.storage.metadata(); - let distance_type = self.storage.distance_type(); + const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.Flat"; + const CURRENT_VERSION: u32 = 1; + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + let metadata = self.storage.metadata(); let header = FlatPartitionHeader { - distance_type: distance_type_to_u8(distance_type), - dim: metadata.dim, + distance_type: distance_type_to_proto(self.storage.distance_type()) as i32, + dim: metadata.dim as u64, }; - write_json_header(writer, &header)?; - write_ipc_stream(&self.index.to_batch()?, writer)?; - write_ipc_stream_batches(self.storage.to_batches()?, writer)?; + w.write_header(&header)?; + w.write_ipc(&self.index.to_batch()?)?; + w.write_ipc_batches(self.storage.to_batches()?)?; Ok(()) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let header: FlatPartitionHeader = read_json_header(data, &mut offset)?; - let distance_type = u8_to_distance_type(header.distance_type)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: FlatPartitionHeader = r.read_header()?; + let distance_type = proto_to_distance_type(header.distance_type()); - let sub_index_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; - let storage_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; + let sub_index_batch = r.read_ipc()?; + let storage_batch = read_single_storage_batch(r)?; let index = S::load(sub_index_batch)?; - let metadata = FlatMetadata { dim: header.dim }; + let metadata = FlatMetadata { + dim: header.dim as usize, + }; let storage = ::Storage::try_from_batch( storage_batch, &metadata, @@ -327,34 +317,34 @@ impl CacheCodecImpl for PartitionEntry { // --------------------------------------------------------------------------- impl CacheCodecImpl for PartitionEntry { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { - let metadata = self.storage.metadata(); - let distance_type = self.storage.distance_type(); + const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.FlatBin"; + const CURRENT_VERSION: u32 = 1; + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + let metadata = self.storage.metadata(); let header = FlatPartitionHeader { - distance_type: distance_type_to_u8(distance_type), - dim: metadata.dim, + distance_type: distance_type_to_proto(self.storage.distance_type()) as i32, + dim: metadata.dim as u64, }; - write_json_header(writer, &header)?; - write_ipc_stream(&self.index.to_batch()?, writer)?; - write_ipc_stream_batches(self.storage.to_batches()?, writer)?; + w.write_header(&header)?; + w.write_ipc(&self.index.to_batch()?)?; + w.write_ipc_batches(self.storage.to_batches()?)?; Ok(()) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let header: FlatPartitionHeader = read_json_header(data, &mut offset)?; - let distance_type = u8_to_distance_type(header.distance_type)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: FlatPartitionHeader = r.read_header()?; + let distance_type = proto_to_distance_type(header.distance_type()); - let sub_index_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; - let storage_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; + let sub_index_batch = r.read_ipc()?; + let storage_batch = read_single_storage_batch(r)?; let index = S::load(sub_index_batch)?; - let metadata = FlatMetadata { dim: header.dim }; + let metadata = FlatMetadata { + dim: header.dim as usize, + }; let storage = ::Storage::try_from_batch( storage_batch, &metadata, @@ -370,56 +360,41 @@ impl CacheCodecImpl for PartitionEntry { // SQ // --------------------------------------------------------------------------- -#[derive(Serialize, Deserialize)] -struct SqPartitionHeader { - distance_type: u8, - num_bits: u16, - dim: usize, - bounds_start: f64, - bounds_end: f64, -} - impl CacheCodecImpl for PartitionEntry { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { - let metadata = self.storage.metadata(); - let distance_type = self.storage.distance_type(); + const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.SQ"; + const CURRENT_VERSION: u32 = 1; + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + let metadata = self.storage.metadata(); let header = SqPartitionHeader { - distance_type: distance_type_to_u8(distance_type), - num_bits: metadata.num_bits, - dim: metadata.dim, + distance_type: distance_type_to_proto(self.storage.distance_type()) as i32, + num_bits: metadata.num_bits as u32, + dim: metadata.dim as u64, bounds_start: metadata.bounds.start, bounds_end: metadata.bounds.end, }; - write_json_header(writer, &header)?; - write_ipc_stream(&self.index.to_batch()?, writer)?; - // SQ storage may contain multiple batches; stream them all in one IPC stream. - write_ipc_stream_batches(self.storage.to_batches()?, writer)?; + w.write_header(&header)?; + w.write_ipc(&self.index.to_batch()?)?; + // SQ storage may contain multiple batches; write them all in one section. + w.write_ipc_batches(self.storage.to_batches()?)?; Ok(()) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let header: SqPartitionHeader = read_json_header(data, &mut offset)?; - let distance_type = u8_to_distance_type(header.distance_type)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: SqPartitionHeader = r.read_header()?; + let distance_type = proto_to_distance_type(header.distance_type()); - let sub_index_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; - let storage_batches = - read_ipc_stream_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; + let sub_index_batch = r.read_ipc()?; + let storage_batches = r.read_ipc_batches()?; let index = S::load(sub_index_batch)?; - let metadata = ScalarQuantizationMetadata { - dim: header.dim, - num_bits: header.num_bits, - bounds: header.bounds_start..header.bounds_end, - }; + let num_bits = header.num_bits as u16; let storage = ::Storage::try_new( - metadata.num_bits, + num_bits, distance_type, - metadata.bounds, + header.bounds_start..header.bounds_end, storage_batches, None, )?; @@ -432,88 +407,69 @@ impl CacheCodecImpl for PartitionEntry { // RabitQ // --------------------------------------------------------------------------- -#[derive(Serialize, Deserialize)] -struct RabitPartitionHeader { - distance_type: u8, - num_bits: u8, - code_dim: u32, - #[serde(default = "default_rabit_query_estimator")] - query_estimator: RabitQueryEstimator, - /// 0 = Matrix, 1 = Fast - rotation_type: u8, - /// Fast rotation signs (only set when rotation_type == Fast). - fast_rotation_signs: Option>, -} - -fn default_rabit_query_estimator() -> RabitQueryEstimator { - RabitQueryEstimator::ResidualQuery -} - impl CacheCodecImpl for PartitionEntry { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { - let metadata = self.storage.metadata(); - let distance_type = self.storage.distance_type(); + const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.Rabit"; + const CURRENT_VERSION: u32 = 1; + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + let metadata = self.storage.metadata(); let header = RabitPartitionHeader { - distance_type: distance_type_to_u8(distance_type), - num_bits: metadata.num_bits, + distance_type: distance_type_to_proto(self.storage.distance_type()) as i32, + num_bits: metadata.num_bits as u32, code_dim: metadata.code_dim, - query_estimator: metadata.query_estimator, - rotation_type: rotation_type_to_u8(metadata.rotation_type), + rotation_type: rotation_type_to_proto(metadata.rotation_type) as i32, + query_estimator: query_estimator_to_proto(metadata.query_estimator) as i32, fast_rotation_signs: metadata.fast_rotation_signs.clone(), }; - write_json_header(writer, &header)?; + w.write_header(&header)?; + w.write_ipc(&self.index.to_batch()?)?; - write_ipc_stream(&self.index.to_batch()?, writer)?; - - // Write the rotation matrix IPC stream only for Matrix rotation; the - // Fast rotation case stores its signs compactly in the JSON header. + // Write the rotation matrix IPC section only for Matrix rotation; the + // Fast rotation case stores its signs compactly in the proto header. if metadata.rotation_type == RQRotationType::Matrix { let mat = metadata.rotate_mat.as_ref().ok_or_else(|| { Error::io( "RabitQ Matrix metadata missing rotate_mat during serialization".to_string(), ) })?; - write_ipc_stream(&fsl_to_batch(mat, "rotate_mat")?, writer)?; + w.write_ipc(&fsl_to_batch(mat, "rotate_mat")?)?; } - write_ipc_stream_batches(self.storage.to_batches()?, writer)?; + w.write_ipc_batches(self.storage.to_batches()?)?; Ok(()) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let header: RabitPartitionHeader = read_json_header(data, &mut offset)?; - let distance_type = u8_to_distance_type(header.distance_type)?; - let rotation_type = u8_to_rotation_type(header.rotation_type)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: RabitPartitionHeader = r.read_header()?; + let distance_type = proto_to_distance_type(header.distance_type()); + let rotation_type = proto_to_rotation_type(header.rotation_type()); - let sub_index_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; + let sub_index_batch = r.read_ipc()?; let rotate_mat = if rotation_type == RQRotationType::Matrix { - let mat_batch = read_ipc_stream_single_at(data, &mut offset) - .map_err(|e| Error::io(e.to_string()))?; + let mat_batch = r.read_ipc()?; Some(batch_to_fsl(&mat_batch)?) } else { None }; - let storage_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; + let storage_batch = read_single_storage_batch(r)?; let index = S::load(sub_index_batch)?; + // Read the proto enum accessor before moving fields out of `header`. + let query_estimator = proto_to_query_estimator(header.query_estimator()); let metadata = RabitQuantizationMetadata { rotate_mat, rotate_mat_position: None, fast_rotation_signs: header.fast_rotation_signs, rotation_type, code_dim: header.code_dim, - num_bits: header.num_bits, + num_bits: header.num_bits as u8, // The storage batch already has packed codes; skip re-packing. packed: true, - query_estimator: header.query_estimator, + query_estimator, }; let storage = ::Storage::try_from_batch( storage_batch, @@ -551,6 +507,21 @@ mod tests { use lance_index::vector::flat::storage::FlatFloatStorage; use lance_index::vector::sq::storage::ScalarQuantizationStorage; + /// Serialize a codec body (no envelope) for tests. + fn ser_body(entry: &T) -> Vec { + let mut buf = Vec::new(); + entry + .serialize(&mut CacheEntryWriter::new(&mut buf)) + .unwrap(); + buf + } + + /// Deserialize a codec body (no envelope) at the current build's version. + fn de_body(bytes: Vec) -> Result { + let data = bytes::Bytes::from(bytes); + T::deserialize(&mut CacheEntryReader::new(&data, 0, T::CURRENT_VERSION)) + } + // ----- PQ helpers ------------------------------------------------------- fn make_test_codebook(dim: usize, num_sub_vectors: usize) -> FixedSizeListArray { @@ -618,12 +589,9 @@ mod tests { storage, }; - let mut serialized = Vec::new(); - entry.serialize(&mut serialized).unwrap(); - let deserialized = PartitionEntry::::deserialize( - &bytes::Bytes::from(serialized), - ) - .unwrap(); + let serialized = ser_body(&entry); + let deserialized = + de_body::>(serialized).unwrap(); assert_eq!(entry.storage, deserialized.storage); } @@ -671,12 +639,8 @@ mod tests { storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = PartitionEntry::::deserialize( - &bytes::Bytes::from(bytes), - ) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); assert_eq!( restored.storage.distance_type(), entry.storage.distance_type() @@ -694,12 +658,9 @@ mod tests { storage, }; - let mut serialized = Vec::new(); - entry.serialize(&mut serialized).unwrap(); - let deserialized = PartitionEntry::::deserialize( - &bytes::Bytes::from(serialized), - ) - .unwrap(); + let serialized = ser_body(&entry); + let deserialized = + de_body::>(serialized).unwrap(); assert_eq!(entry.storage, deserialized.storage); } @@ -712,13 +673,9 @@ mod tests { index: FlatIndex::default(), storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); + let mut bytes = ser_body(&entry); bytes.truncate(3); - assert!( - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .is_err() - ); + assert!(de_body::>(bytes).is_err()); } // ----- Flat helpers ----------------------------------------------------- @@ -756,11 +713,8 @@ mod tests { storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); assert_eq!( restored.storage.metadata().dim, @@ -786,11 +740,8 @@ mod tests { index: FlatIndex::default(), storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); assert_eq!(restored.storage.distance_type(), dt); } } @@ -803,11 +754,8 @@ mod tests { storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); let restored_batch = restored.storage.to_batches().unwrap().next().unwrap(); let schema = restored_batch.schema(); @@ -828,11 +776,8 @@ mod tests { storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); let restored_batch = restored.storage.to_batches().unwrap().next().unwrap(); let schema = restored_batch.schema(); @@ -884,11 +829,8 @@ mod tests { storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); let m = entry.storage.metadata(); let rm = restored.storage.metadata(); @@ -914,12 +856,8 @@ mod tests { index: FlatIndex::default(), storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = PartitionEntry::::deserialize( - &bytes::Bytes::from(bytes), - ) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); assert_eq!(restored.storage.distance_type(), dt); } } @@ -960,11 +898,8 @@ mod tests { index: FlatIndex::default(), storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); assert_eq!(restored.storage.len(), 30); let orig_ids: Vec = entry.storage.row_ids().copied().collect(); @@ -978,14 +913,27 @@ mod tests { num_rows: usize, code_dim: usize, distance_type: DistanceType, + ) -> ::Storage { + make_rabit_storage( + num_rows, + code_dim, + distance_type, + RQRotationType::Fast, + RabitQueryEstimator::ResidualQuery, + ) + } + + fn make_rabit_storage( + num_rows: usize, + code_dim: usize, + distance_type: DistanceType, + rotation_type: RQRotationType, + query_estimator: RabitQueryEstimator, ) -> ::Storage { use lance_arrow::FixedSizeListArrayExt; - let quantizer = RabitQuantizer::new_with_rotation::( - 1, - code_dim as i32, - RQRotationType::Fast, - ); + let quantizer = + RabitQuantizer::new_with_rotation::(1, code_dim as i32, rotation_type); let values: Vec = (0..num_rows * code_dim) .map(|i| (i % 100) as f32 / 100.0 - 0.5) .collect(); @@ -997,7 +945,8 @@ mod tests { .as_fixed_size_list() .clone(); - let metadata = quantizer.metadata(None); + let mut metadata = quantizer.metadata(None); + metadata.query_estimator = query_estimator; let batch = RecordBatch::try_from_iter(vec![ ( lance_core::ROW_ID, @@ -1044,11 +993,8 @@ mod tests { storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); let m = entry.storage.metadata(); let rm = restored.storage.metadata(); @@ -1082,22 +1028,125 @@ mod tests { fn test_rabitq_distance_types() { for dt in [DistanceType::L2, DistanceType::Cosine, DistanceType::Dot] { let storage = make_rabit_storage_fast(10, 32, dt); - let expected_distance_type = if dt == DistanceType::Cosine { - DistanceType::L2 - } else { - dt - }; let entry = PartitionEntry:: { index: FlatIndex::default(), storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = PartitionEntry::::deserialize( - &bytes::Bytes::from(bytes), - ) + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); + // The codec round-trips the distance type faithfully. + assert_eq!( + restored.storage.distance_type(), + entry.storage.distance_type() + ); + } + } + + #[test] + fn test_roundtrip_rabitq_raw_query_estimator() { + // The query estimator is a non-default value here; it must survive the + // round trip so raw-query search keeps working after a cache reload. + let storage = make_rabit_storage( + 40, + 32, + DistanceType::L2, + RQRotationType::Fast, + RabitQueryEstimator::RawQuery, + ); + assert_eq!( + storage.metadata().query_estimator, + RabitQueryEstimator::RawQuery + ); + let entry = PartitionEntry:: { + index: FlatIndex::default(), + storage, + }; + + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); + assert_eq!( + restored.storage.metadata().query_estimator, + RabitQueryEstimator::RawQuery + ); + } + + /// Matrix rotation writes an extra `rotate_mat` IPC section between the + /// sub-index and storage sections; exercise that the codec preserves it. + #[test] + fn test_roundtrip_flat_rabitq_matrix() { + let storage = make_rabit_storage( + 40, + 32, + DistanceType::L2, + RQRotationType::Matrix, + RabitQueryEstimator::ResidualQuery, + ); + let entry = PartitionEntry:: { + index: FlatIndex::default(), + storage, + }; + + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); + + let m = entry.storage.metadata(); + let rm = restored.storage.metadata(); + assert_eq!(rm.rotation_type, RQRotationType::Matrix); + assert_eq!(rm.code_dim, m.code_dim); + assert_eq!(rm.num_bits, m.num_bits); + // The rotation matrix itself must survive the round trip. + let orig_mat = m + .rotate_mat + .as_ref() + .expect("matrix rotation has rotate_mat"); + let rest_mat = rm + .rotate_mat + .as_ref() + .expect("restored matrix rotation has rotate_mat"); + assert_eq!( + orig_mat.values().as_primitive::().values(), + rest_mat.values().as_primitive::().values(), + ); + } + + /// SQ storage (a multi-batch IPC section) must decode zero-copy through the + /// full envelope even though the proto header and sub-index section push it + /// to a non-aligned starting offset. + #[test] + fn test_partition_storage_is_zero_copy_through_envelope() { + use lance_core::cache::CacheCodec; + const ALIGN: usize = 64; + + let entry = PartitionEntry:: { + index: FlatIndex::default(), + storage: make_sq_storage(64, 32, DistanceType::L2), + }; + let codec = CacheCodec::from_impl::>(); + let any: Arc = Arc::new(entry); + let mut buf = Vec::new(); + codec.serialize(&any, &mut buf).unwrap(); + + let mut v = vec![0u8; buf.len() + ALIGN]; + let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN; + v[pad..pad + buf.len()].copy_from_slice(&buf); + let data = bytes::Bytes::from(v).slice(pad..pad + buf.len()); + + let restored = codec.deserialize(&data).hit().unwrap(); + let restored = restored + .downcast::>() .unwrap(); - assert_eq!(restored.storage.distance_type(), expected_distance_type); + + let base = data.as_ptr() as usize; + let end = base + data.len(); + let first = restored.storage.to_batches().unwrap().next().unwrap(); + for col in first.columns() { + for buffer in col.to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= base && ptr < end, + "storage buffer was realigned out of the input — misaligned IPC section", + ); + } } } @@ -1135,17 +1184,12 @@ mod tests { let entry = IvfStateEntryBox(Arc::new(state)); - let mut bytes = Vec::new(); - CacheCodecImpl::serialize(&entry, &mut bytes).unwrap(); - - let restored = - ::deserialize(&bytes::Bytes::from(bytes.clone())) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::(bytes.clone()).unwrap(); // Re-serialize the restored entry and compare bytes — a stronger check // than field-by-field comparison and avoids needing to downcast. - let mut restored_bytes = Vec::new(); - CacheCodecImpl::serialize(&restored, &mut restored_bytes).unwrap(); + let restored_bytes = ser_body(&restored); assert_eq!(bytes, restored_bytes); } } diff --git a/rust/lance/src/index/vector/ivf/v2.rs b/rust/lance/src/index/vector/ivf/v2.rs index 4ea076ed420..5b29752f7c1 100644 --- a/rust/lance/src/index/vector/ivf/v2.rs +++ b/rust/lance/src/index/vector/ivf/v2.rs @@ -3,10 +3,10 @@ //! IVF - Inverted File index. -use std::io::Write as IoWrite; use std::marker::PhantomData; use std::{ any::Any, + borrow::Cow, collections::{BinaryHeap, HashMap}, sync::{Arc, Mutex}, }; @@ -25,8 +25,10 @@ use futures::future::BoxFuture; use futures::prelude::stream::{self, TryStreamExt}; use futures::{StreamExt, TryFutureExt}; use lance_arrow::RecordBatchExt; -use lance_arrow::ipc::write_len_prefixed_bytes; -use lance_core::cache::{CacheCodec, CacheCodecImpl, CacheKey, LanceCache, WeakLanceCache}; +use lance_core::cache::{ + CacheCodec, CacheCodecImpl, CacheEntryReader, CacheEntryWriter, CacheKey, LanceCache, + WeakLanceCache, +}; use lance_core::deepsize::DeepSizeOf; use lance_core::utils::tokio::{get_num_compute_intensive_cpus, spawn_cpu}; use lance_core::utils::tracing::{IO_TYPE_LOAD_VECTOR_PART, TRACE_IO_EVENTS}; @@ -34,12 +36,14 @@ use lance_core::{Error, ROW_ID, Result}; use lance_encoding::decoder::{DecoderPlugins, FilterExpression}; use lance_file::LanceEncodingsIo; use lance_file::reader::{CachedFileMetadata, FileReader, FileReaderOptions}; +use lance_index::cache_pb::IvfStateHeader; use lance_index::frag_reuse::FragReuseIndex; use lance_index::metrics::{LocalMetricsCollector, MetricsCollector, NoOpMetricsCollector}; use lance_index::vector::VectorIndexCacheEntry; use lance_index::vector::bq::builder::RabitQuantizer; +use lance_index::vector::bq::ex_dot::{blocked_ex_code_bytes, padded_query_len}; +use lance_index::vector::bq::rabit_ex_bits; use lance_index::vector::bq::storage::{RabitQueryEstimator, SEGMENT_NUM_CODES}; -use lance_index::vector::bq::{rabit_ex_bits, rabit_ex_code_bytes}; use lance_index::vector::flat::index::{FlatBinQuantizer, FlatIndex, FlatQuantizer}; use lance_index::vector::graph::OrderedNode; use lance_index::vector::hnsw::HNSW; @@ -64,7 +68,7 @@ use lance_index::{ }; use lance_index::{INDEX_METADATA_SCHEMA_KEY, IndexMetadata}; use lance_io::local::to_local_path; -use lance_io::scheduler::SchedulerConfig; +use lance_io::scheduler::{IoStats, ScanStats, SchedulerConfig}; use lance_io::utils::CachedFileSize; use lance_io::{ ReadBatchParams, object_store::ObjectStore, scheduler::ScanScheduler, traits::Reader, @@ -152,16 +156,18 @@ fn rotated_partition_centroid_slice( cache.rotated_centroids.get(start..end) } -fn rabit_ex_dist_table_len(dim: usize, num_bits: u8) -> usize { - rabit_ex_bits(num_bits) - .map(|ex_bits| { - if ex_bits == 0 { - 0 - } else { - dim * (1usize << usize::from(ex_bits)) - } - }) - .unwrap_or(dim * 256) +/// `f32` scratch needed for the ex-bit query state: a zero-padded query copy +/// when the rotated dim is not a multiple of the 64-dim kernel block (the +/// FastScan ex LUT is built directly from the query, with no f32 table). +fn rabit_ex_scratch_len(dim: usize, num_bits: u8) -> usize { + let multi_bit = rabit_ex_bits(num_bits) + .map(|ex_bits| ex_bits > 0) + .unwrap_or(true); + if !multi_bit || dim.is_multiple_of(64) { + 0 + } else { + padded_query_len(dim) + } } fn rabit_u8_scratch_len(dim: usize, num_bits: u8) -> usize { @@ -169,7 +175,7 @@ fn rabit_u8_scratch_len(dim: usize, num_bits: u8) -> usize { let ex_dist_table_len = rabit_ex_bits(num_bits) .ok() .and_then(|ex_bits| match ex_bits { - 2 | 4 | 8 => rabit_ex_code_bytes(dim, ex_bits).ok(), + 2 | 4 | 8 => Some(blocked_ex_code_bytes(dim, ex_bits)), _ => None, }) .map(|ex_code_len| ex_code_len * 2 * SEGMENT_NUM_CODES) @@ -183,12 +189,12 @@ fn rabit_query_scratch_capacity( num_bits: u8, ) -> QueryScratchCapacity { let dist_table_len = dim * 4; - let ex_dist_table_len = rabit_ex_dist_table_len(dim, num_bits); + let ex_scratch_len = rabit_ex_scratch_len(dim, num_bits); let u8_scratch_len = rabit_u8_scratch_len(dim, num_bits); QueryScratchCapacity::new( max_partition_len, - dim + dist_table_len + ex_dist_table_len, + dim + dist_table_len + ex_scratch_len, max_partition_len.max(dist_table_len), u8_scratch_len, ) @@ -213,28 +219,6 @@ impl DeepSizeOf for IvfIndexState { } } -/// Serialization header for the `IvfIndexState` wire format. -/// -/// Kept as a flat, non-generic struct so the JSON header format is stable -/// regardless of `Q`. `quantizer_metadata_json` holds the serialized -/// `Q::Metadata`; large blobs (PQ codebook, RQ matrix) follow as raw bytes. -#[derive(serde::Serialize, serde::Deserialize)] -struct IvfIndexStateHeader { - index_file_path: String, - uuid: String, - distance_type: String, - sub_index_metadata: Vec, - sub_index_type: String, - quantization_type: String, - quantizer_metadata_json: String, - #[serde(default)] - cache_key_prefix: String, - #[serde(default)] - index_file_size: u64, - #[serde(default)] - aux_file_size: u64, -} - /// Object-safe interface for a type-erased `IvfIndexState`. /// /// Stored as `Arc` inside [`IvfStateEntryBox`], which is @@ -242,13 +226,14 @@ struct IvfIndexStateHeader { /// wrapper lets the cache infrastructure work with a sized type while the /// hot paths call `reconstruct` without knowing `Q`. pub(crate) trait IvfStateEntry: DeepSizeOf + Send + Sync + 'static { - fn serialize_state(&self, writer: &mut dyn IoWrite) -> Result<()>; + fn serialize_state(&self, w: &mut CacheEntryWriter<'_>) -> Result<()>; fn reconstruct<'a>( &'a self, object_store: Arc, file_metadata_cache: &'a LanceCache, index_cache: LanceCache, + frag_reuse_index: Option>, ) -> BoxFuture<'a, Result>>; } @@ -266,42 +251,39 @@ impl DeepSizeOf for IvfStateEntryBox { } } -/// Wire format (unchanged from the non-generic `IvfIndexState`): -/// `[header_json_len: u64 LE][header JSON][ivf_pb_len: u64 LE][ivf protobuf] -/// [extra_len: u64 LE][extra bytes][aux_ivf_pb_len: u64 LE][aux_ivf protobuf]` +/// Wire format: +/// ```text +/// HEADER : IvfStateHeader proto (paths, types, quantizer metadata JSON) +/// RAW_BLOB : IVF model protobuf +/// RAW_BLOB : quantizer extra-metadata buffer (may be empty) +/// RAW_BLOB : auxiliary IVF model protobuf +/// ``` impl CacheCodecImpl for IvfStateEntryBox { - fn serialize(&self, writer: &mut dyn IoWrite) -> Result<()> { - self.0.serialize_state(writer) - } + const TYPE_ID: &'static str = "lance.vector.ivf.IvfState"; + const CURRENT_VERSION: u32 = 1; - fn deserialize(data: &bytes::Bytes) -> Result { - use lance_arrow::ipc::read_len_prefixed_bytes_at; + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + self.0.serialize_state(w) + } - // Parse the common wire format, then dispatch on quantization_type to + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + // Parse the common header, then dispatch on quantization_type to // construct the right IvfIndexState. - let mut offset = 0; - let header_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; - let header: IvfIndexStateHeader = serde_json::from_slice(&header_bytes) - .map_err(|e| lance_core::Error::io(format!("IvfIndexState header: {e}")))?; + let header: IvfStateHeader = r.read_header()?; - let ivf_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + let ivf_bytes = r.read_raw()?; let ivf = IvfModel::try_from( pb::Ivf::decode(ivf_bytes.as_ref()) .map_err(|e| lance_core::Error::io(format!("IvfIndexState IVF decode: {e}")))?, )?; - let extra_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + let extra_bytes = r.read_raw()?; - // aux_ivf was added after initial deployment; fall back to ivf on - // clean EOF (legacy format without the field). - let aux_ivf = if offset + 8 <= data.len() { - let aux_ivf_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + let aux_ivf_bytes = r.read_raw()?; + let aux_ivf = IvfModel::try_from(pb::Ivf::decode(aux_ivf_bytes.as_ref()).map_err(|e| { lance_core::Error::io(format!("IvfIndexState aux IVF decode: {e}")) - })?)? - } else { - ivf.clone() - }; + })?)?; let distance_type = DistanceType::try_from(header.distance_type.as_str())?; let sub_index_type = SubIndexType::try_from(header.sub_index_type.as_str())?; @@ -310,7 +292,7 @@ impl CacheCodecImpl for IvfStateEntryBox { // Helper: parse Q::Metadata from the JSON+extra_bytes in the header, // then build an IvfStateEntryBox wrapping IvfIndexState. fn make_entry( - header: IvfIndexStateHeader, + header: IvfStateHeader, ivf: IvfModel, aux_ivf: IvfModel, extra_bytes: bytes::Bytes, @@ -396,13 +378,13 @@ impl CacheCodecImpl for IvfStateEntryBox { } impl IvfStateEntry for IvfIndexState { - fn serialize_state(&self, writer: &mut dyn IoWrite) -> Result<()> { + fn serialize_state(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { let quantizer_metadata_json = serde_json::to_string(&self.metadata) .map_err(|e| lance_core::Error::io(format!("IvfIndexState metadata: {e}")))?; let extra = self.metadata.extra_metadata()?; let extra = extra.as_deref().unwrap_or(&[]); - let header = IvfIndexStateHeader { + let header = IvfStateHeader { index_file_path: self.index_file_path.clone(), uuid: self.uuid.to_string(), distance_type: self.distance_type.to_string(), @@ -414,15 +396,13 @@ impl IvfStateEntry for IvfIndexState { index_file_size: self.index_file_size, aux_file_size: self.aux_file_size, }; - let header_json = serde_json::to_vec(&header) - .map_err(|e| lance_core::Error::io(format!("IvfIndexState header: {e}")))?; let ivf_bytes = pb::Ivf::try_from(&self.ivf)?.encode_to_vec(); let aux_ivf_bytes = pb::Ivf::try_from(&self.aux_ivf)?.encode_to_vec(); - write_len_prefixed_bytes(writer, &header_json)?; - write_len_prefixed_bytes(writer, &ivf_bytes)?; - write_len_prefixed_bytes(writer, extra)?; - write_len_prefixed_bytes(writer, &aux_ivf_bytes)?; + w.write_header(&header)?; + w.write_raw(&ivf_bytes)?; + w.write_raw(extra)?; + w.write_raw(&aux_ivf_bytes)?; Ok(()) } @@ -431,6 +411,7 @@ impl IvfStateEntry for IvfIndexState { object_store: Arc, file_metadata_cache: &'a LanceCache, index_cache: LanceCache, + frag_reuse_index: Option>, ) -> BoxFuture<'a, Result>> { Box::pin(async move { match self.sub_index_type { @@ -440,6 +421,7 @@ impl IvfStateEntry for IvfIndexState { object_store, file_metadata_cache, index_cache, + frag_reuse_index, ) .await } @@ -449,6 +431,7 @@ impl IvfStateEntry for IvfIndexState { object_store, file_metadata_cache, index_cache, + frag_reuse_index, ) .await } @@ -614,6 +597,11 @@ pub struct IVFIndex { index_cache: WeakLanceCache, io_parallelism: usize, + /// Cumulative I/O performed while opening this index (file footers, IVF + /// centroids, quantization metadata). Captured once in `try_new`; exposed + /// via [`VectorIndex::open_io_stats`] so the opening query can attribute the + /// one-time open cost to its plan metrics. + open_io_stats: ScanStats, scratch_pool: Arc, use_query_residual: bool, use_residual_scratch: bool, @@ -1090,6 +1078,12 @@ impl IVFIndex { let use_residual_scratch = Self::use_residual_scratch(&ivf, use_query_residual); let rq_search_cache = Self::build_rq_search_cache(&ivf, &storage)?; + // The scheduler is freshly created above and, at this point, has served + // only the open-time reads (file footers, IVF centroids, quantization + // metadata) -- partition reads happen later, during queries. So its + // cumulative stats are exactly the one-time index-open I/O. + let open_io_stats = scheduler.stats(); + Ok(Self { uri: to_local_path(&uri), index_path: uri.as_ref().to_string(), @@ -1105,6 +1099,7 @@ impl IVFIndex { distance_type, index_cache: WeakLanceCache::from(&index_cache), io_parallelism, + open_io_stats, _marker: PhantomData, }) } @@ -1142,6 +1137,10 @@ impl IVFIndex { distance_type, index_cache: WeakLanceCache::from(&index_cache), io_parallelism, + // Reconstruction from cached state re-opens readers on its own path; + // the open-time I/O is not attributed here (it is a one-time cost, + // and the first open via `try_new` already accounts for it). + open_io_stats: ScanStats::default(), _marker: PhantomData, } } @@ -1169,7 +1168,8 @@ impl IVFIndex { .get_or_insert_with_key(cache_key, || async { info!(target: TRACE_IO_EVENTS, r#type=IO_TYPE_LOAD_VECTOR_PART, index_type="ivf", part_id=partition_id); metrics.record_part_load(); - self.load_partition_entry(partition_id).await + self.load_partition_entry(partition_id, metrics.io_stats()) + .await }) .await?; Ok(entry as Arc) @@ -1179,11 +1179,18 @@ impl IVFIndex { } info!(target: TRACE_IO_EVENTS, r#type=IO_TYPE_LOAD_VECTOR_PART, index_type="ivf", part_id=partition_id); metrics.record_part_load(); - Ok(Arc::new(self.load_partition_entry(partition_id).await?)) + Ok(Arc::new( + self.load_partition_entry(partition_id, metrics.io_stats()) + .await?, + )) } } - async fn load_partition_entry(&self, partition_id: usize) -> Result> { + async fn load_partition_entry( + &self, + partition_id: usize, + io_stats: Option, + ) -> Result> { let schema = Arc::new(self.reader.schema().as_ref().into()); let batch = match self.reader.metadata().num_rows { 0 => RecordBatch::new_empty(schema), @@ -1192,8 +1199,17 @@ impl IVFIndex { if row_range.is_empty() { RecordBatch::new_empty(schema) } else { - let batches = self - .reader + // When I/O is being measured, read through a reader whose + // scheduler also records into the per-query sink (a cheap + // clone sharing all cached metadata, no file re-open). + // Otherwise borrow the shared reader as-is, with no clone. + let reader = match &io_stats { + Some(io_stats) => { + Cow::Owned(self.reader.with_io_stats(io_stats.recorder())) + } + None => Cow::Borrowed(&self.reader), + }; + let batches = reader .read_stream( ReadBatchParams::Range(row_range), u32::MAX, @@ -1212,15 +1228,19 @@ impl IVFIndex { self.sub_index_metadata[partition_id].clone(), )?; let idx = S::load(batch)?; - let storage = self.load_partition_storage(partition_id).await?; + let storage = self.load_partition_storage(partition_id, io_stats).await?; Ok(PartitionEntry { index: idx, storage, }) } - pub async fn load_partition_storage(&self, partition_id: usize) -> Result { - self.storage.load_partition(partition_id).await + pub async fn load_partition_storage( + &self, + partition_id: usize, + io_stats: Option, + ) -> Result { + self.storage.load_partition(partition_id, io_stats).await } /// preprocess the query vector given the partition id. @@ -1800,6 +1820,10 @@ impl VectorIndex for IVFInd fn metric_type(&self) -> DistanceType { self.distance_type } + + fn open_io_stats(&self) -> Option { + Some(self.open_io_stats) + } } pub type IvfFlatIndex = IVFIndex; @@ -1812,6 +1836,7 @@ async fn reconstruct_typed( object_store: Arc, file_metadata_cache: &LanceCache, index_cache: LanceCache, + frag_reuse_index: Option>, ) -> Result> { let io_parallelism = object_store.io_parallelism(); @@ -1867,7 +1892,7 @@ async fn reconstruct_typed( state.aux_ivf.clone(), state.metadata.clone(), state.distance_type, - None, + frag_reuse_index, ); let rq_search_cache = IVFIndex::::rq_search_cache_from_state(state, &storage)?; @@ -1908,7 +1933,8 @@ mod tests { use lance_arrow::FixedSizeListArrayExt; use lance_index::vector::bq::{ RQBuildParams, RQRotationType, - storage::{RABIT_EX_CODE_COLUMN, RabitQuantizationMetadata, RabitQueryEstimator}, + ex_dot::{blocked_ex_code_bytes, padded_query_len}, + storage::{RABIT_BLOCKED_EX_CODE_COLUMN, RabitQuantizationMetadata, RabitQueryEstimator}, transform::{EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN}, }; use lance_index::vector::storage::VectorStore; @@ -1983,14 +2009,17 @@ mod tests { } #[test] - fn test_rabit_ex_dist_table_len_uses_num_bits() { + fn test_rabit_ex_scratch_len_uses_num_bits() { + // Block-aligned dims read the rotated query in place. let dim = 960; + for num_bits in [1, 3, 5, 7, 9] { + assert_eq!(super::rabit_ex_scratch_len(dim, num_bits), 0); + } - assert_eq!(super::rabit_ex_dist_table_len(dim, 1), 0); - assert_eq!(super::rabit_ex_dist_table_len(dim, 3), dim * 4); - assert_eq!(super::rabit_ex_dist_table_len(dim, 5), dim * 16); - assert_eq!(super::rabit_ex_dist_table_len(dim, 7), dim * 64); - assert_eq!(super::rabit_ex_dist_table_len(dim, 9), dim * 256); + // Unaligned multi-bit queries add one padded query copy. + let dim = 968; + assert_eq!(super::rabit_ex_scratch_len(dim, 1), 0); + assert_eq!(super::rabit_ex_scratch_len(dim, 7), padded_query_len(dim)); } #[test] @@ -2012,7 +2041,7 @@ mod tests { let capacity = super::rabit_query_scratch_capacity(dim, max_partition_len, 5); assert_eq!(capacity.distances, max_partition_len); - assert_eq!(capacity.query_f32, dim + dim * 4 + dim * 16); + assert_eq!(capacity.query_f32, dim + dim * 4); assert_eq!(capacity.u16, max_partition_len); assert_eq!(capacity.u8, dim * 16); assert_eq!(capacity.u32, 0); @@ -2723,7 +2752,7 @@ mod tests { async fn load_partition_row_ids(index: &IvfPq, partition_idx: usize) -> Vec { index .storage - .load_partition(partition_idx) + .load_partition(partition_idx, None) .await .unwrap() .row_ids() @@ -4403,18 +4432,24 @@ mod tests { } #[rstest] - #[case::l2(DistanceType::L2)] - #[case::cosine(DistanceType::Cosine)] + #[case::l2(DistanceType::L2, 9)] + #[case::cosine(DistanceType::Cosine, 9)] + // ex_bits=3 and ex_bits=5 have no FastScan support and use the bit-plane + // repack, so these searches go through the exact ex-dot rerank kernels + // end to end. + #[case::l2_plane_repack_3bit(DistanceType::L2, 4)] + #[case::l2_plane_repack_5bit(DistanceType::L2, 6)] #[tokio::test] async fn test_build_ivf_rq_multi_bit_persists_split_codes_and_searches( #[case] distance_type: DistanceType, + #[case] num_bits: u8, ) { let test_dir = TempStrDir::default(); let test_uri = test_dir.as_str(); let (mut dataset, vectors) = generate_test_dataset::(test_uri, 0.0..1.0).await; let ivf_params = IvfBuildParams::new(4); - let rq_params = RQBuildParams::with_rotation_type(9, RQRotationType::Fast); + let rq_params = RQBuildParams::with_rotation_type(num_bits, RQRotationType::Fast); let params = VectorIndexParams::with_ivf_rq_params(distance_type, ivf_params, rq_params); dataset .create_index(&["vector"], IndexType::Vector, None, ¶ms, true) @@ -4427,16 +4462,18 @@ mod tests { let scheduler = ScanScheduler::new(obj_store, SchedulerConfig::default_for_testing()); let index_uuid = indices[0].uuid.to_string(); let rq_meta = get_rq_metadata(&dataset, scheduler.clone(), &index_uuid).await; - assert_eq!(rq_meta.num_bits, 9); + assert_eq!(rq_meta.num_bits, num_bits); assert_eq!(rq_meta.query_estimator, RabitQueryEstimator::RawQuery); let reader = open_rq_aux_reader(&dataset, scheduler, &index_uuid).await; let schema = reader.schema(); - let ex_field = schema.field(RABIT_EX_CODE_COLUMN).unwrap(); + let ex_field = schema.field(RABIT_BLOCKED_EX_CODE_COLUMN).unwrap(); let DataType::FixedSizeList(_, ex_code_bytes) = ex_field.data_type() else { panic!("RQ ex-code field should be FixedSizeList"); }; - assert_eq!(ex_code_bytes, 32); + let expected_ex_code_bytes = + blocked_ex_code_bytes(rq_meta.rotated_dim(), num_bits - 1) as i32; + assert_eq!(ex_code_bytes, expected_ex_code_bytes); assert!(schema.field(EX_ADD_FACTORS_COLUMN).is_some()); assert!(schema.field(EX_SCALE_FACTORS_COLUMN).is_some()); @@ -6178,11 +6215,9 @@ mod tests { // Try serialized store first let guard = self.serialized.lock().await; if let Some((bytes, stored_codec, _)) = guard.get(key) { - return Some( - stored_codec - .deserialize(&bytes::Bytes::copy_from_slice(bytes)) - .expect("deserialization should succeed"), - ); + return stored_codec + .deserialize(&bytes::Bytes::copy_from_slice(bytes)) + .hit(); } drop(guard); // Fall through to passthrough diff --git a/rust/lance/src/io/commit/external_manifest.rs b/rust/lance/src/io/commit/external_manifest.rs index df2b84a4878..eee4fbf07b6 100644 --- a/rust/lance/src/io/commit/external_manifest.rs +++ b/rust/lance/src/io/commit/external_manifest.rs @@ -365,6 +365,32 @@ mod test { assert_eq!(ds.version().version, 6); assert_eq!(ds.count_rows(None).await.unwrap(), 60); + { + inner_store.lock().await.remove(&(ds.base.to_string(), 6)); + } + assert!( + handler + .version_exists( + &ds.base, + 6, + ds.object_store.inner.as_ref(), + ds.manifest_location().naming_scheme, + ) + .await + .unwrap() + ); + assert!( + !handler + .version_exists( + &ds.base, + 7, + ds.object_store.inner.as_ref(), + ds.manifest_location().naming_scheme, + ) + .await + .unwrap() + ); + // Open without external store handler again, should see the newly sync'd commit let ds = DatasetBuilder::from_uri(ds_uri).load().await.unwrap(); assert_eq!(ds.version().version, 6); diff --git a/rust/lance/src/io/commit/namespace_manifest.rs b/rust/lance/src/io/commit/namespace_manifest.rs index 92d5e7bc789..f4f012adcca 100644 --- a/rust/lance/src/io/commit/namespace_manifest.rs +++ b/rust/lance/src/io/commit/namespace_manifest.rs @@ -14,8 +14,24 @@ use lance_table::io::commit::{ManifestLocation, ManifestNamingScheme}; use object_store::ObjectStore as OSObjectStore; use object_store::path::Path; +use lance_namespace::error::NamespaceError; + use crate::dataset::branch_location::BranchLocation; +/// Whether `e` says the requested chain (table or branch) does not exist, as +/// opposed to a failure talking to the namespace. +fn is_chain_not_found(e: &lance_core::Error) -> bool { + if let lance_core::Error::Namespace { source, .. } = e + && let Some(ns_err) = source.downcast_ref::() + { + return matches!( + ns_err, + NamespaceError::TableNotFound { .. } | NamespaceError::TableBranchNotFound { .. } + ); + } + false +} + #[derive(Debug)] pub struct LanceNamespaceExternalManifestStore { namespace_client: Arc, @@ -90,7 +106,15 @@ impl ExternalManifestStore for LanceNamespaceExternalManifestStore { ..Default::default() }; - let response = self.namespace_client.list_table_versions(request).await?; + let response = match self.namespace_client.list_table_versions(request).await { + Ok(response) => response, + // A chain that does not exist yet (e.g. probing a branch location + // before the branch is created) has no latest version; the + // ExternalManifestStore contract reports that as None, not an + // error, so existence checks can treat it as a missing dataset. + Err(e) if is_chain_not_found(&e) => return Ok(None), + Err(e) => return Err(e), + }; if response.versions.is_empty() { return Ok(None); @@ -182,3 +206,93 @@ impl ExternalManifestStore for LanceNamespaceExternalManifestStore { )) } } + +#[cfg(test)] +mod tests { + use super::*; + use lance_namespace::models::ListTableVersionsResponse; + + /// A namespace whose list_table_versions always fails with the configured + /// error, to pin how get_latest_version classifies failures. + #[derive(Debug)] + struct FailingNamespace { + error: fn() -> lance_core::Error, + } + + #[async_trait] + impl LanceNamespace for FailingNamespace { + fn namespace_id(&self) -> String { + "failing".to_string() + } + + async fn list_table_versions( + &self, + _request: ListTableVersionsRequest, + ) -> Result { + Err((self.error)()) + } + } + + fn store_with(error: fn() -> lance_core::Error) -> LanceNamespaceExternalManifestStore { + LanceNamespaceExternalManifestStore::new( + Arc::new(FailingNamespace { error }), + vec!["t".to_string()], + Path::parse("data/t.lance").unwrap(), + ) + } + + /// A chain that does not exist (missing table or branch) has no latest + /// version; everything else is a real failure and must propagate so an + /// outage is never mistaken for an absent dataset. + #[tokio::test] + async fn test_get_latest_version_error_classification() { + use lance_namespace::error::NamespaceError; + + let absent = [ + store_with(|| { + NamespaceError::TableNotFound { + message: "missing table".to_string(), + } + .into() + }), + store_with(|| { + NamespaceError::TableBranchNotFound { + message: "missing branch".to_string(), + } + .into() + }), + ]; + for store in absent { + let latest = store.get_latest_version("data/t.lance/tree/dev").await; + assert!( + matches!(latest, Ok(None)), + "a missing chain must read as no latest version, got: {:?}", + latest + ); + } + + let failures = [ + store_with(|| { + NamespaceError::Internal { + message: "server error".to_string(), + } + .into() + }), + store_with(|| { + NamespaceError::Throttling { + message: "slow down".to_string(), + } + .into() + }), + store_with(|| lance_core::Error::io("connection reset".to_string())), + ]; + for store in failures { + let latest = store.get_latest_version("data/t.lance/tree/dev").await; + assert!( + latest.is_err(), + "a real failure must propagate, got: {:?}", + latest + ); + } + } +} diff --git a/rust/lance/src/io/exec/knn.rs b/rust/lance/src/io/exec/knn.rs index c4c79dcee5e..0ceddf7c5ee 100644 --- a/rust/lance/src/io/exec/knn.rs +++ b/rust/lance/src/io/exec/knn.rs @@ -926,6 +926,9 @@ impl ExecutionPlan for ANNIvfPartitionExec { }) .buffered(self.index_uuids.len().min(target_partitions).max(1)) .finally(move || { + // Partition ranking reads centroids from memory, so this is + // typically zero; flushed for symmetry with ANNSubIndex. + metrics_clone.index_metrics.flush_io(); metrics_clone.baseline_metrics.done(); metrics_clone .baseline_metrics @@ -1627,6 +1630,9 @@ impl ExecutionPlan for ANNIvfSubIndexExec { // will not start until the early search is complete across all deltas. .try_flatten_unordered(None) .finally(move || { + // Publish the exact index-file I/O measured for this query + // (cache misses only) to the iops/requests/bytes_read gauges. + metrics_clone.index_metrics.flush_io(); metrics_clone .baseline_metrics .elapsed_compute() @@ -2919,6 +2925,128 @@ mod tests { assert_find_partitions_elapsed_recorded(&stats); } + /// The ANN operators report the exact index-file I/O performed for a query + /// (bytes_read / iops), measured only on cache misses. A cold search loads + /// partitions from storage and reports non-zero I/O; an immediately + /// following warm search serves every partition from the index cache and + /// reports zero -- which is the cache-effectiveness signal the metric adds. + #[tokio::test] + async fn test_io_metrics_cold_vs_warm() { + let fixture = NprobesTestFixture::new(100, 1).await; + let q = fixture.get_centroid(0); + + let run = |holder: &StatsHolder| { + let setter = holder.get_setter(); + async { + fixture + .dataset + .scan() + .nearest("vector", q.as_ref(), 10) + .unwrap() + .minimum_nprobes(10) + .scan_stats_callback(setter) + .project(&Vec::::new()) + .unwrap() + .with_row_id() + .try_into_batch() + .await + .unwrap() + } + }; + + // Cold: a freshly opened dataset has an empty index cache, so the + // sub-index search must read partitions (and their quantization storage) + // from disk. Those reads flow through the per-query I/O sink. + let cold_holder = StatsHolder::default(); + run(&cold_holder).await; + let cold = cold_holder.consume(); + assert!( + cold.parts_loaded > 0, + "cold search should load partitions, got parts_loaded={}", + cold.parts_loaded + ); + assert!( + cold.bytes_read > 0, + "cold search should report index-file I/O, got bytes_read={}", + cold.bytes_read + ); + assert!( + cold.iops > 0, + "cold search should report index-file IOPS, got iops={}", + cold.iops + ); + + // Warm: the same query on the same dataset finds every partition it + // needs already cached, so no index-file I/O is performed. + let warm_holder = StatsHolder::default(); + run(&warm_holder).await; + let warm = warm_holder.consume(); + assert_eq!( + warm.parts_loaded, 0, + "warm search should not reload partitions, got parts_loaded={}", + warm.parts_loaded + ); + assert_eq!( + warm.bytes_read, 0, + "warm search should report no index-file I/O, got bytes_read={}", + warm.bytes_read + ); + } + + /// The new I/O metrics must actually surface in `EXPLAIN ANALYZE` text on + /// the ANN operators: non-zero on a cold query (partition reads on + /// `ANNSubIndex`, index-open reads on `ANNIvfPartition`) and zero on a warm + /// query (everything served from the index cache). + #[tokio::test] + async fn test_io_metrics_visible_in_explain_analyze() { + // Returns the value of `metric=` from the analyzed-plan line for `node`. + fn node_metric<'a>(plan: &'a str, node: &str, metric: &str) -> &'a str { + let line = plan + .lines() + .find(|l| l.trim_start().starts_with(node)) + .unwrap_or_else(|| panic!("plan missing node {node}:\n{plan}")); + let after = line + .split_once(&format!("{metric}=")) + .unwrap_or_else(|| panic!("node {node} line missing {metric}=:\n{line}")) + .1; + after.split([',', ']']).next().unwrap().trim() + } + + let fixture = NprobesTestFixture::new(100, 1).await; + let q = fixture.get_centroid(0); + + // Cold: a freshly opened dataset must show real index-file I/O. + let cold = fixture + .dataset + .scan() + .nearest("vector", q.as_ref(), 10) + .unwrap() + .minimum_nprobes(10) + .analyze_plan() + .await + .unwrap(); + // Sub-index partition reads. + assert_ne!(node_metric(&cold, "ANNSubIndex", "bytes_read"), "0"); + assert_ne!(node_metric(&cold, "ANNSubIndex", "iops"), "0"); + // Index-open reads (centroids/metadata) now attributed to the partition + // operator -- the value this part of the change adds. + assert_ne!(node_metric(&cold, "ANNIvfPartition", "bytes_read"), "0"); + assert_ne!(node_metric(&cold, "ANNIvfPartition", "iops"), "0"); + + // Warm: same query, everything cache-resident -> zero index-file I/O. + let warm = fixture + .dataset + .scan() + .nearest("vector", q.as_ref(), 10) + .unwrap() + .minimum_nprobes(10) + .analyze_plan() + .await + .unwrap(); + assert_eq!(node_metric(&warm, "ANNSubIndex", "bytes_read"), "0"); + assert_eq!(node_metric(&warm, "ANNIvfPartition", "bytes_read"), "0"); + } + #[rstest] #[tokio::test] async fn test_no_prefilter_results(#[values(1, 20)] num_deltas: usize) { diff --git a/rust/lance/src/io/exec/take.rs b/rust/lance/src/io/exec/take.rs index 977a9c88dce..c3642cdb043 100644 --- a/rust/lance/src/io/exec/take.rs +++ b/rust/lance/src/io/exec/take.rs @@ -4,6 +4,7 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::sync::{Arc, Mutex}; +use std::task::Poll; use arrow::array::AsArray; use arrow::compute::{TakeOptions, concat_batches}; @@ -27,6 +28,7 @@ use lance_arrow::RecordBatchExt; use lance_core::datatypes::{Field, OnMissing, Projection}; use lance_core::error::{DataFusionResult, LanceOptionExt}; use lance_core::utils::address::RowAddress; +use lance_core::utils::futures::FinallyStreamExt; use lance_core::utils::tokio::get_num_compute_intensive_cpus; use lance_core::{ROW_ADDR, ROW_ID}; use lance_io::scheduler::{ScanScheduler, SchedulerConfig}; @@ -353,10 +355,6 @@ impl TakeStream { (None, None) => {} } - self.metrics - .baseline_metrics - .record_output(new_data.num_rows()); - self.metrics.batches_processed.add(1); Ok(batch.merge_with_schema(&new_data, self.output_schema.as_ref())?) } @@ -364,8 +362,10 @@ impl TakeStream { self: Arc, input: S, ) -> impl Stream> { - let scan_scheduler = self.scan_scheduler.clone(); - let metrics = self.metrics.clone(); + let result_scan_scheduler = self.scan_scheduler.clone(); + let final_scan_scheduler = self.scan_scheduler.clone(); + let result_metrics = self.metrics.clone(); + let final_metrics = self.metrics.clone(); let batches = input .enumerate() .map(move |(batch_index, batch)| { @@ -378,8 +378,24 @@ impl TakeStream { }) .boxed(); batches - .inspect_ok(move |_| metrics.io_metrics.record(&scan_scheduler)) .try_buffered(get_num_compute_intensive_cpus()) + .map(move |result| { + if result.is_ok() { + result_metrics.batches_processed.add(1); + } + result_metrics.io_metrics.record(&result_scan_scheduler); + match result_metrics + .baseline_metrics + .record_poll(Poll::Ready(Some(result))) + { + Poll::Ready(Some(result)) => result, + _ => unreachable!("record_poll returned a different poll state"), + } + }) + .finally(move || { + final_metrics.baseline_metrics.done(); + final_metrics.io_metrics.record(&final_scan_scheduler); + }) } } @@ -839,6 +855,80 @@ mod tests { } } + #[tokio::test(flavor = "current_thread")] + async fn test_take_records_output_and_io_metrics() { + use datafusion::physical_plan::metrics::MetricValue; + use lance_datafusion::utils::{BYTES_READ_METRIC, IOPS_METRIC, REQUESTS_METRIC}; + let TestFixture { + dataset, + _tmp_dir_guard, + } = test_fixture().await; + + let row_addrs = UInt64Array::from(vec![0_u64, 1, 2, 3, 4]); + let schema = Arc::new(ArrowSchema::new(vec![Field::new( + ROW_ADDR, + DataType::UInt64, + true, + )])); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(row_addrs)]).unwrap(); + let stream = futures::stream::iter(vec![Ok(batch)]); + let stream = Box::pin(RecordBatchStreamAdapter::new(schema, stream)); + let input = Arc::new(OneShotExec::new(stream)); + + let projection = dataset + .empty_projection() + .union_column("s", OnMissing::Error) + .unwrap(); + + let take_exec = TakeExec::try_new(dataset, input, projection) + .unwrap() + .unwrap(); + + let stream = take_exec + .execute(0, Arc::new(TaskContext::default())) + .unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + assert_eq!(batches.iter().map(|b| b.num_rows()).sum::(), 5); + + let metrics = take_exec.metrics().unwrap(); + + let output_batches: usize = metrics + .iter() + .filter_map(|m| match m.value() { + MetricValue::OutputBatches(count) => Some(count.value()), + _ => None, + }) + .sum(); + + let output_bytes: usize = metrics + .iter() + .filter_map(|m| match m.value() { + MetricValue::OutputBytes(count) => Some(count.value()), + _ => None, + }) + .sum(); + + let gauge = |name: &str| -> usize { + metrics + .iter_gauges() + .find_map(|(metric_name, gauge)| { + (metric_name.as_ref() == name).then(|| gauge.value()) + }) + .unwrap_or(0) + }; + + let bytes_read = gauge(BYTES_READ_METRIC); + let iops = gauge(IOPS_METRIC); + let requests = gauge(REQUESTS_METRIC); + + assert_eq!(metrics.output_rows(), Some(5)); + assert_eq!(metrics.find_count("batches_processed").unwrap().value(), 1); + assert!( + output_batches > 0 && output_bytes > 0 && bytes_read > 0 && iops > 0 && requests > 0, + "expected positive TakeExec metrics, got output_batches={output_batches}, output_bytes={output_bytes}, bytes_read={bytes_read}, iops={iops}, requests={requests}" + ); + } + #[tokio::test] async fn test_take_order() { let TestFixture { diff --git a/rust/lance/src/io/exec/utils.rs b/rust/lance/src/io/exec/utils.rs index af3c5095f75..6e2d50d3736 100644 --- a/rust/lance/src/io/exec/utils.rs +++ b/rust/lance/src/io/exec/utils.rs @@ -6,7 +6,7 @@ use lance_datafusion::utils::{ IOPS_METRIC, PARTS_LOADED_METRIC, REQUESTS_METRIC, }; use lance_index::metrics::MetricsCollector; -use lance_io::scheduler::ScanScheduler; +use lance_io::scheduler::{IoStats, ScanScheduler, ScanStats}; use lance_table::format::IndexMetadata; use pin_project::pin_project; use std::future::Future; @@ -502,12 +502,17 @@ impl IoMetrics { } pub fn record(&self, scan_scheduler: &ScanScheduler) { - let current_stats = scan_scheduler.stats(); + self.record_stats(scan_scheduler.stats()); + } - // Use set_max to ensure gauge always shows the highest value seen - self.iops.set_max(current_stats.iops as usize); - self.requests.set_max(current_stats.requests as usize); - self.bytes_read.set_max(current_stats.bytes_read as usize); + /// Record a snapshot of cumulative I/O statistics. + /// + /// Uses `set_max` because the underlying counters are cumulative; the gauge + /// always reflects the highest (i.e. final) value seen. + pub fn record_stats(&self, stats: ScanStats) { + self.iops.set_max(stats.iops as usize); + self.requests.set_max(stats.requests as usize); + self.bytes_read.set_max(stats.bytes_read as usize); } } @@ -516,6 +521,12 @@ pub struct IndexMetrics { indices_loaded: Count, parts_loaded: Count, index_comparisons: Count, + /// Per-query sink that accumulates exact index-file I/O as partitions are + /// loaded from storage. Shared by all clones of this `IndexMetrics`, so + /// concurrent partition loads all funnel into the same counters. Published + /// to `io_metrics` for display via [`IndexMetrics::flush_io`]. + io_stats: IoStats, + io_metrics: IoMetrics, } impl IndexMetrics { @@ -524,8 +535,18 @@ impl IndexMetrics { indices_loaded: metrics.new_count(INDICES_LOADED_METRIC, partition), parts_loaded: metrics.new_count(PARTS_LOADED_METRIC, partition), index_comparisons: metrics.new_count(INDEX_COMPARISONS_METRIC, partition), + io_stats: IoStats::new(), + io_metrics: IoMetrics::new(metrics, partition), } } + + /// Publish the I/O accumulated in the per-query sink to the displayed + /// `iops`/`requests`/`bytes_read` metrics. Call once when the operator's + /// stream finishes; the sink only accumulates on cache misses, so a fully + /// cache-resident query publishes zeros. + pub fn flush_io(&self) { + self.io_metrics.record_stats(self.io_stats.snapshot()); + } } impl MetricsCollector for IndexMetrics { @@ -538,6 +559,9 @@ impl MetricsCollector for IndexMetrics { fn record_comparisons(&self, num_comparisons: usize) { self.index_comparisons.add(num_comparisons); } + fn io_stats(&self) -> Option { + Some(self.io_stats.clone()) + } } #[cfg(test)] diff --git a/rust/lance/src/lib.rs b/rust/lance/src/lib.rs index 284e10a9b6f..729cf2ffbe7 100644 --- a/rust/lance/src/lib.rs +++ b/rust/lance/src/lib.rs @@ -90,7 +90,7 @@ pub mod pb { include!(concat!(env!("OUT_DIR"), "/lance.pb.rs")); } -pub use blob::{BlobArrayBuilder, blob_field}; +pub use blob::{BlobArrayBuilder, BlobFieldOptions, blob_field, blob_field_with_options}; pub use dataset::Dataset; use lance_index::vector::DIST_COL; diff --git a/rust/lance/src/session.rs b/rust/lance/src/session.rs index 484d53c066a..8d5e9717570 100644 --- a/rust/lance/src/session.rs +++ b/rust/lance/src/session.rs @@ -4,7 +4,7 @@ use std::collections::HashMap; use std::sync::Arc; -use lance_core::cache::{CacheBackend, LanceCache}; +use lance_core::cache::{CacheBackend, CacheKeyIterator, LanceCache}; use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use lance_index::IndexType; @@ -209,6 +209,44 @@ impl Session { pub async fn index_cache_stats(&self) -> lance_core::cache::CacheStats { self.index_cache.0.stats().await } + + /// Return an iterator over keys currently held by the index cache. + /// + /// Returns `None` when the index cache backend does not support key + /// inventory. + /// + /// # Examples + /// + /// ``` + /// # use lance::session::Session; + /// # async fn example() { + /// let session = Session::default(); + /// let keys = session.index_cache_keys().await; + /// assert!(keys.is_some()); + /// # } + /// ``` + pub async fn index_cache_keys(&self) -> Option> { + self.index_cache.0.keys().await + } + + /// Return an iterator over keys currently held by the metadata cache. + /// + /// Returns `None` when the metadata cache backend does not support key + /// inventory. + /// + /// # Examples + /// + /// ``` + /// # use lance::session::Session; + /// # async fn example() { + /// let session = Session::default(); + /// let keys = session.metadata_cache_keys().await; + /// assert!(keys.is_some()); + /// # } + /// ``` + pub async fn metadata_cache_keys(&self) -> Option> { + self.metadata_cache.0.keys().await + } } impl Default for Session { @@ -224,10 +262,23 @@ impl Default for Session { #[cfg(test)] mod tests { use super::*; - use lance_core::cache::UnsizedCacheKey; + use lance_core::cache::{CacheKey, UnsizedCacheKey}; use lance_index::vector::VectorIndex; use std::borrow::Cow; + struct TestKey(&'static str); + impl CacheKey for TestKey { + type ValueType = Vec; + + fn key(&self) -> Cow<'_, str> { + Cow::Borrowed(self.0) + } + + fn type_name() -> &'static str { + "TestVec" + } + } + struct TestUnsizedKey(&'static str); impl UnsizedCacheKey for TestUnsizedKey { type ValueType = dyn VectorIndex; @@ -251,4 +302,41 @@ mod tests { .is_none() ); } + + #[tokio::test] + async fn test_session_cache_keys() { + let session = Session::new(10_000, 10_000, Default::default()); + + session + .index_cache + .insert_with_key(&TestKey("index-key"), Arc::new(vec![1])) + .await; + session + .metadata_cache + .0 + .insert_with_key(&TestKey("metadata-key"), Arc::new(vec![2])) + .await; + + let index_keys = session + .index_cache_keys() + .await + .unwrap() + .collect::>(); + assert_eq!(index_keys.len(), 1); + assert_eq!(index_keys[0].prefix(), ""); + assert_eq!(index_keys[0].key(), "index-key"); + assert_eq!(index_keys[0].type_name(), "TestVec"); + + let metadata_keys = session + .metadata_cache_keys() + .await + .unwrap() + .collect::>(); + assert_eq!(metadata_keys.len(), 1); + assert_eq!(metadata_keys[0].prefix(), ""); + assert_eq!(metadata_keys[0].key(), "metadata-key"); + assert_eq!(metadata_keys[0].type_name(), "TestVec"); + + assert_ne!(index_keys, metadata_keys); + } }