From a83479b6efcd2be6928fb9e8ee8109813cc6df6c Mon Sep 17 00:00:00 2001
From: zhangyue19921010 <zhangyue.1010@bytedance.com>
Date: Wed, 10 Jun 2026 11:09:28 +0800
Subject: [PATCH 1/3] feat(index): consolidate bitmap segments and unindexed
 data on optimize

---
 rust/lance-index/src/scalar.rs        |   9 +
 rust/lance-index/src/scalar/bitmap.rs | 444 ++------------------------
 rust/lance-select/src/mask.rs         |   5 +
 rust/lance/src/index/append.rs        | 207 +++++++++++-
 rust/lance/src/index/create.rs        | 217 +++++++------
 rust/lance/src/index/scalar.rs        |  47 +--
 rust/lance/src/index/scalar/bitmap.rs |  82 +++--
 rust/lance/src/index/scalar/btree.rs  |  28 +-
 8 files changed, 415 insertions(+), 624 deletions(-)
diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs
index 772dfaf4089..2aef324de83 100644
--- a/rust/lance-index/src/scalar.rs
+++ b/rust/lance-index/src/scalar.rs
@@ -928,6 +928,15 @@ impl OldIndexDataFilter {
                 .collect(),
         }
     }
+
+    /// Filter a posting list of row addresses in place, retaining only the rows
+    /// selected by this filter.
+    pub fn retain_row_addrs(&self, addrs: &mut RowAddrTreeMap) {
+        match self {
+            Self::Fragments { to_keep, .. } => addrs.retain_fragments_in(to_keep),
+            Self::RowIds(valid_row_ids) => *addrs &= valid_row_ids,
+        }
+    }
 }
 
 impl UpdateCriteria {
diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs
index 10254e699c5..7b88e7c5d29 100644
--- a/rust/lance-index/src/scalar/bitmap.rs
+++ b/rust/lance-index/src/scalar/bitmap.rs
@@ -3,8 +3,7 @@
 
 use std::{
     any::Any,
-    cmp::Reverse,
-    collections::{BTreeMap, BinaryHeap, HashMap},
+    collections::{BTreeMap, HashMap},
     fmt::Debug,
     ops::Bound,
     sync::Arc,
@@ -29,14 +28,12 @@ use lance_core::{
     error::LanceOptionExt,
     utils::tokio::get_num_compute_intensive_cpus,
 };
-use lance_io::object_store::ObjectStore;
 use lance_select::{NullableRowAddrSet, RowAddrTreeMap, RowSetOps};
-use object_store::path::Path;
 use roaring::RoaringBitmap;
 use serde::{Deserialize, Serialize};
 use tracing::{instrument, warn};
 
-use super::{AnyQuery, IndexStore, ScalarIndex};
+use super::{AnyQuery, IndexStore, OldIndexDataFilter, ScalarIndex};
 use super::{
     BuiltinIndexType, SargableQuery, ScalarIndexParams, SearchResult, btree::OrderableScalarValue,
 };
@@ -58,18 +55,10 @@ use crate::{scalar::IndexReader, scalar::expression::ScalarQueryParser};
 
 pub const BITMAP_LOOKUP_NAME: &str = "bitmap_page_lookup.lance";
 pub const INDEX_STATS_METADATA_KEY: &str = "lance:index_stats";
-const BITMAP_PART_LOOKUP_PREFIX: &str = "part_";
-const BITMAP_PART_LOOKUP_SUFFIX: &str = "_bitmap_page_lookup.lance";
-const EXPLICIT_SHARD_ID_TAG: u64 = 0;
-const IMPLICIT_FRAGMENT_ID_TAG: u64 = 1;
 
 const MAX_BITMAP_ARRAY_LENGTH: usize = i32::MAX as usize - 1024 * 1024; // leave headroom
 
 const MAX_ROWS_PER_CHUNK: usize = 2 * 1024;
-// Smaller than MAX_ROWS_PER_CHUNK to bound the per-cursor in-memory batch
-// footprint during a k-way merge (N cursors × chunk), while still amortising
-// I/O over a reasonable number of rows per read.
-const MERGE_ROWS_PER_CHUNK: usize = 512;
 
 const BITMAP_INDEX_VERSION: u32 = 0;
 
@@ -883,64 +872,6 @@ impl BitmapBatchWriter {
     }
 }
 
-fn bitmap_shard_file_name(partition_id: u64) -> String {
-    format!("{BITMAP_PART_LOOKUP_PREFIX}{partition_id}{BITMAP_PART_LOOKUP_SUFFIX}")
-}
-
-fn tagged_bitmap_partition_id(id: u32, tag: u64) -> u64 {
-    ((id as u64) << 32) | tag
-}
-
-fn bitmap_shard_partition_id(fragment_ids: &[u32], shard_id: Option<u32>) -> Result<u64> {
-    if fragment_ids.is_empty() {
-        return Err(Error::invalid_input(
-            "Bitmap shard build requires at least one fragment id".to_string(),
-        ));
-    }
-
-    if let Some(shard_id) = shard_id {
-        return Ok(tagged_bitmap_partition_id(shard_id, EXPLICIT_SHARD_ID_TAG));
-    }
-
-    let [fragment_id] = fragment_ids else {
-        return Err(Error::invalid_input(format!(
-            "Bitmap distributed build over multiple fragments requires an explicit shard_id. \
-             Received {} fragment ids: {:?}. Please assign mutually exclusive shard_id values \
-             to disjoint fragment groups.",
-            fragment_ids.len(),
-            fragment_ids
-        )));
-    };
-
-    Ok(tagged_bitmap_partition_id(
-        *fragment_id,
-        IMPLICIT_FRAGMENT_ID_TAG,
-    ))
-}
-
-fn extract_bitmap_shard_id(filename: &str) -> Result<u64> {
-    let partition_id = filename
-        .strip_prefix(BITMAP_PART_LOOKUP_PREFIX)
-        .and_then(|name| name.strip_suffix(BITMAP_PART_LOOKUP_SUFFIX))
-        .ok_or_else(|| {
-            Error::internal(format!("Invalid bitmap shard file name format: {filename}"))
-        })?;
-    partition_id.parse::<u64>().map_err(|_| {
-        Error::internal(format!(
-            "Failed to parse bitmap partition id from file name: {filename}"
-        ))
-    })
-}
-
-fn deserialize_bitmap(bitmap_bytes: &[u8], file_name: &str) -> Result<RowAddrTreeMap> {
-    RowAddrTreeMap::deserialize_from(bitmap_bytes).map_err(|error| {
-        Error::corrupt_file(
-            Path::from(file_name),
-            format!("Failed to deserialize bitmap bytes: {error}"),
-        )
-    })
-}
-
 async fn new_bitmap_batch_writer(
     index_store: &dyn IndexStore,
     file_name: &str,
@@ -954,218 +885,6 @@ async fn new_bitmap_batch_writer(
     Ok(BitmapBatchWriter::new(index_file))
 }
 
-#[derive(Clone, Debug, Eq, PartialEq)]
-struct BitmapHeapItem {
-    key: OrderableScalarValue,
-    shard_idx: usize,
-}
-
-impl Ord for BitmapHeapItem {
-    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-        self.key
-            .cmp(&other.key)
-            .then_with(|| self.shard_idx.cmp(&other.shard_idx))
-    }
-}
-
-impl PartialOrd for BitmapHeapItem {
-    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-struct BitmapShardCursor {
-    file_name: String,
-    reader: Arc<dyn IndexReader>,
-    total_rows: usize,
-    next_row_offset: usize,
-    batch: Option<RecordBatch>,
-    batch_row_idx: usize,
-}
-
-impl BitmapShardCursor {
-    async fn try_new(file_name: String, reader: Arc<dyn IndexReader>) -> Result<Option<Self>> {
-        let total_rows = reader.num_rows();
-        if total_rows == 0 {
-            return Ok(None);
-        }
-
-        let mut cursor = Self {
-            file_name,
-            reader,
-            total_rows,
-            next_row_offset: 0,
-            batch: None,
-            batch_row_idx: 0,
-        };
-        if cursor.advance().await? {
-            Ok(Some(cursor))
-        } else {
-            Ok(None)
-        }
-    }
-
-    fn peek_key(&self) -> Result<OrderableScalarValue> {
-        let batch = self.batch.as_ref().ok_or_else(|| {
-            Error::internal(format!(
-                "Bitmap shard {} has no active batch",
-                self.file_name
-            ))
-        })?;
-        let key = ScalarValue::try_from_array(batch.column(0), self.batch_row_idx)?;
-        Ok(OrderableScalarValue(key))
-    }
-
-    fn take_current(&mut self) -> Result<(ScalarValue, RowAddrTreeMap)> {
-        let batch = self.batch.as_ref().ok_or_else(|| {
-            Error::internal(format!(
-                "Bitmap shard {} has no active batch",
-                self.file_name
-            ))
-        })?;
-        let keys = batch.column(0);
-        let binary_bitmaps = batch
-            .column(1)
-            .as_any()
-            .downcast_ref::<BinaryArray>()
-            .ok_or_else(|| {
-                Error::corrupt_file(
-                    Path::from(self.file_name.as_str()),
-                    "Bitmap shard batch has non-binary bitmap column".to_string(),
-                )
-            })?;
-        let key = ScalarValue::try_from_array(keys, self.batch_row_idx)?;
-        let bitmap = deserialize_bitmap(binary_bitmaps.value(self.batch_row_idx), &self.file_name)?;
-        self.batch_row_idx += 1;
-        Ok((key, bitmap))
-    }
-
-    async fn advance(&mut self) -> Result<bool> {
-        loop {
-            if let Some(batch) = &self.batch
-                && self.batch_row_idx < batch.num_rows()
-            {
-                return Ok(true);
-            }
-
-            if self.next_row_offset >= self.total_rows {
-                self.batch = None;
-                return Ok(false);
-            }
-
-            let end_row = (self.next_row_offset + MERGE_ROWS_PER_CHUNK).min(self.total_rows);
-            let batch = self
-                .reader
-                .read_range(self.next_row_offset..end_row, None)
-                .await?;
-            self.next_row_offset = end_row;
-            self.batch = Some(batch);
-            self.batch_row_idx = 0;
-        }
-    }
-}
-
-async fn advance_cursor_and_push(
-    cursors: &mut [BitmapShardCursor],
-    heap: &mut BinaryHeap<Reverse<BitmapHeapItem>>,
-    shard_idx: usize,
-) -> Result<()> {
-    if cursors[shard_idx].advance().await? {
-        heap.push(Reverse(BitmapHeapItem {
-            key: cursors[shard_idx].peek_key()?,
-            shard_idx,
-        }));
-    }
-    Ok(())
-}
-
-async fn drain_same_key_bitmaps(
-    cursors: &mut [BitmapShardCursor],
-    heap: &mut BinaryHeap<Reverse<BitmapHeapItem>>,
-    item: BitmapHeapItem,
-) -> Result<(ScalarValue, RowAddrTreeMap)> {
-    let (key, mut merged_bitmap) = cursors[item.shard_idx].take_current()?;
-    let merged_key = OrderableScalarValue(key);
-    advance_cursor_and_push(cursors, heap, item.shard_idx).await?;
-
-    loop {
-        let Some(Reverse(next_item)) = heap.peek() else {
-            break;
-        };
-        if next_item.key != merged_key {
-            break;
-        }
-
-        let shard_idx = next_item.shard_idx;
-        let _ = heap.pop();
-        let (_, bitmap) = cursors[shard_idx].take_current()?;
-        merged_bitmap |= &bitmap;
-        advance_cursor_and_push(cursors, heap, shard_idx).await?;
-    }
-
-    Ok((merged_key.0, merged_bitmap))
-}
-
-async fn list_bitmap_shard_files(
-    object_store: &ObjectStore,
-    index_dir: &Path,
-    progress: &dyn IndexBuildProgress,
-) -> Result<Vec<String>> {
-    let mut shard_files = Vec::new();
-    let mut list_stream = object_store.list(Some(index_dir.clone()));
-    while let Some(item) = list_stream.next().await {
-        match item {
-            Ok(meta) => {
-                let file_name = meta.location.filename().unwrap_or_default();
-                if file_name.starts_with(BITMAP_PART_LOOKUP_PREFIX)
-                    && file_name.ends_with(BITMAP_PART_LOOKUP_SUFFIX)
-                {
-                    shard_files.push(file_name.to_string());
-                    progress
-                        .stage_progress("scan_bitmap_shards", shard_files.len() as u64)
-                        .await?;
-                }
-            }
-            Err(err) => {
-                return Err(Error::io(format!(
-                    "Failed to list bitmap shard files in {}: {err}",
-                    index_dir
-                )));
-            }
-        }
-    }
-    let mut shard_files = shard_files
-        .into_iter()
-        .map(|file_name| extract_bitmap_shard_id(&file_name).map(|shard_id| (shard_id, file_name)))
-        .collect::<Result<Vec<_>>>()?;
-    shard_files.sort_unstable_by_key(|(shard_id, _)| *shard_id);
-    let shard_files = shard_files
-        .into_iter()
-        .map(|(_, file_name)| file_name)
-        .collect::<Vec<_>>();
-    if shard_files.is_empty() {
-        return Err(Error::invalid_input(format!(
-            "No bitmap shard files found in index directory: {}; \
-             call build_index for each fragment before calling merge_index_metadata",
-            index_dir
-        )));
-    }
-    Ok(shard_files)
-}
-
-async fn cleanup_bitmap_shard_files(store: &dyn IndexStore, shard_files: &[String]) {
-    for file_name in shard_files {
-        if let Err(error) = store.delete_index_file(file_name).await {
-            warn!(
-                "Failed to delete bitmap shard file '{}': {}. \
-                 This does not affect the merged bitmap index, but the shard file \
-                 may need manual cleanup.",
-                file_name, error
-            );
-        }
-    }
-}
-
 #[derive(Debug, Default)]
 pub struct BitmapIndexPlugin;
 
@@ -1305,23 +1024,6 @@ impl BitmapIndexPlugin {
         Self::streaming_build_and_write(data, None, index_store, BITMAP_LOOKUP_NAME).await
     }
 
-    async fn train_bitmap_shard(
-        data: SendableRecordBatchStream,
-        index_store: &dyn IndexStore,
-        fragment_ids: &[u32],
-        shard_id: Option<u32>,
-        progress: Arc<dyn crate::progress::IndexBuildProgress>,
-    ) -> Result<()> {
-        let partition_id = bitmap_shard_partition_id(fragment_ids, shard_id)?;
-        let file_name = bitmap_shard_file_name(partition_id);
-        progress
-            .stage_start("build_bitmap_shard", None, "rows")
-            .await?;
-        Self::streaming_build_and_write(data, None, index_store, &file_name).await?;
-        progress.stage_complete("build_bitmap_shard").await?;
-        Ok(())
-    }
-
     /// Builds and writes a bitmap index in a streaming fashion from value-sorted
     /// input. Only one value's bitmap is in memory at a time, reducing peak memory
     /// from O(unique_values * avg_bitmap) to O(largest_single_bitmap).
@@ -1499,104 +1201,21 @@ impl BitmapIndexPlugin {
             })
             .collect()
     }
-
-    /// Merge per-shard bitmap lookup files into a single bitmap index file.
-    ///
-    /// Each shard file is already sorted by key and can contain many distinct keys.
-    /// This method does not materialize an entire shard in memory. Instead, it keeps
-    /// one cursor per shard, where each cursor tracks the shard's current row within
-    /// a small in-memory batch. A min-heap stores the current key for each shard.
-    ///
-    /// The merge then proceeds as a streaming K-way merge:
-    /// - pop the smallest current key across all shards
-    /// - union the bitmap for that key with any other shards currently positioned on
-    ///   the same key
-    /// - advance only those shards that participated in the union and push their next
-    ///   keys back into the heap
-    ///
-    /// This keeps memory usage proportional to the number of shards plus the bitmaps
-    /// currently being merged, instead of the total number of keys across all shards.
-    async fn merge_shards(
-        store: &dyn IndexStore,
-        shard_files: &[String],
-        progress: Arc<dyn IndexBuildProgress>,
-    ) -> Result<()> {
-        progress
-            .stage_start("merge_bitmap_shards", None, "bitmaps")
-            .await?;
-
-        let mut cursors = Vec::with_capacity(shard_files.len());
-        let mut heap = BinaryHeap::with_capacity(shard_files.len());
-        let mut value_type: Option<DataType> = None;
-
-        for file_name in shard_files {
-            let reader = store.open_index_file(file_name).await?;
-            let shard_value_type = reader.schema().fields[0].data_type().clone();
-            if let Some(existing_type) = &value_type {
-                if existing_type != &shard_value_type {
-                    return Err(Error::invalid_input(format!(
-                        "Bitmap shard {} has value type {:?}, expected {:?}",
-                        file_name, shard_value_type, existing_type
-                    )));
-                }
-            } else {
-                value_type = Some(shard_value_type);
-            }
-            if let Some(cursor) = BitmapShardCursor::try_new(file_name.clone(), reader).await? {
-                let key = cursor.peek_key()?;
-                let shard_idx = cursors.len();
-                cursors.push(cursor);
-                heap.push(Reverse(BitmapHeapItem { key, shard_idx }));
-            }
-        }
-
-        let value_type = value_type.ok_or_else(|| {
-            Error::invalid_input("Bitmap shard merge requires at least one shard file".to_string())
-        })?;
-        let mut writer = new_bitmap_batch_writer(store, BITMAP_LOOKUP_NAME, &value_type).await?;
-        let mut merged_keys = 0u64;
-
-        while let Some(Reverse(item)) = heap.pop() {
-            let (key, merged_bitmap) =
-                drain_same_key_bitmaps(&mut cursors, &mut heap, item).await?;
-            writer.emit(key, &merged_bitmap).await?;
-            merged_keys += 1;
-            progress
-                .stage_progress("merge_bitmap_shards", merged_keys)
-                .await?;
-        }
-
-        progress.stage_complete("merge_bitmap_shards").await?;
-        progress
-            .stage_start("write_bitmap_index", Some(1), "files")
-            .await?;
-        writer.finish().await?;
-        progress.stage_progress("write_bitmap_index", 1).await?;
-        progress.stage_complete("write_bitmap_index").await?;
-        Ok(())
-    }
-}
-
-pub async fn merge_index_files(
-    object_store: &ObjectStore,
-    index_dir: &Path,
-    store: Arc<dyn IndexStore>,
-    progress: Arc<dyn IndexBuildProgress>,
-) -> Result<()> {
-    progress
-        .stage_start("scan_bitmap_shards", None, "files")
-        .await?;
-    let shard_files = list_bitmap_shard_files(object_store, index_dir, progress.as_ref()).await?;
-    progress.stage_complete("scan_bitmap_shards").await?;
-
-    BitmapIndexPlugin::merge_shards(store.as_ref(), &shard_files, progress).await?;
-    cleanup_bitmap_shard_files(store.as_ref(), &shard_files).await;
-    Ok(())
 }
 
+/// Consolidate the materialized state of several bitmap segments (and,
+/// optionally, a stream of not-yet-indexed `new_data`) into a single canonical
+/// bitmap written to `dest_store`.
+///
+/// `old_data_filter` is applied only to the rows coming from `source_indices`,
+/// dropping row addresses whose fragments compaction/deletion has retired; rows
+/// from `new_data` are inserted unfiltered. The whole merged state is held in
+/// memory, as bitmap segment consolidation has always done.
 pub async fn merge_bitmap_indices(
     source_indices: &[Arc<BitmapIndex>],
+    new_data: Option<SendableRecordBatchStream>,
     dest_store: &dyn IndexStore,
+    old_data_filter: Option<OldIndexDataFilter>,
     progress: Arc<dyn IndexBuildProgress>,
 ) -> Result<CreatedIndex> {
     if source_indices.is_empty() {
@@ -1636,6 +1255,18 @@ pub async fn merge_bitmap_indices(
             .await?;
     }
     progress.stage_complete("merge_bitmap_segments").await?;
+    if let Some(old_data_filter) = old_data_filter {
+        merged_state.retain(|_, postings| {
+            old_data_filter.retain_row_addrs(postings);
+            !postings.is_empty()
+        });
+    }
+
+    // Fold the not-yet-indexed rows into the same in-memory state.
+    if let Some(new_data) = new_data {
+        (merged_state, _) =
+            BitmapIndexPlugin::build_bitmap_index_state(new_data, merged_state).await?;
+    }
 
     progress
         .stage_start("write_bitmap_index", Some(1), "files")
@@ -1700,8 +1331,8 @@ impl ScalarIndexPlugin for BitmapIndexPlugin {
         data: SendableRecordBatchStream,
         index_store: &dyn IndexStore,
         request: Box<dyn TrainingRequest>,
-        fragment_ids: Option<Vec<u32>>,
-        progress: Arc<dyn crate::progress::IndexBuildProgress>,
+        _fragment_ids: Option<Vec<u32>>,
+        _progress: Arc<dyn crate::progress::IndexBuildProgress>,
     ) -> Result<CreatedIndex> {
         let request = request
             .as_any()
@@ -1712,23 +1343,14 @@ impl ScalarIndexPlugin for BitmapIndexPlugin {
                         .to_string(),
                 )
             })?;
-        if let Some(fragment_ids) = fragment_ids.as_ref() {
-            Self::train_bitmap_shard(
-                data,
-                index_store,
-                fragment_ids,
-                request.parameters.shard_id,
-                progress,
-            )
-            .await?;
-        } else if request.parameters.shard_id.is_some() {
-            return Err(Error::invalid_input(
-                "Bitmap shard_id requires fragment_ids and is only supported for distributed shard builds"
-                    .to_string(),
-            ));
-        } else {
-            Self::train_bitmap_index(data, index_store).await?;
+        if request.parameters.shard_id.is_some() {
+            warn!(
+                "Bitmap `shard_id` is deprecated and now ignored; each build now produces one \
+                 canonical segment. Use the segmented-index APIs instead. The `shard_id` field \
+                 will be removed in a future release."
+            );
         }
+        Self::train_bitmap_index(data, index_store).await?;
         Ok(CreatedIndex {
             index_details: prost_types::Any::from_msg(&pbold::BitmapIndexDetails::default())
                 .unwrap(),
diff --git a/rust/lance-select/src/mask.rs b/rust/lance-select/src/mask.rs
index a10ad9a6f50..c44b77fe770 100644
--- a/rust/lance-select/src/mask.rs
+++ b/rust/lance-select/src/mask.rs
@@ -572,6 +572,11 @@ impl RowAddrTreeMap {
             .retain(|frag_id, _| frag_id_set.contains(frag_id));
     }
 
+    /// Retain only the rows whose fragment id is contained in `keep`.
+    pub fn retain_fragments_in(&mut self, keep: &RoaringBitmap) {
+        self.inner.retain(|frag_id, _| keep.contains(*frag_id));
+    }
+
     /// Compute the serialized size of the set.
     pub fn serialized_size(&self) -> usize {
         // Starts at 4 because of the u32 num_entries
diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs
index a89b64df276..f6e5ce54219 100644
--- a/rust/lance/src/index/append.rs
+++ b/rust/lance/src/index/append.rs
@@ -94,6 +94,45 @@ pub async fn build_old_data_filter(
     }
 }
 
+/// Split the stored fragment coverage of `segments` into fragments still live
+/// in `dataset` (`effective`) and fragments that compaction or deletion has
+/// already retired (`deleted`). 
+pub fn split_segment_coverage<'a>(
+    dataset: &Dataset,
+    segments: impl IntoIterator<Item = &'a IndexMetadata>,
+) -> (RoaringBitmap, RoaringBitmap) {
+    let mut effective = RoaringBitmap::new();
+    let mut deleted = RoaringBitmap::new();
+    for segment in segments {
+        if let Some(eff) = segment.effective_fragment_bitmap(&dataset.fragment_bitmap) {
+            effective |= eff;
+        }
+        if let Some(del) = segment.deleted_fragment_bitmap(&dataset.fragment_bitmap) {
+            deleted |= del;
+        }
+    }
+    (effective, deleted)
+}
+
+/// Validate that every segment carries fragment coverage, split that coverage
+/// into still-live and retired fragments, and build the matching [`OldIndexDataFilter`].
+pub async fn effective_coverage_and_filter(
+    dataset: &Dataset,
+    segments: &[IndexMetadata],
+) -> Result<(RoaringBitmap, Option<OldIndexDataFilter>)> {
+    for segment in segments {
+        if segment.fragment_bitmap.is_none() {
+            return Err(Error::invalid_input(format!(
+                "CreateIndex: segment {} is missing fragment coverage",
+                segment.uuid
+            )));
+        }
+    }
+    let (effective, deleted) = split_segment_coverage(dataset, segments);
+    let old_data_filter = build_old_data_filter(dataset, &effective, &deleted).await?;
+    Ok((effective, old_data_filter))
+}
+
 async fn load_unindexed_training_data(
     dataset: &Dataset,
     field_path: &str,
@@ -194,16 +233,8 @@ async fn merge_scalar_indices<'a>(
         .await?;
 
     // Effective = bitmap ∩ live fragments; deleted = bitmap \ live fragments.
-    let mut effective_old_frags = RoaringBitmap::new();
-    let mut deleted_old_frags = RoaringBitmap::new();
-    for idx in selected_old_indices {
-        if let Some(effective) = idx.effective_fragment_bitmap(&dataset.fragment_bitmap) {
-            effective_old_frags |= effective;
-        }
-        if let Some(deleted) = idx.deleted_fragment_bitmap(&dataset.fragment_bitmap) {
-            deleted_old_frags |= deleted;
-        }
-    }
+    let (effective_old_frags, deleted_old_frags) =
+        split_segment_coverage(dataset.as_ref(), selected_old_indices.iter().copied());
 
     let mut frag_bitmap = base_unindexed_bitmap.clone();
     frag_bitmap |= &effective_old_frags;
@@ -211,7 +242,7 @@ async fn merge_scalar_indices<'a>(
 
     // Scalar Index that expos an N:1 segment-merge primitive reachable without
     // rescanning the dataset
-    let has_segment_merge_primitive = matches!(index_type, IndexType::BTree);
+    let has_segment_merge_primitive = matches!(index_type, IndexType::BTree | IndexType::Bitmap);
 
     // Merge new data into the existing segment(s) instead of rebuilding from
     // scratch, when both hold:
@@ -256,6 +287,25 @@ async fn merge_scalar_indices<'a>(
                 )
                 .await?
             }
+            IndexType::Bitmap => {
+                if selected_old_indices.len() == 1 {
+                    // Memory optimization: a single segment can absorb the new data
+                    // via `BitmapIndex::update` without loading all into memory at once.
+                    reference_index
+                        .update(new_data_stream, &new_store, None)
+                        .await?
+                } else {
+                    crate::index::scalar::bitmap::open_and_merge_segments(
+                        dataset.as_ref(),
+                        field_path,
+                        selected_old_indices,
+                        new_data_stream,
+                        &new_store,
+                        old_data_filter,
+                    )
+                    .await?
+                }
+            }
             _ => {
                 reference_index
                     .update(new_data_stream, &new_store, old_data_filter)
@@ -1710,6 +1760,141 @@ mod tests {
         assert_eq!(rows, 2, "value 'd' lives in appended fragment");
     }
 
+    #[tokio::test]
+    async fn test_optimize_bitmap_multi_segment_consolidation() {
+        async fn query_count(dataset: &Dataset, value: &str) -> usize {
+            dataset
+                .scan()
+                .filter(&format!("category = '{}'", value))
+                .unwrap()
+                .project(&["category"])
+                .unwrap()
+                .try_into_batch()
+                .await
+                .unwrap()
+                .num_rows()
+        }
+
+        let test_dir = TempStrDir::default();
+        let test_uri = test_dir.as_str();
+
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "category",
+            DataType::Utf8,
+            false,
+        )]));
+        let make_batch = |labels: &[&str]| {
+            let arr = StringArray::from_iter_values(labels.iter().copied());
+            RecordBatch::try_new(schema.clone(), vec![Arc::new(arr)]).unwrap()
+        };
+
+        // Three fragments, each committed as its own Bitmap segment so optimize
+        // sees a multi-segment logical index.
+        // frag0={a,b}, frag1={a,c}, frag2={b,c}.
+        let reader = RecordBatchIterator::new(
+            vec![
+                Ok(make_batch(&["a", "b"])),
+                Ok(make_batch(&["a", "c"])),
+                Ok(make_batch(&["b", "c"])),
+            ],
+            schema.clone(),
+        );
+        let mut dataset = Dataset::write(
+            reader,
+            test_uri,
+            Some(WriteParams {
+                max_rows_per_file: 2,
+                ..Default::default()
+            }),
+        )
+        .await
+        .unwrap();
+
+        let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::Bitmap);
+        let fragments = dataset.get_fragments();
+        assert_eq!(fragments.len(), 3);
+        let frag0_id = fragments[0].id() as u32;
+        let mut staged_segments = Vec::new();
+        for fragment in &fragments {
+            staged_segments.push(
+                crate::index::create::CreateIndexBuilder::new(
+                    &mut dataset,
+                    &["category"],
+                    IndexType::Bitmap,
+                    &params,
+                )
+                .name("cat_idx".into())
+                .fragments(vec![fragment.id() as u32])
+                .execute_uncommitted()
+                .await
+                .unwrap(),
+            );
+        }
+        dataset
+            .commit_existing_index_segments("cat_idx", "category", staged_segments)
+            .await
+            .unwrap();
+        assert_eq!(
+            dataset.load_indices_by_name("cat_idx").await.unwrap().len(),
+            3
+        );
+
+        dataset.delete("category IN ('a', 'b')").await.unwrap();
+        let live_frag_ids: Vec<u32> = dataset
+            .get_fragments()
+            .iter()
+            .map(|f| f.id() as u32)
+            .collect();
+        assert!(
+            !live_frag_ids.contains(&frag0_id),
+            "frag0 should be retired after deleting all its rows"
+        );
+        assert_eq!(live_frag_ids.len(), 2);
+
+        // Append a fourth fragment, leave it unindexed.
+        let appended = RecordBatchIterator::new(vec![Ok(make_batch(&["a", "d"]))], schema.clone());
+        let mut dataset = Dataset::write(
+            appended,
+            test_uri,
+            Some(WriteParams {
+                max_rows_per_file: 2,
+                mode: WriteMode::Append,
+                ..Default::default()
+            }),
+        )
+        .await
+        .unwrap();
+
+        // merge(3) selects all three old segments (one now backed only by the
+        // retired frag0) and consolidates them, together with the unindexed
+        // fragment, into a single segment.
+        dataset
+            .optimize_indices(&OptimizeOptions::merge(3))
+            .await
+            .unwrap();
+
+        let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap();
+
+        // Live rows after the delete + append: frag1={c}, frag2={c}, frag3={a,d}.
+        // The retired frag0's 'a'/'b' rows must not resurface.
+        assert_eq!(query_count(&dataset, "a").await, 1);
+        assert_eq!(query_count(&dataset, "b").await, 0);
+        assert_eq!(query_count(&dataset, "c").await, 2);
+        assert_eq!(query_count(&dataset, "d").await, 1);
+
+        // The segments collapsed into a single one covering only the still-live
+        // fragments (frag1, frag2, frag3); the retired frag0 was filtered out of
+        // the consolidated coverage.
+        let segments_after = dataset.load_indices_by_name("cat_idx").await.unwrap();
+        assert_eq!(segments_after.len(), 1);
+        let coverage = segments_after[0].fragment_bitmap.as_ref().unwrap();
+        assert_eq!(coverage.len(), 3);
+        assert!(
+            !coverage.contains(frag0_id),
+            "retired frag0 must not appear in the consolidated coverage"
+        );
+    }
+
     #[tokio::test]
     async fn test_optimize_btree_keeps_rows_with_stable_row_ids_after_compaction() {
         async fn query_id_count(dataset: &Dataset, id: &str) -> usize {
diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs
index ce8e65d8356..2dd1fa3d2e5 100644
--- a/rust/lance/src/index/create.rs
+++ b/rust/lance/src/index/create.rs
@@ -10,7 +10,7 @@ use crate::{
     index::{
         DatasetIndexExt, DatasetIndexInternalExt, IntoIndexSegment,
         build_index_metadata_from_segments,
-        scalar::{build_bitmap_index_segment, build_scalar_index},
+        scalar::build_scalar_index,
         vector::{
             LANCE_VECTOR_INDEX, VectorIndexParams, build_distributed_vector_index,
             build_empty_vector_index, build_vector_index,
@@ -259,44 +259,17 @@ impl<'a> CreateIndexBuilder<'a> {
                     .preprocessed_data
                     .take()
                     .map(|reader| lance_datafusion::utils::reader_to_stream(Box::new(reader)));
-                if self.index_type == IndexType::Bitmap && self.fragments.is_some() {
-                    if !train {
-                        return Err(Error::invalid_input(
-                            "canonical bitmap segment build requires train=true".to_string(),
-                        ));
-                    }
-                    if preprocesssed_data.is_some() {
-                        return Err(Error::invalid_input(
-                            "canonical bitmap segment build does not accept preprocessed data"
-                                .to_string(),
-                        ));
-                    }
-                    let fragments = self.fragments.clone().ok_or_else(|| {
-                        Error::invalid_input(
-                            "canonical bitmap segment build requires fragment ids".to_string(),
-                        )
-                    })?;
-                    build_bitmap_index_segment(
-                        self.dataset,
-                        column,
-                        &index_id.to_string(),
-                        fragments,
-                        self.progress.clone(),
-                    )
-                    .await?
-                } else {
-                    build_scalar_index(
-                        self.dataset,
-                        column,
-                        &index_id.to_string(),
-                        &params,
-                        train,
-                        self.fragments.clone(),
-                        preprocesssed_data,
-                        self.progress.clone(),
-                    )
-                    .await?
-                }
+                build_scalar_index(
+                    self.dataset,
+                    column,
+                    &index_id.to_string(),
+                    &params,
+                    train,
+                    self.fragments.clone(),
+                    preprocesssed_data,
+                    self.progress.clone(),
+                )
+                .await?
             }
             (IndexType::Scalar, LANCE_SCALAR_INDEX) => {
                 // Guess the index type
@@ -569,6 +542,13 @@ fn is_btree_scalar_params(params: &dyn IndexParams) -> bool {
         .is_some_and(|p| p.index_type.eq_ignore_ascii_case("btree"))
 }
 
+fn is_bitmap_scalar_params(params: &dyn IndexParams) -> bool {
+    params
+        .as_any()
+        .downcast_ref::<ScalarIndexParams>()
+        .is_some_and(|p| p.index_type.eq_ignore_ascii_case("bitmap"))
+}
+
 /// Validate that a user-supplied `index_uuid` is permitted for this build.
 fn ensure_index_uuid_allowed(
     index_type: IndexType,
@@ -576,17 +556,16 @@ fn ensure_index_uuid_allowed(
     fragments: Option<&Vec<u32>>,
     index_uuid: Option<&str>,
 ) -> Result<()> {
-    let is_btree = index_type == IndexType::BTree
-        || params
-            .as_any()
-            .downcast_ref::<ScalarIndexParams>()
-            .map(|params| params.index_type.eq_ignore_ascii_case("btree"))
-            .unwrap_or(false);
-
-    if index_uuid.is_some() && fragments.is_some_and(|fragments| !fragments.is_empty()) && is_btree
+    let is_segmented_scalar = matches!(index_type, IndexType::BTree | IndexType::Bitmap)
+        || is_btree_scalar_params(params)
+        || is_bitmap_scalar_params(params);
+
+    if index_uuid.is_some()
+        && fragments.is_some_and(|fragments| !fragments.is_empty())
+        && is_segmented_scalar
     {
         return Err(Error::invalid_input(
-            "index_uuid is no longer accepted for BTree distributed index builds; segment UUIDs \
+            "index_uuid is no longer accepted for distributed scalar index builds; segment UUIDs \
              are generated by Lance and returned in the index metadata."
                 .to_string(),
         ));
@@ -617,8 +596,9 @@ fn uses_segment_commit_path(index_type: IndexType, params: &dyn IndexParams) ->
 
     if params_family == LANCE_SCALAR_INDEX {
         match index_type {
-            IndexType::BTree => return true,
+            IndexType::BTree | IndexType::Bitmap => return true,
             IndexType::Scalar if is_btree_scalar_params(params) => return true,
+            IndexType::Scalar if is_bitmap_scalar_params(params) => return true,
             _ => {}
         }
     }
@@ -1165,7 +1145,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_merge_index_metadata_btree_soft_break() {
+    async fn test_merge_index_metadata_soft_break() {
         let tmpdir = TempStrDir::default();
         let dataset_uri = format!("file://{}", tmpdir.as_str());
         let reader = gen_batch()
@@ -1176,20 +1156,24 @@ mod tests {
             );
         let dataset = Dataset::write(reader, &dataset_uri, None).await.unwrap();
 
-        let err = dataset
-            .merge_index_metadata(
-                &Uuid::new_v4().to_string(),
-                IndexType::BTree,
-                None,
-                Arc::new(NoopIndexBuildProgress),
-            )
-            .await
-            .unwrap_err();
-        assert!(
-            err.to_string()
-                .contains("no longer supports merge_index_metadata"),
-            "expected BTree merge_index_metadata soft-break error, got: {err}"
-        );
+        // Both segmented scalar families have left the legacy distributed-merge
+        // entry point and must report the soft-break.
+        for index_type in [IndexType::BTree, IndexType::Bitmap] {
+            let err = dataset
+                .merge_index_metadata(
+                    &Uuid::new_v4().to_string(),
+                    index_type,
+                    None,
+                    Arc::new(NoopIndexBuildProgress),
+                )
+                .await
+                .unwrap_err();
+            assert!(
+                err.to_string()
+                    .contains("no longer supports merge_index_metadata"),
+                "expected {index_type} merge_index_metadata soft-break error, got: {err}"
+            );
+        }
     }
 
     /// Assert a committed segment directory holds exactly one canonical BTree
@@ -1310,7 +1294,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_btree_distributed_index_uuid_rejected() {
+    async fn test_distributed_index_uuid_rejected() {
         let test_dir = TempStrDir::default();
         let dataset = gen_batch()
             .col("value", lance_datagen::array::step::<Int32Type>())
@@ -1324,25 +1308,39 @@ mod tests {
         let mut dataset = dataset;
         let fragment_id = dataset.get_fragments()[0].id() as u32;
 
-        let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree);
-        for index_type in [IndexType::BTree, IndexType::Scalar] {
-            let err = CreateIndexBuilder::new(&mut dataset, &["value"], index_type, &params)
-                .name("value_btree_segments".to_string())
-                .fragments(vec![fragment_id])
-                .index_uuid(Uuid::new_v4().to_string())
-                .execute_uncommitted()
-                .await
-                .unwrap_err();
-            assert!(
-                matches!(err, Error::InvalidInput { .. }),
-                "expected invalid input error, got: {err}"
-            );
-            assert!(
-                err.to_string().contains(
-                    "index_uuid is no longer accepted for BTree distributed index builds"
-                ),
-                "unexpected error: {err}"
-            );
+        // Each segmented scalar family rejects a user-supplied UUID for a
+        // fragment-scoped build, whether requested via its own IndexType or the
+        // generic Scalar wrapper.
+        for (builtin, native_type) in [
+            (
+                lance_index::scalar::BuiltinIndexType::BTree,
+                IndexType::BTree,
+            ),
+            (
+                lance_index::scalar::BuiltinIndexType::Bitmap,
+                IndexType::Bitmap,
+            ),
+        ] {
+            let params = ScalarIndexParams::for_builtin(builtin);
+            for index_type in [native_type, IndexType::Scalar] {
+                let err = CreateIndexBuilder::new(&mut dataset, &["value"], index_type, &params)
+                    .name("value_segments".to_string())
+                    .fragments(vec![fragment_id])
+                    .index_uuid(Uuid::new_v4().to_string())
+                    .execute_uncommitted()
+                    .await
+                    .unwrap_err();
+                assert!(
+                    matches!(err, Error::InvalidInput { .. }),
+                    "expected invalid input error for {index_type}, got: {err}"
+                );
+                assert!(
+                    err.to_string().contains(
+                        "index_uuid is no longer accepted for distributed scalar index builds"
+                    ),
+                    "unexpected error for {index_type}: {err}"
+                );
+            }
         }
     }
 
@@ -1474,30 +1472,37 @@ mod tests {
         let fragments = dataset.get_fragments();
         let fragment_ids: Vec<u32> = fragments.iter().map(|f| f.id() as u32).collect();
         let selected_fragments = fragment_ids[..2].to_vec();
-        let index =
-            CreateIndexBuilder::new(&mut dataset, &["category"], IndexType::Bitmap, &base_params)
-                .name("bitmap_segment".to_string())
-                .fragments(selected_fragments.clone())
-                .execute_uncommitted()
-                .await
-                .unwrap();
 
-        assert_eq!(
-            index
-                .fragment_bitmap
-                .as_ref()
-                .unwrap()
-                .iter()
-                .collect::<Vec<_>>(),
-            selected_fragments
-        );
+        for index_type in [IndexType::Bitmap, IndexType::Scalar] {
+            let index =
+                CreateIndexBuilder::new(&mut dataset, &["category"], index_type, &base_params)
+                    .name(format!("bitmap_segment_{index_type}"))
+                    .fragments(selected_fragments.clone())
+                    .execute_uncommitted()
+                    .await
+                    .unwrap();
 
-        let files = index.files.as_ref().unwrap();
-        assert!(files.iter().any(|file| file.path == BITMAP_LOOKUP_NAME));
-        assert!(
-            files.iter().all(|file| !file.path.starts_with("part_")),
-            "staged bitmap segment should only reference canonical files"
-        );
+            assert_eq!(
+                index
+                    .fragment_bitmap
+                    .as_ref()
+                    .unwrap()
+                    .iter()
+                    .collect::<Vec<_>>(),
+                selected_fragments,
+                "{index_type}: unexpected fragment coverage"
+            );
+
+            let files = index.files.as_ref().unwrap();
+            assert!(
+                files.iter().any(|file| file.path == BITMAP_LOOKUP_NAME),
+                "{index_type}: staged segment is missing canonical {BITMAP_LOOKUP_NAME}"
+            );
+            assert!(
+                files.iter().all(|file| !file.path.starts_with("part_")),
+                "{index_type}: staged bitmap segment should only reference canonical files"
+            );
+        }
     }
 
     #[tokio::test]
diff --git a/rust/lance/src/index/scalar.rs b/rust/lance/src/index/scalar.rs
index 92b06f0a1a5..c9618dbff27 100644
--- a/rust/lance/src/index/scalar.rs
+++ b/rust/lance/src/index/scalar.rs
@@ -42,7 +42,7 @@ use lance_index::scalar::label_list::{
 use lance_index::scalar::registry::{
     ScalarIndexPlugin, TrainingCriteria, TrainingOrdering, VALUE_COLUMN_NAME,
 };
-use lance_index::scalar::{BuiltinIndexType, CreatedIndex, InvertedIndexParams};
+use lance_index::scalar::{CreatedIndex, InvertedIndexParams};
 use lance_index::scalar::{
     ScalarIndex, ScalarIndexParams, bitmap::BITMAP_LOOKUP_NAME, inverted::INVERT_LIST_FILE,
     lance_format::LanceIndexStore,
@@ -323,51 +323,6 @@ pub(super) async fn build_scalar_index(
     Ok(created_index)
 }
 
-/// Build a canonical bitmap index segment over a caller-selected fragment set.
-///
-/// This is intentionally separate from `build_scalar_index(..., fragment_ids=Some(...))`.
-/// The latter is the legacy distributed scalar-index shard path. Here fragment ids only
-/// restrict the scanned rows; the bitmap plugin receives no shard id and writes the
-/// canonical bitmap layout for the staged segment root.
-#[instrument(level = "debug", skip_all)]
-pub(super) async fn build_bitmap_index_segment(
-    dataset: &Dataset,
-    column: &str,
-    uuid: &str,
-    fragment_ids: Vec<u32>,
-    progress: Arc<dyn IndexBuildProgress>,
-) -> Result<CreatedIndex> {
-    let field = dataset
-        .schema()
-        .field(column)
-        .ok_or(Error::invalid_input_source(
-            format!("No column with name {}", column).into(),
-        ))?;
-    let field: arrow_schema::Field = field.into();
-
-    let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap);
-    let plugin = SCALAR_INDEX_PLUGIN_REGISTRY.get_plugin_by_name(&params.index_type)?;
-    let training_request =
-        plugin.new_training_request(params.params.as_deref().unwrap_or("{}"), &field)?;
-    let criteria = training_request.criteria();
-
-    progress.stage_start("load_data", None, "rows").await?;
-    let training_data =
-        load_training_data(dataset, column, criteria, None, true, Some(fragment_ids)).await?;
-    progress.stage_complete("load_data").await?;
-
-    let index_store = LanceIndexStore::from_dataset_for_new(dataset, uuid)?;
-    plugin
-        .train_index(
-            training_data,
-            &index_store,
-            training_request,
-            None,
-            progress,
-        )
-        .await
-}
-
 /// Fetches the scalar index plugin for a given index metadata
 ///
 /// The fast path, on newer datasets, is just a plugin lookup by the type URL of the index details.
diff --git a/rust/lance/src/index/scalar/bitmap.rs b/rust/lance/src/index/scalar/bitmap.rs
index 11214a9bfdc..7411e41876e 100644
--- a/rust/lance/src/index/scalar/bitmap.rs
+++ b/rust/lance/src/index/scalar/bitmap.rs
@@ -1,16 +1,42 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
+use datafusion::physical_plan::SendableRecordBatchStream;
 use lance_index::metrics::NoOpMetricsCollector;
 use lance_index::scalar::bitmap::BitmapIndex;
 use lance_index::scalar::lance_format::LanceIndexStore;
+use lance_index::scalar::{CreatedIndex, OldIndexDataFilter};
 use lance_table::format::IndexMetadata;
-use roaring::RoaringBitmap;
 use std::sync::Arc;
 use uuid::Uuid;
 
 use crate::{Dataset, Error, Result, dataset::index::LanceIndexStoreExt};
 
+/// Open the given bitmap `segments` and downcast them to [`BitmapIndex`].
+async fn open_bitmap_segments(
+    dataset: &Dataset,
+    field_path: &str,
+    segments: &[&IndexMetadata],
+) -> Result<Vec<Arc<BitmapIndex>>> {
+    let mut source_indices = Vec::with_capacity(segments.len());
+    for &segment in segments {
+        let scalar_index =
+            super::open_scalar_index(dataset, field_path, segment, &NoOpMetricsCollector).await?;
+        let bitmap_index = scalar_index
+            .as_any()
+            .downcast_ref::<BitmapIndex>()
+            .ok_or_else(|| {
+                Error::index(format!(
+                    "Bitmap merge: expected bitmap segment {}, got {:?}",
+                    segment.uuid,
+                    scalar_index.index_type()
+                ))
+            })?;
+        source_indices.push(Arc::new(bitmap_index.clone()));
+    }
+    Ok(source_indices)
+}
+
 /// Merge one caller-defined group of source bitmap segments into a single segment.
 pub(in crate::index) async fn merge_segments(
     dataset: &Dataset,
@@ -28,35 +54,22 @@ pub(in crate::index) async fn merge_segments(
     })?;
     let field_path = dataset.schema().field_path(field_id)?;
 
-    let mut source_indices = Vec::with_capacity(segments.len());
-    let mut fragment_bitmap = RoaringBitmap::new();
-    for segment in &segments {
-        fragment_bitmap |= segment.fragment_bitmap.as_ref().cloned().ok_or_else(|| {
-            Error::invalid_input(format!(
-                "CreateIndex: segment {} is missing fragment coverage",
-                segment.uuid
-            ))
-        })?;
-        let scalar_index =
-            super::open_scalar_index(dataset, &field_path, segment, &NoOpMetricsCollector).await?;
-        let bitmap_index = scalar_index
-            .as_any()
-            .downcast_ref::<BitmapIndex>()
-            .ok_or_else(|| {
-                Error::index(format!(
-                    "merge_existing_index_segments: expected bitmap segment {}, got {:?}",
-                    segment.uuid,
-                    scalar_index.index_type()
-                ))
-            })?;
-        source_indices.push(Arc::new(bitmap_index.clone()));
-    }
+    // Intersect each segment's stored coverage with the dataset's current
+    // fragments so we don't claim coverage on row addresses that compaction or
+    // pruning has already retired.
+    let (fragment_bitmap, old_data_filter) =
+        crate::index::append::effective_coverage_and_filter(dataset, &segments).await?;
+
+    let segment_refs: Vec<&IndexMetadata> = segments.iter().collect();
+    let source_indices = open_bitmap_segments(dataset, &field_path, &segment_refs).await?;
 
     let new_uuid = Uuid::new_v4();
     let new_store = LanceIndexStore::from_dataset_for_new(dataset, &new_uuid.to_string())?;
     let created_index = lance_index::scalar::bitmap::merge_bitmap_indices(
         &source_indices,
+        None,
         &new_store,
+        old_data_filter,
         lance_index::progress::noop_progress(),
     )
     .await?;
@@ -74,3 +87,24 @@ pub(in crate::index) async fn merge_segments(
         ..segments[0].clone()
     })
 }
+
+/// Open the given bitmap `segments` and merge their materialized state, together
+/// with `new_data`, into a single canonical bitmap written to `new_store`.
+pub(in crate::index) async fn open_and_merge_segments(
+    dataset: &Dataset,
+    field_path: &str,
+    segments: &[&IndexMetadata],
+    new_data: SendableRecordBatchStream,
+    new_store: &LanceIndexStore,
+    old_data_filter: Option<OldIndexDataFilter>,
+) -> Result<CreatedIndex> {
+    let source_indices = open_bitmap_segments(dataset, field_path, segments).await?;
+    lance_index::scalar::bitmap::merge_bitmap_indices(
+        &source_indices,
+        Some(new_data),
+        new_store,
+        old_data_filter,
+        lance_index::progress::noop_progress(),
+    )
+    .await
+}
diff --git a/rust/lance/src/index/scalar/btree.rs b/rust/lance/src/index/scalar/btree.rs
index 34534f6811b..d945ba621bd 100644
--- a/rust/lance/src/index/scalar/btree.rs
+++ b/rust/lance/src/index/scalar/btree.rs
@@ -17,7 +17,6 @@ use lance_index::scalar::lance_format::LanceIndexStore;
 use lance_index::scalar::registry::VALUE_COLUMN_NAME;
 use lance_index::scalar::{CreatedIndex, OldIndexDataFilter};
 use lance_table::format::IndexMetadata;
-use roaring::RoaringBitmap;
 use uuid::Uuid;
 
 use crate::{Dataset, Error, Result, dataset::index::LanceIndexStoreExt};
@@ -121,31 +120,8 @@ pub(crate) async fn merge_segments(
     // Intersect each segment's stored bitmap with the dataset's current
     // fragments so we don't claim coverage on IDs that compaction or pruning
     // has already retired.
-    let dataset_fragments = dataset.fragment_bitmap.as_ref();
-    let mut effective_old_frags = RoaringBitmap::new();
-    let mut deleted_old_frags = RoaringBitmap::new();
-    for segment in &segments {
-        if segment.fragment_bitmap.is_none() {
-            return Err(Error::invalid_input(format!(
-                "CreateIndex: segment {} is missing fragment coverage",
-                segment.uuid
-            )));
-        }
-        if let Some(effective) = segment.effective_fragment_bitmap(dataset_fragments) {
-            effective_old_frags |= effective;
-        }
-        if let Some(deleted) = segment.deleted_fragment_bitmap(dataset_fragments) {
-            deleted_old_frags |= deleted;
-        }
-    }
-
-    let fragment_bitmap = effective_old_frags.clone();
-    let old_data_filter = crate::index::append::build_old_data_filter(
-        dataset,
-        &effective_old_frags,
-        &deleted_old_frags,
-    )
-    .await?;
+    let (fragment_bitmap, old_data_filter) =
+        crate::index::append::effective_coverage_and_filter(dataset, &segments).await?;
 
     let output_uuid = Uuid::new_v4();
     let new_store = LanceIndexStore::from_dataset_for_new(dataset, &output_uuid.to_string())?;

From f10201829eba37fb0c59e1a89326ea041c502dbd Mon Sep 17 00:00:00 2001
From: zhangyue19921010 <zhangyue921010@163.com>
Date: Fri, 12 Jun 2026 13:02:40 +0800
Subject: [PATCH 2/3] feat(index): consolidate bitmap segments and unindexed
 data on optimize

---
 rust/lance-index/src/scalar/bitmap.rs |  30 +--
 rust/lance-index/src/scalar/btree.rs  |  15 +-
 rust/lance/src/index/append.rs        | 267 ++++++++++++++++++++++++--
 rust/lance/src/index/scalar/bitmap.rs |  10 +-
 rust/lance/src/index/scalar/btree.rs  |  10 +-
 5 files changed, 294 insertions(+), 38 deletions(-)

diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs
index a765cd94dd2..4a212713e1f 100644
--- a/rust/lance-index/src/scalar/bitmap.rs
+++ b/rust/lance-index/src/scalar/bitmap.rs
@@ -1204,15 +1204,12 @@ impl BitmapIndexPlugin {
 /// optionally, a stream of not-yet-indexed `new_data`) into a single canonical
 /// bitmap written to `dest_store`.
 ///
-/// `old_data_filter` is applied only to the rows coming from `source_indices`,
-/// dropping row addresses whose fragments compaction/deletion has retired; rows
-/// from `new_data` are inserted unfiltered. The whole merged state is held in
-/// memory, as bitmap segment consolidation has always done.
+/// `old_data_filters` carries one optional filter per source segment
 pub async fn merge_bitmap_indices(
     source_indices: &[Arc<BitmapIndex>],
     new_data: Option<SendableRecordBatchStream>,
     dest_store: &dyn IndexStore,
-    old_data_filter: Option<OldIndexDataFilter>,
+    old_data_filters: &[Option<OldIndexDataFilter>],
     progress: Arc<dyn IndexBuildProgress>,
 ) -> Result<CreatedIndex> {
     if source_indices.is_empty() {
@@ -1221,6 +1218,15 @@ pub async fn merge_bitmap_indices(
         ));
     }
 
+    if old_data_filters.len() != source_indices.len() {
+        return Err(Error::invalid_input(format!(
+            "Bitmap merge: expected one old-data filter per source segment \
+             ({} segments) but got {}",
+            source_indices.len(),
+            old_data_filters.len()
+        )));
+    }
+
     let value_type = source_indices[0].value_type().clone();
     let mut merged_state = HashMap::<ScalarValue, RowAddrTreeMap>::new();
 
@@ -1240,7 +1246,13 @@ pub async fn merge_bitmap_indices(
             )));
         }
 
-        let state = source_index.load_bitmap_index_state().await?;
+        let mut state = source_index.load_bitmap_index_state().await?;
+        if let Some(old_data_filter) = &old_data_filters[idx] {
+            state.retain(|_, postings| {
+                old_data_filter.retain_row_addrs(postings);
+                !postings.is_empty()
+            });
+        }
         for (key, bitmap) in state {
             merged_state
                 .entry(key)
@@ -1252,12 +1264,6 @@ pub async fn merge_bitmap_indices(
             .await?;
     }
     progress.stage_complete("merge_bitmap_segments").await?;
-    if let Some(old_data_filter) = old_data_filter {
-        merged_state.retain(|_, postings| {
-            old_data_filter.retain_row_addrs(postings);
-            !postings.is_empty()
-        });
-    }
 
     // Fold the not-yet-indexed rows into the same in-memory state.
     if let Some(new_data) = new_data {
diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs
index 6128248308e..e8e5c42a248 100644
--- a/rust/lance-index/src/scalar/btree.rs
+++ b/rust/lance-index/src/scalar/btree.rs
@@ -1798,7 +1798,7 @@ impl BTreeIndex {
         segments: &[Arc<Self>],
         new_data: SendableRecordBatchStream,
         dest_store: &dyn IndexStore,
-        old_data_filter: Option<OldIndexDataFilter>,
+        old_data_filters: &[Option<OldIndexDataFilter>],
     ) -> Result<CreatedIndex> {
         let Some(first) = segments.first() else {
             return Err(Error::invalid_input(
@@ -1806,6 +1806,15 @@ impl BTreeIndex {
             ));
         };
 
+        if old_data_filters.len() != segments.len() {
+            return Err(Error::invalid_input(format!(
+                "BTree merge: expected one old-data filter per source segment \
+                 ({} segments) but got {}",
+                segments.len(),
+                old_data_filters.len()
+            )));
+        }
+
         for segment in segments.iter().skip(1) {
             if segment.data_type != first.data_type {
                 return Err(Error::index(format!(
@@ -1827,7 +1836,7 @@ impl BTreeIndex {
         }
 
         let mut inputs: Vec<Arc<dyn ExecutionPlan>> = Vec::with_capacity(segments.len() + 1);
-        for segment in segments {
+        for (segment, old_data_filter) in segments.iter().zip(old_data_filters) {
             let stream = segment.data_stream().await?;
             let stream = match old_data_filter.clone() {
                 Some(filter) => filter_row_ids(stream, filter),
@@ -2235,7 +2244,7 @@ impl ScalarIndex for BTreeIndex {
             &[Arc::new(self.clone())],
             new_data,
             dest_store,
-            old_data_filter,
+            &[old_data_filter],
         )
         .await
     }
diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs
index 212e9fe9609..388f3170251 100644
--- a/rust/lance/src/index/append.rs
+++ b/rust/lance/src/index/append.rs
@@ -114,12 +114,32 @@ pub fn split_segment_coverage<'a>(
     (effective, deleted)
 }
 
-/// Validate that every segment carries fragment coverage, split that coverage
-/// into still-live and retired fragments, and build the matching [`OldIndexDataFilter`].
-pub async fn effective_coverage_and_filter(
+/// Build one [`OldIndexDataFilter`] per segment, each derived from that
+/// segment's *own* effective (still-live) and retired fragment coverage.
+pub async fn build_per_segment_filters(
+    dataset: &Dataset,
+    segments: &[&IndexMetadata],
+) -> Result<Vec<Option<OldIndexDataFilter>>> {
+    let mut filters = Vec::with_capacity(segments.len());
+    for segment in segments {
+        let effective = segment
+            .effective_fragment_bitmap(&dataset.fragment_bitmap)
+            .unwrap_or_default();
+        let deleted = segment
+            .deleted_fragment_bitmap(&dataset.fragment_bitmap)
+            .unwrap_or_default();
+        filters.push(build_old_data_filter(dataset, &effective, &deleted).await?);
+    }
+    Ok(filters)
+}
+
+/// Validate that every segment carries fragment coverage, then return the
+/// combined still-live coverage (for the merged segment's fragment bitmap)
+/// together with one [`OldIndexDataFilter`] per segment.
+pub async fn effective_coverage_and_filters(
     dataset: &Dataset,
     segments: &[IndexMetadata],
-) -> Result<(RoaringBitmap, Option<OldIndexDataFilter>)> {
+) -> Result<(RoaringBitmap, Vec<Option<OldIndexDataFilter>>)> {
     for segment in segments {
         if segment.fragment_bitmap.is_none() {
             return Err(Error::invalid_input(format!(
@@ -128,9 +148,10 @@ pub async fn effective_coverage_and_filter(
             )));
         }
     }
-    let (effective, deleted) = split_segment_coverage(dataset, segments);
-    let old_data_filter = build_old_data_filter(dataset, &effective, &deleted).await?;
-    Ok((effective, old_data_filter))
+    let (effective, _deleted) = split_segment_coverage(dataset, segments);
+    let segment_refs: Vec<&IndexMetadata> = segments.iter().collect();
+    let filters = build_per_segment_filters(dataset, &segment_refs).await?;
+    Ok((effective, filters))
 }
 
 async fn load_unindexed_training_data(
@@ -271,9 +292,8 @@ async fn merge_scalar_indices<'a>(
             load_unindexed_training_data(dataset.as_ref(), field_path, &update_criteria, unindexed)
                 .await?;
         let new_store = LanceIndexStore::from_dataset_for_new(&dataset, &new_uuid)?;
-        let old_data_filter =
-            build_old_data_filter(dataset.as_ref(), &effective_old_frags, &deleted_old_frags)
-                .await?;
+        let old_data_filters =
+            build_per_segment_filters(dataset.as_ref(), selected_old_indices).await?;
 
         match index_type {
             IndexType::BTree => {
@@ -283,7 +303,7 @@ async fn merge_scalar_indices<'a>(
                     selected_old_indices,
                     new_data_stream,
                     &new_store,
-                    old_data_filter,
+                    &old_data_filters,
                 )
                 .await?
             }
@@ -301,12 +321,22 @@ async fn merge_scalar_indices<'a>(
                         selected_old_indices,
                         new_data_stream,
                         &new_store,
-                        old_data_filter,
+                        &old_data_filters,
                     )
                     .await?
                 }
             }
             _ => {
+                // Non-segmented scalar types only reach this branch with a single
+                // selected segment, so the union filter equals that segment's
+                // filter. Built lazily here so the segmented BTree/Bitmap paths
+                // above don't pay an extra row-id-sequence load they never use.
+                let old_data_filter = build_old_data_filter(
+                    dataset.as_ref(),
+                    &effective_old_frags,
+                    &deleted_old_frags,
+                )
+                .await?;
                 reference_index
                     .update(new_data_stream, &new_store, old_data_filter)
                     .await?
@@ -790,7 +820,7 @@ mod tests {
     use arrow::datatypes::{Float32Type, UInt32Type};
     use arrow_array::cast::AsArray;
     use arrow_array::{
-        FixedSizeListArray, RecordBatch, RecordBatchIterator, StringArray, UInt32Array,
+        FixedSizeListArray, Int32Array, RecordBatch, RecordBatchIterator, StringArray, UInt32Array,
     };
     use arrow_schema::{DataType, Field, Schema};
     use futures::TryStreamExt;
@@ -1984,6 +2014,217 @@ mod tests {
         );
     }
 
+    #[tokio::test]
+    async fn test_optimize_btree_no_duplicate_row_addr() {
+        let test_dir = TempStrDir::default();
+        let test_uri = test_dir.as_str();
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("payload", DataType::Int32, false),
+        ]));
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1])),
+                Arc::new(Int32Array::from(vec![10])),
+            ],
+        )
+        .unwrap();
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone());
+        let mut dataset = Dataset::write(reader, test_uri, None).await.unwrap();
+
+        let params = ScalarIndexParams::for_builtin(BuiltinIndexType::BTree);
+        dataset
+            .create_index(
+                &["id"],
+                IndexType::BTree,
+                Some("id_idx".into()),
+                &params,
+                true,
+            )
+            .await
+            .unwrap();
+
+        // Reordered source columns (payload, id) force the partial-schema
+        // RewriteColumns path instead of a row rewrite.
+        let source_schema = Arc::new(Schema::new(vec![
+            Field::new("payload", DataType::Int32, false),
+            Field::new("id", DataType::Int32, false),
+        ]));
+        let source_batch = RecordBatch::try_new(
+            source_schema.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![100])),
+                Arc::new(Int32Array::from(vec![1])),
+            ],
+        )
+        .unwrap();
+        let merge_job =
+            MergeInsertBuilder::try_new(Arc::new(dataset.clone()), vec!["id".to_string()])
+                .unwrap()
+                .when_matched(WhenMatched::UpdateAll)
+                .try_build()
+                .unwrap();
+        let source_reader = Box::new(RecordBatchIterator::new(
+            [Ok(source_batch)],
+            source_schema.clone(),
+        ));
+        merge_job
+            .execute(reader_to_stream(source_reader))
+            .await
+            .unwrap();
+
+        // Build a delta BTree segment over the now-unindexed fragment.
+        let mut dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap();
+        dataset
+            .optimize_indices(&OptimizeOptions::append())
+            .await
+            .unwrap();
+        assert_eq!(
+            dataset.load_indices_by_name("id_idx").await.unwrap().len(),
+            2,
+            "append must create a delta segment over the rewritten fragment"
+        );
+
+        // Force the old segment + delta segment to merge.
+        dataset
+            .optimize_indices(&OptimizeOptions::merge(2))
+            .await
+            .unwrap();
+
+        let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap();
+        let rows = dataset
+            .scan()
+            .filter("id = 1")
+            .unwrap()
+            .project(&["id"])
+            .unwrap()
+            .try_into_batch()
+            .await
+            .unwrap()
+            .num_rows();
+        assert_eq!(rows, 1, "id = 1 must return exactly one row after merge");
+    }
+
+    #[tokio::test]
+    async fn test_optimize_bitmap_no_stale_postings() {
+        async fn query_count(dataset: &Dataset, value: &str) -> usize {
+            dataset
+                .scan()
+                .filter(&format!("cat = '{}'", value))
+                .unwrap()
+                .project(&["cat"])
+                .unwrap()
+                .try_into_batch()
+                .await
+                .unwrap()
+                .num_rows()
+        }
+
+        let test_dir = TempStrDir::default();
+        let test_uri = test_dir.as_str();
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("key", DataType::Int32, false),
+            Field::new("cat", DataType::Utf8, false),
+        ]));
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1])),
+                Arc::new(StringArray::from(vec!["a"])),
+            ],
+        )
+        .unwrap();
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone());
+        let mut dataset = Dataset::write(reader, test_uri, None).await.unwrap();
+
+        // A scalar index on the join key forces merge_insert down the in-place
+        // RewriteColumns path, keeping the fragment live.
+        dataset
+            .create_index(
+                &["key"],
+                IndexType::BTree,
+                Some("key_idx".into()),
+                &ScalarIndexParams::for_builtin(BuiltinIndexType::BTree),
+                true,
+            )
+            .await
+            .unwrap();
+        dataset
+            .create_index(
+                &["cat"],
+                IndexType::Bitmap,
+                Some("cat_idx".into()),
+                &ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap),
+                true,
+            )
+            .await
+            .unwrap();
+
+        // Reordered source columns (cat, key) force the in-place RewriteColumns
+        // path; the indexed `cat` value changes 'a' -> 'b' on the same row,
+        // pruning the cat index's coverage of the still-live fragment.
+        let source_schema = Arc::new(Schema::new(vec![
+            Field::new("cat", DataType::Utf8, false),
+            Field::new("key", DataType::Int32, false),
+        ]));
+        let source_batch = RecordBatch::try_new(
+            source_schema.clone(),
+            vec![
+                Arc::new(StringArray::from(vec!["b"])),
+                Arc::new(Int32Array::from(vec![1])),
+            ],
+        )
+        .unwrap();
+        let merge_job =
+            MergeInsertBuilder::try_new(Arc::new(dataset.clone()), vec!["key".to_string()])
+                .unwrap()
+                .when_matched(WhenMatched::UpdateAll)
+                .try_build()
+                .unwrap();
+        let source_reader = Box::new(RecordBatchIterator::new(
+            [Ok(source_batch)],
+            source_schema.clone(),
+        ));
+        merge_job
+            .execute(reader_to_stream(source_reader))
+            .await
+            .unwrap();
+
+        let cat_only = || OptimizeOptions::append().index_names(vec!["cat_idx".to_string()]);
+
+        let mut dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap();
+        dataset.optimize_indices(&cat_only()).await.unwrap();
+        assert_eq!(
+            dataset.load_indices_by_name("cat_idx").await.unwrap().len(),
+            2,
+            "append must create a delta segment over the rewritten fragment"
+        );
+        dataset
+            .optimize_indices(&OptimizeOptions::merge(2).index_names(vec!["cat_idx".to_string()]))
+            .await
+            .unwrap();
+
+        let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap();
+        assert_eq!(
+            query_count(&dataset, "a").await,
+            0,
+            "stale 'a' posting must be filtered out of the consolidated segment"
+        );
+        assert_eq!(
+            query_count(&dataset, "b").await,
+            1,
+            "the updated 'b' row must remain queryable"
+        );
+        assert_eq!(
+            dataset.load_indices_by_name("cat_idx").await.unwrap().len(),
+            1,
+            "the segments must collapse into a single consolidated segment"
+        );
+    }
+
     #[tokio::test]
     async fn test_optimize_btree_keeps_rows_with_stable_row_ids_after_compaction() {
         async fn query_id_count(dataset: &Dataset, id: &str) -> usize {
diff --git a/rust/lance/src/index/scalar/bitmap.rs b/rust/lance/src/index/scalar/bitmap.rs
index 06218118467..d5bbdcf2961 100644
--- a/rust/lance/src/index/scalar/bitmap.rs
+++ b/rust/lance/src/index/scalar/bitmap.rs
@@ -57,8 +57,8 @@ pub(in crate::index) async fn merge_segments(
     // Intersect each segment's stored coverage with the dataset's current
     // fragments so we don't claim coverage on row addresses that compaction or
     // pruning has already retired.
-    let (fragment_bitmap, old_data_filter) =
-        crate::index::append::effective_coverage_and_filter(dataset, &segments).await?;
+    let (fragment_bitmap, old_data_filters) =
+        crate::index::append::effective_coverage_and_filters(dataset, &segments).await?;
 
     let segment_refs: Vec<&IndexMetadata> = segments.iter().collect();
     let source_indices = open_bitmap_segments(dataset, &field_path, &segment_refs).await?;
@@ -69,7 +69,7 @@ pub(in crate::index) async fn merge_segments(
         &source_indices,
         None,
         &new_store,
-        old_data_filter,
+        &old_data_filters,
         lance_index::progress::noop_progress(),
     )
     .await?;
@@ -96,14 +96,14 @@ pub(in crate::index) async fn open_and_merge_segments(
     segments: &[&IndexMetadata],
     new_data: SendableRecordBatchStream,
     new_store: &LanceIndexStore,
-    old_data_filter: Option<OldIndexDataFilter>,
+    old_data_filters: &[Option<OldIndexDataFilter>],
 ) -> Result<CreatedIndex> {
     let source_indices = open_bitmap_segments(dataset, field_path, segments).await?;
     lance_index::scalar::bitmap::merge_bitmap_indices(
         &source_indices,
         Some(new_data),
         new_store,
-        old_data_filter,
+        old_data_filters,
         lance_index::progress::noop_progress(),
     )
     .await
diff --git a/rust/lance/src/index/scalar/btree.rs b/rust/lance/src/index/scalar/btree.rs
index 081957ecdad..268048da4dd 100644
--- a/rust/lance/src/index/scalar/btree.rs
+++ b/rust/lance/src/index/scalar/btree.rs
@@ -63,7 +63,7 @@ pub(crate) async fn open_and_merge_segments(
     segments: &[&IndexMetadata],
     new_data: SendableRecordBatchStream,
     new_store: &LanceIndexStore,
-    old_data_filter: Option<OldIndexDataFilter>,
+    old_data_filters: &[Option<OldIndexDataFilter>],
 ) -> Result<CreatedIndex> {
     let mut source_indices = Vec::with_capacity(segments.len());
     for &segment in segments {
@@ -81,7 +81,7 @@ pub(crate) async fn open_and_merge_segments(
             })?;
         source_indices.push(Arc::new(btree.clone()));
     }
-    BTreeIndex::merge_segments(&source_indices, new_data, new_store, old_data_filter).await
+    BTreeIndex::merge_segments(&source_indices, new_data, new_store, old_data_filters).await
 }
 
 /// Merge one caller-defined group of source BTree segments into a single
@@ -120,8 +120,8 @@ pub(crate) async fn merge_segments(
     // Intersect each segment's stored bitmap with the dataset's current
     // fragments so we don't claim coverage on IDs that compaction or pruning
     // has already retired.
-    let (fragment_bitmap, old_data_filter) =
-        crate::index::append::effective_coverage_and_filter(dataset, &segments).await?;
+    let (fragment_bitmap, old_data_filters) =
+        crate::index::append::effective_coverage_and_filters(dataset, &segments).await?;
 
     let output_uuid = Uuid::new_v4();
     let new_store = LanceIndexStore::from_dataset_for_new(dataset, &output_uuid)?;
@@ -135,7 +135,7 @@ pub(crate) async fn merge_segments(
         &segment_refs,
         empty_new_data,
         &new_store,
-        old_data_filter,
+        &old_data_filters,
     )
     .await?;
 

From cfc95cb81ab8ffdbd117aed410df319e7fb24487 Mon Sep 17 00:00:00 2001
From: zhangyue19921010 <zhangyue.1010@bytedance.com>
Date: Mon, 22 Jun 2026 20:08:16 +0800
Subject: [PATCH 3/3] merge main

---
 .bumpversion.toml                             |    2 +-
 .../workflows/build_linux_wheel/action.yml    |    2 +-
 .github/workflows/build_mac_wheel/action.yml  |    2 +-
 .../workflows/build_windows_wheel/action.yml  |    2 +-
 .github/workflows/java-publish.yml            |  139 +-
 .github/workflows/pypi-publish.yml            |    6 +-
 .github/workflows/python.yml                  |    2 +-
 CONTRIBUTING.md                               |    2 +-
 Cargo.lock                                    |  551 +-
 Cargo.toml                                    |   51 +-
 ci/create_release_branch.sh                   |   12 +-
 ci/publish_beta.sh                            |   12 +-
 ci/release_common.sh                          |    6 +-
 docs/src/format/file/encoding.md              |    7 +-
 docs/src/format/index/scalar/ngram.md         |    8 +-
 docs/src/guide/blob.md                        |   10 +
 docs/src/guide/object_store.md                |  205 +
 java/lance-jni/Cargo.lock                     |  463 +-
 java/lance-jni/Cargo.toml                     |    2 +-
 java/lance-jni/src/index.rs                   |    2 +
 java/lance-jni/src/mem_wal.rs                 |   37 +
 java/pom.xml                                  |    2 +-
 .../java/org/lance/OpenDatasetBuilder.java    |    4 +-
 .../main/java/org/lance/index/IndexType.java  |    1 +
 .../lance/index/scalar/ScalarIndexParams.java |    6 +-
 .../java/org/lance/index/ScalarIndexTest.java |   78 +
 .../java/org/lance/memwal/MemWalTest.java     |   29 +-
 .../namespace/DirectoryNamespaceTest.java     |   48 +
 memtest/pyproject.toml                        |    3 +-
 python/Cargo.lock                             |  554 +-
 python/Cargo.toml                             |   10 +-
 python/pyproject.toml                         |   13 +-
 python/python/benchmarks/test_search.py       |   12 +-
 python/python/lance/__init__.py               |    4 +-
 python/python/lance/blob.py                   |   67 +-
 python/python/lance/dataset.py                |    9 +-
 python/python/lance/indices/builder.py        |    2 +-
 python/python/lance/lance/__init__.pyi        |   23 +
 python/python/lance/lance/optimize.pyi        |    6 +-
 python/python/lance/namespace.py              |   48 +
 python/python/lance/optimize.py               |    8 +
 python/python/lance/vector.py                 |  147 +
 python/python/tests/test_blob.py              |  367 ++
 python/python/tests/test_dataset.py           |   49 +-
 python/python/tests/test_indices.py           |    6 +-
 python/python/tests/test_mem_wal.py           |    7 +
 python/python/tests/test_namespace_dir.py     |  170 +
 .../tests/test_namespace_integration.py       |   96 +
 python/python/tests/test_optimize.py          |   41 +
 python/python/tests/test_s3_ddb.py            |   52 +
 python/python/tests/test_scalar_index.py      |   50 +-
 python/python/tests/test_vector.py            |   37 +-
 python/python/tests/test_vector_index.py      |    2 +
 python/src/dataset.rs                         |  182 +
 python/src/dataset/optimize.rs                |   17 +-
 python/src/lib.rs                             |    1 +
 python/src/mem_wal.rs                         |   25 +
 python/src/namespace.rs                       |  102 +-
 python/uv.lock                                |  875 +--
 rust/examples/Cargo.toml                      |    2 +-
 rust/lance-arrow/src/ipc.rs                   |  216 +-
 rust/lance-arrow/src/lib.rs                   |    2 +
 rust/lance-core/src/cache/backend.rs          |   12 +
 rust/lance-core/src/cache/codec.rs            |  517 +-
 rust/lance-core/src/cache/entry_io.rs         |  202 +
 rust/lance-core/src/cache/mod.rs              |  150 +-
 rust/lance-core/src/cache/moka.rs             |    9 +-
 rust/lance-core/src/datatypes.rs              |    1 +
 rust/lance-core/src/datatypes/field.rs        |   60 +
 rust/lance-core/src/datatypes/schema.rs       |   90 +-
 rust/lance-core/src/utils.rs                  |    1 +
 rust/lance-core/src/utils/io_stats.rs         |   30 +
 rust/lance-datafusion/src/expr.rs             |  108 +
 rust/lance-datafusion/src/logical_expr.rs     |   54 +
 rust/lance-datagen/Cargo.toml                 |    1 -
 rust/lance-datagen/src/generator.rs           |  226 +-
 rust/lance-encoding/src/decoder.rs            |   52 +-
 .../src/encodings/logical/primitive.rs        |   81 +-
 .../encodings/logical/primitive/miniblock.rs  |   26 +-
 rust/lance-encoding/src/lib.rs                |   16 +
 rust/lance-file/src/io.rs                     |   10 +
 rust/lance-file/src/reader.rs                 |   17 +
 rust/lance-file/src/writer.rs                 |   13 +-
 rust/lance-index/Cargo.toml                   |    1 +
 rust/lance-index/benches/rq.rs                |  529 +-
 rust/lance-index/build.rs                     |   11 +-
 rust/lance-index/protos-cache/cache.proto     |  194 +
 rust/lance-index/src/lib.rs                   |   13 +
 rust/lance-index/src/metrics.rs               |   13 +
 rust/lance-index/src/scalar.rs                |   49 +-
 rust/lance-index/src/scalar/bitmap.rs         |  125 +-
 rust/lance-index/src/scalar/btree.rs          |  470 +-
 rust/lance-index/src/scalar/btree/flat.rs     |  139 +-
 rust/lance-index/src/scalar/expression.rs     |  355 +-
 rust/lance-index/src/scalar/fmindex.rs        |  245 +-
 .../src/scalar/inverted/builder.rs            |  955 +++-
 .../src/scalar/inverted/cache_codec.rs        |  715 ++-
 rust/lance-index/src/scalar/inverted/index.rs |  107 +-
 .../src/scalar/inverted/tokenizer.rs          |   72 +-
 rust/lance-index/src/scalar/inverted/wand.rs  |  143 +-
 rust/lance-index/src/scalar/label_list.rs     |  118 +-
 rust/lance-index/src/scalar/lance_format.rs   |   47 +-
 rust/lance-index/src/scalar/ngram.rs          |  151 +-
 .../src/scalar/ngram/ngram_regex.rs           |  673 +++
 rust/lance-index/src/scalar/zonemap.rs        |   46 +-
 rust/lance-index/src/vector.rs                |    8 +
 rust/lance-index/src/vector/bq.rs             |    3 +
 rust/lance-index/src/vector/bq/builder.rs     |   34 +-
 .../src/vector/bq/dist_table_quant.rs         |  935 +++
 rust/lance-index/src/vector/bq/ex_dot.rs      | 1078 ++++
 rust/lance-index/src/vector/bq/prune.rs       |  527 ++
 rust/lance-index/src/vector/bq/storage.rs     | 1609 ++++--
 rust/lance-index/src/vector/bq/transform.rs   |   21 +-
 .../src/vector/distributed/index_merger.rs    |   28 +-
 rust/lance-index/src/vector/pq/storage.rs     |    7 +-
 rust/lance-index/src/vector/storage.rs        |   27 +-
 rust/lance-io/src/scheduler.rs                |  153 +-
 rust/lance-linalg/Cargo.toml                  |    5 +-
 rust/lance-linalg/benches/hamming.rs          |   52 -
 rust/lance-linalg/src/distance.rs             |    6 +-
 rust/lance-linalg/src/distance/hamming.rs     | 1323 ++++-
 rust/lance-namespace-datafusion/tests/sql.rs  |    2 +
 rust/lance-namespace-impls/BENCHMARK.md       |   73 +
 rust/lance-namespace-impls/Cargo.toml         |   14 +
 .../benches/manifest_commit_sweep.sh          |  146 +
 .../examples/manifest_bench.rs                |  714 +++
 rust/lance-namespace-impls/src/dir.rs         |  605 +-
 .../lance-namespace-impls/src/dir/manifest.rs | 5023 +++++++++++------
 .../src/dir/manifest_feature_flags.rs         |  194 +
 .../lance-namespace-impls/src/rest_adapter.rs |    7 +-
 rust/lance-select/src/mask.rs                 |   15 +-
 rust/lance-table/src/format/index.rs          |   27 +-
 rust/lance-table/src/io/commit.rs             |   20 +
 .../src/io/commit/external_manifest.rs        |   25 +
 rust/lance-tokenizer/Cargo.toml               |    1 +
 rust/lance-tokenizer/src/stop_word_filter.rs  |   80 +-
 .../src/stop_word_filter/stopwords.rs         |    6 +
 rust/lance/Cargo.toml                         |    8 +
 rust/lance/benches/hamming.rs                 |  228 +
 .../benches/mem_wal/write/mem_wal_write.rs    |    2 +
 rust/lance/benches/regex_ngram.rs             |  134 +
 rust/lance/src/blob.rs                        |   99 +-
 rust/lance/src/dataset.rs                     |   54 +-
 rust/lance/src/dataset/blob.rs                | 1285 ++++-
 rust/lance/src/dataset/branch_location.rs     |   59 +-
 rust/lance/src/dataset/cleanup.rs             |  773 ++-
 rust/lance/src/dataset/fragment.rs            |    2 +-
 rust/lance/src/dataset/index/frag_reuse.rs    |  194 +
 rust/lance/src/dataset/mem_wal/api.rs         |   12 +-
 rust/lance/src/dataset/mem_wal/index.rs       |  294 +-
 .../lance/src/dataset/mem_wal/index/pk_key.rs |  204 +
 .../dataset/mem_wal/memtable/batch_store.rs   |   47 +
 .../src/dataset/mem_wal/memtable/flush.rs     |  387 +-
 .../mem_wal/memtable/scanner/builder.rs       |    8 +
 rust/lance/src/dataset/mem_wal/scanner.rs     |    7 +-
 .../src/dataset/mem_wal/scanner/block_list.rs |  934 ++-
 .../src/dataset/mem_wal/scanner/builder.rs    |  218 +-
 .../src/dataset/mem_wal/scanner/collector.rs  |   68 +
 .../dataset/mem_wal/scanner/data_source.rs    |   23 +
 .../lance/src/dataset/mem_wal/scanner/exec.rs |   12 +-
 .../mem_wal/scanner/exec/newest_pk_filter.rs  |  393 ++
 .../src/dataset/mem_wal/scanner/exec/pk.rs    |    2 +-
 .../mem_wal/scanner/exec/pk_block_filter.rs   |  373 ++
 .../mem_wal/scanner/exec/pk_hash_filter.rs    |  350 --
 .../scanner/exec/within_source_dedup.rs       |  432 --
 .../dataset/mem_wal/scanner/flushed_cache.rs  |  189 +-
 .../src/dataset/mem_wal/scanner/fts_search.rs |  214 +-
 .../src/dataset/mem_wal/scanner/planner.rs    |  194 +-
 .../dataset/mem_wal/scanner/point_lookup.rs   |  197 +-
 .../dataset/mem_wal/scanner/vector_search.rs  |  328 +-
 rust/lance/src/dataset/mem_wal/util.rs        |   10 +
 rust/lance/src/dataset/mem_wal/write.rs       |  288 +-
 rust/lance/src/dataset/optimize.rs            |  869 +++
 rust/lance/src/dataset/optimize/remapping.rs  |  188 +-
 rust/lance/src/dataset/scanner.rs             |  357 +-
 rust/lance/src/dataset/schema_evolution.rs    | 1400 ++++-
 rust/lance/src/dataset/tests/dataset_index.rs |   78 +-
 .../src/dataset/tests/dataset_versioning.rs   |   71 +
 rust/lance/src/dataset/updater.rs             |   40 +-
 rust/lance/src/dataset/write.rs               |  147 +-
 rust/lance/src/dataset/write/insert.rs        |   37 +-
 rust/lance/src/dataset/write/merge_insert.rs  |  160 +-
 rust/lance/src/index.rs                       |   39 +-
 rust/lance/src/index/append.rs                |  451 +-
 rust/lance/src/index/create.rs                |   30 +
 rust/lance/src/index/scalar.rs                |    2 +
 rust/lance/src/index/scalar/bitmap.rs         |    7 +-
 rust/lance/src/index/scalar/btree.rs          |    7 +-
 rust/lance/src/index/scalar_logical.rs        |   21 +-
 rust/lance/src/index/vector.rs                |    1 +
 rust/lance/src/index/vector/builder.rs        |    2 +-
 rust/lance/src/index/vector/hamming.rs        |  938 +++
 rust/lance/src/index/vector/ivf.rs            |    2 +-
 .../src/index/vector/ivf/partition_serde.rs   |  628 ++-
 rust/lance/src/index/vector/ivf/v2.rs         |  239 +-
 rust/lance/src/io/commit/external_manifest.rs |   26 +
 .../lance/src/io/commit/namespace_manifest.rs |  116 +-
 rust/lance/src/io/exec/knn.rs                 |  128 +
 rust/lance/src/io/exec/take.rs                |  104 +-
 rust/lance/src/io/exec/utils.rs               |   36 +-
 rust/lance/src/lib.rs                         |    2 +-
 rust/lance/src/session.rs                     |   92 +-
 202 files changed, 30260 insertions(+), 8491 deletions(-)
 create mode 100644 rust/lance-core/src/cache/entry_io.rs
 create mode 100644 rust/lance-core/src/utils/io_stats.rs
 create mode 100644 rust/lance-index/protos-cache/cache.proto
 create mode 100644 rust/lance-index/src/scalar/ngram/ngram_regex.rs
 create mode 100644 rust/lance-index/src/vector/bq/dist_table_quant.rs
 create mode 100644 rust/lance-index/src/vector/bq/ex_dot.rs
 create mode 100644 rust/lance-index/src/vector/bq/prune.rs
 delete mode 100644 rust/lance-linalg/benches/hamming.rs
 create mode 100644 rust/lance-namespace-impls/BENCHMARK.md
 create mode 100644 rust/lance-namespace-impls/benches/manifest_commit_sweep.sh
 create mode 100644 rust/lance-namespace-impls/examples/manifest_bench.rs
 create mode 100644 rust/lance-namespace-impls/src/dir/manifest_feature_flags.rs
 create mode 100644 rust/lance/benches/hamming.rs
 create mode 100644 rust/lance/benches/regex_ngram.rs
 create mode 100644 rust/lance/src/dataset/mem_wal/index/pk_key.rs
 create mode 100644 rust/lance/src/dataset/mem_wal/scanner/exec/newest_pk_filter.rs
 create mode 100644 rust/lance/src/dataset/mem_wal/scanner/exec/pk_block_filter.rs
 delete mode 100644 rust/lance/src/dataset/mem_wal/scanner/exec/pk_hash_filter.rs
 delete mode 100644 rust/lance/src/dataset/mem_wal/scanner/exec/within_source_dedup.rs
 create mode 100644 rust/lance/src/index/vector/hamming.rs

diff --git a/.bumpversion.toml b/.bumpversion.toml
index 7d766a80aff..80668862afb 100644
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "8.0.0-beta.11"
+current_version = "8.1.0-beta.0"
 parse = "(?P<major>\\d+)\\.(?P<minor>\\d+)\\.(?P<patch>\\d+)(-(?P<prerelease>(beta|rc))\\.(?P<prerelease_num>\\d+))?"
 serialize = [
     "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}",
diff --git a/.github/workflows/build_linux_wheel/action.yml b/.github/workflows/build_linux_wheel/action.yml
index 9016ae67b1a..d6e6e0f1ada 100644
--- a/.github/workflows/build_linux_wheel/action.yml
+++ b/.github/workflows/build_linux_wheel/action.yml
@@ -3,7 +3,7 @@ name: build-linux-wheel
 description: "Build a manylinux wheel for lance"
 inputs:
   python-minor-version:
-    description: "9, 10, 11, 12"
+    description: "10, 11, 12, 13"
     required: true
   args:
     description: "--release"
diff --git a/.github/workflows/build_mac_wheel/action.yml b/.github/workflows/build_mac_wheel/action.yml
index 9d45bde42aa..0cac76c49cf 100644
--- a/.github/workflows/build_mac_wheel/action.yml
+++ b/.github/workflows/build_mac_wheel/action.yml
@@ -3,7 +3,7 @@ name: build_wheel
 description: "Build a lance wheel"
 inputs:
   python-minor-version:
-    description: "9, 10, 11, 12"
+    description: "10, 11, 12, 13"
     required: true
   args:
     description: "--release"
diff --git a/.github/workflows/build_windows_wheel/action.yml b/.github/workflows/build_windows_wheel/action.yml
index 03b601db019..94475059c75 100644
--- a/.github/workflows/build_windows_wheel/action.yml
+++ b/.github/workflows/build_windows_wheel/action.yml
@@ -3,7 +3,7 @@ name: build_wheel
 description: "Build a lance wheel"
 inputs:
   python-minor-version:
-    description: "9, 10, 11, 12"
+    description: "10, 11, 12, 13"
     required: true
   args:
     description: "--release"
diff --git a/.github/workflows/java-publish.yml b/.github/workflows/java-publish.yml
index a51cf969a87..2b22b60dc92 100644
--- a/.github/workflows/java-publish.yml
+++ b/.github/workflows/java-publish.yml
@@ -28,10 +28,24 @@ permissions:
   contents: read
 
 jobs:
-  linux-arm64:
-    name: Build on Linux Arm64
-    runs-on: ubuntu-24.04-arm64-8x
+  build-linux:
+    name: Build on Linux ${{ matrix.arch }}
+    runs-on: ${{ matrix.runner }}
     timeout-minutes: 60
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - arch: x86-64
+            runner: ubuntu-24.04
+            docker_platform: linux/amd64
+            protoc_arch: x86_64
+            artifact: liblance_jni_linux_x86_64.zip
+          - arch: arm64
+            runner: ubuntu-24.04-arm64-8x
+            docker_platform: linux/arm64
+            protoc_arch: aarch_64
+            artifact: liblance_jni_linux_arm_64.zip
     steps:
       - name: Checkout repository
         uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
@@ -41,9 +55,9 @@ jobs:
         uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
       - name: Check glibc version outside docker
         run: ldd --version
-      - name: Build and run in Debian 10 Arm64 container
+      - name: Build and run in Debian 10 container
         run: |
-          docker run --platform linux/arm64 -v ${{ github.workspace }}:/workspace -w /workspace debian:10 bash -c "
+          docker run --platform ${{ matrix.docker_platform }} -v ${{ github.workspace }}:/workspace -w /workspace debian:10 bash -c "
 
             set -ex
             # Update sources.list to use archive repositories for Debian 10 (EOL)
@@ -81,7 +95,7 @@ jobs:
               unzip
 
             # https://github.com/databendlabs/databend/issues/8035
-            PROTOC_ZIP=protoc-3.15.0-linux-aarch_64.zip
+            PROTOC_ZIP=protoc-3.15.0-linux-${{ matrix.protoc_arch }}.zip
             curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v3.15.0/\$PROTOC_ZIP
             unzip -o \$PROTOC_ZIP -d /usr/local
             rm -f \$PROTOC_ZIP
@@ -102,101 +116,44 @@ jobs:
           "
       - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
         with:
-          name: liblance_jni_linux_arm_64.zip
+          name: ${{ matrix.artifact }}
           path: java/lance-jni/target/release/liblance_jni.so
           retention-days: 1
           if-no-files-found: error
-  linux-x86:
-    name: Build on Linux x86-64
-    runs-on: ubuntu-24.04
+  build-macos:
+    name: Build on MacOS Arm64
+    runs-on: warp-macos-14-arm64-6x
     timeout-minutes: 60
     steps:
       - name: Checkout repository
         uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
         with:
           ref: ${{ inputs.ref || github.ref }}
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
-      - name: Check glibc version outside docker
-        run: ldd --version
-      - name: Build and run in Debian 10 X86-64 container
-        run: |
-          docker run --platform linux/amd64 -v ${{ github.workspace }}:/workspace -w /workspace debian:10 bash -c "
-
-            set -ex
-            # Update sources.list to use archive repositories for Debian 10 (EOL)
-            echo 'deb http://archive.debian.org/debian/ buster main' > /etc/apt/sources.list
-            echo 'deb http://archive.debian.org/debian-security buster/updates main' >> /etc/apt/sources.list
-            echo 'deb http://archive.debian.org/debian/ buster-updates main' >> /etc/apt/sources.list
-            apt-get update
-
-            DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends --assume-yes \
-              apt-transport-https \
-              ca-certificates \
-              curl \
-              gpg \
-              bash \
-              less \
-              openssl \
-              libssl-dev \
-              pkg-config \
-              libsqlite3-dev \
-              libsqlite3-0 \
-              libreadline-dev \
-              git \
-              cmake \
-              dh-autoreconf \
-              clang \
-              g++ \
-              libc++-dev \
-              libc++abi-dev \
-              libprotobuf-dev \
-              libncurses5-dev \
-              libncursesw5-dev \
-              libudev-dev \
-              libhidapi-dev \
-              zip \
-              unzip
-
-            # https://github.com/databendlabs/databend/issues/8035
-            PROTOC_ZIP=protoc-3.15.0-linux-x86_64.zip
-            curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v3.15.0/\$PROTOC_ZIP
-            unzip -o \$PROTOC_ZIP -d /usr/local
-            rm -f \$PROTOC_ZIP
-            protoc --version
-
-            curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain stable
-            source \$HOME/.cargo/env
-            cargo --version
-
-            cd java/lance-jni
-
-            # https://github.com/rustls/rustls/issues/1967
-            export CC=clang
-            export CXX=clang++
-            ldd --version
-
-            cargo build --release
-          "
+      - uses: Swatinem/rust-cache@779680da715d629ac1d338a641029a2f4372abb5 # v2
+      - uses: Homebrew/actions/setup-homebrew@50b8c2ab4a835c38897ed2c56c293b07167c0b59 # master 2026-03-07
+      - name: Install dependencies
+        run: brew install protobuf
+      - name: Build native lib
+        working-directory: java/lance-jni
+        run: cargo build --release
       - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
         with:
-          name: liblance_jni_linux_x86_64.zip
-          path: java/lance-jni/target/release/liblance_jni.so
+          name: liblance_jni_darwin_aarch64.zip
+          path: java/lance-jni/target/release/liblance_jni.dylib
           retention-days: 1
           if-no-files-found: error
-  macos-arm64:
-    name: Build on MacOS Arm64 and release
-    runs-on: warp-macos-14-arm64-6x
-    timeout-minutes: 60
+  publish:
+    name: Publish Java packages
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
     needs:
-      - linux-arm64
-      - linux-x86
+      - build-linux
+      - build-macos
     steps:
       - name: Checkout repository
         uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
         with:
           ref: ${{ inputs.ref || github.ref }}
-      - uses: Swatinem/rust-cache@779680da715d629ac1d338a641029a2f4372abb5 # v2
       - name: Set up Java 11
         uses: actions/setup-java@c1e323688fd81a25caa38c78aa6df2d33d3e20d9 # v4
         with:
@@ -208,18 +165,16 @@ jobs:
           server-password: SONATYPE_TOKEN
           gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
           gpg-passphrase: ${{ secrets.GPG_PASSPHRASE }}
-      - uses: Homebrew/actions/setup-homebrew@50b8c2ab4a835c38897ed2c56c293b07167c0b59 # master 2026-03-07
-      - name: Install dependencies
-        run: |
-          brew install protobuf
-          brew install gpg
-      - name: Download artifact
+      - name: Download artifacts
         uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
       - name: Copy native libs
         run: |
-          mkdir -p ./java/target/classes/nativelib/linux-x86-64 ./java/target/classes/nativelib/linux-aarch64
+          mkdir -p ./java/target/classes/nativelib/linux-x86-64 \
+            ./java/target/classes/nativelib/linux-aarch64 \
+            ./java/target/classes/nativelib/darwin-aarch64
           cp ./liblance_jni_linux_x86_64.zip/liblance_jni.so ./java/target/classes/nativelib/linux-x86-64/liblance_jni.so
           cp ./liblance_jni_linux_arm_64.zip/liblance_jni.so ./java/target/classes/nativelib/linux-aarch64/liblance_jni.so
+          cp ./liblance_jni_darwin_aarch64.zip/liblance_jni.dylib ./java/target/classes/nativelib/darwin-aarch64/liblance_jni.dylib
       - name: Set github
         run: |
           git config --global user.email "Lance Github Runner"
@@ -230,7 +185,7 @@ jobs:
           inputs.mode == 'dry_run'
         working-directory: java
         run: |
-          mvn --batch-mode -DskipTests -Drust.release.build=true package
+          mvn --batch-mode -DskipTests -Dskip.build.jni=true package
       - name: Publish with Java 11
         if: |
           github.event_name == 'release' ||
@@ -240,14 +195,14 @@ jobs:
           echo "use-agent" >> ~/.gnupg/gpg.conf
           echo "pinentry-mode loopback" >> ~/.gnupg/gpg.conf
           export GPG_TTY=$(tty)
-          mvn --batch-mode -DskipTests -Drust.release.build=true -DpushChanges=false -Dgpg.passphrase=${{ secrets.GPG_PASSPHRASE }} deploy -P deploy-to-ossrh -P shade-jar
+          mvn --batch-mode -DskipTests -Dskip.build.jni=true -DpushChanges=false -Dgpg.passphrase=${{ secrets.GPG_PASSPHRASE }} deploy -P deploy-to-ossrh
         env:
           SONATYPE_USER: ${{ secrets.SONATYPE_USER }}
           SONATYPE_TOKEN: ${{ secrets.SONATYPE_TOKEN }}
   report-failure:
     name: Report Workflow Failure
     runs-on: ubuntu-latest
-    needs: [linux-arm64, linux-x86, macos-arm64]
+    needs: [build-linux, build-macos, publish]
     if: always() && (github.event_name == 'release' || github.event_name == 'workflow_dispatch')
     permissions:
       contents: read
diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml
index b2bfe284fb5..77c76d6fc69 100644
--- a/.github/workflows/pypi-publish.yml
+++ b/.github/workflows/pypi-publish.yml
@@ -35,7 +35,7 @@ jobs:
     name: Python Linux 3.${{ matrix.python-minor-version }} ${{ matrix.config.platform }} manylinux${{ matrix.config.manylinux }}
     strategy:
       matrix:
-        python-minor-version: ["9"]
+        python-minor-version: ["10"]
         config:
           - platform: x86_64
             manylinux: "2_17"
@@ -101,7 +101,7 @@ jobs:
     runs-on: ${{ matrix.config.runner }}
     strategy:
       matrix:
-        python-minor-version: ["9"]
+        python-minor-version: ["10"]
         config:
           - target: aarch64-apple-darwin
             runner: warp-macos-14-arm64-6x
@@ -152,7 +152,7 @@ jobs:
     runs-on: windows-latest-4x
     strategy:
       matrix:
-        python-minor-version: ["9"]
+        python-minor-version: ["10"]
     steps:
       - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
         with:
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index f9bb3132b38..cce465807e3 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -97,7 +97,7 @@ jobs:
     timeout-minutes: 45
     strategy:
       matrix:
-        python-minor-version: ["9", "13"]
+        python-minor-version: ["10", "13"]
     name: "Python Linux 3.${{ matrix.python-minor-version }} x86_64"
     runs-on: "ubuntu-24.04-4x"
     defaults:
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index cf332215e49..8f3ec285f31 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -19,7 +19,7 @@ If you have any questions, please join our [Discord](https://discord.gg/zMM32dvN
 Currently Lance is implemented in Rust and comes with a Python wrapper. So you'll want to make sure you setup both.
 
 1. Install Rust: https://www.rust-lang.org/tools/install
-2. Install Python 3.9+: https://www.python.org/downloads/
+2. Install Python 3.10+: https://www.python.org/downloads/
 3. Install protoctol buffers: https://grpc.io/docs/protoc-installation/ (make sure you have version 3.20 or higher)
 4. Install commit hooks:
     a. Install pre-commit: https://pre-commit.com/#install
diff --git a/Cargo.lock b/Cargo.lock
index 866eb9b4b0e..11a5fb65a7b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -66,21 +66,6 @@ version = "2.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "514ce16346f9fc96702fd52f2ae7e383b185516ee6f556efd7c3176be8fe7bea"
 
-[[package]]
-name = "alloc-no-stdlib"
-version = "2.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3"
-
-[[package]]
-name = "alloc-stdlib"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece"
-dependencies = [
- "alloc-no-stdlib",
-]
-
 [[package]]
 name = "alloca"
 version = "0.4.0"
@@ -475,7 +460,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -486,7 +471,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -1134,9 +1119,9 @@ dependencies = [
 
 [[package]]
 name = "bitvec"
-version = "1.0.1"
+version = "1.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c"
+checksum = "ddcec3d12c579d40898fe0a9a358a803c23e9c52ca3c425707f81c9436211837"
 dependencies = [
  "funty",
  "radium",
@@ -1178,9 +1163,9 @@ dependencies = [
 
 [[package]]
 name = "block-buffer"
-version = "0.12.0"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be"
+checksum = "d2f6c7dbe95a6ed67ad9f18e57daf93a2f034c524b99fd2b76d18fdfeb6660aa"
 dependencies = [
  "hybrid-array",
 ]
@@ -1194,27 +1179,6 @@ dependencies = [
  "generic-array",
 ]
 
-[[package]]
-name = "brotli"
-version = "8.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610"
-dependencies = [
- "alloc-no-stdlib",
- "alloc-stdlib",
- "brotli-decompressor",
-]
-
-[[package]]
-name = "brotli-decompressor"
-version = "5.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924"
-dependencies = [
- "alloc-no-stdlib",
- "alloc-stdlib",
-]
-
 [[package]]
 name = "bs58"
 version = "0.5.1"
@@ -1261,7 +1225,7 @@ checksum = "89385e82b5d1821d2219e0b095efa2cc1f246cbf99080f3be46a1a85c0d392d9"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -1284,9 +1248,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
 [[package]]
 name = "bytes"
-version = "1.11.1"
+version = "1.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
+checksum = "8ae3f5d315924270530207e2a68396c3cc547f6dca3fbdca317cfb1a51edb593"
 
 [[package]]
 name = "bytes-utils"
@@ -1315,9 +1279,9 @@ dependencies = [
 
 [[package]]
 name = "cc"
-version = "1.2.63"
+version = "1.2.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f"
+checksum = "dad887fd958be91b5098c0248def011f4523ab786cd411be668777e55063501f"
 dependencies = [
  "find-msvc-tools",
  "jobserver",
@@ -1455,7 +1419,7 @@ dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -1944,7 +1908,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim 0.11.1",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -1957,7 +1921,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim 0.11.1",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -1979,7 +1943,7 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead"
 dependencies = [
  "darling_core 0.20.11",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -1990,7 +1954,7 @@ checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d"
 dependencies = [
  "darling_core 0.23.0",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -2026,7 +1990,6 @@ dependencies = [
  "datafusion-datasource-arrow",
  "datafusion-datasource-csv",
  "datafusion-datasource-json",
- "datafusion-datasource-parquet",
  "datafusion-execution",
  "datafusion-expr",
  "datafusion-expr-common",
@@ -2048,7 +2011,6 @@ dependencies = [
  "log",
  "object_store",
  "parking_lot",
- "parquet",
  "rand 0.9.4",
  "regex",
  "sqlparser",
@@ -2123,7 +2085,6 @@ dependencies = [
  "libc",
  "log",
  "object_store",
- "parquet",
  "paste",
  "sqlparser",
  "tokio",
@@ -2241,36 +2202,6 @@ dependencies = [
  "tokio-stream",
 ]
 
-[[package]]
-name = "datafusion-datasource-parquet"
-version = "53.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997"
-dependencies = [
- "arrow",
- "async-trait",
- "bytes",
- "datafusion-common",
- "datafusion-common-runtime",
- "datafusion-datasource",
- "datafusion-execution",
- "datafusion-expr",
- "datafusion-functions-aggregate-common",
- "datafusion-physical-expr",
- "datafusion-physical-expr-adapter",
- "datafusion-physical-expr-common",
- "datafusion-physical-plan",
- "datafusion-pruning",
- "datafusion-session",
- "futures",
- "itertools 0.14.0",
- "log",
- "object_store",
- "parking_lot",
- "parquet",
- "tokio",
-]
-
 [[package]]
 name = "datafusion-doc"
 version = "53.1.0"
@@ -2479,7 +2410,7 @@ checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd"
 dependencies = [
  "datafusion-doc",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -2762,7 +2693,7 @@ dependencies = [
  "darling 0.20.11",
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -2782,7 +2713,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c"
 dependencies = [
  "derive_builder_core 0.20.2",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -2809,7 +2740,7 @@ version = "0.11.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2"
 dependencies = [
- "block-buffer 0.12.0",
+ "block-buffer 0.12.1",
  "const-oid 0.10.2",
  "crypto-common 0.2.2",
  "ctutils",
@@ -2844,7 +2775,7 @@ checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -2969,7 +2900,7 @@ checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -3078,7 +3009,6 @@ checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
 dependencies = [
  "crc32fast",
  "miniz_oxide",
- "zlib-rs",
 ]
 
 [[package]]
@@ -3146,7 +3076,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
 
 [[package]]
 name = "fsst"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-array",
  "rand 0.9.4",
@@ -3225,7 +3155,7 @@ checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -3448,17 +3378,15 @@ dependencies = [
 
 [[package]]
 name = "getrandom"
-version = "0.4.2"
+version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
+checksum = "300e883d756b2e4ec94e02791f39b04b522276138852cfc41d9fb7e904106099"
 dependencies = [
  "cfg-if 1.0.4",
  "js-sys",
  "libc",
  "r-efi 6.0.0",
  "rand_core 0.10.1",
- "wasip2",
- "wasip3",
  "wasm-bindgen",
 ]
 
@@ -3485,7 +3413,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -3532,9 +3460,9 @@ dependencies = [
 
 [[package]]
 name = "h2"
-version = "0.4.14"
+version = "0.4.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733"
+checksum = "6cb093c84e8bd9b188d4c4a8cb6579fc016968d14c99882163cd3ff402a4f155"
 dependencies = [
  "atomic-waker",
  "bytes",
@@ -3835,7 +3763,7 @@ dependencies = [
  "tokio",
  "tokio-rustls",
  "tower-service",
- "webpki-roots 1.0.7",
+ "webpki-roots 1.0.8",
 ]
 
 [[package]]
@@ -4095,12 +4023,6 @@ version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e4a2c462a4d927d512f5f882a033ddd62f33a05bb9f230d98f736ac3dc85938f"
 
-[[package]]
-name = "id-arena"
-version = "2.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
-
 [[package]]
 name = "ident_case"
 version = "1.0.1"
@@ -4322,7 +4244,7 @@ checksum = "782d32378dddf207193ac91cefb848ad41abb58195c95168e1291227a0832b47"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -4367,7 +4289,7 @@ dependencies = [
  "quote",
  "rustc_version",
  "simd_cesu8",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -4386,7 +4308,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264"
 dependencies = [
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -4401,9 +4323,9 @@ dependencies = [
 
 [[package]]
 name = "js-sys"
-version = "0.3.100"
+version = "0.3.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2025f20d7a4fa7785846e7b63d10a76d3f1cee98ee5cb79ea59703f95e42162"
+checksum = "03d04c30968dffe80775bd4d7fb676131cd04a1fb46d2686dbffbaec2d9dfd31"
 dependencies = [
  "cfg-if 1.0.4",
  "futures-util",
@@ -4458,7 +4380,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a"
 
 [[package]]
 name = "lance"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "all_asserts",
  "approx",
@@ -4561,7 +4483,7 @@ dependencies = [
 
 [[package]]
 name = "lance-arrow"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -4609,7 +4531,7 @@ dependencies = [
 
 [[package]]
 name = "lance-bitpacking"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrayref",
  "paste",
@@ -4618,7 +4540,7 @@ dependencies = [
 
 [[package]]
 name = "lance-core"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -4658,7 +4580,7 @@ dependencies = [
 
 [[package]]
 name = "lance-datafusion"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow",
  "arrow-array",
@@ -4691,7 +4613,7 @@ dependencies = [
 
 [[package]]
 name = "lance-datagen"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow",
  "arrow-array",
@@ -4706,21 +4628,20 @@ dependencies = [
  "rand 0.9.4",
  "rand_distr",
  "rand_xoshiro",
- "random_word",
 ]
 
 [[package]]
 name = "lance-derive"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
 name = "lance-encoding"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-arith",
  "arrow-array",
@@ -4765,7 +4686,7 @@ dependencies = [
 
 [[package]]
 name = "lance-examples"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "all_asserts",
  "arrow",
@@ -4791,7 +4712,7 @@ dependencies = [
 
 [[package]]
 name = "lance-file"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-arith",
  "arrow-array",
@@ -4830,7 +4751,7 @@ dependencies = [
 
 [[package]]
 name = "lance-geo"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "datafusion",
  "geo-traits",
@@ -4844,7 +4765,7 @@ dependencies = [
 
 [[package]]
 name = "lance-index"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "approx",
  "arc-swap",
@@ -4906,6 +4827,7 @@ dependencies = [
  "rand_distr",
  "rangemap",
  "rayon",
+ "regex-syntax",
  "roaring",
  "rstest",
  "serde",
@@ -4920,7 +4842,7 @@ dependencies = [
 
 [[package]]
 name = "lance-io"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow",
  "arrow-arith",
@@ -4968,7 +4890,7 @@ dependencies = [
 
 [[package]]
 name = "lance-linalg"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "approx",
  "arrow-array",
@@ -4983,11 +4905,12 @@ dependencies = [
  "num-traits",
  "proptest",
  "rand 0.9.4",
+ "rayon",
 ]
 
 [[package]]
 name = "lance-namespace"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow",
  "async-trait",
@@ -4999,7 +4922,7 @@ dependencies = [
 
 [[package]]
 name = "lance-namespace-datafusion"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-array",
  "arrow-schema",
@@ -5015,7 +4938,7 @@ dependencies = [
 
 [[package]]
 name = "lance-namespace-impls"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow",
  "arrow-array",
@@ -5028,6 +4951,8 @@ dependencies = [
  "base64 0.22.1",
  "bytes",
  "chrono",
+ "datafusion-common",
+ "datafusion-physical-plan",
  "futures",
  "hmac 0.12.1",
  "lance",
@@ -5045,24 +4970,27 @@ dependencies = [
  "rand 0.9.4",
  "reqwest 0.12.28",
  "ring",
+ "roaring",
  "rstest",
  "rustls-pki-types",
  "serde",
  "serde_json",
  "sha2 0.10.9",
  "tempfile",
+ "time",
  "tokio",
  "tower",
  "tower-http 0.5.2",
  "url",
+ "uuid",
  "wiremock",
 ]
 
 [[package]]
 name = "lance-namespace-reqwest-client"
-version = "0.8.4"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04b4e5caefa132a9cce54b2d4dc95016b949b3a290a83ad5057e705df43d75be"
+checksum = "ba3f0a235e3ed5f8805205649ccc7d7d0f3df23ce1294242c9265ad488d7f19d"
 dependencies = [
  "reqwest 0.12.28",
  "serde",
@@ -5074,7 +5002,7 @@ dependencies = [
 
 [[package]]
 name = "lance-select"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -5092,7 +5020,7 @@ dependencies = [
 
 [[package]]
 name = "lance-table"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow",
  "arrow-array",
@@ -5138,16 +5066,16 @@ dependencies = [
 
 [[package]]
 name = "lance-test-macros"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
 name = "lance-testing"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-array",
  "arrow-schema",
@@ -5160,19 +5088,20 @@ dependencies = [
 
 [[package]]
 name = "lance-tokenizer"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "icu_segmenter",
  "jieba-rs",
  "lindera",
  "rust-stemmers",
  "serde",
+ "stop-words",
  "unicode-normalization",
 ]
 
 [[package]]
 name = "lance-tools"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "clap",
  "lance-core",
@@ -5192,12 +5121,6 @@ dependencies = [
  "spin 0.9.8",
 ]
 
-[[package]]
-name = "leb128fmt"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
-
 [[package]]
 name = "lexical-core"
 version = "1.0.6"
@@ -5520,9 +5443,9 @@ dependencies = [
 
 [[package]]
 name = "memchr"
-version = "2.8.1"
+version = "2.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8"
+checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4"
 
 [[package]]
 name = "memmap2"
@@ -5605,7 +5528,7 @@ dependencies = [
  "cfg-if 1.0.4",
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -5647,7 +5570,7 @@ checksum = "e4db6d5580af57bf992f59068d4ea26fd518574ff48d7639b255a36f9de6e7e9"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -5679,7 +5602,7 @@ checksum = "4568f25ccbd45ab5d5603dc34318c1ec56b117531781260002151b8530a9f931"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -5872,7 +5795,7 @@ dependencies = [
  "proc-macro-crate",
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -6292,9 +6215,9 @@ dependencies = [
 
 [[package]]
 name = "openssl"
-version = "0.10.80"
+version = "0.10.81"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a45fa2aa886c42762255da344f0a0d313e254066c46aad76f300c3d3da62d967"
+checksum = "77823a27f0babb03091cb9ed9ef80af3b39dbc82f97e8fa530374b7dafd87a45"
 dependencies = [
  "bitflags 2.13.0",
  "cfg-if 1.0.4",
@@ -6312,7 +6235,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -6323,9 +6246,9 @@ checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.116"
+version = "0.9.117"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f28a22dc7140cda5f096e5e7724a6962ca81a7f8bfd2979f9b18c11af56318c4"
+checksum = "b47e7e6bb2c38cd930d25a23b40fa52e068c10e85f3e03a7f5ba5aaca5713695"
 dependencies = [
  "cc",
  "libc",
@@ -6435,26 +6358,19 @@ dependencies = [
  "arrow-schema",
  "arrow-select",
  "base64 0.22.1",
- "brotli",
  "bytes",
  "chrono",
- "flate2",
  "futures",
  "half",
  "hashbrown 0.17.1",
- "lz4_flex",
  "num-bigint",
  "num-integer",
  "num-traits",
- "object_store",
  "paste",
  "seq-macro",
- "simdutf8",
- "snap",
  "thrift",
  "tokio",
  "twox-hash",
- "zstd",
 ]
 
 [[package]]
@@ -6639,7 +6555,7 @@ checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -6832,7 +6748,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
 dependencies = [
  "proc-macro2",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -6897,7 +6813,7 @@ dependencies = [
  "prost",
  "prost-types",
  "regex",
- "syn 2.0.117",
+ "syn 2.0.118",
  "tempfile",
 ]
 
@@ -6911,7 +6827,7 @@ dependencies = [
  "itertools 0.14.0",
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -6949,7 +6865,7 @@ checksum = "7347867d0a7e1208d93b46767be83e2b8f978c3dad35f775ac8d8847551d6fe1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -7116,7 +7032,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207"
 dependencies = [
  "chacha20",
- "getrandom 0.4.2",
+ "getrandom 0.4.3",
  "rand_core 0.10.1",
 ]
 
@@ -7192,19 +7108,6 @@ dependencies = [
  "rand_core 0.9.5",
 ]
 
-[[package]]
-name = "random_word"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e47a395bdb55442b883c89062d6bcff25dc90fa5f8369af81e0ac6d49d78cf81"
-dependencies = [
- "ahash",
- "brotli",
- "paste",
- "rand 0.9.4",
- "unicase",
-]
-
 [[package]]
 name = "rangemap"
 version = "1.7.1"
@@ -7294,7 +7197,7 @@ checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -7545,7 +7448,7 @@ dependencies = [
  "wasm-bindgen-futures",
  "wasm-streams 0.4.2",
  "web-sys",
- "webpki-roots 1.0.7",
+ "webpki-roots 1.0.8",
 ]
 
 [[package]]
@@ -7652,7 +7555,7 @@ checksum = "5d2ed0b54125315fb36bd021e82d314d1c126548f871634b483f46b31d13cac6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -7728,7 +7631,7 @@ dependencies = [
  "regex",
  "relative-path",
  "rustc_version",
- "syn 2.0.117",
+ "syn 2.0.118",
  "unicode-ident",
 ]
 
@@ -7965,7 +7868,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "serde_derive_internals",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -8057,7 +7960,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -8068,7 +7971,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -8103,7 +8006,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -8115,7 +8018,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "serde",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -8159,7 +8062,7 @@ dependencies = [
  "darling 0.23.0",
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -8210,7 +8113,7 @@ checksum = "94e153fc76e1c6a068703d6d29c508a0b15c061c4b7e43da59cc097bc342673c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -8349,9 +8252,9 @@ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5"
 
 [[package]]
 name = "smallvec"
-version = "1.15.1"
+version = "1.15.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
+checksum = "8ed6a63f02c8539c91a8685a86f4099661ba3da017932f6ebbea6de3f0fa7c90"
 
 [[package]]
 name = "snafu"
@@ -8371,15 +8274,9 @@ dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
-[[package]]
-name = "snap"
-version = "1.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b"
-
 [[package]]
 name = "socket2"
 version = "0.6.4"
@@ -8468,7 +8365,7 @@ checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -8505,6 +8402,15 @@ version = "0.2.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978"
 
+[[package]]
+name = "stop-words"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d68df56303396bcfb639455b3c166804aeb7994005010aab5e9e8a1277b8871d"
+dependencies = [
+ "serde_json",
+]
+
 [[package]]
 name = "str_stack"
 version = "0.1.1"
@@ -8551,7 +8457,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -8563,7 +8469,7 @@ dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -8586,7 +8492,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_yaml",
- "syn 2.0.117",
+ "syn 2.0.118",
  "typify",
  "walkdir",
 ]
@@ -8639,9 +8545,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.117"
+version = "2.0.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
+checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -8665,7 +8571,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -8722,7 +8628,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
 dependencies = [
  "fastrand",
- "getrandom 0.4.2",
+ "getrandom 0.4.3",
  "once_cell",
  "rustix",
  "windows-sys 0.61.2",
@@ -8753,7 +8659,7 @@ checksum = "c26ef8b00e4d382e59f6a8ddb3cd790b3a5bb29f21a358a9a69ea2f29f13f27b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -8762,7 +8668,7 @@ version = "0.2.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "944ad38adcbb71eaa682c56bceeb079e4ca82b4b3edc2a0fde5cb297b77dac8d"
 dependencies = [
- "syn 2.0.117",
+ "syn 2.0.118",
  "test-log-core",
 ]
 
@@ -8792,7 +8698,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -8803,7 +8709,7 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -8969,7 +8875,7 @@ checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -9201,7 +9107,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -9339,7 +9245,7 @@ dependencies = [
  "semver",
  "serde",
  "serde_json",
- "syn 2.0.117",
+ "syn 2.0.118",
  "thiserror 2.0.18",
  "unicode-ident",
 ]
@@ -9357,7 +9263,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_tokenstream",
- "syn 2.0.117",
+ "syn 2.0.118",
  "typify-impl",
 ]
 
@@ -9415,12 +9321,6 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
 
-[[package]]
-name = "unicode-xid"
-version = "0.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
-
 [[package]]
 name = "unicode_categories"
 version = "0.1.1"
@@ -9501,7 +9401,7 @@ version = "1.23.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7"
 dependencies = [
- "getrandom 0.4.2",
+ "getrandom 0.4.3",
  "js-sys",
  "serde_core",
  "wasm-bindgen",
@@ -9576,20 +9476,11 @@ dependencies = [
 
 [[package]]
 name = "wasip2"
-version = "1.0.3+wasi-0.2.9"
+version = "1.0.4+wasi-0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6"
+checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487"
 dependencies = [
- "wit-bindgen 0.57.1",
-]
-
-[[package]]
-name = "wasip3"
-version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
-dependencies = [
- "wit-bindgen 0.51.0",
+ "wit-bindgen",
 ]
 
 [[package]]
@@ -9603,9 +9494,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.123"
+version = "0.2.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a254a4b10c19a76f09a27640e7ffbf9bc30bf67e16a3bf28aaefa4920fe81563"
+checksum = "8ddb3f79143bced6de84270411622a2699cee572fc0875aeaf1e7867cf9fca1a"
 dependencies = [
  "cfg-if 1.0.4",
  "once_cell",
@@ -9616,9 +9507,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.73"
+version = "0.4.75"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54568702fabf5d4849ce2b90fadfa64168a097eaf4b351ce9df8b687a0086aaf"
+checksum = "503b14d284f2c8dac03b819967e155ea753f573586193b2b2c95990cb5d69280"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -9626,9 +9517,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.123"
+version = "0.2.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24a40fc75b0ec6f3746ceb10d36f53a93dcd68a93b11b6445983945d79eba0dc"
+checksum = "4e21a184b13fb19e157296e2c46056aec9092264fab83e4ba59e68c61b323c3d"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -9636,48 +9527,26 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.123"
+version = "0.2.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "908f34bd9b9ce3d4caf07b72dfab63d61504d156856c6bd3cd87fa350cf3985b"
+checksum = "fecefd9c35bd935a20fc3fc344b5f29138961e4f47fb03297d88f2587afb5ebd"
 dependencies = [
  "bumpalo",
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.123"
+version = "0.2.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7acbf7616c27b194bbb550bf77ed0c2c3e5b7fd1260a93082b95fb7f47959b92"
+checksum = "23939e44bb9a5d7576fa2b563dc2e136628f1224e88a8deed09e04858b77871f"
 dependencies = [
  "unicode-ident",
 ]
 
-[[package]]
-name = "wasm-encoder"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
-dependencies = [
- "leb128fmt",
- "wasmparser",
-]
-
-[[package]]
-name = "wasm-metadata"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
-dependencies = [
- "anyhow",
- "indexmap 2.14.0",
- "wasm-encoder",
- "wasmparser",
-]
-
 [[package]]
 name = "wasm-streams"
 version = "0.4.2"
@@ -9704,23 +9573,11 @@ dependencies = [
  "web-sys",
 ]
 
-[[package]]
-name = "wasmparser"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
-dependencies = [
- "bitflags 2.13.0",
- "hashbrown 0.15.5",
- "indexmap 2.14.0",
- "semver",
-]
-
 [[package]]
 name = "web-sys"
-version = "0.3.100"
+version = "0.3.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e0871acf327f283dc6da28a1696cdc64fb355ba9f935d052021fa77f35cce69"
+checksum = "a6430a72df5eb332242960fe84b3002a241163998241eb596d4f739b9757061d"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -9738,9 +9595,9 @@ dependencies = [
 
 [[package]]
 name = "webpki-root-certs"
-version = "1.0.7"
+version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c"
+checksum = "0d46a5a140e6f7afeccd8eae97eff335163939eac8b929834875168b29b3d267"
 dependencies = [
  "rustls-pki-types",
 ]
@@ -9751,14 +9608,14 @@ version = "0.26.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9"
 dependencies = [
- "webpki-roots 1.0.7",
+ "webpki-roots 1.0.8",
 ]
 
 [[package]]
 name = "webpki-roots"
-version = "1.0.7"
+version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d"
+checksum = "bf85cb06032201fa7c6f829d7db5a7e5aa45bcc0655327713065f6f0576731bf"
 dependencies = [
  "rustls-pki-types",
 ]
@@ -9860,7 +9717,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -9871,7 +9728,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -10125,100 +9982,12 @@ dependencies = [
  "url",
 ]
 
-[[package]]
-name = "wit-bindgen"
-version = "0.51.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
-dependencies = [
- "wit-bindgen-rust-macro",
-]
-
 [[package]]
 name = "wit-bindgen"
 version = "0.57.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e"
 
-[[package]]
-name = "wit-bindgen-core"
-version = "0.51.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
-dependencies = [
- "anyhow",
- "heck",
- "wit-parser",
-]
-
-[[package]]
-name = "wit-bindgen-rust"
-version = "0.51.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
-dependencies = [
- "anyhow",
- "heck",
- "indexmap 2.14.0",
- "prettyplease",
- "syn 2.0.117",
- "wasm-metadata",
- "wit-bindgen-core",
- "wit-component",
-]
-
-[[package]]
-name = "wit-bindgen-rust-macro"
-version = "0.51.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
-dependencies = [
- "anyhow",
- "prettyplease",
- "proc-macro2",
- "quote",
- "syn 2.0.117",
- "wit-bindgen-core",
- "wit-bindgen-rust",
-]
-
-[[package]]
-name = "wit-component"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
-dependencies = [
- "anyhow",
- "bitflags 2.13.0",
- "indexmap 2.14.0",
- "log",
- "serde",
- "serde_derive",
- "serde_json",
- "wasm-encoder",
- "wasm-metadata",
- "wasmparser",
- "wit-parser",
-]
-
-[[package]]
-name = "wit-parser"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
-dependencies = [
- "anyhow",
- "id-arena",
- "indexmap 2.14.0",
- "log",
- "semver",
- "serde",
- "serde_derive",
- "serde_json",
- "unicode-xid",
- "wasmparser",
-]
-
 [[package]]
 name = "wkb"
 version = "0.9.2"
@@ -10313,7 +10082,7 @@ dependencies = [
  "csv",
  "futures",
  "futures-util",
- "getrandom 0.4.2",
+ "getrandom 0.4.3",
  "heapify",
  "itertools 0.14.0",
  "lazy_static",
@@ -10443,7 +10212,7 @@ checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
  "synstructure",
 ]
 
@@ -10464,7 +10233,7 @@ checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -10484,15 +10253,15 @@ checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
  "synstructure",
 ]
 
 [[package]]
 name = "zeroize"
-version = "1.8.2"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
+checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e"
 
 [[package]]
 name = "zerotrie"
@@ -10526,15 +10295,9 @@ checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
-[[package]]
-name = "zlib-rs"
-version = "0.6.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513"
-
 [[package]]
 name = "zmij"
 version = "1.0.21"
diff --git a/Cargo.toml b/Cargo.toml
index 1996e2a2d57..f902f10496b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,7 +32,7 @@ resolver = "3"
 
 
 [workspace.package]
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 edition = "2024"
 authors = ["Lance Devs <dev@lance.org>"]
 license = "Apache-2.0"
@@ -57,27 +57,27 @@ rust-version = "1.91.0"
 [workspace.dependencies]
 arc-swap = "1.7"
 libc = "0.2.176"
-lance = { version = "=8.0.0-beta.11", path = "./rust/lance", default-features = false }
-lance-arrow = { version = "=8.0.0-beta.11", path = "./rust/lance-arrow" }
-lance-core = { version = "=8.0.0-beta.11", path = "./rust/lance-core" }
-lance-datafusion = { version = "=8.0.0-beta.11", path = "./rust/lance-datafusion" }
-lance-datagen = { version = "=8.0.0-beta.11", path = "./rust/lance-datagen" }
-lance-derive = { version = "=8.0.0-beta.11", path = "./rust/lance-derive" }
-lance-encoding = { version = "=8.0.0-beta.11", path = "./rust/lance-encoding" }
-lance-file = { version = "=8.0.0-beta.11", path = "./rust/lance-file" }
-lance-geo = { version = "=8.0.0-beta.11", path = "./rust/lance-geo" }
-lance-index = { version = "=8.0.0-beta.11", path = "./rust/lance-index" }
-lance-io = { version = "=8.0.0-beta.11", path = "./rust/lance-io", default-features = false }
-lance-linalg = { version = "=8.0.0-beta.11", path = "./rust/lance-linalg" }
-lance-namespace = { version = "=8.0.0-beta.11", path = "./rust/lance-namespace" }
-lance-namespace-impls = { version = "=8.0.0-beta.11", path = "./rust/lance-namespace-impls" }
+lance = { version = "=8.1.0-beta.0", path = "./rust/lance", default-features = false }
+lance-arrow = { version = "=8.1.0-beta.0", path = "./rust/lance-arrow" }
+lance-core = { version = "=8.1.0-beta.0", path = "./rust/lance-core" }
+lance-datafusion = { version = "=8.1.0-beta.0", path = "./rust/lance-datafusion" }
+lance-datagen = { version = "=8.1.0-beta.0", path = "./rust/lance-datagen" }
+lance-derive = { version = "=8.1.0-beta.0", path = "./rust/lance-derive" }
+lance-encoding = { version = "=8.1.0-beta.0", path = "./rust/lance-encoding" }
+lance-file = { version = "=8.1.0-beta.0", path = "./rust/lance-file" }
+lance-geo = { version = "=8.1.0-beta.0", path = "./rust/lance-geo" }
+lance-index = { version = "=8.1.0-beta.0", path = "./rust/lance-index" }
+lance-io = { version = "=8.1.0-beta.0", path = "./rust/lance-io", default-features = false }
+lance-linalg = { version = "=8.1.0-beta.0", path = "./rust/lance-linalg" }
+lance-namespace = { version = "=8.1.0-beta.0", path = "./rust/lance-namespace" }
+lance-namespace-impls = { version = "=8.1.0-beta.0", path = "./rust/lance-namespace-impls" }
 lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" }
-lance-namespace-reqwest-client = "0.8.4"
-lance-select = { version = "=8.0.0-beta.11", path = "./rust/lance-select" }
-lance-tokenizer = { version = "=8.0.0-beta.11", path = "./rust/lance-tokenizer" }
-lance-table = { version = "=8.0.0-beta.11", path = "./rust/lance-table" }
-lance-test-macros = { version = "=8.0.0-beta.11", path = "./rust/lance-test-macros" }
-lance-testing = { version = "=8.0.0-beta.11", path = "./rust/lance-testing" }
+lance-namespace-reqwest-client = "0.8.6"
+lance-select = { version = "=8.1.0-beta.0", path = "./rust/lance-select" }
+lance-tokenizer = { version = "=8.1.0-beta.0", path = "./rust/lance-tokenizer" }
+lance-table = { version = "=8.1.0-beta.0", path = "./rust/lance-table" }
+lance-test-macros = { version = "=8.1.0-beta.0", path = "./rust/lance-test-macros" }
+lance-testing = { version = "=8.1.0-beta.0", path = "./rust/lance-testing" }
 approx = "0.5.1"
 # Note that this one does not include pyarrow
 arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] }
@@ -104,7 +104,7 @@ half = { "version" = "2.1", default-features = false, features = [
     "num-traits",
     "std",
 ] }
-lance-bitpacking = { version = "=8.0.0-beta.11", path = "./rust/compression/bitpacking" }
+lance-bitpacking = { version = "=8.1.0-beta.0", path = "./rust/compression/bitpacking" }
 bitpacking = "0.9"
 bitvec = "1"
 bytes = "1.11.1"
@@ -133,17 +133,17 @@ datafusion = { version = "53.0.0", default-features = false, features = [
     "unicode_expressions",
 ] }
 datafusion-common = "53.0.0"
-datafusion-functions = { version = "53.0.0", features = ["regex_expressions"] }
+datafusion-functions = { version = "53.0.0", default-features = false, features = ["regex_expressions"] }
 datafusion-sql = "53.0.0"
 datafusion-expr = "53.0.0"
 datafusion-ffi = "53.0.0"
 datafusion-physical-expr = "53.0.0"
 datafusion-physical-plan = "53.0.0"
-datafusion-substrait = "53.0.0"
+datafusion-substrait = { version = "53.0.0", default-features = false }
 dirs = "6.0.0"
 either = "1.0"
 fst = { version = "0.4.7", features = ["levenshtein"] }
-fsst = { version = "=8.0.0-beta.11", path = "./rust/compression/fsst" }
+fsst = { version = "=8.1.0-beta.0", path = "./rust/compression/fsst" }
 futures = "0.3"
 geoarrow-array = "0.8"
 geoarrow-schema = "0.8"
@@ -180,6 +180,7 @@ rand_distr = { version = "0.5.1" }
 rand_xoshiro = "0.7.0"
 rangemap = { version = "1.0" }
 rayon = "1.10"
+regex-syntax = "0.8.10"
 roaring = "0.11.4"
 rstest = "0.26.1"
 serde = { version = "^1" }
diff --git a/ci/create_release_branch.sh b/ci/create_release_branch.sh
index 9c7d9d3e58a..db88f5b6b24 100755
--- a/ci/create_release_branch.sh
+++ b/ci/create_release_branch.sh
@@ -229,9 +229,9 @@ else
     bump-my-version bump -vv --new-version "${RC_VERSION}" --no-tag patch
 
     # Update Cargo.lock files after version bump
-    cargo update
-    (cd python && cargo update)
-    (cd java/lance-jni && cargo update)
+    cargo update --workspace
+    (cd python && cargo update --workspace)
+    (cd java/lance-jni && cargo update --workspace)
 
     # Commit the RC version
     git add -A
@@ -259,9 +259,9 @@ else
     bump-my-version bump -vv --new-version "${NEXT_VERSION}" --no-tag patch
 
     # Update Cargo.lock files after version bump
-    cargo update
-    (cd python && cargo update)
-    (cd java/lance-jni && cargo update)
+    cargo update --workspace
+    (cd python && cargo update --workspace)
+    (cd java/lance-jni && cargo update --workspace)
 
     git add -A
     git commit -m "chore: bump main to ${NEXT_VERSION}
diff --git a/ci/publish_beta.sh b/ci/publish_beta.sh
index f50798a52e0..06fa5c16a91 100644
--- a/ci/publish_beta.sh
+++ b/ci/publish_beta.sh
@@ -93,9 +93,9 @@ if [[ "${BRANCH}" == "main" ]] && [[ "${CURRENT_VERSION}" =~ -beta\.[0-9]+$ ]];
                 bump-my-version bump -vv --new-version "${NEXT_VERSION}" --no-tag patch
 
                 # Update Cargo.lock files after version bump
-                cargo update
-                (cd python && cargo update)
-                (cd java/lance-jni && cargo update)
+                cargo update --workspace
+                (cd python && cargo update --workspace)
+                (cd java/lance-jni && cargo update --workspace)
 
                 git add -A
                 git commit -m "chore: bump to ${NEXT_VERSION} based on breaking change detection"
@@ -133,9 +133,9 @@ echo "Bumping beta version"
 bump-my-version bump -vv prerelease_num
 
 # Update Cargo.lock files after version bump
-cargo update
-(cd python && cargo update)
-(cd java/lance-jni && cargo update)
+cargo update --workspace
+(cd python && cargo update --workspace)
+(cd java/lance-jni && cargo update --workspace)
 
 # Get new version
 NEW_VERSION=$(grep '^version = ' Cargo.toml | head -n1 | cut -d'"' -f2)
diff --git a/ci/release_common.sh b/ci/release_common.sh
index cd653212aae..573202d1689 100644
--- a/ci/release_common.sh
+++ b/ci/release_common.sh
@@ -29,9 +29,9 @@ bump_and_commit_version() {
     bump-my-version bump -vv --new-version "${NEW_VERSION}" --no-tag patch
 
     # Update Cargo.lock files after version bump
-    cargo update
-    (cd python && cargo update)
-    (cd java/lance-jni && cargo update)
+    cargo update --workspace
+    (cd python && cargo update --workspace)
+    (cd java/lance-jni && cargo update --workspace)
 
     git add -A
     git commit -m "${COMMIT_MESSAGE}"
diff --git a/docs/src/format/file/encoding.md b/docs/src/format/file/encoding.md
index a3d99ef39cb..4ca053d4fa6 100644
--- a/docs/src/format/file/encoding.md
+++ b/docs/src/format/file/encoding.md
@@ -683,9 +683,10 @@ the default mini-block size is negligible. You should only consider changing thi
 confirmed — through profiling — that mini-block read amplification is saturating your available bandwidth
 (for example, accessing a remote object store over a constrained network link).
 
-The maximum number of values per mini-block can be lowered via an environment variable:
+The maximum number of values per mini-block can be tuned via an environment variable:
 
-- `LANCE_MINIBLOCK_MAX_VALUES` (default `4096`): upper bound on the number of values in a single mini-block chunk.
+- `LANCE_MINIBLOCK_MAX_VALUES` (default `4096`, maximum `32768`): upper bound on the number of values in a single mini-block chunk.
 
 Reducing this value produces smaller mini-blocks, which reduces the amount of data fetched per read at the
-cost of more mini-blocks and slightly more metadata overhead.
+cost of more mini-blocks and slightly more metadata overhead. Increasing it can reduce metadata overhead and
+improve throughput for highly compressible data, but it may increase random-read amplification.
diff --git a/docs/src/format/index/scalar/ngram.md b/docs/src/format/index/scalar/ngram.md
index bdf78474d50..d437363d264 100644
--- a/docs/src/format/index/scalar/ngram.md
+++ b/docs/src/format/index/scalar/ngram.md
@@ -29,4 +29,10 @@ The N-gram index provides inexact results for the following query types:
 
 | Query Type     | Description              | Operation                                             | Result Type |
 |----------------|--------------------------|-------------------------------------------------------|-------------|
-| **contains**   | Substring search in text | Finds all trigrams in query, intersects posting lists | AtMost      |
\ No newline at end of file
+| **contains**   | Substring search in text | Finds all trigrams in query, intersects posting lists | AtMost      |
+| **regexp_like** / **regexp_match** | Regular-expression match | Derives a necessary trigram condition from the pattern (AND of intersections, OR of unions), then rechecks the true regex | AtMost |
+| **LIKE** (infix) | Wildcard match such as `%foo%bar%` | Uses the literal segments of the pattern as a trigram condition, then rechecks the LIKE | AtMost |
+
+Patterns from which no trigram can be derived - for example `a.b`, `.*`,
+case-insensitive matches, or literal runs shorter than three characters - fall
+back to rechecking every row. This is always correct, just not accelerated.
diff --git a/docs/src/guide/blob.md b/docs/src/guide/blob.md
index b1f956a19e7..dd13fcaab34 100644
--- a/docs/src/guide/blob.md
+++ b/docs/src/guide/blob.md
@@ -95,6 +95,16 @@ Note:
 - By default, external blob URIs must map to a registered non-dataset-root base path.
 - If you need to reference external objects outside those bases, set
   `allow_external_blob_outside_bases=True` when writing.
+- Blob v2 storage layout thresholds can be configured per column with
+  `blob_field(..., inline_size_threshold=..., dedicated_size_threshold=...)`.
+  The inline threshold controls when values move from the data file to packed
+  `.blob` sidecar storage. The dedicated threshold controls when values move
+  from packed sidecar storage to a dedicated `.blob` file. The dedicated
+  threshold is checked first. For existing columns, these thresholds are stored
+  in the dataset schema; appends that explicitly provide different threshold
+  metadata for the same column are rejected.
+- `blob_pack_file_size_threshold` is a write option for rolling packed `.blob`
+  sidecar files. It does not control inline-vs-packed placement.
 
 ### Example: packed external blobs (single container file)
 
diff --git a/docs/src/guide/object_store.md b/docs/src/guide/object_store.md
index 182b93c0574..f901d2c2411 100644
--- a/docs/src/guide/object_store.md
+++ b/docs/src/guide/object_store.md
@@ -248,3 +248,208 @@ ds = lance.dataset(
 | `tos_access_key_id` | Access key ID used for TOS authentication. Optional if credentials are provided by environment. |
 | `tos_secret_access_key` | Secret access key used for TOS authentication. Optional if credentials are provided by environment. |
 | `tos_security_token` | Security token for temporary credentials. Optional. |
+
+## Tencent Cloud COS Configuration
+
+[COS (Cloud Object Storage)](https://cloud.tencent.com/product/cos) credentials can be set in environment variables prefixed
+with `COS_` or `TENCENTCLOUD_` (for example, `COS_ENDPOINT`, `COS_SECRET_ID`,
+`COS_SECRET_KEY`, `TENCENTCLOUD_REGION`, `TENCENTCLOUD_SECURITY_TOKEN`).
+Alternatively, credentials can be passed as parameters to the `storage_options`
+parameter; explicit `storage_options` override environment variables:
+
+=== "Python"
+
+    ```python
+    import lance
+    ds = lance.dataset(
+        "cos://bucket/path",
+        storage_options={
+            "cos_endpoint": "https://cos.ap-guangzhou.myqcloud.com",
+            "cos_secret_id": "my-secret-id",
+            "cos_secret_key": "my-secret-key",
+        }
+    )
+    ```
+
+=== "Rust"
+
+    In this Lance distribution, `tencent` is already part of the **default
+    features** of the `lance` crate, so simply depending on `lance` is enough:
+
+    ```toml
+    [dependencies]
+    lance = "*"
+    ```
+
+    You only need to enable the `tencent` feature explicitly in the following
+    cases:
+
+    - You opted out of default features, e.g.
+      `lance = { version = "*", default-features = false, features = ["tencent", ...] }`.
+    - You depend on `lance-io` directly (without `lance`); `tencent` is **not**
+      a default feature of `lance-io`:
+      `lance-io = { version = "*", features = ["tencent"] }`.
+
+| Key | Description |
+|-----|-------------|
+| `cos_endpoint` | COS endpoint. Required (for example, `https://cos.ap-guangzhou.myqcloud.com`). Can also be set via the `COS_ENDPOINT` environment variable. |
+| `cos_secret_id` | Secret ID used for COS authentication. Optional if credentials are provided by environment. |
+| `cos_secret_key` | Secret key used for COS authentication. Optional if credentials are provided by environment. |
+| `cos_enable_versioning` | Whether to enable object versioning on the bucket. Optional. |
+
+!!! note
+
+    The OpenDAL `CosConfig` currently exposes a limited set of options. Additional
+    settings such as the security token (`TENCENTCLOUD_SECURITY_TOKEN`) and region
+    (`TENCENTCLOUD_REGION`) must be configured via environment variables.
+
+## GooseFS Configuration
+
+[GooseFS](https://cloud.tencent.com/product/goosefs) is a distributed caching
+filesystem. Lance accesses GooseFS through its Master gRPC service. The URL format
+is `goosefs://host:port/path`, where `host:port` is the GooseFS Master address
+(default port: `9200`, may be omitted, e.g. `goosefs://10.0.0.1/path`) and
+`/path` is the filesystem path within GooseFS.
+
+!!! note "About the dataset path"
+
+    `/path` is just an arbitrary directory inside GooseFS — Lance does **not**
+    require the path to end with a `.lance` suffix. Any valid GooseFS directory
+    works, for example:
+
+    - `goosefs://10.0.0.1:9200/data/my-dataset`
+    - `goosefs://10.0.0.1:9200/data/my-dataset.lance`
+    - `goosefs://10.0.0.1:9200/lance-test/lance-io`
+
+    The `.lance` suffix used in the examples below is only a naming convention
+    that makes it easy to recognize a Lance dataset directory at a glance; it
+    has no special meaning to Lance itself. The only requirement is that the
+    same path is used consistently for reads and writes of a given dataset.
+
+=== "Python"
+
+    ```python
+    import lance
+
+    ds = lance.dataset(
+        "goosefs://10.0.0.1:9200/data/my-dataset.lance",
+        storage_options={
+            "goosefs_auth_type": "simple",
+            "goosefs_auth_username": "lance",
+        },
+    )
+    ```
+
+=== "Rust"
+
+    In this Lance distribution, `goosefs` is already part of the **default
+    features** of the `lance` crate, so simply depending on `lance` is enough:
+
+    ```toml
+    [dependencies]
+    lance = "*"
+    ```
+
+    You only need to enable the `goosefs` feature explicitly in the following
+    cases:
+
+    - You opted out of default features, e.g.
+      `lance = { version = "*", default-features = false, features = ["goosefs", ...] }`.
+    - You depend on `lance-io` directly (without `lance`); `goosefs` is **not**
+      a default feature of `lance-io`:
+      `lance-io = { version = "*", features = ["goosefs"] }`.
+
+    Open the underlying `lance_io::object_store::ObjectStore` directly (mirrors
+    the integration test in `rust/lance-io/tests/goosefs_integration.rs`):
+
+    ```rust
+    use lance_io::object_store::ObjectStore;
+
+    let uri = "goosefs://10.0.0.1:9200/lance-test/lance-io";
+    let (store, path) = ObjectStore::from_uri(uri).await?;
+
+    // Read / write through the underlying `object_store::ObjectStore` API
+    store.inner.put(&path, (&b"hello"[..]).into()).await?;
+    let result = store.inner.get(&path).await?;
+    let bytes = result.bytes().await?;
+    ```
+
+    Open a Lance dataset with custom storage options:
+
+    ```rust
+    use std::collections::HashMap;
+    use lance::dataset::DatasetBuilder;
+
+    let mut storage_options = HashMap::new();
+    storage_options.insert("goosefs_master_addr".to_string(), "10.0.0.1:9200".to_string());
+    storage_options.insert("goosefs_auth_type".to_string(), "simple".to_string());
+    storage_options.insert("goosefs_auth_username".to_string(), "lance".to_string());
+
+    let dataset = DatasetBuilder::from_uri("goosefs://10.0.0.1:9200/data/my-dataset.lance")
+        .with_storage_options(storage_options)
+        .load()
+        .await?;
+    ```
+
+=== "Java"
+
+    Pass the GooseFS configuration through `ReadOptions.setStorageOptions`
+    when opening the dataset:
+
+    ```java
+    import org.lance.Dataset;
+    import org.lance.ReadOptions;
+
+    import java.util.HashMap;
+    import java.util.Map;
+
+    Map<String, String> storageOptions = new HashMap<>();
+    storageOptions.put("goosefs_master_addr", "10.0.0.1:9200");
+    storageOptions.put("goosefs_auth_type", "simple");
+    storageOptions.put("goosefs_auth_username", "lance");
+
+    ReadOptions options = new ReadOptions.Builder()
+        .setStorageOptions(storageOptions)
+        .build();
+
+    try (Dataset dataset = Dataset.open()
+            .uri("goosefs://10.0.0.1:9200/data/my-dataset.lance")
+            .readOptions(options)
+            .build()) {
+        // ... use the dataset
+    }
+    ```
+
+    For writes, the same `storageOptions(...)` setter is available on
+    `WriteDatasetBuilder` and `WriteFragmentBuilder`.
+
+The Master address can be resolved from (in priority order):
+
+1. The `goosefs_master_addr` storage option (supports HA: `"addr1:port,addr2:port"`).
+2. The `GOOSEFS_MASTER_ADDR` environment variable.
+3. The host and port from the URL authority.
+
+The following keys can be used as both environment variables or keys in the
+`storage_options` parameter:
+
+| Key | Description |
+|-----|-------------|
+| `goosefs_master_addr` / `GOOSEFS_MASTER_ADDR` | GooseFS Master address. Supports a single address (`host:port`) or comma-separated HA addresses (`addr1:port,addr2:port`). Optional if the address is provided in the URL. |
+| `goosefs_write_type` / `GOOSEFS_WRITE_TYPE` | Write type, e.g. `MUST_CACHE`, `CACHE_THROUGH`, `THROUGH`, `ASYNC_THROUGH`. Optional. |
+| `goosefs_block_size` / `GOOSEFS_BLOCK_SIZE` | GooseFS block size in bytes (this is the GooseFS-side block size, not Lance's I/O block size). Optional. |
+| `goosefs_chunk_size` / `GOOSEFS_CHUNK_SIZE` | Chunk size in bytes used when reading or writing files. Optional. |
+| `goosefs_auth_type` / `GOOSEFS_AUTH_TYPE` | Authentication type. Either `nosasl` or `simple` (case-insensitive; the value is passed through to OpenDAL). Optional. |
+| `goosefs_auth_username` / `GOOSEFS_AUTH_USERNAME` | Username used in `simple` authentication mode. Optional. |
+
+!!! note "Running the GooseFS integration tests"
+
+    The Rust integration tests for GooseFS live at
+    `rust/lance-io/tests/goosefs_integration.rs` and are gated behind feature
+    flags. They require a reachable GooseFS cluster (configured via the
+    `GOOSEFS_MASTER_ADDR` and `GOOSEFS_AUTH_TYPE` environment variables) and
+    can be run with:
+
+    ```bash
+    cargo test -p lance-io --features "goosefs goosefs-test" \
+        --test goosefs_integration -- --ignored --nocapture --test-threads=1
+    ```
diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock
index f4cfc21ec9c..ee52544ba57 100644
--- a/java/lance-jni/Cargo.lock
+++ b/java/lance-jni/Cargo.lock
@@ -42,21 +42,6 @@ dependencies = [
  "memchr",
 ]
 
-[[package]]
-name = "alloc-no-stdlib"
-version = "2.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3"
-
-[[package]]
-name = "alloc-stdlib"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece"
-dependencies = [
- "alloc-no-stdlib",
-]
-
 [[package]]
 name = "allocator-api2"
 version = "0.2.21"
@@ -944,9 +929,9 @@ dependencies = [
 
 [[package]]
 name = "bitvec"
-version = "1.0.1"
+version = "1.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c"
+checksum = "ddcec3d12c579d40898fe0a9a358a803c23e9c52ca3c425707f81c9436211837"
 dependencies = [
  "funty",
  "radium",
@@ -988,9 +973,9 @@ dependencies = [
 
 [[package]]
 name = "block-buffer"
-version = "0.12.0"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be"
+checksum = "d2f6c7dbe95a6ed67ad9f18e57daf93a2f034c524b99fd2b76d18fdfeb6660aa"
 dependencies = [
  "hybrid-array",
 ]
@@ -1004,27 +989,6 @@ dependencies = [
  "generic-array",
 ]
 
-[[package]]
-name = "brotli"
-version = "8.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610"
-dependencies = [
- "alloc-no-stdlib",
- "alloc-stdlib",
- "brotli-decompressor",
-]
-
-[[package]]
-name = "brotli-decompressor"
-version = "5.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924"
-dependencies = [
- "alloc-no-stdlib",
- "alloc-stdlib",
-]
-
 [[package]]
 name = "bs58"
 version = "0.5.1"
@@ -1065,9 +1029,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
 [[package]]
 name = "bytes"
-version = "1.11.1"
+version = "1.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
+checksum = "8ae3f5d315924270530207e2a68396c3cc547f6dca3fbdca317cfb1a51edb593"
 
 [[package]]
 name = "bytes-utils"
@@ -1090,9 +1054,9 @@ dependencies = [
 
 [[package]]
 name = "cc"
-version = "1.2.63"
+version = "1.2.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f"
+checksum = "dad887fd958be91b5098c0248def011f4523ab786cd411be668777e55063501f"
 dependencies = [
  "find-msvc-tools",
  "jobserver",
@@ -1610,7 +1574,6 @@ dependencies = [
  "datafusion-datasource-arrow",
  "datafusion-datasource-csv",
  "datafusion-datasource-json",
- "datafusion-datasource-parquet",
  "datafusion-execution",
  "datafusion-expr",
  "datafusion-expr-common",
@@ -1632,7 +1595,6 @@ dependencies = [
  "log",
  "object_store",
  "parking_lot",
- "parquet",
  "rand 0.9.4",
  "regex",
  "sqlparser",
@@ -1707,7 +1669,6 @@ dependencies = [
  "libc",
  "log",
  "object_store",
- "parquet",
  "paste",
  "sqlparser",
  "tokio",
@@ -1825,36 +1786,6 @@ dependencies = [
  "tokio-stream",
 ]
 
-[[package]]
-name = "datafusion-datasource-parquet"
-version = "53.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997"
-dependencies = [
- "arrow",
- "async-trait",
- "bytes",
- "datafusion-common",
- "datafusion-common-runtime",
- "datafusion-datasource",
- "datafusion-execution",
- "datafusion-expr",
- "datafusion-functions-aggregate-common",
- "datafusion-physical-expr",
- "datafusion-physical-expr-adapter",
- "datafusion-physical-expr-common",
- "datafusion-physical-plan",
- "datafusion-pruning",
- "datafusion-session",
- "futures",
- "itertools 0.14.0",
- "log",
- "object_store",
- "parking_lot",
- "parquet",
- "tokio",
-]
-
 [[package]]
 name = "datafusion-doc"
 version = "53.1.0"
@@ -2298,7 +2229,7 @@ version = "0.11.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2"
 dependencies = [
- "block-buffer 0.12.0",
+ "block-buffer 0.12.1",
  "const-oid 0.10.2",
  "crypto-common 0.2.2",
  "ctutils",
@@ -2505,7 +2436,6 @@ checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
 dependencies = [
  "crc32fast",
  "miniz_oxide",
- "zlib-rs",
 ]
 
 [[package]]
@@ -2549,7 +2479,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
 
 [[package]]
 name = "fsst"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-array",
  "rand 0.9.4",
@@ -2843,17 +2773,15 @@ dependencies = [
 
 [[package]]
 name = "getrandom"
-version = "0.4.2"
+version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
+checksum = "300e883d756b2e4ec94e02791f39b04b522276138852cfc41d9fb7e904106099"
 dependencies = [
  "cfg-if 1.0.4",
  "js-sys",
  "libc",
  "r-efi 6.0.0",
  "rand_core 0.10.1",
- "wasip2",
- "wasip3",
  "wasm-bindgen",
 ]
 
@@ -2921,9 +2849,9 @@ dependencies = [
 
 [[package]]
 name = "h2"
-version = "0.4.14"
+version = "0.4.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733"
+checksum = "6cb093c84e8bd9b188d4c4a8cb6579fc016968d14c99882163cd3ff402a4f155"
 dependencies = [
  "atomic-waker",
  "bytes",
@@ -3444,12 +3372,6 @@ version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e4a2c462a4d927d512f5f882a033ddd62f33a05bb9f230d98f736ac3dc85938f"
 
-[[package]]
-name = "id-arena"
-version = "2.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
-
 [[package]]
 name = "ident_case"
 version = "1.0.1"
@@ -3510,12 +3432,6 @@ dependencies = [
  "generic-array",
 ]
 
-[[package]]
-name = "integer-encoding"
-version = "3.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
-
 [[package]]
 name = "io-uring"
 version = "0.7.12"
@@ -3701,9 +3617,9 @@ dependencies = [
 
 [[package]]
 name = "js-sys"
-version = "0.3.100"
+version = "0.3.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2025f20d7a4fa7785846e7b63d10a76d3f1cee98ee5cb79ea59703f95e42162"
+checksum = "03d04c30968dffe80775bd4d7fb676131cd04a1fb46d2686dbffbaec2d9dfd31"
 dependencies = [
  "cfg-if 1.0.4",
  "futures-util",
@@ -3723,7 +3639,7 @@ dependencies = [
  "jiff",
  "nom",
  "num-traits",
- "ordered-float 5.3.0",
+ "ordered-float",
  "rand 0.9.4",
  "serde",
  "serde_json",
@@ -3749,7 +3665,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a"
 
 [[package]]
 name = "lance"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arc-swap",
  "arrow",
@@ -3822,7 +3738,7 @@ dependencies = [
 
 [[package]]
 name = "lance-arrow"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -3864,7 +3780,7 @@ dependencies = [
 
 [[package]]
 name = "lance-bitpacking"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrayref",
  "paste",
@@ -3873,7 +3789,7 @@ dependencies = [
 
 [[package]]
 name = "lance-core"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -3911,7 +3827,7 @@ dependencies = [
 
 [[package]]
 name = "lance-datafusion"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow",
  "arrow-array",
@@ -3943,7 +3859,7 @@ dependencies = [
 
 [[package]]
 name = "lance-datagen"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow",
  "arrow-array",
@@ -3956,12 +3872,11 @@ dependencies = [
  "rand 0.9.4",
  "rand_distr",
  "rand_xoshiro",
- "random_word",
 ]
 
 [[package]]
 name = "lance-derive"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -3970,7 +3885,7 @@ dependencies = [
 
 [[package]]
 name = "lance-encoding"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-arith",
  "arrow-array",
@@ -4005,7 +3920,7 @@ dependencies = [
 
 [[package]]
 name = "lance-file"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-arith",
  "arrow-array",
@@ -4035,7 +3950,7 @@ dependencies = [
 
 [[package]]
 name = "lance-geo"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "datafusion",
  "geo-traits",
@@ -4049,7 +3964,7 @@ dependencies = [
 
 [[package]]
 name = "lance-index"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arc-swap",
  "arrow",
@@ -4104,6 +4019,7 @@ dependencies = [
  "rand_distr",
  "rangemap",
  "rayon",
+ "regex-syntax",
  "roaring",
  "serde",
  "serde_json",
@@ -4116,7 +4032,7 @@ dependencies = [
 
 [[package]]
 name = "lance-io"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow",
  "arrow-arith",
@@ -4157,7 +4073,7 @@ dependencies = [
 
 [[package]]
 name = "lance-jni"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow",
  "arrow-array",
@@ -4193,7 +4109,7 @@ dependencies = [
 
 [[package]]
 name = "lance-linalg"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -4208,7 +4124,7 @@ dependencies = [
 
 [[package]]
 name = "lance-namespace"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow",
  "async-trait",
@@ -4220,7 +4136,7 @@ dependencies = [
 
 [[package]]
 name = "lance-namespace-impls"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow",
  "arrow-ipc",
@@ -4228,6 +4144,8 @@ dependencies = [
  "async-trait",
  "axum",
  "bytes",
+ "datafusion-common",
+ "datafusion-physical-plan",
  "futures",
  "lance",
  "lance-core",
@@ -4240,19 +4158,22 @@ dependencies = [
  "object_store",
  "rand 0.9.4",
  "reqwest 0.12.28",
+ "roaring",
  "serde",
  "serde_json",
+ "time",
  "tokio",
  "tower",
  "tower-http 0.5.2",
  "url",
+ "uuid",
 ]
 
 [[package]]
 name = "lance-namespace-reqwest-client"
-version = "0.8.4"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04b4e5caefa132a9cce54b2d4dc95016b949b3a290a83ad5057e705df43d75be"
+checksum = "ba3f0a235e3ed5f8805205649ccc7d7d0f3df23ce1294242c9265ad488d7f19d"
 dependencies = [
  "reqwest 0.12.28",
  "serde",
@@ -4264,7 +4185,7 @@ dependencies = [
 
 [[package]]
 name = "lance-select"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -4279,7 +4200,7 @@ dependencies = [
 
 [[package]]
 name = "lance-table"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow",
  "arrow-array",
@@ -4316,11 +4237,12 @@ dependencies = [
 
 [[package]]
 name = "lance-tokenizer"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "icu_segmenter",
  "rust-stemmers",
  "serde",
+ "stop-words",
  "unicode-normalization",
 ]
 
@@ -4333,12 +4255,6 @@ dependencies = [
  "spin",
 ]
 
-[[package]]
-name = "leb128fmt"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
-
 [[package]]
 name = "lexical-core"
 version = "1.0.6"
@@ -4571,9 +4487,9 @@ dependencies = [
 
 [[package]]
 name = "memchr"
-version = "2.8.1"
+version = "2.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8"
+checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4"
 
 [[package]]
 name = "mime"
@@ -5173,15 +5089,6 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
 
-[[package]]
-name = "ordered-float"
-version = "2.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c"
-dependencies = [
- "num-traits",
-]
-
 [[package]]
 name = "ordered-float"
 version = "5.3.0"
@@ -5245,42 +5152,6 @@ dependencies = [
  "windows-link",
 ]
 
-[[package]]
-name = "parquet"
-version = "58.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908"
-dependencies = [
- "ahash",
- "arrow-array",
- "arrow-buffer",
- "arrow-data",
- "arrow-ipc",
- "arrow-schema",
- "arrow-select",
- "base64",
- "brotli",
- "bytes",
- "chrono",
- "flate2",
- "futures",
- "half",
- "hashbrown 0.17.1",
- "lz4_flex",
- "num-bigint",
- "num-integer",
- "num-traits",
- "object_store",
- "paste",
- "seq-macro",
- "simdutf8",
- "snap",
- "thrift",
- "tokio",
- "twox-hash",
- "zstd",
-]
-
 [[package]]
 name = "paste"
 version = "1.0.15"
@@ -5733,7 +5604,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207"
 dependencies = [
  "chacha20",
- "getrandom 0.4.2",
+ "getrandom 0.4.3",
  "rand_core 0.10.1",
 ]
 
@@ -5800,19 +5671,6 @@ dependencies = [
  "rand_core 0.9.5",
 ]
 
-[[package]]
-name = "random_word"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e47a395bdb55442b883c89062d6bcff25dc90fa5f8369af81e0ac6d49d78cf81"
-dependencies = [
- "ahash",
- "brotli",
- "paste",
- "rand 0.9.4",
- "unicase",
-]
-
 [[package]]
 name = "rangemap"
 version = "1.7.1"
@@ -6792,9 +6650,9 @@ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5"
 
 [[package]]
 name = "smallvec"
-version = "1.15.1"
+version = "1.15.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
+checksum = "8ed6a63f02c8539c91a8685a86f4099661ba3da017932f6ebbea6de3f0fa7c90"
 
 [[package]]
 name = "snafu"
@@ -6817,12 +6675,6 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "snap"
-version = "1.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b"
-
 [[package]]
 name = "socket2"
 version = "0.6.4"
@@ -6916,6 +6768,15 @@ version = "0.2.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978"
 
+[[package]]
+name = "stop-words"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d68df56303396bcfb639455b3c166804aeb7994005010aab5e9e8a1277b8871d"
+dependencies = [
+ "serde_json",
+]
+
 [[package]]
 name = "strsim"
 version = "0.11.1"
@@ -6983,9 +6844,9 @@ checksum = "a7973cce6668464ea31f176d85b13c7ab3bba2cb3b77a2ed26abd7801688010a"
 
 [[package]]
 name = "syn"
-version = "2.0.117"
+version = "2.0.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
+checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -7066,7 +6927,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
 dependencies = [
  "fastrand",
- "getrandom 0.4.2",
+ "getrandom 0.4.3",
  "once_cell",
  "rustix",
  "windows-sys 0.61.2",
@@ -7130,17 +6991,6 @@ dependencies = [
  "cfg-if 1.0.4",
 ]
 
-[[package]]
-name = "thrift"
-version = "0.17.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
-dependencies = [
- "byteorder",
- "integer-encoding",
- "ordered-float 2.10.1",
-]
-
 [[package]]
 name = "time"
 version = "0.3.47"
@@ -7616,12 +7466,6 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
 
-[[package]]
-name = "unicode-xid"
-version = "0.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
-
 [[package]]
 name = "unsafe-libyaml"
 version = "0.2.11"
@@ -7676,7 +7520,7 @@ version = "1.23.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7"
 dependencies = [
- "getrandom 0.4.2",
+ "getrandom 0.4.3",
  "js-sys",
  "serde_core",
  "wasm-bindgen",
@@ -7736,20 +7580,11 @@ dependencies = [
 
 [[package]]
 name = "wasip2"
-version = "1.0.3+wasi-0.2.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6"
-dependencies = [
- "wit-bindgen 0.57.1",
-]
-
-[[package]]
-name = "wasip3"
-version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
+version = "1.0.4+wasi-0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
+checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487"
 dependencies = [
- "wit-bindgen 0.51.0",
+ "wit-bindgen",
 ]
 
 [[package]]
@@ -7763,9 +7598,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.123"
+version = "0.2.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a254a4b10c19a76f09a27640e7ffbf9bc30bf67e16a3bf28aaefa4920fe81563"
+checksum = "8ddb3f79143bced6de84270411622a2699cee572fc0875aeaf1e7867cf9fca1a"
 dependencies = [
  "cfg-if 1.0.4",
  "once_cell",
@@ -7776,9 +7611,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.73"
+version = "0.4.75"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54568702fabf5d4849ce2b90fadfa64168a097eaf4b351ce9df8b687a0086aaf"
+checksum = "503b14d284f2c8dac03b819967e155ea753f573586193b2b2c95990cb5d69280"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -7786,9 +7621,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.123"
+version = "0.2.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24a40fc75b0ec6f3746ceb10d36f53a93dcd68a93b11b6445983945d79eba0dc"
+checksum = "4e21a184b13fb19e157296e2c46056aec9092264fab83e4ba59e68c61b323c3d"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -7796,9 +7631,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.123"
+version = "0.2.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "908f34bd9b9ce3d4caf07b72dfab63d61504d156856c6bd3cd87fa350cf3985b"
+checksum = "fecefd9c35bd935a20fc3fc344b5f29138961e4f47fb03297d88f2587afb5ebd"
 dependencies = [
  "bumpalo",
  "proc-macro2",
@@ -7809,35 +7644,13 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.123"
+version = "0.2.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7acbf7616c27b194bbb550bf77ed0c2c3e5b7fd1260a93082b95fb7f47959b92"
+checksum = "23939e44bb9a5d7576fa2b563dc2e136628f1224e88a8deed09e04858b77871f"
 dependencies = [
  "unicode-ident",
 ]
 
-[[package]]
-name = "wasm-encoder"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
-dependencies = [
- "leb128fmt",
- "wasmparser",
-]
-
-[[package]]
-name = "wasm-metadata"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
-dependencies = [
- "anyhow",
- "indexmap 2.14.0",
- "wasm-encoder",
- "wasmparser",
-]
-
 [[package]]
 name = "wasm-streams"
 version = "0.4.2"
@@ -7864,23 +7677,11 @@ dependencies = [
  "web-sys",
 ]
 
-[[package]]
-name = "wasmparser"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
-dependencies = [
- "bitflags",
- "hashbrown 0.15.5",
- "indexmap 2.14.0",
- "semver",
-]
-
 [[package]]
 name = "web-sys"
-version = "0.3.100"
+version = "0.3.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e0871acf327f283dc6da28a1696cdc64fb355ba9f935d052021fa77f35cce69"
+checksum = "a6430a72df5eb332242960fe84b3002a241163998241eb596d4f739b9757061d"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -7898,18 +7699,18 @@ dependencies = [
 
 [[package]]
 name = "webpki-root-certs"
-version = "1.0.7"
+version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c"
+checksum = "0d46a5a140e6f7afeccd8eae97eff335163939eac8b929834875168b29b3d267"
 dependencies = [
  "rustls-pki-types",
 ]
 
 [[package]]
 name = "webpki-roots"
-version = "1.0.7"
+version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d"
+checksum = "bf85cb06032201fa7c6f829d7db5a7e5aa45bcc0655327713065f6f0576731bf"
 dependencies = [
  "rustls-pki-types",
 ]
@@ -8310,100 +8111,12 @@ dependencies = [
  "memchr",
 ]
 
-[[package]]
-name = "wit-bindgen"
-version = "0.51.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
-dependencies = [
- "wit-bindgen-rust-macro",
-]
-
 [[package]]
 name = "wit-bindgen"
 version = "0.57.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e"
 
-[[package]]
-name = "wit-bindgen-core"
-version = "0.51.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
-dependencies = [
- "anyhow",
- "heck",
- "wit-parser",
-]
-
-[[package]]
-name = "wit-bindgen-rust"
-version = "0.51.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
-dependencies = [
- "anyhow",
- "heck",
- "indexmap 2.14.0",
- "prettyplease",
- "syn",
- "wasm-metadata",
- "wit-bindgen-core",
- "wit-component",
-]
-
-[[package]]
-name = "wit-bindgen-rust-macro"
-version = "0.51.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
-dependencies = [
- "anyhow",
- "prettyplease",
- "proc-macro2",
- "quote",
- "syn",
- "wit-bindgen-core",
- "wit-bindgen-rust",
-]
-
-[[package]]
-name = "wit-component"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
-dependencies = [
- "anyhow",
- "bitflags",
- "indexmap 2.14.0",
- "log",
- "serde",
- "serde_derive",
- "serde_json",
- "wasm-encoder",
- "wasm-metadata",
- "wasmparser",
- "wit-parser",
-]
-
-[[package]]
-name = "wit-parser"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
-dependencies = [
- "anyhow",
- "id-arena",
- "indexmap 2.14.0",
- "log",
- "semver",
- "serde",
- "serde_derive",
- "serde_json",
- "unicode-xid",
- "wasmparser",
-]
-
 [[package]]
 name = "wkb"
 version = "0.9.2"
@@ -8498,7 +8211,7 @@ dependencies = [
  "csv",
  "futures",
  "futures-util",
- "getrandom 0.4.2",
+ "getrandom 0.4.3",
  "heapify",
  "itertools 0.14.0",
  "lazy_static",
@@ -8669,9 +8382,9 @@ dependencies = [
 
 [[package]]
 name = "zeroize"
-version = "1.8.2"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
+checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e"
 
 [[package]]
 name = "zerotrie"
@@ -8708,12 +8421,6 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "zlib-rs"
-version = "0.6.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513"
-
 [[package]]
 name = "zmij"
 version = "1.0.21"
diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml
index f1144423c0d..6210c5daf1d 100644
--- a/java/lance-jni/Cargo.toml
+++ b/java/lance-jni/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lance-jni"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 edition = "2024"
 authors = ["Lance Devs <dev@lance.org>"]
 rust-version = "1.91"
diff --git a/java/lance-jni/src/index.rs b/java/lance-jni/src/index.rs
index 1e533eed9fc..6cb64a05a81 100644
--- a/java/lance-jni/src/index.rs
+++ b/java/lance-jni/src/index.rs
@@ -173,6 +173,8 @@ fn determine_index_type<'local>(
             Some("ZONEMAP")
         } else if lower.contains("bloomfilter") {
             Some("BLOOM_FILTER")
+        } else if lower.contains("rtree") {
+            Some("RTREE")
         } else if lower.contains("ivfhnsw") {
             if lower.contains("sq") {
                 Some("IVF_HNSW_SQ")
diff --git a/java/lance-jni/src/mem_wal.rs b/java/lance-jni/src/mem_wal.rs
index 9ba3fdd7440..20404b6a88b 100644
--- a/java/lance-jni/src/mem_wal.rs
+++ b/java/lance-jni/src/mem_wal.rs
@@ -27,6 +27,7 @@ use jni::sys::{jdouble, jint, jlong};
 use lance::dataset::Dataset as LanceDataset;
 use lance::dataset::mem_wal::scanner::{
     FlushedGeneration, LsmDataSourceCollector, LsmPointLookupPlanner, LsmVectorSearchPlanner,
+    write_pk_sidecar,
 };
 use lance::dataset::mem_wal::write::{MemTableStats, WriteStatsSnapshot};
 use lance::dataset::mem_wal::{
@@ -180,6 +181,42 @@ fn inner_put(env: &mut JNIEnv, this: JObject, stream_addr: jlong) -> Result<()>
     Ok(())
 }
 
+/// Test-support: write a primary-key dedup sidecar (`_pk_index/`) for a
+/// flushed-generation dataset already staged at `gen_path`, mirroring what
+/// production flush emits. Lets Java tests stage a *faithful* flushed
+/// generation (dataset + sidecar); production always writes the sidecar during
+/// flush, so a dataset-without-sidecar is not a state the system produces.
+/// Mirrors the Python `_write_pk_sidecar` binding.
+#[unsafe(no_mangle)]
+pub extern "system" fn Java_org_lance_memwal_MemWalTest_nativeWritePkSidecar(
+    mut env: JNIEnv,
+    _class: JClass,
+    gen_path: JString,
+    stream_addr: jlong,
+    pk_columns: JObject,
+) {
+    ok_or_throw_without_return!(
+        env,
+        inner_write_pk_sidecar(&mut env, gen_path, stream_addr, pk_columns)
+    );
+}
+
+fn inner_write_pk_sidecar(
+    env: &mut JNIEnv,
+    gen_path: JString,
+    stream_addr: jlong,
+    pk_columns: JObject,
+) -> Result<()> {
+    let gen_path: String = env.get_string(&gen_path)?.into();
+    let pk_columns = env.get_strings(&pk_columns)?;
+    let stream_ptr = stream_addr as *mut FFI_ArrowArrayStream;
+    let reader = unsafe { ArrowArrayStreamReader::from_raw(stream_ptr) }?;
+    let batches: Vec<RecordBatch> = reader.collect::<std::result::Result<_, _>>()?;
+    let pk_refs: Vec<&str> = pk_columns.iter().map(String::as_str).collect();
+    RT.block_on(write_pk_sidecar(&gen_path, &batches, &pk_refs))?;
+    Ok(())
+}
+
 #[unsafe(no_mangle)]
 pub extern "system" fn Java_org_lance_memwal_ShardWriter_nativeStats<'local>(
     mut env: JNIEnv<'local>,
diff --git a/java/pom.xml b/java/pom.xml
index e5791f8155d..6306ecc63f9 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -7,7 +7,7 @@
     <groupId>org.lance</groupId>
     <artifactId>lance-core</artifactId>
     <name>Lance Core</name>
-    <version>8.0.0-beta.11</version>
+    <version>8.1.0-beta.0</version>
     <packaging>jar</packaging>
 
     <description>Lance Format Java API</description>
diff --git a/java/src/main/java/org/lance/OpenDatasetBuilder.java b/java/src/main/java/org/lance/OpenDatasetBuilder.java
index baece0767a1..32fd5ca7635 100644
--- a/java/src/main/java/org/lance/OpenDatasetBuilder.java
+++ b/java/src/main/java/org/lance/OpenDatasetBuilder.java
@@ -216,8 +216,8 @@ private Dataset buildFromNamespaceClient() {
     // Call describe_table to get location and storage options
     DescribeTableRequest request = new DescribeTableRequest();
     request.setId(tableId);
-    // Only set version if present
-    options.getVersion().ifPresent(v -> request.setVersion(Long.valueOf(v)));
+    // Do not set the dataset version here. Some namespace implementations only support describing
+    // the latest table metadata; the requested version is applied when opening the dataset below.
 
     DescribeTableResponse response = namespaceClient.describeTable(request);
 
diff --git a/java/src/main/java/org/lance/index/IndexType.java b/java/src/main/java/org/lance/index/IndexType.java
index 3a03934effd..1fff86fc7e0 100644
--- a/java/src/main/java/org/lance/index/IndexType.java
+++ b/java/src/main/java/org/lance/index/IndexType.java
@@ -24,6 +24,7 @@ public enum IndexType {
   MEM_WAL(7),
   ZONEMAP(8),
   BLOOM_FILTER(9),
+  RTREE(10),
   VECTOR(100),
   IVF_FLAT(101),
   IVF_SQ(102),
diff --git a/java/src/main/java/org/lance/index/scalar/ScalarIndexParams.java b/java/src/main/java/org/lance/index/scalar/ScalarIndexParams.java
index 345a55f20b2..b3408e2d68d 100644
--- a/java/src/main/java/org/lance/index/scalar/ScalarIndexParams.java
+++ b/java/src/main/java/org/lance/index/scalar/ScalarIndexParams.java
@@ -31,7 +31,7 @@ private ScalarIndexParams(Builder builder) {
    * Create a new ScalarIndexParams with the given index type and no parameters.
    *
    * @param indexType the index type (e.g., "btree", "zonemap", "bitmap", "inverted", "labellist",
-   *     "ngram")
+   *     "ngram", "rtree")
    * @return ScalarIndexParams
    */
   public static ScalarIndexParams create(String indexType) {
@@ -42,7 +42,7 @@ public static ScalarIndexParams create(String indexType) {
    * Create a new ScalarIndexParams with the given index type and JSON parameters.
    *
    * @param indexType the index type (e.g., "btree", "zonemap", "bitmap", "inverted", "labellist",
-   *     "ngram")
+   *     "ngram", "rtree")
    * @param jsonParams JSON string containing index-specific parameters
    * @return ScalarIndexParams
    */
@@ -58,7 +58,7 @@ public static class Builder {
      * Create a new builder for scalar index parameters.
      *
      * @param indexType the index type (e.g., "btree", "zonemap", "bitmap", "inverted", "labellist",
-     *     "ngram")
+     *     "ngram", "rtree")
      */
     public Builder(String indexType) {
       this.indexType = indexType;
diff --git a/java/src/test/java/org/lance/index/ScalarIndexTest.java b/java/src/test/java/org/lance/index/ScalarIndexTest.java
index b993a7e8a5f..cb090e7c955 100644
--- a/java/src/test/java/org/lance/index/ScalarIndexTest.java
+++ b/java/src/test/java/org/lance/index/ScalarIndexTest.java
@@ -25,14 +25,18 @@
 import org.apache.arrow.c.Data;
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.Float8Vector;
 import org.apache.arrow.vector.IntVector;
 import org.apache.arrow.vector.UInt8Vector;
 import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.complex.StructVector;
 import org.apache.arrow.vector.ipc.ArrowReader;
 import org.apache.arrow.vector.ipc.ArrowStreamReader;
 import org.apache.arrow.vector.ipc.ArrowStreamWriter;
+import org.apache.arrow.vector.types.FloatingPointPrecision;
 import org.apache.arrow.vector.types.pojo.ArrowType;
 import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.types.pojo.FieldType;
 import org.apache.arrow.vector.types.pojo.Schema;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
@@ -318,4 +322,78 @@ public void testCreateZonemapIndex(@TempDir Path tempDir) throws Exception {
       }
     }
   }
+
+  @Test
+  public void testCreateRTreeIndex(@TempDir Path tempDir) throws Exception {
+    String datasetPath = tempDir.resolve("rtree_test").toString();
+    ArrowType f64 = new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE);
+    Field geometryField =
+        new Field(
+            "geometry",
+            new FieldType(
+                true,
+                new ArrowType.Struct(),
+                null,
+                Collections.singletonMap("ARROW:extension:name", "geoarrow.point")),
+            Arrays.asList(Field.notNullable("x", f64), Field.notNullable("y", f64)));
+    Schema schema = new Schema(Collections.singletonList(geometryField), null);
+
+    int rowCount = 3;
+    try (RootAllocator allocator = new RootAllocator();
+        VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) {
+      root.allocateNew();
+      StructVector geometry = (StructVector) root.getVector("geometry");
+      Float8Vector x = (Float8Vector) geometry.getChild("x");
+      Float8Vector y = (Float8Vector) geometry.getChild("y");
+      for (int i = 0; i < rowCount; i++) {
+        geometry.setIndexDefined(i);
+        x.setSafe(i, (double) i);
+        y.setSafe(i, i * 2.0);
+      }
+      geometry.setValueCount(rowCount);
+      root.setRowCount(rowCount);
+
+      ByteArrayOutputStream out = new ByteArrayOutputStream();
+      try (ArrowStreamWriter writer = new ArrowStreamWriter(root, null, out)) {
+        writer.start();
+        writer.writeBatch();
+        writer.end();
+      }
+
+      try (ArrowStreamReader reader =
+              new ArrowStreamReader(new ByteArrayInputStream(out.toByteArray()), allocator);
+          Dataset dataset =
+              Dataset.write()
+                  .reader(reader)
+                  .uri(datasetPath)
+                  .allocator(allocator)
+                  .mode(WriteParams.WriteMode.CREATE)
+                  .execute()) {
+        // The point data round-trips through Lance.
+        assertEquals(rowCount, dataset.countRows());
+        try (ArrowReader scan = dataset.newScan(new ScanOptions.Builder().build()).scanBatches()) {
+          assertTrue(scan.loadNextBatch());
+          StructVector readGeometry =
+              (StructVector) scan.getVectorSchemaRoot().getVector("geometry");
+          assertEquals(2.0, ((Float8Vector) readGeometry.getChild("x")).get(2));
+          assertEquals(4.0, ((Float8Vector) readGeometry.getChild("y")).get(2));
+        }
+
+        // Creating and listing an RTree index via the typed IndexType works end-to-end.
+        Index index =
+            dataset.createIndex(
+                Collections.singletonList("geometry"),
+                IndexType.RTREE,
+                Optional.of("rtree_geometry_index"),
+                IndexParams.builder()
+                    .setScalarIndexParams(ScalarIndexParams.create("rtree"))
+                    .build(),
+                true);
+        assertEquals(IndexType.RTREE, index.indexType());
+        assertTrue(
+            dataset.listIndexes().contains("rtree_geometry_index"),
+            "Expected 'rtree_geometry_index' in: " + dataset.listIndexes());
+      }
+    }
+  }
 }
diff --git a/java/src/test/java/org/lance/memwal/MemWalTest.java b/java/src/test/java/org/lance/memwal/MemWalTest.java
index ee26932dd59..5af3bd3f474 100644
--- a/java/src/test/java/org/lance/memwal/MemWalTest.java
+++ b/java/src/test/java/org/lance/memwal/MemWalTest.java
@@ -50,6 +50,7 @@
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Optional;
 import java.util.UUID;
@@ -142,6 +143,30 @@ private static Dataset writeAppendOnlyDataset(
     }
   }
 
+  /**
+   * Stage a <em>faithful</em> flushed generation at {@code genPath}: the Lance dataset plus its
+   * primary-key dedup sidecar ({@code _pk_index/}), mirroring what production flush emits. The LSM
+   * scanner's cross-generation block-list opens the sidecar, so a dataset alone (no sidecar) is not
+   * a state production produces. Mirrors the Python {@code _write_flushed_gen} test helper.
+   */
+  private static void writeFlushedGen(
+      BufferAllocator allocator, String genPath, long[] ids, String prefix) throws Exception {
+    writeLookupDataset(allocator, genPath, ids, prefix).close();
+    try (VectorSchemaRoot root = lookupRoot(allocator, ids, prefix);
+        ArrowReader reader = toReader(allocator, root);
+        ArrowArrayStream stream = ArrowArrayStream.allocateNew(allocator)) {
+      Data.exportArrayStream(allocator, reader, stream);
+      nativeWritePkSidecar(genPath, stream.memoryAddress(), Collections.singletonList("id"));
+    }
+  }
+
+  /**
+   * Test-support native: write the primary-key dedup sidecar for a flushed-generation dataset
+   * already staged at {@code genPath}. See {@link #writeFlushedGen}.
+   */
+  private static native void nativeWritePkSidecar(
+      String genPath, long streamAddress, List<String> pkColumns);
+
   /** Read an LSM scanner fully into an {@code id -> name} map. */
   private static Map<Long, String> readByName(ArrowReader reader) throws Exception {
     Map<Long, String> byId = new HashMap<>();
@@ -367,7 +392,7 @@ void testLsmScannerFromSnapshots(@TempDir Path tempDir) throws Exception {
 
       // Flushed generation overwrites id=2.
       String genPath = basePath + "/_mem_wal/" + shardId + "/gen_1";
-      writeLookupDataset(allocator, genPath, new long[] {2}, "gen1").close();
+      writeFlushedGen(allocator, genPath, new long[] {2}, "gen1");
 
       ShardSnapshot snapshot =
           new ShardSnapshot(shardId).withFlushedGeneration(1, "gen_1").withCurrentGeneration(2);
@@ -393,7 +418,7 @@ void testPointLookup(@TempDir Path tempDir) throws Exception {
       dataset.initializeMemWal(new InitializeMemWalParams());
 
       String genPath = basePath + "/_mem_wal/" + shardId + "/gen_1";
-      writeLookupDataset(allocator, genPath, new long[] {2}, "gen1").close();
+      writeFlushedGen(allocator, genPath, new long[] {2}, "gen1");
 
       ShardSnapshot snapshot =
           new ShardSnapshot(shardId).withFlushedGeneration(1, "gen_1").withCurrentGeneration(2);
diff --git a/java/src/test/java/org/lance/namespace/DirectoryNamespaceTest.java b/java/src/test/java/org/lance/namespace/DirectoryNamespaceTest.java
index f425ddcc4f9..c622bac9fcd 100644
--- a/java/src/test/java/org/lance/namespace/DirectoryNamespaceTest.java
+++ b/java/src/test/java/org/lance/namespace/DirectoryNamespaceTest.java
@@ -189,6 +189,33 @@ void testNamespaceId() {
         "namespaceId should contain 'DirectoryNamespace', got: " + namespaceId);
   }
 
+  @Test
+  void testOpenSpecificVersionDoesNotPassVersionToDescribeTable() throws Exception {
+    VersionRejectingNamespace versionRejectingNamespace =
+        new VersionRejectingNamespace(innerNamespaceClient);
+    namespaceClient = versionRejectingNamespace;
+    List<String> tableId = Arrays.asList("test_table");
+
+    namespaceClient.createTable(new CreateTableRequest().id(tableId), createTestTableData());
+    namespaceClient.insertIntoTable(
+        new InsertIntoTableRequest().id(tableId).mode("append"), createTestTableData());
+
+    try (Dataset versionOne =
+        Dataset.open()
+            .allocator(allocator)
+            .namespaceClient(namespaceClient)
+            .tableId(tableId)
+            .readOptions(new ReadOptions.Builder().setVersion(1L).build())
+            .build()) {
+      assertEquals(1, versionOne.version());
+      assertEquals(3, versionOne.countRows());
+    }
+
+    assertTrue(
+        versionRejectingNamespace.getDescribeTableCallCount() > 0,
+        "Expected describeTable to be called when opening through namespace");
+  }
+
   @Test
   void testCreateAndListNamespaces() {
     // Create a namespace
@@ -1439,4 +1466,25 @@ private byte[] createVectorTableData(int numRows, int dim) throws Exception {
       return out.toByteArray();
     }
   }
+
+  private static class VersionRejectingNamespace extends CustomNamespace {
+    private final AtomicInteger describeTableCallCount = new AtomicInteger();
+
+    VersionRejectingNamespace(DirectoryNamespace inner) {
+      super(inner);
+    }
+
+    @Override
+    public DescribeTableResponse describeTable(DescribeTableRequest request) {
+      describeTableCallCount.incrementAndGet();
+      assertNull(
+          request.getVersion(),
+          "Dataset version should be passed to dataset open, not describeTable");
+      return super.describeTable(request);
+    }
+
+    int getDescribeTableCallCount() {
+      return describeTableCallCount.get();
+    }
+  }
 }
diff --git a/memtest/pyproject.toml b/memtest/pyproject.toml
index 396d7c442e0..4418d0e19c8 100644
--- a/memtest/pyproject.toml
+++ b/memtest/pyproject.toml
@@ -7,7 +7,7 @@ name = "lance-memtest"
 version = "0.1.0"
 description = "Memory allocation testing utilities for Python test suites"
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 license = { text = "Apache-2.0" }
 authors = [
     { name = "Lance Developers" }
@@ -17,7 +17,6 @@ classifiers = [
     "Intended Audience :: Developers",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
diff --git a/python/Cargo.lock b/python/Cargo.lock
index 879195811cf..126714795cc 100644
--- a/python/Cargo.lock
+++ b/python/Cargo.lock
@@ -185,15 +185,6 @@ dependencies = [
  "num-traits",
 ]
 
-[[package]]
-name = "ar_archive_writer"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4087686b4b0a3427190bae57a1d9a478dbb2d40c5dc1bd6e2b6d797913bdd348"
-dependencies = [
- "object",
-]
-
 [[package]]
 name = "arc-swap"
 version = "1.9.1"
@@ -517,7 +508,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -528,7 +519,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -1068,9 +1059,9 @@ dependencies = [
 
 [[package]]
 name = "bitvec"
-version = "1.0.1"
+version = "1.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c"
+checksum = "ddcec3d12c579d40898fe0a9a358a803c23e9c52ca3c425707f81c9436211837"
 dependencies = [
  "funty",
  "radium",
@@ -1112,9 +1103,9 @@ dependencies = [
 
 [[package]]
 name = "block-buffer"
-version = "0.12.0"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be"
+checksum = "d2f6c7dbe95a6ed67ad9f18e57daf93a2f034c524b99fd2b76d18fdfeb6660aa"
 dependencies = [
  "hybrid-array",
 ]
@@ -1130,9 +1121,9 @@ dependencies = [
 
 [[package]]
 name = "brotli"
-version = "8.0.3"
+version = "8.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610"
+checksum = "5cc91aac060a7a1e25823bdccbfb6af1875b88f17c6daac97894eed8207166b3"
 dependencies = [
  "alloc-no-stdlib",
  "alloc-stdlib",
@@ -1141,9 +1132,9 @@ dependencies = [
 
 [[package]]
 name = "brotli-decompressor"
-version = "5.0.1"
+version = "5.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924"
+checksum = "3a32acac15fe1967bc3986b2a6347dffc965602354ea6f450ad07e8bfd253583"
 dependencies = [
  "alloc-no-stdlib",
  "alloc-stdlib",
@@ -1195,7 +1186,7 @@ checksum = "89385e82b5d1821d2219e0b095efa2cc1f246cbf99080f3be46a1a85c0d392d9"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -1218,9 +1209,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
 [[package]]
 name = "bytes"
-version = "1.11.1"
+version = "1.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
+checksum = "8ae3f5d315924270530207e2a68396c3cc547f6dca3fbdca317cfb1a51edb593"
 
 [[package]]
 name = "bytes-utils"
@@ -1232,15 +1223,6 @@ dependencies = [
  "either",
 ]
 
-[[package]]
-name = "bzip2"
-version = "0.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c"
-dependencies = [
- "libbz2-rs-sys",
-]
-
 [[package]]
 name = "cbc"
 version = "0.1.2"
@@ -1252,9 +1234,9 @@ dependencies = [
 
 [[package]]
 name = "cc"
-version = "1.2.63"
+version = "1.2.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f"
+checksum = "dad887fd958be91b5098c0248def011f4523ab786cd411be668777e55063501f"
 dependencies = [
  "find-msvc-tools",
  "jobserver",
@@ -1365,7 +1347,7 @@ dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -1430,13 +1412,9 @@ version = "0.4.38"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ce2548391e9c1929c21bf6aa2680af86fe4c1b33e6cea9ac1cfeec0bd11218cf"
 dependencies = [
- "bzip2",
  "compression-core",
  "flate2",
- "liblzma",
  "memchr",
- "zstd",
- "zstd-safe",
 ]
 
 [[package]]
@@ -1764,7 +1742,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -1777,7 +1755,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -1788,7 +1766,7 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead"
 dependencies = [
  "darling_core 0.20.11",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -1799,7 +1777,7 @@ checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d"
 dependencies = [
  "darling_core 0.23.0",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -1826,7 +1804,6 @@ dependencies = [
  "arrow-schema",
  "async-trait",
  "bytes",
- "bzip2",
  "chrono",
  "datafusion-catalog",
  "datafusion-catalog-listing",
@@ -1836,7 +1813,6 @@ dependencies = [
  "datafusion-datasource-arrow",
  "datafusion-datasource-csv",
  "datafusion-datasource-json",
- "datafusion-datasource-parquet",
  "datafusion-execution",
  "datafusion-expr",
  "datafusion-expr-common",
@@ -1853,14 +1829,11 @@ dependencies = [
  "datafusion-physical-plan",
  "datafusion-session",
  "datafusion-sql",
- "flate2",
  "futures",
  "itertools 0.14.0",
- "liblzma",
  "log",
  "object_store",
  "parking_lot",
- "parquet",
  "rand 0.9.4",
  "regex",
  "sqlparser",
@@ -1868,7 +1841,6 @@ dependencies = [
  "tokio",
  "url",
  "uuid",
- "zstd",
 ]
 
 [[package]]
@@ -1938,7 +1910,6 @@ dependencies = [
  "object_store",
  "parquet",
  "paste",
- "recursive",
  "sqlparser",
  "tokio",
  "web-time",
@@ -1962,10 +1933,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6"
 dependencies = [
  "arrow",
- "async-compression",
  "async-trait",
  "bytes",
- "bzip2",
  "chrono",
  "datafusion-common",
  "datafusion-common-runtime",
@@ -1976,18 +1945,14 @@ dependencies = [
  "datafusion-physical-expr-common",
  "datafusion-physical-plan",
  "datafusion-session",
- "flate2",
  "futures",
  "glob",
  "itertools 0.14.0",
- "liblzma",
  "log",
  "object_store",
  "rand 0.9.4",
  "tokio",
- "tokio-util",
  "url",
- "zstd",
 ]
 
 [[package]]
@@ -2138,7 +2103,6 @@ dependencies = [
  "indexmap 2.14.0",
  "itertools 0.14.0",
  "paste",
- "recursive",
  "serde_json",
  "sqlparser",
 ]
@@ -2330,7 +2294,7 @@ checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd"
 dependencies = [
  "datafusion-doc",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -2348,7 +2312,6 @@ dependencies = [
  "indexmap 2.14.0",
  "itertools 0.14.0",
  "log",
- "recursive",
  "regex",
  "regex-syntax",
 ]
@@ -2373,7 +2336,6 @@ dependencies = [
  "parking_lot",
  "paste",
  "petgraph",
- "recursive",
  "tokio",
 ]
 
@@ -2425,7 +2387,6 @@ dependencies = [
  "datafusion-physical-plan",
  "datafusion-pruning",
  "itertools 0.14.0",
- "recursive",
 ]
 
 [[package]]
@@ -2544,7 +2505,6 @@ dependencies = [
  "datafusion-functions-nested",
  "indexmap 2.14.0",
  "log",
- "recursive",
  "regex",
  "sqlparser",
 ]
@@ -2608,7 +2568,7 @@ dependencies = [
  "darling 0.20.11",
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -2618,7 +2578,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c"
 dependencies = [
  "derive_builder_core",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -2639,7 +2599,7 @@ version = "0.11.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2"
 dependencies = [
- "block-buffer 0.12.0",
+ "block-buffer 0.12.1",
  "const-oid 0.10.2",
  "crypto-common 0.2.2",
  "ctutils",
@@ -2674,7 +2634,7 @@ checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -2899,7 +2859,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
 
 [[package]]
 name = "fsst"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-array",
  "rand 0.9.4",
@@ -2976,7 +2936,7 @@ checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -3202,17 +3162,15 @@ dependencies = [
 
 [[package]]
 name = "getrandom"
-version = "0.4.2"
+version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
+checksum = "300e883d756b2e4ec94e02791f39b04b522276138852cfc41d9fb7e904106099"
 dependencies = [
  "cfg-if 1.0.4",
  "js-sys",
  "libc",
  "r-efi 6.0.0",
  "rand_core 0.10.1",
- "wasip2",
- "wasip3",
  "wasm-bindgen",
 ]
 
@@ -3233,7 +3191,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -3280,9 +3238,9 @@ dependencies = [
 
 [[package]]
 name = "h2"
-version = "0.4.14"
+version = "0.4.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733"
+checksum = "6cb093c84e8bd9b188d4c4a8cb6579fc016968d14c99882163cd3ff402a4f155"
 dependencies = [
  "atomic-waker",
  "bytes",
@@ -3803,12 +3761,6 @@ version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e4a2c462a4d927d512f5f882a033ddd62f33a05bb9f230d98f736ac3dc85938f"
 
-[[package]]
-name = "id-arena"
-version = "2.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
-
 [[package]]
 name = "ident_case"
 version = "1.0.1"
@@ -3979,7 +3931,7 @@ checksum = "782d32378dddf207193ac91cefb848ad41abb58195c95168e1291227a0832b47"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -4024,7 +3976,7 @@ dependencies = [
  "quote",
  "rustc_version",
  "simd_cesu8",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -4043,7 +3995,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264"
 dependencies = [
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -4058,9 +4010,9 @@ dependencies = [
 
 [[package]]
 name = "js-sys"
-version = "0.3.100"
+version = "0.3.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2025f20d7a4fa7785846e7b63d10a76d3f1cee98ee5cb79ea59703f95e42162"
+checksum = "03d04c30968dffe80775bd4d7fb676131cd04a1fb46d2686dbffbaec2d9dfd31"
 dependencies = [
  "cfg-if 1.0.4",
  "futures-util",
@@ -4115,7 +4067,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a"
 
 [[package]]
 name = "lance"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arc-swap",
  "arrow",
@@ -4189,7 +4141,7 @@ dependencies = [
 
 [[package]]
 name = "lance-arrow"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -4231,7 +4183,7 @@ dependencies = [
 
 [[package]]
 name = "lance-bitpacking"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrayref",
  "paste",
@@ -4240,7 +4192,7 @@ dependencies = [
 
 [[package]]
 name = "lance-core"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -4278,7 +4230,7 @@ dependencies = [
 
 [[package]]
 name = "lance-datafusion"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow",
  "arrow-array",
@@ -4310,7 +4262,7 @@ dependencies = [
 
 [[package]]
 name = "lance-datagen"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow",
  "arrow-array",
@@ -4323,21 +4275,20 @@ dependencies = [
  "rand 0.9.4",
  "rand_distr",
  "rand_xoshiro",
- "random_word",
 ]
 
 [[package]]
 name = "lance-derive"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
 name = "lance-encoding"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-arith",
  "arrow-array",
@@ -4372,7 +4323,7 @@ dependencies = [
 
 [[package]]
 name = "lance-file"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-arith",
  "arrow-array",
@@ -4402,7 +4353,7 @@ dependencies = [
 
 [[package]]
 name = "lance-geo"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "datafusion",
  "geo-traits",
@@ -4416,7 +4367,7 @@ dependencies = [
 
 [[package]]
 name = "lance-index"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arc-swap",
  "arrow",
@@ -4472,6 +4423,7 @@ dependencies = [
  "rand_distr",
  "rangemap",
  "rayon",
+ "regex-syntax",
  "roaring",
  "serde",
  "serde_json",
@@ -4484,7 +4436,7 @@ dependencies = [
 
 [[package]]
 name = "lance-io"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow",
  "arrow-arith",
@@ -4525,7 +4477,7 @@ dependencies = [
 
 [[package]]
 name = "lance-linalg"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -4536,11 +4488,12 @@ dependencies = [
  "lance-core",
  "num-traits",
  "rand 0.9.4",
+ "rayon",
 ]
 
 [[package]]
 name = "lance-namespace"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow",
  "async-trait",
@@ -4552,7 +4505,7 @@ dependencies = [
 
 [[package]]
 name = "lance-namespace-impls"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow",
  "arrow-ipc",
@@ -4560,6 +4513,8 @@ dependencies = [
  "async-trait",
  "axum",
  "bytes",
+ "datafusion-common",
+ "datafusion-physical-plan",
  "futures",
  "lance",
  "lance-core",
@@ -4572,19 +4527,22 @@ dependencies = [
  "object_store",
  "rand 0.9.4",
  "reqwest 0.12.28",
+ "roaring",
  "serde",
  "serde_json",
+ "time",
  "tokio",
  "tower",
  "tower-http 0.5.2",
  "url",
+ "uuid",
 ]
 
 [[package]]
 name = "lance-namespace-reqwest-client"
-version = "0.8.4"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04b4e5caefa132a9cce54b2d4dc95016b949b3a290a83ad5057e705df43d75be"
+checksum = "ba3f0a235e3ed5f8805205649ccc7d7d0f3df23ce1294242c9265ad488d7f19d"
 dependencies = [
  "reqwest 0.12.28",
  "serde",
@@ -4596,7 +4554,7 @@ dependencies = [
 
 [[package]]
 name = "lance-select"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -4611,7 +4569,7 @@ dependencies = [
 
 [[package]]
 name = "lance-table"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "arrow",
  "arrow-array",
@@ -4650,13 +4608,14 @@ dependencies = [
 
 [[package]]
 name = "lance-tokenizer"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
  "icu_segmenter",
  "jieba-rs",
  "lindera",
  "rust-stemmers",
  "serde",
+ "stop-words",
  "unicode-normalization",
 ]
 
@@ -4669,12 +4628,6 @@ dependencies = [
  "spin",
 ]
 
-[[package]]
-name = "leb128fmt"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
-
 [[package]]
 name = "lexical-core"
 version = "1.0.6"
@@ -4732,12 +4685,6 @@ dependencies = [
  "lexical-util",
 ]
 
-[[package]]
-name = "libbz2-rs-sys"
-version = "0.2.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34b357333733e8260735ba5894eb928c02ecc69c78715f01a8019e7fa7f2db4c"
-
 [[package]]
 name = "libc"
 version = "0.2.186"
@@ -4754,26 +4701,6 @@ dependencies = [
  "winapi",
 ]
 
-[[package]]
-name = "liblzma"
-version = "0.4.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6033b77c21d1f56deeae8014eb9fbe7bdf1765185a6c508b5ca82eeaed7f899"
-dependencies = [
- "liblzma-sys",
-]
-
-[[package]]
-name = "liblzma-sys"
-version = "0.4.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a60851d15cd8c5346eca4ab8babff585be2ae4bc8097c067291d3ffe2add3b6"
-dependencies = [
- "cc",
- "libc",
- "pkg-config",
-]
-
 [[package]]
 name = "libm"
 version = "0.2.16"
@@ -4997,9 +4924,9 @@ dependencies = [
 
 [[package]]
 name = "memchr"
-version = "2.8.1"
+version = "2.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8"
+checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4"
 
 [[package]]
 name = "memmap2"
@@ -5096,7 +5023,7 @@ checksum = "4568f25ccbd45ab5d5603dc34318c1ec56b117531781260002151b8530a9f931"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -5241,7 +5168,7 @@ dependencies = [
  "proc-macro-crate",
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -5272,15 +5199,6 @@ dependencies = [
  "objc2-core-foundation",
 ]
 
-[[package]]
-name = "object"
-version = "0.37.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe"
-dependencies = [
- "memchr",
-]
-
 [[package]]
 name = "object_store"
 version = "0.13.2"
@@ -5927,7 +5845,7 @@ checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -6034,7 +5952,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
 dependencies = [
  "proc-macro2",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -6080,7 +5998,7 @@ dependencies = [
  "prost",
  "prost-types",
  "regex",
- "syn 2.0.117",
+ "syn 2.0.118",
  "tempfile",
 ]
 
@@ -6094,7 +6012,7 @@ dependencies = [
  "itertools 0.14.0",
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -6106,16 +6024,6 @@ dependencies = [
  "prost",
 ]
 
-[[package]]
-name = "psm"
-version = "0.1.31"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "645dbe486e346d9b5de3ef16ede18c26e6c70ad97418f4874b8b1889d6e761ea"
-dependencies = [
- "ar_archive_writer",
- "cc",
-]
-
 [[package]]
 name = "ptr_meta"
 version = "0.3.1"
@@ -6133,13 +6041,14 @@ checksum = "7347867d0a7e1208d93b46767be83e2b8f978c3dad35f775ac8d8847551d6fe1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
 name = "pylance"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 dependencies = [
+ "alloc-stdlib",
  "arrow",
  "arrow-array",
  "arrow-cast",
@@ -6228,7 +6137,7 @@ dependencies = [
  "proc-macro2",
  "pyo3-macros-backend",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -6241,7 +6150,7 @@ dependencies = [
  "proc-macro2",
  "pyo3-build-config",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -6393,7 +6302,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207"
 dependencies = [
  "chacha20",
- "getrandom 0.4.2",
+ "getrandom 0.4.3",
  "rand_core 0.10.1",
 ]
 
@@ -6460,19 +6369,6 @@ dependencies = [
  "rand_core 0.9.5",
 ]
 
-[[package]]
-name = "random_word"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e47a395bdb55442b883c89062d6bcff25dc90fa5f8369af81e0ac6d49d78cf81"
-dependencies = [
- "ahash",
- "brotli",
- "paste",
- "rand 0.9.4",
- "unicase",
-]
-
 [[package]]
 name = "rangemap"
 version = "1.7.1"
@@ -6505,26 +6401,6 @@ dependencies = [
  "crossbeam-utils",
 ]
 
-[[package]]
-name = "recursive"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e"
-dependencies = [
- "recursive-proc-macro-impl",
- "stacker",
-]
-
-[[package]]
-name = "recursive-proc-macro-impl"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b"
-dependencies = [
- "quote",
- "syn 2.0.117",
-]
-
 [[package]]
 name = "redb"
 version = "3.1.3"
@@ -6571,7 +6447,7 @@ checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -6920,7 +6796,7 @@ checksum = "5d2ed0b54125315fb36bd021e82d314d1c126548f871634b483f46b31d13cac6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -7185,7 +7061,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "serde_derive_internals",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -7277,7 +7153,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -7288,7 +7164,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -7323,7 +7199,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -7335,7 +7211,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "serde",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -7379,7 +7255,7 @@ dependencies = [
  "darling 0.23.0",
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -7533,9 +7409,9 @@ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5"
 
 [[package]]
 name = "smallvec"
-version = "1.15.1"
+version = "1.15.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
+checksum = "8ed6a63f02c8539c91a8685a86f4099661ba3da017932f6ebbea6de3f0fa7c90"
 
 [[package]]
 name = "snafu"
@@ -7555,7 +7431,7 @@ dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -7609,7 +7485,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dbf5ea8d4d7c808e1af1cbabebca9a2abe603bcefc22294c5b95018d53200cb7"
 dependencies = [
  "log",
- "recursive",
  "sqlparser_derive",
 ]
 
@@ -7621,7 +7496,7 @@ checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -7630,19 +7505,6 @@ version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
 
-[[package]]
-name = "stacker"
-version = "0.1.24"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "640c8cdd92b6b12f5bcb1803ca3bbf5ab96e5e6b6b96b9ab77dabe9e880b3190"
-dependencies = [
- "cc",
- "cfg-if 1.0.4",
- "libc",
- "psm",
- "windows-sys 0.61.2",
-]
-
 [[package]]
 name = "static_assertions"
 version = "1.1.0"
@@ -7671,6 +7533,15 @@ version = "0.2.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978"
 
+[[package]]
+name = "stop-words"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d68df56303396bcfb639455b3c166804aeb7994005010aab5e9e8a1277b8871d"
+dependencies = [
+ "serde_json",
+]
+
 [[package]]
 name = "strsim"
 version = "0.11.1"
@@ -7705,7 +7576,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -7717,7 +7588,7 @@ dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -7740,7 +7611,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_yaml",
- "syn 2.0.117",
+ "syn 2.0.118",
  "typify",
  "walkdir",
 ]
@@ -7770,9 +7641,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.117"
+version = "2.0.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
+checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -7796,7 +7667,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -7859,7 +7730,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
 dependencies = [
  "fastrand",
- "getrandom 0.4.2",
+ "getrandom 0.4.3",
  "once_cell",
  "rustix",
  "windows-sys 0.61.2",
@@ -7891,7 +7762,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -7902,7 +7773,7 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -8025,7 +7896,7 @@ checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -8247,7 +8118,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -8386,7 +8257,7 @@ dependencies = [
  "semver",
  "serde",
  "serde_json",
- "syn 2.0.117",
+ "syn 2.0.118",
  "thiserror 2.0.18",
  "unicode-ident",
 ]
@@ -8404,7 +8275,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_tokenstream",
- "syn 2.0.117",
+ "syn 2.0.118",
  "typify-impl",
 ]
 
@@ -8447,12 +8318,6 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
 
-[[package]]
-name = "unicode-xid"
-version = "0.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
-
 [[package]]
 name = "unsafe-libyaml"
 version = "0.2.11"
@@ -8507,7 +8372,7 @@ version = "1.23.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7"
 dependencies = [
- "getrandom 0.4.2",
+ "getrandom 0.4.3",
  "js-sys",
  "serde_core",
  "wasm-bindgen",
@@ -8567,20 +8432,11 @@ dependencies = [
 
 [[package]]
 name = "wasip2"
-version = "1.0.3+wasi-0.2.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6"
-dependencies = [
- "wit-bindgen 0.57.1",
-]
-
-[[package]]
-name = "wasip3"
-version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
+version = "1.0.4+wasi-0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
+checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487"
 dependencies = [
- "wit-bindgen 0.51.0",
+ "wit-bindgen",
 ]
 
 [[package]]
@@ -8594,9 +8450,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.123"
+version = "0.2.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a254a4b10c19a76f09a27640e7ffbf9bc30bf67e16a3bf28aaefa4920fe81563"
+checksum = "8ddb3f79143bced6de84270411622a2699cee572fc0875aeaf1e7867cf9fca1a"
 dependencies = [
  "cfg-if 1.0.4",
  "once_cell",
@@ -8607,9 +8463,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.73"
+version = "0.4.75"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54568702fabf5d4849ce2b90fadfa64168a097eaf4b351ce9df8b687a0086aaf"
+checksum = "503b14d284f2c8dac03b819967e155ea753f573586193b2b2c95990cb5d69280"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -8617,9 +8473,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.123"
+version = "0.2.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24a40fc75b0ec6f3746ceb10d36f53a93dcd68a93b11b6445983945d79eba0dc"
+checksum = "4e21a184b13fb19e157296e2c46056aec9092264fab83e4ba59e68c61b323c3d"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -8627,48 +8483,26 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.123"
+version = "0.2.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "908f34bd9b9ce3d4caf07b72dfab63d61504d156856c6bd3cd87fa350cf3985b"
+checksum = "fecefd9c35bd935a20fc3fc344b5f29138961e4f47fb03297d88f2587afb5ebd"
 dependencies = [
  "bumpalo",
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.123"
+version = "0.2.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7acbf7616c27b194bbb550bf77ed0c2c3e5b7fd1260a93082b95fb7f47959b92"
+checksum = "23939e44bb9a5d7576fa2b563dc2e136628f1224e88a8deed09e04858b77871f"
 dependencies = [
  "unicode-ident",
 ]
 
-[[package]]
-name = "wasm-encoder"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
-dependencies = [
- "leb128fmt",
- "wasmparser",
-]
-
-[[package]]
-name = "wasm-metadata"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
-dependencies = [
- "anyhow",
- "indexmap 2.14.0",
- "wasm-encoder",
- "wasmparser",
-]
-
 [[package]]
 name = "wasm-streams"
 version = "0.4.2"
@@ -8695,23 +8529,11 @@ dependencies = [
  "web-sys",
 ]
 
-[[package]]
-name = "wasmparser"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
-dependencies = [
- "bitflags 2.13.0",
- "hashbrown 0.15.5",
- "indexmap 2.14.0",
- "semver",
-]
-
 [[package]]
 name = "web-sys"
-version = "0.3.100"
+version = "0.3.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e0871acf327f283dc6da28a1696cdc64fb355ba9f935d052021fa77f35cce69"
+checksum = "a6430a72df5eb332242960fe84b3002a241163998241eb596d4f739b9757061d"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -8729,18 +8551,18 @@ dependencies = [
 
 [[package]]
 name = "webpki-root-certs"
-version = "1.0.7"
+version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c"
+checksum = "0d46a5a140e6f7afeccd8eae97eff335163939eac8b929834875168b29b3d267"
 dependencies = [
  "rustls-pki-types",
 ]
 
 [[package]]
 name = "webpki-roots"
-version = "1.0.7"
+version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d"
+checksum = "bf85cb06032201fa7c6f829d7db5a7e5aa45bcc0655327713065f6f0576731bf"
 dependencies = [
  "rustls-pki-types",
 ]
@@ -8842,7 +8664,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -8853,7 +8675,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -9075,100 +8897,12 @@ dependencies = [
  "memchr",
 ]
 
-[[package]]
-name = "wit-bindgen"
-version = "0.51.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
-dependencies = [
- "wit-bindgen-rust-macro",
-]
-
 [[package]]
 name = "wit-bindgen"
 version = "0.57.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e"
 
-[[package]]
-name = "wit-bindgen-core"
-version = "0.51.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
-dependencies = [
- "anyhow",
- "heck",
- "wit-parser",
-]
-
-[[package]]
-name = "wit-bindgen-rust"
-version = "0.51.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
-dependencies = [
- "anyhow",
- "heck",
- "indexmap 2.14.0",
- "prettyplease",
- "syn 2.0.117",
- "wasm-metadata",
- "wit-bindgen-core",
- "wit-component",
-]
-
-[[package]]
-name = "wit-bindgen-rust-macro"
-version = "0.51.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
-dependencies = [
- "anyhow",
- "prettyplease",
- "proc-macro2",
- "quote",
- "syn 2.0.117",
- "wit-bindgen-core",
- "wit-bindgen-rust",
-]
-
-[[package]]
-name = "wit-component"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
-dependencies = [
- "anyhow",
- "bitflags 2.13.0",
- "indexmap 2.14.0",
- "log",
- "serde",
- "serde_derive",
- "serde_json",
- "wasm-encoder",
- "wasm-metadata",
- "wasmparser",
- "wit-parser",
-]
-
-[[package]]
-name = "wit-parser"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
-dependencies = [
- "anyhow",
- "id-arena",
- "indexmap 2.14.0",
- "log",
- "semver",
- "serde",
- "serde_derive",
- "serde_json",
- "unicode-xid",
- "wasmparser",
-]
-
 [[package]]
 name = "wkb"
 version = "0.9.2"
@@ -9263,7 +8997,7 @@ dependencies = [
  "csv",
  "futures",
  "futures-util",
- "getrandom 0.4.2",
+ "getrandom 0.4.3",
  "heapify",
  "itertools 0.14.0",
  "lazy_static",
@@ -9387,7 +9121,7 @@ checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
  "synstructure",
 ]
 
@@ -9408,7 +9142,7 @@ checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
@@ -9428,15 +9162,15 @@ checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
  "synstructure",
 ]
 
 [[package]]
 name = "zeroize"
-version = "1.8.2"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
+checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e"
 
 [[package]]
 name = "zerotrie"
@@ -9470,7 +9204,7 @@ checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.117",
+ "syn 2.0.118",
 ]
 
 [[package]]
diff --git a/python/Cargo.toml b/python/Cargo.toml
index f7d6280644a..240c046e5ff 100644
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "pylance"
-version = "8.0.0-beta.11"
+version = "8.1.0-beta.0"
 edition = "2024"
 authors = ["Lance Devs <dev@lance.org>"]
 license = "Apache-2.0"
@@ -19,9 +19,13 @@ arrow-cast = "58.0.0"
 arrow-data = "58.0.0"
 arrow-schema = "58.0.0"
 object_store = "0.13.2"
-datafusion = "53.0.0"
+datafusion = { version = "53.0.0", default-features = false }
 datafusion-ffi = "53.0.0"
 datafusion-common = "53.0.0"
+# Keep the Python FFI build on the working Brotli allocator resolution until
+# datafusion-ffi no longer enables datafusion-proto/default.
+# See https://github.com/lance-format/lance/issues/7271.
+alloc-stdlib = "=0.2.2"
 async-trait = "0.1"
 chrono = "0.4.42"
 env_logger = "0.11.7"
@@ -56,7 +60,7 @@ prost = "0.14.1"
 prost-types = "0.14.1"
 pyo3 = { version = "0.28", features = [
     "extension-module",
-    "abi3-py39",
+    "abi3-py310",
     "py-clone",
     "chrono",
 ] }
diff --git a/python/pyproject.toml b/python/pyproject.toml
index a1e69855a0f..d863fe38517 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -1,13 +1,13 @@
 [project]
 name = "pylance"
 dynamic = ["version"]
-dependencies = ["pyarrow>=14", "numpy>=1.22", "lance-namespace>=0.8.0,<0.9"]
+dependencies = ["pyarrow>=14", "numpy>=1.22", "lance-namespace>=0.8.5,<0.9"]
 description = "python wrapper for Lance columnar format"
 authors = [{ name = "Lance Devs", email = "dev@lance.org" }]
 license = { file = "LICENSE" }
 repository = "https://github.com/lancedb/lance"
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 keywords = [
     "data-format",
     "data-science",
@@ -30,7 +30,6 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3 :: Only",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
@@ -61,7 +60,7 @@ tests = [
     # Only test tensorflow on linux for now. We will deprecate tensorflow soon.
     "tensorflow; sys_platform == 'linux'",
     "tqdm",
-    "datafusion>=53,<54; python_version >= '3.10'",
+    "datafusion>=53,<54",
 ]
 dev = ["ruff==0.11.2", "pyright"]
 benchmarks = ["pytest-benchmark"]
@@ -74,7 +73,7 @@ geo = [
 [dependency-groups]
 tests = [
     "boto3==1.40.43",
-    "datasets==4.1.1; python_version >= '3.10'",
+    "datasets==4.1.1",
     "duckdb==1.4.0",
     "ml_dtypes==0.5.3",
     "pillow==11.3.0",
@@ -82,9 +81,9 @@ tests = [
     "polars[pyarrow,pandas]==1.34.0",
     "psutil==7.1.0",
     "pytest==8.4.2",
-    "tensorflow==2.20.0; sys_platform == 'linux' and python_version >= '3.10'",
+    "tensorflow==2.20.0; sys_platform == 'linux'",
     "tqdm==4.67.1",
-    "datafusion==53.0.0; python_version >= '3.10'",
+    "datafusion==53.0.0",
 ]
 dev = [
     "maturin==1.13.3",
diff --git a/python/python/benchmarks/test_search.py b/python/python/benchmarks/test_search.py
index 61076e61687..b4e33338cb1 100644
--- a/python/python/benchmarks/test_search.py
+++ b/python/python/benchmarks/test_search.py
@@ -78,10 +78,12 @@ def create_base_dataset(data_dir: Path) -> lance.LanceDataset:
         rows_remaining -= next_batch_length
         table = create_table(next_batch_length, offset)
         if offset == 0:
-            dataset = lance.write_dataset(table, tmp_path, use_legacy_format=False)
+            dataset = lance.write_dataset(
+                table, tmp_path, data_storage_version="stable"
+            )
         else:
             dataset = lance.write_dataset(
-                table, tmp_path, mode="append", use_legacy_format=False
+                table, tmp_path, mode="append", data_storage_version="stable"
             )
         offset += next_batch_length
 
@@ -98,7 +100,7 @@ def create_base_dataset(data_dir: Path) -> lance.LanceDataset:
     dataset.create_scalar_index("category", "BITMAP")
     dataset.create_scalar_index("genres", "LABEL_LIST")
 
-    return lance.dataset(tmp_path, index_cache_size=64 * 1024)
+    return lance.dataset(tmp_path, index_cache_size_bytes=512 * 1024 * 1024)
 
 
 def create_delete_dataset(data_dir):
@@ -113,7 +115,7 @@ def create_delete_dataset(data_dir):
     dataset = lance.dataset(tmp_path)
     dataset.delete("filterable % 2 != 0")
 
-    return lance.dataset(tmp_path, index_cache_size=64 * 1024)
+    return lance.dataset(tmp_path, index_cache_size_bytes=512 * 1024 * 1024)
 
 
 def create_new_rows_dataset(data_dir):
@@ -129,7 +131,7 @@ def create_new_rows_dataset(data_dir):
     table = create_table(NEW_ROWS, offset=NUM_ROWS)
     dataset = lance.write_dataset(table, tmp_path, mode="append")
 
-    return lance.dataset(tmp_path, index_cache_size=64 * 1024)
+    return lance.dataset(tmp_path, index_cache_size_bytes=512 * 1024 * 1024)
 
 
 class Datasets(NamedTuple):
diff --git a/python/python/lance/__init__.py b/python/python/lance/__init__.py
index f58b169a47a..be99eb05cc5 100644
--- a/python/python/lance/__init__.py
+++ b/python/python/lance/__init__.py
@@ -230,7 +230,9 @@ def dataset(
                 "Both 'namespace_client' and 'table_id' must be provided together."
             )
 
-        request = DescribeTableRequest(id=table_id, version=version)
+        # Resolve the latest table metadata here. The requested dataset version is
+        # applied by the lower-level dataset open path after namespace resolution.
+        request = DescribeTableRequest(id=table_id, version=None)
         response = namespace_client.describe_table(request)
 
         uri = response.location
diff --git a/python/python/lance/blob.py b/python/python/lance/blob.py
index 46faf760cdd..a87c9302736 100644
--- a/python/python/lance/blob.py
+++ b/python/python/lance/blob.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright The Lance Authors
 
+import ctypes
 import io
 from dataclasses import dataclass
 from typing import IO, Any, Iterator, Optional, Union
@@ -9,6 +10,12 @@
 
 from .lance import LanceBlobFile
 
+_BLOB_INLINE_SIZE_THRESHOLD_META_KEY = b"lance-encoding:blob-inline-size-threshold"
+_BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY = (
+    b"lance-encoding:blob-dedicated-size-threshold"
+)
+_MAX_RUST_USIZE = ctypes.c_size_t(-1).value
+
 
 @dataclass(frozen=True)
 class Blob:
@@ -190,9 +197,63 @@ def blob_array(values: list[Any]) -> BlobArray:
     return BlobArray.from_pylist(values)
 
 
-def blob_field(name: str, *, nullable: bool = True) -> pa.Field:
-    """Construct an Arrow field for a Lance blob column."""
-    return pa.field(name, BlobType(), nullable=nullable)
+def _validate_threshold(name: str, value: Optional[int], *, allow_zero: bool) -> None:
+    if value is None:
+        return
+    if isinstance(value, bool) or not isinstance(value, int):
+        raise TypeError(f"{name} must be an int, got {type(value).__name__}")
+    if allow_zero:
+        if value < 0:
+            raise ValueError(f"{name} must be non-negative")
+    elif value <= 0:
+        raise ValueError(f"{name} must be positive")
+    if value > _MAX_RUST_USIZE:
+        raise OverflowError(f"{name} must fit in a Rust usize")
+
+
+def blob_field(
+    name: str,
+    *,
+    nullable: bool = True,
+    inline_size_threshold: Optional[int] = None,
+    dedicated_size_threshold: Optional[int] = None,
+) -> pa.Field:
+    """
+    Construct an Arrow field for a Lance blob column.
+
+    Parameters
+    ----------
+    name : str
+        Field name.
+    nullable : bool, default True
+        Whether the blob column accepts null values.
+    inline_size_threshold : optional, int
+        Maximum payload size in bytes to keep inline in the data file before
+        using packed blob storage.
+    dedicated_size_threshold : optional, int
+        Maximum payload size in bytes to store in packed blob storage before
+        using dedicated blob storage. This threshold is checked before
+        ``inline_size_threshold``.
+    """
+    _validate_threshold("inline_size_threshold", inline_size_threshold, allow_zero=True)
+    _validate_threshold(
+        "dedicated_size_threshold", dedicated_size_threshold, allow_zero=False
+    )
+
+    field = pa.field(name, BlobType(), nullable=nullable)
+    if inline_size_threshold is None and dedicated_size_threshold is None:
+        return field
+
+    metadata = dict(field.metadata or {})
+    if inline_size_threshold is not None:
+        metadata[_BLOB_INLINE_SIZE_THRESHOLD_META_KEY] = str(
+            inline_size_threshold
+        ).encode()
+    if dedicated_size_threshold is not None:
+        metadata[_BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY] = str(
+            dedicated_size_threshold
+        ).encode()
+    return field.with_metadata(metadata)
 
 
 class BlobIterator:
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index e96d9305ce5..45dc1b253d3 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -950,6 +950,9 @@ def create_branch(
         ds._base_store_params = self._base_store_params
         ds._namespace_client = self._namespace_client
         ds._table_id = self._table_id
+        ds._namespace_client_managed_versioning = (
+            self._namespace_client_managed_versioning
+        )
         ds._default_scan_options = self._default_scan_options
         ds._read_params = self._read_params
         return ds
@@ -1350,7 +1353,10 @@ def data_storage_version(self) -> str:
     @property
     def has_stable_row_ids(self) -> bool:
         """
-        Whether this dataset has stable row IDs enabled
+        Whether this dataset has stable row IDs enabled.
+
+        This is based on the dataset manifest feature flag and does not depend on
+        whether the current version has any fragments.
         """
         return self._ds.has_stable_row_ids
 
@@ -4579,6 +4585,7 @@ def commit_batch(
         ds._base_store_params = base_store_params
         ds._namespace_client = None
         ds._table_id = None
+        ds._namespace_client_managed_versioning = False
         ds._default_scan_options = None
         ds._read_params = None
         return BulkCommitResult(
diff --git a/python/python/lance/indices/builder.py b/python/python/lance/indices/builder.py
index d3d61c5f8ff..6059166d6ba 100644
--- a/python/python/lance/indices/builder.py
+++ b/python/python/lance/indices/builder.py
@@ -150,7 +150,7 @@ def train_ivf(
                 max_iters=max_iters,
             )
             num_dims = ivf_centroids.shape[1]
-            ivf_centroids.shape = -1
+            ivf_centroids = ivf_centroids.reshape(-1)
             flat_centroids_array = pa.array(ivf_centroids)
             centroids_array = pa.FixedSizeListArray.from_arrays(
                 flat_centroids_array, num_dims
diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi
index 38d82738063..26ad75a27b7 100644
--- a/python/python/lance/lance/__init__.pyi
+++ b/python/python/lance/lance/__init__.pyi
@@ -226,6 +226,8 @@ class _Dataset:
     def replace_field_metadata(self, field_name: str, metadata: Dict[str, str]): ...
     @property
     def data_storage_version(self) -> str: ...
+    @property
+    def has_stable_row_ids(self) -> bool: ...
     def index_statistics(self, index_name: str) -> str: ...
     def serialized_manifest(self) -> bytes: ...
     def describe_indices(self) -> List[IndexDescription]: ...
@@ -461,6 +463,27 @@ class _Dataset:
     def get_transactions(
         self, recent_transactions=10
     ) -> List[Optional[Transaction]]: ...
+    def hamming_clustering_for_ivf_partition(
+        self,
+        index_name: str,
+        partition_id: int,
+        hamming_threshold: int,
+    ) -> pa.RecordBatchReader: ...
+    def get_ivf_partition_info(self, index_name: str) -> List[dict]: ...
+    def hamming_clustering_for_sample(
+        self,
+        column: str,
+        sample_size: Optional[int],
+        hamming_threshold: int,
+    ) -> pa.RecordBatchReader: ...
+    def hamming_clustering_for_range(
+        self,
+        column: str,
+        fragment_id: int,
+        start_row: int,
+        num_rows: int,
+        hamming_threshold: int,
+    ) -> pa.RecordBatchReader: ...
 
 class _MergeInsertBuilder:
     def __init__(self, dataset: _Dataset, on: str | Iterable[str]): ...
diff --git a/python/python/lance/lance/optimize.pyi b/python/python/lance/lance/optimize.pyi
index 9a26d23c003..c4b6b6546e6 100644
--- a/python/python/lance/lance/optimize.pyi
+++ b/python/python/lance/lance/optimize.pyi
@@ -12,7 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-from typing import List
+from typing import List, Optional
 
 from lance import LanceDataset
 from lance.fragment import FragmentMetadata
@@ -51,5 +51,7 @@ class Compaction:
     def plan(dataset: "LanceDataset", options: CompactionOptions) -> CompactionPlan: ...
     @staticmethod
     def commit(
-        dataset: "LanceDataset", rewrites: List[RewriteResult]
+        dataset: "LanceDataset",
+        rewrites: List[RewriteResult],
+        options: Optional[CompactionOptions] = None,
     ) -> CompactionMetrics: ...
diff --git a/python/python/lance/namespace.py b/python/python/lance/namespace.py
index f448e5c3368..fec3a1cfb1e 100644
--- a/python/python/lance/namespace.py
+++ b/python/python/lance/namespace.py
@@ -32,6 +32,8 @@
     CreateMaterializedViewResponse,
     CreateNamespaceRequest,
     CreateNamespaceResponse,
+    CreateTableBranchRequest,
+    CreateTableBranchResponse,
     CreateTableIndexRequest,
     CreateTableIndexResponse,
     CreateTableRequest,
@@ -42,6 +44,8 @@
     DeclareTableResponse,
     DeleteFromTableRequest,
     DeleteFromTableResponse,
+    DeleteTableBranchRequest,
+    DeleteTableBranchResponse,
     DeleteTableTagRequest,
     DeleteTableTagResponse,
     DeregisterTableRequest,
@@ -70,6 +74,8 @@
     LanceNamespace,
     ListNamespacesRequest,
     ListNamespacesResponse,
+    ListTableBranchesRequest,
+    ListTableBranchesResponse,
     ListTableIndicesRequest,
     ListTableIndicesResponse,
     ListTablesRequest,
@@ -850,6 +856,27 @@ def update_table_tag(
         response_dict = self._inner.update_table_tag(request.model_dump())
         return UpdateTableTagResponse.from_dict(response_dict)
 
+    def create_table_branch(
+        self, request: CreateTableBranchRequest
+    ) -> CreateTableBranchResponse:
+        """Create a new branch forked from a table version."""
+        response_dict = self._inner.create_table_branch(request.model_dump())
+        return CreateTableBranchResponse.from_dict(response_dict)
+
+    def list_table_branches(
+        self, request: ListTableBranchesRequest
+    ) -> ListTableBranchesResponse:
+        """List all branches of a table."""
+        response_dict = self._inner.list_table_branches(request.model_dump())
+        return ListTableBranchesResponse.from_dict(response_dict)
+
+    def delete_table_branch(
+        self, request: DeleteTableBranchRequest
+    ) -> DeleteTableBranchResponse:
+        """Delete a branch from a table."""
+        response_dict = self._inner.delete_table_branch(request.model_dump())
+        return DeleteTableBranchResponse.from_dict(response_dict)
+
     # Operation metrics methods
 
     def retrieve_ops_metrics(self) -> Dict[str, int]:
@@ -1420,6 +1447,27 @@ def update_table_tag(
         response_dict = self._inner.update_table_tag(request.model_dump())
         return UpdateTableTagResponse.from_dict(response_dict)
 
+    def create_table_branch(
+        self, request: CreateTableBranchRequest
+    ) -> CreateTableBranchResponse:
+        """Create a new branch forked from a table version."""
+        response_dict = self._inner.create_table_branch(request.model_dump())
+        return CreateTableBranchResponse.from_dict(response_dict)
+
+    def list_table_branches(
+        self, request: ListTableBranchesRequest
+    ) -> ListTableBranchesResponse:
+        """List all branches of a table."""
+        response_dict = self._inner.list_table_branches(request.model_dump())
+        return ListTableBranchesResponse.from_dict(response_dict)
+
+    def delete_table_branch(
+        self, request: DeleteTableBranchRequest
+    ) -> DeleteTableBranchResponse:
+        """Delete a branch from a table."""
+        response_dict = self._inner.delete_table_branch(request.model_dump())
+        return DeleteTableBranchResponse.from_dict(response_dict)
+
     # Operation metrics methods
 
     def retrieve_ops_metrics(self) -> Dict[str, int]:
diff --git a/python/python/lance/optimize.py b/python/python/lance/optimize.py
index 8b98308d442..3ac7547960b 100644
--- a/python/python/lance/optimize.py
+++ b/python/python/lance/optimize.py
@@ -57,6 +57,14 @@ class CompactionOptions(TypedDict):
     The batch size to use when scanning input fragments.  You may want
     to reduce this if you are running out of memory during compaction.
 
+    The default will use the same default from ``scanner``.
+    """
+    io_buffer_size: Optional[int]
+    """
+    The number of bytes to allow to queue up in the I/O buffer when scanning
+    input fragments.  Increasing this can avoid a deadlock that occurs when a
+    single batch of data is larger than the I/O buffer size.
+
     The default will use the same default from ``scanner``.
     """
     compaction_mode: Optional[
diff --git a/python/python/lance/vector.py b/python/python/lance/vector.py
index 34a6154a321..5ce5e8b61e5 100644
--- a/python/python/lance/vector.py
+++ b/python/python/lance/vector.py
@@ -749,3 +749,150 @@ def _partition_and_pq_codes_assignment() -> Iterable[pa.RecordBatch]:
         data_file.path for frag in ds.get_fragments() for data_file in frag.data_files()
     ]
     return dst_dataset_uri, shuffle_buffers
+
+
+# =============================================================================
+# Hamming Distance Clustering
+# =============================================================================
+
+
+def hamming_clustering_for_ivf_partition(
+    dataset: "LanceDataset",
+    index_name: str,
+    partition_id: int,
+    hamming_threshold: int,
+) -> pa.RecordBatchReader:
+    """
+    Perform hamming clustering on a partition of an IVF_FLAT index.
+
+    Loads a partition from an IVF_FLAT index on a hash column, computes
+    pairwise hamming distances between all hashes in the partition,
+    filters by threshold, and clusters the results using union-find.
+
+    Parameters
+    ----------
+    dataset : LanceDataset
+        The Lance dataset containing the hash column with an IVF_FLAT index.
+    index_name : str
+        Name of the IVF_FLAT index on the hash column
+    partition_id : int
+        The partition ID within the IVF_FLAT index
+    hamming_threshold : int
+        Maximum hamming distance to consider as similar
+
+    Returns
+    -------
+    pa.RecordBatchReader
+        A reader yielding batches with columns:
+
+        - 'representative': uint64 - The representative row ID for each cluster
+        - 'duplicates': list<uint64> - List of duplicate row IDs in each cluster
+    """
+    return dataset._ds.hamming_clustering_for_ivf_partition(
+        index_name, partition_id, hamming_threshold
+    )
+
+
+def get_ivf_partition_info(
+    dataset: "LanceDataset",
+    index_name: str,
+) -> List[dict]:
+    """
+    Get partition information for an IVF_FLAT index.
+
+    Parameters
+    ----------
+    dataset : LanceDataset
+        The Lance dataset containing the hash column with an IVF_FLAT index.
+    index_name : str
+        Name of the IVF_FLAT index
+
+    Returns
+    -------
+    list[dict]
+        List of partition info dicts with 'partition_id' and 'size'
+    """
+    return dataset._ds.get_ivf_partition_info(index_name)
+
+
+def hamming_clustering_for_sample(
+    dataset: "LanceDataset",
+    column: str,
+    sample_size: Optional[int] = None,
+    hamming_threshold: int = 10,
+) -> pa.RecordBatchReader:
+    """
+    Perform pairwise hamming distance clustering on a sample of the dataset.
+
+    Randomly samples rows from the dataset, computes pairwise hamming distances
+    between all hashes in the sample, filters by threshold, and clusters the
+    results using union-find.
+
+    Parameters
+    ----------
+    dataset : LanceDataset
+        The Lance dataset containing the hash column.
+    column : str
+        Name of the hash column (must be FixedSizeList<UInt8, 8>)
+    sample_size : int, optional
+        Number of rows to sample. If None, uses all rows.
+    hamming_threshold : int, default 10
+        Maximum hamming distance to consider as similar
+
+    Returns
+    -------
+    pa.RecordBatchReader
+        A reader yielding batches with columns:
+
+        - 'representative': uint64 - The representative row ID for each cluster
+        - 'duplicates': list<uint64> - List of duplicate row IDs in each cluster
+    """
+    return dataset._ds.hamming_clustering_for_sample(
+        column, sample_size, hamming_threshold
+    )
+
+
+def hamming_clustering_for_range(
+    dataset: "LanceDataset",
+    column: str,
+    fragment_id: int,
+    start_row: int,
+    num_rows: int,
+    hamming_threshold: int = 10,
+) -> pa.RecordBatchReader:
+    """
+    Perform pairwise hamming distance clustering on a contiguous range of rows.
+
+    Reads a contiguous range of rows from a specific fragment, computes pairwise
+    hamming distances between all hashes in the range, filters by threshold,
+    and clusters the results using union-find.
+
+    Unlike sampling, this reads sequential rows which is useful for distributed
+    processing where each worker handles a specific range of a fragment.
+
+    Parameters
+    ----------
+    dataset : LanceDataset
+        The Lance dataset containing the hash column.
+    column : str
+        Name of the hash column (must be FixedSizeList<UInt8, 8>)
+    fragment_id : int
+        The fragment ID to read from
+    start_row : int
+        The starting row offset within the fragment
+    num_rows : int
+        Number of rows to read from the start position
+    hamming_threshold : int, default 10
+        Maximum hamming distance to consider as similar
+
+    Returns
+    -------
+    pa.RecordBatchReader
+        A reader yielding batches with columns:
+
+        - 'representative': uint64 - The representative row ID for each cluster
+        - 'duplicates': list<uint64> - List of duplicate row IDs in each cluster
+    """
+    return dataset._ds.hamming_clustering_for_range(
+        column, fragment_id, start_row, num_rows, hamming_threshold
+    )
diff --git a/python/python/tests/test_blob.py b/python/python/tests/test_blob.py
index 5a896d21c5d..fc879c9cbaa 100644
--- a/python/python/tests/test_blob.py
+++ b/python/python/tests/test_blob.py
@@ -45,6 +45,56 @@ def _external_blob_table(blob_path, payload=b"hello"):
     return pa.table({"blob": lance.blob_array([blob_path.as_uri()])})
 
 
+def _add_columns_blob_v2_values(tmp_path):
+    external_base = tmp_path / "external_base"
+    external_blob = external_base / "external_blob.bin"
+    external_blob.parent.mkdir(parents=True, exist_ok=True)
+    external_blob.write_bytes(b"external")
+
+    payloads = [
+        b"inline",
+        b"p" * (64 * 1024 + 1024),
+        b"d" * (4 * 1024 * 1024 + 1024),
+        b"external",
+    ]
+    values = [payloads[0], payloads[1], payloads[2], external_blob.as_uri()]
+    initial_bases = [DatasetBasePath(external_base.as_uri(), name="external", id=1)]
+    return values, payloads, initial_bases
+
+
+def _assert_blob_v2_add_columns_result(dataset, column, payloads):
+    desc = dataset.to_table(columns=[column]).column(column).chunk(0)
+
+    assert desc.field("kind").to_pylist() == [0, 1, 2, 3]
+    assert desc.field("blob_id").to_pylist()[3] == 1
+    assert desc.field("blob_uri").to_pylist()[3] == "external_blob.bin"
+
+    blobs = dataset.take_blobs(column, indices=range(len(payloads)))
+    assert [blob.readall() for blob in blobs] == payloads
+
+
+def _dataset_file_set(dataset_path):
+    return {
+        path.relative_to(dataset_path)
+        for path in dataset_path.rglob("*")
+        if path.is_file()
+    }
+
+
+def _write_two_fragment_blob_v2_seed_dataset(tmp_path, name):
+    values, payloads, initial_bases = _add_columns_blob_v2_values(tmp_path)
+    dataset_path = tmp_path / name
+    ds = lance.write_dataset(
+        pa.table({"id": range(8)}),
+        dataset_path,
+        data_storage_version="2.2",
+        initial_bases=initial_bases,
+        max_rows_per_file=4,
+        max_rows_per_group=4,
+    )
+    return ds, dataset_path, values, payloads
+
+
 def _out_of_order_blob_selection(dataset_with_blobs, selection_kind):
     addresses = _blob_row_addresses(dataset_with_blobs)
     expected = [(addresses[4], b"quux"), (addresses[0], b"foo")]
@@ -533,6 +583,160 @@ def test_blob_extension_write_inline(tmp_path):
         assert f.read() == b"foo"
 
 
+def test_blob_field_threshold_metadata():
+    field = lance.blob_field(
+        "blob",
+        inline_size_threshold=16 * 1024,
+        dedicated_size_threshold=2 * 1024 * 1024,
+    )
+
+    assert field.metadata[b"lance-encoding:blob-inline-size-threshold"] == b"16384"
+    assert field.metadata[b"lance-encoding:blob-dedicated-size-threshold"] == b"2097152"
+
+
+@pytest.mark.parametrize(
+    ("kwargs", "error", "message"),
+    [
+        pytest.param(
+            {"inline_size_threshold": -1},
+            ValueError,
+            "inline_size_threshold must be non-negative",
+            id="negative_inline",
+        ),
+        pytest.param(
+            {"dedicated_size_threshold": 0},
+            ValueError,
+            "dedicated_size_threshold must be positive",
+            id="zero_dedicated",
+        ),
+        pytest.param(
+            {"dedicated_size_threshold": -1},
+            ValueError,
+            "dedicated_size_threshold must be positive",
+            id="negative_dedicated",
+        ),
+        pytest.param(
+            {"inline_size_threshold": True},
+            TypeError,
+            "inline_size_threshold must be an int",
+            id="bool_inline",
+        ),
+        pytest.param(
+            {"dedicated_size_threshold": True},
+            TypeError,
+            "dedicated_size_threshold must be an int",
+            id="bool_dedicated",
+        ),
+        pytest.param(
+            {"inline_size_threshold": 1.5},
+            TypeError,
+            "inline_size_threshold must be an int",
+            id="float_inline",
+        ),
+        pytest.param(
+            {"inline_size_threshold": 2**100},
+            OverflowError,
+            "inline_size_threshold must fit in a Rust usize",
+            id="overflow_inline",
+        ),
+        pytest.param(
+            {"dedicated_size_threshold": 2**100},
+            OverflowError,
+            "dedicated_size_threshold must fit in a Rust usize",
+            id="overflow_dedicated",
+        ),
+    ],
+)
+def test_blob_field_rejects_invalid_thresholds(kwargs, error, message):
+    with pytest.raises(error, match=message):
+        lance.blob_field("blob", **kwargs)
+
+
+def test_blob_extension_inline_threshold_per_column(tmp_path):
+    payload = b"x" * 2048
+    schema = pa.schema(
+        [
+            lance.blob_field("inline_blob", inline_size_threshold=4096),
+            lance.blob_field("packed_blob", inline_size_threshold=1024),
+        ]
+    )
+    table = pa.table(
+        {
+            "inline_blob": lance.blob_array([payload]),
+            "packed_blob": lance.blob_array([payload]),
+        },
+        schema=schema,
+    )
+    ds = lance.write_dataset(
+        table,
+        tmp_path / "test_ds_v2_inline_threshold_per_column",
+        data_storage_version="2.2",
+    )
+
+    desc = ds.to_table(columns=["inline_blob", "packed_blob"])
+    assert desc.column("inline_blob").chunk(0).field("kind").to_pylist() == [0]
+    assert desc.column("packed_blob").chunk(0).field("kind").to_pylist() == [1]
+
+
+def test_blob_extension_threshold_metadata_persists_after_reopen(tmp_path):
+    dataset_path = tmp_path / "test_ds_v2_threshold_metadata_persists"
+    schema = pa.schema([lance.blob_field("blob", inline_size_threshold=1024)])
+    table = pa.table({"blob": lance.blob_array([b"x"])}, schema=schema)
+
+    lance.write_dataset(table, dataset_path, data_storage_version="2.2")
+    reopened = lance.dataset(dataset_path)
+
+    assert (
+        reopened.schema.field("blob").metadata[
+            b"lance-encoding:blob-inline-size-threshold"
+        ]
+        == b"1024"
+    )
+
+
+def test_blob_extension_append_rejects_explicit_threshold_mismatch(tmp_path):
+    dataset_path = tmp_path / "test_ds_v2_append_threshold_mismatch"
+    initial_schema = pa.schema([lance.blob_field("blob", inline_size_threshold=4096)])
+    initial = pa.table(
+        {"blob": lance.blob_array([b"x" * 2048])},
+        schema=initial_schema,
+    )
+    lance.write_dataset(initial, dataset_path, data_storage_version="2.2")
+
+    append_schema = pa.schema([lance.blob_field("blob", inline_size_threshold=1024)])
+    append = pa.table(
+        {"blob": lance.blob_array([b"x" * 2048])},
+        schema=append_schema,
+    )
+
+    with pytest.raises(
+        OSError, match="Cannot append data with blob threshold metadata"
+    ):
+        lance.write_dataset(append, dataset_path, mode="append")
+
+
+def test_blob_extension_dedicated_threshold_precedes_inline_threshold(tmp_path):
+    payload = b"x" * 2048
+    schema = pa.schema(
+        [
+            lance.blob_field(
+                "blob",
+                inline_size_threshold=4096,
+                dedicated_size_threshold=1024,
+            )
+        ]
+    )
+    table = pa.table({"blob": lance.blob_array([payload])}, schema=schema)
+    ds = lance.write_dataset(
+        table,
+        tmp_path / "test_ds_v2_dedicated_precedes_inline",
+        data_storage_version="2.2",
+    )
+
+    desc = ds.to_table(columns=["blob"]).column("blob").chunk(0)
+    assert desc.field("kind").to_pylist() == [2]
+
+
 def test_blob_extension_write_external(tmp_path):
     blob_path = tmp_path / "external_blob.bin"
     blob_path.write_bytes(b"hello")
@@ -608,6 +812,137 @@ def test_blob_extension_write_external_ingest_rejects_reference_only_options(tmp
         )
 
 
+def test_blob_extension_add_columns_record_batch_reader_all_kinds(tmp_path):
+    values, payloads, initial_bases = _add_columns_blob_v2_values(tmp_path)
+    ds = lance.write_dataset(
+        pa.table({"id": range(4)}),
+        tmp_path / "test_add_columns_reader_blob_v2",
+        data_storage_version="2.2",
+        initial_bases=initial_bases,
+    )
+
+    ds.add_columns(pa.table({"blob": lance.blob_array(values)}).to_reader())
+
+    _assert_blob_v2_add_columns_result(ds, "blob", payloads)
+
+
+@pytest.mark.parametrize(
+    "failure_mode",
+    [
+        pytest.param("raises_after_first_fragment", id="reader_raises_mid_stream"),
+        pytest.param("wrong_schema", id="reader_yields_wrong_schema"),
+        pytest.param("too_many_rows", id="reader_produces_too_many_rows"),
+    ],
+)
+def test_blob_extension_add_columns_record_batch_reader_failure_cleans_files(
+    tmp_path,
+    failure_mode,
+):
+    ds, dataset_path, values, payloads = _write_two_fragment_blob_v2_seed_dataset(
+        tmp_path,
+        f"test_add_columns_reader_blob_v2_fail_cleanup_{failure_mode}",
+    )
+    external_blob_path = tmp_path / "external_base" / "external_blob.bin"
+    files_before = _dataset_file_set(dataset_path)
+
+    schema = pa.schema([lance.blob_field("blob")])
+    first_fragment_batch = pa.record_batch([lance.blob_array(values)], schema=schema)
+    second_fragment_batch = pa.record_batch([lance.blob_array(values)], schema=schema)
+
+    if failure_mode == "raises_after_first_fragment":
+        match = "reader failed after first fragment"
+
+        def failing_reader():
+            yield first_fragment_batch
+            raise RuntimeError("reader failed after first fragment")
+
+    elif failure_mode == "wrong_schema":
+        match = "field names"
+
+        def failing_reader():
+            yield first_fragment_batch
+            yield pa.record_batch([pa.array(range(4))], ["not_blob"])
+
+    else:
+        match = "Stream produced more values than expected for dataset"
+
+        def failing_reader():
+            yield first_fragment_batch
+            yield second_fragment_batch
+            yield pa.record_batch([lance.blob_array([payloads[0]])], schema=schema)
+
+    with pytest.raises(OSError, match=match):
+        ds.add_columns(failing_reader(), reader_schema=schema)
+
+    assert ds.version == 1
+    assert _dataset_file_set(dataset_path) == files_before
+    assert external_blob_path.exists()
+
+
+def test_blob_extension_add_columns_batch_udf_failure_cleans_files(tmp_path):
+    ds, dataset_path, values, _ = _write_two_fragment_blob_v2_seed_dataset(
+        tmp_path,
+        "test_add_columns_udf_blob_v2_fail_cleanup",
+    )
+    external_blob_path = tmp_path / "external_base" / "external_blob.bin"
+    files_before = _dataset_file_set(dataset_path)
+    call_count = 0
+
+    @lance.batch_udf(output_schema=pa.schema([lance.blob_field("blob")]))
+    def fail_on_second_fragment(batch):
+        nonlocal call_count
+        call_count += 1
+        if call_count == 2:
+            raise RuntimeError("udf failed after first fragment")
+        blob_values = [values[row.as_py() % len(values)] for row in batch["id"]]
+        return pa.record_batch(
+            [lance.blob_array(blob_values)],
+            ["blob"],
+        )
+
+    with pytest.raises(OSError, match="udf failed after first fragment"):
+        ds.add_columns(fail_on_second_fragment, read_columns=["id"], batch_size=4)
+
+    assert call_count == 2
+    assert ds.version == 1
+    assert _dataset_file_set(dataset_path) == files_before
+    assert external_blob_path.exists()
+
+
+def test_blob_extension_add_columns_batch_udf_all_kinds(tmp_path):
+    values, payloads, initial_bases = _add_columns_blob_v2_values(tmp_path)
+    ds = lance.write_dataset(
+        pa.table({"id": range(4)}),
+        tmp_path / "test_add_columns_udf_blob_v2",
+        data_storage_version="2.2",
+        initial_bases=initial_bases,
+    )
+
+    @lance.batch_udf(output_schema=pa.schema([lance.blob_field("blob")]))
+    def make_blob_column(batch):
+        return pa.record_batch(
+            [lance.blob_array([values[row.as_py()] for row in batch["id"]])],
+            ["blob"],
+        )
+
+    ds.add_columns(make_blob_column, read_columns=["id"])
+
+    _assert_blob_v2_add_columns_result(ds, "blob", payloads)
+
+
+def test_blob_extension_add_columns_all_nulls_blob_v2(tmp_path):
+    ds = lance.write_dataset(
+        pa.table({"id": range(4)}),
+        tmp_path / "test_add_columns_all_nulls_blob_v2",
+        data_storage_version="2.2",
+    )
+
+    ds.add_columns(lance.blob_field("blob"))
+
+    assert ds.to_table(columns=["blob"]).column("blob").to_pylist() == [None] * 4
+    assert ds.take_blobs("blob", indices=range(4)) == []
+
+
 def test_blob_extension_write_fragments_external_denied_by_default(tmp_path):
     blob_path = tmp_path / "external_blob.bin"
 
@@ -1125,6 +1460,38 @@ def test_read_blobs_resolves_nested_field_path(dataset_with_nested_blobs):
     assert [data for _, data in results] == [b"foo", b"baz"]
 
 
+def test_write_nested_blob_v2_and_take_by_field_path(tmp_path):
+    packed = b"x" * (70 * 1024)
+    blob_field = lance.blob_field("blob")
+    info_fields = [pa.field("name", pa.string()), blob_field]
+    info_type = pa.struct(info_fields)
+    info_array = pa.StructArray.from_arrays(
+        [pa.array(["a", "b", "c"]), lance.blob_array([b"foo", packed, None])],
+        fields=info_fields,
+    )
+    table = pa.table(
+        [info_array],
+        schema=pa.schema([pa.field("info", info_type)]),
+    )
+
+    dataset = lance.write_dataset(
+        table,
+        tmp_path / "nested_blob_v2",
+        data_storage_version="2.2",
+    )
+
+    desc = dataset.to_table(columns=["info.blob"]).column("info.blob").chunk(0)
+    assert desc.field("kind").to_pylist()[:2] == [0, 1]
+
+    blobs = dataset.take_blobs("info.blob", indices=[0, 1])
+    with blobs[0] as f:
+        assert f.read() == b"foo"
+    with blobs[1] as f:
+        assert f.read() == packed
+
+    assert dataset.take_blobs("info.blob", indices=[2]) == []
+
+
 def test_to_pandas_returns_blob_files_for_projected_nested_fields(
     dataset_with_nested_blobs,
 ):
diff --git a/python/python/tests/test_dataset.py b/python/python/tests/test_dataset.py
index 4af363868e1..45866f3c4da 100644
--- a/python/python/tests/test_dataset.py
+++ b/python/python/tests/test_dataset.py
@@ -93,6 +93,25 @@ def test_roundtrip_types(tmp_path: Path):
     assert dataset.to_table() == table
 
 
+@pytest.mark.parametrize("data_storage_version", ["legacy", "stable", "2.1"])
+def test_write_zero_dimension_fixed_size_list(
+    tmp_path: Path, data_storage_version: str
+):
+    # Zero-dimension fixed-size lists must be rejected with a clean error
+    # instead of a divide-by-zero panic (#5102)
+    schema = pa.schema(
+        [
+            pa.field("id", pa.int64()),
+            pa.field("vec", pa.list_(pa.float32(), 0)),
+        ]
+    )
+    table = pa.table({"id": [1], "vec": [[]]}, schema=schema)
+    with pytest.raises(OSError, match="dimension must be a positive integer"):
+        lance.write_dataset(
+            table, tmp_path / "ds.lance", data_storage_version=data_storage_version
+        )
+
+
 def test_dataset_overwrite(tmp_path: Path):
     table1 = pa.Table.from_pylist([{"a": 1, "b": 2}, {"a": 10, "b": 20}])
     base_dir = tmp_path / "test"
@@ -424,16 +443,27 @@ def test_enable_stable_row_ids(tmp_path: Path):
     assert table_after["_rowaddr"][3].as_py() == (2 << 32) + 3
 
 
-def test_has_stable_row_ids_property(tmp_path: Path):
-    table = pa.Table.from_pylist([{"a": 1}, {"a": 2}])
+@pytest.mark.parametrize("enable_stable_row_ids", [True, False])
+@pytest.mark.parametrize(
+    "rows",
+    [[{"a": 1}, {"a": 2}], []],
+    ids=["non_empty", "empty"],
+)
+def test_has_stable_row_ids_property(tmp_path: Path, enable_stable_row_ids: bool, rows):
+    schema = pa.schema([pa.field("a", pa.int64())])
+    table = pa.Table.from_pylist(rows, schema=schema)
 
-    stable_path = tmp_path / "stable"
-    lance.write_dataset(table, stable_path, enable_stable_row_ids=True)
-    assert lance.dataset(stable_path).has_stable_row_ids is True
+    path = tmp_path / f"stable_row_ids_{enable_stable_row_ids}_{len(rows)}"
+    lance.write_dataset(
+        table,
+        path,
+        enable_stable_row_ids=enable_stable_row_ids,
+    )
+    ds = lance.dataset(path)
 
-    non_stable_path = tmp_path / "non_stable"
-    lance.write_dataset(table, non_stable_path, enable_stable_row_ids=False)
-    assert lance.dataset(non_stable_path).has_stable_row_ids is False
+    assert ds.count_rows() == len(rows)
+    assert len(ds.get_fragments()) == (0 if len(rows) == 0 else 1)
+    assert ds.has_stable_row_ids is enable_stable_row_ids
 
 
 def _list_manifests(versions_dir):
@@ -1742,6 +1772,7 @@ def test_commit_batch_append():
     result = lance.LanceDataset.commit_batch(dataset, [txn2, txn3])
     dataset = result["dataset"]
     assert dataset.version == 2
+    assert dataset.checkout_version(1).version == 1
     assert len(dataset.get_fragments()) == 3
     assert dataset.to_table() == pa.concat_tables([data1, data2, data3])
     merged_txn = result["merged"]
@@ -5538,6 +5569,8 @@ def test_branches(tmp_path: Path):
     branch1 = ds_main.create_branch("branch1")
     ds_main.branches.replace_metadata("branch1", {"description": "branch one"})
     assert branch1.version == 1
+    # The dataset returned by create_branch must be fully constructed
+    assert branch1.checkout_version(("main", None)).version == 1
     branch1_append = pa.Table.from_pydict({"a": [7, 8], "b": [9, 10]})
     branch1 = lance.write_dataset(branch1_append, branch1, mode="append")
     assert branch1.version == 2
diff --git a/python/python/tests/test_indices.py b/python/python/tests/test_indices.py
index 7f6595f2ecc..02cf64541d6 100644
--- a/python/python/tests/test_indices.py
+++ b/python/python/tests/test_indices.py
@@ -25,7 +25,7 @@
 
 def make_ds(num_rows: int, rows_per_frag: int, tmpdir: pathlib.Path, dtype: str):
     vectors = np.random.randn(num_rows, DIMENSION).astype(dtype)
-    vectors.shape = -1
+    vectors = vectors.reshape(-1)
     vectors = pa.FixedSizeListArray.from_arrays(vectors, DIMENSION)
     table = pa.Table.from_arrays([vectors], names=["vectors"])
     uri = str(tmpdir / "dataset")
@@ -53,7 +53,7 @@ def small_rand_dataset(tmpdir, request):
 @pytest.fixture
 def mostly_null_dataset(tmpdir, request):
     vectors = np.random.randn(NUM_ROWS, DIMENSION).astype(np.float32)
-    vectors.shape = -1
+    vectors = vectors.reshape(-1)
     vectors = pa.FixedSizeListArray.from_arrays(vectors, DIMENSION)
     vectors = vectors.to_pylist()
     vectors = [vec if i % 10 == 0 else None for i, vec in enumerate(vectors)]
@@ -219,7 +219,7 @@ def test_ivf_centroids_fragment_ids(tmpdir):
         ],
         axis=0,
     )
-    vectors.shape = -1
+    vectors = vectors.reshape(-1)
     table = pa.Table.from_arrays(
         [pa.FixedSizeListArray.from_arrays(vectors, DIMENSION)], names=["vectors"]
     )
diff --git a/python/python/tests/test_mem_wal.py b/python/python/tests/test_mem_wal.py
index b8c859cb637..c21e88b2416 100644
--- a/python/python/tests/test_mem_wal.py
+++ b/python/python/tests/test_mem_wal.py
@@ -60,9 +60,16 @@ def _write_flushed_gen(base_path: str, shard_id: str, gen_folder: str, data: pa.
 
     The collector resolves flushed generation paths as:
         {base_dataset_path}/_mem_wal/{shard_id}/{gen_folder}
+
+    Production flush also writes a primary-key dedup sidecar (`_pk_index/`) that
+    the LSM scanner opens to dedup across generations; stage it here too so the
+    flushed generation faithfully matches what flush produces.
     """
+    from lance.lance import _write_pk_sidecar
+
     gen_path = os.path.join(base_path, "_mem_wal", shard_id, gen_folder)
     lance.write_dataset(data, gen_path, schema=_LOOKUP_SCHEMA)
+    _write_pk_sidecar(gen_path, data, ["id"])
 
 
 def test_point_lookup_with_memtables(tmp_path):
diff --git a/python/python/tests/test_namespace_dir.py b/python/python/tests/test_namespace_dir.py
index 1991b82946e..fa1bc93b422 100644
--- a/python/python/tests/test_namespace_dir.py
+++ b/python/python/tests/test_namespace_dir.py
@@ -29,6 +29,8 @@
     CountTableRowsRequest,
     CreateNamespaceRequest,
     CreateNamespaceResponse,
+    CreateTableBranchRequest,
+    CreateTableBranchResponse,
     CreateTableIndexRequest,
     CreateTableIndexResponse,
     CreateTableRequest,
@@ -37,6 +39,8 @@
     CreateTableVersionResponse,
     DeclareTableRequest,
     DeclareTableResponse,
+    DeleteTableBranchRequest,
+    DeleteTableBranchResponse,
     DeregisterTableRequest,
     DeregisterTableResponse,
     DescribeNamespaceRequest,
@@ -54,6 +58,8 @@
     InsertIntoTableResponse,
     ListNamespacesRequest,
     ListNamespacesResponse,
+    ListTableBranchesRequest,
+    ListTableBranchesResponse,
     ListTableIndicesRequest,
     ListTableIndicesResponse,
     ListTablesRequest,
@@ -71,6 +77,8 @@
     InvalidInputError,
     NamespaceNotEmptyError,
     NamespaceNotFoundError,
+    TableBranchAlreadyExistsError,
+    TableBranchNotFoundError,
     TableNotFoundError,
 )
 
@@ -151,6 +159,21 @@ def create_table_version(
     ) -> CreateTableVersionResponse:
         return self._inner.create_table_version(request)
 
+    def create_table_branch(
+        self, request: CreateTableBranchRequest
+    ) -> CreateTableBranchResponse:
+        return self._inner.create_table_branch(request)
+
+    def list_table_branches(
+        self, request: ListTableBranchesRequest
+    ) -> ListTableBranchesResponse:
+        return self._inner.list_table_branches(request)
+
+    def delete_table_branch(
+        self, request: DeleteTableBranchRequest
+    ) -> DeleteTableBranchResponse:
+        return self._inner.delete_table_branch(request)
+
     def create_table_index(
         self, request: CreateTableIndexRequest
     ) -> CreateTableIndexResponse:
@@ -564,6 +587,110 @@ def test_register_table_rejects_path_traversal(self, temp_ns_client):
         assert "Path traversal is not allowed" in str(exc_info.value)
 
 
+class TestTableBranchOperations:
+    """Branch CRUD through the python bindings - mirrors the Rust branch
+    CRUD tests."""
+
+    def test_branch_crud_round_trip(self, temp_ns_client):
+        create_ns_req = CreateNamespaceRequest(id=["workspace"])
+        temp_ns_client.create_namespace(create_ns_req)
+        ipc_data = table_to_ipc_bytes(create_test_data())
+        table_id = ["workspace", "branched_table"]
+        temp_ns_client.create_table(CreateTableRequest(id=table_id), ipc_data)
+
+        temp_ns_client.create_table_branch(
+            CreateTableBranchRequest(id=table_id, name="dev")
+        )
+        listed = temp_ns_client.list_table_branches(
+            ListTableBranchesRequest(id=table_id)
+        )
+        assert "dev" in listed.branches
+        assert listed.branches["dev"].parent_version == 1
+
+        # Duplicate creation and deleting a missing branch surface the typed
+        # branch errors (codes 23 and 22), not InternalError.
+        temp_ns_client.create_table_branch(
+            CreateTableBranchRequest(id=table_id, name="dev2")
+        )
+        with pytest.raises(TableBranchAlreadyExistsError):
+            temp_ns_client.create_table_branch(
+                CreateTableBranchRequest(id=table_id, name="dev2")
+            )
+
+        temp_ns_client.delete_table_branch(
+            DeleteTableBranchRequest(id=table_id, name="dev")
+        )
+        listed = temp_ns_client.list_table_branches(
+            ListTableBranchesRequest(id=table_id)
+        )
+        assert "dev" not in listed.branches
+        with pytest.raises(TableBranchNotFoundError):
+            temp_ns_client.delete_table_branch(
+                DeleteTableBranchRequest(id=table_id, name="dev")
+            )
+
+    def test_create_branch_from_other_branch(self, temp_ns_client):
+        """Forking from a non-main source branch records the right parent."""
+        create_ns_req = CreateNamespaceRequest(id=["workspace"])
+        temp_ns_client.create_namespace(create_ns_req)
+        ipc_data = table_to_ipc_bytes(create_test_data())
+        table_id = ["workspace", "fork_table"]
+        temp_ns_client.create_table(CreateTableRequest(id=table_id), ipc_data)
+
+        temp_ns_client.create_table_branch(
+            CreateTableBranchRequest(id=table_id, name="dev")
+        )
+        temp_ns_client.create_table_branch(
+            CreateTableBranchRequest(id=table_id, name="child", from_branch="dev")
+        )
+        listed = temp_ns_client.list_table_branches(
+            ListTableBranchesRequest(id=table_id)
+        )
+        assert listed.branches["child"].parent_branch == "dev"
+
+
+class _ForeignCodeError(Exception):
+    """Not a LanceNamespaceError, but carries the same integer code as
+    TABLE_NOT_FOUND."""
+
+    code = 4
+
+
+class _RaisingNamespace(LanceNamespace):
+    """A namespace whose describe_table raises the configured exception."""
+
+    def __init__(self, exc: Exception):
+        self._exc = exc
+
+    def namespace_id(self) -> str:
+        return "raising"
+
+    def describe_table(self, request: DescribeTableRequest) -> DescribeTableResponse:
+        raise self._exc
+
+
+class TestPythonNamespaceErrorMapping:
+    """The Rust adapter must trust the `code` attribute only on the
+    lance_namespace exception hierarchy."""
+
+    def test_namespace_error_identity_preserved(self):
+        ns = _RaisingNamespace(TableNotFoundError("no such table"))
+        with pytest.raises(TableNotFoundError, match="no such table"):
+            lance.dataset(namespace_client=ns, table_id=["t"])
+
+        # Branch error codes (22/23) survive the round trip too.
+        ns = _RaisingNamespace(TableBranchNotFoundError("no such branch"))
+        with pytest.raises(TableBranchNotFoundError, match="no such branch"):
+            lance.dataset(namespace_client=ns, table_id=["t"])
+
+    def test_foreign_code_attribute_not_trusted(self):
+        # The foreign exception must surface as itself, not be reinterpreted
+        # as a namespace error via its `code` attribute.
+        ns = _RaisingNamespace(_ForeignCodeError("boom"))
+        with pytest.raises(_ForeignCodeError, match="boom"):
+            lance.dataset(namespace_client=ns, table_id=["t"])
+
+
 class TestChildNamespaceOperations:
     """Tests for operations in child namespaces - mirrors Rust tests."""
 
@@ -979,6 +1106,49 @@ def test_external_manifest_store_invokes_namespace_apis(use_custom):
         ), "describe_table_version should be called once when opening version 1"
 
 
+def test_dataset_namespace_open_does_not_pass_version_to_describe_table():
+    """Dataset versions are applied to dataset open, not namespace describe_table."""
+
+    class VersionRejectingNamespace(CustomNamespace):
+        def __init__(self, inner: lance.namespace.DirectoryNamespace):
+            super().__init__(inner)
+            self.describe_versions = []
+
+        def describe_table(
+            self, request: DescribeTableRequest
+        ) -> DescribeTableResponse:
+            self.describe_versions.append(request.version)
+            assert request.version is None
+            return super().describe_table(request)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        inner_ns_client = lance.namespace.DirectoryNamespace(root=tmpdir)
+        ns_client = VersionRejectingNamespace(inner_ns_client)
+        table_id = ["test_table"]
+
+        table1 = pa.Table.from_pylist([{"a": 1}, {"a": 2}])
+        ds = lance.write_dataset(
+            table1, namespace_client=ns_client, table_id=table_id, mode="create"
+        )
+        assert ds.count_rows() == 2
+        assert ds.version == 1
+
+        table2 = pa.Table.from_pylist([{"a": 3}])
+        ds = lance.write_dataset(
+            table2, namespace_client=ns_client, table_id=table_id, mode="append"
+        )
+        assert ds.count_rows() == 3
+        assert ds.version == 2
+
+        version_one = lance.dataset(
+            namespace_client=ns_client, table_id=table_id, version=1
+        )
+        assert version_one.count_rows() == 2
+        assert version_one.version == 1
+        assert ns_client.describe_versions
+        assert all(version is None for version in ns_client.describe_versions)
+
+
 @pytest.mark.skipif(
     sys.platform == "win32",
     reason="Windows file locking prevents reliable concurrent filesystem operations",
diff --git a/python/python/tests/test_namespace_integration.py b/python/python/tests/test_namespace_integration.py
index 4605b755816..fc08370d247 100644
--- a/python/python/tests/test_namespace_integration.py
+++ b/python/python/tests/test_namespace_integration.py
@@ -31,6 +31,8 @@
 from lance_namespace import (
     CreateNamespaceRequest,
     CreateNamespaceResponse,
+    CreateTableBranchRequest,
+    CreateTableBranchResponse,
     CreateTableRequest,
     CreateTableResponse,
     CreateTableVersionRequest,
@@ -136,6 +138,11 @@ def create_table_version(
     ) -> CreateTableVersionResponse:
         return self._inner.create_table_version(request)
 
+    def create_table_branch(
+        self, request: CreateTableBranchRequest
+    ) -> CreateTableBranchResponse:
+        return self._inner.create_table_branch(request)
+
     def retrieve_ops_metrics(self) -> Optional[Dict[str, int]]:
         return self._inner.retrieve_ops_metrics()
 
@@ -199,6 +206,7 @@ def create_tracking_namespace(
     storage_options: dict,
     credential_expires_in_seconds: int = 60,
     use_custom: bool = False,
+    managed_versioning: bool = False,
 ):
     """Create a DirectoryNamespace with ops metrics and credential vending enabled.
 
@@ -212,6 +220,9 @@ def create_tracking_namespace(
         storage_options: Storage options to pass through (credentials, endpoint, etc.)
         credential_expires_in_seconds: Interval in seconds for credential expiration
         use_custom: If True, wrap in CustomNamespace for testing custom implementations
+        managed_versioning: If True, enable the manifest catalog so table versions
+            are tracked by the namespace and commits route through
+            create_table_version
 
     Returns:
         Tuple of (namespace_client, inner_namespace_client) where inner is always
@@ -238,6 +249,10 @@ def create_tracking_namespace(
     dir_props["vend_input_storage_options_refresh_interval_millis"] = str(
         credential_expires_in_seconds * 1000
     )
+    if managed_versioning:
+        dir_props["manifest_enabled"] = "true"
+        dir_props["table_version_tracking_enabled"] = "true"
+        dir_props["table_version_storage_enabled"] = "true"
 
     inner_ns_client = DirectoryNamespace(**dir_props)
     ns_client = _wrap_if_custom(inner_ns_client, use_custom)
@@ -558,6 +573,87 @@ def test_namespace_write_overwrite_mode(s3_bucket: str, use_custom: bool):
     assert get_describe_call_count(inner_ns_client) == call_count_before_reads
 
 
+@pytest.mark.integration
+@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
+def test_namespace_managed_branches(s3_bucket: str, use_custom: bool):
+    """Branches on a managed-versioning table over S3.
+
+    Branch commits must route through the catalog (create_table_version) and
+    leave main's chain untouched. A cross-branch checkout at an overlapping
+    version number must resolve the requested chain: branch version numbers
+    continue from the fork point, so the same number exists on both chains
+    with different data.
+    """
+    storage_options = copy.deepcopy(CONFIG)
+
+    ns_client, inner_ns_client = create_tracking_namespace(
+        bucket_name=s3_bucket,
+        storage_options=storage_options,
+        credential_expires_in_seconds=3600,
+        use_custom=use_custom,
+        managed_versioning=True,
+    )
+
+    table_name = uuid.uuid4().hex
+    table_id = ["test_ns", table_name]
+
+    def commit_count() -> int:
+        return inner_ns_client.retrieve_ops_metrics().get("create_table_version", 0)
+
+    lance.write_dataset(
+        pa.Table.from_pylist([{"a": 1}]),
+        namespace_client=ns_client,
+        table_id=table_id,
+        mode="create",
+        storage_options=storage_options,
+    )
+    ds = lance.write_dataset(
+        pa.Table.from_pylist([{"a": 2}]),
+        namespace_client=ns_client,
+        table_id=table_id,
+        mode="append",
+        storage_options=storage_options,
+    )
+    assert commit_count() >= 2
+
+    ns_client.create_table_branch(
+        CreateTableBranchRequest(id=table_id, name="dev", from_version=2)
+    )
+
+    dev = ds.checkout_version(("dev", None))
+    commits_before_branch_append = commit_count()
+    dev = lance.write_dataset(
+        pa.Table.from_pylist([{"a": 3}]),
+        dev,
+        mode="append",
+        storage_options=storage_options,
+    )
+    assert commit_count() == commits_before_branch_append + 1
+    assert sorted(dev.to_table()["a"].to_pylist()) == [1, 2, 3]
+
+    # Diverge main to the same version number as dev's tip.
+    ds = lance.write_dataset(
+        pa.Table.from_pylist([{"a": 100}]),
+        namespace_client=ns_client,
+        table_id=table_id,
+        mode="append",
+        storage_options=storage_options,
+    )
+    assert sorted(ds.to_table()["a"].to_pylist()) == [1, 2, 100]
+
+    on_dev = ds.checkout_version(("dev", 3))
+    assert sorted(on_dev.to_table()["a"].to_pylist()) == [1, 2, 3]
+    back_on_main = dev.checkout_version(("main", None))
+    assert sorted(back_on_main.to_table()["a"].to_pylist()) == [1, 2, 100]
+
+    fresh = lance.dataset(
+        namespace_client=ns_client,
+        table_id=table_id,
+        storage_options=storage_options,
+    )
+    assert sorted(fresh.to_table()["a"].to_pylist()) == [1, 2, 100]
+
+
 @pytest.mark.integration
 @pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"])
 def test_namespace_distributed_write(s3_bucket: str, use_custom: bool):
diff --git a/python/python/tests/test_optimize.py b/python/python/tests/test_optimize.py
index ccd889db116..049ce2cc3a5 100644
--- a/python/python/tests/test_optimize.py
+++ b/python/python/tests/test_optimize.py
@@ -324,6 +324,47 @@ def test_defer_index_remap(tmp_path: Path):
     assert any(idx.name == "__lance_frag_reuse" for idx in indices)
 
 
+@pytest.mark.parametrize("use_commit_options", [True, False])
+def test_defer_index_remap_via_commit_options(tmp_path: Path, use_commit_options: bool):
+    """Compaction.commit respects defer_index_remap passed in options.
+
+    When options={"defer_index_remap": True} is supplied to Compaction.commit
+    the __lance_frag_reuse system index must appear in describe_indices().
+    When the option is omitted (default) no such system index is written.
+    """
+    base_dir = tmp_path / f"dataset_commit_opts_{use_commit_options}"
+    data = pa.table({"i": range(6_000), "val": range(6_000)})
+    dataset = lance.write_dataset(data, base_dir, max_rows_per_file=1_000)
+    dataset.create_scalar_index("i", "BTREE")
+    dataset.delete("i < 500")
+
+    plan = Compaction.plan(
+        dataset,
+        options=dict(target_rows_per_fragment=2_000, num_threads=1),
+    )
+    rewrites = [task.execute(dataset) for task in plan.tasks]
+
+    if use_commit_options:
+        Compaction.commit(dataset, rewrites, options={"defer_index_remap": True})
+    else:
+        Compaction.commit(dataset, rewrites)
+
+    dataset = lance.dataset(base_dir)
+    indices = dataset.describe_indices()
+    has_frag_reuse = any(idx.name == "__lance_frag_reuse" for idx in indices)
+
+    if use_commit_options:
+        assert has_frag_reuse, (
+            "expected __lance_frag_reuse system index when defer_index_remap=True "
+            "is passed to Compaction.commit"
+        )
+    else:
+        assert not has_frag_reuse, (
+            "did not expect __lance_frag_reuse system index when options is omitted "
+            "from Compaction.commit"
+        )
+
+
 @pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_describe_indices_matches_list_indices_for_frag_reuse(tmp_path: Path):
     """describe_indices() and list_indices() must agree on the index_type
diff --git a/python/python/tests/test_s3_ddb.py b/python/python/tests/test_s3_ddb.py
index b9c9e4be6c0..dc9744115e2 100644
--- a/python/python/tests/test_s3_ddb.py
+++ b/python/python/tests/test_s3_ddb.py
@@ -212,6 +212,58 @@ def writh_dataset_with_start_barrier():
     assert lance.dataset(table_dir).count_rows() == expected_version * 2
 
 
+@pytest.mark.integration
+def test_s3_ddb_branches(s3_bucket: str, ddb_table: str):
+    """Branches on a table committed through the DynamoDB external manifest
+    store.
+
+    The DDB store keys version chains by base uri, so each branch chain must
+    get its own entries via its branch-qualified path. Both chains are given
+    the same version number with diverged data so a wrong-chain resolution
+    cannot pass silently.
+    """
+    storage_options = copy.deepcopy(CONFIG)
+    table_name = uuid.uuid4().hex
+    table_dir = f"s3+ddb://{s3_bucket}/{table_name}?ddbTableName={ddb_table}"
+
+    # main: v1 (a=1), v2 (a=2)
+    lance.write_dataset(
+        pa.Table.from_pylist([{"a": 1}]), table_dir, storage_options=storage_options
+    )
+    ds = lance.write_dataset(
+        pa.Table.from_pylist([{"a": 2}]),
+        table_dir,
+        mode="append",
+        storage_options=storage_options,
+    )
+
+    # Fork "dev" at v2 and commit on it, then diverge main to the same
+    # version number.
+    dev = ds.create_branch("dev", 2)
+    dev = lance.write_dataset(
+        pa.Table.from_pylist([{"a": 3}]),
+        dev,
+        mode="append",
+        storage_options=storage_options,
+    )
+    ds = lance.write_dataset(
+        pa.Table.from_pylist([{"a": 100}]),
+        table_dir,
+        mode="append",
+        storage_options=storage_options,
+    )
+
+    assert sorted(dev.to_table()["a"].to_pylist()) == [1, 2, 3]
+    assert sorted(ds.to_table()["a"].to_pylist()) == [1, 2, 100]
+
+    # Cross-branch checkout at the overlapping version number resolves each
+    # chain's own data.
+    on_dev = ds.checkout_version(("dev", 3))
+    assert sorted(on_dev.to_table()["a"].to_pylist()) == [1, 2, 3]
+    back_on_main = dev.checkout_version(("main", None))
+    assert sorted(back_on_main.to_table()["a"].to_pylist()) == [1, 2, 100]
+
+
 @pytest.mark.integration
 def test_s3_unsafe(s3_bucket: str):
     storage_options = copy.deepcopy(CONFIG)
diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py
index 7ddfbbc0dc8..b6e882633f5 100644
--- a/python/python/tests/test_scalar_index.py
+++ b/python/python/tests/test_scalar_index.py
@@ -648,7 +648,10 @@ def make_fts_search(ds):
     assert "ScalarIndexQuery" in plan
     assert "MaterializeIndex" not in plan
     assert "FlatMatchQuery" in plan
-    assert "LanceScan" in plan
+    # Flat FTS now reads via FilteredReadExec (prints as `LanceRead`) so the
+    # BTree on `id` pushes into the unindexed-fragment scan too.
+    assert "LanceRead" in plan
+    assert "LanceScan" not in plan
     assert make_fts_search(ds).to_table().num_rows == 12
 
     # Update vector index but NOT scalar index
@@ -868,6 +871,51 @@ def test_fts_custom_stop_words(tmp_path):
     assert len(results["_rowid"].to_pylist()) == 1
 
 
+def test_fts_stop_words_respect_language_for_simple_tokenizer(tmp_path):
+    data = pa.table({"text": ["the lance data", "的 lance data"]})
+    ds = lance.write_dataset(data, tmp_path, mode="overwrite")
+    ds.create_scalar_index(
+        "text",
+        "INVERTED",
+        base_tokenizer="simple",
+        stem=False,
+    )
+
+    results = ds.to_table(full_text_query="the", with_row_id=True)
+    assert results.num_rows == 0
+
+    results = ds.to_table(full_text_query="的", with_row_id=True)
+    assert results["text"].to_pylist() == ["的 lance data"]
+
+
+def test_fts_icu_stop_words_are_all_or_none(tmp_path):
+    data = pa.table({"text": ["the 的 lance data", "useful data"]})
+    ds = lance.write_dataset(data, tmp_path / "enabled", mode="overwrite")
+    ds.create_scalar_index(
+        "text",
+        "INVERTED",
+        base_tokenizer="icu",
+        stem=False,
+        remove_stop_words=True,
+    )
+
+    assert ds.to_table(full_text_query="the", with_row_id=True).num_rows == 0
+    assert ds.to_table(full_text_query="的", with_row_id=True).num_rows == 0
+    assert ds.to_table(full_text_query="lance", with_row_id=True).num_rows == 1
+
+    ds = lance.write_dataset(data, tmp_path / "disabled", mode="overwrite")
+    ds.create_scalar_index(
+        "text",
+        "INVERTED",
+        base_tokenizer="icu",
+        stem=False,
+        remove_stop_words=False,
+    )
+
+    assert ds.to_table(full_text_query="the", with_row_id=True).num_rows == 1
+    assert ds.to_table(full_text_query="的", with_row_id=True).num_rows == 1
+
+
 def test_rowid_order(dataset):
     dataset.create_scalar_index("doc", index_type="INVERTED", with_position=False)
     results = dataset.scanner(
diff --git a/python/python/tests/test_vector.py b/python/python/tests/test_vector.py
index c02c8312f88..4ea4e7d425e 100644
--- a/python/python/tests/test_vector.py
+++ b/python/python/tests/test_vector.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pyarrow as pa
 import pytest
-from lance.vector import vec_to_table
+from lance.vector import hamming_clustering_for_sample, vec_to_table
 
 
 def test_dict():
@@ -147,3 +147,38 @@ def test_binary_vectors_invalid_metric(tmp_path):
                 "metric": "l2",
             }
         ).to_table()
+
+
+def _hash_table(hashes):
+    """Build a table with a ``hash`` column of FixedSizeList<UInt8, 8>.
+
+    ``hashes`` is a list of 8-byte sequences, one per row.
+    """
+    flat = [byte for row in hashes for byte in row]
+    values = pa.FixedSizeListArray.from_arrays(
+        pa.array(flat, type=pa.uint8()), list_size=8
+    )
+    return pa.Table.from_arrays([values], names=["hash"])
+
+
+def test_hamming_clustering_for_sample(tmp_path):
+    hash_a = [0, 0, 0, 0, 0, 0, 0, 0]
+    hash_b = [255, 0, 0, 0, 0, 0, 0, 0]  # 8 bits from hash_a
+    hash_c = [1, 2, 3, 4, 5, 6, 7, 8]  # far from both
+    # Rows 0,1,2 share hash_a; rows 3,4 share hash_b; row 5 is unique.
+    table = _hash_table([hash_a, hash_a, hash_a, hash_b, hash_b, hash_c])
+    dataset = lance.write_dataset(table, tmp_path / "hashes")
+
+    # threshold 0 => only exact-match hashes cluster together. Full scan
+    # (sample_size=None) yields deterministic row ids 0..5.
+    result = hamming_clustering_for_sample(dataset, "hash", None, 0).read_all()
+
+    clusters = {
+        rep: sorted(dups)
+        for rep, dups in zip(
+            result["representative"].to_pylist(),
+            result["duplicates"].to_pylist(),
+        )
+    }
+    # Singleton row 5 is not emitted as a cluster.
+    assert clusters == {0: [1, 2], 3: [4]}
diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
index 292b8079706..4e3addfedb8 100644
--- a/python/python/tests/test_vector_index.py
+++ b/python/python/tests/test_vector_index.py
@@ -1772,6 +1772,8 @@ def test_index_cast_centroids(tmp_path):
     values = pa.array([x for arr in centroids for x in arr], pa.float32())
     centroids = pa.FixedSizeListArray.from_arrays(values, 128)
 
+    # Cast invalidates the attached index; drop it first per the new contract.
+    dataset.drop_index(index_name)
     dataset.alter_columns(dict(path="vector", data_type=pa.list_(pa.float16(), 128)))
 
     # centroids are f32, but the column is now f16
diff --git a/python/src/dataset.rs b/python/src/dataset.rs
index 8bfa81aeae4..31eaa96a654 100644
--- a/python/src/dataset.rs
+++ b/python/src/dataset.rs
@@ -3428,6 +3428,188 @@ impl Dataset {
             self.ds.clone(),
         ))
     }
+
+    /// Perform pairwise hamming distance clustering on a partition of an IVF_FLAT index.
+    ///
+    /// This function loads a specific partition from an IVF_FLAT index on a hash column,
+    /// computes pairwise hamming distances between all hashes in the partition,
+    /// filters by threshold, and clusters the results using union-find.
+    ///
+    /// Parameters
+    /// ----------
+    /// index_name : str
+    ///     Name of the IVF_FLAT index on the hash column
+    /// partition_id : int
+    ///     The partition ID within the IVF_FLAT index
+    /// hamming_threshold : int
+    ///     Maximum hamming distance to consider as similar
+    ///
+    /// Returns
+    /// -------
+    /// pyarrow.RecordBatchReader
+    ///     A reader yielding batches with columns:
+    ///     - 'representative': uint64 - The representative row ID for each cluster
+    ///     - 'duplicates': list<uint64> - List of duplicate row IDs in each cluster
+    #[pyo3(signature = (index_name, partition_id, hamming_threshold))]
+    fn hamming_clustering_for_ivf_partition(
+        &self,
+        py: Python<'_>,
+        index_name: &str,
+        partition_id: usize,
+        hamming_threshold: u32,
+    ) -> PyResult<PyArrowType<Box<dyn RecordBatchReader + Send>>> {
+        use lance::index::vector::hamming::hamming_clustering_for_ivf_partition;
+
+        let ds = self.ds.as_ref();
+        let reader = rt()
+            .block_on(
+                Some(py),
+                hamming_clustering_for_ivf_partition(
+                    ds,
+                    index_name,
+                    partition_id,
+                    hamming_threshold,
+                ),
+            )?
+            .map_err(|err| PyValueError::new_err(err.to_string()))?;
+
+        Ok(PyArrowType(reader))
+    }
+
+    /// Get partition information for an IVF_FLAT index.
+    ///
+    /// Parameters
+    /// ----------
+    /// index_name : str
+    ///     Name of the IVF_FLAT index
+    ///
+    /// Returns
+    /// -------
+    /// List[dict]
+    ///     List of partition info dicts with 'partition_id' and 'size'
+    #[pyo3(signature = (index_name))]
+    fn get_ivf_partition_info(
+        &self,
+        py: Python<'_>,
+        index_name: &str,
+    ) -> PyResult<Vec<Py<PyDict>>> {
+        use lance::index::vector::hamming::get_ivf_partition_info;
+
+        let ds = self.ds.as_ref();
+        let result = rt()
+            .block_on(Some(py), get_ivf_partition_info(ds, index_name))?
+            .map_err(|err| PyValueError::new_err(err.to_string()))?;
+
+        let partitions: PyResult<Vec<_>> = result
+            .iter()
+            .map(|p| {
+                let dict = PyDict::new(py);
+                dict.set_item("partition_id", p.partition_id)?;
+                dict.set_item("size", p.size)?;
+                Ok(dict.into())
+            })
+            .collect();
+
+        partitions
+    }
+
+    /// Perform pairwise hamming distance clustering on sampled rows from a dataset.
+    ///
+    /// This function samples N rows randomly from the dataset, extracts hashes,
+    /// computes pairwise hamming distances, and clusters the results.
+    /// It's useful for benchmarking and testing without requiring an IVF index.
+    ///
+    /// Parameters
+    /// ----------
+    /// column : str
+    ///     Name of the hash column (must be FixedSizeList<UInt8, 8>)
+    /// sample_size : int, optional
+    ///     Number of rows to sample (if None or >= total rows, uses all rows)
+    /// hamming_threshold : int
+    ///     Maximum hamming distance to consider as similar
+    ///
+    /// Returns
+    /// -------
+    /// pyarrow.RecordBatchReader
+    ///     A reader yielding batches with columns:
+    ///     - 'representative': uint64 - The representative row ID for each cluster
+    ///     - 'duplicates': list<uint64> - List of duplicate row IDs in each cluster
+    #[pyo3(signature = (column, sample_size, hamming_threshold))]
+    fn hamming_clustering_for_sample(
+        &self,
+        py: Python<'_>,
+        column: &str,
+        sample_size: Option<usize>,
+        hamming_threshold: u32,
+    ) -> PyResult<PyArrowType<Box<dyn RecordBatchReader + Send>>> {
+        use lance::index::vector::hamming::hamming_clustering_for_sample;
+
+        let ds = self.ds.as_ref();
+        let reader = rt()
+            .block_on(
+                Some(py),
+                hamming_clustering_for_sample(ds, column, sample_size, hamming_threshold),
+            )?
+            .map_err(|err| PyValueError::new_err(err.to_string()))?;
+
+        Ok(PyArrowType(reader))
+    }
+
+    /// Perform pairwise hamming distance clustering on a contiguous range of rows from a fragment.
+    ///
+    /// This function reads a contiguous range of rows from a specific fragment,
+    /// extracts hashes, computes pairwise hamming distances, and clusters the results.
+    /// Unlike sampling, this reads sequential rows which is useful for distributed
+    /// processing where each worker handles a specific range of a fragment.
+    ///
+    /// Parameters
+    /// ----------
+    /// column : str
+    ///     Name of the hash column (must be FixedSizeList<UInt8, 8>)
+    /// fragment_id : int
+    ///     The fragment ID to read from
+    /// start_row : int
+    ///     The starting row offset within the fragment
+    /// num_rows : int
+    ///     Number of rows to read from the start position
+    /// hamming_threshold : int
+    ///     Maximum hamming distance to consider as similar
+    ///
+    /// Returns
+    /// -------
+    /// pyarrow.RecordBatchReader
+    ///     A reader yielding batches with columns:
+    ///     - 'representative': uint64 - The representative row ID for each cluster
+    ///     - 'duplicates': list<uint64> - List of duplicate row IDs in each cluster
+    #[pyo3(signature = (column, fragment_id, start_row, num_rows, hamming_threshold))]
+    fn hamming_clustering_for_range(
+        &self,
+        py: Python<'_>,
+        column: &str,
+        fragment_id: usize,
+        start_row: usize,
+        num_rows: usize,
+        hamming_threshold: u32,
+    ) -> PyResult<PyArrowType<Box<dyn RecordBatchReader + Send>>> {
+        use lance::index::vector::hamming::hamming_clustering_for_range;
+
+        let ds = self.ds.as_ref();
+        let reader = rt()
+            .block_on(
+                Some(py),
+                hamming_clustering_for_range(
+                    ds,
+                    column,
+                    fragment_id,
+                    start_row,
+                    num_rows,
+                    hamming_threshold,
+                ),
+            )?
+            .map_err(|err| PyValueError::new_err(err.to_string()))?;
+
+        Ok(PyArrowType(reader))
+    }
 }
 
 #[pyclass(name = "SqlQuery", module = "_lib", subclass, skip_from_py_object)]
diff --git a/python/src/dataset/optimize.rs b/python/src/dataset/optimize.rs
index 321d7157b86..4bb29246f45 100644
--- a/python/src/dataset/optimize.rs
+++ b/python/src/dataset/optimize.rs
@@ -58,6 +58,9 @@ fn parse_compaction_options(
             "batch_size" => {
                 opts.batch_size = value.extract()?;
             }
+            "io_buffer_size" => {
+                opts.io_buffer_size = value.extract()?;
+            }
             "compaction_mode" => {
                 let mode_str: Option<String> = value.extract()?;
                 if let Some(mode_str) = mode_str {
@@ -551,26 +554,34 @@ impl PyCompaction {
     ///     new version once committed.
     /// rewrites : List[RewriteResult]
     ///     The results of the compaction tasks to include in the commit.
+    /// options : dict, optional
+    ///     Compaction options to apply at commit time.
+    ///     When absent or ``None``, defaults to ``CompactionOptions::default()``.
     ///
     /// Returns
     /// -------
     /// CompactionMetrics
     #[staticmethod]
+    #[pyo3(signature = (dataset, rewrites, options = None))]
     pub fn commit(
         dataset: Bound<PyAny>,
         rewrites: Vec<PyRewriteResult>,
+        options: Option<Bound<PyDict>>,
     ) -> PyResult<PyCompactionMetrics> {
         let dataset_ref = unwrap_dataset(dataset)?;
         let dataset = dataset_ref.borrow().clone();
+        let config = dataset.ds.manifest.config.clone();
+        let opts = match options {
+            Some(ref dict) => parse_compaction_options(dict, &config)?,
+            None => CompactionOptions::default(),
+        };
         let rewrites: Vec<RewriteResult> = rewrites.into_iter().map(|r| r.0).collect();
         let mut new_ds = dataset.ds.as_ref().clone();
-        // TODO: pass compaction option from plan and execute time
-        let options: CompactionOptions = CompactionOptions::default();
         let fut = commit_compaction(
             &mut new_ds,
             rewrites,
             Arc::new(DatasetIndexRemapperOptions::default()),
-            &options,
+            &opts,
         );
         let metrics = rt()
             .block_on(None, fut)?
diff --git a/python/src/lib.rs b/python/src/lib.rs
index cf29b26c46a..3bf4eab221e 100644
--- a/python/src/lib.rs
+++ b/python/src/lib.rs
@@ -293,6 +293,7 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<mem_wal::PyLsmPointLookupPlanner>()?;
     m.add_class::<mem_wal::PyLsmVectorSearchPlanner>()?;
     m.add_wrapped(wrap_pyfunction!(mem_wal::py_evaluate_sharding_spec))?;
+    m.add_wrapped(wrap_pyfunction!(mem_wal::py_write_pk_sidecar))?;
     m.add_wrapped(wrap_pyfunction!(bfloat16_array))?;
     m.add_wrapped(wrap_pyfunction!(write_dataset))?;
     m.add_wrapped(wrap_pyfunction!(write_fragments))?;
diff --git a/python/src/mem_wal.rs b/python/src/mem_wal.rs
index 25127c95ea4..dc9718c0dce 100644
--- a/python/src/mem_wal.rs
+++ b/python/src/mem_wal.rs
@@ -51,6 +51,31 @@ pub fn py_evaluate_sharding_spec<'py>(
     result.to_pyarrow(py)
 }
 
+/// Write a primary-key dedup sidecar (`_pk_index/`) for a flushed-generation
+/// dataset already written at `gen_path`, mirroring what production flush emits.
+///
+/// Test-support only: lets Python tests stage a *faithful* flushed generation
+/// (dataset + sidecar). Production always writes the sidecar during flush, so a
+/// dataset-without-sidecar is not a state the system otherwise produces.
+#[pyfunction(name = "_write_pk_sidecar", signature = (gen_path, data, pk_columns))]
+pub fn py_write_pk_sidecar(
+    py: Python<'_>,
+    gen_path: String,
+    data: &Bound<'_, PyAny>,
+    pk_columns: Vec<String>,
+) -> PyResult<()> {
+    let reader = ArrowArrayStreamReader::from_pyarrow_bound(data)
+        .map_err(|e| PyValueError::new_err(format!("Cannot read data as Arrow: {}", e)))?;
+    let batches: Vec<RecordBatch> = reader
+        .collect::<Result<_, _>>()
+        .map_err(|e| PyIOError::new_err(format!("Failed to read batches: {}", e)))?;
+    rt().block_on(Some(py), async move {
+        let pk_refs: Vec<&str> = pk_columns.iter().map(String::as_str).collect();
+        lance::dataset::mem_wal::scanner::write_pk_sidecar(&gen_path, &batches, &pk_refs).await
+    })?
+    .map_err(|e: lance::Error| PyIOError::new_err(e.to_string()))
+}
+
 fn sharding_spec_from_py(spec: &Bound<'_, PyAny>) -> PyResult<ShardingSpec> {
     let spec_id = get_py_value(spec, "spec_id")?.extract::<u32>()?;
     let fields_obj = get_py_value(spec, "fields")?;
diff --git a/python/src/namespace.rs b/python/src/namespace.rs
index cf5f7c41b0f..e88ff40de2c 100644
--- a/python/src/namespace.rs
+++ b/python/src/namespace.rs
@@ -392,6 +392,44 @@ impl PyDirectoryNamespace {
         pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))
     }
 
+    // Table branch operations
+
+    fn create_table_branch<'py>(
+        &self,
+        py: Python<'py>,
+        request: &Bound<'_, PyAny>,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let request = depythonize(request)?;
+        let response = crate::rt()
+            .block_on(Some(py), self.inner.create_table_branch(request))?
+            .infer_error()?;
+        pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))
+    }
+
+    fn list_table_branches<'py>(
+        &self,
+        py: Python<'py>,
+        request: &Bound<'_, PyAny>,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let request = depythonize(request)?;
+        let response = crate::rt()
+            .block_on(Some(py), self.inner.list_table_branches(request))?
+            .infer_error()?;
+        pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))
+    }
+
+    fn delete_table_branch<'py>(
+        &self,
+        py: Python<'py>,
+        request: &Bound<'_, PyAny>,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let request = depythonize(request)?;
+        let response = crate::rt()
+            .block_on(Some(py), self.inner.delete_table_branch(request))?
+            .infer_error()?;
+        pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))
+    }
+
     // Data manipulation operations
 
     fn count_table_rows(&self, py: Python, request: &Bound<'_, PyAny>) -> PyResult<i64> {
@@ -1054,6 +1092,44 @@ impl PyRestNamespace {
         pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))
     }
 
+    // Table branch operations
+
+    fn create_table_branch<'py>(
+        &self,
+        py: Python<'py>,
+        request: &Bound<'_, PyAny>,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let request = depythonize(request)?;
+        let response = crate::rt()
+            .block_on(Some(py), self.inner.create_table_branch(request))?
+            .infer_error()?;
+        pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))
+    }
+
+    fn list_table_branches<'py>(
+        &self,
+        py: Python<'py>,
+        request: &Bound<'_, PyAny>,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let request = depythonize(request)?;
+        let response = crate::rt()
+            .block_on(Some(py), self.inner.list_table_branches(request))?
+            .infer_error()?;
+        pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))
+    }
+
+    fn delete_table_branch<'py>(
+        &self,
+        py: Python<'py>,
+        request: &Bound<'_, PyAny>,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let request = depythonize(request)?;
+        let response = crate::rt()
+            .block_on(Some(py), self.inner.delete_table_branch(request))?
+            .infer_error()?;
+        pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))
+    }
+
     // Data manipulation operations
 
     fn count_table_rows(&self, py: Python, request: &Bound<'_, PyAny>) -> PyResult<i64> {
@@ -1472,6 +1548,30 @@ fn get_dict_with_model_dump_class(py: Python<'_>) -> PyResult<Bound<'_, PyAny>>
     Ok(class)
 }
 
+/// Convert a Python namespace exception into a lance error, preserving the
+/// namespace error identity when the exception is a `lance_namespace`
+/// `LanceNamespaceError` carrying an error `code`, so callers can react to
+/// e.g. TableNotFound the same way they do for native clients. Foreign
+/// exceptions that happen to carry an integer `code` (e.g. SystemExit) must
+/// not be reinterpreted, so the extraction is gated on the exception type.
+fn namespace_error_from_py(method_name: &'static str, e: PyErr) -> lance_core::Error {
+    Python::attach(|py| {
+        let value = e.value(py);
+        let is_namespace_error = py
+            .import("lance_namespace.errors")
+            .and_then(|module| module.getattr("LanceNamespaceError"))
+            .and_then(|class| value.is_instance(&class))
+            .unwrap_or(false);
+        if is_namespace_error
+            && let Ok(code) = value.getattr("code").and_then(|code| code.extract::<u32>())
+        {
+            return lance_namespace::error::NamespaceError::from_code(code, value.to_string())
+                .into();
+        }
+        lance_core::Error::io(format!("Python error in {}: {}", method_name, e))
+    })
+}
+
 /// Helper to call a Python namespace method with JSON serialization.
 /// For methods that take a request and return a response.
 /// Uses DictWithModelDump to pass a dict that also has model_dump() method,
@@ -1519,7 +1619,7 @@ where
     })
     .await
     .map_err(|e| lance_core::Error::io(format!("Task join error for {}: {}", method_name, e)))?
-    .map_err(|e: PyErr| lance_core::Error::io(format!("Python error in {}: {}", method_name, e)))?;
+    .map_err(|e: PyErr| namespace_error_from_py(method_name, e))?;
 
     serde_json::from_str(&response_json).map_err(|e| {
         lance_core::Error::io(format!(
diff --git a/python/uv.lock b/python/uv.lock
index 314417f5aa1..5f1fa45d755 100644
--- a/python/uv.lock
+++ b/python/uv.lock
@@ -1,13 +1,12 @@
 version = 1
 revision = 3
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 resolution-markers = [
     "python_full_version >= '3.14'",
     "python_full_version == '3.13.*'",
     "python_full_version == '3.12.*'",
     "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version < '3.10'",
+    "python_full_version < '3.11'",
 ]
 
 [[package]]
@@ -33,15 +32,15 @@ name = "aiohttp"
 version = "3.14.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "aiohappyeyeballs", marker = "python_full_version >= '3.10'" },
-    { name = "aiosignal", marker = "python_full_version >= '3.10'" },
-    { name = "async-timeout", marker = "python_full_version == '3.10.*'" },
-    { name = "attrs", marker = "python_full_version >= '3.10'" },
-    { name = "frozenlist", marker = "python_full_version >= '3.10'" },
-    { name = "multidict", marker = "python_full_version >= '3.10'" },
-    { name = "propcache", marker = "python_full_version >= '3.10'" },
-    { name = "typing-extensions", marker = "python_full_version >= '3.10' and python_full_version < '3.13'" },
-    { name = "yarl", marker = "python_full_version >= '3.10'" },
+    { name = "aiohappyeyeballs" },
+    { name = "aiosignal" },
+    { name = "async-timeout", marker = "python_full_version < '3.11'" },
+    { name = "attrs" },
+    { name = "frozenlist" },
+    { name = "multidict" },
+    { name = "propcache" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+    { name = "yarl" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/ee/ab/93ce242f899b68c51b0578c027aafa791ab3614cb9345fa5d37b5f5c8e3e/aiohttp-3.14.0.tar.gz", hash = "sha256:2882de819734c715fd1b9c11c97e09fa020d14438203d1d354d8ed1702791c9b", size = 7940674, upload-time = "2026-06-01T19:41:02.763Z" }
 wheels = [
@@ -170,8 +169,8 @@ name = "aiosignal"
 version = "1.4.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "frozenlist", marker = "python_full_version >= '3.10'" },
-    { name = "typing-extensions", marker = "python_full_version >= '3.10' and python_full_version < '3.13'" },
+    { name = "frozenlist" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007, upload-time = "2025-07-03T22:54:43.528Z" }
 wheels = [
@@ -235,19 +234,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/28/8a/79c76ad88b16f2fac25684f7313593738f353355eb1af2307e43efd7b1ca/arro3_core-0.6.5-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:de74a2512e2e2366d4b064c498c38672bf6ddea38acec8b1999b4e66182dd001", size = 3104663, upload-time = "2025-10-13T23:11:00.582Z" },
     { url = "https://files.pythonhosted.org/packages/20/66/9152feaa87f851a37c1a2bd74fb89d7e82e4c76447ee590bf8e6fff5e9d8/arro3_core-0.6.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:806ca8e20507675b2de68b3d009f76e898cc3c3e441c834ea5220866f68aac50", size = 2956440, upload-time = "2025-10-13T23:11:03.769Z" },
     { url = "https://files.pythonhosted.org/packages/ad/66/f4179ef64d5c18fe76ec93cfbff42c0f401438ef771c6766b880044d7e13/arro3_core-0.6.5-cp313-cp313t-win_amd64.whl", hash = "sha256:8f6f0cc78877ade7ad6e678a4671b191406547e7b407bc9637436869c017ed47", size = 2845345, upload-time = "2025-10-13T23:11:07.447Z" },
-    { url = "https://files.pythonhosted.org/packages/07/c2/407d6bc19813fb74cc2b087ad3e959e102b29ff81e35dcc0ad0dfb5b946c/arro3_core-0.6.5-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:dfac7fac3c6a302399d94644d48682a19488a5b67bd1ccbdf6c560a7ffabde6d", size = 2680237, upload-time = "2025-10-13T23:11:10.876Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/73/c67156794d7e9734f4cc03d2eca7e44a1cc014686e6b7663f5110f58581d/arro3_core-0.6.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fc70042e558d1cd5fbe917b58e8ef52701441e38ff30b1912858050f796a62c", size = 2386228, upload-time = "2025-10-13T23:11:14.02Z" },
-    { url = "https://files.pythonhosted.org/packages/79/e8/817ee1abb0cfa7e266ef00749b144553d2bb9c4679ca932ecbca9dc7dea9/arro3_core-0.6.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1583b29b2ba83927a33e5435e5d9d134114c45a6360a8bb4db4beda13dab4fd8", size = 2886476, upload-time = "2025-10-13T23:11:17.579Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/d6/1b9beceab797c4510abfc25ef6e657e4c940d06a9615927ce506463691dd/arro3_core-0.6.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6a170fe53f18dda4a4647fd3b8b4a9373fc11ac42c41a4b65f55d79ad531a33e", size = 2911941, upload-time = "2025-10-13T23:11:21.131Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/ed/4fe1fb9a24698fe6189111836d22c9582cbc92fa159b24b8664e924738dc/arro3_core-0.6.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:83047b4e6e18835c91c8d12c5494e6ababc7c185c5a772d3429e8f9b0c185894", size = 3150419, upload-time = "2025-10-13T23:11:24.503Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/91/d6215b782fa91493f504ae13623db889beeaf0519037c28fc6744464439a/arro3_core-0.6.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3d4393d281d1ef18927915a11187da27287d279f99d5325bc9afb417f76084f", size = 2777891, upload-time = "2025-10-13T23:11:28.11Z" },
-    { url = "https://files.pythonhosted.org/packages/d4/de/0aa3504e6cbf406086de49b59cb0dcb3ab11f64acbb38602143e479831dc/arro3_core-0.6.5-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:f0c88d8babcf51affdd69390882e2f0ecb1890a1b8a5abfc087d003e7181eb6e", size = 2519673, upload-time = "2025-10-13T23:11:31.426Z" },
-    { url = "https://files.pythonhosted.org/packages/05/69/47bf9c9ab66bafc7056a41f6db9d2149639eea6417299e3fe6c01ef99b6c/arro3_core-0.6.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:36424e1d62212466a5cacdc27d414e99bf0fdab1544cc2b7e5b81e41437e5970", size = 3026254, upload-time = "2025-10-13T23:11:36.199Z" },
-    { url = "https://files.pythonhosted.org/packages/b1/e8/638582437ab41ba52d3c7f2a1b0a98e4a05a51e3f660985e594b4f6c18d5/arro3_core-0.6.5-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4eb4d96f7db618f100758a8b7ec1b221c8737d543073701b7ffee74bc5019d46", size = 2704582, upload-time = "2025-10-13T23:11:39.408Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/0a/7bc46ee799459cce72a2e15b0eb184170f26cac37eace0b813e855fbc4d8/arro3_core-0.6.5-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:2cfe9b4b1dd663d256754f1aa7aae783a1cddd3eb5698892b9caf381431f0af7", size = 3155815, upload-time = "2025-10-13T23:11:43.304Z" },
-    { url = "https://files.pythonhosted.org/packages/99/8a/f20eff8f4ff5bd7db9b37b70ea058b37375a930a10e03d584a7597b6b740/arro3_core-0.6.5-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:a3b2621505f97eb5ce80f1c6fa8c77d18d757ab48d1f11d33a805e9ccbcd6fb6", size = 3107791, upload-time = "2025-10-13T23:11:46.735Z" },
-    { url = "https://files.pythonhosted.org/packages/79/da/60c66f0cc4a6af7f54e57973190540f77b84da1218fad2a9917e17bd897b/arro3_core-0.6.5-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6c1becbb96ceba0b20f3d4318dd35f3417ee9a49065813d99f52b0fa285fc569", size = 2957730, upload-time = "2025-10-13T23:11:49.875Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/8d/6e3235894196e1fd2be34e01ac2d4280dd24e6c9019e3b12603858651e91/arro3_core-0.6.5-cp39-cp39-win_amd64.whl", hash = "sha256:5459e7bd39bb9dd8c57aa06856d2bebc5c1ca782cbccab0e186c6c89530e4ca9", size = 2839298, upload-time = "2025-10-13T23:11:53.566Z" },
     { url = "https://files.pythonhosted.org/packages/10/ca/b2139dbb25f9fefb9b1cdce8a73785615de6763af6a16bf6ff96a3b630f2/arro3_core-0.6.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:26d5b50139f1a96727fa1760b4d70393acf5ee0fba45346ad2d4f69824d3bdc2", size = 2676788, upload-time = "2025-10-13T23:11:56.965Z" },
     { url = "https://files.pythonhosted.org/packages/34/a1/c68dde2944f493c8ccfcb91bf6da6d27a27c3674316dd09c9560f9e6ab1a/arro3_core-0.6.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b65b3d8d7f65f2f3c36002dc467380d7a31ea771132986dddc6341c5a9dc726f", size = 2382809, upload-time = "2025-10-13T23:12:00.175Z" },
     { url = "https://files.pythonhosted.org/packages/c6/fc/2fb81d42a3cecd632deace97dc23ac74083d60d158106440c783bae4ff01/arro3_core-0.6.5-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6c3442a79a757ed3fbd7793de180019ae3201f04237537c2e2e3f1e3dd99b31c", size = 2882818, upload-time = "2025-10-13T23:12:03.721Z" },
@@ -314,8 +300,7 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "jmespath" },
     { name = "python-dateutil" },
-    { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "urllib3", version = "2.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "urllib3" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/49/d0/3888673417202262ddd7e6361cab8e01ee2705e39643af8445e2eb276eab/botocore-1.40.43.tar.gz", hash = "sha256:d87412dc1ea785df156f412627d3417c9f9eb45601fd0846d8fe96fe3c78b630", size = 14389164, upload-time = "2025-10-01T19:38:16.06Z" }
 wheels = [
@@ -392,17 +377,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/72/2a/aff5dd112b2f14bcc3462c312dce5445806bfc8ab3a7328555da95330e4b/charset_normalizer-3.4.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d716a916938e03231e86e43782ca7878fb602a125a91e7acb8b5112e2e96ac16", size = 152224, upload-time = "2025-08-09T07:56:51.369Z" },
     { url = "https://files.pythonhosted.org/packages/b7/8c/9839225320046ed279c6e839d51f028342eb77c91c89b8ef2549f951f3ec/charset_normalizer-3.4.3-cp314-cp314-win32.whl", hash = "sha256:c6dbd0ccdda3a2ba7c2ecd9d77b37f3b5831687d8dc1b6ca5f56a4880cc7b7ce", size = 100086, upload-time = "2025-08-09T07:56:52.722Z" },
     { url = "https://files.pythonhosted.org/packages/ee/7a/36fbcf646e41f710ce0a563c1c9a343c6edf9be80786edeb15b6f62e17db/charset_normalizer-3.4.3-cp314-cp314-win_amd64.whl", hash = "sha256:73dc19b562516fc9bcf6e5d6e596df0b4eb98d87e4f79f3ae71840e6ed21361c", size = 107400, upload-time = "2025-08-09T07:56:55.172Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/ca/9a0983dd5c8e9733565cf3db4df2b0a2e9a82659fd8aa2a868ac6e4a991f/charset_normalizer-3.4.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:70bfc5f2c318afece2f5838ea5e4c3febada0be750fcf4775641052bbba14d05", size = 207520, upload-time = "2025-08-09T07:57:11.026Z" },
-    { url = "https://files.pythonhosted.org/packages/39/c6/99271dc37243a4f925b09090493fb96c9333d7992c6187f5cfe5312008d2/charset_normalizer-3.4.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:23b6b24d74478dc833444cbd927c338349d6ae852ba53a0d02a2de1fce45b96e", size = 147307, upload-time = "2025-08-09T07:57:12.4Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/69/132eab043356bba06eb333cc2cc60c6340857d0a2e4ca6dc2b51312886b3/charset_normalizer-3.4.3-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:34a7f768e3f985abdb42841e20e17b330ad3aaf4bb7e7aeeb73db2e70f077b99", size = 160448, upload-time = "2025-08-09T07:57:13.712Z" },
-    { url = "https://files.pythonhosted.org/packages/04/9a/914d294daa4809c57667b77470533e65def9c0be1ef8b4c1183a99170e9d/charset_normalizer-3.4.3-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:fb731e5deb0c7ef82d698b0f4c5bb724633ee2a489401594c5c88b02e6cb15f7", size = 157758, upload-time = "2025-08-09T07:57:14.979Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/a8/6f5bcf1bcf63cb45625f7c5cadca026121ff8a6c8a3256d8d8cd59302663/charset_normalizer-3.4.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:257f26fed7d7ff59921b78244f3cd93ed2af1800ff048c33f624c87475819dd7", size = 152487, upload-time = "2025-08-09T07:57:16.332Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/72/d3d0e9592f4e504f9dea08b8db270821c909558c353dc3b457ed2509f2fb/charset_normalizer-3.4.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1ef99f0456d3d46a50945c98de1774da86f8e992ab5c77865ea8b8195341fc19", size = 150054, upload-time = "2025-08-09T07:57:17.576Z" },
-    { url = "https://files.pythonhosted.org/packages/20/30/5f64fe3981677fe63fa987b80e6c01042eb5ff653ff7cec1b7bd9268e54e/charset_normalizer-3.4.3-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:2c322db9c8c89009a990ef07c3bcc9f011a3269bc06782f916cd3d9eed7c9312", size = 161703, upload-time = "2025-08-09T07:57:20.012Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/ef/dd08b2cac9284fd59e70f7d97382c33a3d0a926e45b15fc21b3308324ffd/charset_normalizer-3.4.3-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:511729f456829ef86ac41ca78c63a5cb55240ed23b4b737faca0eb1abb1c41bc", size = 159096, upload-time = "2025-08-09T07:57:21.329Z" },
-    { url = "https://files.pythonhosted.org/packages/45/8c/dcef87cfc2b3f002a6478f38906f9040302c68aebe21468090e39cde1445/charset_normalizer-3.4.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:88ab34806dea0671532d3f82d82b85e8fc23d7b2dd12fa837978dad9bb392a34", size = 153852, upload-time = "2025-08-09T07:57:22.608Z" },
-    { url = "https://files.pythonhosted.org/packages/63/86/9cbd533bd37883d467fcd1bd491b3547a3532d0fbb46de2b99feeebf185e/charset_normalizer-3.4.3-cp39-cp39-win32.whl", hash = "sha256:16a8770207946ac75703458e2c743631c79c59c5890c80011d536248f8eaa432", size = 99840, upload-time = "2025-08-09T07:57:23.883Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/d6/7e805c8e5c46ff9729c49950acc4ee0aeb55efb8b3a56687658ad10c3216/charset_normalizer-3.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:d22dbedd33326a4a5190dd4fe9e9e693ef12160c77382d9e87919bce54f3d4ca", size = 107438, upload-time = "2025-08-09T07:57:25.287Z" },
     { url = "https://files.pythonhosted.org/packages/8a/1f/f041989e93b001bc4e44bb1669ccdcf54d3f00e628229a85b08d330615c5/charset_normalizer-3.4.3-py3-none-any.whl", hash = "sha256:ce571ab16d890d23b5c278547ba694193a45011ff86a9162a71307ed9f86759a", size = 53175, upload-time = "2025-08-09T07:57:26.864Z" },
 ]
 
@@ -420,9 +394,9 @@ name = "datafusion"
 version = "53.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
+    { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "pyarrow", version = "23.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
-    { name = "typing-extensions", marker = "python_full_version >= '3.10' and python_full_version < '3.13'" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/58/2b/0f96f12b70839c93930c4e17d767fc32b6c77d548c78784128049e944701/datafusion-53.0.0.tar.gz", hash = "sha256:ba9a5ec06b5453fbd8710d6aeeb515a8bcac4b6c140e254409bb53a5f322ef22", size = 224267, upload-time = "2026-04-13T00:45:02.686Z" }
 wheels = [
@@ -433,42 +407,26 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4b/1a/ea4831fc6aeefedbcf186c9f6a273d507b1787c03cbb905bded7e1149a6a/datafusion-53.0.0-cp310-abi3-win_amd64.whl", hash = "sha256:4c8410f5f659b926677be6c7d443bbc05d825c078c970b7d8cf977ebcf948314", size = 38120687, upload-time = "2026-04-13T00:45:00.633Z" },
 ]
 
-[[package]]
-name = "datasets"
-version = "0.0.9"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.10'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/cd/fe/4d2874473a753d59c83335691bd9532704f2605418a0d288a1d70fa003fc/datasets-0.0.9.zip", hash = "sha256:86d54441bab87aebb2aa3bf0853aa7fb7abed8c708f9bb08a88e86a498972010", size = 4013, upload-time = "2015-08-18T00:07:40.556Z" }
-
 [[package]]
 name = "datasets"
 version = "4.1.1"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14'",
-    "python_full_version == '3.13.*'",
-    "python_full_version == '3.12.*'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-]
 dependencies = [
-    { name = "dill", marker = "python_full_version >= '3.10'" },
-    { name = "filelock", marker = "python_full_version >= '3.10'" },
-    { name = "fsspec", extra = ["http"], marker = "python_full_version >= '3.10'" },
-    { name = "huggingface-hub", marker = "python_full_version >= '3.10'" },
-    { name = "multiprocess", marker = "python_full_version >= '3.10'" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
+    { name = "dill" },
+    { name = "filelock" },
+    { name = "fsspec", extra = ["http"] },
+    { name = "huggingface-hub" },
+    { name = "multiprocess" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
-    { name = "packaging", marker = "python_full_version >= '3.10'" },
-    { name = "pandas", marker = "python_full_version >= '3.10'" },
-    { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
+    { name = "packaging" },
+    { name = "pandas" },
+    { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "pyarrow", version = "23.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
-    { name = "pyyaml", marker = "python_full_version >= '3.10'" },
-    { name = "requests", marker = "python_full_version >= '3.10'" },
-    { name = "tqdm", marker = "python_full_version >= '3.10'" },
-    { name = "xxhash", marker = "python_full_version >= '3.10'" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "tqdm" },
+    { name = "xxhash" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/91/a4/73f8e6ef52c535e1d20d5b2ca83bfe6de399d8b8b8a61ccc8d63d60735aa/datasets-4.1.1.tar.gz", hash = "sha256:7d8d5ba8b12861d2c44bfff9c83484ebfafff1ff553371e5901a8d3aab5450e2", size = 579324, upload-time = "2025-09-18T13:14:27.108Z" }
 wheels = [
@@ -514,12 +472,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/23/32/57866cf8881288b3dfb9212720221fb890daaa534dbdc6fe3fff3979ecd1/duckdb-1.4.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2de258a93435c977a0ec3a74ec8f60c2f215ddc73d427ee49adc4119558facd3", size = 18421289, upload-time = "2025-09-16T10:22:21.564Z" },
     { url = "https://files.pythonhosted.org/packages/a0/83/7438fb43be451a7d4a04650aaaf662b2ff2d95895bbffe3e0e28cbe030c9/duckdb-1.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6d3659641d517dd9ed1ab66f110cdbdaa6900106f116effaf2dbedd83c38de3", size = 20426547, upload-time = "2025-09-16T10:22:23.759Z" },
     { url = "https://files.pythonhosted.org/packages/21/b2/98fb89ae81611855f35984e96f648d871f3967bb3f524b51d1372d052f0c/duckdb-1.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:07fcc612ea5f0fe6032b92bcc93693034eb00e7a23eb9146576911d5326af4f7", size = 12290467, upload-time = "2025-09-16T10:22:25.923Z" },
-    { url = "https://files.pythonhosted.org/packages/8d/42/0f355319b3e8ee1703d0e17378dd829db391434306621f85c110134f2763/duckdb-1.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1c97ee61c582002b654331f7fd967d6b1e83bf7fdb0772f409dfd4b6af3a70f4", size = 31292373, upload-time = "2025-09-16T10:22:28.118Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/52/091dbef5eb2ac4e60a9c6d38fcc7c7530a75fafa0f37658450e8731a265b/duckdb-1.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:74e3d6295355160df5d3588b880e8bcae23fdd6f573f538793a8a1abf4c2c29d", size = 17288145, upload-time = "2025-09-16T10:22:30.346Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/6c/879317d9c3ac7a2a1f0618ca536a48ebfa4b9fe202f9783e07070e168192/duckdb-1.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d0c76425e4ffe98069dd4fc4752ab919a4125dc0d176bb676b3065fdea152c42", size = 14816258, upload-time = "2025-09-16T10:22:32.442Z" },
-    { url = "https://files.pythonhosted.org/packages/95/87/83ac8e67c0530b69fe39f91bbb7f3bd0a49b0c24216cffa9c5561fb2845c/duckdb-1.4.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c122bd7d80ab5057f53024ee3922d7612a5cdc99583fae730990964aebc3fd4", size = 18391043, upload-time = "2025-09-16T10:22:34.616Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/01/1d70bd6c594ef915c004edc0f1119d1602173dc5ce91c1eed7368f6aab34/duckdb-1.4.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:30689c1436bca723526be6102fe1f4f82ea6d4780fb9ca196bda7ed5ec227950", size = 20385348, upload-time = "2025-09-16T10:22:36.982Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/04/0650128cdcdc5208c4f51341a0a3f8db436ecaba51032c6065e20ea0baae/duckdb-1.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:4c55a367c1296617cff89c5e1c7153f1dc3c3b556ef70711a45b0236515f80c2", size = 12283322, upload-time = "2025-09-16T10:22:39.388Z" },
 ]
 
 [[package]]
@@ -543,29 +495,10 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" },
 ]
 
-[[package]]
-name = "flatbuffers"
-version = "2.0.7"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.10'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/d1/90/0532e737a11e1dc50e9e352c3ccc97338cb75991f83279c2edbc9234e022/flatbuffers-2.0.7.tar.gz", hash = "sha256:0ae7d69c5b82bf41962ca5fde9cc43033bc9501311d975fd5a25e8a7d29c1245", size = 22686, upload-time = "2022-08-23T22:50:07.903Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d7/0d/b5bfb553a6ac66d6ec2b6d7f1e814a908fba7188356ac94bb36ae3d905c3/flatbuffers-2.0.7-py2.py3-none-any.whl", hash = "sha256:71e135d533be527192819aaab757c5e3d109cb10fbb01e687f6bdb7a61ad39d1", size = 26562, upload-time = "2022-08-23T22:50:56.342Z" },
-]
-
 [[package]]
 name = "flatbuffers"
 version = "25.9.23"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14'",
-    "python_full_version == '3.13.*'",
-    "python_full_version == '3.12.*'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-]
 sdist = { url = "https://files.pythonhosted.org/packages/9d/1f/3ee70b0a55137442038f2a33469cc5fddd7e0ad2abf83d7497c18a2b6923/flatbuffers-25.9.23.tar.gz", hash = "sha256:676f9fa62750bb50cf531b42a0a2a118ad8f7f797a511eda12881c016f093b12", size = 22067, upload-time = "2025-09-24T05:25:30.106Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/ee/1b/00a78aa2e8fbd63f9af08c9c19e6deb3d5d66b4dda677a0f61654680ee89/flatbuffers-25.9.23-py2.py3-none-any.whl", hash = "sha256:255538574d6cb6d0a79a17ec8bc0d30985913b87513a01cce8bcdb6b4c44d0e2", size = 30869, upload-time = "2025-09-24T05:25:28.912Z" },
@@ -662,23 +595,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/40/37/5f9f3c3fd7f7746082ec67bcdc204db72dad081f4f83a503d33220a92973/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:1a85e345b4c43db8b842cab1feb41be5cc0b10a1830e6295b69d7310f99becaf", size = 282620, upload-time = "2025-06-09T23:02:00.493Z" },
     { url = "https://files.pythonhosted.org/packages/0b/31/8fbc5af2d183bff20f21aa743b4088eac4445d2bb1cdece449ae80e4e2d1/frozenlist-1.7.0-cp313-cp313t-win32.whl", hash = "sha256:3a14027124ddb70dfcee5148979998066897e79f89f64b13328595c4bdf77c81", size = 43059, upload-time = "2025-06-09T23:02:02.072Z" },
     { url = "https://files.pythonhosted.org/packages/bb/ed/41956f52105b8dbc26e457c5705340c67c8cc2b79f394b79bffc09d0e938/frozenlist-1.7.0-cp313-cp313t-win_amd64.whl", hash = "sha256:3bf8010d71d4507775f658e9823210b7427be36625b387221642725b515dcf3e", size = 47516, upload-time = "2025-06-09T23:02:03.779Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/b1/ee59496f51cd244039330015d60f13ce5a54a0f2bd8d79e4a4a375ab7469/frozenlist-1.7.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:cea3dbd15aea1341ea2de490574a4a37ca080b2ae24e4b4f4b51b9057b4c3630", size = 82434, upload-time = "2025-06-09T23:02:05.195Z" },
-    { url = "https://files.pythonhosted.org/packages/75/e1/d518391ce36a6279b3fa5bc14327dde80bcb646bb50d059c6ca0756b8d05/frozenlist-1.7.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d536ee086b23fecc36c2073c371572374ff50ef4db515e4e503925361c24f71", size = 48232, upload-time = "2025-06-09T23:02:07.728Z" },
-    { url = "https://files.pythonhosted.org/packages/b7/8d/a0d04f28b6e821a9685c22e67b5fb798a5a7b68752f104bfbc2dccf080c4/frozenlist-1.7.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dfcebf56f703cb2e346315431699f00db126d158455e513bd14089d992101e44", size = 47186, upload-time = "2025-06-09T23:02:09.243Z" },
-    { url = "https://files.pythonhosted.org/packages/93/3a/a5334c0535c8b7c78eeabda1579179e44fe3d644e07118e59a2276dedaf1/frozenlist-1.7.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:974c5336e61d6e7eb1ea5b929cb645e882aadab0095c5a6974a111e6479f8878", size = 226617, upload-time = "2025-06-09T23:02:10.949Z" },
-    { url = "https://files.pythonhosted.org/packages/0a/67/8258d971f519dc3f278c55069a775096cda6610a267b53f6248152b72b2f/frozenlist-1.7.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c70db4a0ab5ab20878432c40563573229a7ed9241506181bba12f6b7d0dc41cb", size = 224179, upload-time = "2025-06-09T23:02:12.603Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/89/8225905bf889b97c6d935dd3aeb45668461e59d415cb019619383a8a7c3b/frozenlist-1.7.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1137b78384eebaf70560a36b7b229f752fb64d463d38d1304939984d5cb887b6", size = 235783, upload-time = "2025-06-09T23:02:14.678Z" },
-    { url = "https://files.pythonhosted.org/packages/54/6e/ef52375aa93d4bc510d061df06205fa6dcfd94cd631dd22956b09128f0d4/frozenlist-1.7.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e793a9f01b3e8b5c0bc646fb59140ce0efcc580d22a3468d70766091beb81b35", size = 229210, upload-time = "2025-06-09T23:02:16.313Z" },
-    { url = "https://files.pythonhosted.org/packages/ee/55/62c87d1a6547bfbcd645df10432c129100c5bd0fd92a384de6e3378b07c1/frozenlist-1.7.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74739ba8e4e38221d2c5c03d90a7e542cb8ad681915f4ca8f68d04f810ee0a87", size = 215994, upload-time = "2025-06-09T23:02:17.9Z" },
-    { url = "https://files.pythonhosted.org/packages/45/d2/263fea1f658b8ad648c7d94d18a87bca7e8c67bd6a1bbf5445b1bd5b158c/frozenlist-1.7.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e63344c4e929b1a01e29bc184bbb5fd82954869033765bfe8d65d09e336a677", size = 225122, upload-time = "2025-06-09T23:02:19.479Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/22/7145e35d12fb368d92124f679bea87309495e2e9ddf14c6533990cb69218/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2ea2a7369eb76de2217a842f22087913cdf75f63cf1307b9024ab82dfb525938", size = 224019, upload-time = "2025-06-09T23:02:20.969Z" },
-    { url = "https://files.pythonhosted.org/packages/44/1e/7dae8c54301beb87bcafc6144b9a103bfd2c8f38078c7902984c9a0c4e5b/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:836b42f472a0e006e02499cef9352ce8097f33df43baaba3e0a28a964c26c7d2", size = 239925, upload-time = "2025-06-09T23:02:22.466Z" },
-    { url = "https://files.pythonhosted.org/packages/4b/1e/99c93e54aa382e949a98976a73b9b20c3aae6d9d893f31bbe4991f64e3a8/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e22b9a99741294b2571667c07d9f8cceec07cb92aae5ccda39ea1b6052ed4319", size = 220881, upload-time = "2025-06-09T23:02:24.521Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/9c/ca5105fa7fb5abdfa8837581be790447ae051da75d32f25c8f81082ffc45/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:9a19e85cc503d958abe5218953df722748d87172f71b73cf3c9257a91b999890", size = 234046, upload-time = "2025-06-09T23:02:26.206Z" },
-    { url = "https://files.pythonhosted.org/packages/8d/4d/e99014756093b4ddbb67fb8f0df11fe7a415760d69ace98e2ac6d5d43402/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f22dac33bb3ee8fe3e013aa7b91dc12f60d61d05b7fe32191ffa84c3aafe77bd", size = 235756, upload-time = "2025-06-09T23:02:27.79Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/72/a19a40bcdaa28a51add2aaa3a1a294ec357f36f27bd836a012e070c5e8a5/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:9ccec739a99e4ccf664ea0775149f2749b8a6418eb5b8384b4dc0a7d15d304cb", size = 222894, upload-time = "2025-06-09T23:02:29.848Z" },
-    { url = "https://files.pythonhosted.org/packages/08/49/0042469993e023a758af81db68c76907cd29e847d772334d4d201cbe9a42/frozenlist-1.7.0-cp39-cp39-win32.whl", hash = "sha256:b3950f11058310008a87757f3eee16a8e1ca97979833239439586857bc25482e", size = 39848, upload-time = "2025-06-09T23:02:31.413Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/45/827d86ee475c877f5f766fbc23fb6acb6fada9e52f1c9720e2ba3eae32da/frozenlist-1.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:43a82fce6769c70f2f5a06248b614a7d268080a9d20f7457ef10ecee5af82b63", size = 44102, upload-time = "2025-06-09T23:02:32.808Z" },
     { url = "https://files.pythonhosted.org/packages/ee/45/b82e3c16be2182bff01179db177fe144d58b5dc787a7d4492c6ed8b9317f/frozenlist-1.7.0-py3-none-any.whl", hash = "sha256:9a5af342e34f7e97caf8c995864c7a396418ae2859cc6fdf1b1073020d516a7e", size = 13106, upload-time = "2025-06-09T23:02:34.204Z" },
 ]
 
@@ -693,32 +609,13 @@ wheels = [
 
 [package.optional-dependencies]
 http = [
-    { name = "aiohttp", marker = "python_full_version >= '3.10'" },
-]
-
-[[package]]
-name = "gast"
-version = "0.4.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.10'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/83/4a/07c7e59cef23fb147454663c3271c21da68ba2ab141427c20548ae5a8a4d/gast-0.4.0.tar.gz", hash = "sha256:40feb7b8b8434785585ab224d1568b857edb18297e5a3047f1ba012bc83b42c1", size = 13804, upload-time = "2020-08-07T21:45:23.526Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/b6/48/583c032b79ae5b3daa02225a675aeb673e58d2cb698e78510feceb11958c/gast-0.4.0-py3-none-any.whl", hash = "sha256:b7adcdd5adbebf1adf17378da5ba3f543684dbec47b1cda1f3997e573cd542c4", size = 9824, upload-time = "2020-08-07T21:45:21.32Z" },
+    { name = "aiohttp" },
 ]
 
 [[package]]
 name = "gast"
 version = "0.6.0"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14'",
-    "python_full_version == '3.13.*'",
-    "python_full_version == '3.12.*'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-]
 sdist = { url = "https://files.pythonhosted.org/packages/3c/14/c566f5ca00c115db7725263408ff952b8ae6d6a4e792ef9c84e77d9af7a1/gast-0.6.0.tar.gz", hash = "sha256:88fc5300d32c7ac6ca7b515310862f71e6fdf2c029bbec7c66c0f5dd47b6b1fb", size = 27708, upload-time = "2024-06-27T20:31:49.527Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/a3/61/8001b38461d751cd1a0c3a6ae84346796a5758123f3ed97a1b121dfbf4f3/gast-0.6.0-py3-none-any.whl", hash = "sha256:52b182313f7330389f72b069ba00f174cfe2a06411099547288839c6cbafbd54", size = 21173, upload-time = "2024-07-09T13:15:15.615Z" },
@@ -726,69 +623,61 @@ wheels = [
 
 [[package]]
 name = "geoarrow-rust-core"
-version = "0.6.1"
+version = "0.6.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "arro3-core" },
-    { name = "pyproj", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "pyproj", version = "3.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
+    { name = "pyproj", version = "3.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "pyproj", version = "3.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/77/2d/3e994dd76223fac0eb597a6f55647cca51bd5a4f446d09b668697f901724/geoarrow_rust_core-0.6.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:84d972cc3dd45a797fd99588d7ee68f257e4083ebdcecad9ec773260067f71a6", size = 3570129, upload-time = "2025-12-03T18:51:07.148Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/2a/e19df203b4ffb225f39627e1bd1b89ce7b2220e39f1d6972692174820c57/geoarrow_rust_core-0.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bc0f382d4ed41e85d2d89fc2c7c8c3d046681c9a5e19350ce79e0e930cf69821", size = 3333881, upload-time = "2025-11-21T01:49:28.959Z" },
-    { url = "https://files.pythonhosted.org/packages/52/98/b749a2165dfc5d9c54a1c19eb3e6a75b6d005ecde42289b25c1c355346b7/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:80e719edcaf6698ed2b1aa9525bd97cf79e23a500a39b1e83566cd9a16a294d3", size = 3806366, upload-time = "2025-11-21T01:48:03.525Z" },
-    { url = "https://files.pythonhosted.org/packages/84/93/7c0e42ba7d46208fb0f851e06c05de071962170f3a3b2a2260d8a3f66e7a/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d0f3546a15503329880063aca31266b301b0b781f618f832585bcd1c9efcc876", size = 3981800, upload-time = "2025-11-21T01:48:17.789Z" },
-    { url = "https://files.pythonhosted.org/packages/de/43/9c5736569dead60b33e46b7c485e24804d950693df70dee306e153547789/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6937f3cabebf673f8b726d60d8ca160b46401de8b08c8e257be22772c12c2001", size = 5068955, upload-time = "2025-11-21T01:48:32.569Z" },
-    { url = "https://files.pythonhosted.org/packages/71/5e/f26f9bea2af96b0d070e980dcc2196d369a678e06141ed260de5ca72bcc2/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f29ba92053e8ad4bd60d72188518f033ca4abc1f34eecebeb41ee7b790612e00", size = 4104946, upload-time = "2025-11-21T01:48:45.801Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/08/473796b3e0c03b35292220de88c8efa3e74d6174e807b26a371f2523a4b0/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14a5d05a312fbb76821566b1d144c64d0923fcbd790b2c7376ee11f62472b2fe", size = 3917533, upload-time = "2025-11-21T01:49:14.631Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/7a/7b62b839c3a9878a7d91b8395e0b7b04483e4bec687e073df0fbd4056583/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:88fe8fd33b16a06e9b3b7638b51d24047f1d01af12cc2e3e2653140877bddef6", size = 4318837, upload-time = "2025-11-21T01:48:58.953Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/86/309c55a9c63f316e3a04949ade8847b8e5acbdd21645696911175f0e1814/geoarrow_rust_core-0.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:dbecc2487cc95526ac77797cd70c199e196811b0a9e877c1b61fcaca508575fa", size = 3320081, upload-time = "2025-11-21T01:49:58.861Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/ed/514cff089185d71242a62e774e2c59dda147baab65929851b66d72198d5d/geoarrow_rust_core-0.6.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:e26ca240d7a6a0fa1b4f56a9ebe07b2e14fc7c1c9507aa862bd31ef14e0521f0", size = 3572326, upload-time = "2025-12-03T18:51:08.477Z" },
-    { url = "https://files.pythonhosted.org/packages/77/21/22f8233235bd020db22b4f2bf888f9aeed08813eda7b8b421a6963bdc7e4/geoarrow_rust_core-0.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:46876e3528685673e08b4cbc696dca7f22fb073a83318708b0eaf640107b923b", size = 3335166, upload-time = "2025-11-21T01:49:30.632Z" },
-    { url = "https://files.pythonhosted.org/packages/bb/eb/0c2e40a6a1bd450347a8a9fc7648ca840710bc177ff6eed3fc5da6ef981a/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5502bd12ede712d9b4725753df4db231a0aa6d3e131079bc4b6452c436e37b7", size = 3800540, upload-time = "2025-11-21T01:48:05.583Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/42/22d3b8441bb7041a6fcdb4cf0a1108e150513a52f8a407715188412bc71f/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8f04dd7dd03449dba6d15f7d35c6c708637ac05f125638f56206e876756cd4c5", size = 3984840, upload-time = "2025-11-21T01:48:19.719Z" },
-    { url = "https://files.pythonhosted.org/packages/12/44/477b6b2389398dc983026a4ab7dbb7ec121284ad5fb864a1b7a4658c3881/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d2afce33d0c3fa87d5d4d24d6617732e4297da3372b1746569b759f9b62aede1", size = 5067358, upload-time = "2025-11-21T01:48:34.373Z" },
-    { url = "https://files.pythonhosted.org/packages/62/50/6995e9d11462635972b2fc09c8e1e510928563ca4fb0fd2c9145cf6ef771/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e63cdb661652a9836dc86cb5995ad269817d88b80f4cce6ed236a7f80f0aba", size = 4105773, upload-time = "2025-11-21T01:48:47.461Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/21/b369208495f213db0a0e7d563358307a706cc6af0cb9c897dacf28ae06a1/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adbaf97cb770aef69df8a16437c9faa67adb2b04856faf45bcb61d5b986101dc", size = 3914659, upload-time = "2025-11-21T01:49:16.35Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/49/fccb14c6ee9bb715451e4d5bbe3d571eb59a8a1abe21b2abe0d9d48a7fac/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:202f35b301caa5154d95fd74424a1ef6449306e4f6fbfb5140270e48e94188a5", size = 4315153, upload-time = "2025-11-21T01:49:01.075Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/1c/88b16510e24a4a3332284669085673701b9fe4d6a511b4466c90655a9daf/geoarrow_rust_core-0.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:491405dfcc821a2c599e381cc9923e04a758deb1cc84fdb5794b519446c2f8a8", size = 3320510, upload-time = "2025-11-21T01:50:15.545Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/5f/1dbdbc1dde2140937cff20188cb25034b6f39e1734c14ca6510cf464bf77/geoarrow_rust_core-0.6.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a8145a562e94419402dd0882bb62429853804c53d47dbea944f2a24abc57abd2", size = 3568115, upload-time = "2025-12-03T18:51:09.743Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/e1/b62676f89ef3b866676967989ee8dbbd3d16c77f69aa4287825703268c42/geoarrow_rust_core-0.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:51040a5afcfa0cd3ab372d981375c7fe8eb652d155e3964d52ed51d14faa04e8", size = 3325336, upload-time = "2025-11-21T01:49:32.67Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/89/94e20f255712ff0eaccf9bfeac4bf51953ebcef0599cfc92f67037f8ab1a/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2fbf8506848b0254b3c89b27c045be38bbef6372b21714cad45d76b0c8cb92ce", size = 3808535, upload-time = "2025-11-21T01:48:07.618Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/e4/37c7e2c9e251148be17292d0656d7d1ab35019678f6bd11090a41c270d18/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c1a0d9c14bf2f36676016c753517d9470381969c2a67859716cceae33735f3ee", size = 3978997, upload-time = "2025-11-21T01:48:21.551Z" },
-    { url = "https://files.pythonhosted.org/packages/71/27/c4ba353d9b77889136bdfd1c0cd1a04d6eade9da6e0748b06719c458afb5/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6df97301782ecbaf5f2f0252011a9ff309471cde25435bdf1e17b29c263ebc16", size = 5066492, upload-time = "2025-11-21T01:48:36.142Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/81/34107fc9aacc489e41afed420202645675b41d85b46dc70d5ba222312791/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1948cfdd0e1c7d03a0c2067821dd536ab34d1e726515202e51fbd6b0d9f775f", size = 4106130, upload-time = "2025-11-21T01:48:49.144Z" },
-    { url = "https://files.pythonhosted.org/packages/92/5f/2e348b884738fb213fb3b4745955baeeaf047aecb37639e39a4dd8f12d99/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95b1611b66c386cc6c74e990df4f114bcf24956a35e18e51bf6331c079a36688", size = 3913166, upload-time = "2025-11-21T01:49:18.228Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/81/fdda8bb5f84df82bc9e000435a88be46d46dda41eb5149f624ed96b7031c/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1751357a1aaa26aeb5feb6f66873b6a2d369655039f7278dedcb692b512111cc", size = 4313573, upload-time = "2025-11-21T01:49:03.184Z" },
-    { url = "https://files.pythonhosted.org/packages/a0/14/ca0bc7d3b158094e769ba2bbc43d203330e7e457ed67b50af97d3eac45df/geoarrow_rust_core-0.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:16fe159043a444579948864808ebec8c49ec167ec0df3cb772dfb88de268bc91", size = 3318746, upload-time = "2025-11-21T01:50:17.319Z" },
-    { url = "https://files.pythonhosted.org/packages/85/b8/94e4f8fb32ef705cf65031a24c58cdc441042a68a794b74757a6561cbc60/geoarrow_rust_core-0.6.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:6c1b692f76b613757438bf23cfe3be4a8715f0268afd8ad3ca0063c257a3be4b", size = 3568328, upload-time = "2025-12-03T18:51:11.291Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/45/a96e64f9febc3436766c5055508c4e823cce56577529d7b76c4e4f584bc4/geoarrow_rust_core-0.6.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1a2b4f9a8cfe852a0ba9a667258307db9e354b470b7e0a03edffd0b7daf9b6f5", size = 3325879, upload-time = "2025-11-21T01:49:34.941Z" },
-    { url = "https://files.pythonhosted.org/packages/58/c0/c719ce3fb4e982e28c71f65a80cf697d07d733336e6b74d7d1b8a7daf9d0/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8248330f5c3e7ec5852d0a23c23b31a08395300ef9544109e2991317beddfee3", size = 3809144, upload-time = "2025-11-21T01:48:09.562Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/8e/2ab3563b2ffd13f2dd69c050a901de0a4bb325879531a66f56d30bc7337e/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:775e9fe45c06d02be59b1497c60aa4f7a7c1d460387bf5f63142faf39b8ad4ff", size = 3978886, upload-time = "2025-11-21T01:48:23.335Z" },
-    { url = "https://files.pythonhosted.org/packages/db/0a/31625caa0a32e8e9e7aaf2514a840dda0dadf8e2452710ebc10e5f469494/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:94de8fb01da3f22332eab28b03570c43cc36492ce482c254fe87e851ae21285b", size = 5065429, upload-time = "2025-11-21T01:48:37.896Z" },
-    { url = "https://files.pythonhosted.org/packages/11/8d/ee247bd4ccf3b0791b8669357d440e3960d4dbd5cca940a2e226e8910c31/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c70a63d1d36687a53dc6c2933446b1435c187e4c616cd84844d89b6ba13bc4f6", size = 4105436, upload-time = "2025-11-21T01:48:50.874Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/fb/c1e92716ee5aa00d48b650f0cb43220a1bf4088c8d572dfc21d400b16723/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e505312f2761393fe5158242f3f2d77e9daa5cca63badd8d66e6d1d69fc17bf", size = 3913672, upload-time = "2025-11-21T01:49:19.873Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/6f/ef47f6070c5d5cf0d061d5f5ba95aed7e895e4720a784b84c911c0209fc0/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a732e58549108df8267ab72fa6cc7c54e5a9e30b818d8d869e301a9de9d3029e", size = 4313496, upload-time = "2025-11-21T01:49:04.953Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/ac/2696b979623ea02129e342f8820c89d03fa5a253a913ad00b588d6dd2948/geoarrow_rust_core-0.6.1-cp313-cp313-win_amd64.whl", hash = "sha256:9e1d6492b1388b9d5ae898728838ada78dbf2340d2e9dd25ad3df6ccdd058813", size = 3318780, upload-time = "2025-11-21T01:50:18.928Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/42/0cb3af24b01d3897a9eee6af5cc0676bf6b80364e0d4638e45a5fc873d35/geoarrow_rust_core-0.6.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:3748cc8e8cb2bcedaede27cefed6749d4eea93e358b49a2f0b061d8974dd1b91", size = 3560313, upload-time = "2025-12-03T18:51:12.897Z" },
-    { url = "https://files.pythonhosted.org/packages/51/bc/33f8c918e46188707ab358752b993bee9184fa62e580998c1ec4c37885c1/geoarrow_rust_core-0.6.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1b0e232fe4e239ca435d0bab638934eee87d758024c1727ee24a2b8bc4d8bc7b", size = 3321855, upload-time = "2025-12-03T18:51:00.056Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/d7/aeb2a3922670ad57f62cb591bd0309a8300ceeec6efc7f925a563c9da672/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:843444ada2c7f7670fd9df3bdebd93e5247b376d1dd20c4fb3828632847ab78e", size = 3799057, upload-time = "2025-12-03T18:50:28.982Z" },
-    { url = "https://files.pythonhosted.org/packages/76/08/606e55fc2a0e85b02e0fde7dec2014eb8f1463e8a823496d72a3095de73d/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:880641183a09ebfbca3a6357071f137d1a4b0f1ba606fb9127a01cf58faaef56", size = 3968892, upload-time = "2025-12-03T18:50:34.661Z" },
-    { url = "https://files.pythonhosted.org/packages/10/1f/e75fd5b59e9e582190c11ec73c91728d96e90608a22e0aed7365439d9534/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6bb69024257d2fd20da691d1e15bcced874d278884218b64690256982fa30cb1", size = 5049247, upload-time = "2025-12-03T18:50:40.542Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/95/2257b9b148c8c6557387e67828a5096ebc519b997a158ffb67a0987589e5/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:85464a1bab81068789de5fb19684e43709d2ba6d64d5655aace7c50b35893d6d", size = 4099850, upload-time = "2025-12-03T18:50:45.341Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/07/8c8aaf8755ee7c137f0898823bd005ffb16edaa6accc0cc1a9a747d56ddc/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7eb773a101f1d9716d750bb326991885a7c4576e85d9a016a567a3b07380bf07", size = 3908308, upload-time = "2025-12-03T18:50:55.587Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/7e/b8f1933be03d9a3a6416edf29fc23d520e45f00fbde6bd8f0614ad6f8a69/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:920e6fed857acd2145a8fca7c6fad17094873f586ac5efed7049ce43a7af4ff6", size = 4307178, upload-time = "2025-12-03T18:50:50.429Z" },
-    { url = "https://files.pythonhosted.org/packages/df/95/a8ba3d7e51ec02ec954d0247c6021b36de5935a9a3845c1cf6c1348cd6e3/geoarrow_rust_core-0.6.1-cp314-cp314-win_amd64.whl", hash = "sha256:9887119cc31a763c34ed8676d06434b47971517e86f8e35c640b494d05e7d5ac", size = 3316511, upload-time = "2025-12-03T18:51:18.831Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/6d/4b2f51d0e4ac683217852d79c3acef719ca116f418d9ce8f4dcc6d717716/geoarrow_rust_core-0.6.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:420a720217b5a7ec6f7977cfe7e7a729c73381ed5e63112fdef33bd805b9cf8a", size = 3572216, upload-time = "2025-12-03T18:51:14.544Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/55/85a2948b10ad9ea347597f90355d8992745f00fedae54916205c8c9b80fb/geoarrow_rust_core-0.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0fb9c8c6bba4e712edf475ce3c78bf13f7b10f750256f57deb29c3222eaef033", size = 3335928, upload-time = "2025-11-21T01:49:51.601Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/98/fdd6c34ff8acd878c31e9f5fe4792f49d437e0465e0b60c24d6cdc287ed7/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9547ead76eac906b7a583ee65fa137e6b8ed34c0f128c1745a290c451726f27", size = 3808249, upload-time = "2025-11-21T01:48:11.192Z" },
-    { url = "https://files.pythonhosted.org/packages/8a/a1/fd6741b5c1d7d48b5f6ab58a994a91c86e29d19ee7bca2636590b8ac9a54/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eaa8e8f40ca8fcd367735cb4226c5aa5171a713d75bc2caab9a03bd9f59d7bf2", size = 3984081, upload-time = "2025-11-21T01:48:25.595Z" },
-    { url = "https://files.pythonhosted.org/packages/91/1e/2b5a9b65bf19a79d212ea0fe60fa5632ec4c89bb64ee446272b47e5cd6ac/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:08992719a2accbf993837a6aad615e3f2bf1954d2d9152e507dd79621c87e9d3", size = 5071749, upload-time = "2025-11-21T01:48:39.673Z" },
-    { url = "https://files.pythonhosted.org/packages/08/7a/6b37f5e52300b60854b74f4cdc9fbe613c692a15c3ae42f1952f3849bc86/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:071c0e72c4c2047326ebec8d76ce2debcdd59e187207433c3a29ac2da861ca92", size = 4107621, upload-time = "2025-11-21T01:48:52.632Z" },
-    { url = "https://files.pythonhosted.org/packages/e8/3e/f849642ef4e1f54bcc651903f19a219c3d2be68d27f4ceb282a07ebba7cd/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c49d5a9e7b73c30dd1790a3e0faf30b7a4ee393c127c5a799d543653d1d80f0c", size = 3919352, upload-time = "2025-11-21T01:49:21.495Z" },
-    { url = "https://files.pythonhosted.org/packages/84/c8/57318cb04d061788d5ba523984915c98523e9eb9b7ba4937ff3438e045ef/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:909152922ee42197b8ae846a8b6c5383c6f3ab39fe627ec8539765e3a634de68", size = 4320006, upload-time = "2025-11-21T01:49:06.588Z" },
-    { url = "https://files.pythonhosted.org/packages/13/9f/be16e191fdedbac4d9c01096327917a948625619423c666ec3db2191b4ab/geoarrow_rust_core-0.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:796c84184fe5e65e30df9f9f45aa8c1680f07689ea71ed1960faa7324fb67e52", size = 3321071, upload-time = "2025-11-21T01:50:20.844Z" },
+    { url = "https://files.pythonhosted.org/packages/70/a7/9de5cdcb86089ef4d9a24940838a72ef0655d5be11b46dc4ee807b0d7772/geoarrow_rust_core-0.6.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:e1dbbca927858c05ef4eaa5e13a3977a62183cfa3f17fe7b19dd2d88ecf24e91", size = 3855749, upload-time = "2026-06-11T19:24:32.965Z" },
+    { url = "https://files.pythonhosted.org/packages/54/48/da86c2bd1db71849f003f5a8eb78ce54f7a33341d5b33ddcdb480b5aafb4/geoarrow_rust_core-0.6.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ce7e126d340f335bcc108327cbf7264539e856cb6a299f59757a6ee8329f6643", size = 3710538, upload-time = "2026-06-11T19:24:34.925Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/65/7f8ecc05447a85f14643170de8a29715e7c3e732fbb7132617772d39eac7/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88eb7982c1345fc4c4b18d9895602f0148c9495fe7ac00df03a92c20c8058149", size = 4198382, upload-time = "2026-06-11T19:24:37.02Z" },
+    { url = "https://files.pythonhosted.org/packages/41/57/b11fbb277fab166d8a8940bc1151bbd1aeef537e70c55f495ff85178f827/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c8368b91d4cab5cb5ad1b0f7369da4cec196d82bf73aa3823618a99c1bd4cf04", size = 4270350, upload-time = "2026-06-11T19:24:38.726Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/16/0c35e5aff4aca77d818b28d79f9ce20fe1c282ef26d6a2fcc764f3a55f26/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2955d82d0204197c8e96adbfb70f252fa5987821dd8f202e712a84bfb5b876d3", size = 5602389, upload-time = "2026-06-11T19:24:40.198Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/06/58e4d0c94f7d8897ca5e2469fe5db0dd937bfc3cd676dea43c6ce488effe/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cacbc2231b03c674975d5a25ff549c367dd8c07147c41edb5461c8ebda693739", size = 4414385, upload-time = "2026-06-11T19:24:41.779Z" },
+    { url = "https://files.pythonhosted.org/packages/09/65/902e986d01d4978e752c1d0d5b15873de712321ce3f61c285f491e4149b9/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f5726fd638563d11dfefd7d17dd769e679ac1efb868178791573de19d16b41f", size = 4251263, upload-time = "2026-06-11T19:24:43.556Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/f1/b1e0f93ea5288706f08ac7c01f332eb0feaa128251f3c2c9896e5f42cba5/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:df7a0319cdec5d0e4ffc3f17a171e16787e7719f85f82c8cf0035d873ec31e62", size = 4747229, upload-time = "2026-06-11T19:24:45.281Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/f3/77ebd20cb5cf5eb18c5bb0e32e07f76ec915a728ea123e075365f0b6c53c/geoarrow_rust_core-0.6.3-cp310-cp310-win_amd64.whl", hash = "sha256:19ce5fb18025480461253d0a03f20cbb635163214b5f193b0700bc1a407dfe4d", size = 3601298, upload-time = "2026-06-11T19:24:46.721Z" },
+    { url = "https://files.pythonhosted.org/packages/02/a8/d50e482a56d9543119be40000bc405b725242b6056809bbee3a75eff2411/geoarrow_rust_core-0.6.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d91b5249d5e1da53a79268759601c107beb69a8944dd3b5b225e9515ab63d519", size = 3856056, upload-time = "2026-06-11T19:24:48.331Z" },
+    { url = "https://files.pythonhosted.org/packages/04/e3/f4de7795959d95d88b32b85740d5d2d6b0a2e17233258f0331aee6cb7b13/geoarrow_rust_core-0.6.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:14412f02c1e60c92d2f88bc9f92835cf6d80f1da37fe8ba462eafdb7bd570f3c", size = 3710092, upload-time = "2026-06-11T19:24:49.802Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/48/04888477c2a12fbe6a6f8898bd026facdc3a929b4e747d7b569e6d20dd58/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc5d6db2341568b1e44678ccc0ade1ca1e7660a2c186ebf8bf847acdb160f2cf", size = 4197891, upload-time = "2026-06-11T19:24:51.245Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/2d/c16b6eb6f9f2ab213dcd0cd2ac0dec2eae1e2ce5922b3fbeb7bb1ac2a865/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:45f4193b9d6f6caae969d8448f3687a19f0998d757519a091df609c06ffa68a0", size = 4269771, upload-time = "2026-06-11T19:24:52.781Z" },
+    { url = "https://files.pythonhosted.org/packages/47/fd/2ee73341c37d554ce8d0b67a95525700ec32194fa785261c17262afadfc8/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf9ca054562fb4610c8e5ea140fa1bf746ccc16de505d3a5684abd2fa11f9538", size = 5601846, upload-time = "2026-06-11T19:24:54.63Z" },
+    { url = "https://files.pythonhosted.org/packages/67/05/229234ae7bf1d39306e41896f3055a2ae847707ce58f21bd0872b9a5764e/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ec9530fefb653f9a2e605cc26fc1c0d1ffa5c4923ec1037323ba9a16744f8ccc", size = 4413741, upload-time = "2026-06-11T19:24:56.015Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/5a/7875548a48231b02f909d3d8c7d74ba47867b2af3396e7aed59cd3b2b40d/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2813aceabb29567d96f29fd2d3099d6f8decd0f5f968ff81ed1a664751dc84a3", size = 4251434, upload-time = "2026-06-11T19:24:57.527Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/46/ed0370def1a950f185edda603a02276bb412a9c95ad5a052c9e919b2df78/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:49686767d1379ff3b165f9d35a73e96fc25daba786ce27cf3359c5feac880fd0", size = 4746598, upload-time = "2026-06-11T19:24:58.979Z" },
+    { url = "https://files.pythonhosted.org/packages/44/bc/3a1720be855d7d0011416b7f0a7b7e33546b0fc7320faf59b05e401adff7/geoarrow_rust_core-0.6.3-cp311-cp311-win_amd64.whl", hash = "sha256:fd9cc8c47af736dd087575306088e73b28a720f52e5c3342968851ddd2fb5778", size = 3601329, upload-time = "2026-06-11T19:25:00.459Z" },
+    { url = "https://files.pythonhosted.org/packages/24/b2/65db3af5fcc7d64ac7ac86d7debc6a90803bb076c8f7d4599c167be79fd6/geoarrow_rust_core-0.6.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:86aaa60e5b6d99be08f9adc9e58bd088135e1dcfebd290085228ed8a0e93e90f", size = 3848323, upload-time = "2026-06-11T19:25:02.079Z" },
+    { url = "https://files.pythonhosted.org/packages/27/9a/37bdd36d7feb9d591b9ccdc1952c6171b04dc777b999e2082b810eb1dd45/geoarrow_rust_core-0.6.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fec148cd63e616d9a7aa00c4ab08693eeec55aca7c9d700aa6451cd8001d0e08", size = 3707679, upload-time = "2026-06-11T19:25:03.594Z" },
+    { url = "https://files.pythonhosted.org/packages/45/b7/8d2998284de21d0feb2a0935c41636f8ebf2b65723d8139026e7f9f3d5e8/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b1944f3d548b6296e9fbd668602accae0ad68e49ee0f5b8df9e7ea4f474e4ae", size = 4190279, upload-time = "2026-06-11T19:25:05.21Z" },
+    { url = "https://files.pythonhosted.org/packages/25/f3/140209f53a70f261ef1459b08eea25c4edef3ad9f6ec0924033b5285ee7e/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7f5c04195cbedf5d1684a50203e862d979cda0d6218aac32f607d6e3f7cd65c8", size = 4264876, upload-time = "2026-06-11T19:25:06.654Z" },
+    { url = "https://files.pythonhosted.org/packages/14/32/0097bfb92816ef91b38f7e757f65fe8456e56152ca51cd7a05b1be8a2e40/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:671c6be9cbc68295a68598fc8c6ddd875de063a795d64b2cfd10d36abd1ee324", size = 5586563, upload-time = "2026-06-11T19:25:08.376Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/86/508fe299aa44afe95399d9fa73cdbc7a451841803b8f1431e8c3d0b26ec1/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5f4726fbe09d545a507993f2f76c2be7812fef3c20c994ff33c32aaa96aaa212", size = 4402886, upload-time = "2026-06-11T19:25:10.302Z" },
+    { url = "https://files.pythonhosted.org/packages/46/81/fc34afcce2b0f17424610405481f69f3c6e4d670c5c94170d71ed6719794/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0fa37a90312e7ca06921be56cee183c12c442b345fadd982480cd1f8ed2eede", size = 4247331, upload-time = "2026-06-11T19:25:11.857Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/0d/af42431f80282a2f7e1f3e496c39483dd2362e11f8008c65033be9d2ba4c/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3f41a8c0a9f3558d73537dcad83c88b29c2a169bcc7766dc677e8245a98a5e95", size = 4741954, upload-time = "2026-06-11T19:25:13.964Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/e5/be80aa4384f16be6a20828fd4cc67da18bd2266366f80c9bfefa481559f8/geoarrow_rust_core-0.6.3-cp312-cp312-win_amd64.whl", hash = "sha256:382f0914c75d84b87420aef7b6f11e8b5d4d58b5f5db7c8d199815e4dd282a42", size = 3599115, upload-time = "2026-06-11T19:25:15.357Z" },
+    { url = "https://files.pythonhosted.org/packages/19/52/93bbf15979ce656d09821f02f82420957fdc99ee4cd37e5e2d8c99a324da/geoarrow_rust_core-0.6.3-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:c11190008ed6a571b8ca4ef769198e95434dbe7c3caefa9acd5f0ceba1ed868f", size = 3848682, upload-time = "2026-06-11T19:25:16.914Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/1e/1665171a3756b1977b7240a8f518bbbdfa778dcc156e0f90d659723468fb/geoarrow_rust_core-0.6.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1308ad09912fb67a43ff7dd7dbc685ca8a8fbd8028d3876eb187b6b082a98a7b", size = 3707868, upload-time = "2026-06-11T19:25:22.483Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/38/e344ccb72473b8756c8f2dae3a8a9339e1821884a2a50befbad45150d178/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1247b961c61656596631ca3380d405f8d0a2f60f045f8b8a3a335b1a849dc55", size = 4189835, upload-time = "2026-06-11T19:25:24.116Z" },
+    { url = "https://files.pythonhosted.org/packages/22/10/bc92b9fcdc628fa1ff7e234219701cd575b0a78da5fdf3a6c8884e5ca445/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5c2cb90116255c3f74d5aee563405f3a440bd4eb75471adac13cd0c80a2564dc", size = 4265584, upload-time = "2026-06-11T19:25:25.628Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/ed/67edd70967851bef3ef9e35d8ccef242923ed69104ecb885ad3adf4de9a2/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a993d3a0964b8cf55a51bd404225dc3037b51f34b01c6bb1312611ce61f9b2d", size = 5586300, upload-time = "2026-06-11T19:25:27.32Z" },
+    { url = "https://files.pythonhosted.org/packages/76/a6/a20fba654caa314b4688ad9dceb5e99fa7956bbf92b3059baa36e06c59b3/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cbd153a3348d166ecb57b2770b69b17c2df14cf303d41cd9168adba77532a31b", size = 4402375, upload-time = "2026-06-11T19:25:28.799Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/5d/c8949bb5916ff80186c854792b9ddadc9f3069db09d31311f24d82ba7096/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1fb5aaf3a6f104145b4c5a3188b1be589849b2599626c0e40181a18fc2e79f68", size = 4246712, upload-time = "2026-06-11T19:25:31.015Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/36/c9b7afa2929b697a164ae18f35aba517bcab85efcf19cb48ffa5ac66642b/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c3b33be8308a479f3a3a6d3a664861d6b5f8b1ad8822798f5a7e5d9af0b924eb", size = 4742863, upload-time = "2026-06-11T19:25:32.468Z" },
+    { url = "https://files.pythonhosted.org/packages/57/5c/55a8d753bff924959837c39c9aa37c7813c5929570a2629ae4ece811505f/geoarrow_rust_core-0.6.3-cp313-cp313-pyemscripten_2025_0_wasm32.whl", hash = "sha256:a090191ae224e8490a95e68038db7a14df8f0326706f10c2e958621bf6c06ef5", size = 1979216, upload-time = "2026-06-11T19:25:33.905Z" },
+    { url = "https://files.pythonhosted.org/packages/71/c7/a9f93af9306fd3743a96cc61bfdd7fc9194c38026f7904c067d4b4a99f0c/geoarrow_rust_core-0.6.3-cp313-cp313-win_amd64.whl", hash = "sha256:2606d6f5afacdb49145b39d3e024efadf33f847b596c19c9b6d3030d6beb2721", size = 3599237, upload-time = "2026-06-11T19:25:35.452Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/7a/6993bd89e12d0b227b611a53c657b38e63f906dfca773accae3a1f3815a4/geoarrow_rust_core-0.6.3-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:370cd1ef46bf18fa598f3038fe6f417b016da211ffe060f2b60e47dd2f684a34", size = 3854961, upload-time = "2026-06-11T19:25:37.045Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/c4/92cbcabd2a6add1b69a76a22a349fa219bdfed8026dfab4b8ec230bf9943/geoarrow_rust_core-0.6.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4dbf733db0bc57859d1a34c4bc8c50805f19e60081496967588e43f1f606e885", size = 3708325, upload-time = "2026-06-11T19:25:38.638Z" },
+    { url = "https://files.pythonhosted.org/packages/07/b3/8fc34c5efa95cd597328876b6295fbe280d4b71df615655aaa2cd1618881/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45ac6715e790b1ca9be38ceb8ee39cdfe29395d29c83541f7a1190812290d81d", size = 4196828, upload-time = "2026-06-11T19:25:40.329Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/f2/bd2026862995ff96eb6b94d2fc56f7bf737d13f6bac9662481eaae23d079/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d14917d471dce8ee5a0976ec50b5da800bab0117bfd72bc56e23518a1dbbdb3a", size = 4265577, upload-time = "2026-06-11T19:25:41.91Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/01/73d69c5205a34e043026a73048d210f448a986ebb577deee7ceb1923fb5a/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:43a371299305388663131321f0d623fc70ca4a3840f973598946b5183e5ba4e4", size = 5592303, upload-time = "2026-06-11T19:25:43.503Z" },
+    { url = "https://files.pythonhosted.org/packages/98/20/fe35466e526a5d363ebd9c9dd16985dbad7fd677b90e1f123a8180bceb44/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23eddb8dd65dfefb397762cc3c3f6bfaffb4271641bd9dc8043a9ab3aa4cd72a", size = 4409972, upload-time = "2026-06-11T19:25:45.114Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/c8/dc588827ad6e8dad75413bc1d35b5189c8a011a2be4827499a4ab9402253/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43ce7b3aaeb0e8c8ad7c37c84ceed49e10d0929a5a92042c3f6ec5ef33271de4", size = 4250885, upload-time = "2026-06-11T19:25:46.649Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/e2/a9923e4c5848ace6e3e6f09a40d3860955f7d836675affe35bc79bc27033/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c67201bd662e4732a822f91651111bc024329b3e71eba9f4eed19e58c9cf789b", size = 4742518, upload-time = "2026-06-11T19:25:48.098Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/c7/3112def9e93e88341210dd22b4d04c598fb4d0726adef2114b68157354d5/geoarrow_rust_core-0.6.3-cp314-cp314-pyemscripten_2026_0_wasm32.whl", hash = "sha256:8461e6d07a7b39ab099c9885a68d5e7983d4e83a82a42dd5b331c543683c9d6e", size = 1959191, upload-time = "2026-06-11T19:25:49.668Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/0f/de74ce2171c408e4b4a7660f69f6dfaa294797a18a209fa85b1ea79be141/geoarrow_rust_core-0.6.3-cp314-cp314-win_amd64.whl", hash = "sha256:5d2fd45d09bf700e0ca4d30b51ebcd59fb8d1a9eb4a4d7b4fc5f53a6cca59475", size = 3603948, upload-time = "2026-06-11T19:25:51.078Z" },
 ]
 
 [[package]]
@@ -797,8 +686,7 @@ version = "0.6.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "arro3-core" },
-    { name = "pyproj", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "pyproj", version = "3.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
+    { name = "pyproj", version = "3.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "pyproj", version = "3.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
 ]
 wheels = [
@@ -842,14 +730,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/41/14/1ec1ba4df851b477d802285e8b770f65e6774f0d6272e4e8548c8758892c/geoarrow_rust_io-0.6.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a10e67d95a134dbb5f657fe3436ea645c6760a4ffef44df211f7d9b8fb687e6", size = 10499137, upload-time = "2025-12-03T19:02:24.514Z" },
     { url = "https://files.pythonhosted.org/packages/a5/66/7ad618415790671664e76596c000e812e0bd39e8f347f4eb7b8e3f519a55/geoarrow_rust_io-0.6.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:61ccbb528bbe4834849c501e5990a4a6f4b87976ca6a22df7859f16760c79590", size = 10394123, upload-time = "2025-12-03T19:02:01.248Z" },
     { url = "https://files.pythonhosted.org/packages/43/4b/4520af8c694ca0932f995c91d604837741522bd02b66414fdff4521abc98/geoarrow_rust_io-0.6.1-cp314-cp314-win_amd64.whl", hash = "sha256:aa46f6beda6c267f420ea390f071fadd0161094c1db8d71ad54002c006fe7f21", size = 8989484, upload-time = "2025-12-03T19:02:40.081Z" },
-    { url = "https://files.pythonhosted.org/packages/69/87/efadbf1bb9d359f55791f7198cf9aa87f0272be6a2d373f5844f5e59cd1e/geoarrow_rust_io-0.6.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:46e3e41b726b250b44a829ab41489e5008280acb8af8e68001230babf04bafd8", size = 9780411, upload-time = "2025-11-21T02:11:30.128Z" },
-    { url = "https://files.pythonhosted.org/packages/95/73/5e108b286b219d3a46042cfa0830e0f075f4addd01f83f7c851a933919ae/geoarrow_rust_io-0.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:bb95364b726c34c23fb93ebc9c08b8fa1d52062a4a9c1ac614ff8761a339ba7a", size = 9316307, upload-time = "2025-11-21T02:11:21.195Z" },
-    { url = "https://files.pythonhosted.org/packages/06/76/89c387d6d4d303feef328fc9c63df76cea52963e2046f2c092b434fb04a9/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:691a67ef3a5214fb704d1a19d33a9ddf173483c3943056fb965101c19b0edd28", size = 10309182, upload-time = "2025-11-21T02:10:34.063Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/08/34ed2d76ebfb34ed6bf3312defad16b2b5246e40d59e46443a6fe19e85dd/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91c82e9cbae6759798a8e4a87adb13ea617090a5498f384fc56c44775653d7f0", size = 11291230, upload-time = "2025-11-21T02:10:57.771Z" },
-    { url = "https://files.pythonhosted.org/packages/e9/f5/9c25512c1f31101125555367e55ff28f72f449c8f56ff06c5be9e3feb9e5/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9c2b609addc7a810eab5cd573243710d95afe8486f829edd05b311d51bbb5af", size = 13300664, upload-time = "2025-11-21T02:10:46.082Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/aa/14be165b439d3a3ffc6ced96f971b02df255e86b82c7e1f9f340d35689c3/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6abdc80e130f472f55598543a4bb9ba522d6502a5d80017a952027a9e9c1d1ce", size = 10486589, upload-time = "2025-11-21T02:11:09.681Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/df/1c36bae723561785ce47e463f6366a3c52994795a168d7c4ed5e457e9a37/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c4638a89d61629110dde474b3d410ee2e71c89d2035ab2f2557857e7eee4ea30", size = 10395106, upload-time = "2025-11-21T02:10:20.832Z" },
-    { url = "https://files.pythonhosted.org/packages/47/d4/4e9cffad7647c07a5cd1cce68c97102dd011652168e3e09a2dedc1253a5e/geoarrow_rust_io-0.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:4811e96b1777fcf12ac2416872407b1e4717f9a59fe5b80ce02b1e9a087d1b5e", size = 8988735, upload-time = "2025-11-21T02:11:39.164Z" },
     { url = "https://files.pythonhosted.org/packages/e6/9f/32059400bb853eafe5d37d8c4ae9e48cd9c43820287e435cc1566f42208e/geoarrow_rust_io-0.6.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ef94f84ba4efb42d63588241733e1b62bbdb4edeac5513baeb7bfb07db4f204a", size = 10303111, upload-time = "2025-11-21T02:10:36.067Z" },
     { url = "https://files.pythonhosted.org/packages/6c/a2/7db0a685eafa41e9565a3c4e441f41d2630c084f616d2669c5fe8f5805ef/geoarrow_rust_io-0.6.1-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:872dd92c52b2df342d34ac42d1b710c91c58e9dd93f5c88098816f9cd9dc8a84", size = 11299498, upload-time = "2025-11-21T02:11:00.19Z" },
     { url = "https://files.pythonhosted.org/packages/13/b4/1bfbfbe828ca51b4f314d9f70514c2ff19923714aa7d51ef1b0ec8600aed/geoarrow_rust_io-0.6.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:235a7ea94faa95a4699f6577765a5e5a88bee079828c3d9015d9d5c6c240459c", size = 13299230, upload-time = "2025-11-21T02:10:48.12Z" },
@@ -913,13 +793,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4e/9d/5e3e362815152aa1afd8b26ea613effa005962f9da0eec6e0e4527e7a7d1/grpcio-1.75.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:3e71a2105210366bfc398eef7f57a664df99194f3520edb88b9c3a7e46ee0d64", size = 7081061, upload-time = "2025-09-26T09:02:58.261Z" },
     { url = "https://files.pythonhosted.org/packages/1e/1a/46615682a19e100f46e31ddba9ebc297c5a5ab9ddb47b35443ffadb8776c/grpcio-1.75.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:8679aa8a5b67976776d3c6b0521e99d1c34db8a312a12bcfd78a7085cb9b604e", size = 8010849, upload-time = "2025-09-26T09:03:00.548Z" },
     { url = "https://files.pythonhosted.org/packages/67/8e/3204b94ac30b0f675ab1c06540ab5578660dc8b690db71854d3116f20d00/grpcio-1.75.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:aad1c774f4ebf0696a7f148a56d39a3432550612597331792528895258966dc0", size = 7464478, upload-time = "2025-09-26T09:03:03.096Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/e2/33efd823a879dc7b60c10192df1900ee5c200f8e782663a41a3b2aecd143/grpcio-1.75.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:c09fba33327c3ac11b5c33dbdd8218eef8990d78f83b1656d628831812a8c0fb", size = 5706679, upload-time = "2025-09-26T09:03:10.218Z" },
-    { url = "https://files.pythonhosted.org/packages/77/90/b80e75f8cce758425b2772742eed4e9db765a965d902ba4b7f239b2513de/grpcio-1.75.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c12121e509b9f8b0914d10054d24120237d19e870b1cd82acbb8a9b9ddd198a3", size = 6291926, upload-time = "2025-09-26T09:03:16.282Z" },
-    { url = "https://files.pythonhosted.org/packages/40/5f/e6033d8f99063350e20873a46225468b73045b9ef2c8cba73d66a87c3fd5/grpcio-1.75.1-cp39-cp39-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:73577a93e692b3474b1bfe84285d098de36705dbd838bb4d6a056d326e4dc880", size = 6950040, upload-time = "2025-09-26T09:03:18.874Z" },
-    { url = "https://files.pythonhosted.org/packages/01/12/34076c079b45af5aed40f037fffe388d7fbe90dd539ed01e4744c926d227/grpcio-1.75.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e19e7dfa0d7ca7dea22be464339e18ac608fd75d88c56770c646cdabe54bc724", size = 6465780, upload-time = "2025-09-26T09:03:21.219Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/c5/ee6fd69a9f6e7288d04da010ad7480a0566d2aac81097ff4dafbc5ffa9b6/grpcio-1.75.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4e1c28f51c1cf67eccdfc1065e8e866c9ed622f09773ca60947089c117f848a1", size = 7098308, upload-time = "2025-09-26T09:03:23.875Z" },
-    { url = "https://files.pythonhosted.org/packages/78/32/f2be13f13035361768923159fe20470a7d22db2c7c692b952e21284f56e5/grpcio-1.75.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:030a6164bc2ca726052778c0cf8e3249617a34e368354f9e6107c27ad4af8c28", size = 8042268, upload-time = "2025-09-26T09:03:26.268Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/2d/1bb0572f0a2eaab100b4635c6c2cd0d37e3cda5554037e3f90b1bc428d56/grpcio-1.75.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:67697efef5a98d46d5db7b1720fa4043536f8b8e5072a5d61cfca762f287e939", size = 7491470, upload-time = "2025-09-26T09:03:28.906Z" },
 ]
 
 [[package]]
@@ -927,8 +800,7 @@ name = "h5py"
 version = "3.14.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/5d/57/dfb3c5c3f1bf5f5ef2e59a22dec4ff1f3d7408b55bfcefcfb0ea69ef21c6/h5py-3.14.0.tar.gz", hash = "sha256:2372116b2e0d5d3e5e705b7f663f7c8d96fa79a4052d250484ef91d24d6a08f4", size = 424323, upload-time = "2025-06-06T14:06:15.01Z" }
@@ -941,8 +813,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/86/f9/f00de11c82c88bfc1ef22633557bfba9e271e0cb3189ad704183fc4a2644/h5py-3.14.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0cbd41f4e3761f150aa5b662df991868ca533872c95467216f2bec5fcad84882", size = 4929422, upload-time = "2025-06-06T14:05:18.399Z" },
     { url = "https://files.pythonhosted.org/packages/0d/ce/3a21d87896bc7e3e9255e0ad5583ae31ae9e6b4b00e0bcb2a67e2b6acdbc/h5py-3.14.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8cbaf6910fa3983c46172666b0b8da7b7bd90d764399ca983236f2400436eeb", size = 4700675, upload-time = "2025-06-06T14:05:37.38Z" },
     { url = "https://files.pythonhosted.org/packages/e7/ec/86f59025306dcc6deee5fda54d980d077075b8d9889aac80f158bd585f1b/h5py-3.14.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d90e6445ab7c146d7f7981b11895d70bc1dd91278a4f9f9028bc0c95e4a53f13", size = 4921632, upload-time = "2025-06-06T14:05:43.464Z" },
-    { url = "https://files.pythonhosted.org/packages/66/40/b423b57696514e05aa7bb06150ef96667d0e0006cc6de7ab52c71734ab51/h5py-3.14.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:573c33ad056ac7c1ab6d567b6db9df3ffc401045e3f605736218f96c1e0490c6", size = 4326368, upload-time = "2025-06-06T14:06:00.782Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/07/e088f89f04fdbe57ddf9de377f857158d3daa38cf5d0fb20ef9bd489e313/h5py-3.14.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccbe17dc187c0c64178f1a10aa274ed3a57d055117588942b8a08793cc448216", size = 4559686, upload-time = "2025-06-06T14:06:07.416Z" },
 ]
 
 [[package]]
@@ -965,14 +835,14 @@ name = "huggingface-hub"
 version = "0.35.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "filelock", marker = "python_full_version >= '3.10'" },
-    { name = "fsspec", marker = "python_full_version >= '3.10'" },
-    { name = "hf-xet", marker = "(python_full_version >= '3.10' and platform_machine == 'aarch64') or (python_full_version >= '3.10' and platform_machine == 'amd64') or (python_full_version >= '3.10' and platform_machine == 'arm64') or (python_full_version >= '3.10' and platform_machine == 'x86_64')" },
-    { name = "packaging", marker = "python_full_version >= '3.10'" },
-    { name = "pyyaml", marker = "python_full_version >= '3.10'" },
-    { name = "requests", marker = "python_full_version >= '3.10'" },
-    { name = "tqdm", marker = "python_full_version >= '3.10'" },
-    { name = "typing-extensions", marker = "python_full_version >= '3.10'" },
+    { name = "filelock" },
+    { name = "fsspec" },
+    { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" },
+    { name = "packaging" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "tqdm" },
+    { name = "typing-extensions" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/10/7e/a0a97de7c73671863ca6b3f61fa12518caf35db37825e43d63a70956738c/huggingface_hub-0.35.3.tar.gz", hash = "sha256:350932eaa5cc6a4747efae85126ee220e4ef1b54e29d31c3b45c5612ddf0b32a", size = 461798, upload-time = "2025-09-29T14:29:58.625Z" }
 wheels = [
@@ -988,18 +858,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d2/23/408243171aa9aaba178d3e2559159c24c1171a641aa83b67bdd3394ead8e/idna-3.15-py3-none-any.whl", hash = "sha256:048adeaf8c2d788c40fee287673ccaa74c24ffd8dcf09ffa555a2fbb59f10ac8", size = 72340, upload-time = "2026-05-12T22:45:55.733Z" },
 ]
 
-[[package]]
-name = "importlib-metadata"
-version = "8.7.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "zipp", marker = "python_full_version < '3.10'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" },
-]
-
 [[package]]
 name = "iniconfig"
 version = "2.1.0"
@@ -1030,83 +888,51 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" },
 ]
 
-[[package]]
-name = "keras"
-version = "2.7.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.10'",
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/6b/8b/065f94ba03282fa41b2d76942b87a180a9913312c4611ea7d6508fbbc114/keras-2.7.0-py2.py3-none-any.whl", hash = "sha256:0c33ae1f728064ca0d35dfba999e9c316f03623bf5688c82fb83cc74a80ea248", size = 1332171, upload-time = "2021-11-03T16:16:34.318Z" },
-]
-
 [[package]]
 name = "keras"
 version = "3.11.3"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14'",
-    "python_full_version == '3.13.*'",
-    "python_full_version == '3.12.*'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-]
 dependencies = [
-    { name = "absl-py", marker = "python_full_version >= '3.10'" },
-    { name = "h5py", marker = "python_full_version >= '3.10'" },
-    { name = "ml-dtypes", marker = "python_full_version >= '3.10'" },
-    { name = "namex", marker = "python_full_version >= '3.10'" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
+    { name = "absl-py" },
+    { name = "h5py" },
+    { name = "ml-dtypes" },
+    { name = "namex" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
-    { name = "optree", marker = "python_full_version >= '3.10'" },
-    { name = "packaging", marker = "python_full_version >= '3.10'" },
-    { name = "rich", marker = "python_full_version >= '3.10'" },
+    { name = "optree" },
+    { name = "packaging" },
+    { name = "rich" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/6a/89/646425fe9a46f9053430e1271f817c36041c6f33469950a3caafc3d2591e/keras-3.11.3.tar.gz", hash = "sha256:efda616835c31b7d916d72303ef9adec1257320bc9fd4b2b0138840fc65fb5b7", size = 1065906, upload-time = "2025-08-21T22:08:57.643Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/94/5b/4c778cc921ce4b864b238f63f8e3ff6e954ab19b80c9fa680593ad8093d4/keras-3.11.3-py3-none-any.whl", hash = "sha256:f484f050e05ee400455b05ec8c36ed35edc34de94256b6073f56cfe68f65491f", size = 1408438, upload-time = "2025-08-21T22:08:55.858Z" },
 ]
 
-[[package]]
-name = "keras-preprocessing"
-version = "1.1.2"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "six", marker = "python_full_version < '3.10'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/5e/f1/b44337faca48874333769a29398fe4666686733c8880aa160b9fd5dfe600/Keras_Preprocessing-1.1.2.tar.gz", hash = "sha256:add82567c50c8bc648c14195bf544a5ce7c1f76761536956c3d2978970179ef3", size = 163598, upload-time = "2020-05-14T03:53:48.526Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/79/4c/7c3275a01e12ef9368a892926ab932b33bb13d55794881e3573482b378a7/Keras_Preprocessing-1.1.2-py2.py3-none-any.whl", hash = "sha256:7b82029b130ff61cc99b55f3bd27427df4838576838c5b2f65940e4fcec99a7b", size = 42581, upload-time = "2020-05-14T03:53:47.192Z" },
-]
-
 [[package]]
 name = "lance-namespace"
-version = "0.8.0"
+version = "0.8.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "lance-namespace-urllib3-client" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/21/80/2b6eaa08c5e25915acaa6368a70211a25b5ba9d2d6006450e68a73936164/lance_namespace-0.8.0.tar.gz", hash = "sha256:c4a79ee221a3b2315c29863ad12d85fcf219a13158e26149d63e21dc4b4673a7", size = 10756, upload-time = "2026-06-01T08:47:10.183Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/af/12/f7ab93b29be3edbf5fc3610714bf2d06088e7f4524bfb38dfd6852458b08/lance_namespace-0.8.6.tar.gz", hash = "sha256:18232e721c8188145f4ec9389cc2dfbeeabf54a619d94885ea1b3375bee9f4af", size = 11529, upload-time = "2026-06-12T17:36:41.651Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/4b/bd/7b40a08fb132fab39a6caebf832fdf6b9befc71be9413beb9be0a9d927d4/lance_namespace-0.8.0-py3-none-any.whl", hash = "sha256:782cf9e332f46bf06836722dd98b53ca8495ad98bb541501ff6876c89b67ec90", size = 12579, upload-time = "2026-06-01T08:47:10.91Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/1b/5b1668ee2dc8910965f390640359112a31157092fcf8e000b89c79b58708/lance_namespace-0.8.6-py3-none-any.whl", hash = "sha256:571eae34f9aad70e5b05020416c2860889b9ec82993ccd0eb015e7b39c3ea309", size = 13383, upload-time = "2026-06-12T17:36:43.456Z" },
 ]
 
 [[package]]
 name = "lance-namespace-urllib3-client"
-version = "0.8.0"
+version = "0.8.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "pydantic" },
     { name = "python-dateutil" },
     { name = "typing-extensions" },
-    { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "urllib3", version = "2.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "urllib3" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/8c/37/06fcd5a8969381e0ba953d51990af8d331bdccbc62458bf2eed30d064573/lance_namespace_urllib3_client-0.8.0.tar.gz", hash = "sha256:4f060f05ebf3c04aeaeb0d2022cbe77648a3df290f02cd2c305e5797d0fc1fdd", size = 203710, upload-time = "2026-06-01T08:47:13.404Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/c7/80/fb224b4a89c1c1638cde949cb6cce6c3aca7759effbfea46a3d9c3960b21/lance_namespace_urllib3_client-0.8.6.tar.gz", hash = "sha256:b6fb1d306e74a7576e5309919020be744527de484a63dbf5eed10f8b368548df", size = 228772, upload-time = "2026-06-12T17:36:42.609Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/51/43/e280727feee958f303bc58d5fa912b07734a0831f756d841654d500c2c34/lance_namespace_urllib3_client-0.8.0-py3-none-any.whl", hash = "sha256:6734e341b726e5cc96a0cd257cef27eb9d03013f2d151526ee426cef8e63e228", size = 336669, upload-time = "2026-06-01T08:47:11.88Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/90/1e27de15cd1b16785a1c7312beb0a59e75c8344a815f600f58173a565bd1/lance_namespace_urllib3_client-0.8.6-py3-none-any.whl", hash = "sha256:9d78249c3fb15aa3d15d668f78f04a275af3d08d800a7027492f37996ac4968b", size = 369950, upload-time = "2026-06-12T17:36:40.438Z" },
 ]
 
 [[package]]
@@ -1125,9 +951,6 @@ wheels = [
 name = "markdown"
 version = "3.9"
 source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "importlib-metadata", marker = "python_full_version < '3.10'" },
-]
 sdist = { url = "https://files.pythonhosted.org/packages/8d/37/02347f6d6d8279247a5837082ebc26fc0d5aaeaf75aa013fcbb433c777ab/markdown-3.9.tar.gz", hash = "sha256:d2900fe1782bd33bdbbd56859defef70c2e78fc46668f8eb9df3128138f2cb6a", size = 364585, upload-time = "2025-09-04T20:25:22.885Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/70/ae/44c4a6a4cbb496d93c6257954260fe3a6e91b7bed2240e5dad2a717f5111/markdown-3.9-py3-none-any.whl", hash = "sha256:9f4d91ed810864ea88a6f32c07ba8bee1346c0cc1f6b1f9f6c822f2a9667d280", size = 107441, upload-time = "2025-09-04T20:25:21.784Z" },
@@ -1138,7 +961,7 @@ name = "markdown-it-py"
 version = "4.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "mdurl", marker = "python_full_version >= '3.10'" },
+    { name = "mdurl" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" }
 wheels = [
@@ -1228,17 +1051,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" },
     { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" },
     { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" },
-    { url = "https://files.pythonhosted.org/packages/56/23/0d8c13a44bde9154821586520840643467aee574d8ce79a17da539ee7fed/markupsafe-3.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:15d939a21d546304880945ca1ecb8a039db6b4dc49b2c5a400387cdae6a62e26", size = 11623, upload-time = "2025-09-27T18:37:29.296Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/23/07a2cb9a8045d5f3f0890a8c3bc0859d7a47bfd9a560b563899bec7b72ed/markupsafe-3.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f71a396b3bf33ecaa1626c255855702aca4d3d9fea5e051b41ac59a9c1c41edc", size = 12049, upload-time = "2025-09-27T18:37:30.234Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/e4/6be85eb81503f8e11b61c0b6369b6e077dcf0a74adbd9ebf6b349937b4e9/markupsafe-3.0.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f4b68347f8c5eab4a13419215bdfd7f8c9b19f2b25520968adfad23eb0ce60c", size = 21923, upload-time = "2025-09-27T18:37:31.177Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/bc/4dc914ead3fe6ddaef035341fee0fc956949bbd27335b611829292b89ee2/markupsafe-3.0.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8fc20152abba6b83724d7ff268c249fa196d8259ff481f3b1476383f8f24e42", size = 20543, upload-time = "2025-09-27T18:37:32.168Z" },
-    { url = "https://files.pythonhosted.org/packages/89/6e/5fe81fbcfba4aef4093d5f856e5c774ec2057946052d18d168219b7bd9f9/markupsafe-3.0.3-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:949b8d66bc381ee8b007cd945914c721d9aba8e27f71959d750a46f7c282b20b", size = 20585, upload-time = "2025-09-27T18:37:33.166Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/f6/e0e5a3d3ae9c4020f696cd055f940ef86b64fe88de26f3a0308b9d3d048c/markupsafe-3.0.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:3537e01efc9d4dccdf77221fb1cb3b8e1a38d5428920e0657ce299b20324d758", size = 21387, upload-time = "2025-09-27T18:37:34.185Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/25/651753ef4dea08ea790f4fbb65146a9a44a014986996ca40102e237aa49a/markupsafe-3.0.3-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:591ae9f2a647529ca990bc681daebdd52c8791ff06c2bfa05b65163e28102ef2", size = 20133, upload-time = "2025-09-27T18:37:35.138Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/0a/c3cf2b4fef5f0426e8a6d7fce3cb966a17817c568ce59d76b92a233fdbec/markupsafe-3.0.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a320721ab5a1aba0a233739394eb907f8c8da5c98c9181d1161e77a0c8e36f2d", size = 20588, upload-time = "2025-09-27T18:37:36.096Z" },
-    { url = "https://files.pythonhosted.org/packages/cd/1b/a7782984844bd519ad4ffdbebbba2671ec5d0ebbeac34736c15fb86399e8/markupsafe-3.0.3-cp39-cp39-win32.whl", hash = "sha256:df2449253ef108a379b8b5d6b43f4b1a8e81a061d6537becd5582fba5f9196d7", size = 14566, upload-time = "2025-09-27T18:37:37.09Z" },
-    { url = "https://files.pythonhosted.org/packages/18/1f/8d9c20e1c9440e215a44be5ab64359e207fcb4f675543f1cf9a2a7f648d0/markupsafe-3.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:7c3fb7d25180895632e5d3148dbdc29ea38ccb7fd210aa27acbd1201a1902c6e", size = 15053, upload-time = "2025-09-27T18:37:38.054Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/d3/fe08482b5cd995033556d45041a4f4e76e7f0521112a9c9991d40d39825f/markupsafe-3.0.3-cp39-cp39-win_arm64.whl", hash = "sha256:38664109c14ffc9e7437e86b4dceb442b0096dfe3541d7864d9cbe1da4cf36c8", size = 13928, upload-time = "2025-09-27T18:37:39.037Z" },
 ]
 
 [[package]]
@@ -1279,8 +1091,7 @@ name = "ml-dtypes"
 version = "0.5.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/78/a7/aad060393123cfb383956dca68402aff3db1e1caffd5764887ed5153f41b/ml_dtypes-0.5.3.tar.gz", hash = "sha256:95ce33057ba4d05df50b1f3cfefab22e351868a843b3b15a46c65836283670c9", size = 692316, upload-time = "2025-07-29T18:39:19.454Z" }
@@ -1315,10 +1126,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/53/21/783dfb51f40d2660afeb9bccf3612b99f6a803d980d2a09132b0f9d216ab/ml_dtypes-0.5.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:e12e29764a0e66a7a31e9b8bf1de5cc0423ea72979f45909acd4292de834ccd3", size = 689324, upload-time = "2025-07-29T18:39:07.567Z" },
     { url = "https://files.pythonhosted.org/packages/09/f7/a82d249c711abf411ac027b7163f285487f5e615c3e0716c61033ce996ab/ml_dtypes-0.5.3-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:19f6c3a4f635c2fc9e2aa7d91416bd7a3d649b48350c51f7f715a09370a90d93", size = 5275917, upload-time = "2025-07-29T18:39:09.339Z" },
     { url = "https://files.pythonhosted.org/packages/7f/3c/541c4b30815ab90ebfbb51df15d0b4254f2f9f1e2b4907ab229300d5e6f2/ml_dtypes-0.5.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ab039ffb40f3dc0aeeeba84fd6c3452781b5e15bef72e2d10bcb33e4bbffc39", size = 5285284, upload-time = "2025-07-29T18:39:11.532Z" },
-    { url = "https://files.pythonhosted.org/packages/19/2d/c61af51173083bbf2a3b0f1a1a01d50ef1830436880027433d1b75271083/ml_dtypes-0.5.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5ee72568d46b9533ad54f78b1e1f3067c0534c5065120ea8ecc6f210d22748b3", size = 663552, upload-time = "2025-07-29T18:39:13.102Z" },
-    { url = "https://files.pythonhosted.org/packages/61/0e/a628f2aefd719745e8a13492375a55cedea77c0cfc917b1ce11bde435c68/ml_dtypes-0.5.3-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:01de48de4537dc3c46e684b969a40ec36594e7eeb7c69e9a093e7239f030a28a", size = 4952704, upload-time = "2025-07-29T18:39:14.829Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/2e/5ba92f1f99d1f5f62bffec614a5b8161e55c3961257c902fa26dbe909baa/ml_dtypes-0.5.3-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8b1a6e231b0770f2894910f1dce6d2f31d65884dbf7668f9b08d73623cdca909", size = 4923538, upload-time = "2025-07-29T18:39:16.581Z" },
-    { url = "https://files.pythonhosted.org/packages/70/3b/f801c69027866ea6e387224551185fedef62ad8e2e71181ec0d9dda905f7/ml_dtypes-0.5.3-cp39-cp39-win_amd64.whl", hash = "sha256:a4f39b9bf6555fab9bfb536cf5fdd1c1c727e8d22312078702e9ff005354b37f", size = 206567, upload-time = "2025-07-29T18:39:18.047Z" },
 ]
 
 [[package]]
@@ -1335,7 +1142,7 @@ name = "multidict"
 version = "6.6.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "typing-extensions", marker = "python_full_version == '3.10.*'" },
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/69/7f/0652e6ed47ab288e3756ea9c0df8b14950781184d4bd7883f4d87dd41245/multidict-6.6.4.tar.gz", hash = "sha256:d2d4e4787672911b48350df02ed3fa3fffdc2f2e8ca06dd6afdf34189b76a9dd", size = 101843, upload-time = "2025-08-11T12:08:48.217Z" }
 wheels = [
@@ -1429,24 +1236,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/50/b0/a6fae46071b645ae98786ab738447de1ef53742eaad949f27e960864bb49/multidict-6.6.4-cp313-cp313t-win32.whl", hash = "sha256:f93b2b2279883d1d0a9e1bd01f312d6fc315c5e4c1f09e112e4736e2f650bc4e", size = 47775, upload-time = "2025-08-11T12:08:12.439Z" },
     { url = "https://files.pythonhosted.org/packages/b2/0a/2436550b1520091af0600dff547913cb2d66fbac27a8c33bc1b1bccd8d98/multidict-6.6.4-cp313-cp313t-win_amd64.whl", hash = "sha256:6d46a180acdf6e87cc41dc15d8f5c2986e1e8739dc25dbb7dac826731ef381a4", size = 53100, upload-time = "2025-08-11T12:08:13.823Z" },
     { url = "https://files.pythonhosted.org/packages/97/ea/43ac51faff934086db9c072a94d327d71b7d8b40cd5dcb47311330929ef0/multidict-6.6.4-cp313-cp313t-win_arm64.whl", hash = "sha256:756989334015e3335d087a27331659820d53ba432befdef6a718398b0a8493ad", size = 45501, upload-time = "2025-08-11T12:08:15.173Z" },
-    { url = "https://files.pythonhosted.org/packages/d4/d3/f04c5db316caee9b5b2cbba66270b358c922a959855995bedde87134287c/multidict-6.6.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:af7618b591bae552b40dbb6f93f5518328a949dac626ee75927bba1ecdeea9f4", size = 76977, upload-time = "2025-08-11T12:08:16.667Z" },
-    { url = "https://files.pythonhosted.org/packages/70/39/a6200417d883e510728ab3caec02d3b66ff09e1c85e0aab2ba311abfdf06/multidict-6.6.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b6819f83aef06f560cb15482d619d0e623ce9bf155115150a85ab11b8342a665", size = 44878, upload-time = "2025-08-11T12:08:18.157Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/7e/815be31ed35571b137d65232816f61513fcd97b2717d6a9d7800b5a0c6e0/multidict-6.6.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4d09384e75788861e046330308e7af54dd306aaf20eb760eb1d0de26b2bea2cb", size = 44546, upload-time = "2025-08-11T12:08:19.694Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/f1/21b5bff6a8c3e2aff56956c241941ace6b8820e1abe6b12d3c52868a773d/multidict-6.6.4-cp39-cp39-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:a59c63061f1a07b861c004e53869eb1211ffd1a4acbca330e3322efa6dd02978", size = 223020, upload-time = "2025-08-11T12:08:21.554Z" },
-    { url = "https://files.pythonhosted.org/packages/15/59/37083f1dd3439979a0ffeb1906818d978d88b4cc7f4600a9f89b1cb6713c/multidict-6.6.4-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:350f6b0fe1ced61e778037fdc7613f4051c8baf64b1ee19371b42a3acdb016a0", size = 240528, upload-time = "2025-08-11T12:08:23.45Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/f0/f054d123c87784307a27324c829eb55bcfd2e261eb785fcabbd832c8dc4a/multidict-6.6.4-cp39-cp39-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0c5cbac6b55ad69cb6aa17ee9343dfbba903118fd530348c330211dc7aa756d1", size = 219540, upload-time = "2025-08-11T12:08:24.965Z" },
-    { url = "https://files.pythonhosted.org/packages/e8/26/8f78ce17b7118149c17f238f28fba2a850b660b860f9b024a34d0191030f/multidict-6.6.4-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:630f70c32b8066ddfd920350bc236225814ad94dfa493fe1910ee17fe4365cbb", size = 251182, upload-time = "2025-08-11T12:08:26.511Z" },
-    { url = "https://files.pythonhosted.org/packages/00/c3/a21466322d69f6594fe22d9379200f99194d21c12a5bbf8c2a39a46b83b6/multidict-6.6.4-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f8d4916a81697faec6cb724a273bd5457e4c6c43d82b29f9dc02c5542fd21fc9", size = 249371, upload-time = "2025-08-11T12:08:28.075Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/8e/2e673124eb05cf8dc82e9265eccde01a36bcbd3193e27799b8377123c976/multidict-6.6.4-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8e42332cf8276bb7645d310cdecca93a16920256a5b01bebf747365f86a1675b", size = 239235, upload-time = "2025-08-11T12:08:29.937Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/2d/bdd9f05e7c89e30a4b0e4faf0681a30748f8d1310f68cfdc0e3571e75bd5/multidict-6.6.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:f3be27440f7644ab9a13a6fc86f09cdd90b347c3c5e30c6d6d860de822d7cb53", size = 237410, upload-time = "2025-08-11T12:08:31.872Z" },
-    { url = "https://files.pythonhosted.org/packages/46/4c/3237b83f8ca9a2673bb08fc340c15da005a80f5cc49748b587c8ae83823b/multidict-6.6.4-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:21f216669109e02ef3e2415ede07f4f8987f00de8cdfa0cc0b3440d42534f9f0", size = 232979, upload-time = "2025-08-11T12:08:33.399Z" },
-    { url = "https://files.pythonhosted.org/packages/55/a6/a765decff625ae9bc581aed303cd1837955177dafc558859a69f56f56ba8/multidict-6.6.4-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:d9890d68c45d1aeac5178ded1d1cccf3bc8d7accf1f976f79bf63099fb16e4bd", size = 240979, upload-time = "2025-08-11T12:08:35.02Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/2d/9c75975cb0c66ea33cae1443bb265b2b3cd689bffcbc68872565f401da23/multidict-6.6.4-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:edfdcae97cdc5d1a89477c436b61f472c4d40971774ac4729c613b4b133163cb", size = 246849, upload-time = "2025-08-11T12:08:37.038Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/71/d21ac0843c1d8751fb5dcf8a1f436625d39d4577bc27829799d09b419af7/multidict-6.6.4-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:0b2e886624be5773e69cf32bcb8534aecdeb38943520b240fed3d5596a430f2f", size = 241798, upload-time = "2025-08-11T12:08:38.669Z" },
-    { url = "https://files.pythonhosted.org/packages/94/3d/1d8911e53092837bd11b1c99d71de3e2a9a26f8911f864554677663242aa/multidict-6.6.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:be5bf4b3224948032a845d12ab0f69f208293742df96dc14c4ff9b09e508fc17", size = 235315, upload-time = "2025-08-11T12:08:40.266Z" },
-    { url = "https://files.pythonhosted.org/packages/86/c5/4b758df96376f73e936b1942c6c2dfc17e37ed9d5ff3b01a811496966ca0/multidict-6.6.4-cp39-cp39-win32.whl", hash = "sha256:10a68a9191f284fe9d501fef4efe93226e74df92ce7a24e301371293bd4918ae", size = 41434, upload-time = "2025-08-11T12:08:41.965Z" },
-    { url = "https://files.pythonhosted.org/packages/58/16/f1dfa2a0f25f2717a5e9e5fe8fd30613f7fe95e3530cec8d11f5de0b709c/multidict-6.6.4-cp39-cp39-win_amd64.whl", hash = "sha256:ee25f82f53262f9ac93bd7e58e47ea1bdcc3393cef815847e397cba17e284210", size = 46186, upload-time = "2025-08-11T12:08:43.367Z" },
-    { url = "https://files.pythonhosted.org/packages/88/7d/a0568bac65438c494cb6950b29f394d875a796a237536ac724879cf710c9/multidict-6.6.4-cp39-cp39-win_arm64.whl", hash = "sha256:f9867e55590e0855bcec60d4f9a092b69476db64573c9fe17e92b0c50614c16a", size = 43115, upload-time = "2025-08-11T12:08:45.126Z" },
     { url = "https://files.pythonhosted.org/packages/fd/69/b547032297c7e63ba2af494edba695d781af8a0c6e89e4d06cf848b21d80/multidict-6.6.4-py3-none-any.whl", hash = "sha256:27d8f8e125c07cb954e54d75d04905a9bba8a439c1d84aca94949d4d03d8601c", size = 12313, upload-time = "2025-08-11T12:08:46.891Z" },
 ]
 
@@ -1455,14 +1244,12 @@ name = "multiprocess"
 version = "0.70.16"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "dill", marker = "python_full_version >= '3.10'" },
+    { name = "dill" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b5/ae/04f39c5d0d0def03247c2893d6f2b83c136bf3320a2154d7b8858f2ba72d/multiprocess-0.70.16.tar.gz", hash = "sha256:161af703d4652a0e1410be6abccecde4a7ddffd19341be0a7011b94aeb171ac1", size = 1772603, upload-time = "2024-01-28T18:52:34.85Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/ef/76/6e712a2623d146d314f17598df5de7224c85c0060ef63fd95cc15a25b3fa/multiprocess-0.70.16-pp310-pypy310_pp73-macosx_10_13_x86_64.whl", hash = "sha256:476887be10e2f59ff183c006af746cb6f1fd0eadcfd4ef49e605cbe2659920ee", size = 134980, upload-time = "2024-01-28T18:52:15.731Z" },
     { url = "https://files.pythonhosted.org/packages/0f/ab/1e6e8009e380e22254ff539ebe117861e5bdb3bff1fc977920972237c6c7/multiprocess-0.70.16-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d951bed82c8f73929ac82c61f01a7b5ce8f3e5ef40f5b52553b4f547ce2b08ec", size = 134982, upload-time = "2024-01-28T18:52:17.783Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/94/8638a89f93c80df329116e6781a060506c7e91e1f4370dc831e9d17a041d/multiprocess-0.70.16-pp39-pypy39_pp73-macosx_10_13_x86_64.whl", hash = "sha256:0dfd078c306e08d46d7a8d06fb120313d87aa43af60d66da43ffff40b44d2f41", size = 133497, upload-time = "2024-01-28T18:52:22.644Z" },
-    { url = "https://files.pythonhosted.org/packages/89/21/222066f6bb8d8af287923ae3bd26cf4699a9ce020228ac273caca1de8250/multiprocess-0.70.16-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e7b9d0f307cd9bd50851afaac0dba2cb6c44449efff697df7c7645f7d3f2be3a", size = 133498, upload-time = "2024-01-28T18:52:24.576Z" },
     { url = "https://files.pythonhosted.org/packages/bc/f7/7ec7fddc92e50714ea3745631f79bd9c96424cb2702632521028e57d3a36/multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02", size = 134824, upload-time = "2024-01-28T18:52:26.062Z" },
     { url = "https://files.pythonhosted.org/packages/50/15/b56e50e8debaf439f44befec5b2af11db85f6e0f344c3113ae0be0593a91/multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a", size = 143519, upload-time = "2024-01-28T18:52:28.115Z" },
     { url = "https://files.pythonhosted.org/packages/0a/7d/a988f258104dcd2ccf1ed40fdc97e26c4ac351eeaf81d76e266c52d84e2f/multiprocess-0.70.16-py312-none-any.whl", hash = "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e", size = 146741, upload-time = "2024-01-28T18:52:29.395Z" },
@@ -1479,24 +1266,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b2/bc/465daf1de06409cdd4532082806770ee0d8d7df434da79c76564d0f69741/namex-0.1.0-py3-none-any.whl", hash = "sha256:e2012a474502f1e2251267062aae3114611f07df4224b6e06334c57b0f2ce87c", size = 5905, upload-time = "2025-05-26T23:17:37.695Z" },
 ]
 
-[[package]]
-name = "networkx"
-version = "3.2.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.10'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/c4/80/a84676339aaae2f1cfdf9f418701dd634aef9cc76f708ef55c36ff39c3ca/networkx-3.2.1.tar.gz", hash = "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6", size = 2073928, upload-time = "2023-10-28T08:41:39.364Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d5/f0/8fbc882ca80cf077f1b246c0e3c3465f7f415439bdea6b899f6b19f61f70/networkx-3.2.1-py3-none-any.whl", hash = "sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2", size = 1647772, upload-time = "2023-10-28T08:41:36.945Z" },
-]
-
 [[package]]
 name = "networkx"
 version = "3.4.2"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version == '3.10.*'",
+    "python_full_version < '3.11'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368, upload-time = "2024-10-21T12:39:38.695Z" }
 wheels = [
@@ -1527,67 +1302,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" },
 ]
 
-[[package]]
-name = "numpy"
-version = "2.0.2"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.10'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/a9/75/10dd1f8116a8b796cb2c737b674e02d02e80454bda953fa7e65d8c12b016/numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78", size = 18902015, upload-time = "2024-08-26T20:19:40.945Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/21/91/3495b3237510f79f5d81f2508f9f13fea78ebfdf07538fc7444badda173d/numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece", size = 21165245, upload-time = "2024-08-26T20:04:14.625Z" },
-    { url = "https://files.pythonhosted.org/packages/05/33/26178c7d437a87082d11019292dce6d3fe6f0e9026b7b2309cbf3e489b1d/numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04", size = 13738540, upload-time = "2024-08-26T20:04:36.784Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/31/cc46e13bf07644efc7a4bf68df2df5fb2a1a88d0cd0da9ddc84dc0033e51/numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66", size = 5300623, upload-time = "2024-08-26T20:04:46.491Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/16/7bfcebf27bb4f9d7ec67332ffebee4d1bf085c84246552d52dbb548600e7/numpy-2.0.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b", size = 6901774, upload-time = "2024-08-26T20:04:58.173Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/a3/561c531c0e8bf082c5bef509d00d56f82e0ea7e1e3e3a7fc8fa78742a6e5/numpy-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd", size = 13907081, upload-time = "2024-08-26T20:05:19.098Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/66/f7177ab331876200ac7563a580140643d1179c8b4b6a6b0fc9838de2a9b8/numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318", size = 19523451, upload-time = "2024-08-26T20:05:47.479Z" },
-    { url = "https://files.pythonhosted.org/packages/25/7f/0b209498009ad6453e4efc2c65bcdf0ae08a182b2b7877d7ab38a92dc542/numpy-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8", size = 19927572, upload-time = "2024-08-26T20:06:17.137Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/df/2619393b1e1b565cd2d4c4403bdd979621e2c4dea1f8532754b2598ed63b/numpy-2.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326", size = 14400722, upload-time = "2024-08-26T20:06:39.16Z" },
-    { url = "https://files.pythonhosted.org/packages/22/ad/77e921b9f256d5da36424ffb711ae79ca3f451ff8489eeca544d0701d74a/numpy-2.0.2-cp310-cp310-win32.whl", hash = "sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97", size = 6472170, upload-time = "2024-08-26T20:06:50.361Z" },
-    { url = "https://files.pythonhosted.org/packages/10/05/3442317535028bc29cf0c0dd4c191a4481e8376e9f0db6bcf29703cadae6/numpy-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131", size = 15905558, upload-time = "2024-08-26T20:07:13.881Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/cf/034500fb83041aa0286e0fb16e7c76e5c8b67c0711bb6e9e9737a717d5fe/numpy-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448", size = 21169137, upload-time = "2024-08-26T20:07:45.345Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/d9/32de45561811a4b87fbdee23b5797394e3d1504b4a7cf40c10199848893e/numpy-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195", size = 13703552, upload-time = "2024-08-26T20:08:06.666Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/ca/2f384720020c7b244d22508cb7ab23d95f179fcfff33c31a6eeba8d6c512/numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57", size = 5298957, upload-time = "2024-08-26T20:08:15.83Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/78/a3e4f9fb6aa4e6fdca0c5428e8ba039408514388cf62d89651aade838269/numpy-2.0.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a", size = 6905573, upload-time = "2024-08-26T20:08:27.185Z" },
-    { url = "https://files.pythonhosted.org/packages/a0/72/cfc3a1beb2caf4efc9d0b38a15fe34025230da27e1c08cc2eb9bfb1c7231/numpy-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669", size = 13914330, upload-time = "2024-08-26T20:08:48.058Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/a8/c17acf65a931ce551fee11b72e8de63bf7e8a6f0e21add4c937c83563538/numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951", size = 19534895, upload-time = "2024-08-26T20:09:16.536Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/86/8767f3d54f6ae0165749f84648da9dcc8cd78ab65d415494962c86fac80f/numpy-2.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9", size = 19937253, upload-time = "2024-08-26T20:09:46.263Z" },
-    { url = "https://files.pythonhosted.org/packages/df/87/f76450e6e1c14e5bb1eae6836478b1028e096fd02e85c1c37674606ab752/numpy-2.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15", size = 14414074, upload-time = "2024-08-26T20:10:08.483Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/ca/0f0f328e1e59f73754f06e1adfb909de43726d4f24c6a3f8805f34f2b0fa/numpy-2.0.2-cp311-cp311-win32.whl", hash = "sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4", size = 6470640, upload-time = "2024-08-26T20:10:19.732Z" },
-    { url = "https://files.pythonhosted.org/packages/eb/57/3a3f14d3a759dcf9bf6e9eda905794726b758819df4663f217d658a58695/numpy-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc", size = 15910230, upload-time = "2024-08-26T20:10:43.413Z" },
-    { url = "https://files.pythonhosted.org/packages/45/40/2e117be60ec50d98fa08c2f8c48e09b3edea93cfcabd5a9ff6925d54b1c2/numpy-2.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b", size = 20895803, upload-time = "2024-08-26T20:11:13.916Z" },
-    { url = "https://files.pythonhosted.org/packages/46/92/1b8b8dee833f53cef3e0a3f69b2374467789e0bb7399689582314df02651/numpy-2.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e", size = 13471835, upload-time = "2024-08-26T20:11:34.779Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/19/e2793bde475f1edaea6945be141aef6c8b4c669b90c90a300a8954d08f0a/numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c", size = 5038499, upload-time = "2024-08-26T20:11:43.902Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/ff/ddf6dac2ff0dd50a7327bcdba45cb0264d0e96bb44d33324853f781a8f3c/numpy-2.0.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c", size = 6633497, upload-time = "2024-08-26T20:11:55.09Z" },
-    { url = "https://files.pythonhosted.org/packages/72/21/67f36eac8e2d2cd652a2e69595a54128297cdcb1ff3931cfc87838874bd4/numpy-2.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692", size = 13621158, upload-time = "2024-08-26T20:12:14.95Z" },
-    { url = "https://files.pythonhosted.org/packages/39/68/e9f1126d757653496dbc096cb429014347a36b228f5a991dae2c6b6cfd40/numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a", size = 19236173, upload-time = "2024-08-26T20:12:44.049Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/e9/1f5333281e4ebf483ba1c888b1d61ba7e78d7e910fdd8e6499667041cc35/numpy-2.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c", size = 19634174, upload-time = "2024-08-26T20:13:13.634Z" },
-    { url = "https://files.pythonhosted.org/packages/71/af/a469674070c8d8408384e3012e064299f7a2de540738a8e414dcfd639996/numpy-2.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded", size = 14099701, upload-time = "2024-08-26T20:13:34.851Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/3d/08ea9f239d0e0e939b6ca52ad403c84a2bce1bde301a8eb4888c1c1543f1/numpy-2.0.2-cp312-cp312-win32.whl", hash = "sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5", size = 6174313, upload-time = "2024-08-26T20:13:45.653Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/b5/4ac39baebf1fdb2e72585c8352c56d063b6126be9fc95bd2bb5ef5770c20/numpy-2.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a", size = 15606179, upload-time = "2024-08-26T20:14:08.786Z" },
-    { url = "https://files.pythonhosted.org/packages/43/c1/41c8f6df3162b0c6ffd4437d729115704bd43363de0090c7f913cfbc2d89/numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c", size = 21169942, upload-time = "2024-08-26T20:14:40.108Z" },
-    { url = "https://files.pythonhosted.org/packages/39/bc/fd298f308dcd232b56a4031fd6ddf11c43f9917fbc937e53762f7b5a3bb1/numpy-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd", size = 13711512, upload-time = "2024-08-26T20:15:00.985Z" },
-    { url = "https://files.pythonhosted.org/packages/96/ff/06d1aa3eeb1c614eda245c1ba4fb88c483bee6520d361641331872ac4b82/numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b", size = 5306976, upload-time = "2024-08-26T20:15:10.876Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/98/121996dcfb10a6087a05e54453e28e58694a7db62c5a5a29cee14c6e047b/numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729", size = 6906494, upload-time = "2024-08-26T20:15:22.055Z" },
-    { url = "https://files.pythonhosted.org/packages/15/31/9dffc70da6b9bbf7968f6551967fc21156207366272c2a40b4ed6008dc9b/numpy-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1", size = 13912596, upload-time = "2024-08-26T20:15:42.452Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/14/78635daab4b07c0930c919d451b8bf8c164774e6a3413aed04a6d95758ce/numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd", size = 19526099, upload-time = "2024-08-26T20:16:11.048Z" },
-    { url = "https://files.pythonhosted.org/packages/26/4c/0eeca4614003077f68bfe7aac8b7496f04221865b3a5e7cb230c9d055afd/numpy-2.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d", size = 19932823, upload-time = "2024-08-26T20:16:40.171Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/46/ea25b98b13dccaebddf1a803f8c748680d972e00507cd9bc6dcdb5aa2ac1/numpy-2.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d", size = 14404424, upload-time = "2024-08-26T20:17:02.604Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/a6/177dd88d95ecf07e722d21008b1b40e681a929eb9e329684d449c36586b2/numpy-2.0.2-cp39-cp39-win32.whl", hash = "sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa", size = 6476809, upload-time = "2024-08-26T20:17:13.553Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/2b/7fc9f4e7ae5b507c1a3a21f0f15ed03e794c1242ea8a242ac158beb56034/numpy-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73", size = 15911314, upload-time = "2024-08-26T20:17:36.72Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/3b/df5a870ac6a3be3a86856ce195ef42eec7ae50d2a202be1f5a4b3b340e14/numpy-2.0.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8", size = 21025288, upload-time = "2024-08-26T20:18:07.732Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/97/51af92f18d6f6f2d9ad8b482a99fb74e142d71372da5d834b3a2747a446e/numpy-2.0.2-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4", size = 6762793, upload-time = "2024-08-26T20:18:19.125Z" },
-    { url = "https://files.pythonhosted.org/packages/12/46/de1fbd0c1b5ccaa7f9a005b66761533e2f6a3e560096682683a223631fe9/numpy-2.0.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c", size = 19334885, upload-time = "2024-08-26T20:18:47.237Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/dc/d330a6faefd92b446ec0f0dfea4c3207bb1fef3c4771d19cf4543efd2c78/numpy-2.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385", size = 15828784, upload-time = "2024-08-26T20:19:11.19Z" },
-]
-
 [[package]]
 name = "numpy"
 version = "2.2.6"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version == '3.10.*'",
+    "python_full_version < '3.11'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" }
 wheels = [
@@ -1874,7 +1594,7 @@ name = "optree"
 version = "0.17.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "typing-extensions", marker = "python_full_version >= '3.10'" },
+    { name = "typing-extensions" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/56/c7/0853e0c59b135dff770615d2713b547b6b3b5cde7c10995b4a5825244612/optree-0.17.0.tar.gz", hash = "sha256:5335a5ec44479920620d72324c66563bd705ab2a698605dd4b6ee67dbcad7ecd", size = 163111, upload-time = "2025-07-25T11:26:11.586Z" }
 wheels = [
@@ -1915,11 +1635,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5b/d3/8819a2d5105a240d6793d11a61d597db91756ce84da5cee08808c6b8f61f/optree-0.17.0-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:875c017890a4b5d566af5593cab67fe3c4845544942af57e6bb9dea17e060297", size = 439080, upload-time = "2025-07-25T11:25:42.605Z" },
     { url = "https://files.pythonhosted.org/packages/c6/ef/9dbd34dfd1ad89feb239ca9925897a14ac94f190379a3bd991afdfd94186/optree-0.17.0-cp314-cp314t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ffa5686191139f763e13445a169765c83517164bc28e60dbedb19bed2b2655f1", size = 439422, upload-time = "2025-07-25T11:25:43.672Z" },
     { url = "https://files.pythonhosted.org/packages/86/ca/a7a7549af2951925a692df508902ed2a6a94a51bc846806d2281b1029ef9/optree-0.17.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:575cf48cc2190acb565bd2b26b6f9b15c4e3b60183e86031215badc9d5441345", size = 426579, upload-time = "2025-07-25T11:25:44.765Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/29/3bb53de2de3b36a51e46b6d9ada7ee1a3a312ac461cd54292a023adc807c/optree-0.17.0-cp39-cp39-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:537498cf7bf7a4fe71f7ffd815e72b8672aea0fac82e1513f6b6e35e8569f5aa", size = 350302, upload-time = "2025-07-25T11:25:52.016Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/3b/d17a31447ed7ef6f10bd0caf40742b016fcdeaa3abb7568307b04a0f50cf/optree-0.17.0-cp39-cp39-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:3b3bb2326b550ddb048e3454fad40183b7fed74dda4351b016d20362809180af", size = 405358, upload-time = "2025-07-25T11:25:53.085Z" },
-    { url = "https://files.pythonhosted.org/packages/db/f3/b9f0a8c98fd0c7f53fa9d9a46d75bb1182aeecd7ecde6f353d3e69ec9618/optree-0.17.0-cp39-cp39-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c0d3d702044e5acbec2cf8349789f6b096057bd00dc8e1e1c97b990347279fda", size = 402694, upload-time = "2025-07-25T11:25:54.537Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/dd/0d9d7426fd6b5d90ad40e4d93717a955d4257d06574dfe7a1da0d24cb06c/optree-0.17.0-cp39-cp39-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a9155e82717be1dda1f3c1244e9cb5b3733d5dd3ba47702730c7816be083a5cb", size = 398857, upload-time = "2025-07-25T11:25:55.921Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/57/dacec3f8c70f4685bb07fce19cf3361037fde2b596f6f7228e1a4b39677b/optree-0.17.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a8e825501f55360e8381718623b094579dedc485e57010e01593d72a43b43e68", size = 387849, upload-time = "2025-07-25T11:25:57.046Z" },
     { url = "https://files.pythonhosted.org/packages/ed/d7/3036d15c028c447b1bd65dcf8f66cfd775bfa4e52daa74b82fb1d3c88faf/optree-0.17.0-pp310-pypy310_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:adde1427e0982cfc5f56939c26b4ebbd833091a176734c79fb95c78bdf833dff", size = 350952, upload-time = "2025-07-25T11:26:02.692Z" },
     { url = "https://files.pythonhosted.org/packages/71/45/e710024ef77324e745de48efd64f6270d8c209f14107a48ffef4049ac57a/optree-0.17.0-pp310-pypy310_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a80b7e5de5dd09b9c8b62d501e29a3850b047565c336c9d004b07ee1c01f4ae1", size = 389568, upload-time = "2025-07-25T11:26:04.094Z" },
     { url = "https://files.pythonhosted.org/packages/69/c4/94a187ed3ca71194b9da6a276790e1703c7544c8f695ac915214ae8ce934/optree-0.17.0-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f87f6f39015fc82d7adeee19900d246b89911319726e93cb2dbd4d1a809899bd", size = 363728, upload-time = "2025-07-25T11:26:07.959Z" },
@@ -1940,8 +1655,7 @@ name = "pandas"
 version = "2.3.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "python-dateutil" },
     { name = "pytz" },
@@ -1996,13 +1710,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a4/1e/1bac1a839d12e6a82ec6cb40cda2edde64a2013a66963293696bbf31fbbb/pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5", size = 12121582, upload-time = "2025-09-29T23:30:43.391Z" },
     { url = "https://files.pythonhosted.org/packages/44/91/483de934193e12a3b1d6ae7c8645d083ff88dec75f46e827562f1e4b4da6/pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788", size = 12699963, upload-time = "2025-09-29T23:31:10.009Z" },
     { url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" },
-    { url = "https://files.pythonhosted.org/packages/56/b4/52eeb530a99e2a4c55ffcd352772b599ed4473a0f892d127f4147cf0f88e/pandas-2.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c503ba5216814e295f40711470446bc3fd00f0faea8a086cbc688808e26f92a2", size = 11567720, upload-time = "2025-09-29T23:33:06.209Z" },
-    { url = "https://files.pythonhosted.org/packages/48/4a/2d8b67632a021bced649ba940455ed441ca854e57d6e7658a6024587b083/pandas-2.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a637c5cdfa04b6d6e2ecedcb81fc52ffb0fd78ce2ebccc9ea964df9f658de8c8", size = 10810302, upload-time = "2025-09-29T23:33:35.846Z" },
-    { url = "https://files.pythonhosted.org/packages/13/e6/d2465010ee0569a245c975dc6967b801887068bc893e908239b1f4b6c1ac/pandas-2.3.3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:854d00d556406bffe66a4c0802f334c9ad5a96b4f1f868adf036a21b11ef13ff", size = 12154874, upload-time = "2025-09-29T23:33:49.939Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/18/aae8c0aa69a386a3255940e9317f793808ea79d0a525a97a903366bb2569/pandas-2.3.3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bf1f8a81d04ca90e32a0aceb819d34dbd378a98bf923b6398b9a3ec0bf44de29", size = 12790141, upload-time = "2025-09-29T23:34:05.655Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/26/617f98de789de00c2a444fbe6301bb19e66556ac78cff933d2c98f62f2b4/pandas-2.3.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:23ebd657a4d38268c7dfbdf089fbc31ea709d82e4923c5ffd4fbd5747133ce73", size = 13208697, upload-time = "2025-09-29T23:34:21.835Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/fb/25709afa4552042bd0e15717c75e9b4a2294c3dc4f7e6ea50f03c5136600/pandas-2.3.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5554c929ccc317d41a5e3d1234f3be588248e61f08a74dd17c9eabb535777dc9", size = 13879233, upload-time = "2025-09-29T23:34:35.079Z" },
-    { url = "https://files.pythonhosted.org/packages/98/af/7be05277859a7bc399da8ba68b88c96b27b48740b6cf49688899c6eb4176/pandas-2.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:d3e28b3e83862ccf4d85ff19cf8c20b2ae7e503881711ff2d534dc8f761131aa", size = 11359119, upload-time = "2025-09-29T23:34:46.339Z" },
 ]
 
 [[package]]
@@ -2091,17 +1798,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f0/77/bc6f92a3e8e6e46c0ca78abfffec0037845800ea38c73483760362804c41/pillow-11.3.0-cp314-cp314t-win32.whl", hash = "sha256:118ca10c0d60b06d006be10a501fd6bbdfef559251ed31b794668ed569c87e12", size = 6377370, upload-time = "2025-07-01T09:15:46.673Z" },
     { url = "https://files.pythonhosted.org/packages/4a/82/3a721f7d69dca802befb8af08b7c79ebcab461007ce1c18bd91a5d5896f9/pillow-11.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:8924748b688aa210d79883357d102cd64690e56b923a186f35a82cbc10f997db", size = 7121500, upload-time = "2025-07-01T09:15:48.512Z" },
     { url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload-time = "2025-07-01T09:15:50.399Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/8e/9c089f01677d1264ab8648352dcb7773f37da6ad002542760c80107da816/pillow-11.3.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:48d254f8a4c776de343051023eb61ffe818299eeac478da55227d96e241de53f", size = 5316478, upload-time = "2025-07-01T09:15:52.209Z" },
-    { url = "https://files.pythonhosted.org/packages/b5/a9/5749930caf674695867eb56a581e78eb5f524b7583ff10b01b6e5048acb3/pillow-11.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7aee118e30a4cf54fdd873bd3a29de51e29105ab11f9aad8c32123f58c8f8081", size = 4686522, upload-time = "2025-07-01T09:15:54.162Z" },
-    { url = "https://files.pythonhosted.org/packages/43/46/0b85b763eb292b691030795f9f6bb6fcaf8948c39413c81696a01c3577f7/pillow-11.3.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:23cff760a9049c502721bdb743a7cb3e03365fafcdfc2ef9784610714166e5a4", size = 5853376, upload-time = "2025-07-03T13:11:01.066Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/c6/1a230ec0067243cbd60bc2dad5dc3ab46a8a41e21c15f5c9b52b26873069/pillow-11.3.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6359a3bc43f57d5b375d1ad54a0074318a0844d11b76abccf478c37c986d3cfc", size = 7626020, upload-time = "2025-07-03T13:11:06.479Z" },
-    { url = "https://files.pythonhosted.org/packages/63/dd/f296c27ffba447bfad76c6a0c44c1ea97a90cb9472b9304c94a732e8dbfb/pillow-11.3.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:092c80c76635f5ecb10f3f83d76716165c96f5229addbd1ec2bdbbda7d496e06", size = 5956732, upload-time = "2025-07-01T09:15:56.111Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/a0/98a3630f0b57f77bae67716562513d3032ae70414fcaf02750279c389a9e/pillow-11.3.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cadc9e0ea0a2431124cde7e1697106471fc4c1da01530e679b2391c37d3fbb3a", size = 6624404, upload-time = "2025-07-01T09:15:58.245Z" },
-    { url = "https://files.pythonhosted.org/packages/de/e6/83dfba5646a290edd9a21964da07674409e410579c341fc5b8f7abd81620/pillow-11.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6a418691000f2a418c9135a7cf0d797c1bb7d9a485e61fe8e7722845b95ef978", size = 6067760, upload-time = "2025-07-01T09:16:00.003Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/41/15ab268fe6ee9a2bc7391e2bbb20a98d3974304ab1a406a992dcb297a370/pillow-11.3.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:97afb3a00b65cc0804d1c7abddbf090a81eaac02768af58cbdcaaa0a931e0b6d", size = 6700534, upload-time = "2025-07-01T09:16:02.29Z" },
-    { url = "https://files.pythonhosted.org/packages/64/79/6d4f638b288300bed727ff29f2a3cb63db054b33518a95f27724915e3fbc/pillow-11.3.0-cp39-cp39-win32.whl", hash = "sha256:ea944117a7974ae78059fcc1800e5d3295172bb97035c0c1d9345fca1419da71", size = 6277091, upload-time = "2025-07-01T09:16:04.4Z" },
-    { url = "https://files.pythonhosted.org/packages/46/05/4106422f45a05716fd34ed21763f8ec182e8ea00af6e9cb05b93a247361a/pillow-11.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:e5c5858ad8ec655450a7c7df532e9842cf8df7cc349df7225c60d5d348c8aada", size = 6986091, upload-time = "2025-07-01T09:16:06.342Z" },
-    { url = "https://files.pythonhosted.org/packages/63/c6/287fd55c2c12761d0591549d48885187579b7c257bef0c6660755b0b59ae/pillow-11.3.0-cp39-cp39-win_arm64.whl", hash = "sha256:6abdbfd3aea42be05702a8dd98832329c167ee84400a1d1f61ab11437f1717eb", size = 2422632, upload-time = "2025-07-01T09:16:08.142Z" },
     { url = "https://files.pythonhosted.org/packages/6f/8b/209bd6b62ce8367f47e68a218bffac88888fdf2c9fcf1ecadc6c3ec1ebc7/pillow-11.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3cee80663f29e3843b68199b9d6f4f54bd1d4a6b59bdd91bceefc51238bcb967", size = 5270556, upload-time = "2025-07-01T09:16:09.961Z" },
     { url = "https://files.pythonhosted.org/packages/2e/e6/231a0b76070c2cfd9e260a7a5b504fb72da0a95279410fa7afd99d9751d6/pillow-11.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b5f56c3f344f2ccaf0dd875d3e180f631dc60a51b314295a3e681fe8cf851fbe", size = 4654625, upload-time = "2025-07-01T09:16:11.913Z" },
     { url = "https://files.pythonhosted.org/packages/13/f4/10cf94fda33cb12765f2397fc285fa6d8eb9c29de7f3185165b702fc7386/pillow-11.3.0-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e67d793d180c9df62f1f40aee3accca4829d3794c95098887edc18af4b8b780c", size = 4874207, upload-time = "2025-07-03T13:11:10.201Z" },
@@ -2250,52 +1946,13 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/35/91/9cb56efbb428b006bb85db28591e40b7736847b8331d43fe335acf95f6c8/propcache-0.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4445542398bd0b5d32df908031cb1b30d43ac848e20470a878b770ec2dcc6330", size = 265778, upload-time = "2025-06-09T22:55:36.45Z" },
     { url = "https://files.pythonhosted.org/packages/9a/4c/b0fe775a2bdd01e176b14b574be679d84fc83958335790f7c9a686c1f468/propcache-0.3.2-cp313-cp313t-win32.whl", hash = "sha256:f86e5d7cd03afb3a1db8e9f9f6eff15794e79e791350ac48a8c924e6f439f394", size = 41175, upload-time = "2025-06-09T22:55:38.436Z" },
     { url = "https://files.pythonhosted.org/packages/a4/ff/47f08595e3d9b5e149c150f88d9714574f1a7cbd89fe2817158a952674bf/propcache-0.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:9704bedf6e7cbe3c65eca4379a9b53ee6a83749f047808cbb5044d40d7d72198", size = 44857, upload-time = "2025-06-09T22:55:39.687Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/39/8ea9bcfaaff16fd0b0fc901ee522e24c9ec44b4ca0229cfffb8066a06959/propcache-0.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a7fad897f14d92086d6b03fdd2eb844777b0c4d7ec5e3bac0fbae2ab0602bbe5", size = 74678, upload-time = "2025-06-09T22:55:41.227Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/85/cab84c86966e1d354cf90cdc4ba52f32f99a5bca92a1529d666d957d7686/propcache-0.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1f43837d4ca000243fd7fd6301947d7cb93360d03cd08369969450cc6b2ce3b4", size = 43829, upload-time = "2025-06-09T22:55:42.417Z" },
-    { url = "https://files.pythonhosted.org/packages/23/f7/9cb719749152d8b26d63801b3220ce2d3931312b2744d2b3a088b0ee9947/propcache-0.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:261df2e9474a5949c46e962065d88eb9b96ce0f2bd30e9d3136bcde84befd8f2", size = 43729, upload-time = "2025-06-09T22:55:43.651Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/a2/0b2b5a210ff311260002a315f6f9531b65a36064dfb804655432b2f7d3e3/propcache-0.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e514326b79e51f0a177daab1052bc164d9d9e54133797a3a58d24c9c87a3fe6d", size = 204483, upload-time = "2025-06-09T22:55:45.327Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/e0/7aff5de0c535f783b0c8be5bdb750c305c1961d69fbb136939926e155d98/propcache-0.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d4a996adb6904f85894570301939afeee65f072b4fd265ed7e569e8d9058e4ec", size = 217425, upload-time = "2025-06-09T22:55:46.729Z" },
-    { url = "https://files.pythonhosted.org/packages/92/1d/65fa889eb3b2a7d6e4ed3c2b568a9cb8817547a1450b572de7bf24872800/propcache-0.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:76cace5d6b2a54e55b137669b30f31aa15977eeed390c7cbfb1dafa8dfe9a701", size = 214723, upload-time = "2025-06-09T22:55:48.342Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/e2/eecf6989870988dfd731de408a6fa366e853d361a06c2133b5878ce821ad/propcache-0.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31248e44b81d59d6addbb182c4720f90b44e1efdc19f58112a3c3a1615fb47ef", size = 200166, upload-time = "2025-06-09T22:55:49.775Z" },
-    { url = "https://files.pythonhosted.org/packages/12/06/c32be4950967f18f77489268488c7cdc78cbfc65a8ba8101b15e526b83dc/propcache-0.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abb7fa19dbf88d3857363e0493b999b8011eea856b846305d8c0512dfdf8fbb1", size = 194004, upload-time = "2025-06-09T22:55:51.335Z" },
-    { url = "https://files.pythonhosted.org/packages/46/6c/17b521a6b3b7cbe277a4064ff0aa9129dd8c89f425a5a9b6b4dd51cc3ff4/propcache-0.3.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d81ac3ae39d38588ad0549e321e6f773a4e7cc68e7751524a22885d5bbadf886", size = 203075, upload-time = "2025-06-09T22:55:52.681Z" },
-    { url = "https://files.pythonhosted.org/packages/62/cb/3bdba2b736b3e45bc0e40f4370f745b3e711d439ffbffe3ae416393eece9/propcache-0.3.2-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:cc2782eb0f7a16462285b6f8394bbbd0e1ee5f928034e941ffc444012224171b", size = 195407, upload-time = "2025-06-09T22:55:54.048Z" },
-    { url = "https://files.pythonhosted.org/packages/29/bd/760c5c6a60a4a2c55a421bc34a25ba3919d49dee411ddb9d1493bb51d46e/propcache-0.3.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:db429c19a6c7e8a1c320e6a13c99799450f411b02251fb1b75e6217cf4a14fcb", size = 196045, upload-time = "2025-06-09T22:55:55.485Z" },
-    { url = "https://files.pythonhosted.org/packages/76/58/ced2757a46f55b8c84358d6ab8de4faf57cba831c51e823654da7144b13a/propcache-0.3.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:21d8759141a9e00a681d35a1f160892a36fb6caa715ba0b832f7747da48fb6ea", size = 208432, upload-time = "2025-06-09T22:55:56.884Z" },
-    { url = "https://files.pythonhosted.org/packages/bb/ec/d98ea8d5a4d8fe0e372033f5254eddf3254344c0c5dc6c49ab84349e4733/propcache-0.3.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:2ca6d378f09adb13837614ad2754fa8afaee330254f404299611bce41a8438cb", size = 210100, upload-time = "2025-06-09T22:55:58.498Z" },
-    { url = "https://files.pythonhosted.org/packages/56/84/b6d8a7ecf3f62d7dd09d9d10bbf89fad6837970ef868b35b5ffa0d24d9de/propcache-0.3.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:34a624af06c048946709f4278b4176470073deda88d91342665d95f7c6270fbe", size = 200712, upload-time = "2025-06-09T22:55:59.906Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/32/889f4903ddfe4a9dc61da71ee58b763758cf2d608fe1decede06e6467f8d/propcache-0.3.2-cp39-cp39-win32.whl", hash = "sha256:4ba3fef1c30f306b1c274ce0b8baaa2c3cdd91f645c48f06394068f37d3837a1", size = 38187, upload-time = "2025-06-09T22:56:01.212Z" },
-    { url = "https://files.pythonhosted.org/packages/67/74/d666795fb9ba1dc139d30de64f3b6fd1ff9c9d3d96ccfdb992cd715ce5d2/propcache-0.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:7a2368eed65fc69a7a7a40b27f22e85e7627b74216f0846b04ba5c116e191ec9", size = 42025, upload-time = "2025-06-09T22:56:02.875Z" },
     { url = "https://files.pythonhosted.org/packages/cc/35/cc0aaecf278bb4575b8555f2b137de5ab821595ddae9da9d3cd1da4072c7/propcache-0.3.2-py3-none-any.whl", hash = "sha256:98f1ec44fb675f5052cccc8e609c46ed23a35a1cfd18545ad4e29002d858a43f", size = 12663, upload-time = "2025-06-09T22:56:04.484Z" },
 ]
 
-[[package]]
-name = "protobuf"
-version = "3.19.6"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.10'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/51/d1/79bfd1f481469b661a2eddab551255536401892722189433282bfb13cfb1/protobuf-3.19.6.tar.gz", hash = "sha256:5f5540d57a43042389e87661c6eaa50f47c19c6176e8cf1c4f287aeefeccb5c4", size = 218071, upload-time = "2022-09-29T22:07:23.03Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/4b/3b/90f805b9e5ecacf8a216f2e5acabc2d3ad965b62803510be41804e6bfbfe/protobuf-3.19.6-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:010be24d5a44be7b0613750ab40bc8b8cedc796db468eae6c779b395f50d1fa1", size = 913631, upload-time = "2022-09-29T21:17:39.095Z" },
-    { url = "https://files.pythonhosted.org/packages/26/ef/bd6ba3b4ff9a35944bdd325e2c9ee56f71e855757f7d43938232499f0278/protobuf-3.19.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11478547958c2dfea921920617eb457bc26867b0d1aa065ab05f35080c5d9eb6", size = 1055327, upload-time = "2022-09-29T21:17:41.054Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/db/8b33c9558f1f27dd74e7f9ad730c6b32efab431419af556b1659e125b041/protobuf-3.19.6-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:30a15015d86b9c3b8d6bf78d5b8c7749f2512c29f168ca259c9d7727604d0e39", size = 913657, upload-time = "2022-09-29T21:18:18.359Z" },
-    { url = "https://files.pythonhosted.org/packages/51/61/e80b7a04f4e1b4eecc86582335205fd876abca0abafee4a6c001f70a375e/protobuf-3.19.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:878b4cd080a21ddda6ac6d1e163403ec6eea2e206cf225982ae04567d39be7b0", size = 1055457, upload-time = "2022-09-29T21:18:20.212Z" },
-    { url = "https://files.pythonhosted.org/packages/32/27/1141a8232723dcb10a595cc0ce4321dcbbd5215300bf4acfc142343205bf/protobuf-3.19.6-py2.py3-none-any.whl", hash = "sha256:14082457dc02be946f60b15aad35e9f5c69e738f80ebbc0900a19bc83734a5a4", size = 162648, upload-time = "2022-09-29T22:07:20.303Z" },
-]
-
 [[package]]
 name = "protobuf"
 version = "6.32.1"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14'",
-    "python_full_version == '3.13.*'",
-    "python_full_version == '3.12.*'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-]
 sdist = { url = "https://files.pythonhosted.org/packages/fa/a4/cc17347aa2897568beece2e674674359f911d6fe21b0b8d6268cd42727ac/protobuf-6.32.1.tar.gz", hash = "sha256:ee2469e4a021474ab9baafea6cd070e5bf27c7d29433504ddea1a4ee5850f68d", size = 440635, upload-time = "2025-09-11T21:38:42.935Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/3f/be/8dd0a927c559b37d7a6c8ab79034fd167dcc1f851595f2e641ad62be8643/protobuf-6.32.1-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:2f5b80a49e1eb7b86d85fcd23fe92df154b9730a725c3b38c4e43b9d77018bf4", size = 322874, upload-time = "2025-09-11T21:38:35.509Z" },
@@ -2333,8 +1990,7 @@ name = "pyarrow"
 version = "21.0.0"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version == '3.10.*'",
-    "python_full_version < '3.10'",
+    "python_full_version < '3.11'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/ef/c2/ea068b8f00905c06329a3dfcd40d0fcc2b7d0f2e355bdb25b65e0a0e4cd4/pyarrow-21.0.0.tar.gz", hash = "sha256:5051f2dccf0e283ff56335760cbc8622cf52264d67e359d5569541ac11b6d5bc", size = 1133487, upload-time = "2025-07-18T00:57:31.761Z" }
 wheels = [
@@ -2373,13 +2029,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/0a/f9/4ee798dc902533159250fb4321267730bc0a107d8c6889e07c3add4fe3a5/pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fc0d2f88b81dcf3ccf9a6ae17f89183762c8a94a5bdcfa09e05cfe413acf0503", size = 43276625, upload-time = "2025-07-18T00:56:48.002Z" },
     { url = "https://files.pythonhosted.org/packages/5a/da/e02544d6997037a4b0d22d8e5f66bc9315c3671371a8b18c79ade1cefe14/pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6299449adf89df38537837487a4f8d3bd91ec94354fdd2a7d30bc11c48ef6e79", size = 44951890, upload-time = "2025-07-18T00:56:52.568Z" },
     { url = "https://files.pythonhosted.org/packages/e5/4e/519c1bc1876625fe6b71e9a28287c43ec2f20f73c658b9ae1d485c0c206e/pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10", size = 26371006, upload-time = "2025-07-18T00:56:56.379Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/cc/ce4939f4b316457a083dc5718b3982801e8c33f921b3c98e7a93b7c7491f/pyarrow-21.0.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:a7f6524e3747e35f80744537c78e7302cd41deee8baa668d56d55f77d9c464b3", size = 31211248, upload-time = "2025-07-18T00:56:59.7Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/c2/7a860931420d73985e2f340f06516b21740c15b28d24a0e99a900bb27d2b/pyarrow-21.0.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:203003786c9fd253ebcafa44b03c06983c9c8d06c3145e37f1b76a1f317aeae1", size = 32676896, upload-time = "2025-07-18T00:57:03.884Z" },
-    { url = "https://files.pythonhosted.org/packages/68/a8/197f989b9a75e59b4ca0db6a13c56f19a0ad8a298c68da9cc28145e0bb97/pyarrow-21.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b4d97e297741796fead24867a8dabf86c87e4584ccc03167e4a811f50fdf74d", size = 41067862, upload-time = "2025-07-18T00:57:07.587Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/82/6ecfa89487b35aa21accb014b64e0a6b814cc860d5e3170287bf5135c7d8/pyarrow-21.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:898afce396b80fdda05e3086b4256f8677c671f7b1d27a6976fa011d3fd0a86e", size = 42747508, upload-time = "2025-07-18T00:57:13.917Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/b7/ba252f399bbf3addc731e8643c05532cf32e74cebb5e32f8f7409bc243cf/pyarrow-21.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:067c66ca29aaedae08218569a114e413b26e742171f526e828e1064fcdec13f4", size = 43345293, upload-time = "2025-07-18T00:57:19.828Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/0a/a20819795bd702b9486f536a8eeb70a6aa64046fce32071c19ec8230dbaa/pyarrow-21.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0c4e75d13eb76295a49e0ea056eb18dbd87d81450bfeb8afa19a7e5a75ae2ad7", size = 45060670, upload-time = "2025-07-18T00:57:24.477Z" },
-    { url = "https://files.pythonhosted.org/packages/10/15/6b30e77872012bbfe8265d42a01d5b3c17ef0ac0f2fae531ad91b6a6c02e/pyarrow-21.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:cdc4c17afda4dab2a9c0b79148a43a7f4e1094916b3e18d8975bfd6d6d52241f", size = 26227521, upload-time = "2025-07-18T00:57:29.119Z" },
 ]
 
 [[package]]
@@ -2552,19 +2201,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" },
     { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" },
     { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008", size = 1974769, upload-time = "2025-11-04T13:42:01.186Z" },
-    { url = "https://files.pythonhosted.org/packages/54/db/160dffb57ed9a3705c4cbcbff0ac03bdae45f1ca7d58ab74645550df3fbd/pydantic_core-2.41.5-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:8bfeaf8735be79f225f3fefab7f941c712aaca36f1128c9d7e2352ee1aa87bdf", size = 2107999, upload-time = "2025-11-04T13:42:03.885Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/7d/88e7de946f60d9263cc84819f32513520b85c0f8322f9b8f6e4afc938383/pydantic_core-2.41.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:346285d28e4c8017da95144c7f3acd42740d637ff41946af5ce6e5e420502dd5", size = 1929745, upload-time = "2025-11-04T13:42:06.075Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/c2/aef51e5b283780e85e99ff19db0f05842d2d4a8a8cd15e63b0280029b08f/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a75dafbf87d6276ddc5b2bf6fae5254e3d0876b626eb24969a574fff9149ee5d", size = 1920220, upload-time = "2025-11-04T13:42:08.457Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/97/492ab10f9ac8695cd76b2fdb24e9e61f394051df71594e9bcc891c9f586e/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7b93a4d08587e2b7e7882de461e82b6ed76d9026ce91ca7915e740ecc7855f60", size = 2067296, upload-time = "2025-11-04T13:42:10.817Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/23/984149650e5269c59a2a4c41d234a9570adc68ab29981825cfaf4cfad8f4/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e8465ab91a4bd96d36dde3263f06caa6a8a6019e4113f24dc753d79a8b3a3f82", size = 2231548, upload-time = "2025-11-04T13:42:13.843Z" },
-    { url = "https://files.pythonhosted.org/packages/71/0c/85bcbb885b9732c28bec67a222dbed5ed2d77baee1f8bba2002e8cd00c5c/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:299e0a22e7ae2b85c1a57f104538b2656e8ab1873511fd718a1c1c6f149b77b5", size = 2362571, upload-time = "2025-11-04T13:42:16.208Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/4a/412d2048be12c334003e9b823a3fa3d038e46cc2d64dd8aab50b31b65499/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:707625ef0983fcfb461acfaf14de2067c5942c6bb0f3b4c99158bed6fedd3cf3", size = 2068175, upload-time = "2025-11-04T13:42:18.911Z" },
-    { url = "https://files.pythonhosted.org/packages/73/f4/c58b6a776b502d0a5540ad02e232514285513572060f0d78f7832ca3c98b/pydantic_core-2.41.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f41eb9797986d6ebac5e8edff36d5cef9de40def462311b3eb3eeded1431e425", size = 2177203, upload-time = "2025-11-04T13:42:22.578Z" },
-    { url = "https://files.pythonhosted.org/packages/ed/ae/f06ea4c7e7a9eead3d165e7623cd2ea0cb788e277e4f935af63fc98fa4e6/pydantic_core-2.41.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0384e2e1021894b1ff5a786dbf94771e2986ebe2869533874d7e43bc79c6f504", size = 2148191, upload-time = "2025-11-04T13:42:24.89Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/57/25a11dcdc656bf5f8b05902c3c2934ac3ea296257cc4a3f79a6319e61856/pydantic_core-2.41.5-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:f0cd744688278965817fd0839c4a4116add48d23890d468bc436f78beb28abf5", size = 2343907, upload-time = "2025-11-04T13:42:27.683Z" },
-    { url = "https://files.pythonhosted.org/packages/96/82/e33d5f4933d7a03327c0c43c65d575e5919d4974ffc026bc917a5f7b9f61/pydantic_core-2.41.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:753e230374206729bf0a807954bcc6c150d3743928a73faffee51ac6557a03c3", size = 2322174, upload-time = "2025-11-04T13:42:30.776Z" },
-    { url = "https://files.pythonhosted.org/packages/81/45/4091be67ce9f469e81656f880f3506f6a5624121ec5eb3eab37d7581897d/pydantic_core-2.41.5-cp39-cp39-win32.whl", hash = "sha256:873e0d5b4fb9b89ef7c2d2a963ea7d02879d9da0da8d9d4933dee8ee86a8b460", size = 1990353, upload-time = "2025-11-04T13:42:33.111Z" },
-    { url = "https://files.pythonhosted.org/packages/44/8a/a98aede18db6e9cd5d66bcacd8a409fcf8134204cdede2e7de35c5a2c5ef/pydantic_core-2.41.5-cp39-cp39-win_amd64.whl", hash = "sha256:e4f4a984405e91527a0d62649ee21138f8e3d0ef103be488c1dc11a80d7f184b", size = 2015698, upload-time = "2025-11-04T13:42:35.484Z" },
     { url = "https://files.pythonhosted.org/packages/11/72/90fda5ee3b97e51c494938a4a44c3a35a9c96c19bba12372fb9c634d6f57/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:b96d5f26b05d03cc60f11a7761a5ded1741da411e7fe0909e27a5e6a0cb7b034", size = 2115441, upload-time = "2025-11-04T13:42:39.557Z" },
     { url = "https://files.pythonhosted.org/packages/1f/53/8942f884fa33f50794f119012dc6a1a02ac43a56407adaac20463df8e98f/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:634e8609e89ceecea15e2d61bc9ac3718caaaa71963717bf3c8f38bfde64242c", size = 1930291, upload-time = "2025-11-04T13:42:42.169Z" },
     { url = "https://files.pythonhosted.org/packages/79/c8/ecb9ed9cd942bce09fc888ee960b52654fbdbede4ba6c2d6e0d3b1d8b49c/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93e8740d7503eb008aa2df04d3b9735f845d43ae845e6dcd2be0b55a2da43cd2", size = 1948632, upload-time = "2025-11-04T13:42:44.564Z" },
@@ -2605,8 +2241,7 @@ name = "pylance"
 source = { editable = "." }
 dependencies = [
     { name = "lance-namespace" },
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "pyarrow", version = "23.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -2626,9 +2261,8 @@ geo = [
 ]
 tests = [
     { name = "boto3" },
-    { name = "datafusion", marker = "python_full_version >= '3.10'" },
-    { name = "datasets", version = "0.0.9", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "datasets", version = "4.1.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "datafusion" },
+    { name = "datasets" },
     { name = "duckdb" },
     { name = "ml-dtypes" },
     { name = "pandas" },
@@ -2636,8 +2270,7 @@ tests = [
     { name = "polars", extra = ["pandas", "pyarrow"] },
     { name = "psutil" },
     { name = "pytest" },
-    { name = "tensorflow", version = "2.7.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' and sys_platform == 'linux'" },
-    { name = "tensorflow", version = "2.20.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' and sys_platform == 'linux'" },
+    { name = "tensorflow", marker = "sys_platform == 'linux'" },
     { name = "tqdm" },
 ]
 torch = [
@@ -2655,8 +2288,8 @@ dev = [
 ]
 tests = [
     { name = "boto3" },
-    { name = "datafusion", marker = "python_full_version >= '3.10'" },
-    { name = "datasets", version = "4.1.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "datafusion" },
+    { name = "datasets" },
     { name = "duckdb" },
     { name = "ml-dtypes" },
     { name = "pandas" },
@@ -2664,19 +2297,19 @@ tests = [
     { name = "polars", extra = ["pandas", "pyarrow"] },
     { name = "psutil" },
     { name = "pytest" },
-    { name = "tensorflow", version = "2.20.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' and sys_platform == 'linux'" },
+    { name = "tensorflow", marker = "sys_platform == 'linux'" },
     { name = "tqdm" },
 ]
 
 [package.metadata]
 requires-dist = [
     { name = "boto3", marker = "extra == 'tests'" },
-    { name = "datafusion", marker = "python_full_version >= '3.10' and extra == 'tests'", specifier = ">=53,<54" },
+    { name = "datafusion", marker = "extra == 'tests'", specifier = ">=53,<54" },
     { name = "datasets", marker = "extra == 'tests'" },
     { name = "duckdb", marker = "extra == 'tests'" },
     { name = "geoarrow-rust-core", marker = "extra == 'geo'" },
     { name = "geoarrow-rust-io", marker = "extra == 'geo'" },
-    { name = "lance-namespace", specifier = ">=0.8.0,<0.9" },
+    { name = "lance-namespace", specifier = ">=0.8.5,<0.9" },
     { name = "ml-dtypes", marker = "extra == 'tests'" },
     { name = "numpy", specifier = ">=1.22" },
     { name = "pandas", marker = "extra == 'tests'" },
@@ -2703,8 +2336,8 @@ dev = [
 ]
 tests = [
     { name = "boto3", specifier = "==1.40.43" },
-    { name = "datafusion", marker = "python_full_version >= '3.10'", specifier = "==53.0.0" },
-    { name = "datasets", marker = "python_full_version >= '3.10'", specifier = "==4.1.1" },
+    { name = "datafusion", specifier = "==53.0.0" },
+    { name = "datasets", specifier = "==4.1.1" },
     { name = "duckdb", specifier = "==1.4.0" },
     { name = "ml-dtypes", specifier = "==0.5.3" },
     { name = "pandas", specifier = "==2.3.3" },
@@ -2712,59 +2345,19 @@ tests = [
     { name = "polars", extras = ["pyarrow", "pandas"], specifier = "==1.34.0" },
     { name = "psutil", specifier = "==7.1.0" },
     { name = "pytest", specifier = "==8.4.2" },
-    { name = "tensorflow", marker = "python_full_version >= '3.10' and sys_platform == 'linux'", specifier = "==2.20.0" },
+    { name = "tensorflow", marker = "sys_platform == 'linux'", specifier = "==2.20.0" },
     { name = "tqdm", specifier = "==4.67.1" },
 ]
 
-[[package]]
-name = "pyproj"
-version = "3.6.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.10'",
-]
-dependencies = [
-    { name = "certifi", marker = "python_full_version < '3.10'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/7d/84/2b39bbf888c753ea48b40d47511548c77aa03445465c35cc4c4e9649b643/pyproj-3.6.1.tar.gz", hash = "sha256:44aa7c704c2b7d8fb3d483bbf75af6cb2350d30a63b144279a09b75fead501bf", size = 225131, upload-time = "2023-09-21T02:07:51.593Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c5/32/63cf474f4a8d4804b3bdf7c16b8589f38142e8e2f8319dcea27e0bc21a87/pyproj-3.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ab7aa4d9ff3c3acf60d4b285ccec134167a948df02347585fdd934ebad8811b4", size = 6142763, upload-time = "2023-09-21T02:07:12.844Z" },
-    { url = "https://files.pythonhosted.org/packages/18/86/2e7cb9de40492f1bafbf11f4c9072edc394509a40b5e4c52f8139546f039/pyproj-3.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4bc0472302919e59114aa140fd7213c2370d848a7249d09704f10f5b062031fe", size = 4877123, upload-time = "2023-09-21T02:10:37.905Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/c5/928d5a26995dbefbebd7507d982141cd9153bc7e4392b334fff722c4af12/pyproj-3.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5279586013b8d6582e22b6f9e30c49796966770389a9d5b85e25a4223286cd3f", size = 6190576, upload-time = "2023-09-21T02:17:08.637Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/2b/b60cf73b0720abca313bfffef34e34f7f7dae23852b2853cf0368d49426b/pyproj-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80fafd1f3eb421694857f254a9bdbacd1eb22fc6c24ca74b136679f376f97d35", size = 8328075, upload-time = "2023-09-21T02:07:15.353Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/a8/7193f46032636be917bc775506ae987aad72c931b1f691b775ca812a2917/pyproj-3.6.1-cp310-cp310-win32.whl", hash = "sha256:c41e80ddee130450dcb8829af7118f1ab69eaf8169c4bf0ee8d52b72f098dc2f", size = 5635713, upload-time = "2023-09-21T02:07:17.548Z" },
-    { url = "https://files.pythonhosted.org/packages/89/8f/27350c8fba71a37cd0d316f100fbd96bf139cc2b5ff1ab0dcbc7ac64010a/pyproj-3.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:db3aedd458e7f7f21d8176f0a1d924f1ae06d725228302b872885a1c34f3119e", size = 6087932, upload-time = "2023-09-21T02:07:19.793Z" },
-    { url = "https://files.pythonhosted.org/packages/84/a6/a300c1b14b2112e966e9f90b18f9c13b586bdcf417207cee913ae9005da3/pyproj-3.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ebfbdbd0936e178091309f6cd4fcb4decd9eab12aa513cdd9add89efa3ec2882", size = 6147442, upload-time = "2023-09-21T02:07:21.879Z" },
-    { url = "https://files.pythonhosted.org/packages/30/bd/b9bd3761f08754e8dbb34c5a647db2099b348ab5da338e90980caf280e37/pyproj-3.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:447db19c7efad70ff161e5e46a54ab9cc2399acebb656b6ccf63e4bc4a04b97a", size = 4880331, upload-time = "2023-09-21T02:10:40.828Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/0a/d82aeeb605b5d6870bc72307c3b5e044e632eb7720df8885e144f51a8eac/pyproj-3.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e7e13c40183884ec7f94eb8e0f622f08f1d5716150b8d7a134de48c6110fee85", size = 6192425, upload-time = "2023-09-21T02:17:09.049Z" },
-    { url = "https://files.pythonhosted.org/packages/64/90/dfe5c00de1ca4dbb82606e79790659d4ed7f0ed8d372bccb3baca2a5abe0/pyproj-3.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:65ad699e0c830e2b8565afe42bd58cc972b47d829b2e0e48ad9638386d994915", size = 8571478, upload-time = "2023-09-21T02:07:23.771Z" },
-    { url = "https://files.pythonhosted.org/packages/14/6d/ae373629a1723f0db80d7b8c93598b00d9ecb930ed9ebf4f35826a33e97c/pyproj-3.6.1-cp311-cp311-win32.whl", hash = "sha256:8b8acc31fb8702c54625f4d5a2a6543557bec3c28a0ef638778b7ab1d1772132", size = 5634575, upload-time = "2023-09-21T02:07:26.535Z" },
-    { url = "https://files.pythonhosted.org/packages/79/95/eb68113c5b5737c342bde1bab92705dabe69c16299c5a122616e50f1fbd6/pyproj-3.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:38a3361941eb72b82bd9a18f60c78b0df8408416f9340521df442cebfc4306e2", size = 6088494, upload-time = "2023-09-21T02:07:28.75Z" },
-    { url = "https://files.pythonhosted.org/packages/0b/64/93232511a7906a492b1b7dfdfc17f4e95982d76a24ef4f86d18cfe7ae2c9/pyproj-3.6.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:1e9fbaf920f0f9b4ee62aab832be3ae3968f33f24e2e3f7fbb8c6728ef1d9746", size = 6135280, upload-time = "2023-09-21T02:07:30.911Z" },
-    { url = "https://files.pythonhosted.org/packages/10/f2/b550b1f65cc7e51c9116b220b50aade60c439103432a3fd5b12efbc77e15/pyproj-3.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6d227a865356f225591b6732430b1d1781e946893789a609bb34f59d09b8b0f8", size = 4880030, upload-time = "2023-09-21T02:10:43.067Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/4b/2f8f6f94643b9fe2083338eff294feda84d916409b5840b7a402d2be93f8/pyproj-3.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83039e5ae04e5afc974f7d25ee0870a80a6bd6b7957c3aca5613ccbe0d3e72bf", size = 6184439, upload-time = "2023-09-21T02:17:43.499Z" },
-    { url = "https://files.pythonhosted.org/packages/19/9b/c57569132174786aa3f72275ac306956859a639dad0ce8d95c8411ce8209/pyproj-3.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb059ba3bced6f6725961ba758649261d85ed6ce670d3e3b0a26e81cf1aa8d", size = 8660747, upload-time = "2023-09-21T02:07:32.586Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/ab/1c2159ec757677c5a6b8803f6be45c2b550dc42c84ec4a228dc219849bbb/pyproj-3.6.1-cp312-cp312-win32.whl", hash = "sha256:2d6ff73cc6dbbce3766b6c0bce70ce070193105d8de17aa2470009463682a8eb", size = 5626805, upload-time = "2023-09-21T02:07:35.28Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/f3/2f32fe143cd7ba1d4d68f1b6dce9ca402d909cbd5a5830e3a8fa3d1acbbf/pyproj-3.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:7a27151ddad8e1439ba70c9b4b2b617b290c39395fa9ddb7411ebb0eb86d6fb0", size = 6079779, upload-time = "2023-09-21T02:07:37.486Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/50/d369bbe62d7a0d1e2cb40bc211da86a3f6e0f3c99f872957a72c3d5492d6/pyproj-3.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4ba1f9b03d04d8cab24d6375609070580a26ce76eaed54631f03bab00a9c737b", size = 6144755, upload-time = "2023-09-21T02:07:39.611Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/c2/8d4f61065dfed965e53badd41201ad86a05af0c1bbc75dffb12ef0f5a7dd/pyproj-3.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:18faa54a3ca475bfe6255156f2f2874e9a1c8917b0004eee9f664b86ccc513d3", size = 4879187, upload-time = "2023-09-21T02:10:45.519Z" },
-    { url = "https://files.pythonhosted.org/packages/31/38/2cf8777cb2d5622a78195e690281b7029098795fde4751aec8128238b8bb/pyproj-3.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd43bd9a9b9239805f406fd82ba6b106bf4838d9ef37c167d3ed70383943ade1", size = 6192339, upload-time = "2023-09-21T02:17:09.942Z" },
-    { url = "https://files.pythonhosted.org/packages/97/0a/b1525be9680369cc06dd288e12c59d24d5798b4afcdcf1b0915836e1caa6/pyproj-3.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50100b2726a3ca946906cbaa789dd0749f213abf0cbb877e6de72ca7aa50e1ae", size = 8332638, upload-time = "2023-09-21T02:07:41.777Z" },
-    { url = "https://files.pythonhosted.org/packages/8d/e8/e826e0a962f36bd925a933829cf6ef218efe2055db5ea292be40974a929d/pyproj-3.6.1-cp39-cp39-win32.whl", hash = "sha256:9274880263256f6292ff644ca92c46d96aa7e57a75c6df3f11d636ce845a1877", size = 5638159, upload-time = "2023-09-21T02:07:43.49Z" },
-    { url = "https://files.pythonhosted.org/packages/43/d0/cbe29a4dcf38ee7e72bf695d0d3f2bee21b4f22ee6cf579ad974de9edfc8/pyproj-3.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:36b64c2cb6ea1cc091f329c5bd34f9c01bb5da8c8e4492c709bda6a09f96808f", size = 6090565, upload-time = "2023-09-21T02:07:45.735Z" },
-    { url = "https://files.pythonhosted.org/packages/43/28/e8d2ca71dd56c27cbe668e4226963d61956cded222a2e839e6fec1ab6d82/pyproj-3.6.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:fd93c1a0c6c4aedc77c0fe275a9f2aba4d59b8acf88cebfc19fe3c430cfabf4f", size = 6034252, upload-time = "2023-09-21T02:07:47.906Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/39/1ce27cb86f51a1f5aed3a1617802a6131b59ea78492141d1fbe36722595e/pyproj-3.6.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6420ea8e7d2a88cb148b124429fba8cd2e0fae700a2d96eab7083c0928a85110", size = 6386263, upload-time = "2023-09-21T02:07:49.586Z" },
-]
-
 [[package]]
 name = "pyproj"
 version = "3.7.1"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version == '3.10.*'",
+    "python_full_version < '3.11'",
 ]
 dependencies = [
-    { name = "certifi", marker = "python_full_version == '3.10.*'" },
+    { name = "certifi", marker = "python_full_version < '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/67/10/a8480ea27ea4bbe896c168808854d00f2a9b49f95c0319ddcbba693c8a90/pyproj-3.7.1.tar.gz", hash = "sha256:60d72facd7b6b79853f19744779abcd3f804c4e0d4fa8815469db20c9f640a47", size = 226339, upload-time = "2025-02-16T04:28:46.621Z" }
 wheels = [
@@ -3000,15 +2593,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" },
     { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" },
     { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/62/67fc8e68a75f738c9200422bf65693fb79a4cd0dc5b23310e5202e978090/pyyaml-6.0.3-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:b865addae83924361678b652338317d1bd7e79b1f4596f96b96c77a5a34b34da", size = 184450, upload-time = "2025-09-25T21:33:00.618Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/92/861f152ce87c452b11b9d0977952259aa7df792d71c1053365cc7b09cc08/pyyaml-6.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c3355370a2c156cffb25e876646f149d5d68f5e0a3ce86a5084dd0b64a994917", size = 174319, upload-time = "2025-09-25T21:33:02.086Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/cd/f0cfc8c74f8a030017a2b9c771b7f47e5dd702c3e28e5b2071374bda2948/pyyaml-6.0.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3c5677e12444c15717b902a5798264fa7909e41153cdf9ef7ad571b704a63dd9", size = 737631, upload-time = "2025-09-25T21:33:03.25Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/b2/18f2bd28cd2055a79a46c9b0895c0b3d987ce40ee471cecf58a1a0199805/pyyaml-6.0.3-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5ed875a24292240029e4483f9d4a4b8a1ae08843b9c54f43fcc11e404532a8a5", size = 836795, upload-time = "2025-09-25T21:33:05.014Z" },
-    { url = "https://files.pythonhosted.org/packages/73/b9/793686b2d54b531203c160ef12bec60228a0109c79bae6c1277961026770/pyyaml-6.0.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0150219816b6a1fa26fb4699fb7daa9caf09eb1999f3b70fb6e786805e80375a", size = 750767, upload-time = "2025-09-25T21:33:06.398Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/86/a137b39a611def2ed78b0e66ce2fe13ee701a07c07aebe55c340ed2a050e/pyyaml-6.0.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fa160448684b4e94d80416c0fa4aac48967a969efe22931448d853ada8baf926", size = 727982, upload-time = "2025-09-25T21:33:08.708Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/62/71c27c94f457cf4418ef8ccc71735324c549f7e3ea9d34aba50874563561/pyyaml-6.0.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:27c0abcb4a5dac13684a37f76e701e054692a9b2d3064b70f5e4eb54810553d7", size = 755677, upload-time = "2025-09-25T21:33:09.876Z" },
-    { url = "https://files.pythonhosted.org/packages/29/3d/6f5e0d58bd924fb0d06c3a6bad00effbdae2de5adb5cda5648006ffbd8d3/pyyaml-6.0.3-cp39-cp39-win32.whl", hash = "sha256:1ebe39cb5fc479422b83de611d14e2c0d3bb2a18bbcb01f229ab3cfbd8fee7a0", size = 142592, upload-time = "2025-09-25T21:33:10.983Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/0c/25113e0b5e103d7f1490c0e947e303fe4a696c10b501dea7a9f49d4e876c/pyyaml-6.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:2e71d11abed7344e42a8849600193d15b6def118602c4c176f748e4583246007", size = 158777, upload-time = "2025-09-25T21:33:15.55Z" },
 ]
 
 [[package]]
@@ -3016,10 +2600,10 @@ name = "requests"
 version = "2.33.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "certifi", marker = "python_full_version >= '3.10'" },
-    { name = "charset-normalizer", marker = "python_full_version >= '3.10'" },
-    { name = "idna", marker = "python_full_version >= '3.10'" },
-    { name = "urllib3", version = "2.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "certifi" },
+    { name = "charset-normalizer" },
+    { name = "idna" },
+    { name = "urllib3" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/34/64/8860370b167a9721e8956ae116825caff829224fbca0ca6e7bf8ddef8430/requests-2.33.0.tar.gz", hash = "sha256:c7ebc5e8b0f21837386ad0e1c8fe8b829fa5f544d8df3b2253bff14ef29d7652", size = 134232, upload-time = "2026-03-25T15:10:41.586Z" }
 wheels = [
@@ -3031,8 +2615,8 @@ name = "rich"
 version = "14.1.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "markdown-it-py", marker = "python_full_version >= '3.10'" },
-    { name = "pygments", marker = "python_full_version >= '3.10'" },
+    { name = "markdown-it-py" },
+    { name = "pygments" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/fe/75/af448d8e52bf1d8fa6a9d089ca6c07ff4453d86c65c145d0a300bb073b9b/rich-14.1.0.tar.gz", hash = "sha256:e497a48b844b0320d45007cdebfeaeed8db2a4f4bcf49f15e455cfc4af11eaa8", size = 224441, upload-time = "2025-07-25T07:32:58.125Z" }
 wheels = [
@@ -3114,13 +2698,11 @@ dependencies = [
     { name = "absl-py" },
     { name = "grpcio" },
     { name = "markdown" },
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "packaging" },
     { name = "pillow" },
-    { name = "protobuf", version = "3.19.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "protobuf", version = "6.32.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "protobuf" },
     { name = "setuptools" },
     { name = "tensorboard-data-server" },
     { name = "werkzeug" },
@@ -3138,74 +2720,33 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/73/c6/825dab04195756cf8ff2e12698f22513b3db2f64925bdd41671bfb33aaa5/tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:ef687163c24185ae9754ed5650eb5bc4d84ff257aabdc33f0cc6f74d8ba54530", size = 6590363, upload-time = "2023-10-23T21:23:35.583Z" },
 ]
 
-[[package]]
-name = "tensorflow"
-version = "2.7.4"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.10'",
-]
-dependencies = [
-    { name = "absl-py", marker = "python_full_version < '3.10'" },
-    { name = "astunparse", marker = "python_full_version < '3.10'" },
-    { name = "flatbuffers", version = "2.0.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "gast", version = "0.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "google-pasta", marker = "python_full_version < '3.10'" },
-    { name = "grpcio", marker = "python_full_version < '3.10'" },
-    { name = "h5py", marker = "python_full_version < '3.10'" },
-    { name = "keras", version = "2.7.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "keras-preprocessing", marker = "python_full_version < '3.10'" },
-    { name = "libclang", marker = "python_full_version < '3.10'" },
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "opt-einsum", marker = "python_full_version < '3.10'" },
-    { name = "protobuf", version = "3.19.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "six", marker = "python_full_version < '3.10'" },
-    { name = "tensorboard", marker = "python_full_version < '3.10'" },
-    { name = "tensorflow-estimator", marker = "python_full_version < '3.10'" },
-    { name = "tensorflow-io-gcs-filesystem", marker = "python_full_version < '3.10'" },
-    { name = "termcolor", marker = "python_full_version < '3.10'" },
-    { name = "typing-extensions", marker = "python_full_version < '3.10'" },
-    { name = "wheel", marker = "python_full_version < '3.10'" },
-    { name = "wrapt", marker = "python_full_version < '3.10'" },
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/5e/31/d49a3dff9c4ca6e6c09c2c5fea95f58cf59cc3cd4f0d557069c7dccd6f57/tensorflow-2.7.4-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:c4597635dd71fc6809b7fffcb462524d73e2ade09da61844059e6a2fead71140", size = 496066688, upload-time = "2022-09-02T19:11:01.631Z" },
-]
-
 [[package]]
 name = "tensorflow"
 version = "2.20.0"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14'",
-    "python_full_version == '3.13.*'",
-    "python_full_version == '3.12.*'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-]
 dependencies = [
-    { name = "absl-py", marker = "python_full_version >= '3.10'" },
-    { name = "astunparse", marker = "python_full_version >= '3.10'" },
-    { name = "flatbuffers", version = "25.9.23", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
-    { name = "gast", version = "0.6.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
-    { name = "google-pasta", marker = "python_full_version >= '3.10'" },
-    { name = "grpcio", marker = "python_full_version >= '3.10'" },
-    { name = "h5py", marker = "python_full_version >= '3.10'" },
-    { name = "keras", version = "3.11.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
-    { name = "libclang", marker = "python_full_version >= '3.10'" },
-    { name = "ml-dtypes", marker = "python_full_version >= '3.10'" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
+    { name = "absl-py" },
+    { name = "astunparse" },
+    { name = "flatbuffers" },
+    { name = "gast" },
+    { name = "google-pasta" },
+    { name = "grpcio" },
+    { name = "h5py" },
+    { name = "keras" },
+    { name = "libclang" },
+    { name = "ml-dtypes" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
-    { name = "opt-einsum", marker = "python_full_version >= '3.10'" },
-    { name = "packaging", marker = "python_full_version >= '3.10'" },
-    { name = "protobuf", version = "6.32.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
-    { name = "requests", marker = "python_full_version >= '3.10'" },
-    { name = "setuptools", marker = "python_full_version >= '3.10'" },
-    { name = "six", marker = "python_full_version >= '3.10'" },
-    { name = "tensorboard", marker = "python_full_version >= '3.10'" },
-    { name = "termcolor", marker = "python_full_version >= '3.10'" },
-    { name = "typing-extensions", marker = "python_full_version >= '3.10'" },
-    { name = "wrapt", marker = "python_full_version >= '3.10'" },
+    { name = "opt-einsum" },
+    { name = "packaging" },
+    { name = "protobuf" },
+    { name = "requests" },
+    { name = "setuptools" },
+    { name = "six" },
+    { name = "tensorboard" },
+    { name = "termcolor" },
+    { name = "typing-extensions" },
+    { name = "wrapt" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/ff/07/ea91ac67a9fd36d3372099f5a3e69860ded544f877f5f2117802388f4212/tensorflow-2.20.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02a0293d94f5c8b7125b66abf622cc4854a33ae9d618a0d41309f95e091bbaea", size = 259307122, upload-time = "2025-08-13T16:50:47.909Z" },
@@ -3216,31 +2757,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/9c/d1/6aa15085d672056d5f08b5f28b1c7ce01c4e12149a23b0c98e3c79d04441/tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25265b0bc527e0d54b1e9cc60c44a24f44a809fe27666b905f0466471f9c52ec", size = 620682547, upload-time = "2025-08-13T16:52:46.396Z" },
     { url = "https://files.pythonhosted.org/packages/ea/4c/c1aa90c5cc92e9f7f9c78421e121ef25bae7d378f8d1d4cbad46c6308836/tensorflow-2.20.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47c88e05a07f1ead4977b4894b3ecd4d8075c40191065afc4fd9355c9db3d926", size = 259663776, upload-time = "2025-08-13T16:53:24.507Z" },
     { url = "https://files.pythonhosted.org/packages/43/fb/8be8547c128613d82a2b006004026d86ed0bd672e913029a98153af4ffab/tensorflow-2.20.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fa3729b0126f75a99882b89fb7d536515721eda8014a63e259e780ba0a37372", size = 620815537, upload-time = "2025-08-13T16:53:42.577Z" },
-    { url = "https://files.pythonhosted.org/packages/83/ff/a26d49895586207b2704403366ef976dcaa6ed07514699dae9a4fc3fa1a9/tensorflow-2.20.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28bc33759249c98eabcee9debd24e74506bbe29ac139e050cf0c74aa9888ebdf", size = 259307564, upload-time = "2025-08-13T16:54:17.691Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/fe/f3d738dc7c93ed5f67f9ace8dd3ed66971dab7c5a47f2d1c504ef0d0cf1d/tensorflow-2.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0deb5c583dfc53b54fd158a194ce0087b406bb6518af400ca3809735e4548ec3", size = 620427169, upload-time = "2025-08-13T16:54:33.431Z" },
-]
-
-[[package]]
-name = "tensorflow-estimator"
-version = "2.7.0"
-source = { registry = "https://pypi.org/simple" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/db/de/3a71ad41b87f9dd424e3aec3b0794a60f169fa7e9a9a1e3dd44290b86dd6/tensorflow_estimator-2.7.0-py2.py3-none-any.whl", hash = "sha256:325b5a224864379242b7b76c6987ca544239be82579d33e68ec7c2bda57abc9d", size = 463110, upload-time = "2021-10-29T23:02:47.14Z" },
-]
-
-[[package]]
-name = "tensorflow-io-gcs-filesystem"
-version = "0.37.1"
-source = { registry = "https://pypi.org/simple" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/e2/19/9095c69e22c879cb3896321e676c69273a549a3148c4f62aa4bc5ebdb20f/tensorflow_io_gcs_filesystem-0.37.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8febbfcc67c61e542a5ac1a98c7c20a91a5e1afc2e14b1ef0cb7c28bc3b6aa70", size = 4842078, upload-time = "2024-07-01T23:44:18.977Z" },
-    { url = "https://files.pythonhosted.org/packages/f3/48/47b7d25572961a48b1de3729b7a11e835b888e41e0203cca82df95d23b91/tensorflow_io_gcs_filesystem-0.37.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9679b36e3a80921876f31685ab6f7270f3411a4cc51bc2847e80d0e4b5291e27", size = 5085736, upload-time = "2024-07-01T23:44:21.034Z" },
-    { url = "https://files.pythonhosted.org/packages/de/bf/ba597d3884c77d05a78050f3c178933d69e3f80200a261df6eaa920656cd/tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e1f2796b57e799a8ca1b75bf47c2aaa437c968408cc1a402a9862929e104cda", size = 4842079, upload-time = "2024-07-01T23:44:26.825Z" },
-    { url = "https://files.pythonhosted.org/packages/66/7f/e36ae148c2f03d61ca1bff24bc13a0fef6d6825c966abef73fc6f880a23b/tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee7c8ee5fe2fd8cb6392669ef16e71841133041fee8a330eff519ad9b36e4556", size = 5085736, upload-time = "2024-07-01T23:44:28.618Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/46/962f47af08bd39fc9feb280d3192825431a91a078c856d17a78ae4884eb1/tensorflow_io_gcs_filesystem-0.37.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fbb33f1745f218464a59cecd9a18e32ca927b0f4d77abd8f8671b645cc1a182f", size = 4842077, upload-time = "2024-07-01T23:44:33.86Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/9b/790d290c232bce9b691391cf16e95a96e469669c56abfb1d9d0f35fa437c/tensorflow_io_gcs_filesystem-0.37.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:286389a203a5aee1a4fa2e53718c661091aa5fea797ff4fa6715ab8436b02e6c", size = 5085733, upload-time = "2024-07-01T23:44:36.663Z" },
-    { url = "https://files.pythonhosted.org/packages/66/5f/334a011caa1eb97689274d1141df8e6b7a25e389f0390bdcd90235de9783/tensorflow_io_gcs_filesystem-0.37.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:426de1173cb81fbd62becec2012fc00322a295326d90eb6c737fab636f182aed", size = 4842075, upload-time = "2024-07-01T23:44:42.094Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/cb/7dcee55fc5a7d7d8a862e12519322851cd5fe5b086f946fd71e4ae1ef281/tensorflow_io_gcs_filesystem-0.37.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df00891669390078a003cedbdd3b8e645c718b111917535fa1d7725e95cdb95", size = 5087496, upload-time = "2024-07-01T23:44:43.797Z" },
 ]
 
 [[package]]
@@ -3299,8 +2815,7 @@ dependencies = [
     { name = "filelock" },
     { name = "fsspec" },
     { name = "jinja2" },
-    { name = "networkx", version = "3.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
+    { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
@@ -3342,10 +2857,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4a/15/5e488ca0bc6162c86a33b58642bc577c84ded17c7b72d97e49b5833e2d73/torch-2.8.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:8f0a9d617a66509ded240add3754e462430a6c1fc5589f86c17b433dd808f97a", size = 887990692, upload-time = "2025-08-06T14:56:18.286Z" },
     { url = "https://files.pythonhosted.org/packages/b4/a8/6a04e4b54472fc5dba7ca2341ab219e529f3c07b6941059fbf18dccac31f/torch-2.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:a7242b86f42be98ac674b88a4988643b9bc6145437ec8f048fea23f72feb5eca", size = 241603453, upload-time = "2025-08-06T14:55:22.945Z" },
     { url = "https://files.pythonhosted.org/packages/04/6e/650bb7f28f771af0cb791b02348db8b7f5f64f40f6829ee82aa6ce99aabe/torch-2.8.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:7b677e17f5a3e69fdef7eb3b9da72622f8d322692930297e4ccb52fefc6c8211", size = 73632395, upload-time = "2025-08-06T14:55:28.645Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/b0/a321f27270049baa12f5c3fb0d6ceea005634787e3af9a8d75dce8306b0a/torch-2.8.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:da6afa31c13b669d4ba49d8a2169f0db2c3ec6bec4af898aa714f401d4c38904", size = 102059214, upload-time = "2025-08-06T14:55:33.433Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/dd/1630cb51b10d3d2e97db95e5a84c32def81fc26b005bce6fc880b0e6db81/torch-2.8.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:06fcee8000e5c62a9f3e52a688b9c5abb7c6228d0e56e3452983416025c41381", size = 888024302, upload-time = "2025-08-06T14:57:28.23Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/dc/1f1f621afe15e3c496e1e8f94f8903f75f87e7d642d5a985e92210cc208d/torch-2.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:5128fe752a355d9308e56af1ad28b15266fe2da5948660fad44de9e3a9e36e8c", size = 241249338, upload-time = "2025-08-06T14:57:05.669Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/95/ae26263aceb3d57b821179f827d0e321373ed49423e603dd5906ab14a730/torch-2.8.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:e9f071f5b52a9f6970dc8a919694b27a91ae9dc08898b2b988abbef5eddfd1ae", size = 73610795, upload-time = "2025-08-06T14:57:11.513Z" },
 ]
 
 [[package]]
@@ -3365,7 +2876,6 @@ name = "triton"
 version = "3.4.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "importlib-metadata", marker = "python_full_version < '3.10'" },
     { name = "setuptools" },
 ]
 wheels = [
@@ -3374,7 +2884,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d0/66/b1eb52839f563623d185f0927eb3530ee4d5ffe9d377cdaf5346b306689e/triton-3.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:31c1d84a5c0ec2c0f8e8a072d7fd150cab84a9c239eaddc6706c081bfae4eb04", size = 155560068, upload-time = "2025-07-30T19:58:37.081Z" },
     { url = "https://files.pythonhosted.org/packages/30/7b/0a685684ed5322d2af0bddefed7906674f67974aa88b0fae6e82e3b766f6/triton-3.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00be2964616f4c619193cb0d1b29a99bd4b001d7dc333816073f92cf2a8ccdeb", size = 155569223, upload-time = "2025-07-30T19:58:44.017Z" },
     { url = "https://files.pythonhosted.org/packages/20/63/8cb444ad5cdb25d999b7d647abac25af0ee37d292afc009940c05b82dda0/triton-3.4.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7936b18a3499ed62059414d7df563e6c163c5e16c3773678a3ee3d417865035d", size = 155659780, upload-time = "2025-07-30T19:58:51.171Z" },
-    { url = "https://files.pythonhosted.org/packages/12/34/1251beb5a3cb93f3950ebe68732752014646003ef6eb11eb5f1a37ca78cd/triton-3.4.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98e5c1442eaeabae2e2452ae765801bd53cd4ce873cab0d1bdd59a32ab2d9397", size = 155430799, upload-time = "2025-07-30T19:58:57.664Z" },
 ]
 
 [[package]]
@@ -3407,29 +2916,10 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
 ]
 
-[[package]]
-name = "urllib3"
-version = "1.26.20"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.10'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/e4/e8/6ff5e6bc22095cfc59b6ea711b687e2b7ed4bdb373f7eeec370a97d7392f/urllib3-1.26.20.tar.gz", hash = "sha256:40c2dc0c681e47eb8f90e7e27bf6ff7df2e677421fd46756da1161c39ca70d32", size = 307380, upload-time = "2024-08-29T15:43:11.37Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/33/cf/8435d5a7159e2a9c83a95896ed596f68cf798005fe107cc655b5c5c14704/urllib3-1.26.20-py2.py3-none-any.whl", hash = "sha256:0ed14ccfbf1c30a9072c7ca157e4319b70d65f623e91e7b32fadb2853431016e", size = 144225, upload-time = "2024-08-29T15:43:08.921Z" },
-]
-
 [[package]]
 name = "urllib3"
 version = "2.5.0"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14'",
-    "python_full_version == '3.13.*'",
-    "python_full_version == '3.12.*'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-]
 sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload-time = "2025-06-18T14:07:41.644Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" },
@@ -3486,10 +2976,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cc/01/9b85a99996b0a97c8a17484684f206cbb6ba73c1ce6890ac668bcf3838fb/wrapt-1.17.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:223db574bb38637e8230eb14b185565023ab624474df94d2af18f1cdb625216f", size = 113094, upload-time = "2025-08-12T05:52:22.618Z" },
     { url = "https://files.pythonhosted.org/packages/25/02/78926c1efddcc7b3aa0bc3d6b33a822f7d898059f7cd9ace8c8318e559ef/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e405adefb53a435f01efa7ccdec012c016b5a1d3f35459990afc39b6be4d5056", size = 110659, upload-time = "2025-08-12T05:52:24.057Z" },
     { url = "https://files.pythonhosted.org/packages/dc/ee/c414501ad518ac3e6fe184753632fe5e5ecacdcf0effc23f31c1e4f7bfcf/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:88547535b787a6c9ce4086917b6e1d291aa8ed914fdd3a838b3539dc95c12804", size = 106946, upload-time = "2025-08-12T05:52:45.976Z" },
-    { url = "https://files.pythonhosted.org/packages/43/46/dd0791943613885f62619f18ee6107e6133237a6b6ed8a9ecfac339d0b4f/wrapt-1.17.3-cp39-cp39-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:7e18f01b0c3e4a07fe6dfdb00e29049ba17eadbc5e7609a2a3a4af83ab7d710a", size = 81745, upload-time = "2025-08-12T05:52:49.62Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/ec/bb2d19bd1a614cc4f438abac13ae26c57186197920432d2a915183b15a8b/wrapt-1.17.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f5f51a6466667a5a356e6381d362d259125b57f059103dd9fdc8c0cf1d14139", size = 82833, upload-time = "2025-08-12T05:52:27.738Z" },
-    { url = "https://files.pythonhosted.org/packages/8d/eb/66579aea6ad36f07617fedca8e282e49c7c9bab64c63b446cfe4f7f47a49/wrapt-1.17.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:59923aa12d0157f6b82d686c3fd8e1166fa8cdfb3e17b42ce3b6147ff81528df", size = 81889, upload-time = "2025-08-12T05:52:29.023Z" },
-    { url = "https://files.pythonhosted.org/packages/04/9c/a56b5ac0e2473bdc3fb11b22dd69ff423154d63861cf77911cdde5e38fd2/wrapt-1.17.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:46acc57b331e0b3bcb3e1ca3b421d65637915cfcd65eb783cb2f78a511193f9b", size = 81344, upload-time = "2025-08-12T05:52:50.869Z" },
     { url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" },
 ]
 
@@ -3604,21 +3090,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/9a/9a/c19c42c5b3f5a4aad748a6d5b4f23df3bed7ee5445accc65a0fb3ff03953/xxhash-3.6.0-cp314-cp314t-win32.whl", hash = "sha256:5851f033c3030dd95c086b4a36a2683c2ff4a799b23af60977188b057e467119", size = 31586, upload-time = "2025-10-02T14:36:15.603Z" },
     { url = "https://files.pythonhosted.org/packages/03/d6/4cc450345be9924fd5dc8c590ceda1db5b43a0a889587b0ae81a95511360/xxhash-3.6.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0444e7967dac37569052d2409b00a8860c2135cff05502df4da80267d384849f", size = 32526, upload-time = "2025-10-02T14:36:16.708Z" },
     { url = "https://files.pythonhosted.org/packages/0f/c9/7243eb3f9eaabd1a88a5a5acadf06df2d83b100c62684b7425c6a11bcaa8/xxhash-3.6.0-cp314-cp314t-win_arm64.whl", hash = "sha256:bb79b1e63f6fd84ec778a4b1916dfe0a7c3fdb986c06addd5db3a0d413819d95", size = 28898, upload-time = "2025-10-02T14:36:17.843Z" },
-    { url = "https://files.pythonhosted.org/packages/03/ff/1b4bb3f397552116c1df6266c1b83a21aeeb26061ab1f462984b499a3870/xxhash-3.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:cc604dc06027dbeb8281aeac5899c35fcfe7c77b25212833709f0bff4ce74d2a", size = 32844, upload-time = "2025-10-02T14:36:39.157Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/db/27146d0bee4346a9a31f7b498a81fc02747f6f1e6c52a2e7989504278051/xxhash-3.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:277175a73900ad43a8caeb8b99b9604f21fe8d7c842f2f9061a364a7e220ddb7", size = 30806, upload-time = "2025-10-02T14:36:40.621Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/2b/4896188df564908817a75de19bf7f2384b99a75af2d528f9c49326f76458/xxhash-3.6.0-cp39-cp39-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cfbc5b91397c8c2972fdac13fb3e4ed2f7f8ccac85cd2c644887557780a9b6e2", size = 193448, upload-time = "2025-10-02T14:36:41.797Z" },
-    { url = "https://files.pythonhosted.org/packages/51/c5/be8953f62e772340319a826ce1e07489935600089756cf83b628cd36ebe3/xxhash-3.6.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2762bfff264c4e73c0e507274b40634ff465e025f0eaf050897e88ec8367575d", size = 212547, upload-time = "2025-10-02T14:36:43.581Z" },
-    { url = "https://files.pythonhosted.org/packages/51/1a/1e9f0b911d1cf00dd537c074ae3fae15b535a7f0d9e7edd42a9d2c4f78ce/xxhash-3.6.0-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2f171a900d59d51511209f7476933c34a0c2c711078d3c80e74e0fe4f38680ec", size = 211309, upload-time = "2025-10-02T14:36:45.307Z" },
-    { url = "https://files.pythonhosted.org/packages/63/88/b284c6a128d88dc47f201957f926e707db79fb7415a87072e15c0e490de0/xxhash-3.6.0-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:780b90c313348f030b811efc37b0fa1431163cb8db8064cf88a7936b6ce5f222", size = 444480, upload-time = "2025-10-02T14:36:47.226Z" },
-    { url = "https://files.pythonhosted.org/packages/87/e4/798293a2bf9e4fac5f6d53ce59cba4739930778dfc6c7c73f40044ab0e6e/xxhash-3.6.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18b242455eccdfcd1fa4134c431a30737d2b4f045770f8fe84356b3469d4b919", size = 192957, upload-time = "2025-10-02T14:36:48.968Z" },
-    { url = "https://files.pythonhosted.org/packages/78/55/bfd0d7db447a927897469048b953caececa3532e743b940dd1f5c1032d24/xxhash-3.6.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a75ffc1bd5def584129774c158e108e5d768e10b75813f2b32650bb041066ed6", size = 209850, upload-time = "2025-10-02T14:36:50.258Z" },
-    { url = "https://files.pythonhosted.org/packages/31/06/d08ef9a792bfebfd2fb2bcbf04a541ad283bef74749ead6f089a0809d288/xxhash-3.6.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1fc1ed882d1e8df932a66e2999429ba6cc4d5172914c904ab193381fba825360", size = 197342, upload-time = "2025-10-02T14:36:51.651Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/1a/aebf90797c94e9ca407c28e23f54d71f7149d91a93406a08a09e44d06994/xxhash-3.6.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:44e342e8cc11b4e79dae5c57f2fb6360c3c20cc57d32049af8f567f5b4bcb5f4", size = 209757, upload-time = "2025-10-02T14:36:53.009Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/80/799eec3d0a144dc3edf8c19b4f139c27fb923c50b34352796089ca206429/xxhash-3.6.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:c2f9ccd5c4be370939a2e17602fbc49995299203da72a3429db013d44d590e86", size = 412773, upload-time = "2025-10-02T14:36:54.691Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/f9/09df7545699de09219a205123b8463ce9ea83f48acc7aeeba0269507f9d3/xxhash-3.6.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:02ea4cb627c76f48cd9fb37cf7ab22bd51e57e1b519807234b473faebe526796", size = 190357, upload-time = "2025-10-02T14:36:56.363Z" },
-    { url = "https://files.pythonhosted.org/packages/07/40/2f8327f94e64a3f34d6ce3347c55207c322abbc80ae486ea45df4c62e7b3/xxhash-3.6.0-cp39-cp39-win32.whl", hash = "sha256:6551880383f0e6971dc23e512c9ccc986147ce7bfa1cd2e4b520b876c53e9f3d", size = 30585, upload-time = "2025-10-02T14:36:57.664Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/c8/2ecbc6799be9c02e8bf7b5a66cd94832b6ac13d59808746f0d402481c6ad/xxhash-3.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:7c35c4cdc65f2a29f34425c446f2f5cdcd0e3c34158931e1cc927ece925ab802", size = 31512, upload-time = "2025-10-02T14:36:58.837Z" },
-    { url = "https://files.pythonhosted.org/packages/19/94/1d5459a9c587c94d7b8bcc710bd08bbfa145cbd814ebde41b48494362a21/xxhash-3.6.0-cp39-cp39-win_arm64.whl", hash = "sha256:ffc578717a347baf25be8397cb10d2528802d24f94cfc005c0e44fef44b5cdd6", size = 27878, upload-time = "2025-10-02T14:37:00.201Z" },
     { url = "https://files.pythonhosted.org/packages/93/1e/8aec23647a34a249f62e2398c42955acd9b4c6ed5cf08cbea94dc46f78d2/xxhash-3.6.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0f7b7e2ec26c1666ad5fc9dbfa426a6a3367ceaf79db5dd76264659d509d73b0", size = 30662, upload-time = "2025-10-02T14:37:01.743Z" },
     { url = "https://files.pythonhosted.org/packages/b8/0b/b14510b38ba91caf43006209db846a696ceea6a847a0c9ba0a5b1adc53d6/xxhash-3.6.0-pp311-pypy311_pp73-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5dc1e14d14fa0f5789ec29a7062004b5933964bb9b02aae6622b8f530dc40296", size = 41056, upload-time = "2025-10-02T14:37:02.879Z" },
     { url = "https://files.pythonhosted.org/packages/50/55/15a7b8a56590e66ccd374bbfa3f9ffc45b810886c8c3b614e3f90bd2367c/xxhash-3.6.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:881b47fc47e051b37d94d13e7455131054b56749b91b508b0907eb07900d1c13", size = 36251, upload-time = "2025-10-02T14:37:04.44Z" },
@@ -3631,9 +3102,9 @@ name = "yarl"
 version = "1.20.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "idna", marker = "python_full_version >= '3.10'" },
-    { name = "multidict", marker = "python_full_version >= '3.10'" },
-    { name = "propcache", marker = "python_full_version >= '3.10'" },
+    { name = "idna" },
+    { name = "multidict" },
+    { name = "propcache" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/3c/fb/efaa23fa4e45537b827620f04cf8f3cd658b76642205162e072703a5b963/yarl-1.20.1.tar.gz", hash = "sha256:d017a4997ee50c91fd5466cef416231bb82177b93b029906cefc542ce14c35ac", size = 186428, upload-time = "2025-06-10T00:46:09.923Z" }
 wheels = [
@@ -3722,31 +3193,5 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/9e/ed/c5fb04869b99b717985e244fd93029c7a8e8febdfcffa06093e32d7d44e7/yarl-1.20.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:88cab98aa4e13e1ade8c141daeedd300a4603b7132819c484841bb7af3edce9e", size = 341709, upload-time = "2025-06-10T00:45:23.221Z" },
     { url = "https://files.pythonhosted.org/packages/24/fd/725b8e73ac2a50e78a4534ac43c6addf5c1c2d65380dd48a9169cc6739a9/yarl-1.20.1-cp313-cp313t-win32.whl", hash = "sha256:b121ff6a7cbd4abc28985b6028235491941b9fe8fe226e6fdc539c977ea1739d", size = 86591, upload-time = "2025-06-10T00:45:25.793Z" },
     { url = "https://files.pythonhosted.org/packages/94/c3/b2e9f38bc3e11191981d57ea08cab2166e74ea770024a646617c9cddd9f6/yarl-1.20.1-cp313-cp313t-win_amd64.whl", hash = "sha256:541d050a355bbbc27e55d906bc91cb6fe42f96c01413dd0f4ed5a5240513874f", size = 93003, upload-time = "2025-06-10T00:45:27.752Z" },
-    { url = "https://files.pythonhosted.org/packages/01/75/0d37402d208d025afa6b5b8eb80e466d267d3fd1927db8e317d29a94a4cb/yarl-1.20.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e42ba79e2efb6845ebab49c7bf20306c4edf74a0b20fc6b2ccdd1a219d12fad3", size = 134259, upload-time = "2025-06-10T00:45:29.882Z" },
-    { url = "https://files.pythonhosted.org/packages/73/84/1fb6c85ae0cf9901046f07d0ac9eb162f7ce6d95db541130aa542ed377e6/yarl-1.20.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:41493b9b7c312ac448b7f0a42a089dffe1d6e6e981a2d76205801a023ed26a2b", size = 91269, upload-time = "2025-06-10T00:45:32.917Z" },
-    { url = "https://files.pythonhosted.org/packages/f3/9c/eae746b24c4ea29a5accba9a06c197a70fa38a49c7df244e0d3951108861/yarl-1.20.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f5a5928ff5eb13408c62a968ac90d43f8322fd56d87008b8f9dabf3c0f6ee983", size = 89995, upload-time = "2025-06-10T00:45:35.066Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/30/693e71003ec4bc1daf2e4cf7c478c417d0985e0a8e8f00b2230d517876fc/yarl-1.20.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30c41ad5d717b3961b2dd785593b67d386b73feca30522048d37298fee981805", size = 325253, upload-time = "2025-06-10T00:45:37.052Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/a2/5264dbebf90763139aeb0b0b3154763239398400f754ae19a0518b654117/yarl-1.20.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:59febc3969b0781682b469d4aca1a5cab7505a4f7b85acf6db01fa500fa3f6ba", size = 320897, upload-time = "2025-06-10T00:45:39.962Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/17/77c7a89b3c05856489777e922f41db79ab4faf58621886df40d812c7facd/yarl-1.20.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d2b6fb3622b7e5bf7a6e5b679a69326b4279e805ed1699d749739a61d242449e", size = 340696, upload-time = "2025-06-10T00:45:41.915Z" },
-    { url = "https://files.pythonhosted.org/packages/6d/55/28409330b8ef5f2f681f5b478150496ec9cf3309b149dab7ec8ab5cfa3f0/yarl-1.20.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:749d73611db8d26a6281086f859ea7ec08f9c4c56cec864e52028c8b328db723", size = 335064, upload-time = "2025-06-10T00:45:43.893Z" },
-    { url = "https://files.pythonhosted.org/packages/85/58/cb0257cbd4002828ff735f44d3c5b6966c4fd1fc8cc1cd3cd8a143fbc513/yarl-1.20.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9427925776096e664c39e131447aa20ec738bdd77c049c48ea5200db2237e000", size = 327256, upload-time = "2025-06-10T00:45:46.393Z" },
-    { url = "https://files.pythonhosted.org/packages/53/f6/c77960370cfa46f6fb3d6a5a79a49d3abfdb9ef92556badc2dcd2748bc2a/yarl-1.20.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff70f32aa316393eaf8222d518ce9118148eddb8a53073c2403863b41033eed5", size = 316389, upload-time = "2025-06-10T00:45:48.358Z" },
-    { url = "https://files.pythonhosted.org/packages/64/ab/be0b10b8e029553c10905b6b00c64ecad3ebc8ace44b02293a62579343f6/yarl-1.20.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:c7ddf7a09f38667aea38801da8b8d6bfe81df767d9dfc8c88eb45827b195cd1c", size = 340481, upload-time = "2025-06-10T00:45:50.663Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/c3/3f327bd3905a4916029bf5feb7f86dcf864c7704f099715f62155fb386b2/yarl-1.20.1-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:57edc88517d7fc62b174fcfb2e939fbc486a68315d648d7e74d07fac42cec240", size = 336941, upload-time = "2025-06-10T00:45:52.554Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/42/040bdd5d3b3bb02b4a6ace4ed4075e02f85df964d6e6cb321795d2a6496a/yarl-1.20.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:dab096ce479d5894d62c26ff4f699ec9072269d514b4edd630a393223f45a0ee", size = 339936, upload-time = "2025-06-10T00:45:54.919Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/1c/911867b8e8c7463b84dfdc275e0d99b04b66ad5132b503f184fe76be8ea4/yarl-1.20.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:14a85f3bd2d7bb255be7183e5d7d6e70add151a98edf56a770d6140f5d5f4010", size = 360163, upload-time = "2025-06-10T00:45:56.87Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/31/8c389f6c6ca0379b57b2da87f1f126c834777b4931c5ee8427dd65d0ff6b/yarl-1.20.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:2c89b5c792685dd9cd3fa9761c1b9f46fc240c2a3265483acc1565769996a3f8", size = 359108, upload-time = "2025-06-10T00:45:58.869Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/09/ae4a649fb3964324c70a3e2b61f45e566d9ffc0affd2b974cbf628957673/yarl-1.20.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:69e9b141de5511021942a6866990aea6d111c9042235de90e08f94cf972ca03d", size = 351875, upload-time = "2025-06-10T00:46:01.45Z" },
-    { url = "https://files.pythonhosted.org/packages/8d/43/bbb4ed4c34d5bb62b48bf957f68cd43f736f79059d4f85225ab1ef80f4b9/yarl-1.20.1-cp39-cp39-win32.whl", hash = "sha256:b5f307337819cdfdbb40193cad84978a029f847b0a357fbe49f712063cfc4f06", size = 82293, upload-time = "2025-06-10T00:46:03.763Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/cd/ce185848a7dba68ea69e932674b5c1a42a1852123584bccc5443120f857c/yarl-1.20.1-cp39-cp39-win_amd64.whl", hash = "sha256:eae7bfe2069f9c1c5b05fc7fe5d612e5bbc089a39309904ee8b829e322dcad00", size = 87385, upload-time = "2025-06-10T00:46:05.655Z" },
     { url = "https://files.pythonhosted.org/packages/b4/2d/2345fce04cfd4bee161bf1e7d9cdc702e3e16109021035dbb24db654a622/yarl-1.20.1-py3-none-any.whl", hash = "sha256:83b8eb083fe4683c6115795d9fc1cfaf2cbbefb19b3a1cb68f6527460f483a77", size = 46542, upload-time = "2025-06-10T00:46:07.521Z" },
 ]
-
-[[package]]
-name = "zipp"
-version = "3.23.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" },
-]
diff --git a/rust/examples/Cargo.toml b/rust/examples/Cargo.toml
index a4e760f8cbe..80eff457140 100644
--- a/rust/examples/Cargo.toml
+++ b/rust/examples/Cargo.toml
@@ -49,6 +49,6 @@ tokio = { workspace = true }
 all_asserts = "2.3.1"
 env_logger = "0.11.7"
 hf-hub = "0.4.2"
-parquet = "58.0.0"
+parquet = { version = "58.0.0", default-features = false, features = ["arrow", "async"] }
 tokenizers = "0.15.2"
 rand.workspace = true
diff --git a/rust/lance-arrow/src/ipc.rs b/rust/lance-arrow/src/ipc.rs
index 1c6364c4525..8b6e5cf41fe 100644
--- a/rust/lance-arrow/src/ipc.rs
+++ b/rust/lance-arrow/src/ipc.rs
@@ -270,7 +270,7 @@ pub fn read_ipc_stream_single_at(
 /// Modern IPC streams have an 8-byte prefix `[continuation: 4][size: 4]`.
 /// Legacy streams have a 4-byte prefix `[size: 4]`. Returns `(prefix_len, meta_size)`.
 fn parse_ipc_message_prefix(buf: &Buffer) -> Result<(usize, usize), ArrowError> {
-    let has_continuation = buf.len() >= 4 && buf[..4] == [0xff; 4];
+    let has_continuation = buf.len() >= 4 && buf[..4] == IPC_CONTINUATION;
     if has_continuation {
         if buf.len() < 8 {
             return Err(ArrowError::ParseError(
@@ -358,6 +358,134 @@ pub fn read_ipc_stream_single(data: &Bytes) -> Result<RecordBatch, ArrowError> {
     }
 }
 
+// ---------------------------------------------------------------------------
+// Aligned IPC sections
+// ---------------------------------------------------------------------------
+
+/// Byte alignment that each IPC section's stream start is padded to.
+///
+/// When several IPC streams are concatenated into one larger blob (e.g. a
+/// cache entry), a section that starts at an arbitrary offset would leave its
+/// array data misaligned. [`FileDecoder`] with `require_alignment = false`
+/// then silently copies each buffer into a freshly aligned allocation on
+/// every read, defeating zero-copy. Padding each section start to a 64-byte
+/// boundary keeps the decoded buffers borrowed directly from the input.
+pub const IPC_SECTION_ALIGNMENT: usize = 64;
+
+/// Number of zero-padding bytes needed to advance `pos` to the next
+/// [`IPC_SECTION_ALIGNMENT`] boundary.
+fn section_padding(pos: usize) -> usize {
+    (IPC_SECTION_ALIGNMENT - (pos % IPC_SECTION_ALIGNMENT)) % IPC_SECTION_ALIGNMENT
+}
+
+/// A [`Write`] adapter that counts the bytes written through it.
+struct CountingWriter<'a> {
+    inner: &'a mut dyn Write,
+    count: usize,
+}
+
+impl Write for CountingWriter<'_> {
+    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        let n = self.inner.write(buf)?;
+        self.count += n;
+        Ok(n)
+    }
+
+    fn flush(&mut self) -> std::io::Result<()> {
+        self.inner.flush()
+    }
+}
+
+/// Write zero padding so the next byte lands on an [`IPC_SECTION_ALIGNMENT`]
+/// boundary, advancing `pos` past it.
+fn write_section_padding(writer: &mut dyn Write, pos: &mut usize) -> Result<(), ArrowError> {
+    let pad = section_padding(*pos);
+    if pad > 0 {
+        const ZEROS: [u8; IPC_SECTION_ALIGNMENT] = [0u8; IPC_SECTION_ALIGNMENT];
+        writer
+            .write_all(&ZEROS[..pad])
+            .map_err(|e| ArrowError::IoError(e.to_string(), e))?;
+        *pos += pad;
+    }
+    Ok(())
+}
+
+/// Write `batch` as a 64-byte-aligned single-batch Arrow IPC section.
+///
+/// `pos` is the absolute byte offset of `writer` within the enclosing blob.
+/// Zero padding is written first so the IPC stream begins on an
+/// [`IPC_SECTION_ALIGNMENT`] boundary, then the stream itself. `pos` is
+/// advanced past both the padding and the stream so the caller can write
+/// further aligned sections.
+///
+/// Paired with [`read_ipc_section_at`]. For the decoded buffers to be borrowed
+/// zero-copy, the blob must ultimately be read back from a buffer whose base
+/// address is at least 64-byte aligned.
+pub fn write_ipc_section(
+    writer: &mut dyn Write,
+    pos: &mut usize,
+    batch: &RecordBatch,
+) -> Result<(), ArrowError> {
+    write_section_padding(writer, pos)?;
+
+    let mut counting = CountingWriter {
+        inner: writer,
+        count: 0,
+    };
+    write_ipc_stream(batch, &mut counting)?;
+    *pos += counting.count;
+    Ok(())
+}
+
+/// Read a single [`RecordBatch`] from an aligned IPC section at `offset`.
+///
+/// Skips the alignment padding written by [`write_ipc_section`], then reads
+/// the stream, advancing `offset` past the section (padding + stream + EOS).
+///
+/// Zero-copy: array buffers borrow from `data`'s allocation when `data`'s base
+/// address is at least 64-byte aligned (see [`write_ipc_section`]).
+pub fn read_ipc_section_at(data: &Bytes, offset: &mut usize) -> Result<RecordBatch, ArrowError> {
+    *offset += section_padding(*offset);
+    read_ipc_stream_single_at(data, offset)
+}
+
+/// Write `batches` as a single 64-byte-aligned multi-batch Arrow IPC section.
+///
+/// Like [`write_ipc_section`] but emits every batch from `iter` into one IPC
+/// stream (schema + N batches + EOS). `iter` must yield at least one batch.
+/// Paired with [`read_ipc_section_batches_at`].
+pub fn write_ipc_section_batches<I>(
+    writer: &mut dyn Write,
+    pos: &mut usize,
+    iter: I,
+) -> Result<(), ArrowError>
+where
+    I: IntoIterator<Item = RecordBatch>,
+{
+    write_section_padding(writer, pos)?;
+
+    let mut counting = CountingWriter {
+        inner: writer,
+        count: 0,
+    };
+    write_ipc_stream_batches(iter, &mut counting)?;
+    *pos += counting.count;
+    Ok(())
+}
+
+/// Read all [`RecordBatch`]es from an aligned multi-batch IPC section at
+/// `offset`, advancing `offset` past the section (padding + stream + EOS).
+///
+/// Zero-copy: array buffers borrow from `data`'s allocation when `data`'s base
+/// address is at least 64-byte aligned (see [`write_ipc_section_batches`]).
+pub fn read_ipc_section_batches_at(
+    data: &Bytes,
+    offset: &mut usize,
+) -> Result<Vec<RecordBatch>, ArrowError> {
+    *offset += section_padding(*offset);
+    read_ipc_stream_at(data, offset)
+}
+
 #[cfg(test)]
 mod tests {
     use arrow_array::{ArrayRef, record_batch};
@@ -403,4 +531,90 @@ mod tests {
             assert_col_zero_copy(batch.column(1));
         }
     }
+
+    /// Allocate a [`Bytes`] whose base address is 64-byte aligned, modelling a
+    /// backend that reads cache entries into an aligned buffer. A plain
+    /// `Bytes::from(vec)` only guarantees the allocator's alignment for `u8`.
+    fn aligned_bytes(payload: &[u8]) -> Bytes {
+        let mut v = vec![0u8; payload.len() + IPC_SECTION_ALIGNMENT];
+        let pad = section_padding(v.as_ptr() as usize);
+        v[pad..pad + payload.len()].copy_from_slice(payload);
+        Bytes::from(v).slice(pad..pad + payload.len())
+    }
+
+    #[test]
+    fn test_aligned_ipc_sections_are_zero_copy() {
+        // A LargeBinary column exercises the i64-offset buffer whose 8-byte
+        // alignment requirement triggers a realigning memcpy when misaligned.
+        let blocks = arrow_array::LargeBinaryArray::from_vec(vec![&b"hello"[..], b"world"]);
+        let section_a = RecordBatch::try_from_iter([("a", Arc::new(blocks) as ArrayRef)]).unwrap();
+        let section_b = record_batch!(("b", Int64, [10i64, 20, 30, 40, 50])).unwrap();
+
+        let mut buf = Vec::new();
+        // Arbitrary, deliberately non-64-aligned preamble so the first section
+        // must be padded rather than landing at offset 0 by luck.
+        buf.extend_from_slice(&[0xABu8; 7]);
+        let mut pos = buf.len();
+        // The first section's stream begins after padding the 7-byte preamble
+        // up to the next 64-byte boundary.
+        assert_eq!(7 + section_padding(7), IPC_SECTION_ALIGNMENT);
+        write_ipc_section(&mut buf, &mut pos, &section_a).unwrap();
+        write_ipc_section(&mut buf, &mut pos, &section_b).unwrap();
+
+        let data = aligned_bytes(&buf);
+        assert_eq!(
+            section_padding(data.as_ptr() as usize),
+            0,
+            "base not aligned"
+        );
+
+        let mut offset = 7;
+        let read_a = read_ipc_section_at(&data, &mut offset).unwrap();
+        let read_b = read_ipc_section_at(&data, &mut offset).unwrap();
+        assert_eq!(read_a, section_a);
+        assert_eq!(read_b, section_b);
+
+        let data_base = data.as_ptr() as usize;
+        let data_end = data_base + data.len();
+        for batch in [&read_a, &read_b] {
+            for buffer in batch.column(0).to_data().buffers() {
+                let ptr = buffer.as_ptr() as usize;
+                assert!(
+                    ptr >= data_base && ptr < data_end,
+                    "section buffer at {ptr:#x} was realigned out of the input \
+                     [{data_base:#x}..{data_end:#x}) — misaligned section",
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_aligned_multi_batch_section_roundtrip_zero_copy() {
+        // A multi-batch section (e.g. IVF SQ storage chunks) must round-trip
+        // every batch and decode the first batch's buffers zero-copy.
+        let b1 = record_batch!(("v", Int64, [1i64, 2, 3])).unwrap();
+        let b2 = record_batch!(("v", Int64, [4i64, 5])).unwrap();
+        let b3 = record_batch!(("v", Int64, [6i64])).unwrap();
+
+        let mut buf = vec![0xCDu8; 5];
+        let mut pos = buf.len();
+        write_ipc_section_batches(&mut buf, &mut pos, [b1.clone(), b2.clone(), b3.clone()])
+            .unwrap();
+
+        let data = aligned_bytes(&buf);
+        let mut offset = 5;
+        let read = read_ipc_section_batches_at(&data, &mut offset).unwrap();
+        assert_eq!(read, vec![b1, b2, b3]);
+        assert_eq!(offset, buf.len(), "offset should land at section end");
+
+        let data_base = data.as_ptr() as usize;
+        let data_end = data_base + data.len();
+        for buffer in read[0].column(0).to_data().buffers() {
+            let ptr = buffer.as_ptr() as usize;
+            assert!(
+                ptr >= data_base && ptr < data_end,
+                "first batch buffer at {ptr:#x} was realigned out of the input",
+            );
+        }
+    }
 }
diff --git a/rust/lance-arrow/src/lib.rs b/rust/lance-arrow/src/lib.rs
index b993cf00745..34a67600543 100644
--- a/rust/lance-arrow/src/lib.rs
+++ b/rust/lance-arrow/src/lib.rs
@@ -52,6 +52,8 @@ pub const BLOB_V2_EXT_NAME: &str = "lance.blob.v2";
 /// Metadata key for overriding the dedicated blob size threshold (in bytes)
 pub const BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY: &str =
     "lance-encoding:blob-dedicated-size-threshold";
+/// Metadata key for overriding the inline blob size threshold (in bytes)
+pub const BLOB_INLINE_SIZE_THRESHOLD_META_KEY: &str = "lance-encoding:blob-inline-size-threshold";
 
 type Result<T> = std::result::Result<T, ArrowError>;
 
diff --git a/rust/lance-core/src/cache/backend.rs b/rust/lance-core/src/cache/backend.rs
index 237254c464f..9307868f399 100644
--- a/rust/lance-core/src/cache/backend.rs
+++ b/rust/lance-core/src/cache/backend.rs
@@ -22,6 +22,9 @@ use super::CacheCodec;
 /// A type-erased cache entry.
 pub type CacheEntry = Arc<dyn Any + Send + Sync>;
 
+/// Iterator over cache keys currently known to a backend.
+pub type CacheKeyIterator<'a> = Box<dyn Iterator<Item = InternalCacheKey> + Send + 'a>;
+
 /// Structured cache key passed to [`CacheBackend`] methods.
 ///
 /// CacheBackend impls receive these ready-made from [`LanceCache`](super::LanceCache)
@@ -116,6 +119,15 @@ pub trait CacheBackend: Send + Sync + std::fmt::Debug {
     /// Remove all entries.
     async fn clear(&self);
 
+    /// Return an iterator over cache keys currently known to this backend.
+    ///
+    /// Backends that cannot enumerate keys cheaply or accurately should return
+    /// `None`. An empty iterator means key inventory is supported and the
+    /// cache currently has no entries.
+    async fn keys(&self) -> Option<CacheKeyIterator<'_>> {
+        None
+    }
+
     /// Number of entries currently stored (may flush pending operations).
     async fn num_entries(&self) -> usize;
 
diff --git a/rust/lance-core/src/cache/codec.rs b/rust/lance-core/src/cache/codec.rs
index 34e5264bb28..bba54840829 100644
--- a/rust/lance-core/src/cache/codec.rs
+++ b/rust/lance-core/src/cache/codec.rs
@@ -5,12 +5,184 @@
 //!
 //! Implement [`CacheCodecImpl`] on concrete types, then use
 //! [`CacheCodec::from_impl`] to produce a type-erased codec for the cache.
+//!
+//! # Wire format
+//!
+//! Every serialized entry begins with a small hand-framed **envelope** so the
+//! reader can validate it before trusting the body:
+//!
+//! ```text
+//! [magic: 4B = b"LCE1"]
+//! [envelope_version: u8]
+//! [type_id_len: u16 LE][type_id: utf8]   # stable, author-assigned
+//! [type_version: u32 LE]                 # per-type body schema version
+//! <body, written by the type's CacheCodecImpl::serialize>
+//! ```
+//!
+//! The envelope is deliberately *not* protobuf: it is the most
+//! stability-critical part, must parse robustly against arbitrary bytes
+//! (including data written by older, pre-stabilization builds), and never
+//! changes shape. Bodies use protobuf headers, where field-number evolution
+//! pays off.
+//!
+//! # Decode outcome
+//!
+//! Deserialization never propagates a parse failure as a hard error into the
+//! cache path. Anything the reader cannot confidently interpret — absent or
+//! wrong magic, an unknown `envelope_version`, a `type_id` mismatch, an
+//! unsupported `type_version`, or a body decode error — becomes
+//! [`CacheDecode::Miss`]. A backend turns `Miss` into a normal cache miss and
+//! recomputes the value. This is what lets data written by an older format
+//! self-heal: it simply fails the magic check and is regenerated.
 
+use std::io::Write;
 use std::sync::Arc;
 
 use bytes::Bytes;
 
-use crate::Result;
+use crate::{Error, Result};
+
+use super::{CacheEntryReader, CacheEntryWriter};
+
+// ---------------------------------------------------------------------------
+// Envelope
+// ---------------------------------------------------------------------------
+
+/// Magic bytes that prefix every stabilized cache entry.
+///
+/// An ASCII tag (`0x4C 0x43 0x45 0x31`) chosen so it cannot collide with any
+/// pre-stabilization blob: those began with either a small little-endian
+/// length (tens of bytes) or a small tag byte, never these values.
+///
+/// Exported so backends can cheaply identify Lance cache entries (e.g. when
+/// scanning a persistent store at startup) without hardcoding the bytes —
+/// prefer [`has_cache_envelope`] over comparing against this directly.
+pub const MAGIC: [u8; 4] = *b"LCE1";
+
+/// Returns `true` if `data` begins with the cache-entry [`MAGIC`].
+///
+/// A cheap prefix check for backends that need to recognize Lance cache
+/// entries without fully [`deserialize`](CacheCodec::deserialize)-ing them. A
+/// `true` result only means the framing looks like ours; the entry can still
+/// decode to a [`Miss`](CacheDecode::Miss) (e.g. wrong `type_id`).
+pub fn has_cache_envelope(data: &[u8]) -> bool {
+    data.get(..MAGIC.len()) == Some(&MAGIC[..])
+}
+
+/// Version of the envelope framing itself. Bumped only if the outer frame
+/// (magic/version/type_id/type_version layout) ever changes — expected never.
+const ENVELOPE_VERSION: u8 = 1;
+
+/// Parsed envelope borrowed from the input bytes.
+struct ParsedEnvelope<'a> {
+    type_id: &'a str,
+    type_version: u32,
+    /// Offset of the first body byte within the input.
+    body_offset: usize,
+}
+
+/// Parse and validate the envelope at the start of `data`.
+///
+/// Returns `None` for anything that is not a well-formed envelope this build
+/// understands (wrong/absent magic, unknown `envelope_version`, truncation,
+/// non-utf8 `type_id`). Callers translate `None` into [`CacheDecode::Miss`].
+fn parse_envelope(data: &Bytes) -> Option<ParsedEnvelope<'_>> {
+    let bytes = data.as_ref();
+    let mut off = 0usize;
+
+    let magic = bytes.get(off..off + 4)?;
+    if magic != MAGIC {
+        return None;
+    }
+    off += 4;
+
+    if *bytes.get(off)? != ENVELOPE_VERSION {
+        return None;
+    }
+    off += 1;
+
+    let type_id_len = u16::from_le_bytes(bytes.get(off..off + 2)?.try_into().ok()?) as usize;
+    off += 2;
+
+    let type_id = std::str::from_utf8(bytes.get(off..off + type_id_len)?).ok()?;
+    off += type_id_len;
+
+    let type_version = u32::from_le_bytes(bytes.get(off..off + 4)?.try_into().ok()?);
+    off += 4;
+
+    Some(ParsedEnvelope {
+        type_id,
+        type_version,
+        body_offset: off,
+    })
+}
+
+/// Write the envelope for `type_id`/`type_version`, returning the number of
+/// bytes written (the body's starting offset).
+fn write_envelope(writer: &mut dyn Write, type_id: &str, type_version: u32) -> Result<usize> {
+    let type_id_len = u16::try_from(type_id.len()).map_err(|_| {
+        Error::io(format!(
+            "cache codec type_id too long ({} bytes, max {})",
+            type_id.len(),
+            u16::MAX
+        ))
+    })?;
+
+    writer.write_all(&MAGIC)?;
+    writer.write_all(&[ENVELOPE_VERSION])?;
+    writer.write_all(&type_id_len.to_le_bytes())?;
+    writer.write_all(type_id.as_bytes())?;
+    writer.write_all(&type_version.to_le_bytes())?;
+
+    Ok(4 + 1 + 2 + type_id.len() + 4)
+}
+
+// ---------------------------------------------------------------------------
+// CacheDecode — first-class cache-miss outcome
+// ---------------------------------------------------------------------------
+
+/// Why a cache entry could not be decoded into the expected type.
+///
+/// Carried by [`CacheDecode::Miss`] so backends can emit targeted metrics
+/// (e.g. distinguish "evicting due to a stale format" from "type collision")
+/// without re-parsing. Every reason maps to the same behavior — recompute via
+/// the loader — so callers that don't care can ignore it.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum CacheMissReason {
+    /// Absent or wrong magic, unknown `envelope_version`, truncated framing, or
+    /// a non-utf8 `type_id`. Typically an entry written by a pre-stabilization
+    /// or otherwise foreign build.
+    InvalidEnvelope,
+    /// Well-formed envelope, but its `type_id` names a different entry type than
+    /// the codec reading it.
+    TypeMismatch,
+    /// Written by a newer build whose `type_version` this build does not
+    /// understand and must not attempt to interpret.
+    VersionTooNew,
+    /// Envelope validated, but the body failed to decode (truncation, a
+    /// malformed protobuf header, an IPC error, etc.).
+    BodyError,
+}
+
+/// Outcome of deserializing a cache entry.
+///
+/// `Miss` means the bytes could not be confidently decoded into `T`; the
+/// [`CacheMissReason`] says why. A backend treats any `Miss` exactly like a key
+/// that was never present: recompute via the loader.
+#[derive(Debug)]
+pub enum CacheDecode<T> {
+    Hit(T),
+    Miss(CacheMissReason),
+}
+
+impl<T> CacheDecode<T> {
+    pub fn hit(self) -> Option<T> {
+        match self {
+            Self::Hit(v) => Some(v),
+            Self::Miss(_) => None,
+        }
+    }
+}
 
 // ---------------------------------------------------------------------------
 // CacheCodecImpl — trait for serializable cache entry types
@@ -18,31 +190,40 @@ use crate::Result;
 
 /// Serialization trait for cache entries.
 ///
-/// **Experimental**: the serialized format is not stable and may change
-/// between releases without notice.
+/// **Experimental**: the serialized format is not yet covered by a stability
+/// guarantee and may change between releases. When it does stabilize, the
+/// rules are: `TYPE_ID`, protobuf field numbers, and enum values are
+/// append-only forever; format changes that protobuf cannot express
+/// transparently bump [`CURRENT_VERSION`](Self::CURRENT_VERSION).
 ///
-/// Implement this on concrete types that need to survive serialization
-/// through a persistent cache backend. Then wire it into a [`CacheKey`](super::CacheKey)
-/// via [`CacheCodec::from_impl`]:
+/// Implement this on concrete types that need to survive serialization through
+/// a persistent cache backend, then wire it into a
+/// [`CacheKey`](super::CacheKey) via [`CacheCodec::from_impl`].
 ///
-/// ```ignore
-/// impl CacheCodecImpl for MyData {
-///     fn serialize(&self, w: &mut dyn Write) -> Result<()> { /* ... */ }
-///     fn deserialize(data: &Bytes) -> Result<Self> { /* ... */ }
-/// }
-///
-/// impl CacheKey for MyDataKey {
-///     type ValueType = MyData;
-///     fn codec() -> Option<CacheCodec> {
-///         Some(CacheCodec::from_impl::<MyData>())
-///     }
-///     // ...
-/// }
-/// ```
+/// The envelope (magic/version/type_id/type_version) is written and validated
+/// by the [`CacheCodec`] wrapper. [`serialize`](Self::serialize) writes only
+/// the body — a header followed by sections in a fixed, version-keyed order —
+/// and [`deserialize`](Self::deserialize) reads them back in that same order.
+/// The read sequence mirroring the write sequence for each `type_version` is
+/// the invariant the implementor owns.
 pub trait CacheCodecImpl: Send + Sync {
-    fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()>;
+    /// Stable identity for this entry type. **Must not change once shipped.**
+    /// This is a deliberate author-assigned string, not `std::any::type_name`
+    /// (which is not stable across compiler versions).
+    const TYPE_ID: &'static str;
+
+    /// Body schema version this build writes. Bump when the body layout
+    /// changes in a way protobuf field additions cannot express transparently
+    /// (adding/removing/reordering sections, a raw-blob encoding change, etc.).
+    const CURRENT_VERSION: u32;
+
+    /// Write the body: a header, then sections in a fixed order.
+    fn serialize(&self, writer: &mut CacheEntryWriter<'_>) -> Result<()>;
 
-    fn deserialize(data: &Bytes) -> Result<Self>
+    /// Reconstruct from the body. Branch on
+    /// [`reader.version()`](CacheEntryReader::version) for backward compat;
+    /// sections are read in write order.
+    fn deserialize(reader: &mut CacheEntryReader<'_>) -> Result<Self>
     where
         Self: Sized;
 }
@@ -55,25 +236,31 @@ pub(crate) type ArcAny = Arc<dyn std::any::Any + Send + Sync>;
 
 /// Type-erased codec for serializing and deserializing cache entries.
 ///
-/// `CacheCodec` is two plain function pointers — it is `Copy` and has no
-/// heap allocation. Construct one via [`CacheCodec::from_impl`] for types
-/// that implement [`CacheCodecImpl`], or [`CacheCodec::new`] for custom
-/// cases (e.g. when the orphan rule prevents a direct impl).
+/// `CacheCodec` carries the entry's stable `type_id`/`version` plus two plain
+/// function pointers — it is `Copy` and has no heap allocation. Construct one
+/// via [`CacheCodec::from_impl`] for types that implement [`CacheCodecImpl`],
+/// or [`CacheCodec::new`] for custom cases (e.g. when the orphan rule prevents
+/// a direct impl).
 #[derive(Copy, Clone)]
 pub struct CacheCodec {
-    pub(crate) serialize: fn(&ArcAny, &mut dyn std::io::Write) -> Result<()>,
-    pub(crate) deserialize: fn(&Bytes) -> Result<ArcAny>,
+    type_id: &'static str,
+    version: u32,
+    serialize_body: fn(&ArcAny, &mut CacheEntryWriter<'_>) -> Result<()>,
+    deserialize_body: fn(&mut CacheEntryReader<'_>) -> Result<ArcAny>,
 }
 
 impl std::fmt::Debug for CacheCodec {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("CacheCodec").finish_non_exhaustive()
+        f.debug_struct("CacheCodec")
+            .field("type_id", &self.type_id)
+            .field("version", &self.version)
+            .finish_non_exhaustive()
     }
 }
 
 fn serialize_via_impl<T: CacheCodecImpl + 'static>(
     any: &ArcAny,
-    writer: &mut dyn std::io::Write,
+    writer: &mut CacheEntryWriter<'_>,
 ) -> Result<()> {
     let val = any
         .downcast_ref::<T>()
@@ -81,44 +268,278 @@ fn serialize_via_impl<T: CacheCodecImpl + 'static>(
     val.serialize(writer)
 }
 
-fn deserialize_via_impl<T: CacheCodecImpl + 'static>(data: &Bytes) -> Result<ArcAny> {
-    let val = T::deserialize(data)?;
+fn deserialize_via_impl<T: CacheCodecImpl + 'static>(
+    reader: &mut CacheEntryReader<'_>,
+) -> Result<ArcAny> {
+    let val = T::deserialize(reader)?;
     Ok(Arc::new(val) as ArcAny)
 }
 
 impl CacheCodec {
-    /// Create a `CacheCodec` from plain function pointers.
+    /// Create a `CacheCodec` from explicit body function pointers.
     ///
     /// Prefer [`from_impl`](Self::from_impl) when the value type implements
     /// [`CacheCodecImpl`]. Use this for types where a direct impl isn't
-    /// possible (e.g. orphan rule prevents it).
+    /// possible (e.g. the orphan rule prevents it). `type_id` and `version`
+    /// play the same role as the corresponding [`CacheCodecImpl`] constants.
     pub fn new(
-        serialize: fn(&ArcAny, &mut dyn std::io::Write) -> Result<()>,
-        deserialize: fn(&Bytes) -> Result<ArcAny>,
+        type_id: &'static str,
+        version: u32,
+        serialize_body: fn(&ArcAny, &mut CacheEntryWriter<'_>) -> Result<()>,
+        deserialize_body: fn(&mut CacheEntryReader<'_>) -> Result<ArcAny>,
     ) -> Self {
         Self {
-            serialize,
-            deserialize,
+            type_id,
+            version,
+            serialize_body,
+            deserialize_body,
         }
     }
 
     /// Create a `CacheCodec` from a [`CacheCodecImpl`] implementation.
-    ///
-    /// For **sized** types stored directly in the cache. The codec
-    /// downcasts `&dyn Any` to `&T` for serialization and returns `Arc<T>`
-    /// from deserialization.
     pub fn from_impl<T: CacheCodecImpl + 'static>() -> Self {
         Self {
-            serialize: serialize_via_impl::<T>,
-            deserialize: deserialize_via_impl::<T>,
+            type_id: T::TYPE_ID,
+            version: T::CURRENT_VERSION,
+            serialize_body: serialize_via_impl::<T>,
+            deserialize_body: deserialize_via_impl::<T>,
         }
     }
 
-    pub fn serialize(&self, value: &ArcAny, writer: &mut dyn std::io::Write) -> Result<()> {
-        (self.serialize)(value, writer)
+    /// Serialize `value` into `writer`: envelope first, then the body.
+    pub fn serialize(&self, value: &ArcAny, writer: &mut dyn Write) -> Result<()> {
+        let body_offset = write_envelope(writer, self.type_id, self.version)?;
+        let mut entry_writer = CacheEntryWriter::with_pos(writer, body_offset);
+        (self.serialize_body)(value, &mut entry_writer)
+    }
+
+    /// Deserialize an entry from `data`.
+    ///
+    /// Never fails: any non-fatal failure to interpret the bytes becomes a
+    /// [`CacheDecode::Miss`] with the reason why (see [`CacheMissReason`]).
+    /// Reading from an in-memory [`Bytes`] cannot do I/O, so there is no fault
+    /// channel — a miss is the only non-`Hit` outcome.
+    pub fn deserialize(&self, data: &Bytes) -> CacheDecode<ArcAny> {
+        let Some(envelope) = parse_envelope(data) else {
+            log::debug!("cache entry rejected: missing or invalid envelope");
+            return CacheDecode::Miss(CacheMissReason::InvalidEnvelope);
+        };
+
+        if envelope.type_id != self.type_id {
+            log::debug!(
+                "cache entry type_id mismatch: got {:?}, expected {:?}",
+                envelope.type_id,
+                self.type_id
+            );
+            return CacheDecode::Miss(CacheMissReason::TypeMismatch);
+        }
+
+        // A version newer than this build writes was produced by a newer build
+        // whose body layout we cannot assume to understand. Older/equal versions
+        // are the impl's responsibility to handle (branching on reader.version()).
+        if envelope.type_version > self.version {
+            log::debug!(
+                "cache entry {:?} has unsupported type_version {} (this build writes {})",
+                self.type_id,
+                envelope.type_version,
+                self.version
+            );
+            return CacheDecode::Miss(CacheMissReason::VersionTooNew);
+        }
+
+        let mut reader = CacheEntryReader::new(data, envelope.body_offset, envelope.type_version);
+        match (self.deserialize_body)(&mut reader) {
+            Ok(value) => CacheDecode::Hit(value),
+            Err(e) => {
+                log::debug!(
+                    "cache entry {:?} v{} failed to decode: {e}",
+                    self.type_id,
+                    envelope.type_version
+                );
+                CacheDecode::Miss(CacheMissReason::BodyError)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// A trivial codec used to exercise the envelope and miss semantics
+    /// without pulling in arrow-backed payloads.
+    #[derive(Debug, PartialEq)]
+    struct Widget {
+        n: u32,
+    }
+
+    impl CacheCodecImpl for Widget {
+        const TYPE_ID: &'static str = "test.Widget";
+        const CURRENT_VERSION: u32 = 1;
+
+        fn serialize(&self, writer: &mut CacheEntryWriter<'_>) -> Result<()> {
+            writer.write_raw(&self.n.to_le_bytes())
+        }
+
+        fn deserialize(reader: &mut CacheEntryReader<'_>) -> Result<Self> {
+            let bytes = reader.read_raw()?;
+            let n = u32::from_le_bytes(
+                bytes
+                    .as_ref()
+                    .try_into()
+                    .map_err(|_| Error::io("bad widget".to_string()))?,
+            );
+            Ok(Self { n })
+        }
+    }
+
+    fn serialize_widget(widget: &Widget) -> Bytes {
+        let codec = CacheCodec::from_impl::<Widget>();
+        let any: ArcAny = Arc::new(Widget { n: widget.n });
+        let mut buf = Vec::new();
+        codec.serialize(&any, &mut buf).unwrap();
+        Bytes::from(buf)
+    }
+
+    /// The miss reason, or `None` if the decode was a hit.
+    fn miss_reason(data: &Bytes) -> Option<CacheMissReason> {
+        match deserialize_widget(data) {
+            CacheDecode::Hit(_) => None,
+            CacheDecode::Miss(reason) => Some(reason),
+        }
     }
 
-    pub fn deserialize(&self, data: &Bytes) -> Result<ArcAny> {
-        (self.deserialize)(data)
+    fn deserialize_widget(data: &Bytes) -> CacheDecode<Widget> {
+        let codec = CacheCodec::from_impl::<Widget>();
+        match codec.deserialize(data) {
+            CacheDecode::Hit(any) => {
+                CacheDecode::Hit(Arc::try_unwrap(any.downcast::<Widget>().unwrap()).unwrap())
+            }
+            CacheDecode::Miss(reason) => CacheDecode::Miss(reason),
+        }
+    }
+
+    #[test]
+    fn envelope_roundtrip_hits() {
+        let bytes = serialize_widget(&Widget { n: 0xDEADBEEF });
+        // Sanity: the entry starts with the magic.
+        assert_eq!(&bytes[..4], b"LCE1");
+        let decoded = deserialize_widget(&bytes).hit().unwrap();
+        assert_eq!(decoded, Widget { n: 0xDEADBEEF });
+    }
+
+    #[test]
+    fn has_cache_envelope_detects_magic() {
+        let bytes = serialize_widget(&Widget { n: 1 });
+        assert!(has_cache_envelope(&bytes));
+        assert!(has_cache_envelope(&MAGIC)); // exactly the magic, nothing after
+        assert!(!has_cache_envelope(b"LCE")); // too short
+        assert!(!has_cache_envelope(b"JUNK and more"));
+        assert!(!has_cache_envelope(&[]));
+    }
+
+    #[test]
+    fn wrong_magic_is_miss() {
+        let mut bytes = serialize_widget(&Widget { n: 7 }).to_vec();
+        bytes[0] = b'X';
+        assert_eq!(
+            miss_reason(&Bytes::from(bytes)),
+            Some(CacheMissReason::InvalidEnvelope)
+        );
+    }
+
+    #[test]
+    fn pre_stabilization_blob_is_miss() {
+        // An old unstable blob led with a small u64 LE length prefix (a JSON
+        // header of tens of bytes) — no magic. It must self-heal to a miss.
+        let mut blob = Vec::new();
+        blob.extend_from_slice(&(42u64).to_le_bytes());
+        blob.extend_from_slice(&[0u8; 42]);
+        assert_eq!(
+            miss_reason(&Bytes::from(blob)),
+            Some(CacheMissReason::InvalidEnvelope)
+        );
+
+        // A different unstable shape led with a small u8 tag (0/1/2).
+        assert_eq!(
+            miss_reason(&Bytes::from(vec![0u8, 1, 2, 3])),
+            Some(CacheMissReason::InvalidEnvelope)
+        );
+    }
+
+    #[test]
+    fn unknown_envelope_version_is_miss() {
+        let mut bytes = serialize_widget(&Widget { n: 7 }).to_vec();
+        bytes[4] = 0xFF; // envelope_version byte
+        assert_eq!(
+            miss_reason(&Bytes::from(bytes)),
+            Some(CacheMissReason::InvalidEnvelope)
+        );
+    }
+
+    #[test]
+    fn type_id_mismatch_is_miss() {
+        // Hand-build an envelope with a foreign type_id but valid framing.
+        let mut buf = Vec::new();
+        write_envelope(&mut buf, "some.OtherType", 1).unwrap();
+        buf.extend_from_slice(&(4u64).to_le_bytes());
+        buf.extend_from_slice(&99u32.to_le_bytes());
+        assert_eq!(
+            miss_reason(&Bytes::from(buf)),
+            Some(CacheMissReason::TypeMismatch)
+        );
+    }
+
+    #[test]
+    fn unsupported_future_type_version_is_miss() {
+        // An entry written by a newer build (higher type_version) must miss
+        // rather than be misread by this build.
+        let mut buf = Vec::new();
+        write_envelope(&mut buf, Widget::TYPE_ID, Widget::CURRENT_VERSION + 1).unwrap();
+        lance_arrow::ipc::write_len_prefixed_bytes(&mut buf, &9u32.to_le_bytes()).unwrap();
+        assert_eq!(
+            miss_reason(&Bytes::from(buf)),
+            Some(CacheMissReason::VersionTooNew)
+        );
+    }
+
+    #[test]
+    fn truncated_envelope_is_miss() {
+        let bytes = serialize_widget(&Widget { n: 7 });
+        for cut in [0, 1, 4, 5, 7, 9] {
+            assert_eq!(
+                miss_reason(&bytes.slice(..cut.min(bytes.len()))),
+                Some(CacheMissReason::InvalidEnvelope),
+                "truncating to {cut} bytes should miss as InvalidEnvelope"
+            );
+        }
+    }
+
+    #[test]
+    fn body_decode_error_is_miss() {
+        // Valid envelope, but the body is too short for the widget.
+        let mut buf = Vec::new();
+        write_envelope(&mut buf, Widget::TYPE_ID, Widget::CURRENT_VERSION).unwrap();
+        buf.extend_from_slice(&(1u64).to_le_bytes());
+        buf.push(0u8);
+        assert_eq!(
+            miss_reason(&Bytes::from(buf)),
+            Some(CacheMissReason::BodyError)
+        );
+    }
+
+    #[test]
+    fn reader_exposes_envelope_version() {
+        // type_version travels through the envelope to reader.version().
+        let mut buf = Vec::new();
+        write_envelope(&mut buf, Widget::TYPE_ID, 7).unwrap();
+        let body_off = buf.len();
+        // A widget body so the codec can decode it.
+        lance_arrow::ipc::write_len_prefixed_bytes(&mut buf, &5u32.to_le_bytes()).unwrap();
+        let data = Bytes::from(buf);
+
+        let mut r = CacheEntryReader::new(&data, body_off, 7);
+        assert_eq!(r.version(), 7);
+        assert_eq!(r.read_raw().unwrap().as_ref(), 5u32.to_le_bytes());
     }
 }
diff --git a/rust/lance-core/src/cache/entry_io.rs b/rust/lance-core/src/cache/entry_io.rs
new file mode 100644
index 00000000000..fe91b11ca7d
--- /dev/null
+++ b/rust/lance-core/src/cache/entry_io.rs
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Streaming readers/writers for cache entry bodies.
+//!
+//! [`CacheCodecImpl`](super::CacheCodecImpl) bodies are written and read
+//! through these wrappers. They keep serialization streaming (no buffering of
+//! the whole entry) and reads zero-copy (sections borrow from the input
+//! [`Bytes`]), while tracking the byte position needed to keep Arrow IPC
+//! sections 64-byte aligned (see [`lance_arrow::ipc`]).
+//!
+//! Body layout primitives:
+//!
+//! ```text
+//! HEADER    : [header_len: u32 LE][header proto bytes]
+//! ARROW_IPC : [pad to 64B][self-delimiting IPC stream]
+//! RAW_BLOB  : [len: u64 LE][bytes]
+//! ```
+
+use std::io::Write;
+
+use arrow_array::RecordBatch;
+use bytes::Bytes;
+use prost::Message;
+
+use crate::{Error, Result};
+
+/// Writes a cache entry body: a header followed by sections, streaming
+/// directly to the underlying writer.
+///
+/// The envelope is written by the [`CacheCodec`](super::CacheCodec) wrapper
+/// before this writer is handed to
+/// [`CacheCodecImpl::serialize`](super::CacheCodecImpl::serialize).
+pub struct CacheEntryWriter<'a> {
+    writer: &'a mut dyn Write,
+    /// Absolute byte offset within the entry, used to align IPC sections.
+    pos: usize,
+}
+
+impl<'a> CacheEntryWriter<'a> {
+    /// Create a writer positioned at the start of an entry (offset 0).
+    ///
+    /// Use this for nested serialization into a standalone buffer. The
+    /// envelope-aware entry point is [`CacheCodec::serialize`](super::CacheCodec::serialize).
+    pub fn new(writer: &'a mut dyn Write) -> Self {
+        Self { writer, pos: 0 }
+    }
+
+    /// Create a writer whose section alignment accounts for `pos` bytes
+    /// already written ahead of the body (i.e. the envelope).
+    pub(crate) fn with_pos(writer: &'a mut dyn Write, pos: usize) -> Self {
+        Self { writer, pos }
+    }
+
+    /// Write a single discriminant byte (e.g. a variant tag).
+    pub fn write_u8(&mut self, value: u8) -> Result<()> {
+        self.writer.write_all(&[value])?;
+        self.pos += 1;
+        Ok(())
+    }
+
+    /// Write a protobuf header as `[len: u32 LE][bytes]`.
+    pub fn write_header<P: Message>(&mut self, header: &P) -> Result<()> {
+        let bytes = header.encode_to_vec();
+        let len = u32::try_from(bytes.len())
+            .map_err(|_| Error::io(format!("cache header too large: {} bytes", bytes.len())))?;
+        self.writer.write_all(&len.to_le_bytes())?;
+        self.writer.write_all(&bytes)?;
+        self.pos += 4 + bytes.len();
+        Ok(())
+    }
+
+    /// Write `batch` as a 64-byte-aligned Arrow IPC section.
+    pub fn write_ipc(&mut self, batch: &RecordBatch) -> Result<()> {
+        lance_arrow::ipc::write_ipc_section(self.writer, &mut self.pos, batch)
+            .map_err(|e| Error::io(e.to_string()))
+    }
+
+    /// Write `batches` as a single 64-byte-aligned multi-batch Arrow IPC
+    /// section. The iterator must yield at least one batch.
+    pub fn write_ipc_batches<I>(&mut self, batches: I) -> Result<()>
+    where
+        I: IntoIterator<Item = RecordBatch>,
+    {
+        lance_arrow::ipc::write_ipc_section_batches(self.writer, &mut self.pos, batches)
+            .map_err(|e| Error::io(e.to_string()))
+    }
+
+    /// Write a raw blob as `[len: u64 LE][bytes]`.
+    ///
+    /// Only for byte payloads that already have their own stable, portable
+    /// encoding (e.g. a roaring bitmap, a varint-packed stream).
+    pub fn write_raw(&mut self, bytes: &[u8]) -> Result<()> {
+        lance_arrow::ipc::write_len_prefixed_bytes(self.writer, bytes)
+            .map_err(|e| Error::io(e.to_string()))?;
+        self.pos += 8 + bytes.len();
+        Ok(())
+    }
+
+    /// The underlying writer, for a payload that carries its own framing.
+    ///
+    /// Use this only when the codec writes a self-delimiting or whole-body
+    /// payload — e.g. streaming a roaring bitmap as the entire body, where the
+    /// length prefix of [`write_raw`](Self::write_raw) would be redundant and
+    /// buffering to measure that length would force an extra copy. For
+    /// structured bodies prefer [`write_header`](Self::write_header) /
+    /// [`write_ipc`](Self::write_ipc) / [`write_raw`](Self::write_raw), which
+    /// give you versioning and 64-byte IPC alignment.
+    ///
+    /// Bytes written through this do **not** advance the section-alignment
+    /// position, so it must not be interleaved with [`write_ipc`](Self::write_ipc).
+    pub fn raw_writer(&mut self) -> &mut dyn Write {
+        self.writer
+    }
+}
+
+/// Reads a cache entry body, tracking an offset into the input and exposing
+/// the entry's `type_version` so implementors can branch for backward compat.
+///
+/// All reads are zero-copy: returned [`Bytes`] and the buffers behind decoded
+/// [`RecordBatch`]es borrow from the input allocation.
+pub struct CacheEntryReader<'a> {
+    data: &'a Bytes,
+    offset: usize,
+    version: u32,
+}
+
+impl<'a> CacheEntryReader<'a> {
+    /// Create a reader over `data`, starting at body byte `offset`, for an
+    /// entry written at `version`.
+    pub fn new(data: &'a Bytes, offset: usize, version: u32) -> Self {
+        Self {
+            data,
+            offset,
+            version,
+        }
+    }
+
+    /// The `type_version` from the envelope. Branch on this for backward compat.
+    pub fn version(&self) -> u32 {
+        self.version
+    }
+
+    /// Read a single discriminant byte written by [`CacheEntryWriter::write_u8`].
+    pub fn read_u8(&mut self) -> Result<u8> {
+        let bytes = self.data.as_ref();
+        let v = *bytes
+            .get(self.offset)
+            .ok_or_else(|| Error::io("cache entry: truncated, missing tag byte".to_string()))?;
+        self.offset += 1;
+        Ok(v)
+    }
+
+    /// Read a protobuf header written by [`CacheEntryWriter::write_header`].
+    pub fn read_header<P: Message + Default>(&mut self) -> Result<P> {
+        let bytes = self.data.as_ref();
+        let len_end = self
+            .offset
+            .checked_add(4)
+            .filter(|&e| e <= bytes.len())
+            .ok_or_else(|| Error::io("cache header: truncated length prefix".to_string()))?;
+        let len = u32::from_le_bytes(bytes[self.offset..len_end].try_into().unwrap()) as usize;
+        let data_end = len_end
+            .checked_add(len)
+            .filter(|&e| e <= bytes.len())
+            .ok_or_else(|| Error::io("cache header: truncated body".to_string()))?;
+        let msg = P::decode(&bytes[len_end..data_end])
+            .map_err(|e| Error::io(format!("cache header decode failed: {e}")))?;
+        self.offset = data_end;
+        Ok(msg)
+    }
+
+    /// Read one [`RecordBatch`] from a 64-byte-aligned IPC section.
+    pub fn read_ipc(&mut self) -> Result<RecordBatch> {
+        lance_arrow::ipc::read_ipc_section_at(self.data, &mut self.offset)
+            .map_err(|e| Error::io(e.to_string()))
+    }
+
+    /// Read all [`RecordBatch`]es from a 64-byte-aligned multi-batch IPC
+    /// section written by [`CacheEntryWriter::write_ipc_batches`].
+    pub fn read_ipc_batches(&mut self) -> Result<Vec<RecordBatch>> {
+        lance_arrow::ipc::read_ipc_section_batches_at(self.data, &mut self.offset)
+            .map_err(|e| Error::io(e.to_string()))
+    }
+
+    /// Read a raw blob written by [`CacheEntryWriter::write_raw`], zero-copy.
+    pub fn read_raw(&mut self) -> Result<Bytes> {
+        lance_arrow::ipc::read_len_prefixed_bytes_at(self.data, &mut self.offset)
+            .map_err(|e| Error::io(e.to_string()))
+    }
+
+    /// The not-yet-consumed body bytes as a zero-copy slice.
+    ///
+    /// For a payload that carries its own framing and is parsed with the
+    /// codec's own cursor — the read counterpart of
+    /// [`CacheEntryWriter::raw_writer`]. For structured bodies prefer
+    /// [`read_header`](Self::read_header) / [`read_ipc`](Self::read_ipc) /
+    /// [`read_raw`](Self::read_raw).
+    pub fn body(&self) -> Bytes {
+        self.data.slice(self.offset..)
+    }
+}
diff --git a/rust/lance-core/src/cache/mod.rs b/rust/lance-core/src/cache/mod.rs
index f62837fe3cc..07038c6e9d5 100644
--- a/rust/lance-core/src/cache/mod.rs
+++ b/rust/lance-core/src/cache/mod.rs
@@ -47,10 +47,14 @@
 
 pub mod backend;
 pub mod codec;
+mod entry_io;
 mod moka;
 
-pub use backend::{CacheBackend, CacheEntry, InternalCacheKey};
-pub use codec::{CacheCodec, CacheCodecImpl};
+pub use backend::{CacheBackend, CacheEntry, CacheKeyIterator, InternalCacheKey};
+pub use codec::{
+    CacheCodec, CacheCodecImpl, CacheDecode, CacheMissReason, MAGIC, has_cache_envelope,
+};
+pub use entry_io::{CacheEntryReader, CacheEntryWriter};
 pub use moka::MokaCacheBackend;
 
 use std::borrow::Cow;
@@ -245,6 +249,40 @@ impl LanceCache {
         self.cache.size_bytes().await
     }
 
+    /// Return an iterator over keys currently stored under this cache's prefix.
+    ///
+    /// Returns `None` when the backend does not support key inventory. The
+    /// iterator is intended for diagnostics and may be weakly consistent with
+    /// concurrent cache mutations.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use std::{borrow::Cow, sync::Arc};
+    /// # use lance_core::cache::{CacheKey, LanceCache};
+    /// # struct MyKey;
+    /// # impl CacheKey for MyKey {
+    /// #     type ValueType = Vec<i32>;
+    /// #     fn key(&self) -> Cow<'_, str> { Cow::Borrowed("my-key") }
+    /// #     fn type_name() -> &'static str { "VecI32" }
+    /// # }
+    /// # async fn example() {
+    /// let cache = LanceCache::with_capacity(1024);
+    /// cache.insert_with_key(&MyKey, Arc::new(vec![1, 2, 3])).await;
+    ///
+    /// let mut keys = cache.keys().await.expect("Moka supports key inventory");
+    /// assert_eq!(keys.next().unwrap().key(), "my-key");
+    /// # }
+    /// ```
+    pub async fn keys(&self) -> Option<CacheKeyIterator<'_>> {
+        Some(Box::new(
+            self.cache
+                .keys()
+                .await?
+                .filter(|key| key.starts_with(&self.prefix)),
+        ))
+    }
+
     // -- Sized insert/get (internal, shared by sized and unsized paths) --------
 
     async fn insert_with_id<T: DeepSizeOf + Send + Sync + 'static>(
@@ -557,7 +595,7 @@ impl CacheStats {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use std::collections::HashMap;
+    use std::collections::{BTreeSet, HashMap};
     use std::marker::PhantomData;
 
     struct TestKey<T: 'static> {
@@ -609,6 +647,18 @@ mod tests {
         }
     }
 
+    fn key_fields(keys: &[InternalCacheKey]) -> BTreeSet<(String, String, &'static str)> {
+        keys.iter()
+            .map(|key| {
+                (
+                    key.prefix().to_string(),
+                    key.key().to_string(),
+                    key.type_name(),
+                )
+            })
+            .collect()
+    }
+
     #[tokio::test]
     async fn test_cache_bytes() {
         let item = Arc::new(vec![1, 2, 3]);
@@ -718,6 +768,99 @@ mod tests {
         assert_eq!(base.stats().await.hits, 1);
     }
 
+    #[tokio::test]
+    async fn test_cache_keys_with_prefixes() {
+        let base = LanceCache::with_capacity(1000);
+        let prefixed = base.with_key_prefix("ns");
+        let nested = prefixed.with_key_prefix("index");
+        let other = base.with_key_prefix("ns-other");
+
+        base.insert_with_key(&TestKey::new("root"), Arc::new(vec![0]))
+            .await;
+        prefixed
+            .insert_with_key(&TestKey::new("child"), Arc::new(vec![1]))
+            .await;
+        nested
+            .insert_with_key(&TestKey::new("nested"), Arc::new(vec![2]))
+            .await;
+        other
+            .insert_with_key(&TestKey::new("other"), Arc::new(vec![3]))
+            .await;
+
+        let base_keys = base.keys().await.unwrap().collect::<Vec<_>>();
+        assert_eq!(
+            key_fields(&base_keys),
+            BTreeSet::from([
+                (
+                    "".to_string(),
+                    "root".to_string(),
+                    TestKey::<Vec<i32>>::type_name()
+                ),
+                (
+                    "ns/".to_string(),
+                    "child".to_string(),
+                    TestKey::<Vec<i32>>::type_name()
+                ),
+                (
+                    "ns/index/".to_string(),
+                    "nested".to_string(),
+                    TestKey::<Vec<i32>>::type_name()
+                ),
+                (
+                    "ns-other/".to_string(),
+                    "other".to_string(),
+                    TestKey::<Vec<i32>>::type_name()
+                ),
+            ])
+        );
+
+        let prefixed_keys = prefixed.keys().await.unwrap().collect::<Vec<_>>();
+        assert_eq!(
+            key_fields(&prefixed_keys),
+            BTreeSet::from([
+                (
+                    "ns/".to_string(),
+                    "child".to_string(),
+                    TestKey::<Vec<i32>>::type_name()
+                ),
+                (
+                    "ns/index/".to_string(),
+                    "nested".to_string(),
+                    TestKey::<Vec<i32>>::type_name()
+                ),
+            ])
+        );
+    }
+
+    #[tokio::test]
+    async fn test_cache_keys_reflect_invalidation_and_clear() {
+        let base = LanceCache::with_capacity(1000);
+        let prefixed = base.with_key_prefix("ns");
+        let other = base.with_key_prefix("other");
+
+        prefixed
+            .insert_with_key(&TestKey::new("child"), Arc::new(vec![1]))
+            .await;
+        other
+            .insert_with_key(&TestKey::new("other"), Arc::new(vec![2]))
+            .await;
+        assert_eq!(base.keys().await.unwrap().count(), 2);
+
+        prefixed.invalidate_prefix("").await;
+        let keys = base.keys().await.unwrap().collect::<Vec<_>>();
+        assert_eq!(
+            key_fields(&keys),
+            BTreeSet::from([(
+                "other/".to_string(),
+                "other".to_string(),
+                TestKey::<Vec<i32>>::type_name()
+            )])
+        );
+
+        base.clear().await;
+        assert_eq!(base.keys().await.unwrap().count(), 0);
+    }
+
     #[tokio::test]
     async fn test_cache_get_or_insert() {
         let cache = LanceCache::with_capacity(1000);
@@ -833,6 +976,7 @@ mod tests {
                 .await
                 .is_none()
         );
+        assert!(cache.keys().await.is_none());
     }
 
     #[tokio::test]
diff --git a/rust/lance-core/src/cache/moka.rs b/rust/lance-core/src/cache/moka.rs
index 6be7760458a..a3956c1720c 100644
--- a/rust/lance-core/src/cache/moka.rs
+++ b/rust/lance-core/src/cache/moka.rs
@@ -11,7 +11,7 @@ use futures::Future;
 use crate::Result;
 
 use super::CacheCodec;
-use super::backend::{CacheBackend, CacheEntry, InternalCacheKey};
+use super::backend::{CacheBackend, CacheEntry, CacheKeyIterator, InternalCacheKey};
 
 /// Internal record stored in the moka cache.
 #[derive(Clone, Debug)]
@@ -123,6 +123,13 @@ impl CacheBackend for MokaCacheBackend {
         self.cache.run_pending_tasks().await;
     }
 
+    async fn keys(&self) -> Option<CacheKeyIterator<'_>> {
+        self.cache.run_pending_tasks().await;
+        Some(Box::new(
+            self.cache.iter().map(|(key, _)| key.as_ref().clone()),
+        ))
+    }
+
     async fn num_entries(&self) -> usize {
         self.cache.run_pending_tasks().await;
         self.cache.entry_count() as usize
diff --git a/rust/lance-core/src/datatypes.rs b/rust/lance-core/src/datatypes.rs
index 628f9cf9a90..8837037c308 100644
--- a/rust/lance-core/src/datatypes.rs
+++ b/rust/lance-core/src/datatypes.rs
@@ -25,6 +25,7 @@ pub use field::{
 pub use schema::{
     BlobHandling, FieldRef, OnMissing, Projectable, Projection, Schema,
     escape_field_path_for_project, format_field_path, parse_field_path,
+    validate_fixed_size_list_dimensions,
 };
 
 pub static BLOB_DESC_FIELDS: LazyLock<Fields> = LazyLock::new(|| {
diff --git a/rust/lance-core/src/datatypes/field.rs b/rust/lance-core/src/datatypes/field.rs
index 4c2665a3640..9f06d421949 100644
--- a/rust/lance-core/src/datatypes/field.rs
+++ b/rust/lance-core/src/datatypes/field.rs
@@ -575,6 +575,18 @@ impl Field {
         }
     }
 
+    /// Convert blob v2 fields in this field tree to their descriptor view.
+    pub fn unload_blobs_recursive(&mut self) {
+        if self.is_blob_v2() {
+            self.unloaded_mut();
+            return;
+        }
+
+        for child in &mut self.children {
+            child.unload_blobs_recursive();
+        }
+    }
+
     pub fn project(&self, path_components: &[&str]) -> Result<Self> {
         let mut f = Self {
             name: self.name.clone(),
@@ -1864,6 +1876,54 @@ mod tests {
         assert_eq!(field.logical_type, BLOB_V2_DESC_LANCE_FIELD.logical_type);
     }
 
+    #[test]
+    fn unload_blobs_recursive_only_unloads_blob_v2() {
+        let legacy_metadata = HashMap::from([(BLOB_META_KEY.to_string(), "true".to_string())]);
+        let blob_v2_metadata =
+            HashMap::from([(ARROW_EXT_NAME_KEY.to_string(), BLOB_V2_EXT_NAME.to_string())]);
+
+        let mut field: Field = ArrowField::new(
+            "parent",
+            DataType::Struct(Fields::from(vec![
+                ArrowField::new("legacy_blob", DataType::LargeBinary, true)
+                    .with_metadata(legacy_metadata),
+                ArrowField::new(
+                    "blob_v2",
+                    DataType::Struct(
+                        vec![
+                            ArrowField::new("data", DataType::LargeBinary, true),
+                            ArrowField::new("uri", DataType::Utf8, true),
+                        ]
+                        .into(),
+                    ),
+                    true,
+                )
+                .with_metadata(blob_v2_metadata),
+            ])),
+            true,
+        )
+        .try_into()
+        .unwrap();
+
+        field.unload_blobs_recursive();
+
+        let legacy_blob = field
+            .children
+            .iter()
+            .find(|f| f.name == "legacy_blob")
+            .unwrap();
+        assert_eq!(
+            legacy_blob.logical_type,
+            LogicalType::try_from(&DataType::LargeBinary).unwrap()
+        );
+        assert_eq!(legacy_blob.children.len(), 0);
+        assert!(legacy_blob.metadata.contains_key(BLOB_META_KEY));
+
+        let blob_v2 = field.children.iter().find(|f| f.name == "blob_v2").unwrap();
+        assert_eq!(blob_v2.logical_type, BLOB_V2_DESC_LANCE_FIELD.logical_type);
+        assert_eq!(blob_v2.children.len(), 5);
+    }
+
     #[test]
     fn project_by_field_accepts_blob_descriptor_projection() {
         let metadata = HashMap::from([(BLOB_META_KEY.to_string(), "true".to_string())]);
diff --git a/rust/lance-core/src/datatypes/schema.rs b/rust/lance-core/src/datatypes/schema.rs
index f959c37672f..d13eb476359 100644
--- a/rust/lance-core/src/datatypes/schema.rs
+++ b/rust/lance-core/src/datatypes/schema.rs
@@ -11,7 +11,7 @@ use std::{
 
 use crate::deepsize::DeepSizeOf;
 use arrow_array::RecordBatch;
-use arrow_schema::{Field as ArrowField, Schema as ArrowSchema};
+use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};
 use lance_arrow::*;
 
 use super::field::{Field, OnTypeMismatch, SchemaCompareOptions};
@@ -110,6 +110,29 @@ impl<'a> Iterator for SchemaFieldIterPreOrder<'a> {
     }
 }
 
+/// Reject `FixedSizeList` types whose dimension is not a positive integer.
+///
+/// The row count of a fixed-size list is derived by dividing the number of
+/// child items by the dimension, so a zero dimension panics with a
+/// divide-by-zero further down the write path (see issue #5102). A
+/// `FixedSizeList` of a `FixedSizeList` over a primitive collapses into a
+/// single leaf field, so the pre-order field walk never visits the inner list;
+/// recurse through the nested list types here to catch an inner zero dimension.
+///
+/// Shared by [`Schema::validate`] on the write path and the decoder's
+/// field-scheduler builders on the read path.
+pub fn validate_fixed_size_list_dimensions(field_name: &str, data_type: &DataType) -> Result<()> {
+    if let DataType::FixedSizeList(inner, dimension) = data_type {
+        if *dimension <= 0 {
+            return Err(Error::schema(format!(
+                "Field \"{field_name}\" contains a FixedSizeList with dimension {dimension}; dimension must be a positive integer"
+            )));
+        }
+        validate_fixed_size_list_dimensions(field_name, inner.data_type())?;
+    }
+    Ok(())
+}
+
 impl Schema {
     /// The unenforced primary key fields in the schema, ordered by position.
     ///
@@ -346,6 +369,10 @@ impl Schema {
                     field.id, self
                 )));
             }
+            // The row count of a fixed-size list is derived by dividing the
+            // number of items by the dimension, so a zero dimension would
+            // panic with a divide-by-zero further down the write path.
+            validate_fixed_size_list_dimensions(&field.name, &field.data_type())?;
         }
 
         Ok(())
@@ -2825,6 +2852,67 @@ mod tests {
         assert!(paths.contains(&"name".to_string()));
     }
 
+    #[test]
+    fn test_validate_rejects_zero_dimension_fixed_size_list() {
+        // A zero dimension divides-by-zero further down the write path (#5102)
+        let fsl = |dimension: i32| {
+            ArrowDataType::FixedSizeList(
+                Arc::new(ArrowField::new("item", ArrowDataType::Float32, true)),
+                dimension,
+            )
+        };
+
+        let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", fsl(0), true)]);
+        let err = Schema::try_from(&arrow_schema).unwrap_err();
+        assert!(
+            err.to_string()
+                .contains("dimension must be a positive integer"),
+            "unexpected error: {}",
+            err
+        );
+
+        // Nested inside a struct is rejected too
+        let arrow_schema = ArrowSchema::new(vec![ArrowField::new(
+            "outer",
+            ArrowDataType::Struct(ArrowFields::from(vec![ArrowField::new(
+                "vec",
+                fsl(0),
+                true,
+            )])),
+            true,
+        )]);
+        let err = Schema::try_from(&arrow_schema).unwrap_err();
+        assert!(
+            err.to_string()
+                .contains("dimension must be a positive integer"),
+            "unexpected error: {}",
+            err
+        );
+
+        // A zero-dimension FixedSizeList nested inside a positive-dimension
+        // FixedSizeList collapses into a single leaf field, so the inner
+        // dimension is not visited by the pre-order field walk and must still
+        // be rejected: FixedSizeList(FixedSizeList(Float32, 0), 4).
+        let nested =
+            ArrowDataType::FixedSizeList(Arc::new(ArrowField::new("inner", fsl(0), true)), 4);
+        let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", nested, true)]);
+        let err = Schema::try_from(&arrow_schema).unwrap_err();
+        assert!(
+            err.to_string()
+                .contains("dimension must be a positive integer"),
+            "unexpected error: {}",
+            err
+        );
+
+        // A positive dimension still validates, including nested lists
+        let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", fsl(2), true)]);
+        assert!(Schema::try_from(&arrow_schema).is_ok());
+        let nested_ok =
+            ArrowDataType::FixedSizeList(Arc::new(ArrowField::new("inner", fsl(2), true)), 4);
+        let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", nested_ok, true)]);
+        assert!(Schema::try_from(&arrow_schema).is_ok());
+    }
+
     #[test]
     fn test_schema_unenforced_clustering_key() {
         use crate::datatypes::field::LANCE_UNENFORCED_CLUSTERING_KEY_POSITION;
diff --git a/rust/lance-core/src/utils.rs b/rust/lance-core/src/utils.rs
index 8f16744b158..c202329838c 100644
--- a/rust/lance-core/src/utils.rs
+++ b/rust/lance-core/src/utils.rs
@@ -12,6 +12,7 @@ pub mod cpu;
 pub mod deletion;
 pub mod futures;
 pub mod hash;
+pub mod io_stats;
 pub mod parse;
 pub mod path;
 pub mod tempfile;
diff --git a/rust/lance-core/src/utils/io_stats.rs b/rust/lance-core/src/utils/io_stats.rs
new file mode 100644
index 00000000000..e2169d71ae3
--- /dev/null
+++ b/rust/lance-core/src/utils/io_stats.rs
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use std::ops::Range;
+
+/// A sink that records I/O requests as they are submitted to storage.
+///
+/// This lives in `lance-core` so that the encoding layer (`lance-encoding`) and
+/// the I/O layer (`lance-io`) can both refer to it without depending on one
+/// another.  It lets a caller attach a lightweight counter to a file reader and
+/// measure the exact bytes/IOPS performed for a bounded scope (e.g. a single
+/// query); see `lance_io::scheduler::IoStats` for the concrete implementation.
+///
+/// # When to use this
+///
+/// Lance also exposes two *process-wide, cumulative* I/O accounting facilities:
+/// the global scheduler counters (`lance_io::scheduler::iops_counter` /
+/// `bytes_read_counter`) and the object-store `IOTracker` wrapper used in tests.
+/// Both aggregate every read in the process and cannot attribute I/O to a single
+/// bounded scope.  Prefer an `IoStatsRecorder` when you need the *exact* I/O of
+/// one operation (e.g. a single query): attach it to a reader with
+/// `with_io_stats`, then read the snapshot when the scope ends.  It re-uses the
+/// reader's cached metadata, so measuring costs no extra file opens and does not
+/// disturb the global counters.
+pub trait IoStatsRecorder: std::fmt::Debug + Send + Sync {
+    /// Record one completed request, given the byte ranges as actually
+    /// submitted to storage (i.e. after any coalescing/splitting), so the
+    /// counts reflect physical I/O.
+    fn record_request(&self, ranges: &[Range<u64>]);
+}
diff --git a/rust/lance-datafusion/src/expr.rs b/rust/lance-datafusion/src/expr.rs
index 79650f6775e..a0da34ba2bb 100644
--- a/rust/lance-datafusion/src/expr.rs
+++ b/rust/lance-datafusion/src/expr.rs
@@ -17,6 +17,18 @@ const MS_PER_DAY: i64 = 86400000;
 // will always yield "x = 7_u64" regardless of the type of the column "x".  As a result, we
 // need to do that literal coercion ourselves.
 pub fn safe_coerce_scalar(value: &ScalarValue, ty: &DataType) -> Option<ScalarValue> {
+    // A dictionary target coerces the value to the dictionary's value type and
+    // re-wraps it as a dictionary literal. Only an untyped `ScalarValue::Null`
+    // keeps its untyped form, matching the behavior for all other targets; a
+    // *typed* null (e.g. `Utf8(None)`) is coerced and wrapped like any other
+    // value so it produces a `Dictionary(..)` literal that matches the column.
+    if let DataType::Dictionary(key_type, value_type) = ty {
+        if matches!(value, ScalarValue::Null) {
+            return Some(value.clone());
+        }
+        let inner = safe_coerce_scalar(value, value_type)?;
+        return Some(ScalarValue::Dictionary(key_type.clone(), Box::new(inner)));
+    }
     match value {
         ScalarValue::Int8(val) => match ty {
             DataType::Int8 => Some(value.clone()),
@@ -436,6 +448,9 @@ pub fn safe_coerce_scalar(value: &ScalarValue, ty: &DataType) -> Option<ScalarVa
             DataType::BinaryView => Some(value.clone()),
             _ => None,
         },
+        // A dictionary-encoded literal (e.g. produced by DataFusion's dictionary
+        // cast in the scalar-index path) coerces by unwrapping its underlying value.
+        ScalarValue::Dictionary(_, inner) => safe_coerce_scalar(inner, ty),
         _ => None,
     }
 }
@@ -775,4 +790,97 @@ mod tests {
             Some(ScalarValue::BinaryView(Some(vec![1, 2, 3])))
         );
     }
+
+    #[test]
+    fn test_dictionary_coerce() {
+        let dict_ty = DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8));
+
+        // A string literal coerces to a dictionary target by wrapping the
+        // coerced value in a dictionary scalar.
+        assert_eq!(
+            safe_coerce_scalar(&ScalarValue::Utf8(Some("com".to_string())), &dict_ty),
+            Some(ScalarValue::Dictionary(
+                Box::new(DataType::Int16),
+                Box::new(ScalarValue::Utf8(Some("com".to_string()))),
+            ))
+        );
+
+        // The inner value is coerced through to the dictionary value type, so a
+        // LargeUtf8 literal lands as a Utf8 value inside the dictionary.
+        assert_eq!(
+            safe_coerce_scalar(&ScalarValue::LargeUtf8(Some("com".to_string())), &dict_ty),
+            Some(ScalarValue::Dictionary(
+                Box::new(DataType::Int16),
+                Box::new(ScalarValue::Utf8(Some("com".to_string()))),
+            ))
+        );
+
+        // A dictionary literal round-trips back to its value type.
+        assert_eq!(
+            safe_coerce_scalar(
+                &ScalarValue::Dictionary(
+                    Box::new(DataType::Int16),
+                    Box::new(ScalarValue::Utf8(Some("com".to_string()))),
+                ),
+                &DataType::Utf8,
+            ),
+            Some(ScalarValue::Utf8(Some("com".to_string())))
+        );
+
+        // A dictionary literal coerces to a dictionary target, adopting the
+        // target's key type.
+        assert_eq!(
+            safe_coerce_scalar(
+                &ScalarValue::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(ScalarValue::Utf8(Some("com".to_string()))),
+                ),
+                &dict_ty,
+            ),
+            Some(ScalarValue::Dictionary(
+                Box::new(DataType::Int16),
+                Box::new(ScalarValue::Utf8(Some("com".to_string()))),
+            ))
+        );
+
+        // An untyped null keeps its untyped form for a dictionary target, just
+        // like for every other target type.
+        assert_eq!(
+            safe_coerce_scalar(&ScalarValue::Null, &dict_ty),
+            Some(ScalarValue::Null)
+        );
+
+        // A *typed* null (e.g. an API-built `Utf8(None)` literal, or an IN value
+        // already typed as Utf8) is still wrapped in the dictionary type so it
+        // matches the dictionary column. Returning a bare `Utf8(None)` here would
+        // leave `resolve_value` with a literal whose type does not line up with
+        // the column, breaking planning/evaluation the same way non-null strings
+        // used to break.
+        assert_eq!(
+            safe_coerce_scalar(&ScalarValue::Utf8(None), &dict_ty),
+            Some(ScalarValue::Dictionary(
+                Box::new(DataType::Int16),
+                Box::new(ScalarValue::Utf8(None)),
+            ))
+        );
+
+        // The inner null is coerced through to the dictionary value type as well,
+        // so a LargeUtf8 typed null lands as a Utf8 null inside the dictionary.
+        assert_eq!(
+            safe_coerce_scalar(&ScalarValue::LargeUtf8(None), &dict_ty),
+            Some(ScalarValue::Dictionary(
+                Box::new(DataType::Int16),
+                Box::new(ScalarValue::Utf8(None)),
+            ))
+        );
+
+        // A value that cannot be coerced to the dictionary value type fails.
+        assert_eq!(
+            safe_coerce_scalar(
+                &ScalarValue::Utf8(Some("com".to_string())),
+                &DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Int32)),
+            ),
+            None
+        );
+    }
 }
diff --git a/rust/lance-datafusion/src/logical_expr.rs b/rust/lance-datafusion/src/logical_expr.rs
index ab0936d31da..0eed438dae7 100644
--- a/rust/lance-datafusion/src/logical_expr.rs
+++ b/rust/lance-datafusion/src/logical_expr.rs
@@ -463,4 +463,58 @@ mod tests {
             _ => unreachable!("Expected BinaryExpr"),
         }
     }
+
+    #[test]
+    fn test_resolve_typed_null_against_dictionary_column() {
+        // A dictionary-encoded string column, e.g. a categorical field.
+        let dict_ty = DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8));
+        let arrow_schema = ArrowSchema::new(vec![Field::new("etld", dict_ty, true)]);
+        let schema = Schema::try_from(&arrow_schema).unwrap();
+
+        // A typed null must be wrapped in the dictionary type, not left as a bare
+        // `Utf8(None)` literal sitting next to a `Dictionary(...)` column.
+        let expected_null = Expr::Literal(
+            ScalarValue::Dictionary(Box::new(DataType::Int16), Box::new(ScalarValue::Utf8(None))),
+            None,
+        );
+
+        // `etld = <typed null>` built directly via the API, as opposed to coming
+        // through SQL parsing.
+        let expr = Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(Expr::Column("etld".to_string().into())),
+            op: Operator::Eq,
+            right: Box::new(Expr::Literal(ScalarValue::Utf8(None), None)),
+        });
+        match resolve_expr(&expr, &schema).unwrap() {
+            Expr::BinaryExpr(be) => assert_eq!(be.right.as_ref(), &expected_null),
+            other => unreachable!("Expected BinaryExpr, got {other:?}"),
+        }
+
+        // `etld IN ('a', <typed null>)` — a typed value mixed with a typed null,
+        // both already typed as Utf8. Every list element is wrapped in the
+        // dictionary type.
+        let expr = Expr::in_list(
+            Expr::Column("etld".to_string().into()),
+            vec![
+                Expr::Literal(ScalarValue::Utf8(Some("a".to_string())), None),
+                Expr::Literal(ScalarValue::Utf8(None), None),
+            ],
+            false,
+        );
+        let expected = Expr::in_list(
+            Expr::Column("etld".to_string().into()),
+            vec![
+                Expr::Literal(
+                    ScalarValue::Dictionary(
+                        Box::new(DataType::Int16),
+                        Box::new(ScalarValue::Utf8(Some("a".to_string()))),
+                    ),
+                    None,
+                ),
+                expected_null,
+            ],
+            false,
+        );
+        assert_eq!(resolve_expr(&expr, &schema).unwrap(), expected);
+    }
 }
diff --git a/rust/lance-datagen/Cargo.toml b/rust/lance-datagen/Cargo.toml
index eae1e3086b6..83b5aba3689 100644
--- a/rust/lance-datagen/Cargo.toml
+++ b/rust/lance-datagen/Cargo.toml
@@ -21,7 +21,6 @@ hex = "0.4.3"
 rand = { workspace = true }
 rand_distr = { workspace = true }
 rand_xoshiro = { workspace = true }
-random_word = { version = "0.5", features = ["en"] }
 
 [dev-dependencies]
 criterion = { workspace = true }
diff --git a/rust/lance-datagen/src/generator.rs b/rust/lance-datagen/src/generator.rs
index 3756e354bea..39da4734619 100644
--- a/rust/lance-datagen/src/generator.rs
+++ b/rust/lance-datagen/src/generator.rs
@@ -21,7 +21,6 @@ use arrow_schema::{ArrowError, DataType, Field, Fields, IntervalUnit, Schema, Sc
 use futures::{StreamExt, stream::BoxStream};
 use rand::{Rng, RngCore, SeedableRng, distr::Uniform};
 use rand_distr::Zipf;
-use random_word;
 
 use self::array::rand_with_distribution;
 
@@ -1172,24 +1171,223 @@ impl ArrayGenerator for BinaryPrefixPlusCounterGenerator {
     }
 }
 
-// Common English stop words placed at the front to be sampled more frequently
+// Common English stop words placed at the front to be sampled more frequently.
 const STOP_WORDS: &[&str] = &[
     "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
     "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
     "they", "this", "to", "was", "will", "with",
 ];
 
+const ENGLISH_WORDS: &[&str] = &[
+    "ability",
+    "able",
+    "about",
+    "above",
+    "accept",
+    "access",
+    "account",
+    "across",
+    "action",
+    "active",
+    "activity",
+    "actual",
+    "address",
+    "adjust",
+    "admin",
+    "advance",
+    "agent",
+    "align",
+    "allow",
+    "amount",
+    "analysis",
+    "answer",
+    "application",
+    "archive",
+    "array",
+    "asset",
+    "async",
+    "attribute",
+    "available",
+    "balance",
+    "batch",
+    "binary",
+    "bitmap",
+    "block",
+    "branch",
+    "buffer",
+    "build",
+    "cache",
+    "capacity",
+    "catalog",
+    "change",
+    "chunk",
+    "client",
+    "cluster",
+    "column",
+    "commit",
+    "common",
+    "compare",
+    "compile",
+    "compute",
+    "condition",
+    "config",
+    "connect",
+    "content",
+    "context",
+    "control",
+    "convert",
+    "copy",
+    "core",
+    "count",
+    "create",
+    "current",
+    "cursor",
+    "data",
+    "dataset",
+    "decode",
+    "default",
+    "delete",
+    "delta",
+    "depend",
+    "derive",
+    "design",
+    "detail",
+    "detect",
+    "device",
+    "direct",
+    "display",
+    "document",
+    "domain",
+    "drive",
+    "dynamic",
+    "encode",
+    "engine",
+    "error",
+    "event",
+    "example",
+    "execute",
+    "expand",
+    "expect",
+    "export",
+    "extend",
+    "feature",
+    "field",
+    "filter",
+    "final",
+    "finish",
+    "format",
+    "fragment",
+    "future",
+    "generate",
+    "global",
+    "group",
+    "handle",
+    "header",
+    "index",
+    "input",
+    "insert",
+    "inspect",
+    "instance",
+    "integer",
+    "internal",
+    "item",
+    "join",
+    "kernel",
+    "large",
+    "layer",
+    "layout",
+    "length",
+    "level",
+    "limit",
+    "linear",
+    "local",
+    "logical",
+    "lookup",
+    "manage",
+    "manifest",
+    "memory",
+    "merge",
+    "metric",
+    "model",
+    "module",
+    "namespace",
+    "native",
+    "node",
+    "normal",
+    "number",
+    "object",
+    "offset",
+    "option",
+    "output",
+    "package",
+    "page",
+    "parallel",
+    "parse",
+    "partition",
+    "pattern",
+    "physical",
+    "plan",
+    "policy",
+    "prefix",
+    "prepare",
+    "primary",
+    "process",
+    "profile",
+    "project",
+    "property",
+    "query",
+    "range",
+    "reader",
+    "record",
+    "region",
+    "registry",
+    "request",
+    "resolve",
+    "resource",
+    "result",
+    "return",
+    "row",
+    "runtime",
+    "scalar",
+    "scan",
+    "schema",
+    "search",
+    "segment",
+    "select",
+    "session",
+    "setting",
+    "source",
+    "stable",
+    "stage",
+    "state",
+    "static",
+    "storage",
+    "stream",
+    "string",
+    "struct",
+    "table",
+    "target",
+    "task",
+    "thread",
+    "token",
+    "trace",
+    "transform",
+    "type",
+    "update",
+    "upload",
+    "value",
+    "vector",
+    "version",
+    "view",
+    "write",
+    "writer",
+];
+
 /// Word list with stop words at the front for Zipf sampling, computed once.
 static SENTENCE_WORDS: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
-    let all_words = random_word::all(random_word::Lang::En);
-    let mut words = Vec::with_capacity(STOP_WORDS.len() + all_words.len());
+    let mut words = Vec::with_capacity(STOP_WORDS.len() + ENGLISH_WORDS.len());
     words.extend(STOP_WORDS.iter().copied());
-    words.extend(
-        all_words
-            .iter()
-            .filter(|w| !STOP_WORDS.contains(w))
-            .copied(),
-    );
+    words.extend(ENGLISH_WORDS.iter().copied());
     words
 });
 
@@ -1279,7 +1477,7 @@ struct RandomWordGenerator {
 
 impl RandomWordGenerator {
     pub fn new(is_large: bool) -> Self {
-        let words = random_word::all(random_word::Lang::En);
+        let words = ENGLISH_WORDS;
         Self { words, is_large }
     }
 }
@@ -3190,9 +3388,9 @@ mod tests {
         assert_eq!(
             *genn.generate(RowCount::from(3), &mut rng).unwrap(),
             arrow_array::BinaryArray::from_iter_values([
-                vec![174, 178],
-                vec![64, 122, 207, 248],
-                vec![124, 3, 58]
+                vec![111, 9, 80],
+                vec![86, 118, 13, 209],
+                vec![68, 33, 202]
             ])
         );
     }
diff --git a/rust/lance-encoding/src/decoder.rs b/rust/lance-encoding/src/decoder.rs
index 59886d337d1..a30d5ed93a9 100644
--- a/rust/lance-encoding/src/decoder.rs
+++ b/rust/lance-encoding/src/decoder.rs
@@ -226,7 +226,9 @@ use futures::stream::{self, BoxStream};
 use futures::{FutureExt, StreamExt};
 use lance_arrow::DataTypeExt;
 use lance_core::cache::LanceCache;
-use lance_core::datatypes::{BLOB_DESC_LANCE_FIELD, Field, Schema};
+use lance_core::datatypes::{
+    BLOB_DESC_LANCE_FIELD, Field, Schema, validate_fixed_size_list_dimensions,
+};
 use lance_core::utils::futures::{FinallyStreamExt, StreamOnDropExt};
 use lance_core::utils::parse::parse_env_as_bool;
 use log::{debug, trace, warn};
@@ -723,6 +725,7 @@ impl CoreFieldDecoderStrategy {
         column_infos: &mut ColumnInfoIter,
     ) -> Result<Box<dyn StructuralFieldScheduler>> {
         let data_type = field.data_type();
+        validate_fixed_size_list_dimensions(&field.name, &data_type)?;
         if Self::is_structural_primitive(&data_type) {
             let column_info = column_infos.expect_next()?;
             let scheduler = Box::new(StructuralPrimitiveFieldScheduler::try_new(
@@ -832,6 +835,7 @@ impl CoreFieldDecoderStrategy {
         buffers: FileBuffers,
     ) -> Result<Box<dyn crate::previous::decoder::FieldScheduler>> {
         let data_type = field.data_type();
+        validate_fixed_size_list_dimensions(&field.name, &data_type)?;
         if Self::is_primitive_legacy(&data_type) {
             let column_info = column_infos.expect_next()?;
             let scheduler = self.create_primitive_scheduler(field, column_info, buffers)?;
@@ -2887,6 +2891,52 @@ pub async fn decode_batch(
 mod tests {
     use super::*;
 
+    #[test]
+    fn test_read_zero_dimension_fsl_errors_instead_of_panicking() {
+        // Simulates reading a column whose stored schema declares a
+        // zero-dimension FixedSizeList, as old writers (before #5102) could
+        // persist. The read plan is built by the field-scheduler factories,
+        // which run the dimension guard before touching any column data, so
+        // an empty column iterator is sufficient to reach the guard. The read
+        // must surface a clean error rather than a divide-by-zero panic.
+        use arrow_schema::Field as ArrowField;
+
+        let zero_dim = DataType::FixedSizeList(
+            Arc::new(ArrowField::new("item", DataType::Float32, true)),
+            0,
+        );
+        let field = Field::try_from(&ArrowField::new("vec", zero_dim, true)).unwrap();
+        let strategy = CoreFieldDecoderStrategy::default();
+
+        let mut structural_columns = ColumnInfoIter::new(vec![], &[]);
+        let err = strategy
+            .create_structural_field_scheduler(&field, &mut structural_columns)
+            .unwrap_err();
+        assert!(
+            err.to_string()
+                .contains("dimension must be a positive integer"),
+            "unexpected error: {}",
+            err
+        );
+
+        let mut legacy_columns = ColumnInfoIter::new(vec![], &[]);
+        let err = strategy
+            .create_legacy_field_scheduler(
+                &field,
+                &mut legacy_columns,
+                FileBuffers {
+                    positions_and_sizes: &[],
+                },
+            )
+            .unwrap_err();
+        assert!(
+            err.to_string()
+                .contains("dimension must be a positive integer"),
+            "unexpected error: {}",
+            err
+        );
+    }
+
     #[test]
     fn test_coalesce_indices_to_ranges_with_single_index() {
         let indices = vec![1];
diff --git a/rust/lance-encoding/src/encodings/logical/primitive.rs b/rust/lance-encoding/src/encodings/logical/primitive.rs
index 064e3b59745..9b506359e55 100644
--- a/rust/lance-encoding/src/encodings/logical/primitive.rs
+++ b/rust/lance-encoding/src/encodings/logical/primitive.rs
@@ -3701,12 +3701,7 @@ struct SerializedFullZip {
 //
 // If we directly record the size in bytes with 12 bits we would be limited to
 // 4KiB which is too small.  Since we know each mini-block consists of 8 byte
-// words we can store the # of words instead which gives us 32KiB.  We want
-// at least 24KiB so we can handle even the worst case of
-// - 4Ki values compressed into an 8186 byte buffer
-// - 4 bytes to describe rep & def lengths
-// - 16KiB of rep & def buffer (this will almost never happen but life is easier if we
-//   plan for it)
+// words we can store the # of words instead which gives us 32KiB.
 //
 // Second, each chunk in a mini-block is aligned to 8 bytes.  This allows multi-byte
 // values like offsets to be stored in a mini-block and safely read back out.  It also
@@ -3906,9 +3901,9 @@ impl PrimitiveStructuralEncoder {
     // 0xA)  All blocks except the last must have power-of-two number of values.
     // This not only makes metadata smaller but it makes decoding easier since
     // batch sizes are typically a power of 2.  4 bits would allow us to express
-    // up to 16Ki values but we restrict this further to 4Ki values.
+    // up to 32Ki values.
     //
-    // This means blocks can have 1 to 4Ki values and 8 - 32Ki bytes.
+    // This means blocks can have 1 to 32Ki values and 8 - 32Ki bytes.
     //
     // All metadata words are serialized (as little endian) into a single buffer
     // of metadata values.
@@ -4007,7 +4002,13 @@ impl PrimitiveStructuralEncoder {
                 }
             } else {
                 for &buffer_size in &chunk.buffer_sizes {
-                    data_buffer.extend_from_slice(&(buffer_size as u16).to_le_bytes());
+                    let buffer_size = u16::try_from(buffer_size).map_err(|_| {
+                        Error::internal(format!(
+                            "Mini-block buffer size ({} bytes) too large for 16-bit metadata",
+                            buffer_size
+                        ))
+                    })?;
+                    data_buffer.extend_from_slice(&buffer_size.to_le_bytes());
                 }
             }
 
@@ -4041,15 +4042,28 @@ impl PrimitiveStructuralEncoder {
 
             let chunk_bytes = data_buffer.len() - start_pos;
             let max_chunk_size = if support_large_chunk {
-                4 * 1024 * 1024 * 1024 // 4GB limit with u32 metadata
+                1_u64 << 31 // 28 bits of 8-byte words in u32 metadata
             } else {
                 32 * 1024 // 32KiB limit with u16 metadata
             };
-            assert!(chunk_bytes <= max_chunk_size);
-            assert!(chunk_bytes > 0);
-            assert_eq!(chunk_bytes % 8, 0);
-            // 4Ki values max
-            assert!(chunk.log_num_values <= 12);
+            if chunk_bytes == 0 || chunk_bytes as u64 > max_chunk_size {
+                return Err(Error::internal(format!(
+                    "Mini-block chunk size {} bytes exceeds the {} byte metadata limit",
+                    chunk_bytes, max_chunk_size
+                )));
+            }
+            if chunk_bytes % MINIBLOCK_ALIGNMENT != 0 {
+                return Err(Error::internal(format!(
+                    "Mini-block chunk size {} bytes is not aligned to {} bytes",
+                    chunk_bytes, MINIBLOCK_ALIGNMENT
+                )));
+            }
+            if chunk.log_num_values > 15 {
+                return Err(Error::internal(format!(
+                    "Mini-block log_num_values {} exceeds the 4-bit metadata limit",
+                    chunk.log_num_values
+                )));
+            }
             // We subtract 1 here from chunk_bytes because we want to be able to express
             // a size of 32KiB and not (32Ki - 8)B which is what we'd get otherwise with
             // 0xFFF
@@ -5768,8 +5782,9 @@ mod tests {
     use super::{
         ChunkInstructions, DataBlock, DecodeMiniBlockTask, FixedPerValueDecompressor,
         FixedWidthDataBlock, FullZipCacheableState, FullZipDecodeDetails, FullZipReadSource,
-        FullZipRepIndexDetails, FullZipScheduler, MiniBlockRepIndex, PerValueDecompressor,
-        PreambleAction, StructuralPageScheduler, VariableFullZipDecoder,
+        FullZipRepIndexDetails, FullZipScheduler, MiniBlockChunk, MiniBlockCompressed,
+        MiniBlockRepIndex, PerValueDecompressor, PreambleAction, StructuralPageScheduler,
+        VariableFullZipDecoder,
     };
     use crate::buffer::LanceBuffer;
     use crate::compression::DefaultDecompressionStrategy;
@@ -6967,7 +6982,7 @@ mod tests {
     #[tokio::test]
     async fn test_binary_large_minichunk_size_over_max_miniblock_values() {
         let mut string_data = Vec::new();
-        // 128kb/chunk / 6 bytes (t_9999) = 21845 > max 4096 items per chunk
+        // 128kb/chunk / 6 bytes (t_9999) = 21845 items per chunk
         for i in 0..10000 {
             string_data.push(Some(format!("t_{}", i)));
         }
@@ -7566,6 +7581,36 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_v2_1_miniblock_serializes_log_num_values_15() {
+        let miniblocks = MiniBlockCompressed {
+            data: vec![LanceBuffer::from(vec![1_u8; 16])],
+            chunks: vec![
+                MiniBlockChunk {
+                    buffer_sizes: vec![8],
+                    log_num_values: 15,
+                },
+                MiniBlockChunk {
+                    buffer_sizes: vec![8],
+                    log_num_values: 0,
+                },
+            ],
+            num_values: 32_769,
+        };
+
+        let serialized =
+            PrimitiveStructuralEncoder::serialize_miniblocks(miniblocks, None, None, false)
+                .unwrap();
+
+        let chunk_metadata = serialized.metadata.borrow_to_typed_slice::<u16>();
+        assert_eq!(chunk_metadata.len(), 2);
+        assert_eq!(
+            chunk_metadata[0] & 0x0F,
+            15,
+            "V2.1 metadata should use all 4 bits for log_num_values"
+        );
+    }
+
     async fn encode_first_page(
         field: arrow_schema::Field,
         array: ArrayRef,
diff --git a/rust/lance-encoding/src/encodings/logical/primitive/miniblock.rs b/rust/lance-encoding/src/encodings/logical/primitive/miniblock.rs
index de3227b2a39..1cf3b9bf581 100644
--- a/rust/lance-encoding/src/encodings/logical/primitive/miniblock.rs
+++ b/rust/lance-encoding/src/encodings/logical/primitive/miniblock.rs
@@ -19,13 +19,14 @@ use lance_core::Result;
 pub const MAX_MINIBLOCK_BYTES: u64 = 8 * 1024 - 6;
 
 const DEFAULT_MAX_MINIBLOCK_VALUES: u64 = 4096;
+const MAX_CONFIGURABLE_MINIBLOCK_VALUES: u64 = 32768;
 
 fn parse_max_miniblock_values() -> u64 {
     let val = std::env::var("LANCE_MINIBLOCK_MAX_VALUES")
         .ok()
         .and_then(|v| v.parse().ok())
         .unwrap_or(DEFAULT_MAX_MINIBLOCK_VALUES);
-    val.clamp(1, DEFAULT_MAX_MINIBLOCK_VALUES)
+    val.clamp(1, MAX_CONFIGURABLE_MINIBLOCK_VALUES)
 }
 
 pub static MAX_MINIBLOCK_VALUES: std::sync::LazyLock<u64> =
@@ -58,9 +59,9 @@ pub struct MiniBlockCompressed {
 /// and contain a power-of-two number of values (except for the last chunk)
 ///
 /// By default we limit a chunk to 4Ki values and slightly less than
-/// 8KiB of compressed data.  This means that even in the extreme case
-/// where we have 4 bytes of rep/def then we will have at most 24KiB of
-/// data (values, repetition, and definition) per mini-block.
+/// 8KiB of compressed value data.  The byte budget remains the primary
+/// constraint, so only encodings that compress many values into that
+/// budget can use larger value counts when explicitly configured.
 ///
 /// The maximum number of values per chunk can be configured via the
 /// `LANCE_MINIBLOCK_MAX_VALUES` environment variable.  This is only
@@ -77,8 +78,8 @@ pub struct MiniBlockChunk {
     // then this should be 0 (the number of values will be calculated by subtracting the
     // size of all other chunks from the total size of the page)
     //
-    // For example, 1 would mean there are 2 values in the chunk and 12 would mean there
-    // are 4Ki values in the chunk.
+    // For example, 1 would mean there are 2 values in the chunk and 15 would mean there
+    // are 32Ki values in the chunk.
     //
     // This must be <= log2(MAX_MINIBLOCK_VALUES) (i.e. <= 12 at the default of 4096)
     pub log_num_values: u8,
@@ -135,6 +136,14 @@ mod tests {
         unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") };
     }
 
+    #[test]
+    #[serial]
+    fn test_parse_can_raise_to_32k() {
+        unsafe { std::env::set_var("LANCE_MINIBLOCK_MAX_VALUES", "32768") };
+        assert_eq!(parse_max_miniblock_values(), 32768);
+        unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") };
+    }
+
     #[test]
     #[serial]
     fn test_parse_clamps_zero_to_one() {
@@ -147,7 +156,10 @@ mod tests {
     #[serial]
     fn test_parse_clamps_above_max() {
         unsafe { std::env::set_var("LANCE_MINIBLOCK_MAX_VALUES", "99999") };
-        assert_eq!(parse_max_miniblock_values(), DEFAULT_MAX_MINIBLOCK_VALUES);
+        assert_eq!(
+            parse_max_miniblock_values(),
+            MAX_CONFIGURABLE_MINIBLOCK_VALUES
+        );
         unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") };
     }
 
diff --git a/rust/lance-encoding/src/lib.rs b/rust/lance-encoding/src/lib.rs
index cb4062d3220..a58e0a14c59 100644
--- a/rust/lance-encoding/src/lib.rs
+++ b/rust/lance-encoding/src/lib.rs
@@ -86,6 +86,22 @@ pub trait EncodingsIo: std::fmt::Debug + Send + Sync {
     fn with_bypass_backpressure(&self) -> Option<Arc<dyn EncodingsIo>> {
         None
     }
+
+    /// Returns a version of this I/O service that additionally records the I/O it
+    /// performs into `stats`, on top of any global accounting.  This is the seam
+    /// used to measure exact per-scope (e.g. per-query) I/O without re-opening
+    /// files: wrap a reader's I/O service, perform the reads, then inspect the
+    /// recorder.
+    ///
+    /// Returns `None` if this implementation does not support per-scope I/O
+    /// statistics (e.g. in-memory or test schedulers), in which case the caller
+    /// should fall back to using self (and no statistics are recorded).
+    fn with_io_stats(
+        &self,
+        _stats: Arc<dyn lance_core::utils::io_stats::IoStatsRecorder>,
+    ) -> Option<Arc<dyn EncodingsIo>> {
+        None
+    }
 }
 
 /// An implementation of EncodingsIo that serves data from an in-memory buffer
diff --git a/rust/lance-file/src/io.rs b/rust/lance-file/src/io.rs
index c09e9d8d372..1a8edf92b08 100644
--- a/rust/lance-file/src/io.rs
+++ b/rust/lance-file/src/io.rs
@@ -38,6 +38,16 @@ impl EncodingsIo for LanceEncodingsIo {
         }))
     }
 
+    fn with_io_stats(
+        &self,
+        stats: Arc<dyn lance_core::utils::io_stats::IoStatsRecorder>,
+    ) -> Option<Arc<dyn EncodingsIo>> {
+        Some(Arc::new(Self {
+            scheduler: self.scheduler.with_io_stats(stats),
+            read_chunk_size: self.read_chunk_size,
+        }))
+    }
+
     fn submit_request(
         &self,
         ranges: Vec<std::ops::Range<u64>>,
diff --git a/rust/lance-file/src/reader.rs b/rust/lance-file/src/reader.rs
index 9e4e4c449a4..c454f73819e 100644
--- a/rust/lance-file/src/reader.rs
+++ b/rust/lance-file/src/reader.rs
@@ -470,6 +470,23 @@ impl FileReader {
         }
     }
 
+    /// Returns a clone of this reader whose I/O is additionally recorded into
+    /// `stats`, on top of the scheduler's global accounting.
+    ///
+    /// All cached metadata is shared with `self`, so no file is re-opened and
+    /// only a few `Arc` clones are performed.  If the underlying I/O service
+    /// does not support per-scope statistics (e.g. an in-memory scheduler), the
+    /// returned reader is an ordinary, uninstrumented clone.
+    pub fn with_io_stats(
+        &self,
+        stats: Arc<dyn lance_core::utils::io_stats::IoStatsRecorder>,
+    ) -> Self {
+        match self.scheduler.with_io_stats(stats) {
+            Some(scheduler) => self.with_scheduler(scheduler),
+            None => self.clone(),
+        }
+    }
+
     pub fn num_rows(&self) -> u64 {
         self.num_rows
     }
diff --git a/rust/lance-file/src/writer.rs b/rust/lance-file/src/writer.rs
index 14a4c82bde6..12bd50df6fe 100644
--- a/rust/lance-file/src/writer.rs
+++ b/rust/lance-file/src/writer.rs
@@ -633,14 +633,11 @@ impl FileWriter {
     async fn write_global_buffers(&mut self) -> Result<Vec<(u64, u64)>> {
         let schema = self.schema.as_mut().ok_or(Error::invalid_input("No schema provided on writer open and no data provided.  Schema is unknown and file cannot be created"))?;
         schema.metadata = std::mem::take(&mut self.schema_metadata);
-        // Use descriptor layout for blob v2 in the footer to avoid exposing logical child fields.
-        //
-        // TODO(xuanwo): this doesn't work on nested struct, need better solution like fields_per_order_mut?
-        schema.fields.iter_mut().for_each(|f| {
-            if f.is_blob_v2() {
-                f.unloaded_mut();
-            }
-        });
+        // Use descriptor layout for blob v2 fields in the footer to avoid exposing logical child fields.
+        schema
+            .fields
+            .iter_mut()
+            .for_each(|f| f.unload_blobs_recursive());
 
         let file_descriptor = Self::make_file_descriptor(schema, self.rows_written)?;
         let file_descriptor_bytes = file_descriptor.encode_to_vec();
diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml
index e3947b57856..85de43c0f9b 100644
--- a/rust/lance-index/Cargo.toml
+++ b/rust/lance-index/Cargo.toml
@@ -56,6 +56,7 @@ object_store.workspace = true
 prost.workspace = true
 prost-types.workspace = true
 rand.workspace = true
+regex-syntax.workspace = true
 roaring.workspace = true
 rayon.workspace = true
 serde_json.workspace = true
diff --git a/rust/lance-index/benches/rq.rs b/rust/lance-index/benches/rq.rs
index 4a7364d1313..72e0c49820d 100644
--- a/rust/lance-index/benches/rq.rs
+++ b/rust/lance-index/benches/rq.rs
@@ -17,11 +17,16 @@ use lance_datagen::array::rand_type;
 use lance_datagen::{BatchGeneratorBuilder, RowCount};
 use lance_index::vector::bq::RQRotationType;
 use lance_index::vector::bq::builder::RabitQuantizer;
+use lance_index::vector::bq::ex_dot::{
+    blocked_ex_code_bytes, ex_dot_kernel, pack_blocked_row, packed_ex_code_value,
+};
 use lance_index::vector::bq::storage::*;
 use lance_index::vector::bq::transform::{ADD_FACTORS_COLUMN, SCALE_FACTORS_COLUMN};
 use lance_index::vector::quantizer::{Quantization, QuantizerStorage};
 use lance_index::vector::storage::{DistCalculator, VectorStore};
 use lance_linalg::distance::DistanceType;
+use rand::rngs::SmallRng;
+use rand::{Rng, SeedableRng};
 
 const DIM: usize = 128;
 const TOTAL: usize = 16 * 1000;
@@ -119,16 +124,526 @@ fn compute_distances(c: &mut Criterion) {
     }
 }
 
-#[cfg(target_os = "linux")]
-criterion_group!(
-    name=benches;
-    config = Criterion::default().measurement_time(Duration::from_secs(10));
-    targets = construct_dist_table, compute_distances);
+/// The table-gather ex distance used before the dedicated ex-dot kernels,
+/// kept here as the baseline: per dim, extract the packed code and gather
+/// `query[d] * code` from a `dim * 2^ex_bits` table.
+fn gather_ex_distance(row_codes: &[u8], dim: usize, ex_bits: u8, ex_dist_table: &[f32]) -> f32 {
+    let entries_per_dim = 1usize << ex_bits;
+    (0..dim)
+        .map(|dim_idx| {
+            let code = packed_ex_code_value(row_codes, dim_idx, ex_bits) as usize;
+            ex_dist_table[dim_idx * entries_per_dim + code]
+        })
+        .sum()
+}
+
+fn ex_dot_kernels(c: &mut Criterion) {
+    for ex_dim in [1536usize, 2048] {
+        ex_dot_kernels_for_dim(c, ex_dim);
+    }
+}
+
+fn ex_dot_kernels_for_dim(c: &mut Criterion, ex_dim: usize) {
+    const NUM_ROWS: usize = 1024;
+
+    let mut rng = SmallRng::seed_from_u64(42);
+    let query = (0..ex_dim)
+        .map(|_| rng.random_range(-1.0f32..1.0))
+        .collect::<Vec<_>>();
+
+    for ex_bits in 1..=8u8 {
+        let max_code = ((1u16 << ex_bits) - 1) as u8;
+        let values = (0..NUM_ROWS * ex_dim)
+            .map(|_| rng.random_range(0..=max_code))
+            .collect::<Vec<_>>();
+
+        // The gather baseline reads the legacy sequential layout it shipped
+        // with; the kernel reads the blocked layout.
+        let seq_code_len = (ex_dim * ex_bits as usize).div_ceil(8);
+        let mut seq_codes = vec![0u8; NUM_ROWS * seq_code_len];
+        for (row, row_values) in seq_codes
+            .chunks_exact_mut(seq_code_len)
+            .zip(values.chunks_exact(ex_dim))
+        {
+            for (dim, &value) in row_values.iter().enumerate() {
+                let bit_offset = dim * ex_bits as usize;
+                let bits = (value as u16) << (bit_offset % 8);
+                row[bit_offset / 8] |= bits as u8;
+                if bits >> 8 != 0 {
+                    row[bit_offset / 8 + 1] |= (bits >> 8) as u8;
+                }
+            }
+        }
+
+        let kernel_code_len = blocked_ex_code_bytes(ex_dim, ex_bits);
+        let mut kernel_codes = vec![0u8; NUM_ROWS * kernel_code_len];
+        for (row, row_values) in kernel_codes
+            .chunks_exact_mut(kernel_code_len)
+            .zip(values.chunks_exact(ex_dim))
+        {
+            pack_blocked_row(row_values, ex_bits, row);
+        }
+
+        // ex_dim is block-aligned here, so the kernels read the query as-is.
+        let ex_query = &query;
+        let kernel = ex_dot_kernel(ex_bits);
+        c.bench_function(
+            format!("RQ ex_dot kernel: ex_bits={ex_bits}, DIM={ex_dim}, rows={NUM_ROWS}").as_str(),
+            |b| {
+                b.iter(|| {
+                    let mut sum = 0.0f32;
+                    for row in kernel_codes.chunks_exact(kernel_code_len) {
+                        sum += kernel(ex_query, row);
+                    }
+                    black_box(sum)
+                })
+            },
+        );
+
+        let entries_per_dim = 1usize << ex_bits;
+        let mut ex_dist_table = vec![0.0f32; ex_dim * entries_per_dim];
+        for (dim, table) in ex_dist_table.chunks_exact_mut(entries_per_dim).enumerate() {
+            for (code, value) in table.iter_mut().enumerate() {
+                *value = query[dim] * code as f32;
+            }
+        }
+        c.bench_function(
+            format!("RQ ex_dot table-gather: ex_bits={ex_bits}, DIM={ex_dim}, rows={NUM_ROWS}")
+                .as_str(),
+            |b| {
+                b.iter(|| {
+                    let mut sum = 0.0f32;
+                    for row in seq_codes.chunks_exact(seq_code_len) {
+                        sum += gather_ex_distance(row, ex_dim, ex_bits, &ex_dist_table);
+                    }
+                    black_box(sum)
+                })
+            },
+        );
+    }
+}
+
+/// Storage load cost per format: blocked-format ex codes are aliased as-is,
+/// legacy sequential ex codes are repacked row by row.
+fn ex_code_storage_load(c: &mut Criterion) {
+    use arrow_array::{ArrayRef, FixedSizeListArray, Float32Array, UInt8Array, UInt64Array};
+    use lance_arrow::FixedSizeListArrayExt;
+    use lance_index::vector::bq::ex_dot::repack_sequential_row;
+    use lance_index::vector::bq::rabit_ex_code_bytes;
+    use lance_index::vector::bq::transform::{EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN};
+    use std::sync::Arc;
+
+    const LOAD_DIM: usize = 1536;
+    const LOAD_ROWS: usize = 8192;
+    const NUM_BITS: u8 = 4; // ex_bits=3, a bit-plane width
+
+    let ex_bits = NUM_BITS - 1;
+    let mut rng = SmallRng::seed_from_u64(7);
+    let metadata = RabitQuantizationMetadata {
+        rotate_mat: None,
+        rotate_mat_position: None,
+        fast_rotation_signs: None,
+        rotation_type: RQRotationType::Fast,
+        code_dim: LOAD_DIM as u32,
+        num_bits: NUM_BITS,
+        packed: true,
+        query_estimator: RabitQueryEstimator::RawQuery,
+    };
+    let code_len = LOAD_DIM / 8;
+    let binary_codes = (0..LOAD_ROWS * code_len)
+        .map(|_| rng.random_range(0..=u8::MAX))
+        .collect::<Vec<_>>();
+    let seq_code_len = rabit_ex_code_bytes(LOAD_DIM, ex_bits).unwrap();
+    let seq_codes = (0..LOAD_ROWS * seq_code_len)
+        .map(|_| rng.random_range(0..=u8::MAX))
+        .collect::<Vec<_>>();
+    let blocked_code_len = blocked_ex_code_bytes(LOAD_DIM, ex_bits);
+    let mut blocked_codes = vec![0u8; LOAD_ROWS * blocked_code_len];
+    for (seq_row, blocked_row) in seq_codes
+        .chunks_exact(seq_code_len)
+        .zip(blocked_codes.chunks_exact_mut(blocked_code_len))
+    {
+        repack_sequential_row(seq_row, LOAD_DIM, ex_bits, blocked_row);
+    }
+
+    let make_batch = |ex_column: &str, ex_values: Vec<u8>, ex_code_len: usize| {
+        arrow_array::RecordBatch::try_from_iter(vec![
+            (
+                ROW_ID,
+                Arc::new(UInt64Array::from_iter_values(0..LOAD_ROWS as u64)) as ArrayRef,
+            ),
+            (
+                RABIT_CODE_COLUMN,
+                Arc::new(
+                    FixedSizeListArray::try_new_from_values(
+                        UInt8Array::from(binary_codes.clone()),
+                        code_len as i32,
+                    )
+                    .unwrap(),
+                ) as ArrayRef,
+            ),
+            (
+                ADD_FACTORS_COLUMN,
+                Arc::new(Float32Array::from(vec![0.0f32; LOAD_ROWS])) as ArrayRef,
+            ),
+            (
+                SCALE_FACTORS_COLUMN,
+                Arc::new(Float32Array::from(vec![0.0f32; LOAD_ROWS])) as ArrayRef,
+            ),
+            (
+                ex_column,
+                Arc::new(
+                    FixedSizeListArray::try_new_from_values(
+                        UInt8Array::from(ex_values),
+                        ex_code_len as i32,
+                    )
+                    .unwrap(),
+                ) as ArrayRef,
+            ),
+            (
+                EX_ADD_FACTORS_COLUMN,
+                Arc::new(Float32Array::from(vec![0.0f32; LOAD_ROWS])) as ArrayRef,
+            ),
+            (
+                EX_SCALE_FACTORS_COLUMN,
+                Arc::new(Float32Array::from(vec![0.0f32; LOAD_ROWS])) as ArrayRef,
+            ),
+        ])
+        .unwrap()
+    };
+
+    let blocked_batch = make_batch(
+        RABIT_BLOCKED_EX_CODE_COLUMN,
+        blocked_codes,
+        blocked_code_len,
+    );
+    c.bench_function(
+        format!("RQ storage load (blocked ex codes): num_bits={NUM_BITS}, DIM={LOAD_DIM}, rows={LOAD_ROWS}")
+            .as_str(),
+        |b| {
+            b.iter(|| {
+                black_box(
+                    RabitQuantizationStorage::try_from_batch(
+                        blocked_batch.clone(),
+                        &metadata,
+                        DistanceType::L2,
+                        None,
+                    )
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    let legacy_batch = make_batch(RABIT_EX_CODE_COLUMN, seq_codes, seq_code_len);
+    c.bench_function(
+        format!("RQ storage load (legacy ex codes): num_bits={NUM_BITS}, DIM={LOAD_DIM}, rows={LOAD_ROWS}")
+            .as_str(),
+        |b| {
+            b.iter(|| {
+                black_box(
+                    RabitQuantizationStorage::try_from_batch(
+                        legacy_batch.clone(),
+                        &metadata,
+                        DistanceType::L2,
+                        None,
+                    )
+                    .unwrap(),
+                )
+            })
+        },
+    );
+}
+
+/// Bulk-scoring cost of the ex stage: the quantized ex-FastScan LUT path
+/// (inside `distance_all`) vs the exact per-row ex-dot kernel. The
+/// binary-only run isolates the shared binary stage so the ex cost is the
+/// difference from the full run.
+fn ex_bulk_paths(c: &mut Criterion) {
+    use arrow_array::{ArrayRef, FixedSizeListArray, Float32Array, UInt8Array, UInt64Array};
+    use lance_arrow::FixedSizeListArrayExt;
+    use lance_index::vector::ApproxMode;
+    use lance_index::vector::bq::ex_dot::pad_query_into;
+    use lance_index::vector::bq::transform::{EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN};
+    use lance_index::vector::storage::DistanceCalculatorOptions;
+    use std::sync::Arc;
+
+    const BULK_DIM: usize = 1536;
+    const BULK_ROWS: usize = 16384;
+
+    let mut rng = SmallRng::seed_from_u64(13);
+    for num_bits in [3u8, 5, 9] {
+        let ex_bits = num_bits - 1;
+        let max_code = ((1u16 << ex_bits) - 1) as u8;
+
+        let rq = RabitQuantizer::new_with_rotation::<Float32Type>(
+            num_bits,
+            BULK_DIM as i32,
+            RQRotationType::Fast,
+        );
+        let metadata = rq.metadata(None);
+
+        let code_len = BULK_DIM / 8;
+        let binary_codes = (0..BULK_ROWS * code_len)
+            .map(|_| rng.random_range(0..=u8::MAX))
+            .collect::<Vec<_>>();
+        let ex_code_len = blocked_ex_code_bytes(BULK_DIM, ex_bits);
+        let mut ex_codes = vec![0u8; BULK_ROWS * ex_code_len];
+        let values = (0..BULK_DIM)
+            .map(|_| rng.random_range(0..=max_code))
+            .collect::<Vec<_>>();
+        for row in ex_codes.chunks_exact_mut(ex_code_len) {
+            pack_blocked_row(&values, ex_bits, row);
+        }
+
+        // No error factors: `distance_all` takes the FastScan ex bulk branch.
+        let batch = arrow_array::RecordBatch::try_from_iter(vec![
+            (
+                ROW_ID,
+                Arc::new(UInt64Array::from_iter_values(0..BULK_ROWS as u64)) as ArrayRef,
+            ),
+            (
+                RABIT_CODE_COLUMN,
+                Arc::new(
+                    FixedSizeListArray::try_new_from_values(
+                        UInt8Array::from(binary_codes),
+                        code_len as i32,
+                    )
+                    .unwrap(),
+                ) as ArrayRef,
+            ),
+            (
+                ADD_FACTORS_COLUMN,
+                Arc::new(Float32Array::from(vec![0.0f32; BULK_ROWS])) as ArrayRef,
+            ),
+            (
+                SCALE_FACTORS_COLUMN,
+                Arc::new(Float32Array::from(vec![0.0f32; BULK_ROWS])) as ArrayRef,
+            ),
+            (
+                RABIT_BLOCKED_EX_CODE_COLUMN,
+                Arc::new(
+                    FixedSizeListArray::try_new_from_values(
+                        UInt8Array::from(ex_codes.clone()),
+                        ex_code_len as i32,
+                    )
+                    .unwrap(),
+                ) as ArrayRef,
+            ),
+            (
+                EX_ADD_FACTORS_COLUMN,
+                Arc::new(Float32Array::from(vec![0.0f32; BULK_ROWS])) as ArrayRef,
+            ),
+            (
+                EX_SCALE_FACTORS_COLUMN,
+                Arc::new(Float32Array::from(vec![1.0f32; BULK_ROWS])) as ArrayRef,
+            ),
+        ])
+        .unwrap();
+        let storage =
+            RabitQuantizationStorage::try_from_batch(batch, &metadata, DistanceType::L2, None)
+                .unwrap();
+
+        let query: ArrayRef = Arc::new(Float32Array::from(
+            (0..BULK_DIM)
+                .map(|_| rng.random_range(-1.0f32..1.0))
+                .collect::<Vec<_>>(),
+        ));
+
+        for (label, approx_mode) in [
+            ("full distance_all (binary + ex LUT)", ApproxMode::Normal),
+            ("binary-only distance_all (fast mode)", ApproxMode::Fast),
+        ] {
+            let mut f32_scratch = Vec::new();
+            let calc = storage.dist_calculator_with_scratch(
+                query.clone(),
+                0.0,
+                None,
+                &mut f32_scratch,
+                DistanceCalculatorOptions { approx_mode },
+            );
+            let mut dists = Vec::new();
+            let mut u16_scratch = Vec::new();
+            let mut u8_scratch = Vec::new();
+            let mut u32_scratch = Vec::new();
+            c.bench_function(
+                format!("RQ bulk {label}: num_bits={num_bits}, DIM={BULK_DIM}, rows={BULK_ROWS}")
+                    .as_str(),
+                |b| {
+                    b.iter(|| {
+                        calc.distance_all_with_scratch(
+                            0,
+                            &mut dists,
+                            &mut u16_scratch,
+                            &mut u8_scratch,
+                            &mut u32_scratch,
+                        );
+                        black_box(dists.len())
+                    })
+                },
+            );
+        }
+
+        let kernel = ex_dot_kernel(ex_bits);
+        let mut ex_query = vec![0.0f32; BULK_DIM];
+        pad_query_into(
+            query
+                .as_any()
+                .downcast_ref::<Float32Array>()
+                .unwrap()
+                .values(),
+            &mut ex_query,
+        );
+        c.bench_function(
+            format!(
+                "RQ bulk ex kernel loop: num_bits={num_bits}, DIM={BULK_DIM}, rows={BULK_ROWS}"
+            )
+            .as_str(),
+            |b| {
+                b.iter(|| {
+                    let mut sum = 0.0f32;
+                    for row in ex_codes.chunks_exact(ex_code_len) {
+                        sum += kernel(&ex_query, row);
+                    }
+                    black_box(sum)
+                })
+            },
+        );
+    }
+}
+
+/// Top-k accumulation through the gated raw-query multi-bit path: binary
+/// FastScan, the per-row lower-bound pruning scan, and the exact rerank of
+/// the surviving rows. Error factors are present so the gating is enabled.
+fn heap_topk(c: &mut Criterion) {
+    use arrow_array::{ArrayRef, FixedSizeListArray, Float32Array, UInt8Array, UInt64Array};
+    use lance_arrow::FixedSizeListArrayExt;
+    use lance_index::vector::ApproxMode;
+    use lance_index::vector::bq::transform::{
+        ERROR_FACTORS_COLUMN, EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN,
+    };
+    use lance_index::vector::storage::DistanceCalculatorOptions;
+    use std::collections::BinaryHeap;
+    use std::sync::Arc;
+
+    const TOPK_DIM: usize = 1536;
+    const TOPK_ROWS: usize = 4096;
+    const TOPK_K: usize = 10;
+    const NUM_BITS: u8 = 5;
+    let ex_bits = NUM_BITS - 1;
+
+    let mut rng = SmallRng::seed_from_u64(99);
+    let rq = RabitQuantizer::new_with_rotation::<Float32Type>(
+        NUM_BITS,
+        TOPK_DIM as i32,
+        RQRotationType::Fast,
+    );
+    let metadata = rq.metadata(None);
+
+    let code_len = TOPK_DIM / 8;
+    let binary_codes = (0..TOPK_ROWS * code_len)
+        .map(|_| rng.random())
+        .collect::<Vec<u8>>();
+    let ex_code_len = blocked_ex_code_bytes(TOPK_DIM, ex_bits);
+    let ex_codes = (0..TOPK_ROWS * ex_code_len)
+        .map(|_| rng.random())
+        .collect::<Vec<u8>>();
+    // Factor magnitudes chosen so the lower bounds spread mostly with the add
+    // factors; once the heap is full the threshold prunes the vast majority
+    // of rows, like a production multi-partition scan.
+    let mut rand_factors = |low: f32, high: f32| {
+        Arc::new(Float32Array::from(
+            (0..TOPK_ROWS)
+                .map(|_| rng.random_range(low..high))
+                .collect::<Vec<_>>(),
+        )) as ArrayRef
+    };
+    let batch = arrow_array::RecordBatch::try_from_iter(vec![
+        (
+            ROW_ID,
+            Arc::new(UInt64Array::from_iter_values(0..TOPK_ROWS as u64)) as ArrayRef,
+        ),
+        (
+            RABIT_CODE_COLUMN,
+            Arc::new(
+                FixedSizeListArray::try_new_from_values(
+                    UInt8Array::from(binary_codes),
+                    code_len as i32,
+                )
+                .unwrap(),
+            ) as ArrayRef,
+        ),
+        (ADD_FACTORS_COLUMN, rand_factors(0.0, 1.0)),
+        (SCALE_FACTORS_COLUMN, rand_factors(0.0005, 0.0015)),
+        (ERROR_FACTORS_COLUMN, rand_factors(0.0, 0.01)),
+        (
+            RABIT_BLOCKED_EX_CODE_COLUMN,
+            Arc::new(
+                FixedSizeListArray::try_new_from_values(
+                    UInt8Array::from(ex_codes),
+                    ex_code_len as i32,
+                )
+                .unwrap(),
+            ) as ArrayRef,
+        ),
+        (EX_ADD_FACTORS_COLUMN, rand_factors(0.0, 1.0)),
+        (EX_SCALE_FACTORS_COLUMN, rand_factors(0.00003, 0.0001)),
+    ])
+    .unwrap();
+    let storage =
+        RabitQuantizationStorage::try_from_batch(batch, &metadata, DistanceType::L2, None).unwrap();
+    let query: ArrayRef = Arc::new(Float32Array::from(
+        (0..TOPK_DIM)
+            .map(|_| rng.random_range(-1.0f32..1.0))
+            .collect::<Vec<_>>(),
+    ));
+
+    for (label, approx_mode) in [
+        ("normal", ApproxMode::Normal),
+        ("accurate", ApproxMode::Accurate),
+    ] {
+        let mut f32_scratch = Vec::new();
+        let calc = storage.dist_calculator_with_scratch(
+            query.clone(),
+            1.0,
+            None,
+            &mut f32_scratch,
+            DistanceCalculatorOptions { approx_mode },
+        );
+        let mut heap = BinaryHeap::with_capacity(TOPK_K + 1);
+        let mut dists = Vec::new();
+        let mut u16_scratch = Vec::new();
+        let mut u8_scratch = Vec::new();
+        let mut u32_scratch = Vec::new();
+        c.bench_function(
+            format!(
+                "RQ heap topk ({label}): num_bits={NUM_BITS}, DIM={TOPK_DIM}, rows={TOPK_ROWS}, k={TOPK_K}"
+            )
+            .as_str(),
+            |b| {
+                b.iter(|| {
+                    heap.clear();
+                    calc.accumulate_topk_with_scratch(
+                        TOPK_K,
+                        None,
+                        None,
+                        |id| id as u64,
+                        &mut heap,
+                        &mut dists,
+                        &mut u16_scratch,
+                        &mut u8_scratch,
+                        &mut u32_scratch,
+                    );
+                    black_box(heap.len())
+                })
+            },
+        );
+    }
+}
 
-#[cfg(not(target_os = "linux"))]
 criterion_group!(
     name=benches;
     config = Criterion::default().measurement_time(Duration::from_secs(10));
-    targets = construct_dist_table, compute_distances);
+    targets = construct_dist_table, compute_distances, ex_dot_kernels, ex_code_storage_load, ex_bulk_paths, heap_topk);
 
 criterion_main!(benches);
diff --git a/rust/lance-index/build.rs b/rust/lance-index/build.rs
index 0617de8c806..b47744f7b5a 100644
--- a/rust/lance-index/build.rs
+++ b/rust/lance-index/build.rs
@@ -6,6 +6,9 @@ use std::io::Result;
 
 fn main() -> Result<()> {
     println!("cargo:rerun-if-changed=protos");
+    // Cache-entry protos are library-internal serialization, not part of the
+    // on-disk format spec, so they live here rather than in the shared `protos/`.
+    println!("cargo:rerun-if-changed=protos-cache");
 
     #[cfg(feature = "protoc")]
     // Use vendored protobuf compiler if requested.
@@ -17,8 +20,12 @@ fn main() -> Result<()> {
     prost_build.protoc_arg("--experimental_allow_proto3_optional");
     prost_build.enable_type_names();
     prost_build.compile_protos(
-        &["./protos/index.proto", "./protos/index_old.proto"],
-        &["./protos"],
+        &[
+            "./protos/index.proto",
+            "./protos/index_old.proto",
+            "./protos-cache/cache.proto",
+        ],
+        &["./protos", "./protos-cache"],
     )?;
 
     let rust_toolchain = env::var("RUSTUP_TOOLCHAIN")
diff --git a/rust/lance-index/protos-cache/cache.proto b/rust/lance-index/protos-cache/cache.proto
new file mode 100644
index 00000000000..b24a27055d7
--- /dev/null
+++ b/rust/lance-index/protos-cache/cache.proto
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+// Protobuf headers for serialized index cache entries.
+//
+// These messages describe the *cache* serialization format, not the on-disk
+// Lance format spec, so they live with the library (lance-index) rather than in
+// the top-level `protos/` spec folder.
+//
+// Field numbers and enum values are append-only across all messages here: never
+// renumber or reuse them. A change the proto cannot express transparently
+// (adding/removing/reordering the IPC/raw sections that follow a header) must
+// bump the relevant codec's `CURRENT_VERSION` instead.
+
+syntax = "proto3";
+
+package lance.index.cache;
+
+// ---------------------------------------------------------------------------
+// Full-text search (FTS) posting lists
+// ---------------------------------------------------------------------------
+
+// Header for a serialized `CompressedPostingList` cache entry.
+message CompressedPostingHeader {
+  float max_score = 1;
+  uint32 length = 2;
+  PostingTailCodec posting_tail_codec = 3;
+  PositionStorage position_storage = 4;
+  // Only meaningful when position_storage == POSITION_STORAGE_SHARED.
+  PositionStreamCodec position_stream_codec = 5;
+}
+
+// Header for a serialized `PlainPostingList` cache entry. Followed by an Arrow
+// IPC section of (row_ids: UInt64, frequencies: Float32), then — when
+// position_storage == POSITION_STORAGE_LEGACY — an IPC section of the per-doc
+// position list. Plain postings never carry a shared position stream.
+message PlainPostingHeader {
+  // Absent when the posting has no precomputed block-max score (the in-memory
+  // `max_score` is `None`); present otherwise.
+  optional float max_score = 1;
+  // POSITION_STORAGE_NONE or POSITION_STORAGE_LEGACY only.
+  PositionStorage position_storage = 2;
+}
+
+// Header for a serialized standalone `Positions` cache entry. Followed by the
+// position sections framed per `position_storage`, which is never
+// POSITION_STORAGE_NONE for a standalone entry.
+message PositionsHeader {
+  PositionStorage position_storage = 1;
+  // Only meaningful when position_storage == POSITION_STORAGE_SHARED.
+  PositionStreamCodec position_stream_codec = 2;
+}
+
+// Header for a serialized `PostingListGroup`: a member count followed by that
+// many `PostingList` bodies written inline. Each member body is
+// self-delimiting, so members need no length prefixes, and writing them inline
+// keeps their Arrow IPC sections 64-byte aligned within the group entry.
+message PostingListGroupHeader {
+  uint32 count = 1;
+}
+
+// Tail-block encoding of a compressed posting list.
+enum PostingTailCodec {
+  POSTING_TAIL_CODEC_FIXED32 = 0;
+  POSTING_TAIL_CODEC_VARINT_DELTA = 1;
+}
+
+// Encoding of a shared position stream's byte buffer.
+enum PositionStreamCodec {
+  POSITION_STREAM_CODEC_VARINT_DOC_DELTA = 0;
+  POSITION_STREAM_CODEC_PACKED_DELTA = 1;
+}
+
+// Which (if any) positions accompany the posting list, and how they are framed
+// in the sections after the header.
+enum PositionStorage {
+  POSITION_STORAGE_NONE = 0;
+  // Legacy per-doc positions as a single Arrow IPC section.
+  POSITION_STORAGE_LEGACY = 1;
+  // Shared stream: an Arrow IPC section of block offsets, then a raw blob of
+  // the (codec-encoded) position bytes.
+  POSITION_STORAGE_SHARED = 2;
+}
+
+// ---------------------------------------------------------------------------
+// Scalar indices
+// ---------------------------------------------------------------------------
+
+// Header for a serialized `BTreeIndexState` cache entry, followed by a single
+// Arrow IPC section holding the page-lookup batch.
+message BTreeIndexHeader {
+  uint64 batch_size = 1;
+  // Whether an explicit page-range -> file mapping is present. Distinguishes a
+  // non-range-partitioned index (false) from a range-partitioned one whose map
+  // happens to be empty (true with no entries).
+  bool has_ranges_to_files = 2;
+  repeated RangeToFile ranges_to_files = 3;
+}
+
+// One entry of a `BTreeIndexState` page-range -> file mapping. The range is
+// inclusive on both ends (a `RangeInclusive<u32>`).
+message RangeToFile {
+  uint32 start = 1;
+  uint32 end = 2;
+  uint32 page_offset = 3;
+  string path = 4;
+}
+
+// ---------------------------------------------------------------------------
+// Vector indices (IVF partitions)
+// ---------------------------------------------------------------------------
+
+// Headers for serialized IVF partition cache entries (`PartitionEntry<S, Q>`).
+//
+// Each header is followed by 64-byte-aligned Arrow IPC sections in a fixed,
+// version-keyed order (sub-index, then any quantizer-specific arrays, then the
+// quantizer storage batches).
+
+// Distance metric a quantizer's storage was built for.
+enum DistanceType {
+  DISTANCE_TYPE_L2 = 0;
+  DISTANCE_TYPE_COSINE = 1;
+  DISTANCE_TYPE_DOT = 2;
+  DISTANCE_TYPE_HAMMING = 3;
+}
+
+// Rotation applied by a RabitQ quantizer.
+enum RotationType {
+  ROTATION_TYPE_MATRIX = 0;
+  ROTATION_TYPE_FAST = 1;
+}
+
+// Estimator a RabitQ quantizer uses at query time.
+enum RabitQueryEstimator {
+  RABIT_QUERY_ESTIMATOR_RESIDUAL_QUERY = 0;
+  RABIT_QUERY_ESTIMATOR_RAW_QUERY = 1;
+}
+
+// Product quantizer. Sections: sub-index IPC, codebook IPC, storage IPC.
+message PqPartitionHeader {
+  DistanceType distance_type = 1;
+  uint32 nbits = 2;
+  uint64 num_sub_vectors = 3;
+  uint64 dimension = 4;
+  bool transposed = 5;
+}
+
+// Flat (float) and flat-binary quantizers. Sections: sub-index IPC, storage IPC.
+message FlatPartitionHeader {
+  DistanceType distance_type = 1;
+  uint64 dim = 2;
+}
+
+// Scalar quantizer. Sections: sub-index IPC, storage IPC (possibly multi-batch).
+message SqPartitionHeader {
+  DistanceType distance_type = 1;
+  uint32 num_bits = 2;
+  uint64 dim = 3;
+  double bounds_start = 4;
+  double bounds_end = 5;
+}
+
+// Header for a serialized IVF index state (`IvfIndexState<Q>`), followed by
+// three raw blobs: the IVF model protobuf, the quantizer's extra-metadata
+// buffer (may be empty), and the auxiliary IVF model protobuf.
+message IvfStateHeader {
+  string index_file_path = 1;
+  string uuid = 2;
+  string distance_type = 3;
+  repeated string sub_index_metadata = 4;
+  string sub_index_type = 5;
+  string quantization_type = 6;
+  // Per-quantizer `Q::Metadata` as JSON. Kept as a string because the metadata
+  // type is generic over the quantizer; the proto envelope still provides
+  // additive evolution for the surrounding fields.
+  string quantizer_metadata_json = 7;
+  string cache_key_prefix = 8;
+  uint64 index_file_size = 9;
+  uint64 aux_file_size = 10;
+}
+
+// RabitQ quantizer. Sections: sub-index IPC, rotate-matrix IPC (Matrix rotation
+// only), storage IPC.
+message RabitPartitionHeader {
+  DistanceType distance_type = 1;
+  uint32 num_bits = 2;
+  uint32 code_dim = 3;
+  RotationType rotation_type = 4;
+  // Fast-rotation sign vector; present only when rotation_type ==
+  // ROTATION_TYPE_FAST (the Matrix case stores its rotation as an IPC section).
+  optional bytes fast_rotation_signs = 5;
+  // Estimator the RabitQ storage uses at query time (residual vs raw query).
+  RabitQueryEstimator query_estimator = 6;
+}
diff --git a/rust/lance-index/src/lib.rs b/rust/lance-index/src/lib.rs
index 888070a3c1f..20e1c2692d9 100644
--- a/rust/lance-index/src/lib.rs
+++ b/rust/lance-index/src/lib.rs
@@ -68,6 +68,13 @@ pub mod pbold {
     include!(concat!(env!("OUT_DIR"), "/lance.table.rs"));
 }
 
+/// Protobuf headers for serialized index cache entries (FTS posting lists,
+/// scalar indices, and IVF vector partitions).
+pub mod cache_pb {
+    #![allow(clippy::use_self)]
+    include!(concat!(env!("OUT_DIR"), "/lance.index.cache.rs"));
+}
+
 /// Generic methods common across all types of secondary indices
 ///
 #[async_trait]
@@ -312,6 +319,7 @@ impl IndexType {
             Self::IvfFlat => 4096,
             Self::IvfSq => 8192,
             Self::IvfPq => 8192,
+            Self::IvfRq => 4096,
             Self::IvfHnswFlat => 1 << 20,
             Self::IvfHnswSq => 1 << 20,
             Self::IvfHnswPq => 1 << 20,
@@ -382,6 +390,11 @@ mod tests {
         assert_eq!(IndexType::max_vector_version(), IVF_RQ_INDEX_VERSION);
     }
 
+    #[test]
+    fn test_ivf_rq_target_partition_size() {
+        assert_eq!(IndexType::IvfRq.target_partition_size(), 4096);
+    }
+
     #[test]
     fn test_index_type_try_from_i32_covers_all_variants() {
         let all = [
diff --git a/rust/lance-index/src/metrics.rs b/rust/lance-index/src/metrics.rs
index 9e2161ae8f9..37e2c43d196 100644
--- a/rust/lance-index/src/metrics.rs
+++ b/rust/lance-index/src/metrics.rs
@@ -43,6 +43,19 @@ pub trait MetricsCollector: Send + Sync {
     ///
     /// The goal is to provide some visibility into the compute cost of the search
     fn record_comparisons(&self, num_comparisons: usize);
+
+    /// Returns an optional sink for recording exact I/O statistics (bytes read,
+    /// IOPS, and requests) performed on behalf of this collector.
+    ///
+    /// Index implementations that read from a
+    /// [`lance_io::scheduler::ScanScheduler`] can attach the returned handle to
+    /// their file readers so the I/O performed for a single query is measured
+    /// and attributed here.  The default returns `None`, meaning the caller does
+    /// not want I/O measured (and index implementations should then take their
+    /// normal, uninstrumented read path).
+    fn io_stats(&self) -> Option<lance_io::scheduler::IoStats> {
+        None
+    }
 }
 
 /// A no-op metrics collector that does nothing
diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs
index eaebbd1b338..a74318fc5c9 100644
--- a/rust/lance-index/src/scalar.rs
+++ b/rust/lance-index/src/scalar.rs
@@ -8,6 +8,7 @@ use arrow_array::{BooleanArray, ListArray, RecordBatch, UInt64Array};
 use arrow_schema::{Field, Schema};
 use async_trait::async_trait;
 use bytes::Bytes;
+use datafusion::functions::regex::regexplike::RegexpLikeFunc;
 use datafusion::functions::string::contains::ContainsFunc;
 use datafusion::functions_nested::array_has;
 use datafusion::physical_plan::SendableRecordBatchStream;
@@ -288,6 +289,22 @@ pub trait IndexStore: std::fmt::Debug + Send + Sync + DeepSizeOf {
     /// This is often useful when remapping or updating
     async fn copy_index_file(&self, name: &str, dest_store: &dyn IndexStore) -> Result<IndexFile>;
 
+    /// Copy an index file from this store to a new name in another store, leaving the source intact
+    async fn copy_index_file_to(
+        &self,
+        name: &str,
+        new_name: &str,
+        dest_store: &dyn IndexStore,
+    ) -> Result<IndexFile> {
+        if name == new_name {
+            self.copy_index_file(name, dest_store).await
+        } else {
+            Err(Error::not_supported(format!(
+                "copying index file {name} to {new_name} is not supported by this index store"
+            )))
+        }
+    }
+
     /// Rename an index file
     async fn rename_index_file(&self, name: &str, new_name: &str) -> Result<IndexFile>;
 
@@ -633,9 +650,15 @@ impl AnyQuery for LabelListQuery {
 pub enum TextQuery {
     /// Retrieve all row ids where the text contains the given string
     StringContains(String),
-    // TODO: In the future we should be able to do string-insensitive contains
-    // as well as partial matches (e.g. LIKE 'foo%') and potentially even
-    // some regular expressions
+    /// Retrieve all row ids whose text matches the given regular expression.
+    ///
+    /// The pattern is a full regular expression (as accepted by `regexp_like`).
+    /// The index returns a candidate superset that the scan rechecks, so any
+    /// pattern is sound; patterns with no usable trigram structure simply fall
+    /// back to rechecking every row.
+    Regex(String),
+    // TODO: In the future we should be able to do case-insensitive contains
+    // as well as partial matches (e.g. LIKE 'foo%').
 }
 
 impl AnyQuery for TextQuery {
@@ -656,6 +679,17 @@ impl AnyQuery for TextQuery {
                     Expr::Literal(ScalarValue::Utf8(Some(substr.clone())), None),
                 ],
             }),
+            // `regexp_like` returns Boolean directly, so the reconstructed
+            // expression can be used as-is for the recheck filter (no IsNotNull
+            // wrapper, unlike `regexp_match`). It is the semantic equivalent of
+            // the original predicate for the "does it match" question.
+            Self::Regex(pattern) => Expr::ScalarFunction(ScalarFunction {
+                func: Arc::new(RegexpLikeFunc::new().into()),
+                args: vec![
+                    Expr::Column(Column::new_unqualified(col)),
+                    Expr::Literal(ScalarValue::Utf8(Some(pattern.clone())), None),
+                ],
+            }),
         }
     }
 
@@ -935,6 +969,15 @@ impl OldIndexDataFilter {
             Self::RowIds(valid_row_ids) => *addrs &= valid_row_ids,
         }
     }
+
+    /// True if this filter would keep no rows at all (its keep-set is empty),
+    /// letting a segment merge skip reading the source segment entirely.
+    pub fn keeps_nothing(&self) -> bool {
+        match self {
+            Self::Fragments { to_keep, .. } => to_keep.is_empty(),
+            Self::RowIds(valid_row_ids) => valid_row_ids.is_empty(),
+        }
+    }
 }
 
 impl UpdateCriteria {
diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs
index 4a212713e1f..8a7fea074c3 100644
--- a/rust/lance-index/src/scalar/bitmap.rs
+++ b/rust/lance-index/src/scalar/bitmap.rs
@@ -17,14 +17,13 @@ use bytes::Bytes;
 use datafusion::physical_plan::SendableRecordBatchStream;
 use datafusion_common::ScalarValue;
 use futures::{StreamExt, TryStreamExt, stream};
-use lance_arrow::ipc::{
-    read_ipc_stream_single_at, read_len_prefixed_bytes_at, write_ipc_stream,
-    write_len_prefixed_bytes,
-};
 use lance_core::deepsize::DeepSizeOf;
 use lance_core::{
     Error, ROW_ID, Result,
-    cache::{CacheCodec, CacheCodecImpl, CacheKey, LanceCache, WeakLanceCache},
+    cache::{
+        CacheCodec, CacheCodecImpl, CacheEntryReader, CacheEntryWriter, CacheKey, LanceCache,
+        WeakLanceCache,
+    },
     error::LanceOptionExt,
     utils::tokio::get_num_compute_intensive_cpus,
 };
@@ -201,6 +200,32 @@ impl BitmapIndexState {
             frag_reuse_index,
         )))
     }
+
+    /// Build a state directly from its parts, for codec tests in sibling
+    /// modules (e.g. the label-list index, which nests a bitmap state).
+    #[cfg(test)]
+    pub(crate) fn new_for_test(
+        index_map: BTreeMap<OrderableScalarValue, usize>,
+        null_map: RowAddrTreeMap,
+        value_type: DataType,
+    ) -> Result<Self> {
+        Ok(Self {
+            lookup_batch: build_lookup_batch(&index_map, &value_type)?,
+            null_map: Arc::new(null_map),
+            value_type,
+            index_map: Arc::new(index_map),
+        })
+    }
+
+    #[cfg(test)]
+    pub(crate) fn lookup_batch(&self) -> &RecordBatch {
+        &self.lookup_batch
+    }
+
+    #[cfg(test)]
+    pub(crate) fn null_map(&self) -> &RowAddrTreeMap {
+        &self.null_map
+    }
 }
 
 fn build_lookup_batch(
@@ -240,25 +265,27 @@ fn parse_lookup_batch(batch: &RecordBatch) -> Result<BTreeMap<OrderableScalarVal
 }
 
 impl CacheCodecImpl for BitmapIndexState {
+    const TYPE_ID: &'static str = "lance.scalar.BitmapIndexState";
+    const CURRENT_VERSION: u32 = 1;
+
     /// Wire format:
     /// ```text
-    /// [u64 null_map_len][null_map bytes]
-    /// [arrow IPC stream: (keys: <value_type>, offsets: UInt64)]
+    /// RAW_BLOB  : null_map (roaring tree map, portable encoding)
+    /// ARROW_IPC : (keys: <value_type>, offsets: UInt64)
     /// ```
-    /// The value type is recovered from the IPC stream schema.
-    fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()> {
+    /// The value type is recovered from the IPC section schema.
+    fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> {
         let mut null_bytes = Vec::with_capacity(self.null_map.serialized_size());
         self.null_map.serialize_into(&mut null_bytes)?;
-        write_len_prefixed_bytes(writer, &null_bytes)?;
-        write_ipc_stream(&self.lookup_batch, writer)?;
+        w.write_raw(&null_bytes)?;
+        w.write_ipc(&self.lookup_batch)?;
         Ok(())
     }
 
-    fn deserialize(data: &bytes::Bytes) -> Result<Self> {
-        let mut offset = 0;
-        let null_bytes = read_len_prefixed_bytes_at(data, &mut offset)?;
+    fn deserialize(r: &mut CacheEntryReader<'_>) -> Result<Self> {
+        let null_bytes = r.read_raw()?;
         let null_map = Arc::new(RowAddrTreeMap::deserialize_from(null_bytes.as_ref())?);
-        let lookup_batch = read_ipc_stream_single_at(data, &mut offset)?;
+        let lookup_batch = r.read_ipc()?;
         let value_type = lookup_batch.schema().field(0).data_type().clone();
         let index_map = Arc::new(parse_lookup_batch(&lookup_batch)?);
         Ok(Self {
@@ -1246,6 +1273,19 @@ pub async fn merge_bitmap_indices(
             )));
         }
 
+        // A segment whose filter keeps nothing contributes no postings; skip the
+        // state load entirely. (Remapping for deferred compaction happens inside
+        // `load_bitmap`, so the loaded postings already reference live fragments.)
+        if old_data_filters[idx]
+            .as_ref()
+            .is_some_and(|f| f.keeps_nothing())
+        {
+            progress
+                .stage_progress("merge_bitmap_segments", (idx + 1) as u64)
+                .await?;
+            continue;
+        }
+
         let mut state = source_index.load_bitmap_index_state().await?;
         if let Some(old_data_filter) = &old_data_filters[idx] {
             state.retain(|_, postings| {
@@ -1449,8 +1489,12 @@ mod tests {
 
     fn assert_state_roundtrips(state: &BitmapIndexState) {
         let mut buf = Vec::new();
-        state.serialize(&mut buf).unwrap();
-        let restored = BitmapIndexState::deserialize(&bytes::Bytes::from(buf)).unwrap();
+        state
+            .serialize(&mut CacheEntryWriter::new(&mut buf))
+            .unwrap();
+        let data = bytes::Bytes::from(buf);
+        let mut reader = CacheEntryReader::new(&data, 0, BitmapIndexState::CURRENT_VERSION);
+        let restored = BitmapIndexState::deserialize(&mut reader).unwrap();
         assert_eq!(restored.lookup_batch, state.lookup_batch);
         assert_eq!(&*restored.null_map, &*state.null_map);
         assert_eq!(restored.value_type, state.value_type);
@@ -1484,6 +1528,53 @@ mod tests {
         assert_state_roundtrips(&empty_state);
     }
 
+    /// The lookup batch must decode zero-copy through the full envelope-bearing
+    /// [`CacheCodec`] even though the envelope pushes the IPC section to a
+    /// non-aligned starting offset.
+    #[test]
+    fn test_bitmap_index_state_lookup_is_zero_copy() {
+        const ALIGN: usize = 64;
+        let mut index_map = BTreeMap::new();
+        for k in 0..32i32 {
+            index_map.insert(
+                OrderableScalarValue(ScalarValue::Int32(Some(k))),
+                k as usize,
+            );
+        }
+        let state = BitmapIndexState {
+            lookup_batch: build_lookup_batch(&index_map, &DataType::Int32).unwrap(),
+            null_map: Arc::new(RowAddrTreeMap::new()),
+            value_type: DataType::Int32,
+            index_map: Arc::new(index_map),
+        };
+
+        let codec = CacheCodec::from_impl::<BitmapIndexState>();
+        let any: Arc<dyn std::any::Any + Send + Sync> = Arc::new(state);
+        let mut buf = Vec::new();
+        codec.serialize(&any, &mut buf).unwrap();
+
+        // Model a backend reading into a 64-byte-aligned buffer.
+        let mut v = vec![0u8; buf.len() + ALIGN];
+        let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN;
+        v[pad..pad + buf.len()].copy_from_slice(&buf);
+        let data = bytes::Bytes::from(v).slice(pad..pad + buf.len());
+
+        let restored = codec.deserialize(&data).hit().unwrap();
+        let restored = restored.downcast::<BitmapIndexState>().unwrap();
+
+        let base = data.as_ptr() as usize;
+        let end = base + data.len();
+        for col in restored.lookup_batch.columns() {
+            for buffer in col.to_data().buffers() {
+                let ptr = buffer.as_ptr() as usize;
+                assert!(
+                    ptr >= base && ptr < end,
+                    "lookup batch buffer was realigned out of the input — misaligned IPC section",
+                );
+            }
+        }
+    }
+
     #[tokio::test]
     async fn test_bitmap_lazy_loading_and_cache() {
         // Create a temporary directory for the index
diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs
index e8e5c42a248..ab3f6c58075 100644
--- a/rust/lance-index/src/scalar/btree.rs
+++ b/rust/lance-index/src/scalar/btree.rs
@@ -15,6 +15,7 @@ use super::{
     OldIndexDataFilter, SargableQuery, ScalarIndex, ScalarIndexParams, SearchResult,
     compute_next_prefix,
 };
+use crate::cache_pb::{BTreeIndexHeader, RangeToFile};
 use crate::{Index, IndexType};
 use crate::{
     frag_reuse::FragReuseIndex,
@@ -45,18 +46,23 @@ use datafusion::physical_plan::{
     sorts::sort_preserving_merge::SortPreservingMergeExec, stream::RecordBatchStreamAdapter,
     union::UnionExec,
 };
-use datafusion_common::{DataFusionError, ScalarValue};
-use datafusion_physical_expr::{PhysicalSortExpr, expressions::Column};
+use datafusion_common::{DFSchema, DataFusionError, ScalarValue};
+use datafusion_expr::execution_props::ExecutionProps;
+use datafusion_physical_expr::{
+    PhysicalExpr, PhysicalSortExpr, create_physical_expr, expressions::Column,
+};
 use futures::{
     FutureExt, Stream, StreamExt, TryFutureExt, TryStreamExt,
     future::BoxFuture,
     stream::{self},
 };
-use lance_arrow::ipc::{read_ipc_stream_single_at, write_ipc_stream};
 use lance_core::deepsize::DeepSizeOf;
 use lance_core::{
     Error, ROW_ID, Result,
-    cache::{CacheCodec, CacheCodecImpl, CacheKey, LanceCache, WeakLanceCache},
+    cache::{
+        CacheCodec, CacheCodecImpl, CacheEntryReader, CacheEntryWriter, CacheKey, LanceCache,
+        WeakLanceCache,
+    },
     error::LanceOptionExt,
     utils::{
         tokio::get_num_compute_intensive_cpus,
@@ -589,7 +595,7 @@ impl Ord for OrderableScalarValue {
                 }
             }
             (Struct(_arr), _) => panic!("Attempt to compare Struct with non-Struct"),
-            (Dictionary(_k1, _v1), Dictionary(_k2, _v2)) => todo!(),
+            (Dictionary(_k1, v1), Dictionary(_k2, v2)) => Self(*v1.clone()).cmp(&Self(*v2.clone())),
             (Dictionary(_, v1), Null) => Self(*v1.clone()).cmp(&Self(ScalarValue::Null)),
             (Dictionary(_, _), _) => panic!("Attempt to compare Dictionary with non-Dictionary"),
             // What would a btree of unions even look like?  May not be possible.
@@ -1402,106 +1408,58 @@ impl BTreeIndexState {
 }
 
 impl CacheCodecImpl for BTreeIndexState {
-    /// Wire format (no stability guarantees yet — the cache is rebuilt from
-    /// source on any version mismatch):
+    const TYPE_ID: &'static str = "lance.scalar.BTreeIndexState";
+    const CURRENT_VERSION: u32 = 1;
+
+    /// Wire format:
     /// ```text
-    /// u64 batch_size (LE)
-    /// u8  has_ranges (0 = None, 1 = Some)
-    /// if has_ranges:
-    ///   u32 entry_count (LE)
-    ///   per entry: u32 start | u32 end | u32 offset | u32 path_len | path bytes
-    /// lookup batch (Arrow IPC stream)
+    /// HEADER    : BTreeIndexHeader proto (batch_size + page-range mapping)
+    /// ARROW_IPC : page-lookup batch
     /// ```
-    fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()> {
-        writer.write_all(&self.batch_size.to_le_bytes())?;
-        match &self.ranges_to_files {
-            None => writer.write_all(&[0u8])?,
-            Some(ranges) => {
-                writer.write_all(&[1u8])?;
-                let count = u32::try_from(ranges.len()).map_err(|_| {
-                    Error::io("BTreeIndexState: ranges_to_files exceeds u32::MAX entries")
-                })?;
-                writer.write_all(&count.to_le_bytes())?;
-                for (range, (path, page_offset)) in ranges.iter() {
-                    writer.write_all(&range.start().to_le_bytes())?;
-                    writer.write_all(&range.end().to_le_bytes())?;
-                    writer.write_all(&page_offset.to_le_bytes())?;
-                    let path_len = u32::try_from(path.len()).map_err(|_| {
-                        Error::io("BTreeIndexState: ranges_to_files path exceeds u32::MAX bytes")
-                    })?;
-                    writer.write_all(&path_len.to_le_bytes())?;
-                    writer.write_all(path.as_bytes())?;
-                }
-            }
-        }
-        write_ipc_stream(&self.lookup_batch, writer)?;
+    fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> {
+        let ranges_to_files = match &self.ranges_to_files {
+            None => Vec::new(),
+            Some(ranges) => ranges
+                .iter()
+                .map(|(range, (path, page_offset))| RangeToFile {
+                    start: *range.start(),
+                    end: *range.end(),
+                    page_offset: *page_offset,
+                    path: path.clone(),
+                })
+                .collect(),
+        };
+        let header = BTreeIndexHeader {
+            batch_size: self.batch_size,
+            has_ranges_to_files: self.ranges_to_files.is_some(),
+            ranges_to_files,
+        };
+        w.write_header(&header)?;
+        w.write_ipc(&self.lookup_batch)?;
         Ok(())
     }
 
-    fn deserialize(data: &bytes::Bytes) -> Result<Self> {
-        let mut offset = 0;
-        let batch_size = read_u64_le(data, &mut offset)?;
-        let has_ranges = read_u8(data, &mut offset)?;
-        let ranges_to_files = match has_ranges {
-            0 => None,
-            1 => {
-                let count = read_u32_le(data, &mut offset)? as usize;
-                let mut entries = Vec::with_capacity(count);
-                for _ in 0..count {
-                    let start = read_u32_le(data, &mut offset)?;
-                    let end = read_u32_le(data, &mut offset)?;
-                    let page_offset = read_u32_le(data, &mut offset)?;
-                    let path_len = read_u32_le(data, &mut offset)? as usize;
-                    let path = read_bytes(data, &mut offset, path_len)?;
-                    let path = std::str::from_utf8(&path)
-                        .map_err(|e| Error::io(format!("BTreeIndexState path: {e}")))?
-                        .to_string();
-                    entries.push((start..=end, (path, page_offset)));
-                }
-                Some(Arc::new(entries.into_iter().collect()))
-            }
-            other => {
-                return Err(Error::io(format!(
-                    "BTreeIndexState: invalid has_ranges tag {other}"
-                )));
-            }
+    fn deserialize(r: &mut CacheEntryReader<'_>) -> Result<Self> {
+        let header: BTreeIndexHeader = r.read_header()?;
+        let ranges_to_files = if header.has_ranges_to_files {
+            let map: RangeInclusiveMap<u32, (String, u32)> = header
+                .ranges_to_files
+                .into_iter()
+                .map(|entry| (entry.start..=entry.end, (entry.path, entry.page_offset)))
+                .collect();
+            Some(Arc::new(map))
+        } else {
+            None
         };
-        let lookup_batch = read_ipc_stream_single_at(data, &mut offset)?;
+        let lookup_batch = r.read_ipc()?;
         Ok(Self {
             lookup_batch,
-            batch_size,
+            batch_size: header.batch_size,
             ranges_to_files,
         })
     }
 }
 
-fn read_bytes(data: &bytes::Bytes, offset: &mut usize, len: usize) -> Result<bytes::Bytes> {
-    if data.len() < *offset + len {
-        return Err(Error::io(format!(
-            "BTreeIndexState: short read of {len} bytes at offset {offset} (have {})",
-            data.len()
-        )));
-    }
-    let slice = data.slice(*offset..*offset + len);
-    *offset += len;
-    Ok(slice)
-}
-
-fn read_u8(data: &bytes::Bytes, offset: &mut usize) -> Result<u8> {
-    let bytes = read_bytes(data, offset, 1)?;
-    Ok(bytes[0])
-}
-
-fn read_u32_le(data: &bytes::Bytes, offset: &mut usize) -> Result<u32> {
-    let bytes = read_bytes(data, offset, 4)?;
-    Ok(u32::from_le_bytes(bytes.as_ref().try_into().unwrap()))
-}
-
-fn read_u64_le(data: &bytes::Bytes, offset: &mut usize) -> Result<u64> {
-    let bytes = read_bytes(data, offset, 8)?;
-    Ok(u64::from_le_bytes(bytes.as_ref().try_into().unwrap()))
-}
-
 /// Cache key for a [`BTreeIndexState`]. The cache it is used with is already
 /// namespaced per-index, so the key string is a constant.
 struct BTreeIndexStateKey;
@@ -1595,6 +1553,66 @@ impl BTreeIndex {
         }
     }
 
+    /// For each key in `keys`, whether this index contains it — a batched
+    /// existence check returning a mask aligned to `keys`.
+    ///
+    /// The per-key sibling of `search(Equals(..))`, but one call replaces N
+    /// probes: keys are grouped by page using the same page resolution as
+    /// [`ScalarIndex::search`] (`pages_eq`), each touched page is loaded once
+    /// (session-cached), and membership is tested against the page's values via
+    /// `FlatIndex::contains_values`. Avoids the per-key `SearchResult` /
+    /// `RowAddrTreeMap` allocation when the caller only wants a yes/no.
+    ///
+    /// Intended for primary-key dedup, where keys are non-null; a null key maps
+    /// to `false`.
+    pub async fn contains_keys(
+        &self,
+        keys: &[ScalarValue],
+        metrics: &dyn MetricsCollector,
+    ) -> Result<Vec<bool>> {
+        // Group each key (by input position) under every page whose value range
+        // could hold it. Mirrors `search`'s page selection so the two agree.
+        let mut by_page: HashMap<u32, Vec<(usize, OrderableScalarValue)>> = HashMap::new();
+        for (idx, key) in keys.iter().enumerate() {
+            if key.is_null() {
+                continue;
+            }
+            let ov = OrderableScalarValue(key.clone());
+            for matches in self.page_lookup.pages_eq(&ov)? {
+                by_page
+                    .entry(matches.page_id())
+                    .or_default()
+                    .push((idx, ov.clone()));
+            }
+        }
+
+        let index_reader = LazyIndexReader::new(self.store.clone(), self.ranges_to_files.clone());
+        let page_tasks = by_page.into_iter().map(|(page_number, entries)| {
+            let index_reader = index_reader.clone();
+            async move {
+                let page = self.lookup_page(page_number, index_reader, metrics).await?;
+                let needles: Vec<OrderableScalarValue> =
+                    entries.iter().map(|(_, ov)| ov.clone()).collect();
+                let present = page.contains_values(&needles)?;
+                Result::Ok((entries, present))
+            }
+        });
+
+        let mut result = vec![false; keys.len()];
+        let page_results: Vec<_> = stream::iter(page_tasks)
+            .buffer_unordered(get_num_compute_intensive_cpus())
+            .try_collect()
+            .await?;
+        for (entries, present) in page_results {
+            for (idx, ov) in entries {
+                if present.contains(&ov) {
+                    result[idx] = true;
+                }
+            }
+        }
+        Ok(result)
+    }
+
     async fn lookup_page(
         &self,
         page_number: u32,
@@ -1628,11 +1646,28 @@ impl BTreeIndex {
         FlatIndex::try_new(serialized_page)
     }
 
+    /// Compile a sargable predicate into a physical expr against the per-page
+    /// schema ([values, ids]). Built once in `search` and shared across pages so
+    /// a large IN-list is not re-materialized for every page.
+    fn compile_predicate(&self, query: &SargableQuery) -> Result<Arc<dyn PhysicalExpr>> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new(BTREE_VALUES_COLUMN, self.data_type.clone(), true),
+            Field::new(BTREE_IDS_COLUMN, DataType::UInt64, false),
+        ]));
+        let df_schema = DFSchema::try_from(schema)?;
+        Ok(create_physical_expr(
+            &query.to_expr(BTREE_VALUES_COLUMN.to_string()),
+            &df_schema,
+            &ExecutionProps::default(),
+        )?)
+    }
+
     async fn search_page(
         &self,
         query: &SargableQuery,
         matches: Matches,
         index_reader: LazyIndexReader,
+        prebuilt: Option<&Arc<dyn PhysicalExpr>>,
         metrics: &dyn MetricsCollector,
     ) -> Result<NullableRowAddrSet> {
         let subindex = self
@@ -1640,13 +1675,12 @@ impl BTreeIndex {
             .await?;
 
         match matches {
-            Matches::Some(_) => {
-                // TODO: If this is an IN query we can perhaps simplify the subindex query by restricting it to the
-                // values that might be in the page.  E.g. if we are searching for X IN [5, 3, 7] and five is in pages
-                // 1 and 2 and three is in page 2 and seven is in pages 8 and 9, then when searching page 2 we only need
-                // to search for X IN [5, 3]
-                subindex.search(query, metrics)
-            }
+            // For a large IsIn the predicate is compiled once (see `search`) and
+            // reused here, instead of rebuilding the whole IN-list per page.
+            Matches::Some(_) => match prebuilt {
+                Some(expr) => subindex.search_prebuilt(expr, metrics),
+                None => subindex.search(query, metrics),
+            },
             Matches::All(_) => Ok(match query {
                 // This means we hit an all-null page so just grab all row ids as true
                 SargableQuery::IsNull() => subindex.all_ignore_nulls(),
@@ -1809,7 +1843,7 @@ impl BTreeIndex {
         if old_data_filters.len() != segments.len() {
             return Err(Error::invalid_input(format!(
                 "BTree merge: expected one old-data filter per source segment \
-                 ({} segments) but got {}",
+                 (segments={}, filters={})",
                 segments.len(),
                 old_data_filters.len()
             )));
@@ -1837,13 +1871,19 @@ impl BTreeIndex {
 
         let mut inputs: Vec<Arc<dyn ExecutionPlan>> = Vec::with_capacity(segments.len() + 1);
         for (segment, old_data_filter) in segments.iter().zip(old_data_filters) {
+            if old_data_filter.as_ref().is_some_and(|f| f.keeps_nothing()) {
+                continue;
+            }
             let stream = segment.data_stream().await?;
+            let stream = match segment.frag_reuse_index.clone() {
+                Some(frag_reuse_index) => remap_row_ids(stream, frag_reuse_index),
+                None => stream,
+            };
             let stream = match old_data_filter.clone() {
                 Some(filter) => filter_row_ids(stream, filter),
                 None => stream,
             };
-            let exec = Arc::new(OneShotExec::new(stream));
-            inputs.push(exec);
+            inputs.push(Arc::new(OneShotExec::new(stream)));
         }
         inputs.push(Arc::new(OneShotExec::new(new_data)));
 
@@ -1898,6 +1938,18 @@ fn filter_row_ids(
     Box::pin(RecordBatchStreamAdapter::new(schema, filtered))
 }
 
+fn remap_row_ids(
+    stream: SendableRecordBatchStream,
+    frag_reuse_index: Arc<FragReuseIndex>,
+) -> SendableRecordBatchStream {
+    let schema = stream.schema();
+    let remapped = stream.map(move |batch_result| {
+        let batch = batch_result?;
+        Ok(frag_reuse_index.remap_row_ids_record_batch(batch, 1)?)
+    });
+    Box::pin(RecordBatchStreamAdapter::new(schema, remapped))
+}
+
 fn wrap_bound(bound: &Bound<ScalarValue>) -> Bound<OrderableScalarValue> {
     match bound {
         Bound::Unbounded => Bound::Unbounded,
@@ -2113,13 +2165,27 @@ impl ScalarIndex for BTreeIndex {
             }
         }
 
+        // Compile a large IsIn predicate once and reuse it across every page;
+        // rebuilding the full IN-list per page is O(pages * values) and dominates
+        // the lookup for sets with many values.
+        let prebuilt = match query {
+            SargableQuery::IsIn(_) => Some(self.compile_predicate(query)?),
+            _ => None,
+        };
+
         let lazy_index_reader =
             LazyIndexReader::new(self.store.clone(), self.ranges_to_files.clone());
         let page_tasks = pages
             .into_iter()
             .map(|page_index| {
-                self.search_page(query, page_index, lazy_index_reader.clone(), metrics)
-                    .boxed()
+                self.search_page(
+                    query,
+                    page_index,
+                    lazy_index_reader.clone(),
+                    prebuilt.as_ref(),
+                    metrics,
+                )
+                .boxed()
             })
             .collect::<Vec<_>>();
         debug!("Searching {} btree pages", page_tasks.len());
@@ -3295,7 +3361,23 @@ mod tests {
     };
     use crate::scalar::registry::ScalarIndexPlugin;
     use arrow_array::RecordBatch;
-    use lance_core::cache::{CacheCodecImpl, CacheKey};
+    use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter, CacheKey};
+
+    /// Serialize a `BTreeIndexState` body (no envelope) for tests.
+    fn serialize_state(state: &BTreeIndexState) -> Vec<u8> {
+        let mut buf = Vec::new();
+        state
+            .serialize(&mut CacheEntryWriter::new(&mut buf))
+            .unwrap();
+        buf
+    }
+
+    /// Deserialize a `BTreeIndexState` body (no envelope) for tests.
+    fn deserialize_state(buf: Vec<u8>) -> lance_core::Result<BTreeIndexState> {
+        let data = bytes::Bytes::from(buf);
+        let mut reader = CacheEntryReader::new(&data, 0, BTreeIndexState::CURRENT_VERSION);
+        BTreeIndexState::deserialize(&mut reader)
+    }
     use rangemap::RangeInclusiveMap;
 
     lance_testing::define_stage_event_progress!(
@@ -3319,6 +3401,37 @@ mod tests {
         assert!(size_of_many_i32 > 128 * 4);
     }
 
+    #[test]
+    fn test_orderable_dictionary_cmp() {
+        use arrow_schema::DataType;
+        use std::cmp::Ordering;
+
+        let dict = |s: &str, key: DataType| {
+            OrderableScalarValue(ScalarValue::Dictionary(
+                Box::new(key),
+                Box::new(ScalarValue::Utf8(Some(s.to_string()))),
+            ))
+        };
+
+        // Dictionary scalars are ordered by their underlying value, regardless
+        // of the key type. This is exercised when loading a scalar index built
+        // on a dictionary-encoded column into a BTreeMap.
+        assert_eq!(
+            dict("a", DataType::Int16).cmp(&dict("b", DataType::Int16)),
+            Ordering::Less
+        );
+        assert_eq!(
+            dict("b", DataType::Int32).cmp(&dict("b", DataType::Int16)),
+            Ordering::Equal
+        );
+
+        // A non-null dictionary value sorts after null.
+        assert_eq!(
+            dict("a", DataType::Int16).cmp(&OrderableScalarValue(ScalarValue::Null)),
+            Ordering::Greater
+        );
+    }
+
     #[tokio::test]
     async fn test_null_ids() {
         let tmpdir = TempObjDir::default();
@@ -3436,6 +3549,86 @@ mod tests {
         }
     }
 
+    #[tokio::test]
+    async fn test_contains_keys_matches_search() {
+        let tmpdir = TempObjDir::default();
+        let test_store = Arc::new(LanceIndexStore::new(
+            Arc::new(ObjectStore::local()),
+            tmpdir.clone(),
+            Arc::new(LanceCache::no_cache()),
+        ));
+
+        // 1000 distinct Int32 values [0, 1000), spread across many small pages
+        // (batch_size 64) so the keys below exercise multi-page grouping.
+        let data = gen_batch()
+            .col("value", array::step::<Int32Type>())
+            .col("_rowid", array::step::<UInt64Type>())
+            .into_df_exec(RowCount::from(100), BatchCount::from(10));
+        let schema = data.schema();
+        let sort_expr = PhysicalSortExpr::new_default(col("value", schema.as_ref()).unwrap());
+        let plan = Arc::new(SortExec::new([sort_expr].into(), data));
+        let stream = plan.execute(0, Arc::new(TaskContext::default())).unwrap();
+        let stream = break_stream(stream, 64);
+        let stream = stream.map_err(DataFusionError::from);
+        let stream =
+            Box::pin(RecordBatchStreamAdapter::new(schema, stream)) as SendableRecordBatchStream;
+
+        train_btree_index(stream, test_store.as_ref(), 64, None, None)
+            .await
+            .unwrap();
+        let index = BTreeIndex::load(test_store, None, &LanceCache::no_cache())
+            .await
+            .unwrap();
+
+        // Present (range ends, mid, and adjacent values that straddle page
+        // boundaries), interleaved with absent (below/above range, and a gap).
+        let keys: Vec<i32> = vec![0, 999, 500, 1, 998, -1, 1000, 1500, 250, 251, 7, 64, 63, 65];
+        let scalar_keys: Vec<ScalarValue> =
+            keys.iter().map(|k| ScalarValue::Int32(Some(*k))).collect();
+
+        let batched = index
+            .contains_keys(&scalar_keys, &NoOpMetricsCollector)
+            .await
+            .unwrap();
+
+        // Oracle: the per-key Equals search the batched path replaces.
+        let mut oracle = Vec::with_capacity(keys.len());
+        for k in &scalar_keys {
+            let result = index
+                .search(&SargableQuery::Equals(k.clone()), &NoOpMetricsCollector)
+                .await
+                .unwrap();
+            oracle.push(!result.row_addrs().is_empty());
+        }
+        assert_eq!(
+            batched, oracle,
+            "contains_keys must agree with per-key Equals search; keys={keys:?}"
+        );
+
+        // And both must match ground truth: [0, 1000) present, others absent.
+        let expected: Vec<bool> = keys.iter().map(|k| (0..1000).contains(k)).collect();
+        assert_eq!(batched, expected);
+
+        // Empty input → empty mask.
+        assert!(
+            index
+                .contains_keys(&[], &NoOpMetricsCollector)
+                .await
+                .unwrap()
+                .is_empty()
+        );
+
+        // A null key maps to false (and must not panic).
+        let with_null = vec![ScalarValue::Int32(Some(5)), ScalarValue::Int32(None)];
+        assert_eq!(
+            index
+                .contains_keys(&with_null, &NoOpMetricsCollector)
+                .await
+                .unwrap(),
+            vec![true, false]
+        );
+    }
+
     #[tokio::test]
     async fn test_page_cache() {
         let tmpdir = TempObjDir::default();
@@ -5897,9 +6090,7 @@ mod tests {
     }
 
     fn assert_state_roundtrips(state: &BTreeIndexState) {
-        let mut buf = Vec::new();
-        state.serialize(&mut buf).unwrap();
-        let restored = BTreeIndexState::deserialize(&bytes::Bytes::from(buf)).unwrap();
+        let restored = deserialize_state(serialize_state(state)).unwrap();
         assert_eq!(restored.lookup_batch, state.lookup_batch);
         assert_eq!(restored.batch_size, state.batch_size);
         assert_eq!(restored.ranges_to_files, state.ranges_to_files);
@@ -5968,9 +6159,7 @@ mod tests {
             batch_size: index.batch_size,
             ranges_to_files: index.ranges_to_files.clone(),
         };
-        let mut buf = Vec::new();
-        state.serialize(&mut buf).unwrap();
-        let restored = BTreeIndexState::deserialize(&bytes::Bytes::from(buf)).unwrap();
+        let restored = deserialize_state(serialize_state(&state)).unwrap();
         let reconstructed = restored
             .reconstruct(test_store.clone(), &LanceCache::no_cache(), None)
             .unwrap();
@@ -6006,18 +6195,57 @@ mod tests {
         assert_eq!(expected, actual);
     }
 
+    /// The lookup batch must decode zero-copy through the full envelope even
+    /// though the proto header pushes the IPC section to a non-aligned offset.
     #[test]
-    fn test_btree_index_state_rejects_invalid_has_ranges_tag() {
-        // u64 batch_size (any) then a bad has_ranges tag.
+    fn test_btree_index_state_lookup_is_zero_copy() {
+        use lance_core::cache::CacheCodec;
+        const ALIGN: usize = 64;
+
+        let ranges: RangeInclusiveMap<u32, (String, u32)> =
+            [(0..=99, ("part_0_page_file.lance".to_string(), 0))]
+                .into_iter()
+                .collect();
+        let state = BTreeIndexState {
+            lookup_batch: sample_lookup_batch(),
+            batch_size: 8192,
+            ranges_to_files: Some(Arc::new(ranges)),
+        };
+
+        let codec = CacheCodec::from_impl::<BTreeIndexState>();
+        let any: Arc<dyn std::any::Any + Send + Sync> = Arc::new(state);
         let mut buf = Vec::new();
-        buf.extend_from_slice(&1000u64.to_le_bytes());
-        buf.push(7u8);
-        let err = BTreeIndexState::deserialize(&bytes::Bytes::from(buf)).unwrap_err();
-        let msg = err.to_string();
-        assert!(
-            msg.contains("has_ranges") && msg.contains("7"),
-            "expected error to mention the bad has_ranges tag, got: {msg}"
-        );
+        codec.serialize(&any, &mut buf).unwrap();
+
+        let mut v = vec![0u8; buf.len() + ALIGN];
+        let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN;
+        v[pad..pad + buf.len()].copy_from_slice(&buf);
+        let data = bytes::Bytes::from(v).slice(pad..pad + buf.len());
+
+        let restored = codec.deserialize(&data).hit().unwrap();
+        let restored = restored.downcast::<BTreeIndexState>().unwrap();
+
+        let base = data.as_ptr() as usize;
+        let end = base + data.len();
+        for col in restored.lookup_batch.columns() {
+            for buffer in col.to_data().buffers() {
+                let ptr = buffer.as_ptr() as usize;
+                assert!(
+                    ptr >= base && ptr < end,
+                    "lookup batch buffer was realigned out of the input — misaligned IPC section",
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_btree_index_state_rejects_truncated_header() {
+        // A header length prefix that overruns the buffer must error rather
+        // than panic or silently misread it.
+        let mut buf = Vec::new();
+        buf.extend_from_slice(&100u32.to_le_bytes()); // claims a 100-byte header
+        buf.extend_from_slice(&[0u8; 4]); // but only 4 bytes follow
+        assert!(deserialize_state(buf).is_err());
     }
 
     #[tokio::test]
diff --git a/rust/lance-index/src/scalar/btree/flat.rs b/rust/lance-index/src/scalar/btree/flat.rs
index 212ef6490be..744f6a3cb3c 100644
--- a/rust/lance-index/src/scalar/btree/flat.rs
+++ b/rust/lance-index/src/scalar/btree/flat.rs
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
-use std::collections::HashMap;
+use std::collections::{BTreeSet, HashMap};
 use std::{ops::Bound, sync::Arc};
 
 use arrow_array::Array;
@@ -11,19 +11,20 @@ use arrow_array::{
 
 use datafusion_common::DFSchema;
 use datafusion_expr::execution_props::ExecutionProps;
-use datafusion_physical_expr::create_physical_expr;
+use datafusion_physical_expr::{PhysicalExpr, create_physical_expr};
 use lance_arrow::RecordBatchExt;
-use lance_arrow::ipc::{read_ipc_stream_single_at, read_len_prefixed_bytes_at, write_ipc_stream};
 use lance_core::Result;
-use lance_core::cache::CacheCodecImpl;
+use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter};
 use lance_core::deepsize::DeepSizeOf;
 use lance_core::utils::address::RowAddress;
 use lance_select::{NullableRowAddrSet, RowAddrTreeMap, RowSetOps};
 use roaring::RoaringBitmap;
 use tracing::instrument;
 
+use datafusion_common::ScalarValue;
+
 use crate::metrics::MetricsCollector;
-use crate::scalar::btree::BTREE_VALUES_COLUMN;
+use crate::scalar::btree::{BTREE_VALUES_COLUMN, OrderableScalarValue};
 use crate::scalar::{AnyQuery, SargableQuery};
 
 const VALUES_COL_IDX: usize = 0;
@@ -83,6 +84,46 @@ impl FlatIndex {
         self.data.column(IDS_COL_IDX)
     }
 
+    fn values(&self) -> &ArrayRef {
+        self.data.column(VALUES_COL_IDX)
+    }
+
+    /// Which of `needles` are present in this page.
+    ///
+    /// Batched existence sibling of [`Self::search`]: it runs the same `IsIn`
+    /// predicate over the page's `values` column, but returns the matched
+    /// *values* rather than row addresses — so the caller can map each result
+    /// back to the input key it asked about. The page scan stays vectorized;
+    /// only the (small) matched subset is lifted into `ScalarValue`.
+    ///
+    /// Nulls: a null `values` entry never matches a (non-null) primary-key
+    /// needle, so it is simply absent from the result.
+    pub(crate) fn contains_values(
+        &self,
+        needles: &[OrderableScalarValue],
+    ) -> Result<BTreeSet<OrderableScalarValue>> {
+        if needles.is_empty() {
+            return Ok(BTreeSet::new());
+        }
+        let query = SargableQuery::IsIn(needles.iter().map(|v| v.0.clone()).collect());
+        let expr = query.to_expr(BTREE_VALUES_COLUMN.to_string());
+        let expr = create_physical_expr(&expr, &self.df_schema, &ExecutionProps::default())?;
+        let predicate = expr.evaluate(&self.data)?;
+        let predicate = predicate.into_array(self.data.num_rows())?;
+        let predicate = predicate
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .expect("Predicate should return boolean array");
+        let matched = arrow_select::filter::filter(self.values(), predicate)?;
+        (0..matched.len())
+            .map(|i| {
+                Ok(OrderableScalarValue(ScalarValue::try_from_array(
+                    &matched, i,
+                )?))
+            })
+            .collect()
+    }
+
     pub fn all(&self) -> NullableRowAddrSet {
         // Some rows will be in both sets but that is ok, null trumps true
         NullableRowAddrSet::new(self.all_addrs_map.clone(), self.null_addrs_map.clone())
@@ -196,7 +237,22 @@ impl FlatIndex {
         // No shortcut possible, need to actually evaluate the query
         let expr = query.to_expr(BTREE_VALUES_COLUMN.to_string());
         let expr = create_physical_expr(&expr, &self.df_schema, &ExecutionProps::default())?;
+        self.eval_expr(&expr)
+    }
 
+    /// Evaluate a predicate compiled once by the caller. Lets a large IsIn that
+    /// spans many pages build the physical expr a single time instead of
+    /// rebuilding the whole IN-list per page (the dominant cost of a big lookup).
+    pub fn search_prebuilt(
+        &self,
+        expr: &Arc<dyn PhysicalExpr>,
+        metrics: &dyn MetricsCollector,
+    ) -> Result<NullableRowAddrSet> {
+        metrics.record_comparisons(self.data.num_rows());
+        self.eval_expr(expr)
+    }
+
+    fn eval_expr(&self, expr: &Arc<dyn PhysicalExpr>) -> Result<NullableRowAddrSet> {
         let predicate = expr.evaluate(&self.data)?;
         let predicate = predicate.into_array(self.data.num_rows())?;
         let predicate = predicate
@@ -236,32 +292,38 @@ impl FlatIndex {
 }
 
 impl CacheCodecImpl for FlatIndex {
-    fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()> {
+    const TYPE_ID: &'static str = "lance.scalar.FlatIndex";
+    const CURRENT_VERSION: u32 = 1;
+
+    fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> {
         // Format:
-        // [len-prefixed all_addrs_map][len-prefixed null_addrs_map][batch IPC stream]
-        writer.write_all(&(self.all_addrs_map.serialized_size() as u64).to_le_bytes())?;
-        self.all_addrs_map.serialize_into(&mut *writer)?;
+        // RAW_BLOB  : all_addrs_map (roaring tree map)
+        // RAW_BLOB  : null_addrs_map (roaring tree map)
+        // ARROW_IPC : data batch
+        let mut all_addrs_bytes = Vec::with_capacity(self.all_addrs_map.serialized_size());
+        self.all_addrs_map.serialize_into(&mut all_addrs_bytes)?;
+        w.write_raw(&all_addrs_bytes)?;
 
-        writer.write_all(&(self.null_addrs_map.serialized_size() as u64).to_le_bytes())?;
-        self.null_addrs_map.serialize_into(&mut *writer)?;
+        let mut null_addrs_bytes = Vec::with_capacity(self.null_addrs_map.serialized_size());
+        self.null_addrs_map.serialize_into(&mut null_addrs_bytes)?;
+        w.write_raw(&null_addrs_bytes)?;
 
-        write_ipc_stream(self.data.as_ref(), writer)?;
+        w.write_ipc(self.data.as_ref())?;
 
         Ok(())
     }
 
-    fn deserialize(data: &bytes::Bytes) -> Result<Self>
+    fn deserialize(r: &mut CacheEntryReader<'_>) -> Result<Self>
     where
         Self: Sized,
     {
-        let mut offset = 0;
-        let all_addrs_bytes = read_len_prefixed_bytes_at(data, &mut offset)?;
+        let all_addrs_bytes = r.read_raw()?;
         let all_addrs_map = RowAddrTreeMap::deserialize_from(all_addrs_bytes.as_ref())?;
 
-        let null_addrs_bytes = read_len_prefixed_bytes_at(data, &mut offset)?;
+        let null_addrs_bytes = r.read_raw()?;
         let null_addrs_map = RowAddrTreeMap::deserialize_from(null_addrs_bytes.as_ref())?;
 
-        let batch = read_ipc_stream_single_at(data, &mut offset)?;
+        let batch = r.read_ipc()?;
 
         let df_schema = DFSchema::try_from(batch.schema())?;
 
@@ -309,8 +371,12 @@ mod tests {
 
     fn assert_roundtrips(index: &FlatIndex) {
         let mut buf = Vec::new();
-        index.serialize(&mut buf).unwrap();
-        let restored = FlatIndex::deserialize(&bytes::Bytes::from(buf)).unwrap();
+        index
+            .serialize(&mut CacheEntryWriter::new(&mut buf))
+            .unwrap();
+        let data = bytes::Bytes::from(buf);
+        let mut reader = CacheEntryReader::new(&data, 0, FlatIndex::CURRENT_VERSION);
+        let restored = FlatIndex::deserialize(&mut reader).unwrap();
 
         assert_eq!(restored.data, index.data);
         assert_eq!(restored.all_addrs_map, index.all_addrs_map);
@@ -335,6 +401,41 @@ mod tests {
         assert_roundtrips(&FlatIndex::try_new(empty).unwrap());
     }
 
+    /// The data batch must decode zero-copy through the full envelope-bearing
+    /// [`CacheCodec`], even though the two roaring blobs and the envelope push
+    /// the IPC section to a non-aligned starting offset.
+    #[test]
+    fn test_flat_index_data_is_zero_copy() {
+        use lance_core::cache::CacheCodec;
+        const ALIGN: usize = 64;
+
+        let index = example_index();
+        let codec = CacheCodec::from_impl::<FlatIndex>();
+        let any: Arc<dyn std::any::Any + Send + Sync> = Arc::new(index);
+        let mut buf = Vec::new();
+        codec.serialize(&any, &mut buf).unwrap();
+
+        let mut v = vec![0u8; buf.len() + ALIGN];
+        let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN;
+        v[pad..pad + buf.len()].copy_from_slice(&buf);
+        let data = bytes::Bytes::from(v).slice(pad..pad + buf.len());
+
+        let restored = codec.deserialize(&data).hit().unwrap();
+        let restored = restored.downcast::<FlatIndex>().unwrap();
+
+        let base = data.as_ptr() as usize;
+        let end = base + data.len();
+        for col in restored.data.columns() {
+            for buffer in col.to_data().buffers() {
+                let ptr = buffer.as_ptr() as usize;
+                assert!(
+                    ptr >= base && ptr < end,
+                    "data batch buffer was realigned out of the input — misaligned IPC section",
+                );
+            }
+        }
+    }
+
     #[tokio::test]
     async fn test_equality() {
         check_index(&SargableQuery::Equals(ScalarValue::from(100)), &[0]).await;
diff --git a/rust/lance-index/src/scalar/expression.rs b/rust/lance-index/src/scalar/expression.rs
index 38a29e9c43c..053da5ae5e7 100644
--- a/rust/lance-index/src/scalar/expression.rs
+++ b/rust/lance-index/src/scalar/expression.rs
@@ -179,6 +179,18 @@ impl MultiQueryParser {
     pub fn add(&mut self, other: Box<dyn ScalarQueryParser>) {
         self.parsers.push(other);
     }
+
+    /// Pick the first underlying parser whose `is_valid_reference` accepts `expr`.
+    pub fn select(
+        &self,
+        expr: &Expr,
+        data_type: &DataType,
+    ) -> Option<(&dyn ScalarQueryParser, DataType)> {
+        self.parsers.iter().find_map(|p| {
+            p.is_valid_reference(expr, data_type)
+                .map(|dt| (p.as_ref(), dt))
+        })
+    }
 }
 
 impl ScalarQueryParser for MultiQueryParser {
@@ -781,20 +793,28 @@ impl ScalarQueryParser for LabelListQueryParser {
     }
 }
 
-/// A parser for indices that handle string contains queries
+/// A parser for indices that handle string `contains` queries, and -- when
+/// `supports_regex` is set -- `regexp_like` / `regexp_match` queries.
 #[derive(Debug, Clone)]
 pub struct TextQueryParser {
     index_name: String,
     index_type: String,
     needs_recheck: bool,
+    supports_regex: bool,
 }
 
 impl TextQueryParser {
-    pub fn new(index_name: String, index_type: String, needs_recheck: bool) -> Self {
+    pub fn new(
+        index_name: String,
+        index_type: String,
+        needs_recheck: bool,
+        supports_regex: bool,
+    ) -> Self {
         Self {
             index_name,
             index_type,
             needs_recheck,
+            supports_regex,
         }
     }
 }
@@ -837,31 +857,156 @@ impl ScalarQueryParser for TextQueryParser {
         func: &ScalarUDF,
         args: &[Expr],
     ) -> Option<IndexedExpression> {
-        if args.len() != 2 {
+        // The first argument is the indexed column; the second is the substring
+        // / pattern. `contains` takes exactly two arguments; the regex functions
+        // optionally take a third flags argument.
+        if args.len() < 2 {
             return None;
         }
-        let scalar = maybe_scalar(&args[1], data_type)?;
-        match scalar {
-            ScalarValue::Utf8(Some(scalar_str)) | ScalarValue::LargeUtf8(Some(scalar_str)) => {
-                if func.name() == "contains" {
-                    let query = TextQuery::StringContains(scalar_str);
-                    Some(IndexedExpression::index_query_with_recheck(
-                        column.to_string(),
-                        self.index_name.clone(),
-                        self.index_type.clone(),
-                        Arc::new(query),
-                        self.needs_recheck,
-                    ))
-                } else {
+        // A non-string pattern cannot be handled.
+        let (ScalarValue::Utf8(Some(pattern)) | ScalarValue::LargeUtf8(Some(pattern))) =
+            maybe_scalar(&args[1], data_type)?
+        else {
+            return None;
+        };
+
+        let query = match func.name() {
+            "contains" if args.len() == 2 => TextQuery::StringContains(pattern),
+            "regexp_like" | "regexp_match" if self.supports_regex => {
+                let pattern = match args.get(2) {
+                    Some(flags_expr) => apply_regex_flags(&pattern, flags_expr)?,
+                    None => pattern,
+                };
+                // If the pattern yields no usable trigram (e.g. `a.b`), leave it
+                // to a full scan instead of routing it to the index, which could
+                // only answer with an unsupported "recheck everything" result.
+                if !crate::scalar::ngram::regex_can_use_index(&pattern) {
+                    return None;
+                }
+                TextQuery::Regex(pattern)
+            }
+            _ => return None,
+        };
+
+        Some(IndexedExpression::index_query_with_recheck(
+            column.to_string(),
+            self.index_name.clone(),
+            self.index_type.clone(),
+            Arc::new(query),
+            self.needs_recheck,
+        ))
+    }
+
+    fn visit_like(
+        &self,
+        column: &str,
+        like: &Like,
+        pattern: &ScalarValue,
+    ) -> Option<IndexedExpression> {
+        // Infix LIKE is accelerated only by the ngram index (via its regex
+        // machinery). A plain-literal `regexp_like(col, 'foo')` is rewritten to
+        // `col LIKE '%foo%'` before it reaches the index, so this is the path
+        // that accelerates those. ILIKE is skipped because its case folding does
+        // not match the index's normalization.
+        if !self.supports_regex || like.case_insensitive {
+            return None;
+        }
+        let pattern_str = match pattern {
+            ScalarValue::Utf8(Some(s)) | ScalarValue::LargeUtf8(Some(s)) => s.as_str(),
+            _ => return None,
+        };
+        // Translate the LIKE pattern into a loose regex used only for candidate
+        // generation; the original LIKE stays as the recheck filter, so the
+        // regex only needs to be a sound superset.
+        let regex = like_to_regex(pattern_str, like.escape_char)?;
+        if !crate::scalar::ngram::regex_can_use_index(&regex) {
+            return None;
+        }
+        Some(IndexedExpression {
+            scalar_query: Some(ScalarIndexExpr::Query(ScalarIndexSearch {
+                column: column.to_string(),
+                index_name: self.index_name.clone(),
+                index_type: self.index_type.clone(),
+                query: Arc::new(TextQuery::Regex(regex)),
+                needs_recheck: self.needs_recheck,
+                fragment_bitmap: None,
+            })),
+            refine_expr: Some(Expr::Like(like.clone())),
+        })
+    }
+}
+
+/// Translate a LIKE pattern into a regular expression used purely for ngram
+/// candidate generation: `%` becomes `.*`, `_` becomes `.`, and literal
+/// characters are regex-escaped. Returns `None` when no literal run is long
+/// enough to yield a trigram (the index could not help, so a full scan is left
+/// to handle it).
+fn like_to_regex(pattern: &str, escape: Option<char>) -> Option<String> {
+    let mut regex = String::new();
+    let mut run = 0usize;
+    let mut longest_run = 0usize;
+    let mut chars = pattern.chars();
+    while let Some(c) = chars.next() {
+        let literal = if Some(c) == escape {
+            // The next character is escaped, i.e. a literal.
+            chars.next()
+        } else {
+            match c {
+                '%' => {
+                    regex.push_str(".*");
+                    run = 0;
                     None
                 }
+                '_' => {
+                    regex.push('.');
+                    run = 0;
+                    None
+                }
+                other => Some(other),
             }
-            _ => {
-                // If the scalar is not a string, we cannot handle it
-                None
+        };
+        if let Some(lit) = literal {
+            if regex_syntax::is_meta_character(lit) {
+                regex.push('\\');
+            }
+            regex.push(lit);
+            // Only runs of alphanumeric characters can produce a trigram.
+            if lit.is_alphanumeric() {
+                run += 1;
+                longest_run = longest_run.max(run);
+            } else {
+                run = 0;
             }
         }
     }
+    (longest_run >= 3).then_some(regex)
+}
+
+/// Fold the supported `regexp_like` / `regexp_match` flags into an inline prefix
+/// on the pattern (e.g. flags `"i"` -> `"(?i)pattern"`). Returns `None` for a
+/// non-literal flags argument or an unrecognized flag, so the caller leaves the
+/// predicate to a full recheck rather than risk changing its semantics.
+fn apply_regex_flags(pattern: &str, flags_expr: &Expr) -> Option<String> {
+    let (Expr::Literal(ScalarValue::Utf8(Some(flags)), _)
+    | Expr::Literal(ScalarValue::LargeUtf8(Some(flags)), _)) = flags_expr
+    else {
+        return None;
+    };
+    let mut inline = String::new();
+    for flag in flags.chars() {
+        // Only flags expressible as an inline `(?...)` group in the regex crate
+        // (which the recheck uses) are safe to fold.
+        if ['i', 's', 'm', 'x'].contains(&flag) {
+            inline.push(flag);
+        } else {
+            return None;
+        }
+    }
+    if inline.is_empty() {
+        Some(pattern.to_string())
+    } else {
+        Some(format!("(?{inline}){pattern}"))
+    }
 }
 
 /// A parser for indices that handle queries with the contains_tokens function
@@ -1452,8 +1597,8 @@ fn maybe_indexed_column<'b>(
 ) -> Option<(String, DataType, &'b dyn ScalarQueryParser)> {
     // First try to extract the full nested column path for get_field expressions
     if let Some(nested_path) = extract_nested_column_path(expr)
-        && let Some((data_type, parser)) = index_info.get_index(&nested_path)
-        && let Some(data_type) = parser.is_valid_reference(expr, data_type)
+        && let Some((data_type, multi)) = index_info.get_index(&nested_path)
+        && let Some((parser, data_type)) = multi.select(expr, data_type)
     {
         return Some((nested_path, data_type, parser));
     }
@@ -1461,12 +1606,9 @@ fn maybe_indexed_column<'b>(
     match expr {
         Expr::Column(col) => {
             let col = col.name.as_str();
-            let (data_type, parser) = index_info.get_index(col)?;
-            if let Some(data_type) = parser.is_valid_reference(expr, data_type) {
-                Some((col.to_string(), data_type, parser))
-            } else {
-                None
-            }
+            let (data_type, multi) = index_info.get_index(col)?;
+            let (parser, data_type) = multi.select(expr, data_type)?;
+            Some((col.to_string(), data_type, parser))
         }
         Expr::ScalarFunction(udf) => {
             if udf.args.is_empty() {
@@ -1474,12 +1616,9 @@ fn maybe_indexed_column<'b>(
             }
             // For non-get_field functions, fall back to old behavior
             let col = maybe_column(&udf.args[0])?;
-            let (data_type, parser) = index_info.get_index(col)?;
-            if let Some(data_type) = parser.is_valid_reference(expr, data_type) {
-                Some((col.to_string(), data_type, parser))
-            } else {
-                None
-            }
+            let (data_type, multi) = index_info.get_index(col)?;
+            let (parser, data_type) = multi.select(expr, data_type)?;
+            Some((col.to_string(), data_type, parser))
         }
         _ => None,
     }
@@ -1813,7 +1952,18 @@ fn visit_node(
         Expr::IsFalse(expr) => Ok(visit_is_bool(expr.as_ref(), index_info, false)),
         Expr::IsTrue(expr) => Ok(visit_is_bool(expr.as_ref(), index_info, true)),
         Expr::IsNull(expr) => Ok(visit_is_null(expr.as_ref(), index_info, false)),
-        Expr::IsNotNull(expr) => Ok(visit_is_null(expr.as_ref(), index_info, true)),
+        Expr::IsNotNull(expr) => {
+            // `regexp_match(col, pat)` returns a list and is coerced to
+            // `IsNotNull(regexp_match(...))` before it reaches here. Unwrap that
+            // so the regex acceleration applies; everything else is a genuine
+            // IS NOT NULL check.
+            if let Expr::ScalarFunction(scalar_fn) = expr.as_ref()
+                && scalar_fn.func.name() == "regexp_match"
+            {
+                return Ok(visit_scalar_fn(scalar_fn, index_info));
+            }
+            Ok(visit_is_null(expr.as_ref(), index_info, true))
+        }
         Expr::Not(expr) => visit_not(expr.as_ref(), index_info, depth),
         Expr::BinaryExpr(binary_expr) => visit_binary_expr(binary_expr, index_info, depth),
         Expr::ScalarFunction(scalar_fn) => Ok(visit_scalar_fn(scalar_fn, index_info)),
@@ -1833,7 +1983,7 @@ fn visit_node(
 pub trait IndexInformationProvider {
     /// Check if an index exists for `col` and, if so, return the data type of col
     /// as well as a query parser that can parse queries for that column
-    fn get_index(&self, col: &str) -> Option<(&DataType, &dyn ScalarQueryParser)>;
+    fn get_index(&self, col: &str) -> Option<(&DataType, &MultiQueryParser)>;
 
     /// The set of fragments covered by `(column, index_name)`.
     ///
@@ -2015,11 +2165,18 @@ mod tests {
 
     struct ColInfo {
         data_type: DataType,
-        parser: Box<dyn ScalarQueryParser>,
+        parser: Box<MultiQueryParser>,
     }
 
     impl ColInfo {
         fn new(data_type: DataType, parser: Box<dyn ScalarQueryParser>) -> Self {
+            Self {
+                data_type,
+                parser: Box::new(MultiQueryParser::single(parser)),
+            }
+        }
+
+        fn with_multi(data_type: DataType, parser: Box<MultiQueryParser>) -> Self {
             Self { data_type, parser }
         }
     }
@@ -2041,7 +2198,7 @@ mod tests {
     }
 
     impl IndexInformationProvider for MockIndexInfoProvider {
-        fn get_index(&self, col: &str) -> Option<(&DataType, &dyn ScalarQueryParser)> {
+        fn get_index(&self, col: &str) -> Option<(&DataType, &MultiQueryParser)> {
             self.indexed_columns
                 .get(col)
                 .map(|col_info| (&col_info.data_type, col_info.parser.as_ref()))
@@ -2690,6 +2847,59 @@ mod tests {
         assert!(matches!(negated.upper, NullableRowAddrMask::BlockList(_)));
     }
 
+    #[test]
+    fn test_like_to_regex() {
+        // `%` -> `.*`, `_` -> `.`, with a literal run of at least three chars.
+        assert_eq!(like_to_regex("%foo%", None).as_deref(), Some(".*foo.*"));
+        assert_eq!(like_to_regex("foo%bar", None).as_deref(), Some("foo.*bar"));
+        assert_eq!(like_to_regex("foo_bar", None).as_deref(), Some("foo.bar"));
+        assert_eq!(like_to_regex("foobar", None).as_deref(), Some("foobar"));
+
+        // Regex metacharacters in the literal portion are escaped.
+        assert_eq!(
+            like_to_regex("%a.bcd%", None).as_deref(),
+            Some(".*a\\.bcd.*")
+        );
+
+        // No literal run of three alphanumeric characters -> no index help.
+        assert_eq!(like_to_regex("%ab%", None), None);
+        assert_eq!(like_to_regex("%a%b%c%", None), None);
+        assert_eq!(like_to_regex("%", None), None);
+
+        // The escape character makes the following character a literal.
+        assert_eq!(
+            like_to_regex(r"%foo\%bar%", Some('\\')).as_deref(),
+            Some(".*foo%bar.*")
+        );
+    }
+
+    #[test]
+    fn test_apply_regex_flags() {
+        fn flags(s: &str) -> Expr {
+            Expr::Literal(ScalarValue::Utf8(Some(s.to_string())), None)
+        }
+
+        // Empty flags leave the pattern untouched (no inline group emitted).
+        assert_eq!(apply_regex_flags("foo", &flags("")).as_deref(), Some("foo"));
+        // Supported flags are folded into an inline `(?...)` prefix.
+        assert_eq!(
+            apply_regex_flags("foo", &flags("i")).as_deref(),
+            Some("(?i)foo")
+        );
+        assert_eq!(
+            apply_regex_flags("foo", &flags("is")).as_deref(),
+            Some("(?is)foo")
+        );
+        // An unrecognized flag bails out so the caller leaves the predicate to a
+        // full recheck rather than risk changing its semantics.
+        assert_eq!(apply_regex_flags("foo", &flags("g")), None);
+        // A non-string (hence non-literal-flags) argument cannot be folded.
+        assert_eq!(
+            apply_regex_flags("foo", &Expr::Literal(ScalarValue::Int32(Some(1)), None)),
+            None
+        );
+    }
+
     #[test]
     fn test_extract_like_leading_prefix() {
         // Simple prefix patterns (no recheck needed)
@@ -3157,4 +3367,75 @@ mod tests {
         assert_eq!(round_tripped.upper, RowAddrMask::from_allowed(upper_addrs));
         assert_eq!(round_tripped_frags, fragments_covered);
     }
+
+    /// Regression test: when two JSON indices target different paths on the same
+    /// column, a query against one path must be routed to its own index instead
+    /// of being intercepted by whichever parser was registered first.
+    #[test]
+    fn test_multi_json_indices_route_by_path() {
+        // Build a MultiQueryParser containing two JSON sub-parsers: one for
+        // path "$.a" and one for path "$.b".
+        let mut multi = MultiQueryParser::single(Box::new(JsonQueryParser::new(
+            "$.a".to_string(),
+            Box::new(SargableQueryParser::new(
+                "json_a_idx".to_string(),
+                "Json".to_string(),
+                false,
+            )),
+        )));
+        multi.add(Box::new(JsonQueryParser::new(
+            "$.b".to_string(),
+            Box::new(SargableQueryParser::new(
+                "json_b_idx".to_string(),
+                "Json".to_string(),
+                false,
+            )),
+        )));
+
+        let index_info = MockIndexInfoProvider::new(vec![(
+            "json",
+            ColInfo::with_multi(DataType::LargeBinary, Box::new(multi)),
+        )]);
+
+        // Query against path "$.b" must hit the "$.b" index.
+        let expected_b = IndexedExpression::index_query(
+            "json".to_string(),
+            "json_b_idx".to_string(),
+            "Json".to_string(),
+            Arc::new(JsonQuery::new(
+                Arc::new(SargableQuery::Equals(ScalarValue::Utf8(Some(
+                    "foo".to_string(),
+                )))),
+                "$.b".to_string(),
+            )),
+        );
+        check(
+            &index_info,
+            "json_extract(json, '$.b') = 'foo'",
+            Some(expected_b),
+            false,
+        );
+
+        // Query against path "$.a" must hit the "$.a" index.
+        let expected_a = IndexedExpression::index_query(
+            "json".to_string(),
+            "json_a_idx".to_string(),
+            "Json".to_string(),
+            Arc::new(JsonQuery::new(
+                Arc::new(SargableQuery::Equals(ScalarValue::Utf8(Some(
+                    "foo".to_string(),
+                )))),
+                "$.a".to_string(),
+            )),
+        );
+        check(
+            &index_info,
+            "json_extract(json, '$.a') = 'foo'",
+            Some(expected_a),
+            false,
+        );
+
+        // Query against an unindexed path must not bind to either index.
+        check_no_index(&index_info, "json_extract(json, '$.c') = 'foo'");
+    }
 }
diff --git a/rust/lance-index/src/scalar/fmindex.rs b/rust/lance-index/src/scalar/fmindex.rs
index 9677f7471ea..cdf19f0304c 100644
--- a/rust/lance-index/src/scalar/fmindex.rs
+++ b/rust/lance-index/src/scalar/fmindex.rs
@@ -1352,6 +1352,12 @@ impl ScalarIndex for FMIndexScalarIndex {
                     Default::default(),
                 )))
             }
+            // Regex queries are routed only to the ngram index (the FM-index's
+            // query parser advertises `supports_regex = false`), so this is
+            // unreachable in practice; reject it explicitly rather than silently.
+            TextQuery::Regex(_) => Err(Error::invalid_input(
+                "FMIndex does not support regular expression queries",
+            )),
         }
     }
     fn can_remap(&self) -> bool {
@@ -1370,8 +1376,7 @@ impl ScalarIndex for FMIndexScalarIndex {
         dest: &dyn IndexStore,
         _old_data_filter: Option<OldIndexDataFilter>,
     ) -> Result<CreatedIndex> {
-        let texts = collect_texts(new_data).await?;
-        let files = write_partitioned_fmindex(&texts, dest).await?;
+        let files = write_partitioned_fmindex_stream(new_data, dest).await?;
         Ok(CreatedIndex {
             index_details: prost_types::Any::from_msg(&pb::FmIndexIndexDetails {}).unwrap(),
             index_version: FMINDEX_INDEX_VERSION,
@@ -1390,8 +1395,14 @@ impl ScalarIndex for FMIndexScalarIndex {
 
 // ── Helpers ──────────────────────────────────────────────────────────────────
 
-async fn collect_texts(mut stream: SendableRecordBatchStream) -> Result<Vec<(u64, Vec<u8>)>> {
-    let mut texts = Vec::new();
+async fn write_partitioned_fmindex_stream(
+    mut stream: SendableRecordBatchStream,
+    store: &dyn IndexStore,
+) -> Result<Vec<IndexFile>> {
+    let mut files = Vec::new();
+    let mut partition = Vec::with_capacity(PARTITION_SIZE);
+    let mut partition_id = 0;
+
     while let Some(batch) = stream.next().await {
         let batch = batch?;
         // Prefer _rowaddr (global row address) over _rowid to ensure stable,
@@ -1409,24 +1420,85 @@ async fn collect_texts(mut stream: SendableRecordBatchStream) -> Result<Vec<(u64
             .unwrap_or_else(|| batch.column(0));
         for i in 0..batch.num_rows() {
             let rid = row_addrs.value(i);
-            if let Some(bytes) = extract_text_bytes(value_col.as_ref(), i)? {
-                let sanitized: Vec<u8> = bytes
-                    .iter()
-                    .map(|&b| {
-                        if b == SENTINEL_BYTE || b == 0x00 {
-                            b' '
-                        } else {
-                            b
-                        }
-                    })
-                    .collect();
-                texts.push((rid, sanitized));
+            if let Some(bytes) = extract_sanitized_text_bytes(value_col.as_ref(), i)? {
+                partition.push((rid, bytes));
+                if partition.len() == PARTITION_SIZE {
+                    files.push(write_fmindex_partition(&partition, store, partition_id).await?);
+                    partition.clear();
+                    partition_id += 1;
+                }
             }
         }
     }
-    Ok(texts)
+
+    if !partition.is_empty() {
+        files.push(write_fmindex_partition(&partition, store, partition_id).await?);
+    } else if files.is_empty() {
+        files.push(write_empty_fmindex_partition(store).await?);
+    }
+
+    Ok(files)
+}
+
+fn sanitize_text_bytes(bytes: &[u8]) -> Vec<u8> {
+    bytes
+        .iter()
+        .map(|&b| {
+            if b == SENTINEL_BYTE || b == 0x00 {
+                b' '
+            } else {
+                b
+            }
+        })
+        .collect()
 }
 
+fn extract_sanitized_text_bytes(
+    array: &dyn arrow_array::Array,
+    index: usize,
+) -> Result<Option<Vec<u8>>> {
+    if array.is_null(index) {
+        return Ok(None);
+    }
+    match array.data_type() {
+        DataType::Utf8 => Ok(Some(sanitize_text_bytes(
+            array
+                .as_any()
+                .downcast_ref::<arrow_array::StringArray>()
+                .unwrap()
+                .value(index)
+                .as_bytes(),
+        ))),
+        DataType::LargeUtf8 => Ok(Some(sanitize_text_bytes(
+            array
+                .as_any()
+                .downcast_ref::<arrow_array::LargeStringArray>()
+                .unwrap()
+                .value(index)
+                .as_bytes(),
+        ))),
+        DataType::Binary => Ok(Some(sanitize_text_bytes(
+            array
+                .as_any()
+                .downcast_ref::<arrow_array::BinaryArray>()
+                .unwrap()
+                .value(index),
+        ))),
+        DataType::LargeBinary => Ok(Some(sanitize_text_bytes(
+            array
+                .as_any()
+                .downcast_ref::<arrow_array::LargeBinaryArray>()
+                .unwrap()
+                .value(index),
+        ))),
+        _ => Err(Error::invalid_input(format!(
+            "Fm does not support data type: {:?}",
+            array.data_type()
+        ))),
+    }
+}
+
+#[cfg(test)]
 fn extract_text_bytes(array: &dyn arrow_array::Array, index: usize) -> Result<Option<Vec<u8>>> {
     if array.is_null(index) {
         return Ok(None);
@@ -1568,25 +1640,36 @@ async fn write_fmindex(fm: &FMIndex, store: &dyn IndexStore, filename: &str) ->
     writer.finish_with_metadata(metadata).await
 }
 
+#[cfg(test)]
 async fn write_partitioned_fmindex(
     texts: &[(u64, Vec<u8>)],
     store: &dyn IndexStore,
 ) -> Result<Vec<IndexFile>> {
-    let refs: Vec<(u64, &[u8])> = texts.iter().map(|(id, t)| (*id, t.as_slice())).collect();
-    if refs.is_empty() {
-        let fm = FMIndex::build(&[])?;
-        return Ok(vec![
-            write_fmindex(&fm, store, &fmindex_partition_path(0)).await?,
-        ]);
+    if texts.is_empty() {
+        return Ok(vec![write_empty_fmindex_partition(store).await?]);
     }
     let mut files = Vec::new();
-    for (pid, chunk) in refs.chunks(PARTITION_SIZE).enumerate() {
-        let fm = FMIndex::build(chunk)?;
-        files.push(write_fmindex(&fm, store, &fmindex_partition_path(pid as u64)).await?);
+    for (pid, chunk) in texts.chunks(PARTITION_SIZE).enumerate() {
+        files.push(write_fmindex_partition(chunk, store, pid as u64).await?);
     }
     Ok(files)
 }
 
+async fn write_fmindex_partition(
+    texts: &[(u64, Vec<u8>)],
+    store: &dyn IndexStore,
+    partition_id: u64,
+) -> Result<IndexFile> {
+    let refs: Vec<(u64, &[u8])> = texts.iter().map(|(id, t)| (*id, t.as_slice())).collect();
+    let fm = FMIndex::build(&refs)?;
+    write_fmindex(&fm, store, &fmindex_partition_path(partition_id)).await
+}
+
+async fn write_empty_fmindex_partition(store: &dyn IndexStore) -> Result<IndexFile> {
+    let fm = FMIndex::build(&[])?;
+    write_fmindex(&fm, store, &fmindex_partition_path(0)).await
+}
+
 // ── Plugin ───────────────────────────────────────────────────────────────────
 
 #[derive(Debug, Default)]
@@ -1623,8 +1706,7 @@ impl ScalarIndexPlugin for FMIndexPlugin {
         _fids: Option<Vec<u32>>,
         _progress: Arc<dyn crate::progress::IndexBuildProgress>,
     ) -> Result<CreatedIndex> {
-        let texts = collect_texts(data).await?;
-        let files = write_partitioned_fmindex(&texts, store).await?;
+        let files = write_partitioned_fmindex_stream(data, store).await?;
         Ok(CreatedIndex {
             index_details: prost_types::Any::from_msg(&pb::FmIndexIndexDetails {}).unwrap(),
             index_version: FMINDEX_INDEX_VERSION,
@@ -1645,6 +1727,9 @@ impl ScalarIndexPlugin for FMIndexPlugin {
         Some(Box::new(TextQueryParser::new(
             index_name,
             self.name().to_string(),
+            // needs_recheck: the FM-index returns exact substring matches.
+            false,
+            // supports_regex: regex acceleration is only implemented for ngram.
             false,
         )))
     }
@@ -1672,7 +1757,10 @@ impl ScalarIndexPlugin for FMIndexPlugin {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use lance_core::cache::LanceCache;
+    use arrow_array::{BinaryArray, LargeBinaryArray, LargeStringArray, StringArray, UInt64Array};
+    use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
+    use futures::stream;
+    use lance_core::{ROW_ADDR, cache::LanceCache};
     use lance_io::object_store::ObjectStore;
     use object_store::path::Path;
     use std::sync::Arc;
@@ -1885,11 +1973,10 @@ mod tests {
 
     #[test]
     fn test_sentinel_sanitization() {
-        // Text containing \xFF should be sanitized to space
+        // Text containing \xFF should be sanitized to space during training.
         let texts: Vec<(u64, &[u8])> = vec![(0, b"hello\xFFworld")];
         let fm = FMIndex::build(&texts).unwrap();
-        // The \xFF is replaced with space during collect_texts, but here we test build directly
-        // which doesn't sanitize. The search should still work.
+        // Build itself does not sanitize, but search should still work.
         let r = fm.search(b"hello");
         assert!(r.contains(0));
     }
@@ -2061,11 +2148,6 @@ mod tests {
 
     #[tokio::test(flavor = "multi_thread")]
     async fn test_plugin_train_and_load() {
-        use arrow_array::{StringArray, UInt64Array};
-        use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
-        use futures::stream;
-        use lance_core::ROW_ADDR;
-
         let docs = vec!["hello world", "hello rust", "goodbye world"];
         let row_addrs: Vec<u64> = vec![0, 1, 2];
         let schema = Arc::new(arrow_schema::Schema::new(vec![
@@ -2128,6 +2210,88 @@ mod tests {
         }
     }
 
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_plugin_train_streams_multiple_partitions() {
+        fn training_batch(
+            schema: Arc<arrow_schema::Schema>,
+            start: usize,
+            len: usize,
+        ) -> RecordBatch {
+            let docs = vec!["x"; len];
+            let row_addrs: Vec<u64> = (start..start + len).map(|i| i as u64).collect();
+            RecordBatch::try_new(
+                schema,
+                vec![
+                    Arc::new(StringArray::from(docs)),
+                    Arc::new(UInt64Array::from(row_addrs)),
+                ],
+            )
+            .unwrap()
+        }
+
+        let total_rows = PARTITION_SIZE + 5;
+        let first_batch_rows = PARTITION_SIZE - 3;
+        let schema = Arc::new(arrow_schema::Schema::new(vec![
+            arrow_schema::Field::new(
+                crate::scalar::registry::VALUE_COLUMN_NAME,
+                DataType::Utf8,
+                false,
+            ),
+            arrow_schema::Field::new(ROW_ADDR, DataType::UInt64, false),
+        ]));
+        let batches = vec![
+            Ok(training_batch(schema.clone(), 0, first_batch_rows)),
+            Ok(training_batch(
+                schema.clone(),
+                first_batch_rows,
+                total_rows - first_batch_rows,
+            )),
+        ];
+
+        let tempdir = tempfile::tempdir().unwrap();
+        let index_dir = Path::from_filesystem_path(tempdir.path()).unwrap();
+        let store = Arc::new(LanceIndexStore::new(
+            Arc::new(ObjectStore::local()),
+            index_dir,
+            Arc::new(LanceCache::no_cache()),
+        ));
+
+        let stream = RecordBatchStreamAdapter::new(schema, stream::iter(batches));
+        let req = FMIndexPlugin
+            .new_training_request("", &arrow_schema::Field::new("val", DataType::Utf8, false))
+            .unwrap();
+        let created = FMIndexPlugin
+            .train_index(
+                Box::pin(stream),
+                store.as_ref(),
+                req,
+                None,
+                Arc::new(crate::progress::NoopIndexBuildProgress),
+            )
+            .await
+            .unwrap();
+
+        assert_eq!(created.files.len(), 2);
+
+        let index = FMIndexPlugin
+            .load_index(store, &created.index_details, None, &LanceCache::no_cache())
+            .await
+            .unwrap();
+        let r = index
+            .search(
+                &TextQuery::StringContains("x".to_string()),
+                &crate::metrics::NoOpMetricsCollector,
+            )
+            .await
+            .unwrap();
+        match r {
+            SearchResult::Exact(set) => {
+                assert_eq!(set.len(), Some(total_rows as u64));
+            }
+            _ => panic!("expected exact result"),
+        }
+    }
+
     #[test]
     fn test_build_wavelet_batch() {
         let texts: Vec<(u64, &[u8])> = vec![(0, b"hello world"), (1, b"test data")];
@@ -2139,8 +2303,6 @@ mod tests {
 
     #[test]
     fn test_extract_text_bytes_types() {
-        use arrow_array::{BinaryArray, LargeBinaryArray, LargeStringArray, StringArray};
-
         let utf8 = StringArray::from(vec!["hello"]);
         assert_eq!(
             extract_text_bytes(&utf8, 0).unwrap(),
@@ -2158,6 +2320,11 @@ mod tests {
             extract_text_bytes(&binary, 0).unwrap(),
             Some(b"bytes".to_vec())
         );
+        let binary_with_sentinels = BinaryArray::from(vec![b"a\xFFb\0c" as &[u8]]);
+        assert_eq!(
+            extract_sanitized_text_bytes(&binary_with_sentinels, 0).unwrap(),
+            Some(b"a b c".to_vec())
+        );
 
         let large_binary = LargeBinaryArray::from(vec![b"large" as &[u8]]);
         assert_eq!(
diff --git a/rust/lance-index/src/scalar/inverted/builder.rs b/rust/lance-index/src/scalar/inverted/builder.rs
index 24b1eb50203..93932f35332 100644
--- a/rust/lance-index/src/scalar/inverted/builder.rs
+++ b/rust/lance-index/src/scalar/inverted/builder.rs
@@ -407,11 +407,21 @@ impl InvertedIndexBuilder {
     ) -> Result<Vec<IndexFile>> {
         let partition_id = self.next_partition_id() | self.fragment_mask.unwrap_or(0);
         builder.set_id(partition_id);
-        let files = builder.write(dest_store).await?;
+        let files = builder
+            .write_to(dest_store, self.partition_write_target())
+            .await?;
         self.new_partitions.push(partition_id);
         Ok(files)
     }
 
+    fn partition_write_target(&self) -> PartitionWriteTarget {
+        if self.fragment_mask.is_some() {
+            PartitionWriteTarget::Staged
+        } else {
+            PartitionWriteTarget::Final
+        }
+    }
+
     fn next_partition_id(&self) -> u64 {
         self.partitions
             .iter()
@@ -523,7 +533,11 @@ impl InvertedIndexBuilder {
             if let Some(builder) = merged_tail_partitions {
                 self.new_partitions.push(builder.id());
                 let mut builder = builder;
-                files.extend(builder.write(dest_store.as_ref()).await?);
+                files.extend(
+                    builder
+                        .write_to(dest_store.as_ref(), self.partition_write_target())
+                        .await?,
+                );
             }
             log::info!("wait workers indexing elapsed: {:?}", start.elapsed());
             Result::Ok(files)
@@ -550,12 +564,16 @@ impl InvertedIndexBuilder {
             .await?;
             let mut builder = part.into_builder().await?;
             builder.remap(mapping).await?;
-            files.extend(builder.write(dest_store).await?);
+            files.extend(
+                builder
+                    .write_to(dest_store, self.partition_write_target())
+                    .await?,
+            );
         }
         if self.fragment_mask.is_none() {
             files.push(self.write_metadata(dest_store, &self.partitions).await?);
         } else {
-            // in distributed mode, the part_temp_metadata is written by the worker
+            // in distributed mode, the staged partition metadata is written by the worker
             for &partition_id in &self.partitions {
                 files.push(self.write_part_metadata(dest_store, partition_id).await?);
             }
@@ -709,26 +727,35 @@ impl InvertedIndexBuilder {
             .await?;
         let mut copied = 0;
         let mut files = Vec::new();
+        let target = self.partition_write_target();
         for part in self.partitions.iter() {
             files.push(
                 self.src_store
                     .as_ref()
                     .expect("existing partitions require a source store")
-                    .copy_index_file(&token_file_path(*part), dest_store)
+                    .copy_index_file_to(
+                        &token_file_path(*part),
+                        &target.token_path(*part),
+                        dest_store,
+                    )
                     .await?,
             );
             files.push(
                 self.src_store
                     .as_ref()
                     .expect("existing partitions require a source store")
-                    .copy_index_file(&posting_file_path(*part), dest_store)
+                    .copy_index_file_to(
+                        &posting_file_path(*part),
+                        &target.posting_path(*part),
+                        dest_store,
+                    )
                     .await?,
             );
             files.push(
                 self.src_store
                     .as_ref()
                     .expect("existing partitions require a source store")
-                    .copy_index_file(&doc_file_path(*part), dest_store)
+                    .copy_index_file_to(&doc_file_path(*part), &target.doc_path(*part), dest_store)
                     .await?,
             );
             copied += 1;
@@ -986,11 +1013,22 @@ impl InnerBuilder {
     }
 
     pub async fn write(&mut self, store: &dyn IndexStore) -> Result<Vec<IndexFile>> {
+        self.write_to(store, PartitionWriteTarget::Final).await
+    }
+
+    async fn write_to(
+        &mut self,
+        store: &dyn IndexStore,
+        target: PartitionWriteTarget,
+    ) -> Result<Vec<IndexFile>> {
         let docs = Arc::new(std::mem::take(&mut self.docs));
         let files = vec![
-            self.write_posting_lists(store, docs.clone()).await?,
-            self.write_tokens(store).await?,
-            self.write_docs(store, docs).await?,
+            self.write_posting_lists(store, docs.clone(), &target.posting_path(self.id))
+                .await?,
+            self.write_tokens(store, &target.token_path(self.id))
+                .await?,
+            self.write_docs(store, docs, &target.doc_path(self.id))
+                .await?,
         ];
         Ok(files)
     }
@@ -1000,11 +1038,12 @@ impl InnerBuilder {
         &mut self,
         store: &dyn IndexStore,
         docs: Arc<DocSet>,
+        path: &str,
     ) -> Result<IndexFile> {
         let id = self.id;
         let mut writer = store
             .new_index_file(
-                &posting_file_path(self.id),
+                path,
                 inverted_list_schema_for_version(self.with_position, self.format_version),
             )
             .await?;
@@ -1090,29 +1129,57 @@ impl InnerBuilder {
     }
 
     #[instrument(level = "debug", skip_all)]
-    async fn write_tokens(&mut self, store: &dyn IndexStore) -> Result<IndexFile> {
+    async fn write_tokens(&mut self, store: &dyn IndexStore, path: &str) -> Result<IndexFile> {
         log::info!("writing tokens of partition {}", self.id);
         let tokens = std::mem::take(&mut self.tokens);
         let batch = tokens.to_batch(self.token_set_format)?;
-        let mut writer = store
-            .new_index_file(&token_file_path(self.id), batch.schema())
-            .await?;
+        let mut writer = store.new_index_file(path, batch.schema()).await?;
         writer.write_record_batch(batch).await?;
         writer.finish().await
     }
 
     #[instrument(level = "debug", skip_all)]
-    async fn write_docs(&mut self, store: &dyn IndexStore, docs: Arc<DocSet>) -> Result<IndexFile> {
+    async fn write_docs(
+        &mut self,
+        store: &dyn IndexStore,
+        docs: Arc<DocSet>,
+        path: &str,
+    ) -> Result<IndexFile> {
         log::info!("writing docs of partition {}", self.id);
         let batch = docs.to_batch()?;
-        let mut writer = store
-            .new_index_file(&doc_file_path(self.id), batch.schema())
-            .await?;
+        let mut writer = store.new_index_file(path, batch.schema()).await?;
         writer.write_record_batch(batch).await?;
         writer.finish().await
     }
 }
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum PartitionWriteTarget {
+    Final,
+    Staged,
+}
+
+impl PartitionWriteTarget {
+    fn file_path(self, partition_id: u64, suffix: &str) -> String {
+        match self {
+            Self::Final => partition_file_path(partition_id, suffix),
+            Self::Staged => staged_partition_file_path(partition_id, suffix),
+        }
+    }
+
+    fn token_path(self, partition_id: u64) -> String {
+        self.file_path(partition_id, TOKENS_FILE)
+    }
+
+    fn posting_path(self, partition_id: u64) -> String {
+        self.file_path(partition_id, INVERT_LIST_FILE)
+    }
+
+    fn doc_path(self, partition_id: u64) -> String {
+        self.file_path(partition_id, DOCS_FILE)
+    }
+}
+
 struct IndexWorker {
     tokenizer: Box<dyn LanceTokenizer>,
     dest_store: Arc<dyn IndexStore>,
@@ -1430,8 +1497,13 @@ impl IndexWorker {
         );
         let written_partition_id = builder.id();
         let mut builder = builder;
+        let target = if self.fragment_mask.is_some() {
+            PartitionWriteTarget::Staged
+        } else {
+            PartitionWriteTarget::Final
+        };
         let files = builder
-            .write(self.dest_store.as_ref())
+            .write_to(self.dest_store.as_ref(), target)
             .await
             .map_err(|err| {
                 Error::execution(format!(
@@ -1782,14 +1854,23 @@ pub(crate) fn doc_file_path(partition_id: u64) -> String {
 }
 
 pub(crate) fn part_metadata_file_path(partition_id: u64) -> String {
-    format!("part_{}_{}", partition_id, METADATA_FILE)
+    staged_partition_file_path(partition_id, METADATA_FILE)
 }
 
 const PARTITION_FILE_SUFFIXES: [&str; 3] = [TOKENS_FILE, INVERT_LIST_FILE, DOCS_FILE];
-// Each remapped file is renamed twice: first to a temp path (phase 1), then to
-// its final path (phase 2). Keep in sync with the two rename loops below in
-// `merge_metadata_files`.
-const PARTITION_FILE_RENAME_PHASES: u64 = 2;
+const STAGED_PARTITION_DIR: &str = "staging";
+
+fn partition_file_path(partition_id: u64, suffix: &str) -> String {
+    format!("part_{}_{}", partition_id, suffix)
+}
+
+fn staged_partition_file_path(partition_id: u64, suffix: &str) -> String {
+    format!(
+        "{}/{}",
+        STAGED_PARTITION_DIR,
+        partition_file_path(partition_id, suffix)
+    )
+}
 
 pub async fn merge_index_files(
     object_store: &ObjectStore,
@@ -1797,33 +1878,65 @@ pub async fn merge_index_files(
     store: Arc<dyn IndexStore>,
     progress: Arc<dyn IndexBuildProgress>,
 ) -> Result<()> {
-    // List all partition metadata files in the index directory
-    let part_metadata_files = list_metadata_files(object_store, index_dir).await?;
+    let metadata_path = index_dir.clone().join(METADATA_FILE);
+    if object_store.exists(&metadata_path).await? {
+        return Ok(());
+    }
+
+    // List all staged partition metadata files in the index directory
+    let index_files = list_index_files(object_store, index_dir).await?;
+    let part_metadata_files = metadata_files(&index_files);
+    if part_metadata_files.is_empty() {
+        return Err(Error::invalid_input_source(
+            format!(
+                "No partition metadata files found in index directory: {}",
+                index_dir
+            )
+            .into(),
+        ));
+    }
 
     // Call merge_metadata_files function for inverted index
     merge_metadata_files(store, &part_metadata_files, progress).await
 }
 
-/// List and filter metadata files from the index directory
-/// Returns partition metadata files
-async fn list_metadata_files(object_store: &ObjectStore, index_dir: &Path) -> Result<Vec<String>> {
-    // List all partition metadata files in the index directory
-    let mut part_metadata_files = Vec::new();
-    let mut list_stream = object_store.list(Some(index_dir.clone()));
+async fn list_index_files(object_store: &ObjectStore, index_dir: &Path) -> Result<Vec<String>> {
+    let mut index_files = Vec::new();
+    let mut list_stream = object_store.read_dir_all(index_dir, None);
 
     while let Some(item) = list_stream.next().await {
         match item {
             Ok(meta) => {
-                let file_name = meta.location.filename().unwrap_or_default();
-                // Filter files matching the pattern part_*_metadata.lance
-                if file_name.starts_with("part_") && file_name.ends_with("_metadata.lance") {
-                    part_metadata_files.push(file_name.to_string());
-                }
+                let location = meta.location.as_ref().trim_start_matches('/');
+                let index_dir = index_dir.as_ref().trim_start_matches('/');
+                let relative_path = location
+                    .strip_prefix(index_dir)
+                    .map(|s| s.trim_start_matches('/').to_string())
+                    .unwrap_or_else(|| meta.location.filename().unwrap_or("").to_string());
+                index_files.push(relative_path);
             }
             Err(err) => return Err(err),
         }
     }
 
+    Ok(index_files)
+}
+
+fn metadata_files(index_files: &[String]) -> Vec<String> {
+    index_files
+        .iter()
+        .filter(|file_name| {
+            file_name.starts_with(&format!("{}/part_", STAGED_PARTITION_DIR))
+                && file_name.ends_with("_metadata.lance")
+        })
+        .cloned()
+        .collect()
+}
+
+#[cfg(test)]
+async fn list_metadata_files(object_store: &ObjectStore, index_dir: &Path) -> Result<Vec<String>> {
+    let index_files = list_index_files(object_store, index_dir).await?;
+    let part_metadata_files = metadata_files(&index_files);
     if part_metadata_files.is_empty() {
         return Err(Error::invalid_input_source(
             format!(
@@ -1914,89 +2027,35 @@ async fn merge_metadata_files(
     progress.stage_complete("read_partition_metadata").await?;
 
     // Create ID mapping: sorted original IDs -> 0,1,2...
-    let mut sorted_ids = all_partitions.clone();
+    let mut sorted_ids = all_partitions;
     sorted_ids.sort();
     sorted_ids.dedup();
 
-    let id_mapping: HashMap<u64, u64> = sorted_ids
+    let id_mapping: Vec<(u64, u64)> = sorted_ids
         .iter()
         .enumerate()
         .map(|(new_id, &old_id)| (old_id, new_id as u64))
         .collect();
 
-    // Safe rename partition files using temporary files to avoid overwrite
-    let timestamp = std::time::SystemTime::now()
-        .duration_since(std::time::UNIX_EPOCH)
-        .unwrap()
-        .as_secs();
-
-    let changed_partition_count = id_mapping
-        .iter()
-        .filter(|(old_id, new_id)| old_id != new_id)
-        .count() as u64;
-    let total_renames = changed_partition_count
-        * PARTITION_FILE_SUFFIXES.len() as u64
-        * PARTITION_FILE_RENAME_PHASES;
+    let total_copies = id_mapping.len() as u64 * PARTITION_FILE_SUFFIXES.len() as u64;
     progress
-        .stage_start("remap_partition_files", Some(total_renames), "files")
+        .stage_start("remap_partition_files", Some(total_copies), "files")
         .await?;
 
-    // Phase 1: Move files to temporary locations
-    let mut temp_files: Vec<(String, String, String)> = Vec::new(); // (temp_path, old_path, final_path)
-    let mut renamed_files = 0u64;
+    let mut copied_files = 0u64;
 
-    for (&old_id, &new_id) in &id_mapping {
-        if old_id != new_id {
-            for suffix in PARTITION_FILE_SUFFIXES {
-                let old_path = format!("part_{}_{}", old_id, suffix);
-                let new_path = format!("part_{}_{}", new_id, suffix);
-                let temp_path = format!("temp_{}_{}", timestamp, old_path);
-
-                // Move to temporary location first to avoid overwrite
-                if let Err(e) = store.rename_index_file(&old_path, &temp_path).await {
-                    // Rollback phase 1: restore files from temp locations
-                    for (temp_name, old_name, _) in temp_files.iter().rev() {
-                        let _ = store.rename_index_file(temp_name, old_name).await;
-                    }
-                    return Err(Error::index(format!(
-                        "Failed to move {} to temp {}: {}",
-                        old_path, temp_path, e
-                    )));
-                }
-                temp_files.push((temp_path, old_path, new_path));
-                renamed_files += 1;
-                progress
-                    .stage_progress("remap_partition_files", renamed_files)
-                    .await?;
-            }
-        }
-    }
-
-    // Phase 2: Move from temporary to final locations
-    let mut completed_renames: Vec<(String, String)> = Vec::new(); // (final_path, temp_path)
-
-    for (temp_path, _old_path, final_path) in &temp_files {
-        if let Err(e) = store.rename_index_file(temp_path, final_path).await {
-            // Rollback phase 2: restore completed renames and remaining temps
-            for (final_name, temp_name) in completed_renames.iter().rev() {
-                let _ = store.rename_index_file(final_name, temp_name).await;
-            }
-            // Restore remaining temp files to original locations
-            for (temp_name, orig_name, _) in temp_files.iter() {
-                if !completed_renames.iter().any(|(_, t)| t == temp_name) {
-                    let _ = store.rename_index_file(temp_name, orig_name).await;
-                }
-            }
-            return Err(Error::index(format!(
-                "Failed to rename {} to {}: {}",
-                temp_path, final_path, e
-            )));
+    for &(old_id, new_id) in &id_mapping {
+        for suffix in PARTITION_FILE_SUFFIXES {
+            let staged_path = staged_partition_file_path(old_id, suffix);
+            let final_path = partition_file_path(new_id, suffix);
+            store
+                .copy_index_file_to(&staged_path, &final_path, store.as_ref())
+                .await?;
+            copied_files += 1;
+            progress
+                .stage_progress("remap_partition_files", copied_files)
+                .await?;
         }
-        completed_renames.push((final_path.clone(), temp_path.clone()));
-        renamed_files += 1;
-        progress
-            .stage_progress("remap_partition_files", renamed_files)
-            .await?;
     }
     progress.stage_complete("remap_partition_files").await?;
 
@@ -2023,10 +2082,15 @@ async fn merge_metadata_files(
     progress.stage_progress("write_merged_metadata", 1).await?;
     progress.stage_complete("write_merged_metadata").await?;
 
-    // Cleanup partition metadata files
+    // Cleanup staged partition metadata files
     for file_name in part_metadata_files {
-        if file_name.starts_with("part_") && file_name.ends_with("_metadata.lance") {
-            let _ = store.delete_index_file(file_name).await;
+        let _ = store.delete_index_file(file_name).await;
+    }
+    for &(old_id, _) in &id_mapping {
+        for suffix in PARTITION_FILE_SUFFIXES {
+            let _ = store
+                .delete_index_file(&staged_partition_file_path(old_id, suffix))
+                .await;
         }
     }
 
@@ -2246,6 +2310,234 @@ mod tests {
         }
     }
 
+    #[derive(Debug, Clone)]
+    struct NoRenameStore {
+        inner: Arc<dyn IndexStore>,
+        final_delete_count: Option<Arc<AtomicUsize>>,
+    }
+
+    impl NoRenameStore {
+        fn new(inner: Arc<dyn IndexStore>) -> Self {
+            Self {
+                inner,
+                final_delete_count: None,
+            }
+        }
+
+        fn with_final_delete_tracking(inner: Arc<dyn IndexStore>) -> Self {
+            Self {
+                inner,
+                final_delete_count: Some(Arc::new(AtomicUsize::new(0))),
+            }
+        }
+
+        fn final_delete_count(&self) -> usize {
+            self.final_delete_count
+                .as_ref()
+                .map(|count| count.load(Ordering::SeqCst))
+                .unwrap_or_default()
+        }
+
+        fn unwrap_dest_store(dest_store: &dyn IndexStore) -> &dyn IndexStore {
+            dest_store
+                .as_any()
+                .downcast_ref::<Self>()
+                .map(|store| store.inner.as_ref())
+                .unwrap_or(dest_store)
+        }
+    }
+
+    impl DeepSizeOf for NoRenameStore {
+        fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize {
+            self.inner.deep_size_of_children(context)
+        }
+    }
+
+    #[async_trait]
+    impl IndexStore for NoRenameStore {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn clone_arc(&self) -> Arc<dyn IndexStore> {
+            Arc::new(self.clone())
+        }
+
+        fn io_parallelism(&self) -> usize {
+            self.inner.io_parallelism()
+        }
+
+        async fn new_index_file(
+            &self,
+            name: &str,
+            schema: Arc<Schema>,
+        ) -> Result<Box<dyn IndexWriter>> {
+            self.inner.new_index_file(name, schema).await
+        }
+
+        async fn open_index_file(&self, name: &str) -> Result<Arc<dyn IndexReader>> {
+            self.inner.open_index_file(name).await
+        }
+
+        async fn copy_index_file(
+            &self,
+            name: &str,
+            dest_store: &dyn IndexStore,
+        ) -> Result<IndexFile> {
+            self.inner
+                .copy_index_file(name, Self::unwrap_dest_store(dest_store))
+                .await
+        }
+
+        async fn copy_index_file_to(
+            &self,
+            name: &str,
+            new_name: &str,
+            dest_store: &dyn IndexStore,
+        ) -> Result<IndexFile> {
+            self.inner
+                .copy_index_file_to(name, new_name, Self::unwrap_dest_store(dest_store))
+                .await
+        }
+
+        async fn rename_index_file(&self, name: &str, new_name: &str) -> Result<IndexFile> {
+            Err(Error::internal(format!(
+                "merge_index_files should not rename partition file {name} to {new_name}"
+            )))
+        }
+
+        async fn delete_index_file(&self, name: &str) -> Result<()> {
+            if name.starts_with("part_")
+                && let Some(count) = &self.final_delete_count
+            {
+                count.fetch_add(1, Ordering::SeqCst);
+            }
+            self.inner.delete_index_file(name).await
+        }
+
+        async fn list_files_with_sizes(&self) -> Result<Vec<IndexFile>> {
+            self.inner.list_files_with_sizes().await
+        }
+    }
+
+    #[derive(Debug)]
+    struct FailMetadataStore {
+        inner: Arc<dyn IndexStore>,
+    }
+
+    impl FailMetadataStore {
+        fn new(inner: Arc<dyn IndexStore>) -> Self {
+            Self { inner }
+        }
+
+        fn unwrap_dest_store(dest_store: &dyn IndexStore) -> &dyn IndexStore {
+            dest_store
+                .as_any()
+                .downcast_ref::<Self>()
+                .map(|store| store.inner.as_ref())
+                .unwrap_or(dest_store)
+        }
+    }
+
+    impl DeepSizeOf for FailMetadataStore {
+        fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize {
+            self.inner.deep_size_of_children(context)
+        }
+    }
+
+    #[async_trait]
+    impl IndexStore for FailMetadataStore {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn clone_arc(&self) -> Arc<dyn IndexStore> {
+            Arc::new(Self {
+                inner: self.inner.clone(),
+            })
+        }
+
+        fn io_parallelism(&self) -> usize {
+            self.inner.io_parallelism()
+        }
+
+        async fn new_index_file(
+            &self,
+            name: &str,
+            schema: Arc<Schema>,
+        ) -> Result<Box<dyn IndexWriter>> {
+            let writer = self.inner.new_index_file(name, schema).await?;
+            if name == METADATA_FILE {
+                Ok(Box::new(FailFinishWriter { inner: writer }))
+            } else {
+                Ok(writer)
+            }
+        }
+
+        async fn open_index_file(&self, name: &str) -> Result<Arc<dyn IndexReader>> {
+            self.inner.open_index_file(name).await
+        }
+
+        async fn copy_index_file(
+            &self,
+            name: &str,
+            dest_store: &dyn IndexStore,
+        ) -> Result<IndexFile> {
+            self.inner
+                .copy_index_file(name, Self::unwrap_dest_store(dest_store))
+                .await
+        }
+
+        async fn copy_index_file_to(
+            &self,
+            name: &str,
+            new_name: &str,
+            dest_store: &dyn IndexStore,
+        ) -> Result<IndexFile> {
+            self.inner
+                .copy_index_file_to(name, new_name, Self::unwrap_dest_store(dest_store))
+                .await
+        }
+
+        async fn rename_index_file(&self, name: &str, new_name: &str) -> Result<IndexFile> {
+            self.inner.rename_index_file(name, new_name).await
+        }
+
+        async fn delete_index_file(&self, name: &str) -> Result<()> {
+            self.inner.delete_index_file(name).await
+        }
+
+        async fn list_files_with_sizes(&self) -> Result<Vec<IndexFile>> {
+            self.inner.list_files_with_sizes().await
+        }
+    }
+
+    struct FailFinishWriter {
+        inner: Box<dyn IndexWriter>,
+    }
+
+    #[async_trait]
+    impl IndexWriter for FailFinishWriter {
+        async fn write_record_batch(&mut self, batch: RecordBatch) -> Result<u64> {
+            self.inner.write_record_batch(batch).await
+        }
+
+        async fn add_global_buffer(&mut self, data: Bytes) -> Result<u32> {
+            self.inner.add_global_buffer(data).await
+        }
+
+        async fn finish(&mut self) -> Result<IndexFile> {
+            Err(Error::internal("injected metadata write failure"))
+        }
+
+        async fn finish_with_metadata(
+            &mut self,
+            _metadata: HashMap<String, String>,
+        ) -> Result<IndexFile> {
+            Err(Error::internal("injected metadata write failure"))
+        }
+    }
+
     #[derive(Debug)]
     struct CountingWriter {
         path: String,
@@ -2412,12 +2704,446 @@ mod tests {
 
         let store = CountingStore::new();
         let docs = Arc::new(std::mem::take(&mut builder.docs));
-        builder.write_posting_lists(&store, docs).await?;
+        builder
+            .write_posting_lists(&store, docs, &posting_file_path(0))
+            .await?;
 
         assert_eq!(store.write_count(), 1);
         Ok(())
     }
 
+    async fn write_partition_file_marker(
+        store: &dyn IndexStore,
+        path: &str,
+        partition_id: u64,
+    ) -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "partition_id",
+            DataType::UInt64,
+            false,
+        )]));
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![Arc::new(UInt64Array::from(vec![partition_id]))],
+        )?;
+        let mut writer = store.new_index_file(path, schema).await?;
+        writer.write_record_batch(batch).await?;
+        writer.finish().await?;
+        Ok(())
+    }
+
+    async fn write_partition_files(
+        store: &dyn IndexStore,
+        partition_id: u64,
+        target: PartitionWriteTarget,
+    ) -> Result<()> {
+        write_partition_file_marker(store, &target.token_path(partition_id), partition_id).await?;
+        write_partition_file_marker(store, &target.posting_path(partition_id), partition_id)
+            .await?;
+        write_partition_file_marker(store, &target.doc_path(partition_id), partition_id).await?;
+        Ok(())
+    }
+
+    async fn read_partition_file_marker(store: &dyn IndexStore, path: &str) -> Result<u64> {
+        let reader = store.open_index_file(path).await?;
+        let batch = reader.read_range(0..1, None).await?;
+        let partition_ids = batch.column(0).as_primitive::<datatypes::UInt64Type>();
+        Ok(partition_ids.value(0))
+    }
+
+    async fn assert_partition_file_markers(
+        store: &dyn IndexStore,
+        partition_id: u64,
+        expected_marker: u64,
+    ) -> Result<()> {
+        assert_eq!(
+            read_partition_file_marker(store, &token_file_path(partition_id)).await?,
+            expected_marker
+        );
+        assert_eq!(
+            read_partition_file_marker(store, &posting_file_path(partition_id)).await?,
+            expected_marker
+        );
+        assert_eq!(
+            read_partition_file_marker(store, &doc_file_path(partition_id)).await?,
+            expected_marker
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_merge_index_files_remaps_staged_partitions_without_rename() -> Result<()> {
+        let index_dir = TempDir::default();
+        let object_store = Arc::new(ObjectStore::local());
+        let base_store: Arc<dyn IndexStore> = Arc::new(LanceIndexStore::new(
+            object_store.clone(),
+            index_dir.obj_path(),
+            Arc::new(LanceCache::no_cache()),
+        ));
+        let store = Arc::new(NoRenameStore::new(base_store.clone()));
+        let partitions = vec![5_u64, 1_u64, (17_u64 << 32) | 2];
+        let metadata_builder = InvertedIndexBuilder::from_existing_index(
+            InvertedIndexParams::default(),
+            None,
+            Vec::new(),
+            TokenSetFormat::default(),
+            None,
+            RoaringBitmap::new(),
+        );
+
+        for partition_id in &partitions {
+            write_partition_files(
+                base_store.as_ref(),
+                *partition_id,
+                PartitionWriteTarget::Staged,
+            )
+            .await?;
+            metadata_builder
+                .write_part_metadata(base_store.as_ref(), *partition_id)
+                .await?;
+        }
+
+        merge_index_files(
+            object_store.as_ref(),
+            &index_dir.obj_path(),
+            store,
+            noop_progress(),
+        )
+        .await?;
+
+        let metadata_reader = base_store.open_index_file(METADATA_FILE).await?;
+        let metadata = &metadata_reader.schema().metadata;
+        let written_partitions: Vec<u64> = serde_json::from_str(
+            metadata
+                .get("partitions")
+                .expect("partitions missing from metadata"),
+        )?;
+        let mut expected_partitions = partitions.clone();
+        expected_partitions.sort_unstable();
+        expected_partitions.dedup();
+        let remapped_partitions = (0..expected_partitions.len() as u64).collect::<Vec<_>>();
+        assert_eq!(written_partitions, remapped_partitions);
+
+        for (new_id, old_id) in expected_partitions.iter().enumerate() {
+            assert_partition_file_markers(base_store.as_ref(), new_id as u64, *old_id).await?;
+            assert!(
+                base_store
+                    .open_index_file(&part_metadata_file_path(*old_id))
+                    .await
+                    .is_err(),
+                "partition metadata should be cleaned up after final metadata is written"
+            );
+            for suffix in PARTITION_FILE_SUFFIXES {
+                assert!(
+                    base_store
+                        .open_index_file(&staged_partition_file_path(*old_id, suffix))
+                        .await
+                        .is_err(),
+                    "staged partition files should be cleaned up after final metadata is written"
+                );
+            }
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_merge_index_files_rewrites_partial_final_files_from_staging() -> Result<()> {
+        let index_dir = TempDir::default();
+        let object_store = Arc::new(ObjectStore::local());
+        let base_store: Arc<dyn IndexStore> = Arc::new(LanceIndexStore::new(
+            object_store.clone(),
+            index_dir.obj_path(),
+            Arc::new(LanceCache::no_cache()),
+        ));
+        let store = Arc::new(NoRenameStore::with_final_delete_tracking(
+            base_store.clone(),
+        ));
+        let partitions = vec![1_u64, 5_u64];
+        let metadata_builder = InvertedIndexBuilder::from_existing_index(
+            InvertedIndexParams::default(),
+            None,
+            Vec::new(),
+            TokenSetFormat::default(),
+            None,
+            RoaringBitmap::new(),
+        );
+
+        for partition_id in &partitions {
+            write_partition_files(
+                base_store.as_ref(),
+                *partition_id,
+                PartitionWriteTarget::Staged,
+            )
+            .await?;
+            metadata_builder
+                .write_part_metadata(base_store.as_ref(), *partition_id)
+                .await?;
+        }
+
+        for suffix in PARTITION_FILE_SUFFIXES {
+            write_partition_file_marker(base_store.as_ref(), &partition_file_path(1, suffix), 999)
+                .await?;
+        }
+
+        merge_index_files(
+            object_store.as_ref(),
+            &index_dir.obj_path(),
+            store.clone(),
+            noop_progress(),
+        )
+        .await?;
+
+        assert_partition_file_markers(base_store.as_ref(), 0, 1).await?;
+        assert_partition_file_markers(base_store.as_ref(), 1, 5).await?;
+        assert_eq!(
+            store.final_delete_count(),
+            0,
+            "merge should overwrite final partition files without deleting them first"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_distributed_from_existing_copies_existing_partitions_to_staging_and_finalizes()
+    -> Result<()> {
+        let object_store = Arc::new(ObjectStore::local());
+        let source_dir = TempDir::default();
+        let dest_dir = TempDir::default();
+        let source_store: Arc<dyn IndexStore> = Arc::new(LanceIndexStore::new(
+            object_store.clone(),
+            source_dir.obj_path(),
+            Arc::new(LanceCache::no_cache()),
+        ));
+        let dest_store: Arc<dyn IndexStore> = Arc::new(LanceIndexStore::new(
+            object_store.clone(),
+            dest_dir.obj_path(),
+            Arc::new(LanceCache::no_cache()),
+        ));
+        let merge_store = Arc::new(NoRenameStore::new(dest_store.clone()));
+        let fragment_mask = 7_u64 << 32;
+        let partitions = vec![fragment_mask | 5, fragment_mask | 1];
+
+        for partition_id in &partitions {
+            write_partition_files(
+                source_store.as_ref(),
+                *partition_id,
+                PartitionWriteTarget::Final,
+            )
+            .await?;
+        }
+
+        let builder = InvertedIndexBuilder::from_existing_index(
+            InvertedIndexParams::default(),
+            Some(source_store.clone()),
+            partitions.clone(),
+            TokenSetFormat::default(),
+            Some(fragment_mask),
+            RoaringBitmap::new(),
+        );
+        builder.write(dest_store.as_ref()).await?;
+
+        for partition_id in &partitions {
+            assert_partition_file_markers(source_store.as_ref(), *partition_id, *partition_id)
+                .await?;
+            for suffix in PARTITION_FILE_SUFFIXES {
+                let staged_path = staged_partition_file_path(*partition_id, suffix);
+                assert_eq!(
+                    read_partition_file_marker(dest_store.as_ref(), &staged_path).await?,
+                    *partition_id
+                );
+                assert!(
+                    dest_store
+                        .open_index_file(&partition_file_path(*partition_id, suffix))
+                        .await
+                        .is_err(),
+                    "distributed existing partition should be staged instead of copied to root"
+                );
+            }
+            dest_store
+                .open_index_file(&part_metadata_file_path(*partition_id))
+                .await?;
+        }
+
+        merge_index_files(
+            object_store.as_ref(),
+            &dest_dir.obj_path(),
+            merge_store,
+            noop_progress(),
+        )
+        .await?;
+
+        let mut expected_partitions = partitions.clone();
+        expected_partitions.sort_unstable();
+        for (new_id, old_id) in expected_partitions.iter().enumerate() {
+            assert_partition_file_markers(dest_store.as_ref(), new_id as u64, *old_id).await?;
+            for suffix in PARTITION_FILE_SUFFIXES {
+                assert!(
+                    dest_store
+                        .open_index_file(&staged_partition_file_path(*old_id, suffix))
+                        .await
+                        .is_err(),
+                    "staged partition files should be cleaned after final metadata is written"
+                );
+            }
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_merge_index_files_keeps_staging_when_final_metadata_write_fails() -> Result<()> {
+        let index_dir = TempDir::default();
+        let object_store = Arc::new(ObjectStore::local());
+        let base_store: Arc<dyn IndexStore> = Arc::new(LanceIndexStore::new(
+            object_store.clone(),
+            index_dir.obj_path(),
+            Arc::new(LanceCache::no_cache()),
+        ));
+        let failing_store = Arc::new(FailMetadataStore::new(base_store.clone()));
+        let partitions = vec![1_u64, 5_u64];
+        let metadata_builder = InvertedIndexBuilder::from_existing_index(
+            InvertedIndexParams::default(),
+            None,
+            Vec::new(),
+            TokenSetFormat::default(),
+            None,
+            RoaringBitmap::new(),
+        );
+
+        for partition_id in &partitions {
+            write_partition_files(
+                base_store.as_ref(),
+                *partition_id,
+                PartitionWriteTarget::Staged,
+            )
+            .await?;
+            metadata_builder
+                .write_part_metadata(base_store.as_ref(), *partition_id)
+                .await?;
+        }
+
+        let err = merge_index_files(
+            object_store.as_ref(),
+            &index_dir.obj_path(),
+            failing_store,
+            noop_progress(),
+        )
+        .await
+        .unwrap_err();
+        assert!(
+            err.to_string().contains("metadata write failure"),
+            "expected injected metadata failure, got: {err}"
+        );
+
+        for partition_id in &partitions {
+            base_store
+                .open_index_file(&part_metadata_file_path(*partition_id))
+                .await?;
+            for suffix in PARTITION_FILE_SUFFIXES {
+                let staged_path = staged_partition_file_path(*partition_id, suffix);
+                assert_eq!(
+                    read_partition_file_marker(base_store.as_ref(), &staged_path).await?,
+                    *partition_id
+                );
+            }
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_distributed_build_writes_partition_data_to_staging() -> Result<()> {
+        let index_dir = TempDir::default();
+        let object_store = ObjectStore::local();
+        let store = Arc::new(LanceIndexStore::new(
+            object_store.into(),
+            index_dir.obj_path(),
+            Arc::new(LanceCache::no_cache()),
+        ));
+
+        let fragment_mask = 7_u64 << 32;
+        let batch = make_doc_batch("hello world", fragment_mask);
+        let stream = RecordBatchStreamAdapter::new(batch.schema(), stream::iter(vec![Ok(batch)]));
+        let stream = Box::pin(stream);
+        let mut builder = InvertedIndexBuilder::new_with_fragment_mask(
+            InvertedIndexParams::default(),
+            Some(fragment_mask),
+        );
+        builder.update(stream, store.as_ref(), None).await?;
+
+        let part_metadata_files =
+            list_metadata_files(&ObjectStore::local(), &index_dir.obj_path()).await?;
+        assert_eq!(part_metadata_files.len(), 1);
+        assert!(
+            part_metadata_files[0].starts_with("staging/part_"),
+            "partition metadata should be written to staging"
+        );
+        let reader = store.open_index_file(&part_metadata_files[0]).await?;
+        let partition_ids: Vec<u64> = serde_json::from_str(
+            reader
+                .schema()
+                .metadata
+                .get("partitions")
+                .expect("partitions missing from metadata"),
+        )?;
+        assert_eq!(partition_ids.len(), 1);
+        let partition_id = partition_ids[0];
+
+        store
+            .open_index_file(&staged_partition_file_path(partition_id, TOKENS_FILE))
+            .await?;
+        assert!(
+            store
+                .open_index_file(&partition_file_path(partition_id, METADATA_FILE))
+                .await
+                .is_err(),
+            "distributed build-only metadata should not be written to root partition metadata paths"
+        );
+        assert!(
+            store
+                .open_index_file(&token_file_path(partition_id))
+                .await
+                .is_err(),
+            "distributed build-only data should not be written to final partition paths"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_merge_index_files_is_noop_when_metadata_exists() -> Result<()> {
+        let index_dir = TempDir::default();
+        let object_store = Arc::new(ObjectStore::local());
+        let store: Arc<dyn IndexStore> = Arc::new(LanceIndexStore::new(
+            object_store.clone(),
+            index_dir.obj_path(),
+            Arc::new(LanceCache::no_cache()),
+        ));
+        let metadata_builder = InvertedIndexBuilder::from_existing_index(
+            InvertedIndexParams::default(),
+            None,
+            vec![42],
+            TokenSetFormat::default(),
+            None,
+            RoaringBitmap::new(),
+        );
+        metadata_builder
+            .write_metadata(store.as_ref(), &[42])
+            .await?;
+
+        merge_index_files(
+            object_store.as_ref(),
+            &index_dir.obj_path(),
+            store,
+            noop_progress(),
+        )
+        .await?;
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_build_only_path_writes_partitions_as_is() -> Result<()> {
         let src_dir = TempDir::default();
@@ -2856,7 +3582,6 @@ mod tests {
                 }
             })
             .collect::<Vec<_>>();
-
         let read_start = tags
             .iter()
             .position(|e| e == "start:read_partition_metadata")
@@ -2894,8 +3619,8 @@ mod tests {
         );
         assert_eq!(
             remap_progress.last().copied().unwrap_or_default(),
-            12,
-            "expected remap_partition_files progress to cover both rename phases"
+            6,
+            "expected remap_partition_files progress to cover staged-to-final copies"
         );
         assert!(
             tags.iter().any(|e| e == "progress:write_merged_metadata"),
diff --git a/rust/lance-index/src/scalar/inverted/cache_codec.rs b/rust/lance-index/src/scalar/inverted/cache_codec.rs
index 74cfc98ef7b..a676455d5c9 100644
--- a/rust/lance-index/src/scalar/inverted/cache_codec.rs
+++ b/rust/lance-index/src/scalar/inverted/cache_codec.rs
@@ -4,16 +4,24 @@
 //! Cache codec impls for FTS index entries.
 //!
 //! Serializes [`PostingList`] and [`Positions`] cache values for persistent
-//! cache backends. The format is a small variant tag plus a JSON header for
-//! scalar metadata, with Arrow-backed payload sections written as zero-copy
-//! Arrow IPC streams via [`lance_arrow::ipc`]. The raw byte buffer inside
-//! [`SharedPositionStream`] is written via [`write_len_prefixed_bytes`] and
-//! read back via [`read_len_prefixed_bytes_at`] -- both zero-copy slices into
-//! the input `Bytes` allocation.
+//! cache backends, behind the stabilized envelope written by
+//! [`CacheCodec`](lance_core::cache::CacheCodec).
 //!
-//! This is the FTS counterpart of `partition_serde.rs` for vector indices.
+//! Every variant uses a protobuf header (see `protos-cache/cache.proto`, with the
+//! tail/position codecs and position-storage kind as proto enums) followed by
+//! 64-byte-aligned Arrow IPC sections and, where applicable, raw blobs:
+//!
+//! - the compressed posting list: an IPC section for `blocks`, then the
+//!   position sections (legacy IPC, or shared block-offsets IPC + a raw blob of
+//!   the [`SharedPositionStream`] byte buffer, which has its own portable
+//!   encoding);
+//! - the plain posting list: an IPC section of `(row_ids, frequencies)`, then
+//!   an optional legacy position IPC section;
+//! - the standalone [`Positions`] codec: the position sections alone.
+//!
+//! All sections read back zero-copy via [`lance_arrow::ipc`]. This is the FTS
+//! counterpart of `partition_serde.rs` for vector indices.
 
-use std::io::Write;
 use std::sync::Arc;
 
 use arrow_array::cast::AsArray;
@@ -22,14 +30,14 @@ use arrow_array::{
     Array, Float32Array, LargeBinaryArray, ListArray, RecordBatch, UInt32Array, UInt64Array,
 };
 use arrow_schema::{DataType, Field, Schema};
-use bytes::Bytes;
-use lance_arrow::ipc::{
-    read_ipc_stream_single_at, read_len_prefixed_bytes_at, write_ipc_stream,
-    write_len_prefixed_bytes,
-};
-use lance_core::cache::CacheCodecImpl;
+use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter};
 use lance_core::{Error, Result};
-use serde::{Deserialize, Serialize};
+
+use crate::cache_pb::{
+    CompressedPostingHeader, PlainPostingHeader, PositionStorage as PbPositionStorage,
+    PositionStreamCodec as PbPositionStreamCodec, PositionsHeader, PostingListGroupHeader,
+    PostingTailCodec as PbPostingTailCodec,
+};
 
 use super::index::{
     CompressedPositionStorage, CompressedPostingList, PlainPostingList, PositionStreamCodec,
@@ -43,86 +51,43 @@ use super::index::{
 const POSTING_VARIANT_PLAIN: u8 = 0;
 const POSTING_VARIANT_COMPRESSED: u8 = 1;
 
-const POSITIONS_TAG_NONE: u8 = 0;
-const POSITIONS_TAG_LEGACY: u8 = 1;
-const POSITIONS_TAG_SHARED: u8 = 2;
-
-const POSTING_TAIL_CODEC_FIXED32: u8 = 0;
-const POSTING_TAIL_CODEC_VARINT_DELTA: u8 = 1;
-
-const POSITION_STREAM_CODEC_VARINT_DOC_DELTA: u8 = 0;
-const POSITION_STREAM_CODEC_PACKED_DELTA: u8 = 1;
-
 // ---------------------------------------------------------------------------
-// Codec enum byte mappings
+// Codec enum mappings
 // ---------------------------------------------------------------------------
 
-fn posting_tail_codec_to_u8(c: PostingTailCodec) -> u8 {
-    match c {
-        PostingTailCodec::Fixed32 => POSTING_TAIL_CODEC_FIXED32,
-        PostingTailCodec::VarintDelta => POSTING_TAIL_CODEC_VARINT_DELTA,
-    }
-}
+// Posting lists carry their discriminants as protobuf enums in the header;
+// these map to/from the in-memory Rust enums.
 
-fn u8_to_posting_tail_codec(v: u8) -> Result<PostingTailCodec> {
-    match v {
-        POSTING_TAIL_CODEC_FIXED32 => Ok(PostingTailCodec::Fixed32),
-        POSTING_TAIL_CODEC_VARINT_DELTA => Ok(PostingTailCodec::VarintDelta),
-        _ => Err(Error::io(format!("unknown posting tail codec: {v}"))),
+fn posting_tail_codec_to_proto(c: PostingTailCodec) -> PbPostingTailCodec {
+    match c {
+        PostingTailCodec::Fixed32 => PbPostingTailCodec::Fixed32,
+        PostingTailCodec::VarintDelta => PbPostingTailCodec::VarintDelta,
     }
 }
 
-fn position_stream_codec_to_u8(c: PositionStreamCodec) -> u8 {
+fn proto_to_posting_tail_codec(c: PbPostingTailCodec) -> PostingTailCodec {
     match c {
-        PositionStreamCodec::VarintDocDelta => POSITION_STREAM_CODEC_VARINT_DOC_DELTA,
-        PositionStreamCodec::PackedDelta => POSITION_STREAM_CODEC_PACKED_DELTA,
+        PbPostingTailCodec::Fixed32 => PostingTailCodec::Fixed32,
+        PbPostingTailCodec::VarintDelta => PostingTailCodec::VarintDelta,
     }
 }
 
-fn u8_to_position_stream_codec(v: u8) -> Result<PositionStreamCodec> {
-    match v {
-        POSITION_STREAM_CODEC_VARINT_DOC_DELTA => Ok(PositionStreamCodec::VarintDocDelta),
-        POSITION_STREAM_CODEC_PACKED_DELTA => Ok(PositionStreamCodec::PackedDelta),
-        _ => Err(Error::io(format!("unknown position stream codec: {v}"))),
+fn position_stream_codec_to_proto(c: PositionStreamCodec) -> PbPositionStreamCodec {
+    match c {
+        PositionStreamCodec::VarintDocDelta => PbPositionStreamCodec::VarintDocDelta,
+        PositionStreamCodec::PackedDelta => PbPositionStreamCodec::PackedDelta,
     }
 }
 
-// ---------------------------------------------------------------------------
-// Header / tag I/O helpers (mirrors partition_serde.rs)
-// ---------------------------------------------------------------------------
-
-fn write_json_header(writer: &mut dyn Write, header: &impl Serialize) -> Result<()> {
-    let bytes = serde_json::to_vec(header)?;
-    write_len_prefixed_bytes(writer, &bytes)?;
-    Ok(())
-}
-
-fn read_json_header<T: serde::de::DeserializeOwned>(data: &Bytes, offset: &mut usize) -> Result<T> {
-    let bytes = read_len_prefixed_bytes_at(data, offset).map_err(|e| Error::io(e.to_string()))?;
-    serde_json::from_slice(&bytes)
-        .map_err(|e| Error::io(format!("failed to deserialize cache header: {e}")))
-}
-
-fn write_u8(writer: &mut dyn Write, value: u8) -> Result<()> {
-    writer
-        .write_all(&[value])
-        .map_err(|e| Error::io(format!("failed to write tag byte: {e}")))
-}
-
-fn read_u8(data: &Bytes, offset: &mut usize) -> Result<u8> {
-    let bytes = data.as_ref();
-    if *offset >= bytes.len() {
-        return Err(Error::io(
-            "truncated cache entry: missing tag byte".to_string(),
-        ));
+fn proto_to_position_stream_codec(c: PbPositionStreamCodec) -> PositionStreamCodec {
+    match c {
+        PbPositionStreamCodec::VarintDocDelta => PositionStreamCodec::VarintDocDelta,
+        PbPositionStreamCodec::PackedDelta => PositionStreamCodec::PackedDelta,
     }
-    let v = bytes[*offset];
-    *offset += 1;
-    Ok(v)
 }
 
 // ---------------------------------------------------------------------------
-// Position storage serde (shared by PostingList variants and Positions)
+// Position storage sections (shared by PostingList variants and Positions)
 // ---------------------------------------------------------------------------
 
 const POSITION_LIST_COLUMN: &str = "position_list";
@@ -131,33 +96,36 @@ const ROW_IDS_COLUMN: &str = "row_ids";
 const FREQUENCIES_COLUMN: &str = "frequencies";
 const BLOCKS_COLUMN: &str = "blocks";
 
-#[derive(Serialize, Deserialize)]
-struct SharedPositionsHeader {
-    codec: u8,
+fn legacy_positions_batch(list: &ListArray) -> Result<RecordBatch> {
+    let schema = Arc::new(Schema::new(vec![Field::new(
+        POSITION_LIST_COLUMN,
+        list.data_type().clone(),
+        list.is_nullable(),
+    )]));
+    Ok(RecordBatch::try_new(schema, vec![Arc::new(list.clone())])?)
+}
+
+fn read_legacy_positions(r: &mut CacheEntryReader<'_>) -> Result<ListArray> {
+    let batch = r.read_ipc()?;
+    Ok(batch
+        .column(0)
+        .as_any()
+        .downcast_ref::<ListArray>()
+        .ok_or_else(|| Error::io("legacy position column is not a ListArray".to_string()))?
+        .clone())
 }
 
-fn write_position_storage(
-    writer: &mut dyn Write,
+/// Write the position sections (the bytes after the header) for `storage`. The
+/// caller's header proto carries the storage kind and shared-stream codec.
+fn write_position_sections(
+    w: &mut CacheEntryWriter<'_>,
     storage: &CompressedPositionStorage,
 ) -> Result<()> {
     match storage {
         CompressedPositionStorage::LegacyPerDoc(list) => {
-            write_u8(writer, POSITIONS_TAG_LEGACY)?;
-            let schema = Arc::new(Schema::new(vec![Field::new(
-                POSITION_LIST_COLUMN,
-                list.data_type().clone(),
-                list.is_nullable(),
-            )]));
-            let batch = RecordBatch::try_new(schema, vec![Arc::new(list.clone())])?;
-            write_ipc_stream(&batch, writer)?;
+            w.write_ipc(&legacy_positions_batch(list)?)?;
         }
         CompressedPositionStorage::SharedStream(stream) => {
-            write_u8(writer, POSITIONS_TAG_SHARED)?;
-            let header = SharedPositionsHeader {
-                codec: position_stream_codec_to_u8(stream.codec()),
-            };
-            write_json_header(writer, &header)?;
-
             let offsets = UInt32Array::from(stream.block_offsets().to_vec());
             let schema = Arc::new(Schema::new(vec![Field::new(
                 BLOCK_OFFSETS_COLUMN,
@@ -165,55 +133,42 @@ fn write_position_storage(
                 false,
             )]));
             let batch = RecordBatch::try_new(schema, vec![Arc::new(offsets)])?;
-            write_ipc_stream(&batch, writer)?;
-
-            write_len_prefixed_bytes(writer, stream.bytes())?;
+            w.write_ipc(&batch)?;
+            w.write_raw(stream.bytes())?;
         }
     }
     Ok(())
 }
 
-fn read_position_storage(
-    data: &Bytes,
-    offset: &mut usize,
-    tag: u8,
-) -> Result<CompressedPositionStorage> {
-    match tag {
-        POSITIONS_TAG_LEGACY => {
-            let batch =
-                read_ipc_stream_single_at(data, offset).map_err(|e| Error::io(e.to_string()))?;
-            let list = batch
-                .column(0)
-                .as_any()
-                .downcast_ref::<ListArray>()
-                .ok_or_else(|| Error::io("legacy position column is not a ListArray".to_string()))?
-                .clone();
-            Ok(CompressedPositionStorage::LegacyPerDoc(list))
-        }
-        POSITIONS_TAG_SHARED => {
-            let header: SharedPositionsHeader = read_json_header(data, offset)?;
-            let codec = u8_to_position_stream_codec(header.codec)?;
-
-            let batch =
-                read_ipc_stream_single_at(data, offset).map_err(|e| Error::io(e.to_string()))?;
+/// Read the position sections for the given `storage` kind and (for shared
+/// streams) `stream_codec`. Returns `None` only when `storage` is
+/// [`PbPositionStorage::None`].
+fn read_position_sections(
+    r: &mut CacheEntryReader<'_>,
+    storage: PbPositionStorage,
+    stream_codec: PositionStreamCodec,
+) -> Result<Option<CompressedPositionStorage>> {
+    match storage {
+        PbPositionStorage::None => Ok(None),
+        PbPositionStorage::Legacy => Ok(Some(CompressedPositionStorage::LegacyPerDoc(
+            read_legacy_positions(r)?,
+        ))),
+        PbPositionStorage::Shared => {
+            let batch = r.read_ipc()?;
             let block_offsets = batch
                 .column(0)
                 .as_primitive_opt::<UInt32Type>()
                 .ok_or_else(|| Error::io("block_offsets column is not UInt32".to_string()))?
                 .values()
                 .to_vec();
-
-            // Zero copy: read_len_prefixed_bytes_at returns a Bytes slice
-            // backed by the same allocation as `data`, and SharedPositionStream
-            // now stores its byte buffer as Bytes -- no copy on read.
-            let bytes =
-                read_len_prefixed_bytes_at(data, offset).map_err(|e| Error::io(e.to_string()))?;
-
-            Ok(CompressedPositionStorage::SharedStream(
-                SharedPositionStream::new(codec, block_offsets, bytes),
-            ))
+            // Zero copy: read_raw returns a Bytes slice backed by the same
+            // allocation as the input, and SharedPositionStream stores its byte
+            // buffer as Bytes -- no copy on read.
+            let bytes = r.read_raw()?;
+            Ok(Some(CompressedPositionStorage::SharedStream(
+                SharedPositionStream::new(stream_codec, block_offsets, bytes),
+            )))
         }
-        other => Err(Error::io(format!("unknown positions tag: {other}"))),
     }
 }
 
@@ -221,50 +176,45 @@ fn read_position_storage(
 // PostingList codec
 // ---------------------------------------------------------------------------
 
-#[derive(Serialize, Deserialize)]
-struct PlainPostingHeader {
-    max_score: Option<f32>,
-}
-
-#[derive(Serialize, Deserialize)]
-struct CompressedPostingHeader {
-    max_score: f32,
-    length: u32,
-    posting_tail_codec: u8,
-}
-
 impl CacheCodecImpl for PostingList {
-    fn serialize(&self, writer: &mut dyn Write) -> Result<()> {
+    const TYPE_ID: &'static str = "lance.fts.PostingList";
+    const CURRENT_VERSION: u32 = 1;
+
+    fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> {
         match self {
             Self::Plain(plain) => {
-                write_u8(writer, POSTING_VARIANT_PLAIN)?;
-                serialize_plain(writer, plain)
+                w.write_u8(POSTING_VARIANT_PLAIN)?;
+                serialize_plain(w, plain)
             }
             Self::Compressed(compressed) => {
-                write_u8(writer, POSTING_VARIANT_COMPRESSED)?;
-                serialize_compressed(writer, compressed)
+                w.write_u8(POSTING_VARIANT_COMPRESSED)?;
+                serialize_compressed(w, compressed)
             }
         }
     }
 
-    fn deserialize(data: &Bytes) -> Result<Self> {
-        let mut offset = 0;
-        let variant = read_u8(data, &mut offset)?;
+    fn deserialize(r: &mut CacheEntryReader<'_>) -> Result<Self> {
+        let variant = r.read_u8()?;
         match variant {
-            POSTING_VARIANT_PLAIN => Ok(Self::Plain(deserialize_plain(data, &mut offset)?)),
-            POSTING_VARIANT_COMPRESSED => {
-                Ok(Self::Compressed(deserialize_compressed(data, &mut offset)?))
-            }
+            POSTING_VARIANT_PLAIN => Ok(Self::Plain(deserialize_plain(r)?)),
+            POSTING_VARIANT_COMPRESSED => Ok(Self::Compressed(deserialize_compressed(r)?)),
             other => Err(Error::io(format!("unknown PostingList variant: {other}"))),
         }
     }
 }
 
-fn serialize_plain(writer: &mut dyn Write, plain: &PlainPostingList) -> Result<()> {
+fn serialize_plain(w: &mut CacheEntryWriter<'_>, plain: &PlainPostingList) -> Result<()> {
+    // Plain postings carry only per-doc legacy positions (or none).
+    let position_storage = if plain.positions.is_some() {
+        PbPositionStorage::Legacy
+    } else {
+        PbPositionStorage::None
+    };
     let header = PlainPostingHeader {
         max_score: plain.max_score,
+        position_storage: position_storage as i32,
     };
-    write_json_header(writer, &header)?;
+    w.write_header(&header)?;
 
     let row_ids = UInt64Array::new(plain.row_ids.clone(), None);
     let frequencies = Float32Array::new(plain.frequencies.clone(), None);
@@ -273,26 +223,18 @@ fn serialize_plain(writer: &mut dyn Write, plain: &PlainPostingList) -> Result<(
         Field::new(FREQUENCIES_COLUMN, DataType::Float32, false),
     ]));
     let batch = RecordBatch::try_new(schema, vec![Arc::new(row_ids), Arc::new(frequencies)])?;
-    write_ipc_stream(&batch, writer)?;
-
-    match &plain.positions {
-        Some(list) => {
-            // Plain postings can only carry per-doc legacy positions; reuse
-            // the shared encoder.
-            write_position_storage(
-                writer,
-                &CompressedPositionStorage::LegacyPerDoc(list.clone()),
-            )?;
-        }
-        None => write_u8(writer, POSITIONS_TAG_NONE)?,
+    w.write_ipc(&batch)?;
+
+    if let Some(list) = &plain.positions {
+        w.write_ipc(&legacy_positions_batch(list)?)?;
     }
     Ok(())
 }
 
-fn deserialize_plain(data: &Bytes, offset: &mut usize) -> Result<PlainPostingList> {
-    let header: PlainPostingHeader = read_json_header(data, offset)?;
+fn deserialize_plain(r: &mut CacheEntryReader<'_>) -> Result<PlainPostingList> {
+    let header: PlainPostingHeader = r.read_header()?;
 
-    let batch = read_ipc_stream_single_at(data, offset).map_err(|e| Error::io(e.to_string()))?;
+    let batch = r.read_ipc()?;
     let row_ids = batch
         .column(0)
         .as_primitive_opt::<UInt64Type>()
@@ -306,19 +248,13 @@ fn deserialize_plain(data: &Bytes, offset: &mut usize) -> Result<PlainPostingLis
         .values()
         .clone();
 
-    let positions_tag = read_u8(data, offset)?;
-    let positions = match positions_tag {
-        POSITIONS_TAG_NONE => None,
-        POSITIONS_TAG_LEGACY => match read_position_storage(data, offset, positions_tag)? {
-            CompressedPositionStorage::LegacyPerDoc(list) => Some(list),
-            CompressedPositionStorage::SharedStream(_) => {
-                unreachable!("shared stream tag was read as legacy variant (this is a bug)")
-            }
-        },
-        other => {
-            return Err(Error::io(format!(
-                "Plain posting list cannot have positions tag {other}"
-            )));
+    let positions = match header.position_storage() {
+        PbPositionStorage::None => None,
+        PbPositionStorage::Legacy => Some(read_legacy_positions(r)?),
+        PbPositionStorage::Shared => {
+            return Err(Error::io(
+                "Plain posting list cannot have a shared position stream".to_string(),
+            ));
         }
     };
 
@@ -330,13 +266,33 @@ fn deserialize_plain(data: &Bytes, offset: &mut usize) -> Result<PlainPostingLis
     ))
 }
 
-fn serialize_compressed(writer: &mut dyn Write, posting: &CompressedPostingList) -> Result<()> {
+/// The compressed posting list is serialized with a protobuf header followed
+/// by 64-byte-aligned Arrow IPC sections (for the `blocks`, and for shared
+/// position block-offsets) and a raw blob (for the shared position byte
+/// stream, which already has its own portable encoding).
+fn serialize_compressed(
+    w: &mut CacheEntryWriter<'_>,
+    posting: &CompressedPostingList,
+) -> Result<()> {
+    let (position_storage, position_stream_codec) = match &posting.positions {
+        None => (PbPositionStorage::None, PbPositionStreamCodec::default()),
+        Some(CompressedPositionStorage::LegacyPerDoc(_)) => {
+            (PbPositionStorage::Legacy, PbPositionStreamCodec::default())
+        }
+        Some(CompressedPositionStorage::SharedStream(stream)) => (
+            PbPositionStorage::Shared,
+            position_stream_codec_to_proto(stream.codec()),
+        ),
+    };
+
     let header = CompressedPostingHeader {
         max_score: posting.max_score,
         length: posting.length,
-        posting_tail_codec: posting_tail_codec_to_u8(posting.posting_tail_codec),
+        posting_tail_codec: posting_tail_codec_to_proto(posting.posting_tail_codec) as i32,
+        position_storage: position_storage as i32,
+        position_stream_codec: position_stream_codec as i32,
     };
-    write_json_header(writer, &header)?;
+    w.write_header(&header)?;
 
     let schema = Arc::new(Schema::new(vec![Field::new(
         BLOCKS_COLUMN,
@@ -344,20 +300,19 @@ fn serialize_compressed(writer: &mut dyn Write, posting: &CompressedPostingList)
         false,
     )]));
     let batch = RecordBatch::try_new(schema, vec![Arc::new(posting.blocks.clone())])?;
-    write_ipc_stream(&batch, writer)?;
+    w.write_ipc(&batch)?;
 
-    match &posting.positions {
-        Some(storage) => write_position_storage(writer, storage)?,
-        None => write_u8(writer, POSITIONS_TAG_NONE)?,
+    if let Some(storage) = &posting.positions {
+        write_position_sections(w, storage)?;
     }
     Ok(())
 }
 
-fn deserialize_compressed(data: &Bytes, offset: &mut usize) -> Result<CompressedPostingList> {
-    let header: CompressedPostingHeader = read_json_header(data, offset)?;
-    let posting_tail_codec = u8_to_posting_tail_codec(header.posting_tail_codec)?;
+fn deserialize_compressed(r: &mut CacheEntryReader<'_>) -> Result<CompressedPostingList> {
+    let header: CompressedPostingHeader = r.read_header()?;
+    let posting_tail_codec = proto_to_posting_tail_codec(header.posting_tail_codec());
 
-    let batch = read_ipc_stream_single_at(data, offset).map_err(|e| Error::io(e.to_string()))?;
+    let batch = r.read_ipc()?;
     let blocks = batch
         .column(0)
         .as_any()
@@ -365,12 +320,8 @@ fn deserialize_compressed(data: &Bytes, offset: &mut usize) -> Result<Compressed
         .ok_or_else(|| Error::io("blocks column is not a LargeBinaryArray".to_string()))?
         .clone();
 
-    let positions_tag = read_u8(data, offset)?;
-    let positions = if positions_tag == POSITIONS_TAG_NONE {
-        None
-    } else {
-        Some(read_position_storage(data, offset, positions_tag)?)
-    };
+    let stream_codec = proto_to_position_stream_codec(header.position_stream_codec());
+    let positions = read_position_sections(r, header.position_storage(), stream_codec)?;
 
     Ok(CompressedPostingList::new(
         blocks,
@@ -385,39 +336,31 @@ fn deserialize_compressed(data: &Bytes, offset: &mut usize) -> Result<Compressed
 // PostingListGroup codec
 // ---------------------------------------------------------------------------
 
-/// Serializes a [`PostingListGroup`] as a count followed by each member
-/// posting list, length-prefixed so the existing [`PostingList`] codec can be
-/// reused per entry (and its byte buffers read back zero-copy). See issue
-/// #7040.
+/// Serializes a [`PostingListGroup`] as a member-count header followed by each
+/// member posting list written **inline** through the same writer. Reusing the
+/// [`PostingList`] codec inline (rather than into per-member sub-buffers) keeps
+/// each member's Arrow IPC sections 64-byte aligned within the group entry, so
+/// they read back zero-copy. Member bodies are self-delimiting, so they need no
+/// length prefixes to separate them. See issue #7040.
 impl CacheCodecImpl for PostingListGroup {
-    fn serialize(&self, writer: &mut dyn Write) -> Result<()> {
+    const TYPE_ID: &'static str = "lance.fts.PostingListGroup";
+    const CURRENT_VERSION: u32 = 1;
+
+    fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> {
         let count = u32::try_from(self.posting_lists.len())
             .map_err(|_| Error::io("posting list group too large to serialize".to_string()))?;
-        writer
-            .write_all(&count.to_le_bytes())
-            .map_err(|e| Error::io(format!("failed to write group count: {e}")))?;
+        w.write_header(&PostingListGroupHeader { count })?;
         for posting in &self.posting_lists {
-            let mut buf = Vec::new();
-            posting.serialize(&mut buf)?;
-            write_len_prefixed_bytes(writer, &buf)?;
+            posting.serialize(w)?;
         }
         Ok(())
     }
 
-    fn deserialize(data: &Bytes) -> Result<Self> {
-        let mut offset = 0;
-        if data.len() < 4 {
-            return Err(Error::io(
-                "truncated posting list group: missing count".to_string(),
-            ));
-        }
-        let count = u32::from_le_bytes(data[0..4].try_into().unwrap()) as usize;
-        offset += 4;
-        let mut posting_lists = Vec::with_capacity(count);
-        for _ in 0..count {
-            let entry = read_len_prefixed_bytes_at(data, &mut offset)
-                .map_err(|e| Error::io(e.to_string()))?;
-            posting_lists.push(PostingList::deserialize(&entry)?);
+    fn deserialize(r: &mut CacheEntryReader<'_>) -> Result<Self> {
+        let header: PostingListGroupHeader = r.read_header()?;
+        let mut posting_lists = Vec::with_capacity(header.count as usize);
+        for _ in 0..header.count {
+            posting_lists.push(PostingList::deserialize(r)?);
         }
         Ok(Self::new(posting_lists))
     }
@@ -428,20 +371,35 @@ impl CacheCodecImpl for PostingListGroup {
 // ---------------------------------------------------------------------------
 
 impl CacheCodecImpl for Positions {
-    fn serialize(&self, writer: &mut dyn Write) -> Result<()> {
-        write_position_storage(writer, &self.0)
+    const TYPE_ID: &'static str = "lance.fts.Positions";
+    const CURRENT_VERSION: u32 = 1;
+
+    fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> {
+        let (position_storage, position_stream_codec) = match &self.0 {
+            CompressedPositionStorage::LegacyPerDoc(_) => {
+                (PbPositionStorage::Legacy, PbPositionStreamCodec::default())
+            }
+            CompressedPositionStorage::SharedStream(stream) => (
+                PbPositionStorage::Shared,
+                position_stream_codec_to_proto(stream.codec()),
+            ),
+        };
+        let header = PositionsHeader {
+            position_storage: position_storage as i32,
+            position_stream_codec: position_stream_codec as i32,
+        };
+        w.write_header(&header)?;
+        write_position_sections(w, &self.0)
     }
 
-    fn deserialize(data: &Bytes) -> Result<Self> {
-        let mut offset = 0;
-        let tag = read_u8(data, &mut offset)?;
-        if tag == POSITIONS_TAG_NONE {
-            return Err(Error::io(
-                "Positions cache entry cannot encode the None variant".to_string(),
-            ));
-        }
-        let storage = read_position_storage(data, &mut offset, tag)?;
-        Ok(Self(storage))
+    fn deserialize(r: &mut CacheEntryReader<'_>) -> Result<Self> {
+        let header: PositionsHeader = r.read_header()?;
+        let stream_codec = proto_to_position_stream_codec(header.position_stream_codec());
+        read_position_sections(r, header.position_storage(), stream_codec)?
+            .map(Self)
+            .ok_or_else(|| {
+                Error::io("Positions cache entry cannot encode the None variant".to_string())
+            })
     }
 }
 
@@ -455,7 +413,8 @@ mod tests {
     use arrow_array::LargeBinaryArray;
     use arrow_array::builder::{Int32Builder, ListBuilder};
     use bytes::Bytes;
-    use lance_core::cache::CacheCodecImpl;
+    use lance_core::Result;
+    use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter};
 
     use super::super::index::{
         CompressedPositionStorage, CompressedPostingList, PlainPostingList, PositionStreamCodec,
@@ -502,16 +461,26 @@ mod tests {
         }
     }
 
-    fn roundtrip_posting_list(entry: &PostingList) -> PostingList {
+    /// Serialize a codec body (no envelope) into a standalone buffer.
+    fn body_bytes<T: CacheCodecImpl>(entry: &T) -> Bytes {
         let mut buf = Vec::new();
-        entry.serialize(&mut buf).unwrap();
-        PostingList::deserialize(&Bytes::from(buf)).unwrap()
+        let mut w = CacheEntryWriter::new(&mut buf);
+        entry.serialize(&mut w).unwrap();
+        Bytes::from(buf)
+    }
+
+    /// Deserialize a codec body (no envelope) at the current build's version.
+    fn from_body<T: CacheCodecImpl>(data: &Bytes) -> Result<T> {
+        let mut r = CacheEntryReader::new(data, 0, T::CURRENT_VERSION);
+        T::deserialize(&mut r)
+    }
+
+    fn roundtrip_posting_list(entry: &PostingList) -> PostingList {
+        from_body::<PostingList>(&body_bytes(entry)).unwrap()
     }
 
     fn roundtrip_positions(entry: &Positions) -> Positions {
-        let mut buf = Vec::new();
-        entry.serialize(&mut buf).unwrap();
-        Positions::deserialize(&Bytes::from(buf)).unwrap()
+        from_body::<Positions>(&body_bytes(entry)).unwrap()
     }
 
     fn assert_slice_points_into_bytes(slice: &[u8], bytes: &Bytes) {
@@ -652,13 +621,9 @@ mod tests {
                 expected_stream.clone(),
             )),
         );
-        let mut buf = Vec::new();
-        PostingList::Compressed(posting)
-            .serialize(&mut buf)
-            .unwrap();
-        let serialized = Bytes::from(buf);
+        let serialized = body_bytes(&PostingList::Compressed(posting));
 
-        let restored = PostingList::deserialize(&serialized).unwrap();
+        let restored = from_body::<PostingList>(&serialized).unwrap();
         let PostingList::Compressed(restored) = restored else {
             panic!("expected Compressed variant");
         };
@@ -695,9 +660,7 @@ mod tests {
             vec![plain.clone(), compressed, plain],
         ] {
             let group = PostingListGroup::new(members.clone());
-            let mut buf = Vec::new();
-            group.serialize(&mut buf).unwrap();
-            let restored = PostingListGroup::deserialize(&Bytes::from(buf)).unwrap();
+            let restored = from_body::<PostingListGroup>(&body_bytes(&group)).unwrap();
             assert_eq!(restored.posting_lists.len(), members.len());
             for (a, b) in members.iter().zip(restored.posting_lists.iter()) {
                 match (a, b) {
@@ -743,9 +706,241 @@ mod tests {
             None,
         );
         let entry = PostingList::Plain(plain);
-        let mut buf = Vec::new();
-        entry.serialize(&mut buf).unwrap();
+        let mut buf = body_bytes(&entry).to_vec();
         buf.truncate(buf.len() / 2);
-        assert!(PostingList::deserialize(&Bytes::from(buf)).is_err());
+        assert!(from_body::<PostingList>(&Bytes::from(buf)).is_err());
+    }
+
+    /// Tests covering the stabilized envelope + compressed proto format,
+    /// exercised through the full type-erased [`CacheCodec`] (envelope + body).
+    mod stable_format {
+        use std::sync::Arc;
+
+        use arrow_array::Array;
+        use lance_core::cache::CacheCodec;
+        use prost::Message;
+
+        use super::*;
+        use crate::cache_pb::{CompressedPostingHeader, PostingTailCodec as PbPostingTailCodec};
+
+        type ArcAny = Arc<dyn std::any::Any + Send + Sync>;
+
+        fn codec() -> CacheCodec {
+            CacheCodec::from_impl::<PostingList>()
+        }
+
+        /// Serialize an entry through the full codec (envelope + body).
+        fn serialize_entry(entry: PostingList) -> Vec<u8> {
+            let any: ArcAny = Arc::new(entry);
+            let mut buf = Vec::new();
+            codec().serialize(&any, &mut buf).unwrap();
+            buf
+        }
+
+        /// A `Bytes` whose base address is 64-byte aligned, modelling a backend
+        /// that reads cache entries into an aligned buffer.
+        fn aligned_bytes(payload: &[u8]) -> Bytes {
+            const ALIGN: usize = 64;
+            let mut v = vec![0u8; payload.len() + ALIGN];
+            let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN;
+            v[pad..pad + payload.len()].copy_from_slice(payload);
+            Bytes::from(v).slice(pad..pad + payload.len())
+        }
+
+        fn compressed_with_shared_positions() -> PostingList {
+            let blocks =
+                LargeBinaryArray::from_opt_vec(vec![Some(&[9u8; 48][..]), Some(&[1u8; 48])]);
+            let stream = SharedPositionStream::new(
+                PositionStreamCodec::PackedDelta,
+                vec![0u32, 4, 11],
+                Bytes::from((0u8..64).collect::<Vec<_>>()),
+            );
+            PostingList::Compressed(CompressedPostingList::new(
+                blocks,
+                7.0,
+                3,
+                PostingTailCodec::VarintDelta,
+                Some(CompressedPositionStorage::SharedStream(stream)),
+            ))
+        }
+
+        /// The compressed `blocks` (an aligned IPC section) and the shared
+        /// position blob (a raw section) must both be borrowed zero-copy from
+        /// the input even though the envelope pushes them to a non-zero,
+        /// non-aligned starting offset.
+        #[test]
+        fn compressed_sections_are_zero_copy_through_envelope() {
+            let serialized = aligned_bytes(&serialize_entry(compressed_with_shared_positions()));
+            let restored = codec().deserialize(&serialized).hit().unwrap();
+            let restored = restored.downcast::<PostingList>().unwrap();
+            let PostingList::Compressed(restored) = restored.as_ref() else {
+                panic!("expected Compressed");
+            };
+
+            let base = serialized.as_ptr() as usize;
+            let end = base + serialized.len();
+            let points_in = |ptr: usize| ptr >= base && ptr < end;
+
+            // blocks IPC section decoded in place (no realigning memcpy).
+            for buf in restored.blocks.to_data().buffers() {
+                assert!(
+                    points_in(buf.as_ptr() as usize),
+                    "blocks buffer was realigned out of the input — misaligned IPC section",
+                );
+            }
+            // shared position raw blob borrowed in place.
+            let Some(CompressedPositionStorage::SharedStream(stream)) = &restored.positions else {
+                panic!("expected shared stream");
+            };
+            assert!(points_in(stream.bytes().as_ptr() as usize));
+        }
+
+        /// Every member of a `PostingListGroup` must also decode zero-copy. The
+        /// group writes its members inline so each member's IPC sections stay
+        /// 64-byte aligned within the entry; embedding members in per-member
+        /// sub-buffers would land them at arbitrary offsets and force a
+        /// realigning memcpy on load.
+        #[test]
+        fn group_member_sections_are_zero_copy_through_envelope() {
+            let make_member = |fill: u8| {
+                let blocks =
+                    LargeBinaryArray::from_opt_vec(vec![Some(&[fill; 48][..]), Some(&[fill; 48])]);
+                PostingList::Compressed(CompressedPostingList::new(
+                    blocks,
+                    7.0,
+                    3,
+                    PostingTailCodec::VarintDelta,
+                    None,
+                ))
+            };
+            let group = PostingListGroup::new(vec![make_member(9), make_member(1)]);
+
+            let group_codec = CacheCodec::from_impl::<PostingListGroup>();
+            let any: ArcAny = Arc::new(group);
+            let mut buf = Vec::new();
+            group_codec.serialize(&any, &mut buf).unwrap();
+            let serialized = aligned_bytes(&buf);
+
+            let restored = group_codec.deserialize(&serialized).hit().unwrap();
+            let restored = restored.downcast::<PostingListGroup>().unwrap();
+
+            let base = serialized.as_ptr() as usize;
+            let end = base + serialized.len();
+            let points_in = |ptr: usize| ptr >= base && ptr < end;
+
+            assert_eq!(restored.posting_lists.len(), 2);
+            for member in &restored.posting_lists {
+                let PostingList::Compressed(member) = member else {
+                    panic!("expected Compressed member");
+                };
+                for buf in member.blocks.to_data().buffers() {
+                    assert!(
+                        points_in(buf.as_ptr() as usize),
+                        "group member blocks buffer was realigned out of the input — \
+                         misaligned IPC section",
+                    );
+                }
+            }
+        }
+
+        /// The plain posting's row-id/frequency IPC section must also decode
+        /// zero-copy through the envelope + proto header.
+        #[test]
+        fn plain_sections_are_zero_copy_through_envelope() {
+            let plain = PostingList::Plain(PlainPostingList::new(
+                ScalarBuffer::from((0u64..64).collect::<Vec<_>>()),
+                ScalarBuffer::from(vec![1.0f32; 64]),
+                Some(2.0),
+                None,
+            ));
+            let serialized = aligned_bytes(&serialize_entry(plain));
+            let restored = codec().deserialize(&serialized).hit().unwrap();
+            let restored = restored.downcast::<PostingList>().unwrap();
+            let PostingList::Plain(restored) = restored.as_ref() else {
+                panic!("expected Plain");
+            };
+
+            let base = serialized.as_ptr() as usize;
+            let end = base + serialized.len();
+            // The row_ids ScalarBuffer must borrow from the input allocation.
+            let ptr = restored.row_ids.as_ptr() as usize;
+            assert!(
+                ptr >= base && ptr < end,
+                "row_ids buffer was realigned out of the input — misaligned IPC section",
+            );
+        }
+
+        /// Additive proto fields (lever #1) must not break decoding: an unknown
+        /// field number appended to the header is ignored.
+        #[test]
+        fn header_proto_ignores_unknown_fields() {
+            let header = CompressedPostingHeader {
+                max_score: 1.5,
+                length: 9,
+                posting_tail_codec: PbPostingTailCodec::VarintDelta as i32,
+                ..Default::default()
+            };
+            let mut bytes = header.encode_to_vec();
+            // Append an unknown field #15, varint wire type (0), value 7.
+            bytes.push(15 << 3);
+            bytes.push(7);
+            let decoded = CompressedPostingHeader::decode(bytes.as_slice()).unwrap();
+            assert_eq!(decoded.length, 9);
+            assert_eq!(decoded.max_score, 1.5);
+        }
+
+        /// An entry written by a different codec (foreign TYPE_ID) misses.
+        #[test]
+        fn foreign_type_id_is_miss() {
+            // A PostingListGroup entry carries a different TYPE_ID in its
+            // envelope; reading it as a PostingList must miss, not misread it.
+            let group = PostingListGroup::new(vec![]);
+            let any: ArcAny = Arc::new(group);
+            let mut buf = Vec::new();
+            CacheCodec::from_impl::<PostingListGroup>()
+                .serialize(&any, &mut buf)
+                .unwrap();
+            assert!(codec().deserialize(&Bytes::from(buf)).hit().is_none());
+        }
+
+        /// An entry written by a newer build (higher type_version) misses.
+        #[test]
+        fn future_type_version_is_miss() {
+            let mut buf = serialize_entry(compressed_with_shared_positions());
+            // Patch the envelope's type_version (magic[4] + ver[1] + len[2] +
+            // type_id[N]) to a value beyond what this build understands.
+            let type_id_len = u16::from_le_bytes([buf[5], buf[6]]) as usize;
+            let version_off = 4 + 1 + 2 + type_id_len;
+            buf[version_off..version_off + 4].copy_from_slice(&u32::MAX.to_le_bytes());
+            assert!(codec().deserialize(&Bytes::from(buf)).hit().is_none());
+        }
+
+        /// A pre-stabilization blob (no magic) self-heals to a miss.
+        #[test]
+        fn pre_stabilization_blob_is_miss() {
+            // Old format led with a u64 LE length prefix, never our magic.
+            let mut blob = (30u64).to_le_bytes().to_vec();
+            blob.extend_from_slice(&[0u8; 30]);
+            assert!(codec().deserialize(&Bytes::from(blob)).hit().is_none());
+        }
+
+        /// A structurally-valid envelope whose body leads with an out-of-range
+        /// variant tag self-heals to a `BodyError` miss rather than panicking or
+        /// misreading the remaining bytes.
+        #[test]
+        fn unknown_posting_variant_is_miss() {
+            use lance_core::cache::{CacheDecode, CacheMissReason};
+
+            let mut buf = serialize_entry(compressed_with_shared_positions());
+            // The variant tag is the first body byte, right after the envelope
+            // (magic[4] + ver[1] + type_id_len[2] + type_id[N] + type_version[4]).
+            let type_id_len = u16::from_le_bytes([buf[5], buf[6]]) as usize;
+            let variant_off = 4 + 1 + 2 + type_id_len + 4;
+            buf[variant_off] = 2; // neither PLAIN (0) nor COMPRESSED (1)
+            match codec().deserialize(&Bytes::from(buf)) {
+                CacheDecode::Miss(reason) => assert_eq!(reason, CacheMissReason::BodyError),
+                CacheDecode::Hit(_) => panic!("expected a BodyError miss, got a hit"),
+            }
+        }
     }
 }
diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs
index 56547c6510b..41a18c3bd68 100644
--- a/rust/lance-index/src/scalar/inverted/index.rs
+++ b/rust/lance-index/src/scalar/inverted/index.rs
@@ -37,11 +37,12 @@ use datafusion::physical_plan::metrics::Time;
 use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use fst::{Automaton, IntoStreamer, Streamer};
 use futures::{FutureExt, Stream, StreamExt, TryStreamExt, stream};
-use itertools::Itertools;
+use itertools::{Either, Itertools};
 use lance_arrow::{RecordBatchExt, iter_str_array};
 use lance_core::cache::{CacheCodec, CacheKey, LanceCache, WeakLanceCache};
 use lance_core::deepsize::DeepSizeOf;
 use lance_core::error::{DataFusionResult, LanceOptionExt};
+use lance_core::utils::address::RowAddress;
 use lance_core::utils::tokio::{get_num_compute_intensive_cpus, spawn_cpu};
 use lance_core::utils::tracing::{IO_TYPE_LOAD_SCALAR_PART, TRACE_IO_EVENTS};
 use lance_core::{Error, ROW_ID, ROW_ID_FIELD, Result};
@@ -2947,10 +2948,9 @@ impl DeepSizeOf for CompressedPositionStorage {
 #[derive(Debug, Clone, PartialEq, Eq, Default)]
 pub struct SharedPositionStream {
     codec: PositionStreamCodec,
-    block_offsets: Vec<u32>,
-    // Stored as `Bytes` so that the cache deserialization path can hand
-    // ownership of an IPC-decoded slice in without copying. Cloning the
-    // stream is then an `Arc` bump rather than an O(N) buffer copy.
+    block_offsets: Arc<[u32]>,
+    // Stored with shared ownership so cache hits can clone position streams
+    // without copying either offsets or bytes.
     bytes: bytes::Bytes,
 }
 
@@ -2958,7 +2958,7 @@ impl SharedPositionStream {
     pub fn new(codec: PositionStreamCodec, block_offsets: Vec<u32>, bytes: bytes::Bytes) -> Self {
         Self {
             codec,
-            block_offsets,
+            block_offsets: Arc::from(block_offsets.into_boxed_slice()),
             bytes,
         }
     }
@@ -2991,11 +2991,11 @@ impl SharedPositionStream {
     }
 
     pub fn block_offsets(&self) -> &[u32] {
-        &self.block_offsets
+        self.block_offsets.as_ref()
     }
 
     pub fn size(&self) -> usize {
-        self.block_offsets.capacity() * std::mem::size_of::<u32>() + self.bytes.len()
+        self.block_offsets.len() * std::mem::size_of::<u32>() + self.bytes.len()
     }
 }
 
@@ -4615,18 +4615,25 @@ impl DocSet {
         self.row_ids[doc_id as usize]
     }
 
-    pub fn doc_id(&self, row_id: u64) -> Option<u64> {
+    /// Resolve a `row_id` to every `doc_id` it owns.
+    ///
+    /// A scalar column maps each row to a single document, but a
+    /// `list<string>` column indexes every element as its own document, so a
+    /// single `row_id` can own several `doc_id`s sharing that key in `inv`.
+    /// The prefilter path (`flat_search`) walks an allow-list of row_ids and
+    /// must evaluate *all* of a row's documents; resolving to one `doc_id`
+    /// silently drops matches at non-last list positions (lancedb#3352).
+    pub fn doc_ids(&self, row_id: u64) -> impl Iterator<Item = u64> + '_ {
         if self.inv.is_empty() {
-            // in legacy format, the row id is doc id
-            match self.row_ids.binary_search(&row_id) {
-                Ok(_) => Some(row_id),
-                Err(_) => None,
-            }
+            // in legacy format, the row id is doc id (one document per row)
+            let found = self.row_ids.binary_search(&row_id).is_ok();
+            Either::Left(found.then_some(row_id).into_iter())
         } else {
-            match self.inv.binary_search_by_key(&row_id, |x| x.0) {
-                Ok(idx) => Some(self.inv[idx].1 as u64),
-                Err(_) => None,
-            }
+            // `inv` is sorted by row_id, so the entries sharing this key form a
+            // contiguous run; yield the doc_id of each.
+            let lo = self.inv.partition_point(|entry| entry.0 < row_id);
+            let hi = self.inv.partition_point(|entry| entry.0 <= row_id);
+            Either::Right(self.inv[lo..hi].iter().map(|entry| entry.1 as u64))
         }
     }
     pub fn total_tokens_num(&self) -> u64 {
@@ -4750,23 +4757,36 @@ impl DocSet {
             });
         }
 
-        // if frag reuse happened, we'll need to remap the row_ids. And after row_ids been
-        // remapped, we'll need resort to make sure binary_search works.
+        // If frag reuse happened, remap the row_ids through it. Crucially we
+        // must NOT drop the rows the reuse index deleted, because the posting
+        // lists reference doc_ids *positionally* (a doc_id is an index into
+        // these arrays, fixed at build time). Dropping deleted rows would
+        // renumber every later doc_id and desync the posting lists, so wand
+        // would index `num_tokens`/`row_ids` out of bounds or score the wrong
+        // doc. Instead we tombstone deleted rows in place: their slot survives
+        // (so doc_ids stay aligned with the posting lists) carrying
+        // `RowAddress::TOMBSTONE_ROW`, which wand skips, and they are left out
+        // of `inv` so a row_id lookup never resolves to a deleted doc. The
+        // heavyweight physical remap (`DocSet::remap`) is what actually
+        // renumbers and compacts; this load-time path only has to stay
+        // consistent until then.
         if let Some(frag_reuse_index_ref) = frag_reuse_index.as_ref() {
             let mut row_ids = Vec::with_capacity(row_id_col.len());
-            let mut num_tokens = Vec::with_capacity(num_tokens_col.len());
-            for (row_id, num_token) in row_id_col.values().iter().zip(num_tokens_col.values()) {
-                if let Some(new_row_id) = frag_reuse_index_ref.remap_row_id(*row_id) {
-                    row_ids.push(new_row_id);
-                    num_tokens.push(*num_token);
+            let num_tokens = num_tokens_col.values().to_vec();
+            let mut inv = Vec::with_capacity(row_id_col.len());
+            for (doc_id, row_id) in row_id_col.values().iter().enumerate() {
+                match frag_reuse_index_ref.remap_row_id(*row_id) {
+                    Some(new_row_id) => {
+                        row_ids.push(new_row_id);
+                        inv.push((new_row_id, doc_id as u32));
+                    }
+                    None => {
+                        // Deleted: keep the slot (doc_ids must not shift) but
+                        // tombstone it and leave it out of `inv`.
+                        row_ids.push(RowAddress::TOMBSTONE_ROW);
+                    }
                 }
             }
-
-            let mut inv: Vec<(u64, u32)> = row_ids
-                .iter()
-                .enumerate()
-                .map(|(doc_id, row_id)| (*row_id, doc_id as u32))
-                .collect();
             inv.sort_unstable_by_key(|entry| entry.0);
 
             let total_tokens = num_tokens.iter().map(|&x| x as u64).sum();
@@ -5475,6 +5495,21 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_shared_position_stream_clone_shares_block_offsets() {
+        let stream = SharedPositionStream::new(
+            PositionStreamCodec::PackedDelta,
+            vec![0_u32, 4, 11],
+            bytes::Bytes::from_static(b"shared position bytes"),
+        );
+        let original_offsets = stream.block_offsets().as_ptr();
+
+        let cloned = stream.clone();
+
+        assert_eq!(cloned.block_offsets(), stream.block_offsets());
+        assert_eq!(cloned.block_offsets().as_ptr(), original_offsets);
+    }
+
     #[test]
     fn test_posting_builder_roundtrip_shared_positions() {
         let entries = vec![
@@ -6446,6 +6481,16 @@ mod tests {
         ) -> Result<crate::scalar::IndexFile> {
             self.inner.copy_index_file(name, dest_store).await
         }
+        async fn copy_index_file_to(
+            &self,
+            name: &str,
+            new_name: &str,
+            dest_store: &dyn IndexStore,
+        ) -> Result<crate::scalar::IndexFile> {
+            self.inner
+                .copy_index_file_to(name, new_name, dest_store)
+                .await
+        }
         async fn rename_index_file(
             &self,
             name: &str,
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs
index 6024747025b..5a2a701dc73 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs
@@ -355,16 +355,7 @@ impl InvertedIndexParams {
             builder = builder.filter_dynamic(Stemmer::new(self.language));
         }
         if self.remove_stop_words {
-            let stop_word_filter = match &self.custom_stop_words {
-                Some(words) => StopWordFilter::remove(words.iter().cloned()),
-                None => StopWordFilter::new(self.language).ok_or_else(|| {
-                    Error::invalid_input(format!(
-                        "removing stop words for language {:?} is not supported yet",
-                        self.language
-                    ))
-                })?,
-            };
-            builder = builder.filter_dynamic(stop_word_filter);
+            builder = builder.filter_dynamic(self.stop_word_filter()?);
         }
         if self.ascii_folding {
             builder = builder.filter_dynamic(AsciiFoldingFilter);
@@ -382,6 +373,19 @@ impl InvertedIndexParams {
         }
     }
 
+    fn stop_word_filter(&self) -> Result<StopWordFilter> {
+        match &self.custom_stop_words {
+            Some(words) => Ok(StopWordFilter::remove(words.iter().cloned())),
+            None if self.base_tokenizer == "icu" => Ok(StopWordFilter::all()),
+            None => StopWordFilter::new(self.language).ok_or_else(|| {
+                Error::invalid_input(format!(
+                    "removing stop words for language {:?} is not supported yet",
+                    self.language
+                ))
+            }),
+        }
+    }
+
     fn build_base_tokenizer(&self) -> Result<TextAnalyzerBuilder> {
         match self.base_tokenizer.as_str() {
             "simple" => Ok(TextAnalyzer::builder(SimpleTokenizer::default()).dynamic()),
@@ -503,4 +507,52 @@ mod tests {
         stream.process(&mut |token| tokens.push(token.text.clone()));
         assert_eq!(tokens, vec!["hello", "こんにちは", "世界"]);
     }
+
+    #[test]
+    fn test_remove_stop_words_respects_language_for_non_icu_tokenizer() {
+        let mut tokenizer = InvertedIndexParams::default()
+            .stem(false)
+            .base_tokenizer("simple".to_string())
+            .build()
+            .unwrap();
+        let mut stream = tokenizer.token_stream_for_search("the 的 lance data");
+        let mut tokens = Vec::new();
+        while let Some(token) = stream.next() {
+            tokens.push(token.text.clone());
+        }
+        assert_eq!(
+            tokens,
+            vec!["的".to_string(), "lance".to_string(), "data".to_string()]
+        );
+    }
+
+    #[test]
+    fn test_custom_stop_words_replace_language_builtins() {
+        let mut tokenizer = InvertedIndexParams::default()
+            .stem(false)
+            .custom_stop_words(Some(vec!["lance".to_string()]))
+            .build()
+            .unwrap();
+        let mut stream = tokenizer.token_stream_for_search("the lance data");
+        let mut tokens = Vec::new();
+        while let Some(token) = stream.next() {
+            tokens.push(token.text.clone());
+        }
+        assert_eq!(tokens, vec!["the".to_string(), "data".to_string()]);
+    }
+
+    #[test]
+    fn test_icu_stop_words_use_all_builtin_lists() {
+        let mut tokenizer = InvertedIndexParams::default()
+            .stem(false)
+            .base_tokenizer("icu".to_string())
+            .build()
+            .unwrap();
+        let mut stream = tokenizer.token_stream_for_search("the 的 lance data");
+        let mut tokens = Vec::new();
+        while let Some(token) = stream.next() {
+            tokens.push(token.text.clone());
+        }
+        assert_eq!(tokens, vec!["lance".to_string(), "data".to_string()]);
+    }
 }
diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs
index 609ec08041f..dc6d2a860fb 100644
--- a/rust/lance-index/src/scalar/inverted/wand.rs
+++ b/rust/lance-index/src/scalar/inverted/wand.rs
@@ -736,6 +736,15 @@ impl<'a, S: Scorer> Wand<'a, S> {
                 }
                 DocInfo::Located(doc) => doc.row_id,
             };
+            // Skip docs the fragment-reuse remap deleted. They are tombstoned
+            // in the DocSet (slot kept so posting-list doc_ids stay aligned)
+            // and must not surface in results.
+            if docs_has_row_ids && row_id == RowAddress::TOMBSTONE_ROW {
+                if self.operator == Operator::Or {
+                    self.push_back_leads(doc.doc_id() + 1);
+                }
+                continue;
+            }
             if docs_has_row_ids && !mask.selected(row_id) {
                 if self.operator == Operator::Or {
                     self.push_back_leads(doc.doc_id() + 1);
@@ -767,14 +776,15 @@ impl<'a, S: Scorer> Wand<'a, S> {
                 self.score(doc_length)
             };
 
-            let freqs = self.iter_term_freqs().collect();
             if candidates.len() < limit {
+                let freqs = self.iter_term_freqs().collect();
                 candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length)));
                 if candidates.len() == limit {
                     let kth = candidates.peek().unwrap().0.0.score.0;
                     self.update_threshold(kth, params.wand_factor);
                 }
             } else if score > candidates.peek().unwrap().0.0.score.0 {
+                let freqs = self.iter_term_freqs().collect();
                 candidates.pop();
                 candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length)));
                 let kth = candidates.peek().unwrap().0.0.score.0;
@@ -819,11 +829,16 @@ impl<'a, S: Scorer> Wand<'a, S> {
         }
 
         // we need to map the row ids to doc ids, and sort them,
-        // because WAND PostingIterator can't go back to the previous doc id
+        // because WAND PostingIterator can't go back to the previous doc id.
+        // A list column maps one row id to several doc ids, so expand every
+        // document the row owns — keying on a single doc id would drop matches
+        // at non-last list positions (lancedb#3352).
         let doc_ids = row_ids
-            .filter_map(|row_addr| {
+            .flat_map(|row_addr| {
                 let row_id: u64 = row_addr.into();
-                self.docs.doc_id(row_id).map(|doc_id| (doc_id, row_id))
+                self.docs
+                    .doc_ids(row_id)
+                    .map(move |doc_id| (doc_id, row_id))
             })
             .sorted_unstable()
             .collect::<Vec<_>>();
@@ -885,15 +900,16 @@ impl<'a, S: Scorer> Wand<'a, S> {
 
             self.collect_tail_matches(doc_id);
             let score = self.score(doc_length);
-            let freqs = self.iter_term_freqs().collect();
 
             if candidates.len() < limit {
+                let freqs = self.iter_term_freqs().collect();
                 candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length)));
                 if candidates.len() == limit {
                     let kth = candidates.peek().unwrap().0.0.score.0;
                     self.update_threshold(kth, params.wand_factor);
                 }
             } else if score > candidates.peek().unwrap().0.0.score.0 {
+                let freqs = self.iter_term_freqs().collect();
                 candidates.pop();
                 candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length)));
                 let kth = candidates.peek().unwrap().0.0.score.0;
@@ -967,41 +983,37 @@ impl<'a, S: Scorer> Wand<'a, S> {
                 continue;
             }
 
-            let Some(doc) = self.lead.first().and_then(|posting| posting.doc()) else {
+            let Some(first_doc) = self.lead.first().and_then(|posting| posting.doc()) else {
                 self.push_back_leads(target + 1);
                 continue;
             };
-            let doc_length = match &doc {
+            let doc_length = match &first_doc {
                 DocInfo::Raw(doc) => self.docs.num_tokens(doc.doc_id),
                 DocInfo::Located(doc) => self.docs.num_tokens_by_row_id(doc.row_id),
             };
-            let mut lead_score = self
-                .lead
-                .iter()
-                .filter_map(|posting| {
-                    posting.doc().map(|lead_doc| {
-                        posting.score(&self.scorer, lead_doc.frequency(), doc_length)
-                    })
-                })
-                .sum::<f32>();
+            let mut lead_score = 0.0;
+            if let Some(first_posting) = self.lead.first() {
+                lead_score += first_posting.score(&self.scorer, first_doc.frequency(), doc_length);
+            }
+            for posting in self.lead.iter().skip(1) {
+                if let Some(lead_doc) = posting.doc() {
+                    lead_score += posting.score(&self.scorer, lead_doc.frequency(), doc_length);
+                }
+            }
 
             while lead_score <= self.threshold {
                 if lead_score + self.tail_max_score <= self.threshold {
-                    self.push_back_leads(doc.doc_id() + 1);
+                    self.push_back_leads(first_doc.doc_id() + 1);
                     break;
                 }
                 if !self.advance_tail_top(target, doc_length, &mut lead_score) {
-                    self.push_back_leads(doc.doc_id() + 1);
+                    self.push_back_leads(first_doc.doc_id() + 1);
                     break;
                 }
             }
 
             if !self.lead.is_empty() {
-                return Ok(self
-                    .lead
-                    .first()
-                    .and_then(|posting| posting.doc())
-                    .map(|doc| (doc, lead_score)));
+                return Ok(Some((first_doc, lead_score)));
             }
         }
 
@@ -1392,10 +1404,9 @@ impl<'a, S: Scorer> Wand<'a, S> {
         };
         self.tail_max_score -= upper_bound;
         posting.next(target);
-        match posting.doc().map(|doc| doc.doc_id()) {
-            Some(doc_id) if doc_id == target => {
-                let frequency = posting.doc().expect("posting must exist").frequency();
-                *lead_score += posting.score(&self.scorer, frequency, doc_length);
+        match posting.doc() {
+            Some(doc) if doc.doc_id() == target => {
+                *lead_score += posting.score(&self.scorer, doc.frequency(), doc_length);
                 self.lead.push(posting);
             }
             Some(_) => self.push_head(posting),
@@ -1418,14 +1429,10 @@ impl<'a, S: Scorer> Wand<'a, S> {
         for tail_posting in tail.into_vec() {
             let mut posting = tail_posting.posting;
             posting.next(target);
-            match posting.doc().map(|doc| doc.doc_id()) {
-                Some(doc_id) if doc_id == target => {
+            match posting.doc() {
+                Some(doc) if doc.doc_id() == target => {
                     if let (Some(doc_length), Some(score)) = (doc_length, score.as_deref_mut()) {
-                        let frequency = posting
-                            .doc()
-                            .expect("posting moved to target should have doc")
-                            .frequency();
-                        *score += posting.score(&self.scorer, frequency, doc_length);
+                        *score += posting.score(&self.scorer, doc.frequency(), doc_length);
                     }
                     self.lead.push(posting)
                 }
@@ -2211,6 +2218,74 @@ mod tests {
         assert_eq!(matched, vec![2]);
     }
 
+    #[test]
+    fn test_doc_ids_resolves_every_document_a_row_owns() {
+        // A list<string> column indexes each element as its own document, so
+        // one row id owns several doc ids. row 100 -> {0, 1}, row 101 -> {2}.
+        let row_id_col = arrow_array::UInt64Array::from(vec![100_u64, 100, 101]);
+        let num_tokens_col = arrow_array::UInt32Array::from(vec![1_u32, 1, 1]);
+        let docs = DocSet::from_columns(&row_id_col, &num_tokens_col, false, None).unwrap();
+
+        assert_eq!(docs.doc_ids(100).collect::<Vec<_>>(), vec![0, 1]);
+        assert_eq!(docs.doc_ids(101).collect::<Vec<_>>(), vec![2]);
+        assert!(docs.doc_ids(999).next().is_none());
+
+        // legacy shape (row id == doc id) still resolves to a single document.
+        let mut legacy = DocSet::default();
+        legacy.append(7, 1);
+        assert_eq!(legacy.doc_ids(7).collect::<Vec<_>>(), vec![7]);
+        assert!(legacy.doc_ids(8).next().is_none());
+    }
+
+    #[rstest]
+    fn test_flat_search_finds_list_row_with_match_at_non_last_position(
+        #[values(false, true)] is_compressed: bool,
+    ) {
+        // row 100 owns two element-documents (doc 0, doc 1) that share its row
+        // id; row 101 owns doc 2. The query term lives only in doc 0 — the
+        // *non-last* element of row 100. Resolving the row to a single doc id
+        // would evaluate doc 1, miss the term, and drop the row (lancedb#3352).
+        let row_id_col = arrow_array::UInt64Array::from(vec![100_u64, 100, 101]);
+        let num_tokens_col = arrow_array::UInt32Array::from(vec![1_u32, 1, 1]);
+        let docs = DocSet::from_columns(&row_id_col, &num_tokens_col, false, None).unwrap();
+
+        let posting = PostingIterator::with_query_weight(
+            String::from("needle"),
+            0,
+            0,
+            1.0,
+            generate_posting_list(vec![0], 1.0, None, is_compressed),
+            docs.len(),
+        );
+
+        let mut wand = Wand::new(
+            Operator::Or,
+            vec![posting].into_iter(),
+            &docs,
+            InverseDocLengthScorer,
+        );
+        wand.threshold = 0.5;
+
+        let selected = vec![RowAddress::from(100_u64)];
+        let result = wand
+            .flat_search(
+                &FtsSearchParams::default(),
+                Box::new(selected.into_iter()),
+                &NoOpMetricsCollector,
+            )
+            .unwrap();
+
+        // flat_search resolves the prefilter against the DocSet, so the single
+        // match comes back as a concrete RowId(100) rather than a deferred
+        // Pending addr. Asserting on the whole result avoids a never-taken
+        // match arm that would otherwise read as uncovered.
+        let addrs = result.into_iter().map(|doc| doc.addr).collect::<Vec<_>>();
+        assert!(
+            matches!(addrs.as_slice(), [CandidateAddr::RowId(100)]),
+            "expected exactly row 100, got {addrs:?}"
+        );
+    }
+
     #[test]
     fn test_block_max_score_matches_stored_value() {
         let doc_ids = vec![0_u32];
diff --git a/rust/lance-index/src/scalar/label_list.rs b/rust/lance-index/src/scalar/label_list.rs
index cf357d89585..8e07a607bff 100644
--- a/rust/lance-index/src/scalar/label_list.rs
+++ b/rust/lance-index/src/scalar/label_list.rs
@@ -18,8 +18,9 @@ use datafusion::execution::RecordBatchStream;
 use datafusion::physical_plan::{SendableRecordBatchStream, stream::RecordBatchStreamAdapter};
 use datafusion_common::ScalarValue;
 use futures::{StreamExt, TryStream, TryStreamExt, stream::BoxStream};
-use lance_arrow::ipc::{read_len_prefixed_bytes_at, write_len_prefixed_bytes};
-use lance_core::cache::{CacheCodec, CacheCodecImpl, CacheKey, LanceCache};
+use lance_core::cache::{
+    CacheCodec, CacheCodecImpl, CacheEntryReader, CacheEntryWriter, CacheKey, LanceCache,
+};
 use lance_core::deepsize::DeepSizeOf;
 use lance_core::error::LanceOptionExt;
 use lance_core::{Error, ROW_ID, Result};
@@ -532,27 +533,30 @@ impl LabelListIndexState {
 }
 
 impl CacheCodecImpl for LabelListIndexState {
+    const TYPE_ID: &'static str = "lance.scalar.LabelListIndexState";
+    const CURRENT_VERSION: u32 = 1;
+
     /// Wire format:
     /// ```text
-    /// [u64 list_nulls_len][list_nulls bytes]
-    /// [bitmap state bytes (self-delimiting)]
+    /// RAW_BLOB : list_nulls (roaring tree map, portable encoding)
+    /// <nested BitmapIndexState body (self-delimiting)>
     /// ```
-    fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()> {
+    fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> {
         let mut nulls_bytes = Vec::with_capacity(self.list_nulls.serialized_size());
         self.list_nulls.serialize_into(&mut nulls_bytes)?;
-        write_len_prefixed_bytes(writer, &nulls_bytes)?;
-        self.bitmap_state.serialize(writer)?;
+        w.write_raw(&nulls_bytes)?;
+        // The bitmap state writes its own self-delimiting body inline.
+        self.bitmap_state.serialize(w)?;
         Ok(())
     }
 
-    fn deserialize(data: &bytes::Bytes) -> Result<Self> {
-        let mut offset = 0;
-        let nulls_bytes = read_len_prefixed_bytes_at(data, &mut offset)?;
+    fn deserialize(r: &mut CacheEntryReader<'_>) -> Result<Self> {
+        let nulls_bytes = r.read_raw()?;
         let list_nulls = Arc::new(RowAddrTreeMap::deserialize_from(nulls_bytes.as_ref())?);
         // The bitmap state is self-delimiting (length-prefixed null map +
-        // Arrow IPC stream with EOS marker), so we can hand the remaining
-        // tail to it directly.
-        let bitmap_state = BitmapIndexState::deserialize(&data.slice(offset..))?;
+        // Arrow IPC stream with EOS marker); it continues reading the body
+        // from where the null map left off.
+        let bitmap_state = BitmapIndexState::deserialize(r)?;
         Ok(Self {
             bitmap_state,
             list_nulls,
@@ -728,3 +732,91 @@ impl ScalarIndexPlugin for LabelListIndexPlugin {
         Ok(())
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::collections::BTreeMap;
+
+    use datafusion_common::ScalarValue;
+    use lance_core::cache::CacheCodec;
+    use lance_core::utils::address::RowAddress;
+
+    use super::super::bitmap::BitmapIndexState;
+    use super::super::btree::OrderableScalarValue;
+    use super::*;
+
+    fn sample_state() -> LabelListIndexState {
+        let mut index_map = BTreeMap::new();
+        for k in 0..32i32 {
+            index_map.insert(
+                OrderableScalarValue(ScalarValue::Int32(Some(k))),
+                k as usize,
+            );
+        }
+        let mut bitmap_nulls = RowAddrTreeMap::new();
+        bitmap_nulls.insert(RowAddress::new_from_parts(0, 3).into());
+        let bitmap_state =
+            BitmapIndexState::new_for_test(index_map, bitmap_nulls, DataType::Int32).unwrap();
+
+        let mut list_nulls = RowAddrTreeMap::new();
+        list_nulls.insert(RowAddress::new_from_parts(0, 9).into());
+        LabelListIndexState {
+            bitmap_state,
+            list_nulls: Arc::new(list_nulls),
+        }
+    }
+
+    #[test]
+    fn test_label_list_state_codec_roundtrip() {
+        let state = sample_state();
+        let mut buf = Vec::new();
+        state
+            .serialize(&mut CacheEntryWriter::new(&mut buf))
+            .unwrap();
+        let data = Bytes::from(buf);
+        let mut reader = CacheEntryReader::new(&data, 0, LabelListIndexState::CURRENT_VERSION);
+        let restored = LabelListIndexState::deserialize(&mut reader).unwrap();
+
+        assert_eq!(&*restored.list_nulls, &*state.list_nulls);
+        assert_eq!(
+            restored.bitmap_state.lookup_batch(),
+            state.bitmap_state.lookup_batch()
+        );
+        assert_eq!(
+            restored.bitmap_state.null_map(),
+            state.bitmap_state.null_map()
+        );
+    }
+
+    /// The nested bitmap lookup batch must decode zero-copy through the full
+    /// envelope, proving the leading `list_nulls` RAW_BLOB does not knock the
+    /// nested IPC section off its 64-byte boundary.
+    #[test]
+    fn test_label_list_nested_lookup_is_zero_copy() {
+        const ALIGN: usize = 64;
+        let codec = CacheCodec::from_impl::<LabelListIndexState>();
+        let any: Arc<dyn std::any::Any + Send + Sync> = Arc::new(sample_state());
+        let mut buf = Vec::new();
+        codec.serialize(&any, &mut buf).unwrap();
+
+        let mut v = vec![0u8; buf.len() + ALIGN];
+        let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN;
+        v[pad..pad + buf.len()].copy_from_slice(&buf);
+        let data = Bytes::from(v).slice(pad..pad + buf.len());
+
+        let restored = codec.deserialize(&data).hit().unwrap();
+        let restored = restored.downcast::<LabelListIndexState>().unwrap();
+
+        let base = data.as_ptr() as usize;
+        let end = base + data.len();
+        for col in restored.bitmap_state.lookup_batch().columns() {
+            for buffer in col.to_data().buffers() {
+                let ptr = buffer.as_ptr() as usize;
+                assert!(
+                    ptr >= base && ptr < end,
+                    "nested bitmap lookup buffer was realigned — misaligned IPC section",
+                );
+            }
+        }
+    }
+}
diff --git a/rust/lance-index/src/scalar/lance_format.rs b/rust/lance-index/src/scalar/lance_format.rs
index 562945b8f0d..2f82deb8403 100644
--- a/rust/lance-index/src/scalar/lance_format.rs
+++ b/rust/lance-index/src/scalar/lance_format.rs
@@ -99,6 +99,24 @@ impl LanceIndexStore {
         self.file_sizes = file_sizes;
         self
     }
+
+    fn index_file_path(&self, name: &str) -> Result<Path> {
+        let relative_path = Path::parse(name).map_err(|err| {
+            Error::invalid_input(format!("invalid index file path {name:?}: {err}"))
+        })?;
+        if self.index_dir.is_root() {
+            return Ok(relative_path);
+        }
+        if relative_path.is_root() {
+            return Ok(self.index_dir.clone());
+        }
+        Path::parse(format!(
+            "{}/{}",
+            self.index_dir.as_ref(),
+            relative_path.as_ref()
+        ))
+        .map_err(|err| Error::invalid_input(format!("invalid index file path {name:?}: {err}")))
+    }
 }
 
 #[async_trait]
@@ -397,7 +415,7 @@ impl IndexStore for LanceIndexStore {
         name: &str,
         schema: Arc<Schema>,
     ) -> Result<Box<dyn IndexWriter>> {
-        let path = self.index_dir.clone().join(name);
+        let path = self.index_file_path(name)?;
         let schema = schema.as_ref().try_into()?;
         let writer = self.object_store.create(&path).await?;
         let writer = current_writer::FileWriter::try_new(
@@ -415,7 +433,7 @@ impl IndexStore for LanceIndexStore {
     }
 
     async fn open_index_file(&self, name: &str) -> Result<Arc<dyn IndexReader>> {
-        let path = self.index_dir.clone().join(name);
+        let path = self.index_file_path(name)?;
         // Use cached file size if available, otherwise unknown (requires HEAD call)
         let cached_size = self
             .file_sizes
@@ -436,7 +454,7 @@ impl IndexStore for LanceIndexStore {
             Err(e) => {
                 // If the error is a version conflict we can try to read the file with v1 reader
                 if let Error::VersionConflict { .. } = e {
-                    let path = self.index_dir.clone().join(name);
+                    let path = self.index_file_path(name)?;
                     let file_reader = PreviousFileReader::try_new_self_described(
                         &self.object_store,
                         &path,
@@ -452,7 +470,16 @@ impl IndexStore for LanceIndexStore {
     }
 
     async fn copy_index_file(&self, name: &str, dest_store: &dyn IndexStore) -> Result<IndexFile> {
-        let path = self.index_dir.clone().join(name);
+        self.copy_index_file_to(name, name, dest_store).await
+    }
+
+    async fn copy_index_file_to(
+        &self,
+        name: &str,
+        new_name: &str,
+        dest_store: &dyn IndexStore,
+    ) -> Result<IndexFile> {
+        let path = self.index_file_path(name)?;
 
         let other_store = dest_store.as_any().downcast_ref::<Self>();
         match other_store {
@@ -460,21 +487,21 @@ impl IndexStore for LanceIndexStore {
                 // If both this store and the destination are lance stores we can use object_store's copy
                 // This does blindly assume that both stores are using the same underlying object_store
                 // but there is no easy way to verify this and it happens to always be true at the moment
-                let dest_path = dest_store.index_dir.clone().join(name);
+                let dest_path = dest_store.index_file_path(new_name)?;
                 self.object_store.copy(&path, &dest_path).await?;
                 let size_bytes = match self.file_sizes.get(name) {
                     Some(size_bytes) => *size_bytes,
                     None => self.object_store.size(&path).await?,
                 };
                 Ok(IndexFile {
-                    path: name.to_string(),
+                    path: new_name.to_string(),
                     size_bytes,
                 })
             }
             _ => {
                 let reader = self.open_index_file(name).await?;
                 let mut writer = dest_store
-                    .new_index_file(name, Arc::new(reader.schema().into()))
+                    .new_index_file(new_name, Arc::new(reader.schema().into()))
                     .await?;
 
                 for offset in (0..reader.num_rows()).step_by(4096) {
@@ -488,8 +515,8 @@ impl IndexStore for LanceIndexStore {
     }
 
     async fn rename_index_file(&self, name: &str, new_name: &str) -> Result<IndexFile> {
-        let path = self.index_dir.clone().join(name);
-        let new_path = self.index_dir.clone().join(new_name);
+        let path = self.index_file_path(name)?;
+        let new_path = self.index_file_path(new_name)?;
         self.object_store.copy(&path, &new_path).await?;
         self.object_store.delete(&path).await?;
         let size_bytes = match self.file_sizes.get(name) {
@@ -503,7 +530,7 @@ impl IndexStore for LanceIndexStore {
     }
 
     async fn delete_index_file(&self, name: &str) -> Result<()> {
-        let path = self.index_dir.clone().join(name);
+        let path = self.index_file_path(name)?;
         self.object_store.delete(&path).await
     }
 
diff --git a/rust/lance-index/src/scalar/ngram.rs b/rust/lance-index/src/scalar/ngram.rs
index 72ef8d53a92..b452ef78c85 100644
--- a/rust/lance-index/src/scalar/ngram.rs
+++ b/rust/lance-index/src/scalar/ngram.rs
@@ -5,7 +5,10 @@ use std::any::Any;
 use std::collections::BTreeMap;
 use std::iter::once;
 use std::time::Instant;
-use std::{collections::HashMap, sync::Arc};
+use std::{
+    collections::{HashMap, HashSet},
+    sync::Arc,
+};
 
 use super::lance_format::LanceIndexStore;
 use super::{
@@ -49,6 +52,9 @@ use roaring::{RoaringBitmap, RoaringTreemap};
 use serde::Serialize;
 use tracing::instrument;
 
+mod ngram_regex;
+pub(crate) use ngram_regex::regex_can_use_index;
+
 const TOKENS_COL: &str = "tokens";
 const POSTING_LIST_COL: &str = "posting_list";
 const POSTINGS_FILENAME: &str = "ngram_postings.lance";
@@ -476,6 +482,45 @@ impl ScalarIndex for NGramIndex {
                 let row_ids = NGramPostingList::intersect(list_refs);
                 Ok(SearchResult::at_most(RowAddrTreeMap::from(row_ids)))
             }
+            TextQuery::Regex(pattern) => {
+                let trigram_query = ngram_regex::regex_to_trigram_query(pattern);
+                match &trigram_query {
+                    // No usable trigram structure (e.g. `a.b`, `.*`): the index
+                    // cannot prune, so every row must be rechecked.
+                    ngram_regex::TrigramQuery::All => {
+                        Ok(SearchResult::at_least(RowAddrTreeMap::new()))
+                    }
+                    // The pattern is provably unsatisfiable.
+                    ngram_regex::TrigramQuery::None => {
+                        Ok(SearchResult::exact(RowAddrTreeMap::new()))
+                    }
+                    _ => {
+                        let mut tokens = HashSet::new();
+                        ngram_regex::collect_tokens(&trigram_query, &mut tokens);
+                        // Fetch the posting list for every trigram the condition
+                        // references; a token absent from the index contributes
+                        // an empty list, which `eval_trigram_query` handles.
+                        let present = tokens.into_iter().filter_map(|token| {
+                            self.tokens.get(&token).map(|offset| (token, *offset))
+                        });
+                        let lists = futures::stream::iter(present.map(|(token, offset)| {
+                            self.list_reader
+                                .ngram_list(offset, metrics)
+                                .map(move |result| result.map(|list| (token, list)))
+                        }))
+                        .buffer_unordered(self.io_parallelism)
+                        .try_collect::<Vec<(u32, Arc<NGramPostingList>)>>()
+                        .await?;
+                        metrics.record_comparisons(lists.len());
+                        let bitmaps: HashMap<u32, RoaringTreemap> = lists
+                            .into_iter()
+                            .map(|(token, list)| (token, list.bitmap.clone()))
+                            .collect();
+                        let row_ids = ngram_regex::eval_trigram_query(&trigram_query, &bitmaps);
+                        Ok(SearchResult::at_most(RowAddrTreeMap::from(row_ids)))
+                    }
+                }
+            }
         }
     }
 
@@ -1279,6 +1324,9 @@ impl ScalarIndexPlugin for NGramIndexPlugin {
         Some(Box::new(TextQueryParser::new(
             index_name,
             self.name().to_string(),
+            // needs_recheck: ngram results are an inexact candidate superset.
+            true,
+            // supports_regex: the ngram index can answer regex queries.
             true,
         )))
     }
@@ -1538,6 +1586,107 @@ mod tests {
         assert_eq!(expected, res);
     }
 
+    #[test_log::test(tokio::test)]
+    async fn test_ngram_regex_search() {
+        // Same corpus as test_basic_ngram_index.
+        let data = StringArray::from_iter_values([
+            "cat",         // 0
+            "dog",         // 1
+            "cat dog",     // 2
+            "dog cat",     // 3
+            "elephant",    // 4
+            "mouse",       // 5
+            "rhino",       // 6
+            "giraffe",     // 7
+            "rhinos nose", // 8
+        ]);
+        let row_ids = UInt64Array::from_iter_values((0..data.len()).map(|i| i as u64));
+        let schema = Arc::new(Schema::new(vec![
+            Field::new(VALUE_COLUMN_NAME, DataType::Utf8, false),
+            Field::new(ROW_ID, DataType::UInt64, false),
+        ]));
+        let data =
+            RecordBatch::try_new(schema.clone(), vec![Arc::new(data), Arc::new(row_ids)]).unwrap();
+        let data = Box::pin(RecordBatchStreamAdapter::new(
+            schema,
+            stream::once(std::future::ready(Ok(data))),
+        ));
+
+        let builder = NGramIndexBuilder::try_new(NGramIndexBuilderOptions::default()).unwrap();
+        let (index, _tmpdir) = do_train(builder, data).await;
+
+        async fn search(index: &NGramIndex, pattern: &str) -> SearchResult {
+            index
+                .search(
+                    &TextQuery::Regex(pattern.to_string()),
+                    &NoOpMetricsCollector,
+                )
+                .await
+                .unwrap()
+        }
+
+        // A plain literal yields the same candidates as contains("cat").
+        assert_eq!(
+            search(&index, "cat").await,
+            SearchResult::at_most(RowAddrTreeMap::from_iter([0, 2, 3]))
+        );
+
+        // Alternation -> union of each branch's rows.
+        assert_eq!(
+            search(&index, "(cat|dog)").await,
+            SearchResult::at_most(RowAddrTreeMap::from_iter([0, 1, 2, 3]))
+        );
+
+        // AND across `.*`: must contain both the `rhino` and `nose` trigrams, so
+        // row 6 ("rhino") is correctly excluded and only row 8 survives.
+        assert_eq!(
+            search(&index, "rhino.*nose").await,
+            SearchResult::at_most(RowAddrTreeMap::from_iter([8]))
+        );
+
+        // No derivable trigram -> recheck everything.
+        assert_eq!(
+            search(&index, "a.b").await,
+            SearchResult::at_least(RowAddrTreeMap::new())
+        );
+
+        // A trigram that is absent from the index -> empty candidate set.
+        assert_eq!(
+            search(&index, "zzz").await,
+            SearchResult::at_most(RowAddrTreeMap::new())
+        );
+    }
+
+    #[test_log::test(tokio::test)]
+    async fn test_ngram_regex_search_nulls() {
+        // Rows: cat(0), dog(1), NULL(2), NULL(3), cat dog(4).
+        let data = simple_data_with_nulls();
+        let builder = NGramIndexBuilder::try_new(NGramIndexBuilderOptions::default()).unwrap();
+        let (index, _tmpdir) = do_train(builder, data).await;
+
+        // The NULL rows (2, 3) must never appear in the candidate set.
+        let res = index
+            .search(&TextQuery::Regex("cat".to_string()), &NoOpMetricsCollector)
+            .await
+            .unwrap();
+        assert_eq!(
+            res,
+            SearchResult::at_most(RowAddrTreeMap::from_iter([0, 4]))
+        );
+
+        let res = index
+            .search(
+                &TextQuery::Regex("(cat|dog)".to_string()),
+                &NoOpMetricsCollector,
+            )
+            .await
+            .unwrap();
+        assert_eq!(
+            res,
+            SearchResult::at_most(RowAddrTreeMap::from_iter([0, 1, 4]))
+        );
+    }
+
     fn test_data_schema() -> Arc<Schema> {
         Arc::new(Schema::new(vec![
             Field::new(VALUE_COLUMN_NAME, DataType::Utf8, true),
diff --git a/rust/lance-index/src/scalar/ngram/ngram_regex.rs b/rust/lance-index/src/scalar/ngram/ngram_regex.rs
new file mode 100644
index 00000000000..ee67c479a71
--- /dev/null
+++ b/rust/lance-index/src/scalar/ngram/ngram_regex.rs
@@ -0,0 +1,673 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Deriving a trigram pre-filter from a regular expression.
+//!
+//! This is the query-side counterpart of the ngram index that lets us
+//! accelerate `regexp_like` / `regexp_match` predicates the same way the index
+//! already accelerates `contains`. The idea (the same one Postgres `pg_trgm`
+//! and Russ Cox's Google Code Search use) is to derive, from the regex, a
+//! boolean condition over trigram presence that is *necessary* for any string
+//! to match, evaluate it against the inverted index, and let the scan recheck
+//! the true regex on the surviving rows.
+//!
+//! The derived condition is a [`TrigramQuery`] -- an AND/OR tree of trigram
+//! tokens. `AND` maps onto posting-list intersection and `OR` onto union, which
+//! is exactly the set algebra the ngram index is built for.
+//!
+//! # Soundness
+//!
+//! The single invariant that matters is that the condition must never require a
+//! trigram that a matching string could lack -- otherwise we would drop real
+//! matches (a false negative, far worse than a false positive, which the recheck
+//! removes). Everything here is therefore a conservative *over*-approximation:
+//! when in doubt we emit [`TrigramQuery::All`] ("no constraint, recheck
+//! everything"). Concretely:
+//!
+//! * Every trigram requirement is produced by [`trigrams_of_string`], which runs
+//!   the *same* tokenizer the index was built with, so a string shorter than a
+//!   trigram (or with no alphanumeric run) contributes no requirement.
+//! * Character classes and case-insensitive folds are treated as a single
+//!   unknown character (`All`), because the index's normalization does not agree
+//!   with Unicode case folding (e.g. `(?i)c` also matches `ℂ`, which the index
+//!   does not fold to `c`). Literal runs -- the common case -- are fully used.
+//! * When the exact / prefix / suffix string sets grow past a bound we first fold
+//!   their trigrams into the running condition and only then drop the strings, so
+//!   collapsing precision never removes a necessary trigram.
+
+use std::collections::{BTreeSet, HashMap, HashSet};
+
+use regex_syntax::hir::{Class, Hir, HirKind};
+use roaring::RoaringTreemap;
+
+use super::{NGRAM_N, NGRAM_TOKENIZER, ngram_to_token, tokenize_visitor};
+
+/// Maximum number of strings kept in an `exact` / `prefix` / `suffix` set before
+/// it is folded into the trigram condition and dropped.
+const MAX_SET_SIZE: usize = 16;
+/// Maximum length (in characters) of a string kept in a set. Longer strings are
+/// trimmed to a sound shorter affix.
+const MAX_STRING_LEN: usize = 32;
+
+/// A boolean condition over trigram presence that is *necessary* for a regex to
+/// match. `All` means "no constraint" and `None` means "unsatisfiable"; by
+/// construction these only ever appear at the root of the tree.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
+pub enum TrigramQuery {
+    /// No constraint: every row is a candidate (the scan must recheck all rows).
+    All,
+    /// Unsatisfiable: no row can match.
+    None,
+    /// The given trigram token must be present.
+    Trigram(u32),
+    /// Every child condition must hold (posting-list intersection).
+    And(Vec<Self>),
+    /// At least one child condition must hold (posting-list union).
+    Or(Vec<Self>),
+}
+
+impl TrigramQuery {
+    /// Build an `AND` of conditions, applying identity (`All`), absorbing
+    /// (`None`), flattening, sorting and de-duplication so the result is
+    /// canonical and free of nested `All`/`None`.
+    fn and(items: Vec<Self>) -> Self {
+        let mut flat = Vec::with_capacity(items.len());
+        for item in items {
+            match item {
+                Self::All => {}                               // identity
+                Self::None => return Self::None,              // absorbing
+                Self::And(children) => flat.extend(children), // flatten
+                other => flat.push(other),
+            }
+        }
+        flat.sort();
+        flat.dedup();
+        match flat.len() {
+            0 => Self::All,
+            1 => flat.pop().unwrap(),
+            _ => Self::And(flat),
+        }
+    }
+
+    /// Build an `OR` of conditions, applying absorbing (`All`), identity
+    /// (`None`), flattening, sorting and de-duplication.
+    fn or(items: Vec<Self>) -> Self {
+        let mut flat = Vec::with_capacity(items.len());
+        for item in items {
+            match item {
+                Self::All => return Self::All,               // absorbing
+                Self::None => {}                             // identity
+                Self::Or(children) => flat.extend(children), // flatten
+                other => flat.push(other),
+            }
+        }
+        flat.sort();
+        flat.dedup();
+        match flat.len() {
+            0 => Self::None,
+            1 => flat.pop().unwrap(),
+            _ => Self::Or(flat),
+        }
+    }
+}
+
+/// Information about the set of strings a sub-expression can match, used to
+/// build a necessary trigram condition bottom-up. For every string `s` the
+/// sub-expression matches: `s` is in `exact` (when it is `Some`), `s` starts
+/// with some member of `prefix` and ends with some member of `suffix`, and `s`
+/// satisfies `match_q`.
+struct RegexInfo {
+    /// Whether the sub-expression can match the empty string.
+    emptyable: bool,
+    /// The complete set of strings the sub-expression matches, or `None` if that
+    /// set is unbounded / unknown.
+    exact: Option<BTreeSet<String>>,
+    /// Strings that every match must start with (empty = unknown).
+    prefix: BTreeSet<String>,
+    /// Strings that every match must end with (empty = unknown).
+    suffix: BTreeSet<String>,
+    /// A necessary trigram condition for the sub-expression.
+    match_q: TrigramQuery,
+}
+
+impl RegexInfo {
+    /// The empty string (also used for zero-width anchors): matches only `""`.
+    fn empty_string() -> Self {
+        let empty = BTreeSet::from([String::new()]);
+        Self {
+            emptyable: true,
+            exact: Some(empty.clone()),
+            prefix: empty.clone(),
+            suffix: empty,
+            match_q: TrigramQuery::All,
+        }
+    }
+
+    /// A fixed literal string.
+    fn literal(s: &str) -> Self {
+        let set = BTreeSet::from([s.to_string()]);
+        Self {
+            emptyable: s.is_empty(),
+            exact: Some(set.clone()),
+            prefix: set.clone(),
+            suffix: set,
+            match_q: trigrams_of_string(s),
+        }
+    }
+
+    /// A single unknown character (a character class we cannot pin down).
+    fn any_char() -> Self {
+        Self {
+            emptyable: false,
+            exact: None,
+            prefix: BTreeSet::new(),
+            suffix: BTreeSet::new(),
+            match_q: TrigramQuery::All,
+        }
+    }
+
+    /// Enforce the size/length bounds, folding any information about to be
+    /// discarded into `match_q` first so that precision loss never drops a
+    /// necessary trigram. Idempotent.
+    fn bound(&mut self) {
+        let oversized_exact = self.exact.as_ref().is_some_and(|exact| {
+            exact.len() > MAX_SET_SIZE || exact.iter().any(|s| s.chars().count() > MAX_STRING_LEN)
+        });
+        if oversized_exact {
+            let exact = self.exact.take().expect("checked above");
+            self.fold_into_match(&exact);
+        }
+
+        self.prefix = self
+            .prefix
+            .iter()
+            .map(|s| leading(s, MAX_STRING_LEN))
+            .collect();
+        if self.prefix.len() > MAX_SET_SIZE {
+            let prefix = std::mem::take(&mut self.prefix);
+            self.fold_into_match(&prefix);
+        }
+
+        self.suffix = self
+            .suffix
+            .iter()
+            .map(|s| trailing(s, MAX_STRING_LEN))
+            .collect();
+        if self.suffix.len() > MAX_SET_SIZE {
+            let suffix = std::mem::take(&mut self.suffix);
+            self.fold_into_match(&suffix);
+        }
+    }
+
+    /// AND the trigrams of `set` (a complete set of possible affixes/strings)
+    /// into `match_q`. Sound because the set is exhaustive for its role.
+    fn fold_into_match(&mut self, set: &BTreeSet<String>) {
+        let folded = trigrams_of_set(set.iter());
+        let current = std::mem::replace(&mut self.match_q, TrigramQuery::All);
+        self.match_q = TrigramQuery::and(vec![current, folded]);
+    }
+}
+
+/// AND together the trigrams of `s`. Reuses the index's own tokenizer so the
+/// tokens are normalized (lowercase, ASCII-folded, alphanumeric-bounded)
+/// exactly as they were stored. Returns `All` if `s` yields no trigram (too
+/// short, or no run of three alphanumeric characters).
+fn trigrams_of_string(s: &str) -> TrigramQuery {
+    let mut tokens = Vec::new();
+    tokenize_visitor(&NGRAM_TOKENIZER, s, |ngram| {
+        tokens.push(TrigramQuery::Trigram(ngram_to_token(ngram, NGRAM_N)));
+    });
+    TrigramQuery::and(tokens)
+}
+
+/// OR together the trigram conditions of each string in `set`. An empty set
+/// means "unknown" and yields `All` (no constraint); if any member yields `All`
+/// the whole OR is `All`.
+fn trigrams_of_set<'a>(set: impl IntoIterator<Item = &'a String>) -> TrigramQuery {
+    let queries: Vec<_> = set.into_iter().map(|s| trigrams_of_string(s)).collect();
+    if queries.is_empty() {
+        return TrigramQuery::All;
+    }
+    TrigramQuery::or(queries)
+}
+
+/// Concatenate every string in `a` with every string in `b`.
+fn cross_concat(a: &BTreeSet<String>, b: &BTreeSet<String>) -> BTreeSet<String> {
+    let mut out = BTreeSet::new();
+    for x in a {
+        for y in b {
+            out.insert(format!("{x}{y}"));
+        }
+    }
+    out
+}
+
+/// The first `n` characters of `s` (a sound shorter prefix).
+fn leading(s: &str, n: usize) -> String {
+    s.chars().take(n).collect()
+}
+
+/// The last `n` characters of `s` (a sound shorter suffix).
+fn trailing(s: &str, n: usize) -> String {
+    let count = s.chars().count();
+    s.chars().skip(count.saturating_sub(n)).collect()
+}
+
+/// If `class` matches exactly one scalar value, return that character.
+fn singleton_char(class: &Class) -> Option<char> {
+    match class {
+        Class::Unicode(u) => {
+            let ranges = u.ranges();
+            match ranges {
+                [r] if r.start() == r.end() => Some(r.start()),
+                _ => None,
+            }
+        }
+        Class::Bytes(b) => {
+            let ranges = b.ranges();
+            match ranges {
+                [r] if r.start() == r.end() && r.start() < 0x80 => Some(r.start() as char),
+                _ => None,
+            }
+        }
+    }
+}
+
+/// Compute the [`RegexInfo`] for `hir` bottom-up.
+fn analyze(hir: &Hir) -> RegexInfo {
+    let mut info = match hir.kind() {
+        // Zero-width: the empty match. Anchors (^, $, \b) carry no trigram.
+        HirKind::Empty | HirKind::Look(_) => RegexInfo::empty_string(),
+        HirKind::Literal(lit) => match std::str::from_utf8(&lit.0) {
+            Ok(s) => RegexInfo::literal(s),
+            // A literal that is not valid UTF-8 cannot be reasoned about here.
+            Err(_) => RegexInfo::any_char(),
+        },
+        HirKind::Class(class) => match singleton_char(class) {
+            Some(ch) => RegexInfo::literal(ch.encode_utf8(&mut [0u8; 4])),
+            None => RegexInfo::any_char(),
+        },
+        HirKind::Repetition(rep) => {
+            let inner = analyze(&rep.sub);
+            let at_least_one = rep.min >= 1;
+            RegexInfo {
+                emptyable: !at_least_one || inner.emptyable,
+                // We do not unroll bounded repetitions, so the matched set is
+                // unbounded as far as we are concerned.
+                exact: None,
+                prefix: if at_least_one {
+                    inner.prefix.clone()
+                } else {
+                    BTreeSet::new()
+                },
+                suffix: if at_least_one {
+                    inner.suffix.clone()
+                } else {
+                    BTreeSet::new()
+                },
+                // Only a required occurrence (min >= 1) contributes; the single
+                // inner match is necessary, never multiplied.
+                match_q: if at_least_one {
+                    inner.match_q
+                } else {
+                    TrigramQuery::All
+                },
+            }
+        }
+        HirKind::Capture(cap) => analyze(&cap.sub),
+        HirKind::Concat(subs) => analyze_concat(subs),
+        HirKind::Alternation(subs) => analyze_alternation(subs),
+    };
+    info.bound();
+    info
+}
+
+fn analyze_concat(subs: &[Hir]) -> RegexInfo {
+    let mut acc = RegexInfo::empty_string();
+    for sub in subs {
+        acc = concat_info(acc, analyze(sub));
+    }
+    acc
+}
+
+/// Combine two adjacent sub-expressions. This is the subtle part: it recovers
+/// trigrams that straddle the junction via the cross product of `acc.suffix` and
+/// `next.prefix`.
+fn concat_info(acc: RegexInfo, next: RegexInfo) -> RegexInfo {
+    let emptyable = acc.emptyable && next.emptyable;
+
+    // Trigrams spanning the junction (computed from the pre-merge affixes).
+    let boundary = if acc.suffix.is_empty() || next.prefix.is_empty() {
+        TrigramQuery::All
+    } else {
+        trigrams_of_set(cross_concat(&acc.suffix, &next.prefix).iter())
+    };
+
+    // exact = acc.exact x next.exact, only while both are finite and small.
+    let exact = match (&acc.exact, &next.exact) {
+        (Some(a), Some(b)) if a.len().saturating_mul(b.len()) <= MAX_SET_SIZE => {
+            Some(cross_concat(a, b))
+        }
+        _ => None,
+    };
+
+    // A match starts with acc's full string (when known) then next's prefix,
+    // otherwise with acc's own prefix.
+    let prefix = match &acc.exact {
+        Some(a) if !next.prefix.is_empty() => cross_concat(a, &next.prefix),
+        Some(a) => a.clone(),
+        None => acc.prefix.clone(),
+    };
+
+    // Mirror image for the suffix (driven by the right side).
+    let suffix = match &next.exact {
+        Some(b) if !acc.suffix.is_empty() => cross_concat(&acc.suffix, b),
+        Some(b) => b.clone(),
+        None => next.suffix.clone(),
+    };
+
+    let match_q = TrigramQuery::and(vec![acc.match_q, next.match_q, boundary]);
+
+    let mut info = RegexInfo {
+        emptyable,
+        exact,
+        prefix,
+        suffix,
+        match_q,
+    };
+    info.bound();
+    info
+}
+
+fn analyze_alternation(subs: &[Hir]) -> RegexInfo {
+    let infos: Vec<RegexInfo> = subs.iter().map(analyze).collect();
+
+    let emptyable = infos.iter().any(|i| i.emptyable);
+
+    let exact = if infos.iter().all(|i| i.exact.is_some()) {
+        Some(
+            infos
+                .iter()
+                .flat_map(|i| i.exact.as_ref().unwrap().iter().cloned())
+                .collect(),
+        )
+    } else {
+        None
+    };
+
+    // A common prefix exists only if every branch contributes one.
+    let prefix = if infos.iter().all(|i| !i.prefix.is_empty()) {
+        infos
+            .iter()
+            .flat_map(|i| i.prefix.iter().cloned())
+            .collect()
+    } else {
+        BTreeSet::new()
+    };
+    let suffix = if infos.iter().all(|i| !i.suffix.is_empty()) {
+        infos
+            .iter()
+            .flat_map(|i| i.suffix.iter().cloned())
+            .collect()
+    } else {
+        BTreeSet::new()
+    };
+
+    let match_q = TrigramQuery::or(infos.into_iter().map(|i| i.match_q).collect());
+
+    RegexInfo {
+        emptyable,
+        exact,
+        prefix,
+        suffix,
+        match_q,
+    }
+}
+
+/// Derive a necessary trigram condition from a regular expression pattern.
+///
+/// Returns [`TrigramQuery::All`] when no useful condition can be derived (an
+/// unparsable pattern, or one with no trigram-able literal structure such as
+/// `a.b` or `.*`); callers must treat that as "recheck everything".
+pub fn regex_to_trigram_query(pattern: &str) -> TrigramQuery {
+    // An unparsable pattern cannot be accelerated; rechecking is still safe.
+    let Ok(hir) = regex_syntax::parse(pattern) else {
+        return TrigramQuery::All;
+    };
+    let info = analyze(&hir);
+
+    let mut conditions = vec![info.match_q];
+    if let Some(exact) = &info.exact {
+        if exact.is_empty() {
+            // The expression matches nothing.
+            return TrigramQuery::None;
+        }
+        conditions.push(trigrams_of_set(exact.iter()));
+    }
+    conditions.push(trigrams_of_set(info.prefix.iter()));
+    conditions.push(trigrams_of_set(info.suffix.iter()));
+    TrigramQuery::and(conditions)
+}
+
+/// Whether a regular expression yields any trigram condition the index can use
+/// to prune candidates. When it does not (e.g. `a.b`, `.*`, or a case-insensitive
+/// pattern), callers should leave the predicate to a full scan rather than route
+/// it to the index, which would otherwise have to ask the scan to recheck every
+/// row -- a path the index result type (`AtLeast`) does not support.
+pub fn regex_can_use_index(pattern: &str) -> bool {
+    regex_to_trigram_query(pattern) != TrigramQuery::All
+}
+
+/// Collect the distinct trigram tokens referenced anywhere in the tree.
+pub fn collect_tokens(query: &TrigramQuery, out: &mut HashSet<u32>) {
+    match query {
+        TrigramQuery::Trigram(token) => {
+            out.insert(*token);
+        }
+        TrigramQuery::And(items) | TrigramQuery::Or(items) => {
+            for item in items {
+                collect_tokens(item, out);
+            }
+        }
+        TrigramQuery::All | TrigramQuery::None => {}
+    }
+}
+
+/// Evaluate the tree against a map of `trigram token -> posting list`. A token
+/// missing from the map contributes an empty set (sound: a required trigram that
+/// is absent everywhere yields no rows; an absent OR branch contributes
+/// nothing). `All` / `None` are handled by the caller before evaluation.
+pub fn eval_trigram_query(
+    query: &TrigramQuery,
+    bitmaps: &HashMap<u32, RoaringTreemap>,
+) -> RoaringTreemap {
+    match query {
+        TrigramQuery::Trigram(token) => bitmaps.get(token).cloned().unwrap_or_default(),
+        TrigramQuery::And(items) => {
+            let mut iter = items.iter();
+            let mut acc = match iter.next() {
+                Some(first) => eval_trigram_query(first, bitmaps),
+                None => return RoaringTreemap::new(),
+            };
+            for item in iter {
+                if acc.is_empty() {
+                    break;
+                }
+                acc &= &eval_trigram_query(item, bitmaps);
+            }
+            acc
+        }
+        TrigramQuery::Or(items) => {
+            let mut acc = RoaringTreemap::new();
+            for item in items {
+                acc |= &eval_trigram_query(item, bitmaps);
+            }
+            acc
+        }
+        TrigramQuery::All | TrigramQuery::None => RoaringTreemap::new(),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// A single trigram condition, hashed the same way the index hashes it.
+    fn tri(trigram: &str) -> TrigramQuery {
+        TrigramQuery::Trigram(ngram_to_token(trigram, NGRAM_N))
+    }
+
+    fn q(pattern: &str) -> TrigramQuery {
+        regex_to_trigram_query(pattern)
+    }
+
+    #[test]
+    fn test_single_literal_trigram() {
+        assert_eq!(q("foo"), tri("foo"));
+    }
+
+    #[test]
+    fn test_multi_trigram_literal() {
+        assert_eq!(
+            q("foobar"),
+            TrigramQuery::and(vec![tri("foo"), tri("oob"), tri("oba"), tri("bar")])
+        );
+    }
+
+    #[test]
+    fn test_wildcard_splits_into_and() {
+        // `.*` breaks the literal run; both sides are required.
+        assert_eq!(
+            q("foo.*bar"),
+            TrigramQuery::and(vec![tri("foo"), tri("bar")])
+        );
+    }
+
+    #[test]
+    fn test_alternation_is_or() {
+        assert_eq!(
+            q("(cat|dog)"),
+            TrigramQuery::or(vec![tri("cat"), tri("dog")])
+        );
+    }
+
+    #[test]
+    fn test_anchors_are_transparent() {
+        assert_eq!(
+            q("^rhino"),
+            TrigramQuery::and(vec![tri("rhi"), tri("hin"), tri("ino")])
+        );
+        assert_eq!(q("nose$"), TrigramQuery::and(vec![tri("nos"), tri("ose")]));
+    }
+
+    #[test]
+    fn test_boundary_trigram_recovered_across_groups() {
+        // A capturing group is not merged into the adjacent literals, so this
+        // exercises the suffix x prefix cross product that recovers the `foo`
+        // trigram straddling the `(o)` group boundary in "foobar".
+        assert_eq!(
+            q("fo(o)bar"), // spellchecker:disable-line
+            TrigramQuery::and(vec![tri("foo"), tri("oob"), tri("oba"), tri("bar")])
+        );
+    }
+
+    #[test]
+    fn test_no_trigram_yields_all() {
+        // No run of three literal characters anywhere.
+        assert_eq!(q("a.b"), TrigramQuery::All);
+        assert_eq!(q(".*"), TrigramQuery::All);
+        // Every alternation branch is shorter than a trigram, so we must not
+        // require either two-character branch as a (non-existent) trigram.
+        assert_eq!(q("fo|ba"), TrigramQuery::All); // spellchecker:disable-line
+    }
+
+    #[test]
+    fn test_case_insensitive_not_accelerated() {
+        // Unicode case folding (e.g. `(?i)c` also matches U+2102) does not agree
+        // with the index's normalization, so case-insensitive patterns are left
+        // unaccelerated (correct via recheck) rather than risk a false negative.
+        assert_eq!(q("(?i)Cat"), TrigramQuery::All);
+    }
+
+    #[test]
+    fn test_unparsable_pattern_yields_all() {
+        assert_eq!(q("("), TrigramQuery::All);
+    }
+
+    #[test]
+    fn test_large_alternation_stays_bounded() {
+        // More than MAX_SET_SIZE branches: must still produce a sound OR without
+        // panicking or exploding.
+        let pattern = (0..40)
+            .map(|i| format!("aa{i:02}zz"))
+            .collect::<Vec<_>>()
+            .join("|");
+        let result = q(&pattern);
+        // Each branch shares the trigram `aa0`/`aa1`/... and `zz`-ish endings;
+        // the important property is that it is a sound non-empty condition.
+        assert_ne!(result, TrigramQuery::None);
+    }
+
+    #[test]
+    fn test_plus_requires_inner() {
+        // `(abc)+` must contain at least one `abc`.
+        assert_eq!(q("(abc)+"), tri("abc"));
+    }
+
+    #[test]
+    fn test_optional_group_is_not_required() {
+        // `(foo)?bar` -> foo optional, bar required.
+        assert_eq!(q("(foo)?bar"), tri("bar"));
+    }
+
+    #[test]
+    fn test_eval_and_or_with_missing_tokens() {
+        let foo = ngram_to_token("foo", NGRAM_N);
+        let bar = ngram_to_token("bar", NGRAM_N);
+        let mut bitmaps = HashMap::new();
+        bitmaps.insert(foo, RoaringTreemap::from_iter([1u64, 2, 3]));
+        bitmaps.insert(bar, RoaringTreemap::from_iter([2u64, 3, 4]));
+        // `baz` is absent from the index.
+
+        // AND intersects.
+        let and = TrigramQuery::and(vec![tri("foo"), tri("bar")]);
+        assert_eq!(
+            eval_trigram_query(&and, &bitmaps),
+            RoaringTreemap::from_iter([2u64, 3])
+        );
+
+        // OR unions.
+        let or = TrigramQuery::or(vec![tri("foo"), tri("bar")]);
+        assert_eq!(
+            eval_trigram_query(&or, &bitmaps),
+            RoaringTreemap::from_iter([1u64, 2, 3, 4])
+        );
+
+        // A missing token is empty: it zeroes an AND but is harmless in an OR.
+        let and_missing = TrigramQuery::and(vec![tri("foo"), tri("baz")]);
+        assert!(eval_trigram_query(&and_missing, &bitmaps).is_empty());
+        let or_missing = TrigramQuery::or(vec![tri("foo"), tri("baz")]);
+        assert_eq!(
+            eval_trigram_query(&or_missing, &bitmaps),
+            RoaringTreemap::from_iter([1u64, 2, 3])
+        );
+    }
+
+    #[test]
+    fn test_collect_tokens() {
+        let query = TrigramQuery::and(vec![
+            tri("foo"),
+            TrigramQuery::or(vec![tri("bar"), tri("baz")]),
+        ]);
+        let mut tokens = HashSet::new();
+        collect_tokens(&query, &mut tokens);
+        assert_eq!(
+            tokens,
+            HashSet::from([
+                ngram_to_token("foo", NGRAM_N),
+                ngram_to_token("bar", NGRAM_N),
+                ngram_to_token("baz", NGRAM_N),
+            ])
+        );
+    }
+}
diff --git a/rust/lance-index/src/scalar/zonemap.rs b/rust/lance-index/src/scalar/zonemap.rs
index 9f2228740c2..8e7e20c211a 100644
--- a/rust/lance-index/src/scalar/zonemap.rs
+++ b/rust/lance-index/src/scalar/zonemap.rs
@@ -151,26 +151,11 @@ impl ZoneMapIndex {
         Self::zone_has_finite_min(zone) && !(zone.max.is_null() || Self::scalar_is_nan(&zone.max))
     }
 
-    fn finite_value_may_be_in_zone(value: &ScalarValue, zone: &ZoneMapStatistics) -> bool {
-        if !Self::zone_has_finite_min(zone) || value < &zone.min {
-            return false;
-        }
-
-        if Self::scalar_is_nan(&zone.max) {
-            // A NaN max means this zone had both NaNs and finite values.  The
-            // finite max is not persisted, so keep the zone as a false positive
-            // instead of using total ordering to prune it.
-            return true;
-        }
-
-        !zone.max.is_null() && value <= &zone.max
-    }
-
     /// Evaluates whether a zone could potentially contain values matching the query.
     ///
-    /// NaN query values use the explicit `nan_count`.  When the stored max is
-    /// NaN we do not treat it as a finite upper bound; that representation means
-    /// the zone had finite values plus NaNs, and the finite max was not persisted.
+    /// NaN query values use the explicit `nan_count`. For finite query values,
+    /// `ScalarValue` total ordering keeps finite values below a stored NaN max,
+    /// so zones with finite values plus NaNs remain conservative false positives.
     fn evaluate_zone_against_query(
         &self,
         zone: &ZoneMapStatistics,
@@ -206,7 +191,7 @@ impl ZoneMapIndex {
                     return Ok(false);
                 }
 
-                Ok(Self::finite_value_may_be_in_zone(target, zone))
+                Ok(target >= &zone.min && target <= &zone.max)
             }
             SargableQuery::Range(start, end) => {
                 // Zone overlaps with query range if there's any intersection between
@@ -336,22 +321,28 @@ impl ZoneMapIndex {
                             ScalarValue::Float16(Some(f)) => {
                                 if f.is_nan() {
                                     zone.nan_count > 0
+                                } else if !Self::zone_has_finite_min(zone) {
+                                    false
                                 } else {
-                                    Self::finite_value_may_be_in_zone(value, zone)
+                                    value >= &zone.min && value <= &zone.max
                                 }
                             }
                             ScalarValue::Float32(Some(f)) => {
                                 if f.is_nan() {
                                     zone.nan_count > 0
+                                } else if !Self::zone_has_finite_min(zone) {
+                                    false
                                 } else {
-                                    Self::finite_value_may_be_in_zone(value, zone)
+                                    value >= &zone.min && value <= &zone.max
                                 }
                             }
                             ScalarValue::Float64(Some(f)) => {
                                 if f.is_nan() {
                                     zone.nan_count > 0
+                                } else if !Self::zone_has_finite_min(zone) {
+                                    false
                                 } else {
-                                    Self::finite_value_may_be_in_zone(value, zone)
+                                    value >= &zone.min && value <= &zone.max
                                 }
                             }
                             _ => {
@@ -1438,6 +1429,17 @@ mod tests {
             );
         }
 
+        let zone = &index.zones[0];
+        assert!(matches!(
+            zone.max,
+            ScalarValue::Float32(Some(value)) if value.is_nan()
+        ));
+        let finite_target = ScalarValue::Float32(Some(1000.0));
+        assert!(
+            finite_target >= zone.min && finite_target <= zone.max,
+            "ScalarValue total ordering keeps finite values below NaN max"
+        );
+
         // Test search for NaN values using Equals with NaN
         let query = SargableQuery::Equals(ScalarValue::Float32(Some(f32::NAN)));
         let result = index.search(&query, &NoOpMetricsCollector).await.unwrap();
diff --git a/rust/lance-index/src/vector.rs b/rust/lance-index/src/vector.rs
index d0df2fcb7e2..3c5a6601a8a 100644
--- a/rust/lance-index/src/vector.rs
+++ b/rust/lance-index/src/vector.rs
@@ -419,6 +419,14 @@ pub trait VectorIndex: Send + Sync + std::fmt::Debug + Index {
 
     /// the index type of this vector index.
     fn sub_index_type(&self) -> (SubIndexType, QuantizationType);
+
+    /// The cumulative I/O performed while opening this index (file footers, IVF
+    /// centroids, quantization metadata).  This is a one-time cost; it is
+    /// reported once, on the query that actually opens the index, and is `None`
+    /// for index implementations that do not track it.
+    fn open_io_stats(&self) -> Option<lance_io::scheduler::ScanStats> {
+        None
+    }
 }
 
 // it can be an IVF index or a partition of IVF index
diff --git a/rust/lance-index/src/vector/bq.rs b/rust/lance-index/src/vector/bq.rs
index 0fdd918edab..7a47fa88d54 100644
--- a/rust/lance-index/src/vector/bq.rs
+++ b/rust/lance-index/src/vector/bq.rs
@@ -18,6 +18,9 @@ use crate::vector::bq::storage::RabitQuantizationMetadata;
 use crate::vector::quantizer::QuantizerBuildParams;
 
 pub mod builder;
+pub(crate) mod dist_table_quant;
+pub mod ex_dot;
+pub mod prune;
 pub mod rotation;
 pub mod storage;
 pub mod transform;
diff --git a/rust/lance-index/src/vector/bq/builder.rs b/rust/lance-index/src/vector/bq/builder.rs
index 178a6bb5435..9eb7fc76903 100644
--- a/rust/lance-index/src/vector/bq/builder.rs
+++ b/rust/lance-index/src/vector/bq/builder.rs
@@ -25,7 +25,7 @@ use crate::vector::bq::transform::{
     SCALE_FACTORS_FIELD,
 };
 use crate::vector::bq::{
-    RQBuildParams, RQRotationType, rabit_binary_code_bytes, rabit_ex_bits, rabit_ex_code_bytes,
+    RQBuildParams, RQRotationType, rabit_binary_code_bytes, rabit_ex_bits,
     rotation::{apply_fast_rotation, fast_rotation_signs_len, random_fast_rotation_signs},
     validate_rq_num_bits,
 };
@@ -78,21 +78,6 @@ fn pack_sign_bits(codes: &mut [u8], rotated: &[f32]) {
     }
 }
 
-#[inline]
-fn pack_ex_code_bits(codes: &mut [u8], ex_values: &[u8], ex_bits: u8) {
-    codes.fill(0);
-    let ex_bits = ex_bits as usize;
-    for (dim_idx, &value) in ex_values.iter().enumerate() {
-        let bit_offset = dim_idx * ex_bits;
-        for bit_idx in 0..ex_bits {
-            if (value >> bit_idx) & 1 != 0 {
-                let dst_bit = bit_offset + bit_idx;
-                codes[dst_bit / u8::BITS as usize] |= 1u8 << (dst_bit % u8::BITS as usize);
-            }
-        }
-    }
-}
-
 const EX_QUANTIZATION_EPSILON: f32 = 1.0e-5;
 const EX_TIGHT_START: [f32; 9] = [0.0, 0.15, 0.20, 0.52, 0.59, 0.71, 0.75, 0.77, 0.81];
 
@@ -200,7 +185,7 @@ fn quantize_ex_code(
         *ex_code_value = ex_code;
     }
 
-    pack_ex_code_bits(ex_code_dst, ex_code_values_dst, ex_bits);
+    crate::vector::bq::ex_dot::pack_blocked_row(ex_code_values_dst, ex_bits, ex_code_dst);
     residual_dot_code
 }
 
@@ -599,7 +584,11 @@ impl RabitQuantizer {
             .as_slice();
         let code_dim = self.code_dim();
         let code_bytes = rabit_binary_code_bytes(code_dim);
-        let ex_code_bytes = rabit_ex_code_bytes(code_dim, ex_bits)?;
+        let ex_code_bytes = if ex_bits == 0 {
+            0
+        } else {
+            crate::vector::bq::ex_dot::blocked_ex_code_bytes(code_dim, ex_bits)
+        };
 
         let mut encoded_codes = vec![0u8; n * code_bytes];
         let mut encoded_ex_codes = (ex_bits != 0).then(|| vec![0u8; n * ex_code_bytes]);
@@ -901,7 +890,7 @@ mod tests {
     use lance_linalg::distance::DistanceType;
     use rstest::rstest;
 
-    use crate::vector::bq::storage::RABIT_EX_CODE_COLUMN;
+    use crate::vector::bq::storage::RABIT_BLOCKED_EX_CODE_COLUMN;
 
     #[rstest]
     #[case(8)]
@@ -978,14 +967,14 @@ mod tests {
         assert!(
             !fields
                 .iter()
-                .any(|field| field.name() == RABIT_EX_CODE_COLUMN)
+                .any(|field| field.name() == RABIT_BLOCKED_EX_CODE_COLUMN)
         );
 
         let q = RabitQuantizer::new_with_rotation::<Float32Type>(3, 128, RQRotationType::Fast);
         let fields = q.extra_fields();
         for expected in [
             ERROR_FACTORS_FIELD.name().as_str(),
-            RABIT_EX_CODE_COLUMN,
+            RABIT_BLOCKED_EX_CODE_COLUMN,
             EX_ADD_FACTORS_FIELD.name().as_str(),
             EX_SCALE_FACTORS_FIELD.name().as_str(),
         ] {
@@ -1095,7 +1084,8 @@ mod tests {
                     .unwrap()
                     .as_fixed_size_list()
                     .value_length(),
-                32
+                // dim=32 is padded to one 64-dim block at ex_bits=8.
+                64
             );
         }
 
diff --git a/rust/lance-index/src/vector/bq/dist_table_quant.rs b/rust/lance-index/src/vector/bq/dist_table_quant.rs
new file mode 100644
index 00000000000..22196f06edb
--- /dev/null
+++ b/rust/lance-index/src/vector/bq/dist_table_quant.rs
@@ -0,0 +1,935 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! SIMD kernels for quantizing the RaBitQ FastScan distance table.
+//!
+//! Once per (query, probed partition) the `dim * 4`-entry `f32` distance
+//! table is quantized into `u8` (fast/normal approx modes) or `u16`
+//! (accurate mode) FastScan LUT entries: a min/max pass over the table
+//! followed by an affine quantize-and-narrow pass. Both passes are branchy
+//! in scalar form, so they get the same runtime-dispatch treatment as
+//! [`super::ex_dot`]: explicit AVX-512/AVX2 kernels on x86_64 and a portable
+//! fold elsewhere that LLVM auto-vectorizes (NEON is part of the aarch64
+//! baseline).
+//!
+//! Table values are sums of rotated-query components: always finite, never
+//! NaN, so lanewise IEEE `min`/`max` matches `total_cmp` ordering. The only
+//! divergence is the sign of zero, which callers cannot observe: `d - qmin`
+//! and the `qmin == qmax` early-out are arithmetically identical either way.
+//!
+//! Quantization rounds half-to-even so that the scalar fallback and the SIMD
+//! kernels agree bit-exactly. All paths round with fixed-mode rounding,
+//! independent of the dynamic MXCSR rounding mode native code may have
+//! installed: the SIMD kernels use the converts' static rounding and the
+//! scalar path (also the SIMD tails) rounds via `f32::floor` rather than
+//! `f32::round_ties_even`, which can lower to an MXCSR-honoring instruction on
+//! x86. Relative to the pre-SIMD implementation (`f32::round`,
+//! half-away-from-zero) this can move a LUT entry by 1 on exact .5 ties, which
+//! is within the table's inherent quantization error.
+
+use std::mem::MaybeUninit;
+use std::sync::LazyLock;
+
+use super::storage::SEGMENT_NUM_CODES;
+
+type MinMaxFn = fn(&[f32]) -> (f32, f32);
+type QuantizeU8Fn = fn(&[f32], f32, f32, &mut [MaybeUninit<u8>]);
+type QuantizeU16Fn = fn(&[f32], f32, f32, &mut [MaybeUninit<u16>]);
+
+/// How the caller reconstructs binary inner-product distances from the
+/// FastScan accumulator sums computed against the quantized LUT.
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum DistTableDequant {
+    /// Reconstruct each distance with the affine map
+    /// `q_sum * (qmax - qmin) / SCALE + num_tables * qmin`. Returned whenever
+    /// that map is finite, including a zero/sub-resolution range — then the
+    /// LUT is zeroed and every distance collapses to the constant
+    /// `num_tables * qmin`.
+    Affine { qmin: f32, qmax: f32 },
+    /// `num_tables * {qmin, qmax, qmax - qmin}` overflowed f32, so the affine
+    /// reconstruction would yield NaN/inf. The LUT is zeroed; the caller must
+    /// compute exact distances directly from the f32 table.
+    Exact,
+}
+
+/// Quantize `dist_table` into `u8` FastScan LUT entries in the caller-owned
+/// scratch buffer, returning how the caller must dequantize the FastScan
+/// sums (see [`DistTableDequant`]). `dist_table` must be non-empty and all
+/// values finite.
+pub fn quantize_dist_table_into(
+    dist_table: &[f32],
+    quantized_dist_table: &mut Vec<u8>,
+) -> DistTableDequant {
+    debug_assert!(!dist_table.is_empty(), "dist table must be non-empty");
+    let (qmin, qmax) = min_max(dist_table);
+    if dequant_overflows(dist_table.len(), qmin, qmax) {
+        // The caller's affine reconstruction would be non-finite; it computes
+        // exact distances and ignores the LUT, but keep the buffer valid.
+        quantized_dist_table.clear();
+        quantized_dist_table.resize(dist_table.len(), 0);
+        return DistTableDequant::Exact;
+    }
+    let factor = u8::MAX as f32 / (qmax - qmin);
+    if !factor.is_finite() {
+        // Zero or sub-u8-resolution range (e.g. an all-zeros query): the LUT
+        // carries no information, but the finite affine map sends every sum
+        // to the constant `num_tables * qmin`.
+        quantized_dist_table.clear();
+        quantized_dist_table.resize(dist_table.len(), 0);
+        return DistTableDequant::Affine { qmin, qmax };
+    }
+    quantized_dist_table.clear();
+    quantized_dist_table.reserve(dist_table.len());
+    quantize_u8(
+        dist_table,
+        qmin,
+        factor,
+        &mut quantized_dist_table.spare_capacity_mut()[..dist_table.len()],
+    );
+    // SAFETY: the kernel initialized every element in the reserved range.
+    unsafe {
+        quantized_dist_table.set_len(dist_table.len());
+    }
+    DistTableDequant::Affine { qmin, qmax }
+}
+
+/// `u16` variant of [`quantize_dist_table_into`] for the accurate approx
+/// mode.
+pub fn quantize_dist_table_u16_into(
+    dist_table: &[f32],
+    quantized_dist_table: &mut Vec<u16>,
+) -> DistTableDequant {
+    debug_assert!(!dist_table.is_empty(), "dist table must be non-empty");
+    let (qmin, qmax) = min_max(dist_table);
+    if dequant_overflows(dist_table.len(), qmin, qmax) {
+        quantized_dist_table.clear();
+        quantized_dist_table.resize(dist_table.len(), 0);
+        return DistTableDequant::Exact;
+    }
+    let factor = u16::MAX as f32 / (qmax - qmin);
+    if !factor.is_finite() {
+        quantized_dist_table.clear();
+        quantized_dist_table.resize(dist_table.len(), 0);
+        return DistTableDequant::Affine { qmin, qmax };
+    }
+    quantized_dist_table.clear();
+    quantized_dist_table.reserve(dist_table.len());
+    quantize_u16(
+        dist_table,
+        qmin,
+        factor,
+        &mut quantized_dist_table.spare_capacity_mut()[..dist_table.len()],
+    );
+    // SAFETY: the kernel initialized every element in the reserved range.
+    unsafe {
+        quantized_dist_table.set_len(dist_table.len());
+    }
+    DistTableDequant::Affine { qmin, qmax }
+}
+
+/// Whether the caller's affine dequantization
+/// `q_sum * (qmax - qmin) / SCALE + num_tables * qmin` would overflow `f32`
+/// for some row. Each row's reconstructed binary IP lies in
+/// `[num_tables * qmin, num_tables * qmax]` and its quantized term is at most
+/// `num_tables * (qmax - qmin)`, so if any of those is non-finite the table
+/// must fall back to exact distances. The bound is scale-independent — the
+/// `1 / SCALE` factor and the `q_sum <= num_tables * SCALE` range cancel.
+/// Real dist tables are bounded sums of rotated-query components and never
+/// approach this; the guard exists so a pathological query degrades to exact
+/// distances instead of producing NaN.
+fn dequant_overflows(table_len: usize, qmin: f32, qmax: f32) -> bool {
+    let num_tables = (table_len / SEGMENT_NUM_CODES) as f32;
+    !(num_tables * qmin).is_finite()
+        || !(num_tables * qmax).is_finite()
+        || !(num_tables * (qmax - qmin)).is_finite()
+}
+
+fn min_max(values: &[f32]) -> (f32, f32) {
+    static KERNEL: LazyLock<MinMaxFn> = LazyLock::new(select_min_max);
+    KERNEL(values)
+}
+
+fn quantize_u8(values: &[f32], qmin: f32, factor: f32, out: &mut [MaybeUninit<u8>]) {
+    static KERNEL: LazyLock<QuantizeU8Fn> = LazyLock::new(select_quantize_u8);
+    KERNEL(values, qmin, factor, out)
+}
+
+fn quantize_u16(values: &[f32], qmin: f32, factor: f32, out: &mut [MaybeUninit<u16>]) {
+    static KERNEL: LazyLock<QuantizeU16Fn> = LazyLock::new(select_quantize_u16);
+    KERNEL(values, qmin, factor, out)
+}
+
+fn select_min_max() -> MinMaxFn {
+    #[cfg(target_arch = "x86_64")]
+    {
+        if std::arch::is_x86_feature_detected!("avx512f") {
+            return x86::min_max_avx512_dispatch;
+        }
+        if std::arch::is_x86_feature_detected!("avx2") {
+            return x86::min_max_avx2_dispatch;
+        }
+    }
+    min_max_fold
+}
+
+fn select_quantize_u8() -> QuantizeU8Fn {
+    #[cfg(target_arch = "x86_64")]
+    {
+        if std::arch::is_x86_feature_detected!("avx512f") {
+            return x86::quantize_u8_avx512_dispatch;
+        }
+        if std::arch::is_x86_feature_detected!("avx2") {
+            return x86::quantize_u8_avx2_dispatch;
+        }
+    }
+    quantize_u8_scalar
+}
+
+fn select_quantize_u16() -> QuantizeU16Fn {
+    #[cfg(target_arch = "x86_64")]
+    {
+        if std::arch::is_x86_feature_detected!("avx512f") {
+            return x86::quantize_u16_avx512_dispatch;
+        }
+        if std::arch::is_x86_feature_detected!("avx2") {
+            return x86::quantize_u16_avx2_dispatch;
+        }
+    }
+    quantize_u16_scalar
+}
+
+const FOLD_LANES: usize = 16;
+
+/// Portable 16-lane min/max fold; the scalar fallback and the aarch64 path.
+/// The `if` comparisons (rather than `f32::min`/`max`, which carry NaN
+/// bookkeeping) lower to lanewise min/max instructions on targets with
+/// baseline SIMD.
+fn min_max_fold(values: &[f32]) -> (f32, f32) {
+    let mut mins = [f32::INFINITY; FOLD_LANES];
+    let mut maxs = [f32::NEG_INFINITY; FOLD_LANES];
+    let mut chunks = values.chunks_exact(FOLD_LANES);
+    for chunk in &mut chunks {
+        let chunk: &[f32; FOLD_LANES] = chunk.try_into().expect("chunks_exact length");
+        for (i, &v) in chunk.iter().enumerate() {
+            mins[i] = if v < mins[i] { v } else { mins[i] };
+            maxs[i] = if v > maxs[i] { v } else { maxs[i] };
+        }
+    }
+    let mut min = f32::INFINITY;
+    let mut max = f32::NEG_INFINITY;
+    for v in mins {
+        min = if v < min { v } else { min };
+    }
+    for v in maxs {
+        max = if v > max { v } else { max };
+    }
+    for &v in chunks.remainder() {
+        min = if v < min { v } else { min };
+        max = if v > max { v } else { max };
+    }
+    (min, max)
+}
+
+/// Round `x` to the nearest integer, ties to even — the same rule the SIMD
+/// converts use — with fixed-mode operations only, so the result never
+/// depends on the dynamic rounding mode native code may have installed.
+///
+/// On x86, `f32::round_ties_even` can lower to an MXCSR-honoring instruction
+/// (outside an SSE4.1 context), so nearest-even is built from `f32::floor`,
+/// which is always fixed-mode. `x` is a non-negative quantization product, so
+/// only the upward tie case is reachable, but the form is correct for any
+/// finite `x` whose floor fits in `i64`. Elsewhere (e.g. aarch64) the standard
+/// `round_ties_even` is already a fixed-mode instruction (`frintn`) that the
+/// quantize loop — which has no dedicated SIMD kernel there — vectorizes, so
+/// it is kept.
+#[inline(always)]
+fn round_ties_even_fixed(x: f32) -> f32 {
+    #[cfg(target_arch = "x86_64")]
+    {
+        let lower = x.floor();
+        let frac = x - lower;
+        let round_up = frac > 0.5 || (frac == 0.5 && (lower as i64 & 1) != 0);
+        lower + f32::from(round_up)
+    }
+    #[cfg(not(target_arch = "x86_64"))]
+    {
+        x.round_ties_even()
+    }
+}
+
+fn quantize_u8_scalar(values: &[f32], qmin: f32, factor: f32, out: &mut [MaybeUninit<u8>]) {
+    debug_assert_eq!(values.len(), out.len());
+    for (quantized, &d) in out.iter_mut().zip(values) {
+        quantized.write(round_ties_even_fixed((d - qmin) * factor) as u8);
+    }
+}
+
+fn quantize_u16_scalar(values: &[f32], qmin: f32, factor: f32, out: &mut [MaybeUninit<u16>]) {
+    debug_assert_eq!(values.len(), out.len());
+    for (quantized, &d) in out.iter_mut().zip(values) {
+        quantized.write(round_ties_even_fixed((d - qmin) * factor) as u16);
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+mod x86 {
+    use std::arch::x86_64::*;
+    use std::mem::MaybeUninit;
+
+    use super::{quantize_u8_scalar, quantize_u16_scalar};
+
+    pub(super) fn min_max_avx512_dispatch(values: &[f32]) -> (f32, f32) {
+        // SAFETY: only selected when AVX-512F was detected.
+        unsafe { min_max_avx512(values) }
+    }
+
+    #[target_feature(enable = "avx512f")]
+    unsafe fn min_max_avx512(values: &[f32]) -> (f32, f32) {
+        // Two accumulators per direction break the lanewise min/max latency
+        // chain; they are reduced once at the end.
+        let mut min0 = _mm512_set1_ps(f32::INFINITY);
+        let mut min1 = min0;
+        let mut max0 = _mm512_set1_ps(f32::NEG_INFINITY);
+        let mut max1 = max0;
+        let mut chunks = values.chunks_exact(32);
+        for chunk in &mut chunks {
+            // SAFETY: the chunk holds 32 consecutive floats.
+            let (v0, v1) = unsafe {
+                (
+                    _mm512_loadu_ps(chunk.as_ptr()),
+                    _mm512_loadu_ps(chunk.as_ptr().add(16)),
+                )
+            };
+            min0 = _mm512_min_ps(min0, v0);
+            max0 = _mm512_max_ps(max0, v0);
+            min1 = _mm512_min_ps(min1, v1);
+            max1 = _mm512_max_ps(max1, v1);
+        }
+        let mut min = _mm512_reduce_min_ps(_mm512_min_ps(min0, min1));
+        let mut max = _mm512_reduce_max_ps(_mm512_max_ps(max0, max1));
+        for &v in chunks.remainder() {
+            min = if v < min { v } else { min };
+            max = if v > max { v } else { max };
+        }
+        (min, max)
+    }
+
+    pub(super) fn min_max_avx2_dispatch(values: &[f32]) -> (f32, f32) {
+        // SAFETY: only selected when AVX2 was detected.
+        unsafe { min_max_avx2(values) }
+    }
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn min_max_avx2(values: &[f32]) -> (f32, f32) {
+        let mut min0 = _mm256_set1_ps(f32::INFINITY);
+        let mut min1 = min0;
+        let mut max0 = _mm256_set1_ps(f32::NEG_INFINITY);
+        let mut max1 = max0;
+        let mut chunks = values.chunks_exact(16);
+        for chunk in &mut chunks {
+            // SAFETY: the chunk holds 16 consecutive floats.
+            let (v0, v1) = unsafe {
+                (
+                    _mm256_loadu_ps(chunk.as_ptr()),
+                    _mm256_loadu_ps(chunk.as_ptr().add(8)),
+                )
+            };
+            min0 = _mm256_min_ps(min0, v0);
+            max0 = _mm256_max_ps(max0, v0);
+            min1 = _mm256_min_ps(min1, v1);
+            max1 = _mm256_max_ps(max1, v1);
+        }
+        let mut min = reduce_min_avx2(_mm256_min_ps(min0, min1));
+        let mut max = reduce_max_avx2(_mm256_max_ps(max0, max1));
+        for &v in chunks.remainder() {
+            min = if v < min { v } else { min };
+            max = if v > max { v } else { max };
+        }
+        (min, max)
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx2")]
+    fn reduce_min_avx2(v: __m256) -> f32 {
+        let halves = _mm_min_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps::<1>(v));
+        let pairs = _mm_min_ps(halves, _mm_movehl_ps(halves, halves));
+        let single = _mm_min_ss(pairs, _mm_shuffle_ps::<0b01>(pairs, pairs));
+        _mm_cvtss_f32(single)
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx2")]
+    fn reduce_max_avx2(v: __m256) -> f32 {
+        let halves = _mm_max_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps::<1>(v));
+        let pairs = _mm_max_ps(halves, _mm_movehl_ps(halves, halves));
+        let single = _mm_max_ss(pairs, _mm_shuffle_ps::<0b01>(pairs, pairs));
+        _mm_cvtss_f32(single)
+    }
+
+    /// Load 16 floats and affine-quantize them into `i32` lanes, rounding to
+    /// nearest-even with static rounding (`_MM_FROUND_TO_NEAREST_INT`) so the
+    /// result does not depend on the dynamic MXCSR rounding mode and matches
+    /// the scalar [`super::round_ties_even_fixed`].
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    unsafe fn quantize16_epi32(src: *const f32, min: __m512, factor: __m512) -> __m512i {
+        // SAFETY: the caller guarantees 16 floats are readable at `src`.
+        let v = unsafe { _mm512_loadu_ps(src) };
+        let scaled = _mm512_mul_ps(_mm512_sub_ps(v, min), factor);
+        _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(scaled)
+    }
+
+    pub(super) fn quantize_u8_avx512_dispatch(
+        values: &[f32],
+        qmin: f32,
+        factor: f32,
+        out: &mut [MaybeUninit<u8>],
+    ) {
+        // SAFETY: only selected when AVX-512F was detected.
+        unsafe { quantize_u8_avx512(values, qmin, factor, out) }
+    }
+
+    #[target_feature(enable = "avx512f")]
+    unsafe fn quantize_u8_avx512(
+        values: &[f32],
+        qmin: f32,
+        factor: f32,
+        out: &mut [MaybeUninit<u8>],
+    ) {
+        debug_assert_eq!(values.len(), out.len());
+        let min = _mm512_set1_ps(qmin);
+        let factor_v = _mm512_set1_ps(factor);
+        let full = values.len() - values.len() % 16;
+        let src = values.as_ptr();
+        let dst = out.as_mut_ptr().cast::<u8>();
+        for i in (0..full).step_by(16) {
+            // SAFETY: `i + 16 <= values.len() == out.len()`.
+            unsafe {
+                let q = quantize16_epi32(src.add(i), min, factor_v);
+                // Unsigned-saturating i32 -> u8 narrow: lanes are in
+                // [0, 255] plus float epsilon, which saturation clips.
+                _mm_storeu_si128(dst.add(i).cast(), _mm512_cvtusepi32_epi8(q));
+            }
+        }
+        quantize_u8_scalar(&values[full..], qmin, factor, &mut out[full..]);
+    }
+
+    pub(super) fn quantize_u16_avx512_dispatch(
+        values: &[f32],
+        qmin: f32,
+        factor: f32,
+        out: &mut [MaybeUninit<u16>],
+    ) {
+        // SAFETY: only selected when AVX-512F was detected.
+        unsafe { quantize_u16_avx512(values, qmin, factor, out) }
+    }
+
+    #[target_feature(enable = "avx512f")]
+    unsafe fn quantize_u16_avx512(
+        values: &[f32],
+        qmin: f32,
+        factor: f32,
+        out: &mut [MaybeUninit<u16>],
+    ) {
+        debug_assert_eq!(values.len(), out.len());
+        let min = _mm512_set1_ps(qmin);
+        let factor_v = _mm512_set1_ps(factor);
+        let full = values.len() - values.len() % 16;
+        let src = values.as_ptr();
+        let dst = out.as_mut_ptr().cast::<u16>();
+        for i in (0..full).step_by(16) {
+            // SAFETY: `i + 16 <= values.len() == out.len()`.
+            unsafe {
+                let q = quantize16_epi32(src.add(i), min, factor_v);
+                _mm256_storeu_si256(dst.add(i).cast(), _mm512_cvtusepi32_epi16(q));
+            }
+        }
+        quantize_u16_scalar(&values[full..], qmin, factor, &mut out[full..]);
+    }
+
+    /// Load 8 floats and affine-quantize them into `i32` lanes. AVX2 has no
+    /// embedded-rounding convert, so round to nearest-even explicitly with
+    /// `_mm256_round_ps` (which ignores MXCSR); the subsequent convert then
+    /// sees an integral value, so its dynamic rounding mode cannot change the
+    /// result, keeping it bit-identical to the scalar
+    /// [`super::round_ties_even_fixed`].
+    #[inline]
+    #[target_feature(enable = "avx2")]
+    unsafe fn quantize8_epi32(src: *const f32, min: __m256, factor: __m256) -> __m256i {
+        // SAFETY: the caller guarantees 8 floats are readable at `src`.
+        let v = unsafe { _mm256_loadu_ps(src) };
+        let scaled = _mm256_mul_ps(_mm256_sub_ps(v, min), factor);
+        let rounded = _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(scaled);
+        _mm256_cvtps_epi32(rounded)
+    }
+
+    pub(super) fn quantize_u8_avx2_dispatch(
+        values: &[f32],
+        qmin: f32,
+        factor: f32,
+        out: &mut [MaybeUninit<u8>],
+    ) {
+        // SAFETY: only selected when AVX2 was detected.
+        unsafe { quantize_u8_avx2(values, qmin, factor, out) }
+    }
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn quantize_u8_avx2(
+        values: &[f32],
+        qmin: f32,
+        factor: f32,
+        out: &mut [MaybeUninit<u8>],
+    ) {
+        debug_assert_eq!(values.len(), out.len());
+        let min = _mm256_set1_ps(qmin);
+        let factor_v = _mm256_set1_ps(factor);
+        // The 32->16 and 16->8 packs interleave the two 128-bit lanes; this
+        // permutation of 32-bit groups restores natural order.
+        let restore = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
+        let full = values.len() - values.len() % 32;
+        let src = values.as_ptr();
+        let dst = out.as_mut_ptr().cast::<u8>();
+        for i in (0..full).step_by(32) {
+            // SAFETY: `i + 32 <= values.len() == out.len()`.
+            unsafe {
+                let q0 = quantize8_epi32(src.add(i), min, factor_v);
+                let q1 = quantize8_epi32(src.add(i + 8), min, factor_v);
+                let q2 = quantize8_epi32(src.add(i + 16), min, factor_v);
+                let q3 = quantize8_epi32(src.add(i + 24), min, factor_v);
+                // Unsigned-saturating i32 -> u16 -> u8 narrows: lanes are in
+                // [0, 255] plus float epsilon, which saturation clips.
+                let lo = _mm256_packus_epi32(q0, q1);
+                let hi = _mm256_packus_epi32(q2, q3);
+                let bytes = _mm256_permutevar8x32_epi32(_mm256_packus_epi16(lo, hi), restore);
+                _mm256_storeu_si256(dst.add(i).cast(), bytes);
+            }
+        }
+        quantize_u8_scalar(&values[full..], qmin, factor, &mut out[full..]);
+    }
+
+    pub(super) fn quantize_u16_avx2_dispatch(
+        values: &[f32],
+        qmin: f32,
+        factor: f32,
+        out: &mut [MaybeUninit<u16>],
+    ) {
+        // SAFETY: only selected when AVX2 was detected.
+        unsafe { quantize_u16_avx2(values, qmin, factor, out) }
+    }
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn quantize_u16_avx2(
+        values: &[f32],
+        qmin: f32,
+        factor: f32,
+        out: &mut [MaybeUninit<u16>],
+    ) {
+        debug_assert_eq!(values.len(), out.len());
+        let min = _mm256_set1_ps(qmin);
+        let factor_v = _mm256_set1_ps(factor);
+        let full = values.len() - values.len() % 16;
+        let src = values.as_ptr();
+        let dst = out.as_mut_ptr().cast::<u16>();
+        for i in (0..full).step_by(16) {
+            // SAFETY: `i + 16 <= values.len() == out.len()`.
+            unsafe {
+                let q0 = quantize8_epi32(src.add(i), min, factor_v);
+                let q1 = quantize8_epi32(src.add(i + 8), min, factor_v);
+                // The pack interleaves the 128-bit lanes as
+                // [q0_lo, q1_lo, q0_hi, q1_hi]; the 64-bit-lane permute
+                // restores [q0_lo, q0_hi, q1_lo, q1_hi].
+                let packed = _mm256_packus_epi32(q0, q1);
+                let words = _mm256_permute4x64_epi64::<0b11_01_10_00>(packed);
+                _mm256_storeu_si256(dst.add(i).cast(), words);
+            }
+        }
+        quantize_u16_scalar(&values[full..], qmin, factor, &mut out[full..]);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rand::rngs::SmallRng;
+    use rand::{Rng, SeedableRng};
+    use rstest::rstest;
+
+    /// Straightforward scalar reference implementing the documented
+    /// semantics: `total_cmp` min/max plus nearest-even rounding.
+    fn reference_min_max(values: &[f32]) -> (f32, f32) {
+        let min = values
+            .iter()
+            .cloned()
+            .min_by(|a, b| a.total_cmp(b))
+            .unwrap();
+        let max = values
+            .iter()
+            .cloned()
+            .max_by(|a, b| a.total_cmp(b))
+            .unwrap();
+        (min, max)
+    }
+
+    fn reference_u8(values: &[f32]) -> (DistTableDequant, Vec<u8>) {
+        let (qmin, qmax) = reference_min_max(values);
+        if dequant_overflows(values.len(), qmin, qmax) {
+            return (DistTableDequant::Exact, vec![0; values.len()]);
+        }
+        let factor = u8::MAX as f32 / (qmax - qmin);
+        if !factor.is_finite() {
+            return (
+                DistTableDequant::Affine { qmin, qmax },
+                vec![0; values.len()],
+            );
+        }
+        let quantized = values
+            .iter()
+            .map(|&d| ((d - qmin) * factor).round_ties_even() as u8)
+            .collect();
+        (DistTableDequant::Affine { qmin, qmax }, quantized)
+    }
+
+    fn reference_u16(values: &[f32]) -> (DistTableDequant, Vec<u16>) {
+        let (qmin, qmax) = reference_min_max(values);
+        if dequant_overflows(values.len(), qmin, qmax) {
+            return (DistTableDequant::Exact, vec![0; values.len()]);
+        }
+        let factor = u16::MAX as f32 / (qmax - qmin);
+        if !factor.is_finite() {
+            return (
+                DistTableDequant::Affine { qmin, qmax },
+                vec![0; values.len()],
+            );
+        }
+        let quantized = values
+            .iter()
+            .map(|&d| ((d - qmin) * factor).round_ties_even() as u16)
+            .collect();
+        (DistTableDequant::Affine { qmin, qmax }, quantized)
+    }
+
+    fn available_kernels() -> Vec<(&'static str, MinMaxFn, QuantizeU8Fn, QuantizeU16Fn)> {
+        // `mut` is only exercised on x86_64 where extra kernels may be pushed.
+        #[allow(unused_mut)]
+        let mut kernels = vec![(
+            "scalar",
+            min_max_fold as MinMaxFn,
+            quantize_u8_scalar as QuantizeU8Fn,
+            quantize_u16_scalar as QuantizeU16Fn,
+        )];
+        #[cfg(target_arch = "x86_64")]
+        {
+            if std::arch::is_x86_feature_detected!("avx2") {
+                kernels.push((
+                    "avx2",
+                    x86::min_max_avx2_dispatch,
+                    x86::quantize_u8_avx2_dispatch,
+                    x86::quantize_u16_avx2_dispatch,
+                ));
+            }
+            if std::arch::is_x86_feature_detected!("avx512f") {
+                kernels.push((
+                    "avx512",
+                    x86::min_max_avx512_dispatch,
+                    x86::quantize_u8_avx512_dispatch,
+                    x86::quantize_u16_avx512_dispatch,
+                ));
+            }
+        }
+        kernels
+    }
+
+    /// Every available kernel must agree bit-exactly with the reference on
+    /// the given input.
+    fn check_against_reference(values: &[f32]) {
+        let (expected_dequant_u8, expected_u8) = reference_u8(values);
+        let (expected_dequant_u16, expected_u16) = reference_u16(values);
+        let (expected_min, expected_max) = reference_min_max(values);
+
+        for (name, min_max_fn, quantize_u8_fn, quantize_u16_fn) in available_kernels() {
+            let (qmin, qmax) = min_max_fn(values);
+            assert_eq!(
+                (qmin, qmax),
+                (expected_min, expected_max),
+                "kernel={name} len={}",
+                values.len()
+            );
+
+            // The quantize kernels are only invoked on the populated path, so
+            // mirror that guard before exercising them directly.
+            let overflows = dequant_overflows(values.len(), qmin, qmax);
+            let factor_u8 = u8::MAX as f32 / (qmax - qmin);
+            if !overflows && factor_u8.is_finite() {
+                let mut out_u8 = Vec::with_capacity(values.len());
+                quantize_u8_fn(
+                    values,
+                    qmin,
+                    factor_u8,
+                    &mut out_u8.spare_capacity_mut()[..values.len()],
+                );
+                // SAFETY: the kernel initialized every element.
+                unsafe { out_u8.set_len(values.len()) };
+                assert_eq!(out_u8, expected_u8, "kernel={name} len={}", values.len());
+            }
+
+            let factor_u16 = u16::MAX as f32 / (qmax - qmin);
+            if !overflows && factor_u16.is_finite() {
+                let mut out_u16 = Vec::with_capacity(values.len());
+                quantize_u16_fn(
+                    values,
+                    qmin,
+                    factor_u16,
+                    &mut out_u16.spare_capacity_mut()[..values.len()],
+                );
+                // SAFETY: the kernel initialized every element.
+                unsafe { out_u16.set_len(values.len()) };
+                assert_eq!(out_u16, expected_u16, "kernel={name} len={}", values.len());
+            }
+        }
+
+        // The public entry points exercise the dispatched kernels, the
+        // dequantization classification, and the scratch-buffer handling.
+        let mut out_u8 = Vec::new();
+        assert_eq!(
+            quantize_dist_table_into(values, &mut out_u8),
+            expected_dequant_u8,
+            "len={}",
+            values.len()
+        );
+        assert_eq!(out_u8, expected_u8, "len={}", values.len());
+        let mut out_u16 = Vec::new();
+        assert_eq!(
+            quantize_dist_table_u16_into(values, &mut out_u16),
+            expected_dequant_u16,
+            "len={}",
+            values.len()
+        );
+        assert_eq!(out_u16, expected_u16, "len={}", values.len());
+    }
+
+    #[rstest]
+    fn test_quantize_matches_reference(
+        #[values(1, 2, 15, 16, 17, 31, 32, 33, 63, 64, 100, 6144, 6160)] len: usize,
+        #[values(1.0, 1e-3, 1e4)] scale: f32,
+    ) {
+        let mut rng = SmallRng::seed_from_u64(42 + len as u64);
+        let values = (0..len)
+            .map(|_| rng.random_range(-scale..scale))
+            .collect::<Vec<_>>();
+        check_against_reference(&values);
+    }
+
+    /// Integer tables with range 510 (resp. 131070) make `factor` exactly
+    /// 0.5, so odd values land on exact .5 ties; all kernels must round them
+    /// to even and agree with each other.
+    #[test]
+    fn test_exact_half_ties_round_to_even() {
+        let values = (0..=510).map(|v| v as f32).collect::<Vec<_>>();
+        check_against_reference(&values);
+        let mut quantized = Vec::new();
+        assert_eq!(
+            quantize_dist_table_into(&values, &mut quantized),
+            DistTableDequant::Affine {
+                qmin: 0.0,
+                qmax: 510.0
+            }
+        );
+        // Spot-check nearest-even: 0.5 -> 0, 1.5 -> 2, 127.5 -> 128,
+        // 254.5 -> 254.
+        assert_eq!(&quantized[..6], &[0, 0, 1, 2, 2, 2]);
+        assert_eq!(quantized[255], 128);
+        assert_eq!(quantized[509], 254);
+        assert_eq!(quantized[510], 255);
+
+        // Integers up to 131070 are exactly representable in f32.
+        let values = (0..=510).map(|v| (v * 257) as f32).collect::<Vec<_>>();
+        check_against_reference(&values);
+        let mut quantized = Vec::new();
+        assert_eq!(
+            quantize_dist_table_u16_into(&values, &mut quantized),
+            DistTableDequant::Affine {
+                qmin: 0.0,
+                qmax: 131070.0
+            }
+        );
+        // value * 0.5 = 128.5 -> 128, 385.5 -> 386 under nearest-even.
+        assert_eq!(&quantized[..4], &[0, 128, 257, 386]);
+        assert_eq!(quantized[510], u16::MAX);
+    }
+
+    #[test]
+    fn test_negative_and_mixed_sign_values() {
+        let mut rng = SmallRng::seed_from_u64(7);
+        let values = (0..1000)
+            .map(|_| rng.random_range(-100.0f32..-1.0))
+            .collect::<Vec<_>>();
+        check_against_reference(&values);
+        let values = (0..999)
+            .map(|i| (i as f32 - 499.5) * 0.75)
+            .collect::<Vec<_>>();
+        check_against_reference(&values);
+    }
+
+    #[rstest]
+    fn test_all_equal_input_zeroes_table(#[values(0.0, -7.25, 3.5)] value: f32) {
+        let values = vec![value; 100];
+        check_against_reference(&values);
+        // Zero range: a zeroed LUT plus the finite affine map (every sum maps
+        // to `num_tables * value`).
+        let expected = DistTableDequant::Affine {
+            qmin: value,
+            qmax: value,
+        };
+        let mut quantized = vec![1u8; 5];
+        assert_eq!(quantize_dist_table_into(&values, &mut quantized), expected);
+        assert_eq!(quantized, vec![0; 100]);
+        let mut quantized = vec![1u16; 5];
+        assert_eq!(
+            quantize_dist_table_u16_into(&values, &mut quantized),
+            expected
+        );
+        assert_eq!(quantized, vec![0; 100]);
+    }
+
+    /// A finite sub-resolution range zeroes the LUT but still dequantizes
+    /// with the finite affine map (`Affine`), whereas a range whose
+    /// `num_tables`-scaled reconstruction overflows must signal `Exact` so the
+    /// caller computes exact distances instead of `0 * inf = NaN`.
+    #[test]
+    fn test_degenerate_range_classification() {
+        // factor = 255 / 1e-38 overflows to +inf, but the reconstruction
+        // (num_tables * {0, 1e-38}) stays finite -> Affine, zeroed LUT.
+        let mut tiny_range = vec![0.0f32; 32];
+        tiny_range[1] = 1e-38;
+        // num_tables * (2e38 - (-2e38)) overflows f32 -> Exact.
+        let mut huge_range = vec![0.0f32; 32];
+        huge_range[0] = -2e38;
+        huge_range[1] = 2e38;
+        // factor = 65535 / 1e-35 overflows only in the u16 variant; the u8
+        // variant still quantizes normally.
+        let mut u16_only = vec![0.0f32; 32];
+        u16_only[1] = 1e-35;
+
+        for values in [&tiny_range, &huge_range, &u16_only] {
+            check_against_reference(values);
+        }
+        let mut quantized_u8 = Vec::new();
+        assert_eq!(
+            quantize_dist_table_into(&tiny_range, &mut quantized_u8),
+            DistTableDequant::Affine {
+                qmin: 0.0,
+                qmax: 1e-38
+            }
+        );
+        assert_eq!(quantized_u8, vec![0; 32]);
+        assert_eq!(
+            quantize_dist_table_into(&huge_range, &mut quantized_u8),
+            DistTableDequant::Exact
+        );
+        assert_eq!(quantized_u8, vec![0; 32]);
+        let mut quantized_u16 = Vec::new();
+        assert_eq!(
+            quantize_dist_table_u16_into(&u16_only, &mut quantized_u16),
+            DistTableDequant::Affine {
+                qmin: 0.0,
+                qmax: 1e-35
+            }
+        );
+        assert_eq!(quantized_u16, vec![0; 32]);
+        assert_eq!(
+            quantize_dist_table_into(&u16_only, &mut quantized_u8),
+            DistTableDequant::Affine {
+                qmin: 0.0,
+                qmax: 1e-35
+            }
+        );
+        assert_eq!(quantized_u8[1], u8::MAX);
+    }
+
+    /// `-0.0 == 0.0` must keep taking the zero-range path (zeroed LUT,
+    /// `Affine`) even though SIMD min/max may pick either sign for the
+    /// extremes.
+    #[test]
+    fn test_signed_zero_mix_zeroes_table() {
+        let mut values = vec![0.0f32; 64];
+        values.iter_mut().step_by(2).for_each(|v| *v = -0.0);
+        let mut quantized = Vec::new();
+        match quantize_dist_table_into(&values, &mut quantized) {
+            DistTableDequant::Affine { qmin, qmax } => assert_eq!(qmin, qmax),
+            other => panic!("expected Affine, got {other:?}"),
+        }
+        assert_eq!(quantized, vec![0; 64]);
+    }
+
+    /// Every quantizer — scalar, AVX2, AVX-512, including the SIMD kernels'
+    /// scalar tails — must round with fixed nearest-even, independent of the
+    /// dynamic MXCSR rounding mode. Run each with MXCSR forced to
+    /// round-toward-zero and require it still matches the nearest-even
+    /// reference (computed under the default mode). `factor == 0.5` puts odd
+    /// integers on exact .5 ties, where truncation (1.5 -> 1) and nearest-even
+    /// (1.5 -> 2) disagree, so a path that honored MXCSR would fail. The
+    /// length (511) is deliberately not a multiple of the SIMD step so the
+    /// kernels' scalar tails are exercised too.
+    #[cfg(target_arch = "x86_64")]
+    #[test]
+    #[allow(deprecated)] // _mm_getcsr/_mm_setcsr: no stable non-asm replacement.
+    fn test_quantize_rounding_ignores_mxcsr() {
+        use std::arch::x86_64::{_MM_ROUND_MASK, _MM_ROUND_TOWARD_ZERO, _mm_getcsr, _mm_setcsr};
+
+        let values = (0..=510).map(|v| v as f32).collect::<Vec<_>>();
+        // Computed under the default (nearest-even) rounding mode.
+        let (_, expected_u8) = reference_u8(&values);
+        let (_, expected_u16) = reference_u16(&values);
+        let factor_u8 = u8::MAX as f32 / 510.0;
+        let factor_u16 = u16::MAX as f32 / 510.0;
+
+        for (name, _, quantize_u8_fn, quantize_u16_fn) in available_kernels() {
+            let mut out_u8 = Vec::with_capacity(values.len());
+            let mut out_u16 = Vec::with_capacity(values.len());
+            // SAFETY: SSE is baseline on x86_64. MXCSR is restored before any
+            // assertion so a failure cannot leak the truncating mode.
+            let saved = unsafe { _mm_getcsr() };
+            unsafe {
+                _mm_setcsr((saved & !_MM_ROUND_MASK) | _MM_ROUND_TOWARD_ZERO);
+                quantize_u8_fn(
+                    &values,
+                    0.0,
+                    factor_u8,
+                    &mut out_u8.spare_capacity_mut()[..values.len()],
+                );
+                quantize_u16_fn(
+                    &values,
+                    0.0,
+                    factor_u16,
+                    &mut out_u16.spare_capacity_mut()[..values.len()],
+                );
+                _mm_setcsr(saved);
+                out_u8.set_len(values.len());
+                out_u16.set_len(values.len());
+            }
+            assert_eq!(out_u8, expected_u8, "kernel={name} under truncating MXCSR");
+            assert_eq!(
+                out_u16, expected_u16,
+                "kernel={name} under truncating MXCSR"
+            );
+        }
+    }
+
+    /// The scratch buffer must be fully overwritten across reuses with
+    /// different lengths.
+    #[test]
+    fn test_scratch_buffer_reuse() {
+        let mut rng = SmallRng::seed_from_u64(11);
+        let mut scratch_u8 = vec![7u8; 500];
+        let mut scratch_u16 = vec![7u16; 500];
+        for len in [48, 512, 16] {
+            let values = (0..len)
+                .map(|_| rng.random_range(-1.0f32..1.0))
+                .collect::<Vec<_>>();
+            quantize_dist_table_into(&values, &mut scratch_u8);
+            assert_eq!(scratch_u8, reference_u8(&values).1);
+            quantize_dist_table_u16_into(&values, &mut scratch_u16);
+            assert_eq!(scratch_u16, reference_u16(&values).1);
+        }
+    }
+}
diff --git a/rust/lance-index/src/vector/bq/ex_dot.rs b/rust/lance-index/src/vector/bq/ex_dot.rs
new file mode 100644
index 00000000000..1aeb83ba40c
--- /dev/null
+++ b/rust/lance-index/src/vector/bq/ex_dot.rs
@@ -0,0 +1,1078 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Inner-product kernels between an `f32` query and bit-packed RaBitQ ex-codes.
+//!
+//! Multi-bit RaBitQ reranking reduces to `sum_d query[d] * ex_code[d]`, where
+//! `ex_code[d]` is an unsigned `ex_bits`-wide integer. Materializing a
+//! `dim * 2^ex_bits` lookup table and gathering one entry per dimension is
+//! cache-hostile (the table is 1MiB for `ex_bits=8`, `dim=1024`); these kernels
+//! instead unpack the codes with shifts and masks and FMA them against the
+//! query directly, following the kernel design of the RaBitQ reference library
+//! (<https://github.com/VectorDB-NTU/RaBitQ-Library>, Apache-2.0).
+//!
+//! Codes are stored in the *blocked* layout: dims are grouped into 64-dim
+//! blocks (the last block zero-padded) and bit-interleaved within each block
+//! so that the SIMD unpack emits codes in natural dim order:
+//!
+//! ```text
+//! per 64-dim block (T = ex_bits - 1, the top bit; "run k" = dims 16k..16k+16):
+//! 1 bit:  [8B]  bit i of the LE word = dim i
+//! 2 bits: [16B] byte b = dims {b, b+16, b+32, b+48} at bit pairs 0/2/4/6
+//! 3 bits: [16B 2-bit plane as above][8B top-bit plane]
+//! 4 bits: [32B] byte 8j+b = dim 16j+b (low nibble) | dim 16j+8+b (high nibble)
+//! 5 bits: [32B 4-bit plane: byte b = dims b|b+16; byte 16+b = dims b+32|b+48]
+//!         [8B top-bit plane]
+//! 6 bits: [48B] byte 16k+b = dim 16k+b (6 low bits) | bits 2k..2k+2 of
+//!         dim 48+b (2 high bits)
+//! 7 bits: [48B as 6 bits][8B top-bit plane]
+//! 8 bits: [64B] identity
+//! top-bit plane: top bit of dim 16k+b at bit 8*(b%8) + 2k + b/8 of a LE u64
+//! ```
+//!
+//! Because unpack order is natural, the kernels read the rotated query
+//! directly; it only needs zero-padding ([`pad_query_into`]) when the rotated
+//! dim is not a multiple of 64. Legacy indexes store ex codes sequentially
+//! (LSB-first bit stream) and are repacked once at load time
+//! ([`repack_sequential_row`]); for `ex_bits` ∈ {1, 8} the two layouts agree
+//! (modulo trailing padding, which the kernels tolerate) and rows are used as
+//! stored.
+
+use std::sync::LazyLock;
+
+/// Dims are packed in blocks of this size; the query is zero-padded to a
+/// whole number of blocks when the rotated dim is not already a multiple.
+pub const EX_DOT_BLOCK_DIMS: usize = 64;
+
+/// `f32` length of the query consumed by the kernels.
+pub fn padded_query_len(dim: usize) -> usize {
+    dim.next_multiple_of(EX_DOT_BLOCK_DIMS)
+}
+
+/// Whether the legacy sequential layout of a row already matches the blocked
+/// layout (modulo trailing zero padding, which the kernels tolerate), so
+/// legacy rows can be consumed without repacking.
+pub fn sequential_matches_blocked(ex_bits: u8) -> bool {
+    matches!(ex_bits, 1 | 8)
+}
+
+/// Bytes per row of the blocked ex-code layout.
+pub fn blocked_ex_code_bytes(dim: usize, ex_bits: u8) -> usize {
+    debug_assert!((1..=8).contains(&ex_bits));
+    padded_query_len(dim) * ex_bits as usize / 8
+}
+
+/// Dimensions per unpacking group for the given code width.
+fn group_dims(ex_bits: u8) -> usize {
+    match ex_bits {
+        1 | 4 | 8 => 16,
+        _ => EX_DOT_BLOCK_DIMS,
+    }
+}
+
+fn group_bytes(ex_bits: u8) -> usize {
+    group_dims(ex_bits) * ex_bits as usize / 8
+}
+
+/// Extract the `ex_bits`-wide code of `dim_idx` from a sequentially bit-packed
+/// row (LSB-first, codes may straddle byte boundaries).
+#[inline]
+pub fn packed_ex_code_value(row_codes: &[u8], dim_idx: usize, ex_bits: u8) -> u8 {
+    debug_assert!(ex_bits > 0);
+    let bit_offset = dim_idx * ex_bits as usize;
+    let byte_idx = bit_offset / u8::BITS as usize;
+    let bit_shift = bit_offset % u8::BITS as usize;
+    let bits = row_codes[byte_idx] as u16
+        | row_codes
+            .get(byte_idx + 1)
+            .map(|byte| (*byte as u16) << u8::BITS)
+            .unwrap_or_default();
+    let mask = (1u16 << ex_bits) - 1;
+    ((bits >> bit_shift) & mask) as u8
+}
+
+/// Zero-pad the rotated query to a whole number of 64-dim blocks. Only needed
+/// when `dim` is not a multiple of [`EX_DOT_BLOCK_DIMS`]; aligned queries are
+/// passed to the kernels as-is.
+pub fn pad_query_into(rotated_query: &[f32], out: &mut [f32]) {
+    debug_assert_eq!(out.len(), padded_query_len(rotated_query.len()));
+    out[..rotated_query.len()].copy_from_slice(rotated_query);
+    out[rotated_query.len()..].fill(0.0);
+}
+
+/// Pack the top bit of each of 64 codes into a `u64` so kernels can position
+/// it with two shifts per 16-code run: the top bit of dim `16k + b` is stored
+/// at bit `8 * (b % 8) + 2k + b / 8`.
+fn pack_top_plane(block_values: &[u8; 64], top_bit: u8) -> u64 {
+    let mut plane = 0u64;
+    for k in 0..4 {
+        for b in 0..16 {
+            let bit = (block_values[16 * k + b] >> top_bit) & 1;
+            plane |= (bit as u64) << (8 * (b % 8) + 2 * k + b / 8);
+        }
+    }
+    plane
+}
+
+/// Shift `plane` so that its bit `8j + from_bit` lands at bit `8j + to_bit`.
+#[inline(always)]
+fn shift_plane(plane: u64, from_bit: usize, to_bit: usize) -> u64 {
+    if from_bit >= to_bit {
+        plane >> (from_bit - to_bit)
+    } else {
+        plane << (to_bit - from_bit)
+    }
+}
+
+/// Pack one block of 64 code values (natural dim order) into the blocked
+/// layout described in the module docs.
+fn pack_block(ex_bits: u8, block_values: &[u8; 64], out: &mut [u8]) {
+    let v = block_values;
+    match ex_bits {
+        1 => {
+            for (b, byte) in out[..8].iter_mut().enumerate() {
+                *byte = (0..8).fold(0, |acc, t| acc | ((v[8 * b + t] & 1) << t));
+            }
+        }
+        2 | 3 => {
+            for b in 0..16 {
+                out[b] = (v[b] & 0b11)
+                    | ((v[16 + b] & 0b11) << 2)
+                    | ((v[32 + b] & 0b11) << 4)
+                    | ((v[48 + b] & 0b11) << 6);
+            }
+            if ex_bits == 3 {
+                out[16..24].copy_from_slice(&pack_top_plane(v, 2).to_le_bytes());
+            }
+        }
+        4 => {
+            for unit in 0..4 {
+                for b in 0..8 {
+                    out[8 * unit + b] =
+                        (v[16 * unit + b] & 0x0f) | ((v[16 * unit + 8 + b] & 0x0f) << 4);
+                }
+            }
+        }
+        5 => {
+            for b in 0..16 {
+                out[b] = (v[b] & 0x0f) | ((v[16 + b] & 0x0f) << 4);
+                out[16 + b] = (v[32 + b] & 0x0f) | ((v[48 + b] & 0x0f) << 4);
+            }
+            out[32..40].copy_from_slice(&pack_top_plane(v, 4).to_le_bytes());
+        }
+        6 | 7 => {
+            // Runs 0..3 keep their 6 low bits in place; the fourth run's dims
+            // are split into three 2-bit pieces stored in the runs' top bits.
+            for k in 0..3 {
+                for b in 0..16 {
+                    out[16 * k + b] =
+                        (v[16 * k + b] & 0x3f) | (((v[48 + b] >> (2 * k)) & 0b11) << 6);
+                }
+            }
+            if ex_bits == 7 {
+                out[48..56].copy_from_slice(&pack_top_plane(v, 6).to_le_bytes());
+            }
+        }
+        8 => out[..64].copy_from_slice(v),
+        _ => unreachable!("invalid RabitQ ex_bits={ex_bits}"),
+    }
+}
+
+/// Pack one row of unpacked code values (one `u8` per dim) into the blocked
+/// layout; the writer path. `out` must have [`blocked_ex_code_bytes`] bytes.
+pub fn pack_blocked_row(values: &[u8], ex_bits: u8, out: &mut [u8]) {
+    debug_assert_eq!(out.len(), blocked_ex_code_bytes(values.len(), ex_bits));
+    let block_bytes = EX_DOT_BLOCK_DIMS * ex_bits as usize / 8;
+    let mut block_values = [0u8; 64];
+    for (block, out) in out.chunks_exact_mut(block_bytes).enumerate() {
+        let base = block * EX_DOT_BLOCK_DIMS;
+        let count = EX_DOT_BLOCK_DIMS.min(values.len() - base);
+        block_values[..count].copy_from_slice(&values[base..base + count]);
+        block_values[count..].fill(0);
+        pack_block(ex_bits, &block_values, out);
+    }
+}
+
+/// Repack one legacy sequentially bit-packed row into the blocked layout.
+/// `out` must have [`blocked_ex_code_bytes`] bytes.
+pub fn repack_sequential_row(seq_row: &[u8], dim: usize, ex_bits: u8, out: &mut [u8]) {
+    debug_assert_eq!(out.len(), blocked_ex_code_bytes(dim, ex_bits));
+    let block_bytes = EX_DOT_BLOCK_DIMS * ex_bits as usize / 8;
+    let mut block_values = [0u8; 64];
+    for (block, out) in out.chunks_exact_mut(block_bytes).enumerate() {
+        block_values.fill(0);
+        let base = block * EX_DOT_BLOCK_DIMS;
+        let count = EX_DOT_BLOCK_DIMS.min(dim.saturating_sub(base));
+        for (i, value) in block_values[..count].iter_mut().enumerate() {
+            *value = packed_ex_code_value(seq_row, base + i, ex_bits);
+        }
+        pack_block(ex_bits, &block_values, out);
+    }
+}
+
+/// Unpack one code group into per-dim values (natural dim order). Reference
+/// implementation for the SIMD unpackers; also the scalar fallback.
+fn unpack_group(ex_bits: u8, group_codes: &[u8], out: &mut [u8; 64]) {
+    debug_assert_eq!(group_codes.len(), group_bytes(ex_bits));
+    match ex_bits {
+        1 => {
+            for (i, value) in out[..16].iter_mut().enumerate() {
+                *value = (group_codes[i / 8] >> (i % 8)) & 1;
+            }
+        }
+        2 => {
+            for k in 0..4 {
+                for b in 0..16 {
+                    out[16 * k + b] = (group_codes[b] >> (2 * k)) & 0b11;
+                }
+            }
+        }
+        3 => {
+            let plane = u64::from_le_bytes(group_codes[16..24].try_into().unwrap());
+            for k in 0..4 {
+                for b in 0..16 {
+                    let top = (plane >> (8 * (b % 8) + 2 * k + b / 8)) & 1;
+                    out[16 * k + b] = ((group_codes[b] >> (2 * k)) & 0b11) | ((top as u8) << 2);
+                }
+            }
+        }
+        4 => {
+            for b in 0..8 {
+                out[b] = group_codes[b] & 0x0f;
+                out[8 + b] = group_codes[b] >> 4;
+            }
+        }
+        5 => {
+            let plane = u64::from_le_bytes(group_codes[32..40].try_into().unwrap());
+            for k in 0..4 {
+                for b in 0..16 {
+                    let nibble = (group_codes[16 * (k / 2) + b] >> (4 * (k % 2))) & 0x0f;
+                    let top = (plane >> (8 * (b % 8) + 2 * k + b / 8)) & 1;
+                    out[16 * k + b] = nibble | ((top as u8) << 4);
+                }
+            }
+        }
+        6 | 7 => {
+            for k in 0..3 {
+                for b in 0..16 {
+                    out[16 * k + b] = group_codes[16 * k + b] & 0x3f;
+                }
+            }
+            for b in 0..16 {
+                out[48 + b] = (group_codes[b] >> 6)
+                    | ((group_codes[16 + b] >> 6) << 2)
+                    | ((group_codes[32 + b] >> 6) << 4);
+            }
+            if ex_bits == 7 {
+                let plane = u64::from_le_bytes(group_codes[48..56].try_into().unwrap());
+                for k in 0..4 {
+                    for b in 0..16 {
+                        let top = (plane >> (8 * (b % 8) + 2 * k + b / 8)) & 1;
+                        out[16 * k + b] |= (top as u8) << 6;
+                    }
+                }
+            }
+        }
+        8 => out[..16].copy_from_slice(group_codes),
+        _ => unreachable!("invalid RabitQ ex_bits={ex_bits}"),
+    }
+}
+
+/// `sum_d query[d] * code[d]` for one row of blocked-layout codes.
+///
+/// The query must cover a whole number of 64-dim blocks (the rotated query
+/// as-is for aligned dims, otherwise zero-padded via [`pad_query_into`]);
+/// `codes` is the blocked row slice. Rows shorter than the padded query
+/// length are treated as zero-padded.
+pub type ExDotFn = fn(&[f32], &[u8]) -> f32;
+
+/// Resolve the dot kernel for `ex_bits` once; the result can be cached by the
+/// caller for per-candidate use.
+pub fn ex_dot_kernel(ex_bits: u8) -> ExDotFn {
+    debug_assert!((1..=8).contains(&ex_bits));
+    static KERNELS: LazyLock<[ExDotFn; 8]> =
+        LazyLock::new(|| std::array::from_fn(|i| select_ex_dot_kernel(i as u8 + 1)));
+    KERNELS[usize::from(ex_bits) - 1]
+}
+
+fn select_ex_dot_kernel(ex_bits: u8) -> ExDotFn {
+    #[cfg(target_arch = "x86_64")]
+    {
+        if std::arch::is_x86_feature_detected!("avx512f") {
+            return x86::avx512_kernel(ex_bits);
+        }
+        if std::arch::is_x86_feature_detected!("avx2") && std::arch::is_x86_feature_detected!("fma")
+        {
+            return x86::avx2_kernel(ex_bits);
+        }
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        // NEON is part of the aarch64 baseline.
+        return neon::kernel(ex_bits);
+    }
+    #[allow(unreachable_code)]
+    scalar_kernel(ex_bits)
+}
+
+fn scalar_kernel(ex_bits: u8) -> ExDotFn {
+    match ex_bits {
+        1 => ex_dot_scalar::<1>,
+        2 => ex_dot_scalar::<2>,
+        3 => ex_dot_scalar::<3>,
+        4 => ex_dot_scalar::<4>,
+        5 => ex_dot_scalar::<5>,
+        6 => ex_dot_scalar::<6>,
+        7 => ex_dot_scalar::<7>,
+        8 => ex_dot_scalar::<8>,
+        _ => unreachable!("invalid RabitQ ex_bits={ex_bits}"),
+    }
+}
+
+fn ex_dot_scalar<const EX_BITS: u8>(ex_query: &[f32], codes: &[u8]) -> f32 {
+    let group_dims = group_dims(EX_BITS);
+    let bytes_per_group = group_bytes(EX_BITS);
+    debug_assert_eq!(ex_query.len() % EX_DOT_BLOCK_DIMS, 0);
+    debug_assert!(codes.len() * u8::BITS as usize <= ex_query.len() * EX_BITS as usize);
+
+    let mut sum = 0.0f32;
+    let mut unpacked = [0u8; 64];
+    let mut padded = [0u8; 56];
+    for (group, query) in ex_query.chunks_exact(group_dims).enumerate() {
+        let start = group * bytes_per_group;
+        if start >= codes.len() {
+            // The remaining query lanes are zero padding.
+            break;
+        }
+        let group_codes = if start + bytes_per_group <= codes.len() {
+            &codes[start..start + bytes_per_group]
+        } else {
+            let avail = codes.len() - start;
+            padded[..bytes_per_group].fill(0);
+            padded[..avail].copy_from_slice(&codes[start..]);
+            &padded[..bytes_per_group]
+        };
+        unpack_group(EX_BITS, group_codes, &mut unpacked);
+        for (q, &code) in query.iter().zip(unpacked[..group_dims].iter()) {
+            sum += q * code as f32;
+        }
+    }
+    sum
+}
+
+#[cfg(target_arch = "x86_64")]
+mod x86 {
+    use super::ExDotFn;
+    use std::arch::x86_64::*;
+
+    pub(super) fn avx2_kernel(ex_bits: u8) -> ExDotFn {
+        match ex_bits {
+            1 => dot_u1_avx2_dispatch,
+            2 => dot_u2_avx2_dispatch,
+            3 => dot_u3_avx2_dispatch,
+            4 => dot_u4_avx2_dispatch,
+            5 => dot_u5_avx2_dispatch,
+            6 => dot_u6_avx2_dispatch,
+            7 => dot_u7_avx2_dispatch,
+            8 => dot_u8_avx2_dispatch,
+            _ => unreachable!("invalid RabitQ ex_bits={ex_bits}"),
+        }
+    }
+
+    pub(super) fn avx512_kernel(ex_bits: u8) -> ExDotFn {
+        match ex_bits {
+            1 => dot_u1_avx512_dispatch,
+            2 => dot_u2_avx512_dispatch,
+            3 => dot_u3_avx512_dispatch,
+            4 => dot_u4_avx512_dispatch,
+            5 => dot_u5_avx512_dispatch,
+            6 => dot_u6_avx512_dispatch,
+            7 => dot_u7_avx512_dispatch,
+            8 => dot_u8_avx512_dispatch,
+            _ => unreachable!("invalid RabitQ ex_bits={ex_bits}"),
+        }
+    }
+
+    /// Broadcast a byte to the 8 bytes of a `u64`.
+    #[inline(always)]
+    fn splat_byte(byte: u8) -> u64 {
+        byte as u64 * 0x0101_0101_0101_0101
+    }
+
+    // Unpack helpers. They read exactly one group of code bytes and return
+    // runs of 16 codes matching the kernel-order query. Only SSE2 (baseline on
+    // x86_64) is required.
+
+    /// 16 1-bit codes from 2 bytes: compare each replicated byte against
+    /// per-lane bit masks to turn set bits into 0/1 bytes.
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn unpack_u1(ptr: *const u8) -> [__m128i; 1] {
+        let (b0, b1) = unsafe { (ptr.read(), ptr.add(1).read()) };
+        let bytes = _mm_set_epi64x(splat_byte(b1) as i64, splat_byte(b0) as i64);
+        let bit_select = _mm_set1_epi64x(0x8040_2010_0804_0201u64 as i64);
+        let selected = _mm_cmpeq_epi8(_mm_and_si128(bytes, bit_select), bit_select);
+        [_mm_and_si128(selected, _mm_set1_epi8(1))]
+    }
+
+    /// 64 2-bit codes from 16 bytes: byte b holds dims 4b..4b+3 at bit pairs.
+    /// The 16-bit shifts drag bits across byte boundaries, which the per-byte
+    /// mask removes.
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn unpack_u2(ptr: *const u8) -> [__m128i; 4] {
+        let raw = unsafe { _mm_loadu_si128(ptr as *const __m128i) };
+        let mask = _mm_set1_epi8(0b11);
+        [
+            _mm_and_si128(raw, mask),
+            _mm_and_si128(_mm_srli_epi16::<2>(raw), mask),
+            _mm_and_si128(_mm_srli_epi16::<4>(raw), mask),
+            _mm_and_si128(_mm_srli_epi16::<6>(raw), mask),
+        ]
+    }
+
+    /// Position the top-bit plane (see [`super::pack_top_plane`]) of run `k`
+    /// at `top_bit` within each byte.
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    fn top_plane_run(plane: u64, k: usize, top_bit: usize) -> __m128i {
+        let lo = super::shift_plane(plane, 2 * k, top_bit);
+        let hi = super::shift_plane(plane, 2 * k + 1, top_bit);
+        _mm_and_si128(
+            _mm_set_epi64x(hi as i64, lo as i64),
+            _mm_set1_epi8(1 << top_bit),
+        )
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn unpack_u3(ptr: *const u8) -> [__m128i; 4] {
+        let mut runs = unsafe { unpack_u2(ptr) };
+        let plane = unsafe { (ptr.add(16) as *const u64).read_unaligned() };
+        for (k, run) in runs.iter_mut().enumerate() {
+            *run = _mm_or_si128(*run, top_plane_run(plane, k, 2));
+        }
+        runs
+    }
+
+    /// 16 4-bit codes from 8 bytes: low nibbles are the even dims, high
+    /// nibbles the odd dims.
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn unpack_u4(ptr: *const u8) -> [__m128i; 1] {
+        let word = unsafe { (ptr as *const u64).read_unaligned() };
+        let mask = 0x0f0f_0f0f_0f0f_0f0fu64;
+        [_mm_set_epi64x(
+            ((word >> 4) & mask) as i64,
+            (word & mask) as i64,
+        )]
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn unpack_u5(ptr: *const u8) -> [__m128i; 4] {
+        let blk0 = unsafe { _mm_loadu_si128(ptr as *const __m128i) };
+        let blk1 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) };
+        let plane = unsafe { (ptr.add(32) as *const u64).read_unaligned() };
+        let mask = _mm_set1_epi8(0x0f);
+        let mut runs = [
+            _mm_and_si128(blk0, mask),
+            _mm_and_si128(_mm_srli_epi16::<4>(blk0), mask),
+            _mm_and_si128(blk1, mask),
+            _mm_and_si128(_mm_srli_epi16::<4>(blk1), mask),
+        ];
+        for (k, run) in runs.iter_mut().enumerate() {
+            *run = _mm_or_si128(*run, top_plane_run(plane, k, 4));
+        }
+        runs
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn unpack_u6(ptr: *const u8) -> [__m128i; 4] {
+        let blk0 = unsafe { _mm_loadu_si128(ptr as *const __m128i) };
+        let blk1 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) };
+        let blk2 = unsafe { _mm_loadu_si128(ptr.add(32) as *const __m128i) };
+        let mask6 = _mm_set1_epi8(0x3f);
+        let mask2 = _mm_set1_epi8(0b1100_0000u8 as i8);
+        let stolen = _mm_or_si128(
+            _mm_or_si128(
+                _mm_srli_epi16::<6>(_mm_and_si128(blk0, mask2)),
+                _mm_srli_epi16::<4>(_mm_and_si128(blk1, mask2)),
+            ),
+            _mm_srli_epi16::<2>(_mm_and_si128(blk2, mask2)),
+        );
+        [
+            _mm_and_si128(blk0, mask6),
+            _mm_and_si128(blk1, mask6),
+            _mm_and_si128(blk2, mask6),
+            stolen,
+        ]
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn unpack_u7(ptr: *const u8) -> [__m128i; 4] {
+        let mut runs = unsafe { unpack_u6(ptr) };
+        let plane = unsafe { (ptr.add(48) as *const u64).read_unaligned() };
+        for (k, run) in runs.iter_mut().enumerate() {
+            *run = _mm_or_si128(*run, top_plane_run(plane, k, 6));
+        }
+        runs
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn unpack_u8x16(ptr: *const u8) -> [__m128i; 1] {
+        [unsafe { _mm_loadu_si128(ptr as *const __m128i) }]
+    }
+
+    /// FMA 16 code bytes against 16 query floats (AVX2: two 8-float halves).
+    #[inline]
+    #[target_feature(enable = "avx2", enable = "fma")]
+    unsafe fn fma16_avx2(codes: __m128i, query: *const f32, acc: &mut [__m256; 2]) {
+        let lo = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(codes));
+        acc[0] = _mm256_fmadd_ps(lo, unsafe { _mm256_loadu_ps(query) }, acc[0]);
+        let hi = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_srli_si128::<8>(codes)));
+        acc[1] = _mm256_fmadd_ps(hi, unsafe { _mm256_loadu_ps(query.add(8)) }, acc[1]);
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx2")]
+    unsafe fn reduce_add_avx2(acc: [__m256; 2]) -> f32 {
+        let v = _mm256_add_ps(acc[0], acc[1]);
+        let halves = _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps::<1>(v));
+        let pairs = _mm_add_ps(halves, _mm_movehl_ps(halves, halves));
+        let total = _mm_add_ss(pairs, _mm_shuffle_ps::<0b01>(pairs, pairs));
+        _mm_cvtss_f32(total)
+    }
+
+    /// FMA 16 code bytes against 16 query floats (AVX-512: one 16-float lane).
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    unsafe fn fma16_avx512(codes: __m128i, query: *const f32, acc: &mut __m512) {
+        let values = _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(codes));
+        *acc = _mm512_fmadd_ps(values, unsafe { _mm512_loadu_ps(query) }, *acc);
+    }
+
+    macro_rules! x86_dot_kernel {
+        ($name:ident, $dispatch:ident, $unpack:ident, $ex_bits:expr, $runs:expr) => {
+            #[target_feature(enable = "avx2", enable = "fma")]
+            unsafe fn $name(ex_query: &[f32], codes: &[u8]) -> f32 {
+                const GROUP_DIMS: usize = if $runs == 1 { 16 } else { 64 };
+                const GROUP_BYTES: usize = GROUP_DIMS * $ex_bits / 8;
+                debug_assert_eq!(ex_query.len() % super::EX_DOT_BLOCK_DIMS, 0);
+                debug_assert!(codes.len() * 8 <= ex_query.len() * $ex_bits);
+
+                let groups = ex_query.len() / GROUP_DIMS;
+                let full_groups = (codes.len() / GROUP_BYTES).min(groups);
+                // Two accumulators per run position break the FMA latency
+                // chain; they are summed once at the end.
+                let mut acc = [_mm256_setzero_ps(); 2];
+                for group in 0..full_groups {
+                    // SAFETY: `group < full_groups` keeps both the code group
+                    // and the query run in bounds.
+                    let runs = unsafe { $unpack(codes.as_ptr().add(group * GROUP_BYTES)) };
+                    for (run, codes16) in runs.into_iter().enumerate() {
+                        unsafe {
+                            fma16_avx2(
+                                codes16,
+                                ex_query.as_ptr().add(group * GROUP_DIMS + run * 16),
+                                &mut acc,
+                            )
+                        };
+                    }
+                }
+                let consumed = full_groups * GROUP_BYTES;
+                if consumed < codes.len() && full_groups < groups {
+                    // Zero-pad the final partial code group on the stack.
+                    let mut padded = [0u8; GROUP_BYTES];
+                    padded[..codes.len() - consumed].copy_from_slice(&codes[consumed..]);
+                    let runs = unsafe { $unpack(padded.as_ptr()) };
+                    for (run, codes16) in runs.into_iter().enumerate() {
+                        unsafe {
+                            fma16_avx2(
+                                codes16,
+                                ex_query.as_ptr().add(full_groups * GROUP_DIMS + run * 16),
+                                &mut acc,
+                            )
+                        };
+                    }
+                }
+                unsafe { reduce_add_avx2(acc) }
+            }
+
+            fn $dispatch(ex_query: &[f32], codes: &[u8]) -> f32 {
+                // SAFETY: only selected when AVX2 and FMA were detected.
+                unsafe { $name(ex_query, codes) }
+            }
+        };
+    }
+
+    macro_rules! x86_dot_kernel_avx512 {
+        ($name:ident, $dispatch:ident, $unpack:ident, $ex_bits:expr, $runs:expr) => {
+            #[target_feature(enable = "avx512f")]
+            unsafe fn $name(ex_query: &[f32], codes: &[u8]) -> f32 {
+                const GROUP_DIMS: usize = if $runs == 1 { 16 } else { 64 };
+                const GROUP_BYTES: usize = GROUP_DIMS * $ex_bits / 8;
+                debug_assert_eq!(ex_query.len() % super::EX_DOT_BLOCK_DIMS, 0);
+                debug_assert!(codes.len() * 8 <= ex_query.len() * $ex_bits);
+
+                let groups = ex_query.len() / GROUP_DIMS;
+                let full_groups = (codes.len() / GROUP_BYTES).min(groups);
+                // Alternating by group as well as run keeps two independent
+                // FMA chains even for the single-run widths.
+                let mut acc = [_mm512_setzero_ps(); 2];
+                for group in 0..full_groups {
+                    // SAFETY: `group < full_groups` keeps both the code group
+                    // and the query run in bounds.
+                    let runs = unsafe { $unpack(codes.as_ptr().add(group * GROUP_BYTES)) };
+                    for (run, codes16) in runs.into_iter().enumerate() {
+                        unsafe {
+                            fma16_avx512(
+                                codes16,
+                                ex_query.as_ptr().add(group * GROUP_DIMS + run * 16),
+                                &mut acc[(group + run) % 2],
+                            )
+                        };
+                    }
+                }
+                let consumed = full_groups * GROUP_BYTES;
+                if consumed < codes.len() && full_groups < groups {
+                    let mut padded = [0u8; GROUP_BYTES];
+                    padded[..codes.len() - consumed].copy_from_slice(&codes[consumed..]);
+                    let runs = unsafe { $unpack(padded.as_ptr()) };
+                    for (run, codes16) in runs.into_iter().enumerate() {
+                        unsafe {
+                            fma16_avx512(
+                                codes16,
+                                ex_query.as_ptr().add(full_groups * GROUP_DIMS + run * 16),
+                                &mut acc[(full_groups + run) % 2],
+                            )
+                        };
+                    }
+                }
+                _mm512_reduce_add_ps(_mm512_add_ps(acc[0], acc[1]))
+            }
+
+            fn $dispatch(ex_query: &[f32], codes: &[u8]) -> f32 {
+                // SAFETY: only selected when AVX-512F was detected.
+                unsafe { $name(ex_query, codes) }
+            }
+        };
+    }
+
+    x86_dot_kernel!(dot_u1_avx2, dot_u1_avx2_dispatch, unpack_u1, 1, 1);
+    x86_dot_kernel!(dot_u2_avx2, dot_u2_avx2_dispatch, unpack_u2, 2, 4);
+    x86_dot_kernel!(dot_u3_avx2, dot_u3_avx2_dispatch, unpack_u3, 3, 4);
+    x86_dot_kernel!(dot_u4_avx2, dot_u4_avx2_dispatch, unpack_u4, 4, 1);
+    x86_dot_kernel!(dot_u5_avx2, dot_u5_avx2_dispatch, unpack_u5, 5, 4);
+    x86_dot_kernel!(dot_u6_avx2, dot_u6_avx2_dispatch, unpack_u6, 6, 4);
+    x86_dot_kernel!(dot_u7_avx2, dot_u7_avx2_dispatch, unpack_u7, 7, 4);
+    x86_dot_kernel!(dot_u8_avx2, dot_u8_avx2_dispatch, unpack_u8x16, 8, 1);
+
+    x86_dot_kernel_avx512!(dot_u1_avx512, dot_u1_avx512_dispatch, unpack_u1, 1, 1);
+    x86_dot_kernel_avx512!(dot_u2_avx512, dot_u2_avx512_dispatch, unpack_u2, 2, 4);
+    x86_dot_kernel_avx512!(dot_u3_avx512, dot_u3_avx512_dispatch, unpack_u3, 3, 4);
+    x86_dot_kernel_avx512!(dot_u4_avx512, dot_u4_avx512_dispatch, unpack_u4, 4, 1);
+    x86_dot_kernel_avx512!(dot_u5_avx512, dot_u5_avx512_dispatch, unpack_u5, 5, 4);
+    x86_dot_kernel_avx512!(dot_u6_avx512, dot_u6_avx512_dispatch, unpack_u6, 6, 4);
+    x86_dot_kernel_avx512!(dot_u7_avx512, dot_u7_avx512_dispatch, unpack_u7, 7, 4);
+    x86_dot_kernel_avx512!(dot_u8_avx512, dot_u8_avx512_dispatch, unpack_u8x16, 8, 1);
+}
+
+#[cfg(target_arch = "aarch64")]
+mod neon {
+    use super::ExDotFn;
+    use std::arch::aarch64::*;
+
+    pub(super) fn kernel(ex_bits: u8) -> ExDotFn {
+        match ex_bits {
+            1 => dot_u1_neon_dispatch,
+            2 => dot_u2_neon_dispatch,
+            3 => dot_u3_neon_dispatch,
+            4 => dot_u4_neon_dispatch,
+            5 => dot_u5_neon_dispatch,
+            6 => dot_u6_neon_dispatch,
+            7 => dot_u7_neon_dispatch,
+            8 => dot_u8_neon_dispatch,
+            _ => unreachable!("invalid RabitQ ex_bits={ex_bits}"),
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "neon")]
+    unsafe fn unpack_u1(ptr: *const u8) -> [uint8x16_t; 1] {
+        let (b0, b1) = unsafe { (ptr.read(), ptr.add(1).read()) };
+        let bytes = vcombine_u8(vdup_n_u8(b0), vdup_n_u8(b1));
+        let bit_select = vreinterpretq_u8_u64(vdupq_n_u64(0x8040_2010_0804_0201));
+        [vandq_u8(vtstq_u8(bytes, bit_select), vdupq_n_u8(1))]
+    }
+
+    #[inline]
+    #[target_feature(enable = "neon")]
+    unsafe fn unpack_u2(ptr: *const u8) -> [uint8x16_t; 4] {
+        let raw = unsafe { vld1q_u8(ptr) };
+        let mask = vdupq_n_u8(0b11);
+        [
+            vandq_u8(raw, mask),
+            vandq_u8(vshrq_n_u8::<2>(raw), mask),
+            vandq_u8(vshrq_n_u8::<4>(raw), mask),
+            vshrq_n_u8::<6>(raw),
+        ]
+    }
+
+    #[inline]
+    #[target_feature(enable = "neon")]
+    fn top_plane_run(plane: u64, k: usize, top_bit: usize) -> uint8x16_t {
+        let lo = super::shift_plane(plane, 2 * k, top_bit);
+        let hi = super::shift_plane(plane, 2 * k + 1, top_bit);
+        vandq_u8(
+            vreinterpretq_u8_u64(vcombine_u64(vcreate_u64(lo), vcreate_u64(hi))),
+            vdupq_n_u8(1 << top_bit),
+        )
+    }
+
+    #[inline]
+    #[target_feature(enable = "neon")]
+    unsafe fn unpack_u3(ptr: *const u8) -> [uint8x16_t; 4] {
+        let mut runs = unsafe { unpack_u2(ptr) };
+        let plane = unsafe { (ptr.add(16) as *const u64).read_unaligned() };
+        for (k, run) in runs.iter_mut().enumerate() {
+            *run = vorrq_u8(*run, top_plane_run(plane, k, 2));
+        }
+        runs
+    }
+
+    #[inline]
+    #[target_feature(enable = "neon")]
+    unsafe fn unpack_u4(ptr: *const u8) -> [uint8x16_t; 1] {
+        let word = unsafe { (ptr as *const u64).read_unaligned() };
+        let mask = 0x0f0f_0f0f_0f0f_0f0fu64;
+        [vreinterpretq_u8_u64(vcombine_u64(
+            vcreate_u64(word & mask),
+            vcreate_u64((word >> 4) & mask),
+        ))]
+    }
+
+    #[inline]
+    #[target_feature(enable = "neon")]
+    unsafe fn unpack_u5(ptr: *const u8) -> [uint8x16_t; 4] {
+        let blk0 = unsafe { vld1q_u8(ptr) };
+        let blk1 = unsafe { vld1q_u8(ptr.add(16)) };
+        let plane = unsafe { (ptr.add(32) as *const u64).read_unaligned() };
+        let mask = vdupq_n_u8(0x0f);
+        let mut runs = [
+            vandq_u8(blk0, mask),
+            vshrq_n_u8::<4>(blk0),
+            vandq_u8(blk1, mask),
+            vshrq_n_u8::<4>(blk1),
+        ];
+        for (k, run) in runs.iter_mut().enumerate() {
+            *run = vorrq_u8(*run, top_plane_run(plane, k, 4));
+        }
+        runs
+    }
+
+    #[inline]
+    #[target_feature(enable = "neon")]
+    unsafe fn unpack_u6(ptr: *const u8) -> [uint8x16_t; 4] {
+        let blk0 = unsafe { vld1q_u8(ptr) };
+        let blk1 = unsafe { vld1q_u8(ptr.add(16)) };
+        let blk2 = unsafe { vld1q_u8(ptr.add(32)) };
+        let mask6 = vdupq_n_u8(0x3f);
+        let stolen = vorrq_u8(
+            vorrq_u8(
+                vshrq_n_u8::<6>(blk0),
+                vshlq_n_u8::<2>(vshrq_n_u8::<6>(blk1)),
+            ),
+            vshlq_n_u8::<4>(vshrq_n_u8::<6>(blk2)),
+        );
+        [
+            vandq_u8(blk0, mask6),
+            vandq_u8(blk1, mask6),
+            vandq_u8(blk2, mask6),
+            stolen,
+        ]
+    }
+
+    #[inline]
+    #[target_feature(enable = "neon")]
+    unsafe fn unpack_u7(ptr: *const u8) -> [uint8x16_t; 4] {
+        let mut runs = unsafe { unpack_u6(ptr) };
+        let plane = unsafe { (ptr.add(48) as *const u64).read_unaligned() };
+        for (k, run) in runs.iter_mut().enumerate() {
+            *run = vorrq_u8(*run, top_plane_run(plane, k, 6));
+        }
+        runs
+    }
+
+    #[inline]
+    #[target_feature(enable = "neon")]
+    unsafe fn unpack_u8x16(ptr: *const u8) -> [uint8x16_t; 1] {
+        [unsafe { vld1q_u8(ptr) }]
+    }
+
+    /// FMA 16 code bytes against 16 query floats over four 4-float lanes.
+    #[inline]
+    #[target_feature(enable = "neon")]
+    unsafe fn fma16_neon(codes: uint8x16_t, query: *const f32, acc: &mut [float32x4_t; 4]) {
+        let lo = vmovl_u8(vget_low_u8(codes));
+        let hi = vmovl_u8(vget_high_u8(codes));
+        let c0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(lo)));
+        let c1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(lo)));
+        let c2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(hi)));
+        let c3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(hi)));
+        unsafe {
+            acc[0] = vfmaq_f32(acc[0], c0, vld1q_f32(query));
+            acc[1] = vfmaq_f32(acc[1], c1, vld1q_f32(query.add(4)));
+            acc[2] = vfmaq_f32(acc[2], c2, vld1q_f32(query.add(8)));
+            acc[3] = vfmaq_f32(acc[3], c3, vld1q_f32(query.add(12)));
+        }
+    }
+
+    macro_rules! neon_dot_kernel {
+        ($name:ident, $dispatch:ident, $unpack:ident, $ex_bits:expr, $runs:expr) => {
+            #[target_feature(enable = "neon")]
+            unsafe fn $name(ex_query: &[f32], codes: &[u8]) -> f32 {
+                const GROUP_DIMS: usize = if $runs == 1 { 16 } else { 64 };
+                const GROUP_BYTES: usize = GROUP_DIMS * $ex_bits / 8;
+                debug_assert_eq!(ex_query.len() % super::EX_DOT_BLOCK_DIMS, 0);
+                debug_assert!(codes.len() * 8 <= ex_query.len() * $ex_bits);
+
+                let groups = ex_query.len() / GROUP_DIMS;
+                let full_groups = (codes.len() / GROUP_BYTES).min(groups);
+                let mut acc = [vdupq_n_f32(0.0); 4];
+                for group in 0..full_groups {
+                    // SAFETY: `group < full_groups` keeps both the code group
+                    // and the query run in bounds.
+                    let runs = unsafe { $unpack(codes.as_ptr().add(group * GROUP_BYTES)) };
+                    for (run, codes16) in runs.into_iter().enumerate() {
+                        unsafe {
+                            fma16_neon(
+                                codes16,
+                                ex_query.as_ptr().add(group * GROUP_DIMS + run * 16),
+                                &mut acc,
+                            )
+                        };
+                    }
+                }
+                let consumed = full_groups * GROUP_BYTES;
+                if consumed < codes.len() && full_groups < groups {
+                    // Zero-pad the final partial code group on the stack.
+                    let mut padded = [0u8; GROUP_BYTES];
+                    padded[..codes.len() - consumed].copy_from_slice(&codes[consumed..]);
+                    let runs = unsafe { $unpack(padded.as_ptr()) };
+                    for (run, codes16) in runs.into_iter().enumerate() {
+                        unsafe {
+                            fma16_neon(
+                                codes16,
+                                ex_query.as_ptr().add(full_groups * GROUP_DIMS + run * 16),
+                                &mut acc,
+                            )
+                        };
+                    }
+                }
+                vaddvq_f32(vaddq_f32(
+                    vaddq_f32(acc[0], acc[1]),
+                    vaddq_f32(acc[2], acc[3]),
+                ))
+            }
+
+            fn $dispatch(ex_query: &[f32], codes: &[u8]) -> f32 {
+                // SAFETY: NEON is part of the aarch64 baseline.
+                unsafe { $name(ex_query, codes) }
+            }
+        };
+    }
+
+    neon_dot_kernel!(dot_u1_neon, dot_u1_neon_dispatch, unpack_u1, 1, 1);
+    neon_dot_kernel!(dot_u2_neon, dot_u2_neon_dispatch, unpack_u2, 2, 4);
+    neon_dot_kernel!(dot_u3_neon, dot_u3_neon_dispatch, unpack_u3, 3, 4);
+    neon_dot_kernel!(dot_u4_neon, dot_u4_neon_dispatch, unpack_u4, 4, 1);
+    neon_dot_kernel!(dot_u5_neon, dot_u5_neon_dispatch, unpack_u5, 5, 4);
+    neon_dot_kernel!(dot_u6_neon, dot_u6_neon_dispatch, unpack_u6, 6, 4);
+    neon_dot_kernel!(dot_u7_neon, dot_u7_neon_dispatch, unpack_u7, 7, 4);
+    neon_dot_kernel!(dot_u8_neon, dot_u8_neon_dispatch, unpack_u8x16, 8, 1);
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rand::rngs::SmallRng;
+    use rand::{Rng, SeedableRng};
+    use rstest::rstest;
+
+    /// Bit-pack code values sequentially (LSB-first), the on-disk ex-code layout.
+    fn pack_sequential(values: &[u8], ex_bits: u8) -> Vec<u8> {
+        let mut out = vec![0u8; (values.len() * ex_bits as usize).div_ceil(8)];
+        for (dim, &value) in values.iter().enumerate() {
+            let bit_offset = dim * ex_bits as usize;
+            let bits = (value as u16) << (bit_offset % 8);
+            out[bit_offset / 8] |= bits as u8;
+            if bits >> 8 != 0 {
+                out[bit_offset / 8 + 1] |= (bits >> 8) as u8;
+            }
+        }
+        out
+    }
+
+    fn kernel_codes(values: &[u8], dim: usize, ex_bits: u8) -> Vec<u8> {
+        debug_assert_eq!(values.len(), dim);
+        let mut out = vec![0u8; blocked_ex_code_bytes(dim, ex_bits)];
+        pack_blocked_row(values, ex_bits, &mut out);
+        out
+    }
+
+    fn available_kernels(ex_bits: u8) -> Vec<(&'static str, ExDotFn)> {
+        // `mut` is only exercised on x86_64 where extra kernels may be pushed.
+        #[allow(unused_mut)]
+        let mut kernels = vec![
+            ("scalar", scalar_kernel(ex_bits)),
+            ("dispatched", ex_dot_kernel(ex_bits)),
+        ];
+        #[cfg(target_arch = "x86_64")]
+        {
+            if std::arch::is_x86_feature_detected!("avx2")
+                && std::arch::is_x86_feature_detected!("fma")
+            {
+                kernels.push(("avx2", x86::avx2_kernel(ex_bits)));
+            }
+            if std::arch::is_x86_feature_detected!("avx512f") {
+                kernels.push(("avx512", x86::avx512_kernel(ex_bits)));
+            }
+        }
+        kernels
+    }
+
+    #[rstest]
+    fn test_ex_dot_matches_reference(
+        #[values(1, 2, 3, 4, 5, 6, 7, 8)] ex_bits: u8,
+        #[values(7, 16, 60, 64, 100, 128, 1024, 1536, 2048)] dim: usize,
+    ) {
+        let mut rng = SmallRng::seed_from_u64(42 + ex_bits as u64 * 1000 + dim as u64);
+        let max_code = ((1u16 << ex_bits) - 1) as u8;
+        let values = (0..dim)
+            .map(|_| rng.random_range(0..=max_code))
+            .collect::<Vec<_>>();
+        let query = (0..dim)
+            .map(|_| rng.random_range(-1.0f32..1.0))
+            .collect::<Vec<_>>();
+
+        let expected = query
+            .iter()
+            .zip(values.iter())
+            .map(|(q, &c)| *q as f64 * c as f64)
+            .sum::<f64>();
+
+        let codes = kernel_codes(&values, dim, ex_bits);
+        let mut ex_query = vec![0.0; padded_query_len(dim)];
+        pad_query_into(&query, &mut ex_query);
+
+        let tolerance = 1e-3 * expected.abs().max(1.0);
+        for (name, kernel) in available_kernels(ex_bits) {
+            let actual = kernel(&ex_query, &codes) as f64;
+            assert!(
+                (actual - expected).abs() <= tolerance,
+                "ex_bits={ex_bits} dim={dim} kernel={name}: {actual} != {expected}"
+            );
+        }
+    }
+
+    #[rstest]
+    fn test_unpack_group_roundtrip(#[values(1, 2, 3, 4, 5, 6, 7, 8)] ex_bits: u8) {
+        let mut rng = SmallRng::seed_from_u64(7 + ex_bits as u64);
+        let max_code = ((1u16 << ex_bits) - 1) as u8;
+        let values = (0..EX_DOT_BLOCK_DIMS)
+            .map(|_| rng.random_range(0..=max_code))
+            .collect::<Vec<_>>();
+        let codes = kernel_codes(&values, EX_DOT_BLOCK_DIMS, ex_bits);
+
+        // Unpacking each kernel group must reproduce the values in natural
+        // dim order.
+        let dims = group_dims(ex_bits);
+        let bytes = group_bytes(ex_bits);
+        let mut unpacked = [0u8; 64];
+        for group in 0..EX_DOT_BLOCK_DIMS / dims {
+            unpack_group(
+                ex_bits,
+                &codes[group * bytes..(group + 1) * bytes],
+                &mut unpacked,
+            );
+            assert_eq!(
+                &unpacked[..dims],
+                &values[group * dims..(group + 1) * dims],
+                "ex_bits={ex_bits} group={group}"
+            );
+        }
+    }
+
+    /// The legacy sequential rows must repack into exactly what the writer
+    /// produces from the unpacked values.
+    #[rstest]
+    fn test_repack_sequential_matches_blocked(
+        #[values(1, 2, 3, 4, 5, 6, 7, 8)] ex_bits: u8,
+        #[values(7, 64, 100, 1536)] dim: usize,
+    ) {
+        let mut rng = SmallRng::seed_from_u64(11 + ex_bits as u64 * 100 + dim as u64);
+        let max_code = ((1u16 << ex_bits) - 1) as u8;
+        let values = (0..dim)
+            .map(|_| rng.random_range(0..=max_code))
+            .collect::<Vec<_>>();
+        let seq = pack_sequential(&values, ex_bits);
+
+        let mut repacked = vec![0u8; blocked_ex_code_bytes(dim, ex_bits)];
+        repack_sequential_row(&seq, dim, ex_bits, &mut repacked);
+        assert_eq!(repacked, kernel_codes(&values, dim, ex_bits));
+
+        // For the widths where the sequential layout is already blocked
+        // (modulo trailing padding), the raw row must be a prefix.
+        if sequential_matches_blocked(ex_bits) {
+            assert_eq!(&repacked[..seq.len()], &seq);
+            assert!(repacked[seq.len()..].iter().all(|&byte| byte == 0));
+        }
+    }
+
+    /// Dense dim sweep for the bit-plane widths: every tail shape within the
+    /// 64-dim kernel group, plus multi-group sizes.
+    #[rstest]
+    fn test_ex_dot_plane_widths_dense_dims(#[values(3, 5)] ex_bits: u8) {
+        let mut rng = SmallRng::seed_from_u64(97 + ex_bits as u64);
+        let max_code = ((1u16 << ex_bits) - 1) as u8;
+        for dim in (1..=160).chain([255, 256, 1000, 1536, 2048]) {
+            let values = (0..dim)
+                .map(|_| rng.random_range(0..=max_code))
+                .collect::<Vec<_>>();
+            let query = (0..dim)
+                .map(|_| rng.random_range(-1.0f32..1.0))
+                .collect::<Vec<_>>();
+            let expected = query
+                .iter()
+                .zip(values.iter())
+                .map(|(q, &c)| *q as f64 * c as f64)
+                .sum::<f64>();
+
+            let codes = kernel_codes(&values, dim, ex_bits);
+            let mut ex_query = vec![0.0; padded_query_len(dim)];
+            pad_query_into(&query, &mut ex_query);
+            let tolerance = 1e-3 * expected.abs().max(1.0);
+            for (name, kernel) in available_kernels(ex_bits) {
+                let actual = kernel(&ex_query, &codes) as f64;
+                assert!(
+                    (actual - expected).abs() <= tolerance,
+                    "ex_bits={ex_bits} dim={dim} kernel={name}: {actual} != {expected}"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_pad_query_pads_with_zeros() {
+        let query = vec![1.0f32; 100];
+        let mut padded = vec![f32::NAN; padded_query_len(query.len())];
+        pad_query_into(&query, &mut padded);
+        assert_eq!(padded.len(), 128);
+        assert_eq!(&padded[..100], &query[..]);
+        assert!(padded[100..].iter().all(|&value| value == 0.0));
+    }
+}
diff --git a/rust/lance-index/src/vector/bq/prune.rs b/rust/lance-index/src/vector/bq/prune.rs
new file mode 100644
index 00000000000..e67ab6642b8
--- /dev/null
+++ b/rust/lance-index/src/vector/bq/prune.rs
@@ -0,0 +1,527 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! SIMD kernels for the RaBitQ top-k lower-bound pruning scan.
+//!
+//! Multi-bit IVF_RQ search gates the exact ex-code rerank with a per-row
+//! distance lower bound: after the binary FastScan fills the per-row binary
+//! inner products, every row of the partition is classified against the query
+//! upper bound and the current top-k heap threshold, and only the survivors
+//! (typically well under 1%) are reranked. The classification is the per-row
+//! formula of `RabitDistCalculator::raw_query_lower_bound`:
+//!
+//! ```text
+//! lower_bound = (binary_ip - 0.5 * sum_q) * scale_factor
+//!             + add_factor + query_factor
+//!             - error_factor * query_error
+//! ```
+//!
+//! These kernels evaluate the formula and both comparisons for
+//! [`PRUNE_LANES`] rows at a time, returning bit masks instead of values so
+//! the caller can skip whole groups (the overwhelmingly common case) and run
+//! the existing scalar rerank only for the surviving lanes.
+//!
+//! Correctness contract:
+//!
+//! - The lower bound is computed with exactly the operation order of the
+//!   scalar helper — multiplies and adds, never FMA. A fused multiply-add
+//!   rounds differently, which could prune a row the scalar code would have
+//!   kept; with bit-identical lower bounds the masks reproduce the scalar
+//!   `>=` decisions exactly, keeping heap contents and prune-stats counters
+//!   unchanged.
+//! - Comparisons use ordered-quiet GE predicates (`_CMP_GE_OQ`), matching
+//!   scalar `>=`: a NaN lower bound is never pruned and falls through to the
+//!   exact rerank.
+//! - The heap threshold may be a stale snapshot (it only ever tightens); the
+//!   caller re-checks surviving lanes against live values, so a stale
+//!   threshold can only over-select survivors, never wrongly prune.
+
+use std::sync::LazyLock;
+
+/// Rows classified per kernel invocation.
+pub const PRUNE_LANES: usize = 16;
+
+/// Per-query constants of the lower-bound formula, mirroring
+/// `RabitDistCalculator::raw_query_lower_bound` term by term.
+#[derive(Debug, Clone, Copy)]
+pub struct LowerBoundTerms {
+    /// `0.5 * sum_q`, subtracted from the binary inner product.
+    pub half_sum_q: f32,
+    pub query_factor: f32,
+    pub query_error: f32,
+}
+
+/// Classify [`PRUNE_LANES`] rows against the pruning bounds.
+///
+/// Arguments are the per-row binary inner products, scale factors, add
+/// factors, and error factors, followed by the formula constants, the query
+/// upper bound, and the heap threshold (`None` while the heap is not full,
+/// which disables the heap mask).
+///
+/// Returns `(pruned_upper_bound, pruned_heap)` masks: bit `i` of
+/// `pruned_upper_bound` is set when `lower_bound[i] >= upper_bound`, and bit
+/// `i` of `pruned_heap` is set when the row is not already pruned by the
+/// upper bound and `lower_bound[i] >= heap_threshold`. Surviving rows are the
+/// zero bits of the OR of both masks.
+pub type PruneMaskFn = fn(
+    &[f32; PRUNE_LANES],
+    &[f32; PRUNE_LANES],
+    &[f32; PRUNE_LANES],
+    &[f32; PRUNE_LANES],
+    LowerBoundTerms,
+    f32,
+    Option<f32>,
+) -> (u16, u16);
+
+/// Resolve the prune-mask kernel for the running CPU once; the result can be
+/// cached by the caller for per-partition use.
+pub fn prune_mask_kernel() -> PruneMaskFn {
+    static KERNEL: LazyLock<PruneMaskFn> = LazyLock::new(select_prune_mask_kernel);
+    *KERNEL
+}
+
+fn select_prune_mask_kernel() -> PruneMaskFn {
+    #[cfg(target_arch = "x86_64")]
+    {
+        if std::arch::is_x86_feature_detected!("avx512f") {
+            return x86::prune_masks_avx512_dispatch;
+        }
+        if std::arch::is_x86_feature_detected!("avx2") {
+            return x86::prune_masks_avx2_dispatch;
+        }
+    }
+    // On aarch64 the plain 16-wide loop auto-vectorizes to NEON (part of the
+    // baseline), so no dedicated kernel is needed.
+    prune_masks_portable
+}
+
+/// Portable implementation; also the reference for the SIMD kernels.
+fn prune_masks_portable(
+    dists: &[f32; PRUNE_LANES],
+    scale_factors: &[f32; PRUNE_LANES],
+    add_factors: &[f32; PRUNE_LANES],
+    error_factors: &[f32; PRUNE_LANES],
+    terms: LowerBoundTerms,
+    upper_bound: f32,
+    heap_threshold: Option<f32>,
+) -> (u16, u16) {
+    let mut lower_bounds = [0.0f32; PRUNE_LANES];
+    for lane in 0..PRUNE_LANES {
+        lower_bounds[lane] = ((dists[lane] - terms.half_sum_q) * scale_factors[lane]
+            + add_factors[lane]
+            + terms.query_factor)
+            - error_factors[lane] * terms.query_error;
+    }
+    let mut pruned_upper_bound = 0u16;
+    for (lane, lower_bound) in lower_bounds.iter().enumerate() {
+        pruned_upper_bound |= u16::from(*lower_bound >= upper_bound) << lane;
+    }
+    let mut pruned_heap = 0u16;
+    if let Some(threshold) = heap_threshold {
+        for (lane, lower_bound) in lower_bounds.iter().enumerate() {
+            pruned_heap |= u16::from(*lower_bound >= threshold) << lane;
+        }
+        pruned_heap &= !pruned_upper_bound;
+    }
+    (pruned_upper_bound, pruned_heap)
+}
+
+#[cfg(target_arch = "x86_64")]
+mod x86 {
+    use super::{LowerBoundTerms, PRUNE_LANES};
+    use std::arch::x86_64::*;
+
+    /// Lower bounds for 8 lanes with the scalar operation order (no FMA).
+    #[inline]
+    #[target_feature(enable = "avx")]
+    fn lower_bounds_avx(
+        dists: __m256,
+        scale_factors: __m256,
+        add_factors: __m256,
+        error_factors: __m256,
+        half_sum_q: __m256,
+        query_factor: __m256,
+        query_error: __m256,
+    ) -> __m256 {
+        let binary_distance = _mm256_add_ps(
+            _mm256_add_ps(
+                _mm256_mul_ps(_mm256_sub_ps(dists, half_sum_q), scale_factors),
+                add_factors,
+            ),
+            query_factor,
+        );
+        _mm256_sub_ps(binary_distance, _mm256_mul_ps(error_factors, query_error))
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx")]
+    fn ge_mask_avx(lower_bounds_lo: __m256, lower_bounds_hi: __m256, bound: f32) -> u16 {
+        let bound = _mm256_set1_ps(bound);
+        let lo = _mm256_movemask_ps(_mm256_cmp_ps::<_CMP_GE_OQ>(lower_bounds_lo, bound));
+        let hi = _mm256_movemask_ps(_mm256_cmp_ps::<_CMP_GE_OQ>(lower_bounds_hi, bound));
+        (lo | (hi << 8)) as u16
+    }
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn prune_masks_avx2(
+        dists: &[f32; PRUNE_LANES],
+        scale_factors: &[f32; PRUNE_LANES],
+        add_factors: &[f32; PRUNE_LANES],
+        error_factors: &[f32; PRUNE_LANES],
+        terms: LowerBoundTerms,
+        upper_bound: f32,
+        heap_threshold: Option<f32>,
+    ) -> (u16, u16) {
+        let half_sum_q = _mm256_set1_ps(terms.half_sum_q);
+        let query_factor = _mm256_set1_ps(terms.query_factor);
+        let query_error = _mm256_set1_ps(terms.query_error);
+        // SAFETY: the array references guarantee 16 readable floats each.
+        let lower_bounds_lo = unsafe {
+            lower_bounds_avx(
+                _mm256_loadu_ps(dists.as_ptr()),
+                _mm256_loadu_ps(scale_factors.as_ptr()),
+                _mm256_loadu_ps(add_factors.as_ptr()),
+                _mm256_loadu_ps(error_factors.as_ptr()),
+                half_sum_q,
+                query_factor,
+                query_error,
+            )
+        };
+        let lower_bounds_hi = unsafe {
+            lower_bounds_avx(
+                _mm256_loadu_ps(dists.as_ptr().add(8)),
+                _mm256_loadu_ps(scale_factors.as_ptr().add(8)),
+                _mm256_loadu_ps(add_factors.as_ptr().add(8)),
+                _mm256_loadu_ps(error_factors.as_ptr().add(8)),
+                half_sum_q,
+                query_factor,
+                query_error,
+            )
+        };
+        let pruned_upper_bound = ge_mask_avx(lower_bounds_lo, lower_bounds_hi, upper_bound);
+        let pruned_heap = match heap_threshold {
+            Some(threshold) => {
+                ge_mask_avx(lower_bounds_lo, lower_bounds_hi, threshold) & !pruned_upper_bound
+            }
+            None => 0,
+        };
+        (pruned_upper_bound, pruned_heap)
+    }
+
+    pub(super) fn prune_masks_avx2_dispatch(
+        dists: &[f32; PRUNE_LANES],
+        scale_factors: &[f32; PRUNE_LANES],
+        add_factors: &[f32; PRUNE_LANES],
+        error_factors: &[f32; PRUNE_LANES],
+        terms: LowerBoundTerms,
+        upper_bound: f32,
+        heap_threshold: Option<f32>,
+    ) -> (u16, u16) {
+        // SAFETY: only selected when AVX2 was detected.
+        unsafe {
+            prune_masks_avx2(
+                dists,
+                scale_factors,
+                add_factors,
+                error_factors,
+                terms,
+                upper_bound,
+                heap_threshold,
+            )
+        }
+    }
+
+    #[target_feature(enable = "avx512f")]
+    unsafe fn prune_masks_avx512(
+        dists: &[f32; PRUNE_LANES],
+        scale_factors: &[f32; PRUNE_LANES],
+        add_factors: &[f32; PRUNE_LANES],
+        error_factors: &[f32; PRUNE_LANES],
+        terms: LowerBoundTerms,
+        upper_bound: f32,
+        heap_threshold: Option<f32>,
+    ) -> (u16, u16) {
+        // SAFETY: the array references guarantee 16 readable floats each.
+        let (dists, scale_factors, add_factors, error_factors) = unsafe {
+            (
+                _mm512_loadu_ps(dists.as_ptr()),
+                _mm512_loadu_ps(scale_factors.as_ptr()),
+                _mm512_loadu_ps(add_factors.as_ptr()),
+                _mm512_loadu_ps(error_factors.as_ptr()),
+            )
+        };
+        let binary_distance = _mm512_add_ps(
+            _mm512_add_ps(
+                _mm512_mul_ps(
+                    _mm512_sub_ps(dists, _mm512_set1_ps(terms.half_sum_q)),
+                    scale_factors,
+                ),
+                add_factors,
+            ),
+            _mm512_set1_ps(terms.query_factor),
+        );
+        let lower_bounds = _mm512_sub_ps(
+            binary_distance,
+            _mm512_mul_ps(error_factors, _mm512_set1_ps(terms.query_error)),
+        );
+        let pruned_upper_bound =
+            _mm512_cmp_ps_mask::<_CMP_GE_OQ>(lower_bounds, _mm512_set1_ps(upper_bound));
+        let pruned_heap = match heap_threshold {
+            Some(threshold) => {
+                _mm512_cmp_ps_mask::<_CMP_GE_OQ>(lower_bounds, _mm512_set1_ps(threshold))
+                    & !pruned_upper_bound
+            }
+            None => 0,
+        };
+        (pruned_upper_bound, pruned_heap)
+    }
+
+    pub(super) fn prune_masks_avx512_dispatch(
+        dists: &[f32; PRUNE_LANES],
+        scale_factors: &[f32; PRUNE_LANES],
+        add_factors: &[f32; PRUNE_LANES],
+        error_factors: &[f32; PRUNE_LANES],
+        terms: LowerBoundTerms,
+        upper_bound: f32,
+        heap_threshold: Option<f32>,
+    ) -> (u16, u16) {
+        // SAFETY: only selected when AVX-512F was detected.
+        unsafe {
+            prune_masks_avx512(
+                dists,
+                scale_factors,
+                add_factors,
+                error_factors,
+                terms,
+                upper_bound,
+                heap_threshold,
+            )
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rand::rngs::SmallRng;
+    use rand::{Rng, SeedableRng};
+
+    fn available_kernels() -> Vec<(&'static str, PruneMaskFn)> {
+        // `mut` is only exercised on x86_64 where extra kernels may be pushed.
+        #[allow(unused_mut)]
+        let mut kernels = vec![
+            ("portable", prune_masks_portable as PruneMaskFn),
+            ("dispatched", prune_mask_kernel()),
+        ];
+        #[cfg(target_arch = "x86_64")]
+        {
+            if std::arch::is_x86_feature_detected!("avx2") {
+                kernels.push(("avx2", x86::prune_masks_avx2_dispatch));
+            }
+            if std::arch::is_x86_feature_detected!("avx512f") {
+                kernels.push(("avx512", x86::prune_masks_avx512_dispatch));
+            }
+        }
+        kernels
+    }
+
+    /// Per-lane reference mirroring `raw_query_lower_bound` and the scalar
+    /// pruning checks of the top-k scan.
+    fn reference_masks(
+        dists: &[f32; PRUNE_LANES],
+        scale_factors: &[f32; PRUNE_LANES],
+        add_factors: &[f32; PRUNE_LANES],
+        error_factors: &[f32; PRUNE_LANES],
+        terms: LowerBoundTerms,
+        upper_bound: f32,
+        heap_threshold: Option<f32>,
+    ) -> (u16, u16) {
+        let mut pruned_upper_bound = 0u16;
+        let mut pruned_heap = 0u16;
+        for lane in 0..PRUNE_LANES {
+            let lower_bound = (dists[lane] - terms.half_sum_q) * scale_factors[lane]
+                + add_factors[lane]
+                + terms.query_factor
+                - error_factors[lane] * terms.query_error;
+            if lower_bound >= upper_bound {
+                pruned_upper_bound |= 1 << lane;
+            } else if heap_threshold.is_some_and(|threshold| lower_bound >= threshold) {
+                pruned_heap |= 1 << lane;
+            }
+        }
+        (pruned_upper_bound, pruned_heap)
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    fn assert_kernels_match_reference(
+        dists: &[f32; PRUNE_LANES],
+        scale_factors: &[f32; PRUNE_LANES],
+        add_factors: &[f32; PRUNE_LANES],
+        error_factors: &[f32; PRUNE_LANES],
+        terms: LowerBoundTerms,
+        upper_bound: f32,
+        heap_threshold: Option<f32>,
+        case: &str,
+    ) {
+        let expected = reference_masks(
+            dists,
+            scale_factors,
+            add_factors,
+            error_factors,
+            terms,
+            upper_bound,
+            heap_threshold,
+        );
+        for (name, kernel) in available_kernels() {
+            let actual = kernel(
+                dists,
+                scale_factors,
+                add_factors,
+                error_factors,
+                terms,
+                upper_bound,
+                heap_threshold,
+            );
+            assert_eq!(
+                actual, expected,
+                "kernel={name} case={case}: masks {actual:04x?} != {expected:04x?}"
+            );
+        }
+    }
+
+    #[test]
+    fn test_prune_masks_match_reference_on_random_inputs() {
+        let mut rng = SmallRng::seed_from_u64(42);
+        for round in 0..200 {
+            let mut dists = [0.0f32; PRUNE_LANES];
+            let mut scale_factors = [0.0f32; PRUNE_LANES];
+            let mut add_factors = [0.0f32; PRUNE_LANES];
+            let mut error_factors = [0.0f32; PRUNE_LANES];
+            for lane in 0..PRUNE_LANES {
+                dists[lane] = rng.random_range(-100.0f32..100.0);
+                scale_factors[lane] = rng.random_range(-2.0f32..2.0);
+                add_factors[lane] = rng.random_range(-10.0f32..10.0);
+                error_factors[lane] = rng.random_range(0.0f32..5.0);
+            }
+            let terms = LowerBoundTerms {
+                half_sum_q: rng.random_range(-50.0f32..50.0),
+                query_factor: rng.random_range(-10.0f32..10.0),
+                query_error: rng.random_range(0.0f32..2.0),
+            };
+            let upper_bound = rng.random_range(-50.0f32..50.0);
+            let heap_threshold = if round % 3 == 0 {
+                None
+            } else {
+                Some(rng.random_range(-50.0f32..50.0))
+            };
+            assert_kernels_match_reference(
+                &dists,
+                &scale_factors,
+                &add_factors,
+                &error_factors,
+                terms,
+                upper_bound,
+                heap_threshold,
+                &format!("random round {round}"),
+            );
+        }
+    }
+
+    #[test]
+    fn test_prune_masks_exact_boundaries() {
+        // With scale=1, err=0, half_sum_q=0, query_factor=0 the lower bound
+        // is the input itself, so bounds can be placed exactly on lanes.
+        let dists: [f32; PRUNE_LANES] = std::array::from_fn(|lane| lane as f32);
+        let scale_factors = [1.0f32; PRUNE_LANES];
+        let add_factors = [0.0f32; PRUNE_LANES];
+        let error_factors = [0.0f32; PRUNE_LANES];
+        let terms = LowerBoundTerms {
+            half_sum_q: 0.0,
+            query_factor: 0.0,
+            query_error: 1.0,
+        };
+        // Equality must prune (scalar uses `>=`): lanes 3.. hit the upper
+        // bound, lanes 1..3 hit only the heap threshold.
+        let (pruned_upper_bound, pruned_heap) = prune_masks_portable(
+            &dists,
+            &scale_factors,
+            &add_factors,
+            &error_factors,
+            terms,
+            3.0,
+            Some(1.0),
+        );
+        assert_eq!(pruned_upper_bound, 0xfff8);
+        assert_eq!(pruned_heap, 0x0006);
+        assert_kernels_match_reference(
+            &dists,
+            &scale_factors,
+            &add_factors,
+            &error_factors,
+            terms,
+            3.0,
+            Some(1.0),
+            "exact boundaries",
+        );
+        // No heap threshold: only the upper-bound mask is set.
+        assert_kernels_match_reference(
+            &dists,
+            &scale_factors,
+            &add_factors,
+            &error_factors,
+            terms,
+            3.0,
+            None,
+            "no heap threshold",
+        );
+    }
+
+    #[test]
+    fn test_prune_masks_nan_and_infinity_semantics() {
+        let mut dists = [0.0f32; PRUNE_LANES];
+        dists[0] = f32::NAN;
+        dists[1] = f32::INFINITY;
+        dists[2] = f32::NEG_INFINITY;
+        dists[3] = 1.0;
+        let mut scale_factors = [1.0f32; PRUNE_LANES];
+        scale_factors[4] = f32::NAN;
+        let add_factors = [0.0f32; PRUNE_LANES];
+        let mut error_factors = [0.0f32; PRUNE_LANES];
+        error_factors[5] = f32::INFINITY;
+        let terms = LowerBoundTerms {
+            half_sum_q: 0.0,
+            query_factor: 0.0,
+            query_error: 1.0,
+        };
+        for (upper_bound, heap_threshold) in [
+            (0.5, Some(0.0)),
+            (f32::INFINITY, Some(f32::NEG_INFINITY)),
+            (f32::NAN, Some(f32::NAN)),
+            (0.5, None),
+        ] {
+            assert_kernels_match_reference(
+                &dists,
+                &scale_factors,
+                &add_factors,
+                &error_factors,
+                terms,
+                upper_bound,
+                heap_threshold,
+                &format!("special values ub={upper_bound} thr={heap_threshold:?}"),
+            );
+        }
+        // NaN lower bounds (lane 0 via a NaN binary inner product, lane 4 via
+        // a NaN scale factor) must never be pruned by either mask.
+        let (pruned_upper_bound, pruned_heap) = prune_masks_portable(
+            &dists,
+            &scale_factors,
+            &add_factors,
+            &error_factors,
+            terms,
+            0.5,
+            Some(0.0),
+        );
+        assert_eq!(pruned_upper_bound & 0b1_0001, 0);
+        assert_eq!(pruned_heap & 0b1_0001, 0);
+    }
+}
diff --git a/rust/lance-index/src/vector/bq/storage.rs b/rust/lance-index/src/vector/bq/storage.rs
index bd70f176c5d..2f4fe69792a 100644
--- a/rust/lance-index/src/vector/bq/storage.rs
+++ b/rust/lance-index/src/vector/bq/storage.rs
@@ -17,7 +17,7 @@ use arrow_array::{
 use arrow_schema::{DataType, Field, SchemaRef};
 use async_trait::async_trait;
 use bytes::{Bytes, BytesMut};
-use itertools::Itertools;
+use itertools::{Itertools, izip};
 use lance_arrow::{ArrowFloatType, FixedSizeListArrayExt, FloatArray, RecordBatchExt};
 use lance_core::deepsize::DeepSizeOf;
 use lance_core::{Error, ROW_ID, Result};
@@ -41,6 +41,14 @@ use serde::{Deserialize, Serialize};
 use crate::frag_reuse::FragReuseIndex;
 use crate::pb;
 use crate::vector::ApproxMode;
+use crate::vector::bq::dist_table_quant::{
+    DistTableDequant, quantize_dist_table_into, quantize_dist_table_u16_into,
+};
+use crate::vector::bq::ex_dot::{
+    EX_DOT_BLOCK_DIMS, ExDotFn, blocked_ex_code_bytes, ex_dot_kernel, pad_query_into,
+    padded_query_len, repack_sequential_row, sequential_matches_blocked,
+};
+use crate::vector::bq::prune::{LowerBoundTerms, PRUNE_LANES, prune_mask_kernel};
 use crate::vector::bq::rotation::{apply_fast_rotation, apply_fast_rotation_in_place};
 use crate::vector::bq::transform::{
     ADD_FACTORS_COLUMN, ERROR_FACTORS_COLUMN, EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN,
@@ -59,7 +67,14 @@ use crate::vector::storage::{
 
 pub const RABIT_METADATA_KEY: &str = "lance:rabit";
 pub const RABIT_CODE_COLUMN: &str = "_rabit_codes";
+/// Legacy ex-code column: sequential LSB-first bit stream per row. Read-only;
+/// rows are repacked into the blocked layout at load time.
 pub const RABIT_EX_CODE_COLUMN: &str = "__ex_codes";
+/// Ex-code column in the blocked layout consumed by the ex-dot kernels (see
+/// `ex_dot` module docs). Indexes written with this column cannot be read by
+/// older versions, which fail with a missing-column error instead of
+/// misinterpreting the bytes.
+pub const RABIT_BLOCKED_EX_CODE_COLUMN: &str = "__blocked_ex_codes";
 pub const SEGMENT_LENGTH: usize = 4;
 pub const SEGMENT_NUM_CODES: usize = 1 << SEGMENT_LENGTH;
 const RABIT_PRUNE_STATS_ENV: &str = "LANCE_RQ_PRUNE_STATS";
@@ -122,16 +137,28 @@ fn emit_rabit_prune_stats(message: &str) {
     );
 }
 
-fn record_rabit_prune_stats(
+/// Per-scan tallies of the raw-query lower-bound gating, reported through
+/// `record_rabit_prune_stats`.
+#[derive(Default)]
+struct RabitPruneCounters {
     candidates: usize,
     pruned_upper_bound: usize,
     pruned_heap: usize,
     exact: usize,
     exact_rejected: usize,
-) {
+}
+
+fn record_rabit_prune_stats(counters: &RabitPruneCounters) {
     if !rabit_prune_stats_enabled() {
         return;
     }
+    let RabitPruneCounters {
+        candidates,
+        pruned_upper_bound,
+        pruned_heap,
+        exact,
+        exact_rejected,
+    } = *counters;
 
     let stats = RABIT_PRUNE_STATS.get_or_init(RabitPruneStats::default);
     let calls = stats.calls.fetch_add(1, Ordering::Relaxed) + 1;
@@ -210,10 +237,10 @@ pub fn rabit_ex_code_field(rotated_dim: usize, num_bits: u8) -> Result<Option<Fi
         return Ok(None);
     }
     Ok(Some(Field::new(
-        RABIT_EX_CODE_COLUMN,
+        RABIT_BLOCKED_EX_CODE_COLUMN,
         DataType::FixedSizeList(
             Arc::new(Field::new("item", DataType::UInt8, true)),
-            rabit_ex_code_bytes(rotated_dim, ex_bits)? as i32,
+            blocked_ex_code_bytes(rotated_dim, ex_bits) as i32,
         ),
         true,
     )))
@@ -349,11 +376,6 @@ impl RabitQuantizationMetadata {
         let code_dim = self.code_dim();
         let ex_bits = rabit_ex_bits(self.num_bits)?;
         let dist_table_len = code_dim * 4;
-        let ex_dist_table_len = if ex_bits == 0 {
-            0
-        } else {
-            code_dim * (1usize << ex_bits)
-        };
 
         let mut rotated_query = vec![0.0; code_dim];
         self.rotate_vector_with_residual_into(query, None, &mut rotated_query);
@@ -361,8 +383,13 @@ impl RabitQuantizationMetadata {
         let mut dist_table = vec![0.0; dist_table_len];
         build_dist_table_direct_into::<Float32Type>(&rotated_query, &mut dist_table);
 
-        let mut ex_dist_table = vec![0.0; ex_dist_table_len];
-        build_ex_dist_table_direct_into(&rotated_query, ex_bits, &mut ex_dist_table);
+        // The kernels consume the rotated query directly; a zero-padded copy
+        // is only needed when the rotated dim is not block-aligned.
+        let mut ex_query = Vec::new();
+        if ex_bits > 0 && !code_dim.is_multiple_of(EX_DOT_BLOCK_DIMS) {
+            ex_query.resize(padded_query_len(code_dim), 0.0);
+            pad_query_into(&rotated_query, &mut ex_query);
+        }
 
         let sum_q = rotated_query.iter().copied().sum();
         Ok(RabitRawQueryContext {
@@ -370,7 +397,7 @@ impl RabitQuantizationMetadata {
             ex_bits,
             rotated_query,
             dist_table,
-            ex_dist_table,
+            ex_query,
             sum_q,
         })
     }
@@ -462,6 +489,10 @@ pub struct RabitQuantizationStorage {
     add_factors: Float32Array,
     scale_factors: Float32Array,
     error_factors: Option<Float32Array>,
+    // ex codes in the blocked kernel layout; always aliases the batch column
+    // (legacy sequential batches are normalized at load, replacing the
+    // sequential column with the repacked one, so rewrites emit the blocked
+    // format).
     ex_codes: Option<FixedSizeListArray>,
     packed_ex_codes: Option<FixedSizeListArray>,
     ex_add_factors: Option<Float32Array>,
@@ -560,12 +591,17 @@ impl RabitQuantizationStorage {
         let RabitDistCalculatorParts {
             dim,
             dist_table,
-            ex_dist_table,
+            ex_query,
             sum_q,
             query_factor,
             query_error,
             approx_mode,
         } = parts;
+        let ex_code_len = self
+            .ex_codes
+            .as_ref()
+            .map(|codes| codes.value_length() as usize)
+            .unwrap_or_default();
         let ex_codes = self
             .ex_codes
             .as_ref()
@@ -579,10 +615,11 @@ impl RabitQuantizationStorage {
             self.metadata.num_bits,
             self.metadata.query_estimator,
             dist_table,
-            ex_dist_table,
+            ex_query,
             sum_q,
             self.codes.values().as_primitive::<UInt8Type>().values(),
             ex_codes,
+            ex_code_len,
             self.add_factors.values(),
             self.scale_factors.values(),
             self.error_factors
@@ -767,25 +804,56 @@ fn copy_subtract_f32(lhs: &[f32], rhs: &[f32], output: &mut [f32]) {
 struct RabitDistCalculatorParts<'a> {
     dim: usize,
     dist_table: Cow<'a, [f32]>,
-    ex_dist_table: Cow<'a, [f32]>,
+    ex_query: Cow<'a, [f32]>,
     sum_q: f32,
     query_factor: f32,
     query_error: f32,
     approx_mode: ApproxMode,
 }
 
+/// Loop-invariant inputs of the raw-query multi-bit top-k scans: the row
+/// count, the resolved ex-code state for exact reranking, and the query
+/// bounds.
+struct RawQueryTopkContext<'a> {
+    n: usize,
+    k: usize,
+    ex_bits: u8,
+    ex_codes: &'a [u8],
+    ex_add_factors: &'a [f32],
+    ex_scale_factors: &'a [f32],
+    query_lower_bound: f32,
+    query_upper_bound: f32,
+}
+
+/// Pick the query slice the ex-dot kernels consume: the rotated query itself
+/// when the dim is block-aligned, otherwise a zero-padded copy.
+fn kernel_query<'a>(rotated_query: &'a [f32], padded: &'a [f32]) -> &'a [f32] {
+    if rotated_query.len().is_multiple_of(EX_DOT_BLOCK_DIMS) {
+        rotated_query
+    } else {
+        padded
+    }
+}
+
 pub struct RabitDistCalculator<'a> {
     dim: usize,
     num_bits: u8,
     query_estimator: RabitQueryEstimator,
     // n * d / 8 binary-code bytes
     codes: &'a [u8],
+    // per-row ex codes in the blocked kernel layout
     ex_codes: Option<&'a [u8]>,
+    // bytes per ex-code row; legacy rows for layout-compatible widths may be
+    // shorter than the blocked size, which the kernels treat as zero padding
+    ex_code_len: usize,
     // this is a flattened 2D array of size d/4 * 16,
     // we split the query codes into d/4 chunks, each chunk is with 4 elements,
     // then dist_table[i][j] is the distance between the i-th query code and the code j
     dist_table: Cow<'a, [f32]>,
-    ex_dist_table: Cow<'a, [f32]>,
+    // the rotated query, zero-padded to a 64-dim multiple when needed; also
+    // the source for the FastScan ex LUT on the legacy bypass path
+    ex_query: Cow<'a, [f32]>,
+    ex_dot: Option<ExDotFn>,
     add_factors: &'a [f32],
     scale_factors: &'a [f32],
     error_factors: Option<&'a [f32]>,
@@ -807,10 +875,11 @@ impl<'a> RabitDistCalculator<'a> {
         num_bits: u8,
         query_estimator: RabitQueryEstimator,
         dist_table: Cow<'a, [f32]>,
-        ex_dist_table: Cow<'a, [f32]>,
+        ex_query: Cow<'a, [f32]>,
         sum_q: f32,
         codes: &'a [u8],
         ex_codes: Option<&'a [u8]>,
+        ex_code_len: usize,
         add_factors: &'a [f32],
         scale_factors: &'a [f32],
         error_factors: Option<&'a [f32]>,
@@ -821,14 +890,17 @@ impl<'a> RabitDistCalculator<'a> {
         query_error: f32,
         approx_mode: ApproxMode,
     ) -> Self {
+        let ex_dot = (num_bits > 1).then(|| ex_dot_kernel(num_bits - 1));
         Self {
             dim,
             num_bits,
             query_estimator,
             codes,
             ex_codes,
+            ex_code_len,
             dist_table,
-            ex_dist_table,
+            ex_query,
+            ex_dot,
             add_factors,
             scale_factors,
             error_factors,
@@ -843,6 +915,34 @@ impl<'a> RabitDistCalculator<'a> {
         }
     }
 
+    /// `sum_d query[d] * ex_code[d]` for the candidate's packed ex codes.
+    #[inline]
+    fn ex_code_dot(&self, ex_codes: &[u8], id: usize) -> f32 {
+        let ex_dot = self
+            .ex_dot
+            .expect("raw-query multi-bit RQ requires an ex-dot kernel");
+        ex_dot(
+            self.ex_query.as_ref(),
+            &ex_codes[id * self.ex_code_len..(id + 1) * self.ex_code_len],
+        )
+    }
+
+    /// Fill `dists[0..n]` with exact per-row binary distances computed
+    /// directly from the f32 dist table — the fallback when the quantized
+    /// reconstruction scale would be non-finite ([`DistTableDequant::Exact`]).
+    #[allow(clippy::uninit_vec)]
+    fn fill_exact_binary_distances(&self, n: usize, code_len: usize, dists: &mut Vec<f32>) {
+        dists.clear();
+        dists.reserve(n);
+        // SAFETY: the loop initializes every element in [0, n).
+        unsafe {
+            dists.set_len(n);
+        }
+        dists.iter_mut().enumerate().for_each(|(id, dist)| {
+            *dist = compute_single_rq_distance(self.codes, id, n, code_len, &self.dist_table);
+        });
+    }
+
     #[allow(clippy::uninit_vec)]
     fn binary_distances_with_scratch(
         &self,
@@ -864,7 +964,16 @@ impl<'a> RabitDistCalculator<'a> {
             );
         }
 
-        let (qmin, qmax) = quantize_dist_table_into(&self.dist_table, quantized_dists_table);
+        let (qmin, qmax) = match quantize_dist_table_into(&self.dist_table, quantized_dists_table) {
+            DistTableDequant::Affine { qmin, qmax } => (qmin, qmax),
+            DistTableDequant::Exact => {
+                // The affine reconstruction would be non-finite; compute every
+                // binary distance exactly and report no SIMD rows so the
+                // ex-rerank caller takes the per-row path for all of them.
+                self.fill_exact_binary_distances(n, code_len, dists);
+                return 0;
+            }
+        };
         let remainder = n % BATCH_SIZE;
         let simd_len = n - remainder;
         quantized_dists.clear();
@@ -924,7 +1033,16 @@ impl<'a> RabitDistCalculator<'a> {
         hacc_dist_table: &mut Vec<u8>,
         quantized_dists: &mut Vec<u32>,
     ) -> usize {
-        let (qmin, qmax) = quantize_dist_table_u16_into(&self.dist_table, quantized_dist_table);
+        let (qmin, qmax) =
+            match quantize_dist_table_u16_into(&self.dist_table, quantized_dist_table) {
+                DistTableDequant::Affine { qmin, qmax } => (qmin, qmax),
+                DistTableDequant::Exact => {
+                    // See binary_distances_with_scratch: non-finite affine
+                    // scale falls back to exact per-row distances.
+                    self.fill_exact_binary_distances(n, code_len, dists);
+                    return 0;
+                }
+            };
         simd::dist_table::transfer_4bit_dist_table_u16(quantized_dist_table, hacc_dist_table);
         let remainder = n % BATCH_SIZE;
         let simd_len = n - remainder;
@@ -1030,8 +1148,6 @@ impl<'a> RabitDistCalculator<'a> {
         let ex_scale_factors = self
             .ex_scale_factors
             .expect("raw-query multi-bit RQ requires ex scale factors");
-        let ex_code_len =
-            rabit_ex_code_bytes(self.dim, ex_bits).expect("RabitQ num_bits should be validated");
         let code_scale = (1u32 << ex_bits) as f32;
         let code_bias = -(code_scale - 0.5);
 
@@ -1039,12 +1155,11 @@ impl<'a> RabitDistCalculator<'a> {
             self.packed_ex_codes
                 .map(|packed_ex_codes| {
                     let fastscan_len = simd_len;
-                    let fastscan_code_len = ex_fastscan_code_len(self.dim, ex_bits)
-                        .expect("RabitQ num_bits should be validated");
+                    let fastscan_code_len = self.ex_code_len;
                     let (qmin, qmax, quantization_max) = quantize_ex_fastscan_dist_table_into(
-                        self.dim,
                         ex_bits,
-                        &self.ex_dist_table,
+                        self.ex_code_len,
+                        self.ex_query.as_ref(),
                         quantized_dists_table,
                     );
                     quantized_dists.clear();
@@ -1088,14 +1203,7 @@ impl<'a> RabitDistCalculator<'a> {
             .enumerate()
             .skip(fastscan_len)
             .for_each(|(id, dist)| {
-                let ex_dist = compute_single_rq_ex_distance(
-                    ex_codes,
-                    id,
-                    ex_code_len,
-                    ex_bits,
-                    self.dim,
-                    &self.ex_dist_table,
-                );
+                let ex_dist = self.ex_code_dot(ex_codes, id);
                 let full_dot = code_scale * *dist + ex_dist + code_bias * self.sum_q;
                 *dist = full_dot * ex_scale_factors[id] + ex_add_factors[id] + self.query_factor;
             });
@@ -1121,44 +1229,37 @@ impl<'a> RabitDistCalculator<'a> {
         id: usize,
         binary_ip: f32,
         ex_bits: u8,
-        ex_code_len: usize,
         ex_codes: &[u8],
         ex_add_factors: &[f32],
         ex_scale_factors: &[f32],
     ) -> f32 {
-        let ex_dist = compute_single_rq_ex_distance(
-            ex_codes,
-            id,
-            ex_code_len,
-            ex_bits,
-            self.dim,
-            &self.ex_dist_table,
-        );
+        let ex_dist = self.ex_code_dot(ex_codes, id);
         let code_bias = -((1u32 << ex_bits) as f32 - 0.5);
         let full_dot = (1u32 << ex_bits) as f32 * binary_ip + ex_dist + code_bias * self.sum_q;
         full_dot * ex_scale_factors[id] + ex_add_factors[id] + self.query_factor
     }
 
+    /// Compute the binary inner products into `dists` and resolve the inputs
+    /// shared by the raw-query multi-bit top-k scans. Returns `None` when the
+    /// partition has no rows.
     #[allow(clippy::too_many_arguments)]
-    fn accumulate_raw_query_multi_bit_topk_with_scratch(
+    fn raw_query_multi_bit_topk_context(
         &self,
         k: usize,
         lower_bound: Option<f32>,
         upper_bound: Option<f32>,
-        row_ids: impl Iterator<Item = (usize, u64)>,
-        res: &mut BinaryHeap<OrderedNode<u64>>,
         dists: &mut Vec<f32>,
         quantized_dists: &mut Vec<u16>,
         quantized_dists_table: &mut Vec<u8>,
         hacc_quantized_dists: &mut Vec<u32>,
-    ) {
+    ) -> Option<RawQueryTopkContext<'_>> {
         let code_len = rabit_binary_code_bytes(self.dim);
         let n = self.codes.len() / code_len;
         if n == 0 {
             dists.clear();
             quantized_dists.clear();
             hacc_quantized_dists.clear();
-            return;
+            return None;
         }
 
         self.binary_distances_with_scratch(
@@ -1170,77 +1271,233 @@ impl<'a> RabitDistCalculator<'a> {
             hacc_quantized_dists,
         );
 
-        let ex_bits = self.num_bits - 1;
-        let ex_codes = self
-            .ex_codes
-            .expect("raw-query multi-bit RQ requires ex codes");
-        let ex_add_factors = self
-            .ex_add_factors
-            .expect("raw-query multi-bit RQ requires ex add factors");
-        let ex_scale_factors = self
-            .ex_scale_factors
-            .expect("raw-query multi-bit RQ requires ex scale factors");
-        let ex_code_len =
-            rabit_ex_code_bytes(self.dim, ex_bits).expect("RabitQ num_bits should be validated");
-        let query_lower_bound = lower_bound.unwrap_or(f32::MIN);
-        let query_upper_bound = upper_bound.unwrap_or(f32::MAX);
+        Some(RawQueryTopkContext {
+            n,
+            k,
+            ex_bits: self.num_bits - 1,
+            ex_codes: self
+                .ex_codes
+                .expect("raw-query multi-bit RQ requires ex codes"),
+            ex_add_factors: self
+                .ex_add_factors
+                .expect("raw-query multi-bit RQ requires ex add factors"),
+            ex_scale_factors: self
+                .ex_scale_factors
+                .expect("raw-query multi-bit RQ requires ex scale factors"),
+            query_lower_bound: lower_bound.unwrap_or(f32::MIN),
+            query_upper_bound: upper_bound.unwrap_or(f32::MAX),
+        })
+    }
+
+    /// Process one candidate row given its lower bound: the bound checks,
+    /// the exact rerank, and the heap update shared by the sparse scan and
+    /// the dense scan's surviving lanes and tail.
+    #[inline]
+    #[allow(clippy::too_many_arguments)]
+    fn accumulate_raw_query_multi_bit_row(
+        &self,
+        ctx: &RawQueryTopkContext<'_>,
+        id: usize,
+        row_id: u64,
+        binary_ip: f32,
+        raw_lower_bound: f32,
+        res: &mut BinaryHeap<OrderedNode<u64>>,
+        max_dist: &mut Option<OrderedFloat>,
+        counters: &mut RabitPruneCounters,
+    ) {
+        if raw_lower_bound >= ctx.query_upper_bound {
+            counters.pruned_upper_bound += 1;
+            return;
+        }
+        if res.len() >= ctx.k && max_dist.is_some_and(|max_dist| raw_lower_bound >= max_dist.0) {
+            counters.pruned_heap += 1;
+            return;
+        }
+
+        counters.exact += 1;
+        let dist = self.raw_query_multi_bit_exact_distance(
+            id,
+            binary_ip,
+            ctx.ex_bits,
+            ctx.ex_codes,
+            ctx.ex_add_factors,
+            ctx.ex_scale_factors,
+        );
+        if dist < ctx.query_lower_bound || dist >= ctx.query_upper_bound {
+            counters.exact_rejected += 1;
+            return;
+        }
+        let dist = OrderedFloat(dist);
+        if res.len() < ctx.k {
+            res.push(OrderedNode::new(row_id, dist));
+            if res.len() == ctx.k {
+                *max_dist = res.peek().map(|node| node.dist);
+            }
+        } else if max_dist.is_some_and(|max_dist| max_dist > dist) {
+            res.pop();
+            res.push(OrderedNode::new(row_id, dist));
+            *max_dist = res.peek().map(|node| node.dist);
+        }
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    fn accumulate_raw_query_multi_bit_topk_with_scratch(
+        &self,
+        k: usize,
+        lower_bound: Option<f32>,
+        upper_bound: Option<f32>,
+        row_ids: impl Iterator<Item = (usize, u64)>,
+        res: &mut BinaryHeap<OrderedNode<u64>>,
+        dists: &mut Vec<f32>,
+        quantized_dists: &mut Vec<u16>,
+        quantized_dists_table: &mut Vec<u8>,
+        hacc_quantized_dists: &mut Vec<u32>,
+    ) {
+        let Some(ctx) = self.raw_query_multi_bit_topk_context(
+            k,
+            lower_bound,
+            upper_bound,
+            dists,
+            quantized_dists,
+            quantized_dists_table,
+            hacc_quantized_dists,
+        ) else {
+            return;
+        };
         let mut max_dist = res.peek().map(|node| node.dist);
-        let mut candidates = 0;
-        let mut pruned_upper_bound = 0;
-        let mut pruned_heap = 0;
-        let mut exact = 0;
-        let mut exact_rejected = 0;
+        let mut counters = RabitPruneCounters::default();
 
         for (id, row_id) in row_ids {
             let Some(binary_ip) = dists.get(id).copied() else {
                 continue;
             };
-            candidates += 1;
+            counters.candidates += 1;
             let Some(raw_lower_bound) = self.raw_query_lower_bound(id, binary_ip) else {
                 continue;
             };
-            if raw_lower_bound >= query_upper_bound {
-                pruned_upper_bound += 1;
-                continue;
-            }
-            if res.len() >= k && max_dist.is_some_and(|max_dist| raw_lower_bound >= max_dist.0) {
-                pruned_heap += 1;
-                continue;
+            self.accumulate_raw_query_multi_bit_row(
+                &ctx,
+                id,
+                row_id,
+                binary_ip,
+                raw_lower_bound,
+                res,
+                &mut max_dist,
+                &mut counters,
+            );
+        }
+        record_rabit_prune_stats(&counters);
+    }
+
+    /// Top-k scan over all rows `0..n` in order: classify [`PRUNE_LANES`]
+    /// rows at a time with the SIMD lower-bound kernel and run the scalar
+    /// rerank only for the surviving lanes.
+    #[allow(clippy::too_many_arguments)]
+    fn accumulate_raw_query_multi_bit_topk_dense_with_scratch(
+        &self,
+        k: usize,
+        lower_bound: Option<f32>,
+        upper_bound: Option<f32>,
+        row_id: impl Fn(u32) -> u64,
+        res: &mut BinaryHeap<OrderedNode<u64>>,
+        dists: &mut Vec<f32>,
+        quantized_dists: &mut Vec<u16>,
+        quantized_dists_table: &mut Vec<u8>,
+        hacc_quantized_dists: &mut Vec<u32>,
+    ) {
+        let Some(ctx) = self.raw_query_multi_bit_topk_context(
+            k,
+            lower_bound,
+            upper_bound,
+            dists,
+            quantized_dists,
+            quantized_dists_table,
+            hacc_quantized_dists,
+        ) else {
+            return;
+        };
+        let dists = dists.as_slice();
+        debug_assert_eq!(dists.len(), ctx.n);
+        let scale_factors = &self.scale_factors[..ctx.n];
+        let add_factors = &self.add_factors[..ctx.n];
+        let error_factors = &self
+            .error_factors
+            .expect("raw-query lower-bound gating requires error factors")[..ctx.n];
+        // Same expression as `raw_query_lower_bound` with `error_factors`
+        // already resolved; the masks below match it bit for bit.
+        let lower_bound_of = |id: usize, binary_ip: f32| {
+            self.raw_query_binary_distance(id, binary_ip) - error_factors[id] * self.query_error
+        };
+        let terms = LowerBoundTerms {
+            half_sum_q: 0.5 * self.sum_q,
+            query_factor: self.query_factor,
+            query_error: self.query_error,
+        };
+        let prune_masks = prune_mask_kernel();
+        let mut max_dist = res.peek().map(|node| node.dist);
+        let mut counters = RabitPruneCounters::default();
+
+        let (dist_groups, dist_tail) = dists.as_chunks::<PRUNE_LANES>();
+        let (scale_groups, _) = scale_factors.as_chunks::<PRUNE_LANES>();
+        let (add_groups, _) = add_factors.as_chunks::<PRUNE_LANES>();
+        let (error_groups, _) = error_factors.as_chunks::<PRUNE_LANES>();
+        for (group, (dist16, scale16, add16, error16)) in
+            izip!(dist_groups, scale_groups, add_groups, error_groups).enumerate()
+        {
+            counters.candidates += PRUNE_LANES;
+            // The heap threshold only ever tightens, so this group-start
+            // snapshot can only over-select survivors (which the per-row
+            // processing below re-checks against live values), never prune a
+            // row the scalar scan would have kept.
+            let heap_threshold = (res.len() >= ctx.k)
+                .then(|| max_dist.map(|max_dist| max_dist.0))
+                .flatten();
+            let (pruned_upper_bound, pruned_heap) = prune_masks(
+                dist16,
+                scale16,
+                add16,
+                error16,
+                terms,
+                ctx.query_upper_bound,
+                heap_threshold,
+            );
+            counters.pruned_upper_bound += pruned_upper_bound.count_ones() as usize;
+            counters.pruned_heap += pruned_heap.count_ones() as usize;
+            let mut survivors = !(pruned_upper_bound | pruned_heap);
+            while survivors != 0 {
+                let lane = survivors.trailing_zeros() as usize;
+                survivors &= survivors - 1;
+                let id = group * PRUNE_LANES + lane;
+                let binary_ip = dists[id];
+                self.accumulate_raw_query_multi_bit_row(
+                    &ctx,
+                    id,
+                    row_id(id as u32),
+                    binary_ip,
+                    lower_bound_of(id, binary_ip),
+                    res,
+                    &mut max_dist,
+                    &mut counters,
+                );
             }
+        }
 
-            exact += 1;
-            let dist = self.raw_query_multi_bit_exact_distance(
+        let tail_start = ctx.n - dist_tail.len();
+        for (offset, binary_ip) in dist_tail.iter().copied().enumerate() {
+            let id = tail_start + offset;
+            counters.candidates += 1;
+            self.accumulate_raw_query_multi_bit_row(
+                &ctx,
                 id,
+                row_id(id as u32),
                 binary_ip,
-                ex_bits,
-                ex_code_len,
-                ex_codes,
-                ex_add_factors,
-                ex_scale_factors,
+                lower_bound_of(id, binary_ip),
+                res,
+                &mut max_dist,
+                &mut counters,
             );
-            if dist < query_lower_bound || dist >= query_upper_bound {
-                exact_rejected += 1;
-                continue;
-            }
-            let dist = OrderedFloat(dist);
-            if res.len() < k {
-                res.push(OrderedNode::new(row_id, dist));
-                if res.len() == k {
-                    max_dist = res.peek().map(|node| node.dist);
-                }
-            } else if max_dist.is_some_and(|max_dist| max_dist > dist) {
-                res.pop();
-                res.push(OrderedNode::new(row_id, dist));
-                max_dist = res.peek().map(|node| node.dist);
-            }
         }
-        record_rabit_prune_stats(
-            candidates,
-            pruned_upper_bound,
-            pruned_heap,
-            exact,
-            exact_rejected,
-        );
+        record_rabit_prune_stats(&counters);
     }
 
     fn raw_query_lower_bound_gating_disabled_reason(&self) -> Option<&'static str> {
@@ -1276,33 +1533,6 @@ where
     dist_table
 }
 
-fn build_ex_dist_table_direct(rotated_query: &[f32], ex_bits: u8) -> Vec<f32> {
-    if ex_bits == 0 {
-        return Vec::new();
-    }
-    let entries_per_dim = 1usize << ex_bits;
-    let mut dist_table = vec![0.0; rotated_query.len() * entries_per_dim];
-    build_ex_dist_table_direct_into(rotated_query, ex_bits, &mut dist_table);
-    dist_table
-}
-
-fn build_ex_dist_table_direct_into(rotated_query: &[f32], ex_bits: u8, dist_table: &mut [f32]) {
-    if ex_bits == 0 {
-        debug_assert!(dist_table.is_empty());
-        return;
-    }
-    let entries_per_dim = 1usize << ex_bits;
-    debug_assert_eq!(dist_table.len(), rotated_query.len() * entries_per_dim);
-    for (query_value, table) in rotated_query
-        .iter()
-        .zip(dist_table.chunks_exact_mut(entries_per_dim))
-    {
-        for (code, value) in table.iter_mut().enumerate() {
-            *value = *query_value * code as f32;
-        }
-    }
-}
-
 fn build_dist_table_direct_into<T: ArrowFloatType>(qc: &[T::Native], dist_table: &mut [f32])
 where
     T::Native: AsPrimitive<f32>,
@@ -1339,95 +1569,20 @@ where
     })
 }
 
-// Quantize the distance table into a caller-owned buffer.
-#[inline]
-fn quantize_dist_table_into(dist_table: &[f32], quantized_dist_table: &mut Vec<u8>) -> (f32, f32) {
-    let (qmin, qmax) = dist_table
-        .iter()
-        .cloned()
-        .minmax_by(|a, b| a.total_cmp(b))
-        .into_option()
-        .unwrap();
-    // this happens if the query is all zeros
-    if qmin == qmax {
-        quantized_dist_table.clear();
-        quantized_dist_table.resize(dist_table.len(), 0);
-        return (qmin, qmax);
-    }
-    let factor = 255.0 / (qmax - qmin);
-    quantized_dist_table.clear();
-    quantized_dist_table.reserve(dist_table.len());
-    let spare = quantized_dist_table.spare_capacity_mut();
-    for (quantized, &d) in spare[..dist_table.len()].iter_mut().zip(dist_table.iter()) {
-        quantized.write(((d - qmin) * factor).round() as u8);
-    }
-    // SAFETY: every element in the reserved range was initialized in the loop above.
-    unsafe {
-        quantized_dist_table.set_len(dist_table.len());
-    }
-
-    (qmin, qmax)
-}
-
-#[inline]
-fn quantize_dist_table_u16_into(
-    dist_table: &[f32],
-    quantized_dist_table: &mut Vec<u16>,
-) -> (f32, f32) {
-    let (qmin, qmax) = dist_table
-        .iter()
-        .cloned()
-        .minmax_by(|a, b| a.total_cmp(b))
-        .into_option()
-        .unwrap();
-    if qmin == qmax {
-        quantized_dist_table.clear();
-        quantized_dist_table.resize(dist_table.len(), 0);
-        return (qmin, qmax);
-    }
-
-    let factor = u16::MAX as f32 / (qmax - qmin);
-    quantized_dist_table.clear();
-    quantized_dist_table.reserve(dist_table.len());
-    let spare = quantized_dist_table.spare_capacity_mut();
-    for (quantized, &d) in spare[..dist_table.len()].iter_mut().zip(dist_table.iter()) {
-        quantized.write(((d - qmin) * factor).round() as u16);
-    }
-    // SAFETY: every element in the reserved range was initialized in the loop above.
-    unsafe {
-        quantized_dist_table.set_len(dist_table.len());
-    }
-
-    (qmin, qmax)
-}
-
-#[inline]
-fn packed_ex_code_value(row_codes: &[u8], dim_idx: usize, ex_bits: u8) -> u8 {
-    debug_assert!(ex_bits > 0);
-    let bit_offset = dim_idx * ex_bits as usize;
-    let byte_idx = bit_offset / u8::BITS as usize;
-    let bit_shift = bit_offset % u8::BITS as usize;
-    let bits = row_codes[byte_idx] as u16
-        | row_codes
-            .get(byte_idx + 1)
-            .map(|byte| (*byte as u16) << u8::BITS)
-            .unwrap_or_default();
-    let mask = (1u16 << ex_bits) - 1;
-    ((bits >> bit_shift) & mask) as u8
-}
-
+/// Build the u8 FastScan LUT for the ex codes directly from the rotated
+/// query (`ex_query`, natural dim order, padding dims zero): the underlying
+/// per-dim table is the pure multiplication `q[d] * code`, so no intermediate
+/// `dim * 2^ex_bits` table is materialized.
 fn quantize_ex_fastscan_dist_table_into(
-    dim: usize,
     ex_bits: u8,
-    ex_dist_table: &[f32],
+    ex_code_len: usize,
+    ex_query: &[f32],
     quantized_dist_table: &mut Vec<u8>,
 ) -> (f32, f32, f32) {
     debug_assert!(supports_ex_fastscan(ex_bits));
 
-    let entries_per_dim = 1usize << ex_bits;
-    debug_assert_eq!(ex_dist_table.len(), dim * entries_per_dim);
-    let num_split_tables =
-        ex_fastscan_code_len(dim, ex_bits).expect("RabitQ num_bits should be validated") * 2;
+    // One split table per code nibble of the row.
+    let num_split_tables = ex_code_len * 2;
     let quantization_max = (u16::MAX as usize / num_split_tables)
         .min(u8::MAX as usize)
         .max(1) as f32;
@@ -1436,7 +1591,7 @@ fn quantize_ex_fastscan_dist_table_into(
     let mut qmax = f32::NEG_INFINITY;
     for table_idx in 0..num_split_tables {
         for code in 0..SEGMENT_NUM_CODES {
-            let value = ex_fastscan_dist_table_value(dim, ex_bits, ex_dist_table, table_idx, code);
+            let value = ex_fastscan_dist_table_value(ex_query, ex_bits, table_idx, code);
             qmin = qmin.min(value);
             qmax = qmax.max(value);
         }
@@ -1452,7 +1607,7 @@ fn quantize_ex_fastscan_dist_table_into(
     let factor = quantization_max / (qmax - qmin);
     for table_idx in 0..num_split_tables {
         for code in 0..SEGMENT_NUM_CODES {
-            let value = ex_fastscan_dist_table_value(dim, ex_bits, ex_dist_table, table_idx, code);
+            let value = ex_fastscan_dist_table_value(ex_query, ex_bits, table_idx, code);
             quantized_dist_table.push(((value - qmin) * factor).round() as u8);
         }
     }
@@ -1465,91 +1620,153 @@ fn supports_ex_fastscan(ex_bits: u8) -> bool {
     matches!(ex_bits, 2 | 4 | 8)
 }
 
-#[inline]
-fn ex_fastscan_code_len(dim: usize, ex_bits: u8) -> Option<usize> {
-    match ex_bits {
-        2 | 4 | 8 => rabit_ex_code_bytes(dim, ex_bits).ok(),
-        _ => None,
-    }
-}
-
+/// The FastScan LUT value for one nibble of a blocked-layout code byte:
+/// `table_idx / 2` is the byte position within a row and `table_idx % 2`
+/// selects its low/high nibble (see the `ex_dot` module docs for the
+/// byte-to-dim mapping per width). Dims beyond the query length (block
+/// padding) contribute zero.
 #[inline]
 fn ex_fastscan_dist_table_value(
-    dim: usize,
+    ex_query: &[f32],
     ex_bits: u8,
-    ex_dist_table: &[f32],
     table_idx: usize,
     code: usize,
 ) -> f32 {
+    let query = |dim_idx: usize| ex_query.get(dim_idx).copied().unwrap_or(0.0);
+    let byte_idx = table_idx / 2;
+    let high_nibble = table_idx % 2 == 1;
     match ex_bits {
         2 => {
-            let dim_idx = table_idx * 2;
-            let low = code & 0b11;
-            let high = (code >> 2) & 0b11;
-            ex_dist_table_value(ex_dist_table, dim, ex_bits, dim_idx, low)
-                + ex_dist_table_value(ex_dist_table, dim, ex_bits, dim_idx + 1, high)
+            // byte 16g+b = dims {64g+b, +16, +32, +48} at bit pairs; the low
+            // nibble covers the first two dims, the high nibble the last two.
+            let dim_idx = 64 * (byte_idx / 16) + byte_idx % 16 + 32 * usize::from(high_nibble);
+            let low = (code & 0b11) as f32;
+            let high = ((code >> 2) & 0b11) as f32;
+            query(dim_idx) * low + query(dim_idx + 16) * high
+        }
+        4 => {
+            // byte 32g+8j+b = dim 64g+16j+b (low nibble) | dim +8 (high).
+            let in_block = byte_idx % 32;
+            let dim_idx = 64 * (byte_idx / 32)
+                + 16 * (in_block / 8)
+                + in_block % 8
+                + 8 * usize::from(high_nibble);
+            query(dim_idx) * code as f32
         }
-        4 => ex_dist_table_value(ex_dist_table, dim, ex_bits, table_idx, code),
         8 => {
-            let dim_idx = table_idx / 2;
-            if table_idx.is_multiple_of(2) {
-                ex_dist_table_value(ex_dist_table, dim, ex_bits, dim_idx, code)
+            // byte = dim identity; the high nibble carries code bits 4..8.
+            let code = if high_nibble {
+                code << SEGMENT_LENGTH
             } else {
-                ex_dist_table_value(ex_dist_table, dim, ex_bits, dim_idx, code << SEGMENT_LENGTH)
-            }
+                code
+            };
+            query(byte_idx) * code as f32
         }
         _ => unreachable!("unsupported RabitQ ex_bits={ex_bits} for FastScan"),
     }
 }
 
-#[inline]
-fn ex_dist_table_value(
-    ex_dist_table: &[f32],
-    dim: usize,
-    ex_bits: u8,
-    dim_idx: usize,
-    code: usize,
-) -> f32 {
-    if dim_idx >= dim {
-        return 0.0;
-    }
-    let entries_per_dim = 1usize << ex_bits;
-    ex_dist_table[dim_idx * entries_per_dim + code]
-}
-
-#[inline]
-fn compute_single_rq_ex_distance(
-    ex_codes: &[u8],
-    id: usize,
-    ex_code_len: usize,
-    ex_bits: u8,
-    dim: usize,
-    ex_dist_table: &[f32],
-) -> f32 {
-    if ex_bits == 0 {
-        return 0.0;
-    }
-    let entries_per_dim = 1usize << ex_bits;
-    let row_codes = &ex_codes[id * ex_code_len..(id + 1) * ex_code_len];
-    (0..dim)
-        .map(|dim_idx| {
-            let code = packed_ex_code_value(row_codes, dim_idx, ex_bits) as usize;
-            ex_dist_table[dim_idx * entries_per_dim + code]
-        })
-        .sum()
-}
-
+/// Transpose ex codes for the FastScan bulk path. That path is only reachable
+/// when lower-bound gating is disabled, i.e. for legacy indexes without error
+/// factors; gated indexes rerank per candidate with the ex-dot kernels and
+/// never touch this copy, so skip the transpose (and its resident memory).
 fn maybe_pack_ex_codes(
     ex_codes: Option<&FixedSizeListArray>,
     ex_bits: u8,
+    error_factors: Option<&Float32Array>,
 ) -> Option<FixedSizeListArray> {
     let ex_codes = ex_codes?;
+    if error_factors.is_some() {
+        return None;
+    }
     match ex_bits {
         2 | 4 | 8 => Some(pack_codes(ex_codes)),
         _ => None,
     }
 }
 
+/// Bring legacy sequential ex codes into the blocked kernel layout: rows are
+/// repacked, except for the widths whose layouts agree byte-for-byte (then
+/// the column is used as stored).
+fn blocked_ex_codes_from_sequential(
+    seq_codes: &FixedSizeListArray,
+    dim: usize,
+    ex_bits: u8,
+) -> Result<FixedSizeListArray> {
+    if sequential_matches_blocked(ex_bits)
+        && seq_codes.value_length() as usize == blocked_ex_code_bytes(dim, ex_bits)
+    {
+        return Ok(seq_codes.clone());
+    }
+    let seq_code_len = seq_codes.value_length() as usize;
+    let seq_values = seq_codes.values().as_primitive::<UInt8Type>().values();
+    let blocked_code_len = blocked_ex_code_bytes(dim, ex_bits);
+    let mut blocked_values = vec![0u8; seq_codes.len() * blocked_code_len];
+    for (seq_row, blocked_row) in seq_values
+        .chunks_exact(seq_code_len)
+        .zip(blocked_values.chunks_exact_mut(blocked_code_len))
+    {
+        repack_sequential_row(seq_row, dim, ex_bits, blocked_row);
+    }
+    Ok(FixedSizeListArray::try_new_from_values(
+        UInt8Array::from(blocked_values),
+        blocked_code_len as i32,
+    )?)
+}
+
+/// Load the ex-code column of an index batch into the blocked kernel layout,
+/// accepting both the blocked format and the legacy sequential format. Legacy
+/// batches are normalized in place (the sequential column is replaced by the
+/// blocked one), so rewrites — remap, optimize merges — always emit the
+/// blocked format and legacy indexes upgrade on their next rewrite.
+pub(crate) fn load_blocked_ex_codes(
+    batch: RecordBatch,
+    rotated_dim: usize,
+    num_bits: u8,
+) -> Result<(RecordBatch, FixedSizeListArray)> {
+    let ex_bits = rabit_ex_bits(num_bits)?;
+    if let Some(column) = batch.column_by_name(RABIT_BLOCKED_EX_CODE_COLUMN) {
+        let codes = column.as_fixed_size_list().clone();
+        let expected_bytes = blocked_ex_code_bytes(rotated_dim, ex_bits);
+        if codes.value_length() as usize != expected_bytes {
+            return Err(Error::invalid_input(format!(
+                "RabitQ ex-code byte width mismatch: column {} has {} bytes, metadata rotated_dim={} ex_bits={} requires {} bytes",
+                RABIT_BLOCKED_EX_CODE_COLUMN,
+                codes.value_length(),
+                rotated_dim,
+                ex_bits,
+                expected_bytes
+            )));
+        }
+        return Ok((batch, codes));
+    }
+    let column = batch.column_by_name(RABIT_EX_CODE_COLUMN).ok_or_else(|| {
+        Error::invalid_input(format!(
+            "RabitQ num_bits={} requires {} column",
+            num_bits, RABIT_BLOCKED_EX_CODE_COLUMN
+        ))
+    })?;
+    let codes = column.as_fixed_size_list().clone();
+    let expected_bytes = rabit_ex_code_bytes(rotated_dim, ex_bits)?;
+    if codes.value_length() as usize != expected_bytes {
+        return Err(Error::invalid_input(format!(
+            "RabitQ ex-code byte width mismatch: column {} has {} bytes, metadata rotated_dim={} ex_bits={} requires {} bytes",
+            RABIT_EX_CODE_COLUMN,
+            codes.value_length(),
+            rotated_dim,
+            ex_bits,
+            expected_bytes
+        )));
+    }
+    let blocked = blocked_ex_codes_from_sequential(&codes, rotated_dim, ex_bits)?;
+    let ex_code_field = rabit_ex_code_field(rotated_dim, num_bits)?
+        .expect("multi-bit RabitQ always has an ex-code field");
+    let batch = batch
+        .drop_column(RABIT_EX_CODE_COLUMN)?
+        .try_with_column(ex_code_field, Arc::new(blocked.clone()))?;
+    Ok((batch, blocked))
+}
+
 impl DistCalculator for RabitDistCalculator<'_> {
     #[inline(always)]
     fn distance(&self, id: u32) -> f32 {
@@ -1580,13 +1797,10 @@ impl DistCalculator for RabitDistCalculator<'_> {
                 let ex_scale_factors = self
                     .ex_scale_factors
                     .expect("raw-query multi-bit RQ requires ex scale factors");
-                let ex_code_len = rabit_ex_code_bytes(self.dim, ex_bits)
-                    .expect("RabitQ num_bits should be validated");
                 self.raw_query_multi_bit_exact_distance(
                     id,
                     dist,
                     ex_bits,
-                    ex_code_len,
                     ex_codes,
                     ex_add_factors,
                     ex_scale_factors,
@@ -1690,13 +1904,11 @@ impl DistCalculator for RabitDistCalculator<'_> {
             return;
         }
 
-        let code_len = rabit_binary_code_bytes(self.dim);
-        let n = self.codes.len() / code_len;
-        self.accumulate_raw_query_multi_bit_topk_with_scratch(
+        self.accumulate_raw_query_multi_bit_topk_dense_with_scratch(
             k,
             lower_bound,
             upper_bound,
-            (0..n).map(|id| (id, row_id(id as u32))),
+            row_id,
             res,
             dists,
             quantized_dists,
@@ -1865,8 +2077,6 @@ impl VectorStore for RabitQuantizationStorage {
         let code_dim = self.code_dim();
         let rotated_qr = self.rotate_query_vector(code_dim, &qr);
         let dist_table = build_dist_table_direct::<Float32Type>(&rotated_qr);
-        let ex_bits = self.metadata.num_bits - 1;
-        let ex_dist_table = build_ex_dist_table_direct(&rotated_qr, ex_bits);
         let query_factor = match self.metadata.query_estimator {
             RabitQueryEstimator::ResidualQuery => self.residual_query_factor(dist_q_c),
             RabitQueryEstimator::RawQuery => self.raw_query_factor(dist_q_c, &rotated_qr, None),
@@ -1877,12 +2087,21 @@ impl VectorStore for RabitQuantizationStorage {
                 self.raw_query_error_for_gating(dist_q_c, &rotated_qr, None)
             }
         };
-        let sum_q = rotated_qr.into_iter().sum();
+        let sum_q = rotated_qr.iter().copied().sum();
+        // The kernels read the rotated query directly; only unaligned dims
+        // need a zero-padded copy.
+        let ex_query = if code_dim.is_multiple_of(EX_DOT_BLOCK_DIMS) {
+            rotated_qr
+        } else {
+            let mut padded = vec![0.0; padded_query_len(code_dim)];
+            pad_query_into(&rotated_qr, &mut padded);
+            padded
+        };
 
         self.distance_calculator_from_parts(RabitDistCalculatorParts {
             dim: code_dim,
             dist_table: Cow::Owned(dist_table),
-            ex_dist_table: Cow::Owned(ex_dist_table),
+            ex_query: Cow::Owned(ex_query),
             sum_q,
             query_factor,
             query_error,
@@ -1921,7 +2140,10 @@ impl VectorStore for RabitQuantizationStorage {
             return self.distance_calculator_from_parts(RabitDistCalculatorParts {
                 dim: code_dim,
                 dist_table: Cow::Borrowed(&raw_query.dist_table),
-                ex_dist_table: Cow::Borrowed(&raw_query.ex_dist_table),
+                ex_query: Cow::Borrowed(kernel_query(
+                    &raw_query.rotated_query,
+                    &raw_query.ex_query,
+                )),
                 sum_q: raw_query.sum_q,
                 query_factor,
                 query_error,
@@ -1931,18 +2153,20 @@ impl VectorStore for RabitQuantizationStorage {
 
         let dist_table_len = code_dim * 4;
         let ex_bits = self.metadata.num_bits - 1;
-        let ex_dist_table_len = if ex_bits == 0 {
+        // The kernels read the rotated query in place; a zero-padded copy is
+        // only needed when the rotated dim is not block-aligned.
+        let ex_query_table_len = if ex_bits == 0 || code_dim.is_multiple_of(EX_DOT_BLOCK_DIMS) {
             0
         } else {
-            code_dim * (1usize << ex_bits)
+            padded_query_len(code_dim)
         };
-        f32_scratch.resize(code_dim + dist_table_len + ex_dist_table_len, 0.0);
+        f32_scratch.resize(code_dim + dist_table_len + ex_query_table_len, 0.0);
 
         let query_factor;
         let query_error;
         let sum_q = {
             let (rotated_qr, remaining) = f32_scratch.split_at_mut(code_dim);
-            let (dist_table, ex_dist_table) = remaining.split_at_mut(dist_table_len);
+            let (dist_table, ex_query) = remaining.split_at_mut(dist_table_len);
             match residual {
                 Some(QueryResidual::Centroid(residual_centroid)) => {
                     self.rotate_query_vector_into(
@@ -1981,17 +2205,20 @@ impl VectorStore for RabitQuantizationStorage {
                 }
             };
             build_dist_table_direct_into::<Float32Type>(rotated_qr, dist_table);
-            build_ex_dist_table_direct_into(rotated_qr, ex_bits, ex_dist_table);
+            if ex_query_table_len > 0 {
+                pad_query_into(rotated_qr, ex_query);
+            }
             rotated_qr.iter().copied().sum()
         };
 
+        let ex_query_start = code_dim + dist_table_len;
         self.distance_calculator_from_parts(RabitDistCalculatorParts {
             dim: code_dim,
-            dist_table: Cow::Borrowed(&f32_scratch[code_dim..code_dim + dist_table_len]),
-            ex_dist_table: Cow::Borrowed(
-                &f32_scratch
-                    [code_dim + dist_table_len..code_dim + dist_table_len + ex_dist_table_len],
-            ),
+            dist_table: Cow::Borrowed(&f32_scratch[code_dim..ex_query_start]),
+            ex_query: Cow::Borrowed(kernel_query(
+                &f32_scratch[..code_dim],
+                &f32_scratch[ex_query_start..ex_query_start + ex_query_table_len],
+            )),
             sum_q,
             query_factor,
             query_error,
@@ -2155,6 +2382,38 @@ pub fn unpack_codes(codes: &FixedSizeListArray) -> FixedSizeListArray {
     FixedSizeListArray::try_new_from_values(UInt8Array::from(unpacked), code_len as i32).unwrap()
 }
 
+/// Build a row-id remapping for the rows present in this partition from a
+/// fragment-reuse index, mirroring the PQ storage frag-reuse path.
+///
+/// Returns `None` when there is nothing to do (no fragment-reuse index, or the
+/// index leaves every present row id unchanged), so callers keep the zero-cost
+/// no-op path. Otherwise, returns a `HashMap` mapping every affected old row id
+/// to `Some(new_id)` for surviving rows or `None` for rows whose covering
+/// fragment was compacted away, suitable for `RabitQuantizationStorage::remap`.
+fn build_frag_reuse_mapping(
+    fri: Option<&FragReuseIndex>,
+    row_ids: &UInt64Array,
+) -> Option<HashMap<u64, Option<u64>>> {
+    let fri = fri?;
+    if fri.row_id_maps.is_empty() {
+        return None;
+    }
+    let mut mapping: HashMap<u64, Option<u64>> = HashMap::new();
+    for row_id in row_ids.values().iter() {
+        match fri.remap_row_id(*row_id) {
+            Some(new_id) if new_id == *row_id => {}
+            mapped => {
+                mapping.insert(*row_id, mapped);
+            }
+        }
+    }
+    if mapping.is_empty() {
+        None
+    } else {
+        Some(mapping)
+    }
+}
+
 #[async_trait]
 impl QuantizerStorage for RabitQuantizationStorage {
     type Metadata = RabitQuantizationMetadata;
@@ -2163,7 +2422,7 @@ impl QuantizerStorage for RabitQuantizationStorage {
         batch: RecordBatch,
         metadata: &Self::Metadata,
         distance_type: DistanceType,
-        _fri: Option<Arc<FragReuseIndex>>,
+        fri: Option<Arc<FragReuseIndex>>,
     ) -> Result<Self> {
         let distance_type = match (metadata.query_estimator, distance_type) {
             (RabitQueryEstimator::RawQuery, DistanceType::Cosine) => DistanceType::L2,
@@ -2192,31 +2451,14 @@ impl QuantizerStorage for RabitQuantizationStorage {
             .column_by_name(ERROR_FACTORS_COLUMN)
             .map(|factors| factors.as_primitive::<Float32Type>().clone());
         let ex_bits = rabit_ex_bits(metadata.num_bits)?;
+        let mut batch = batch;
         let mut ex_codes = None;
         let mut ex_add_factors = None;
         let mut ex_scale_factors = None;
         if ex_bits != 0 {
-            let codes = batch
-                .column_by_name(RABIT_EX_CODE_COLUMN)
-                .ok_or_else(|| {
-                    Error::invalid_input(format!(
-                        "RabitQ num_bits={} requires {} column",
-                        metadata.num_bits, RABIT_EX_CODE_COLUMN
-                    ))
-                })?
-                .as_fixed_size_list()
-                .clone();
-            let expected_ex_code_bytes = rabit_ex_code_bytes(metadata.rotated_dim(), ex_bits)?;
-            if codes.value_length() as usize != expected_ex_code_bytes {
-                return Err(Error::invalid_input(format!(
-                    "RabitQ ex-code byte width mismatch: column {} has {} bytes, metadata rotated_dim={} ex_bits={} requires {} bytes",
-                    RABIT_EX_CODE_COLUMN,
-                    codes.value_length(),
-                    metadata.rotated_dim(),
-                    ex_bits,
-                    expected_ex_code_bytes
-                )));
-            }
+            let (normalized_batch, codes) =
+                load_blocked_ex_codes(batch, metadata.rotated_dim(), metadata.num_bits)?;
+            batch = normalized_batch;
             ex_codes = Some(codes);
             ex_add_factors = Some(
                 batch
@@ -2246,16 +2488,19 @@ impl QuantizerStorage for RabitQuantizationStorage {
             if batch.column_by_name(EX_ADD_FACTORS_COLUMN).is_some()
                 || batch.column_by_name(EX_SCALE_FACTORS_COLUMN).is_some()
                 || batch.column_by_name(RABIT_EX_CODE_COLUMN).is_some()
+                || batch.column_by_name(RABIT_BLOCKED_EX_CODE_COLUMN).is_some()
             {
                 return Err(Error::invalid_input(
                     "RabitQ num_bits=1 raw-query indexes must not contain ex-code columns"
                         .to_string(),
                 ));
             }
-        } else if batch.column_by_name(RABIT_EX_CODE_COLUMN).is_some() {
+        } else if batch.column_by_name(RABIT_EX_CODE_COLUMN).is_some()
+            || batch.column_by_name(RABIT_BLOCKED_EX_CODE_COLUMN).is_some()
+        {
             return Err(Error::invalid_input(format!(
-                "RabitQ num_bits={} does not support {} column",
-                metadata.num_bits, RABIT_EX_CODE_COLUMN
+                "RabitQ num_bits={} does not support ex-code columns",
+                metadata.num_bits
             )));
         }
 
@@ -2270,9 +2515,10 @@ impl QuantizerStorage for RabitQuantizationStorage {
 
         let mut metadata = metadata.clone();
         metadata.packed = true;
-        let packed_ex_codes = maybe_pack_ex_codes(ex_codes.as_ref(), ex_bits);
+        let packed_ex_codes =
+            maybe_pack_ex_codes(ex_codes.as_ref(), ex_bits, error_factors.as_ref());
 
-        Ok(Self {
+        let storage = Self {
             metadata,
             batch,
             distance_type,
@@ -2285,7 +2531,12 @@ impl QuantizerStorage for RabitQuantizationStorage {
             packed_ex_codes,
             ex_add_factors,
             ex_scale_factors,
-        })
+        };
+
+        match build_frag_reuse_mapping(fri.as_deref(), &storage.row_ids) {
+            Some(mapping) => storage.remap(&mapping),
+            None => Ok(storage),
+        }
     }
 
     fn metadata(&self) -> &Self::Metadata {
@@ -2353,11 +2604,18 @@ impl QuantizerStorage for RabitQuantizationStorage {
         let error_factors = batch
             .column_by_name(ERROR_FACTORS_COLUMN)
             .map(|factors| factors.as_primitive::<Float32Type>().clone());
-        let ex_codes = batch
-            .column_by_name(RABIT_EX_CODE_COLUMN)
-            .map(|codes| codes.as_fixed_size_list().clone());
+        let ex_bits = rabit_ex_bits(self.metadata.num_bits)?;
+        let (batch, ex_codes) = if ex_bits == 0 {
+            (batch, None)
+        } else {
+            // `self.batch` is already normalized at load, so this is a
+            // zero-copy column lookup.
+            let (batch, codes) =
+                load_blocked_ex_codes(batch, self.metadata.rotated_dim(), self.metadata.num_bits)?;
+            (batch, Some(codes))
+        };
         let packed_ex_codes =
-            maybe_pack_ex_codes(ex_codes.as_ref(), rabit_ex_bits(self.metadata.num_bits)?);
+            maybe_pack_ex_codes(ex_codes.as_ref(), ex_bits, error_factors.as_ref());
         let ex_add_factors = batch
             .column_by_name(EX_ADD_FACTORS_COLUMN)
             .map(|factors| factors.as_primitive::<Float32Type>().clone());
@@ -2490,6 +2748,9 @@ mod tests {
     use arrow_array::{ArrayRef, Float32Array, Float64Array, UInt64Array};
     use lance_core::ROW_ID;
     use lance_linalg::distance::DistanceType;
+    use rand::rngs::SmallRng;
+    use rand::{Rng, SeedableRng};
+    use rstest::rstest;
 
     use crate::vector::bq::{RQRotationType, builder::RabitQuantizer};
     use crate::vector::quantizer::{Quantization, QuantizerStorage};
@@ -2695,7 +2956,7 @@ mod tests {
 
         assert!(rabit_ex_code_field(128, 1).unwrap().is_none());
         let ex_field = rabit_ex_code_field(128, 9).unwrap().unwrap();
-        assert_eq!(ex_field.name(), RABIT_EX_CODE_COLUMN);
+        assert_eq!(ex_field.name(), RABIT_BLOCKED_EX_CODE_COLUMN);
         let DataType::FixedSizeList(_, ex_code_bytes) = ex_field.data_type() else {
             panic!("ex-code field should be FixedSizeList");
         };
@@ -2898,6 +3159,229 @@ mod tests {
         assert_eq!(distances, vec![104.0, 22.0]);
     }
 
+    /// Exercise the ex-dot kernel through the storage API for every ex width,
+    /// including the widths without FastScan support ({1, 3, 5, 6, 7}), and a
+    /// dim that is not a multiple of the 64-dim kernel group.
+    ///
+    /// The dim must be a multiple of 8: the binary distance stage consumes
+    /// two 4-dim segments per code byte and ignores trailing dims otherwise.
+    #[test]
+    fn test_raw_query_multi_bit_distance_matches_reference_for_all_ex_widths() {
+        use rand::rngs::SmallRng;
+        use rand::{Rng, SeedableRng};
+
+        // 72 exercises the kernels' padded-tail path; 1536 is a production
+        // embedding dim exercising the full-group path. Both the blocked
+        // format and the legacy sequential format must produce the same
+        // distances.
+        for (code_dim, num_rows) in [(72usize, 33usize), (1536, 33)] {
+            for num_bits in 2..=9u8 {
+                for legacy_format in [false, true] {
+                    let ex_bits = num_bits - 1;
+                    let mut rng = SmallRng::seed_from_u64(num_bits as u64);
+
+                    let sign_bits = (0..num_rows * code_dim)
+                        .map(|_| rng.random_bool(0.5))
+                        .collect::<Vec<_>>();
+                    let max_code = ((1u16 << ex_bits) - 1) as u8;
+                    let ex_values = (0..num_rows * code_dim)
+                        .map(|_| rng.random_range(0..=max_code))
+                        .collect::<Vec<_>>();
+
+                    let code_len = rabit_binary_code_bytes(code_dim);
+                    let mut code_bytes = vec![0u8; num_rows * code_len];
+                    for (row, bits) in sign_bits.chunks_exact(code_dim).enumerate() {
+                        for (dim, &bit) in bits.iter().enumerate() {
+                            code_bytes[row * code_len + dim / 8] |= (bit as u8) << (dim % 8);
+                        }
+                    }
+                    let (ex_code_column, ex_code_len, ex_code_bytes) = if legacy_format {
+                        let ex_code_len = rabit_ex_code_bytes(code_dim, ex_bits).unwrap();
+                        let mut ex_code_bytes = vec![0u8; num_rows * ex_code_len];
+                        for (row, values) in ex_values.chunks_exact(code_dim).enumerate() {
+                            for (dim, &value) in values.iter().enumerate() {
+                                let bit_offset = dim * ex_bits as usize;
+                                let bits = (value as u16) << (bit_offset % 8);
+                                ex_code_bytes[row * ex_code_len + bit_offset / 8] |= bits as u8;
+                                if bits >> 8 != 0 {
+                                    ex_code_bytes[row * ex_code_len + bit_offset / 8 + 1] |=
+                                        (bits >> 8) as u8;
+                                }
+                            }
+                        }
+                        (RABIT_EX_CODE_COLUMN, ex_code_len, ex_code_bytes)
+                    } else {
+                        let ex_code_len = blocked_ex_code_bytes(code_dim, ex_bits);
+                        let mut ex_code_bytes = vec![0u8; num_rows * ex_code_len];
+                        for (row, values) in ex_code_bytes
+                            .chunks_exact_mut(ex_code_len)
+                            .zip(ex_values.chunks_exact(code_dim))
+                        {
+                            crate::vector::bq::ex_dot::pack_blocked_row(values, ex_bits, row);
+                        }
+                        (RABIT_BLOCKED_EX_CODE_COLUMN, ex_code_len, ex_code_bytes)
+                    };
+
+                    let identity = Float32Array::from_iter_values((0..code_dim).flat_map(|row| {
+                        (0..code_dim).map(move |col| if row == col { 1.0 } else { 0.0 })
+                    }));
+                    let rotate_mat =
+                        FixedSizeListArray::try_new_from_values(identity, code_dim as i32).unwrap();
+                    let metadata = RabitQuantizationMetadata {
+                        rotate_mat: Some(rotate_mat),
+                        rotate_mat_position: None,
+                        fast_rotation_signs: None,
+                        rotation_type: RQRotationType::Matrix,
+                        code_dim: code_dim as u32,
+                        num_bits,
+                        packed: false,
+                        query_estimator: RabitQueryEstimator::RawQuery,
+                    };
+                    let codes = FixedSizeListArray::try_new_from_values(
+                        UInt8Array::from(code_bytes),
+                        code_len as i32,
+                    )
+                    .unwrap();
+                    let ex_codes = FixedSizeListArray::try_new_from_values(
+                        UInt8Array::from(ex_code_bytes),
+                        ex_code_len as i32,
+                    )
+                    .unwrap();
+                    let ex_add_factors = (0..num_rows)
+                        .map(|_| rng.random_range(-1.0f32..1.0))
+                        .collect::<Vec<_>>();
+                    let ex_scale_factors = (0..num_rows)
+                        .map(|_| rng.random_range(0.1f32..1.0))
+                        .collect::<Vec<_>>();
+                    let batch = RecordBatch::try_from_iter(vec![
+                        (
+                            ROW_ID,
+                            Arc::new(UInt64Array::from_iter_values(0..num_rows as u64)) as ArrayRef,
+                        ),
+                        (RABIT_CODE_COLUMN, Arc::new(codes) as ArrayRef),
+                        (
+                            ADD_FACTORS_COLUMN,
+                            Arc::new(Float32Array::from(vec![0.0; num_rows])) as ArrayRef,
+                        ),
+                        (
+                            SCALE_FACTORS_COLUMN,
+                            Arc::new(Float32Array::from(vec![0.0; num_rows])) as ArrayRef,
+                        ),
+                        (ex_code_column, Arc::new(ex_codes) as ArrayRef),
+                        (
+                            EX_ADD_FACTORS_COLUMN,
+                            Arc::new(Float32Array::from(ex_add_factors.clone())) as ArrayRef,
+                        ),
+                        (
+                            EX_SCALE_FACTORS_COLUMN,
+                            Arc::new(Float32Array::from(ex_scale_factors.clone())) as ArrayRef,
+                        ),
+                    ])
+                    .unwrap();
+                    let storage = RabitQuantizationStorage::try_from_batch(
+                        batch,
+                        &metadata,
+                        DistanceType::L2,
+                        None,
+                    )
+                    .unwrap();
+
+                    let query = (0..code_dim)
+                        .map(|_| rng.random_range(-1.0f32..1.0))
+                        .collect::<Vec<_>>();
+                    let sum_q = query.iter().sum::<f32>();
+                    let calc = storage.dist_calculator(
+                        Arc::new(Float32Array::from(query.clone())) as ArrayRef,
+                        0.0,
+                    );
+
+                    let code_scale = (1u32 << ex_bits) as f32;
+                    let code_bias = -(code_scale - 0.5);
+                    let expected = (0..num_rows)
+                        .map(|row| {
+                            let binary_ip = (0..code_dim)
+                                .map(|dim| {
+                                    query[dim] * sign_bits[row * code_dim + dim] as u8 as f32
+                                })
+                                .sum::<f32>();
+                            let ex_dist = (0..code_dim)
+                                .map(|dim| query[dim] * ex_values[row * code_dim + dim] as f32)
+                                .sum::<f32>();
+                            let full_dot = code_scale * binary_ip + ex_dist + code_bias * sum_q;
+                            full_dot * ex_scale_factors[row] + ex_add_factors[row]
+                        })
+                        .collect::<Vec<_>>();
+
+                    for (row, &want) in expected.iter().enumerate() {
+                        let got = calc.distance(row as u32);
+                        assert!(
+                            (got - want).abs() <= 1e-3 * want.abs().max(1.0),
+                            "num_bits={num_bits} row={row}: {got} != {want}"
+                        );
+                    }
+
+                    let mut distances = Vec::new();
+                    let mut u16_scratch = Vec::new();
+                    let mut u8_scratch = Vec::new();
+                    let mut u32_scratch = Vec::new();
+                    calc.distance_all_with_scratch(
+                        0,
+                        &mut distances,
+                        &mut u16_scratch,
+                        &mut u8_scratch,
+                        &mut u32_scratch,
+                    );
+                    assert_eq!(distances.len(), num_rows);
+                    // The bulk path quantizes the binary LUT to u8, and that error is
+                    // amplified by 2^ex_bits in the multi-bit estimate, so the value
+                    // assertions need a quantization-aware bound. The FastScan ex
+                    // widths additionally quantize the ex LUT and are covered by
+                    // `test_raw_query_multi_bit_distance_all_uses_fastscan_for_split_ex_codes`.
+                    if !matches!(ex_bits, 2 | 4 | 8) {
+                        // Worst-case |error| of one u8-quantized binary LUT lookup is
+                        // (table range) / 255 / 2, accumulated over one lookup per
+                        // 8-dim pair of segments.
+                        let num_tables = code_dim.div_ceil(4);
+                        let mut table_min = f32::INFINITY;
+                        let mut table_max = f32::NEG_INFINITY;
+                        for segment in query.chunks(4) {
+                            for subset in 0..16usize {
+                                let value = segment
+                                    .iter()
+                                    .enumerate()
+                                    .filter(|(idx, _)| subset & (1 << idx) != 0)
+                                    .map(|(_, q)| *q)
+                                    .sum::<f32>();
+                                table_min = table_min.min(value);
+                                table_max = table_max.max(value);
+                            }
+                        }
+                        let binary_bound =
+                            code_scale * num_tables as f32 * (table_max - table_min) / 255.0 / 2.0
+                                * ex_scale_factors.iter().fold(0.0f32, |max, &s| max.max(s));
+                        for (row, (&got, &want)) in
+                            distances.iter().zip(expected.iter()).enumerate()
+                        {
+                            assert!(
+                                (got - want).abs() <= binary_bound + 1e-3,
+                                "num_bits={num_bits} row={row} (distance_all): {got} != {want} (bound {binary_bound})"
+                            );
+                        }
+                        // Rows past the SIMD batch use the exact binary path, so the
+                        // final remainder row must match the per-candidate distance.
+                        let remainder_row = num_rows - 1;
+                        let got = distances[remainder_row];
+                        let want = calc.distance(remainder_row as u32);
+                        assert!(
+                            (got - want).abs() <= 1e-3 * want.abs().max(1.0),
+                            "num_bits={num_bits} remainder row (distance_all): {got} != {want}"
+                        );
+                    }
+                }
+            }
+        }
+    }
+
     #[test]
     fn test_fast_approx_mode_uses_one_bit_scores_for_multi_bit_raw_query() {
         let code_dim = 8usize;
@@ -3061,10 +3545,17 @@ mod tests {
         assert_eq!(hacc_accum_len, num_rows);
     }
 
-    fn assert_raw_query_multi_bit_distance_all_uses_fastscan(num_bits: u8) {
-        let code_dim = 8usize;
+    fn assert_raw_query_multi_bit_distance_all_uses_fastscan(
+        num_bits: u8,
+        legacy_format: bool,
+        with_error_factors: bool,
+    ) {
+        // Not a multiple of 64, so the padded-tail LUT entries are exercised;
+        // a multiple of 8 as the binary stage requires.
+        let code_dim = 72usize;
         let num_rows = BATCH_SIZE + 1;
         let ex_bits = rabit_ex_bits(num_bits).unwrap();
+        let max_code = ((1u16 << ex_bits) - 1) as u8;
         let identity = Float32Array::from_iter_values(
             (0..code_dim)
                 .flat_map(|row| (0..code_dim).map(move |col| if row == col { 1.0 } else { 0.0 })),
@@ -3081,16 +3572,42 @@ mod tests {
             packed: false,
             query_estimator: RabitQueryEstimator::RawQuery,
         };
+        let code_len = rabit_binary_code_bytes(code_dim);
         let codes = FixedSizeListArray::try_new_from_values(
-            UInt8Array::from_iter_values((0..num_rows).map(|idx| (idx * 13) as u8)),
-            1,
+            UInt8Array::from_iter_values((0..num_rows * code_len).map(|idx| (idx * 13) as u8)),
+            code_len as i32,
         )
         .unwrap();
-        let ex_code_len = rabit_ex_code_bytes(code_dim, ex_bits).unwrap();
+        let ex_values = (0..num_rows * code_dim)
+            .map(|idx| ((idx * 37) % (max_code as usize + 1)) as u8)
+            .collect::<Vec<_>>();
+        let (ex_code_column, ex_code_len, ex_code_bytes) = if legacy_format {
+            let ex_code_len = rabit_ex_code_bytes(code_dim, ex_bits).unwrap();
+            let mut ex_code_bytes = vec![0u8; num_rows * ex_code_len];
+            for (row, values) in ex_values.chunks_exact(code_dim).enumerate() {
+                for (dim, &value) in values.iter().enumerate() {
+                    let bit_offset = dim * ex_bits as usize;
+                    let bits = (value as u16) << (bit_offset % 8);
+                    ex_code_bytes[row * ex_code_len + bit_offset / 8] |= bits as u8;
+                    if bits >> 8 != 0 {
+                        ex_code_bytes[row * ex_code_len + bit_offset / 8 + 1] |= (bits >> 8) as u8;
+                    }
+                }
+            }
+            (RABIT_EX_CODE_COLUMN, ex_code_len, ex_code_bytes)
+        } else {
+            let ex_code_len = blocked_ex_code_bytes(code_dim, ex_bits);
+            let mut ex_code_bytes = vec![0u8; num_rows * ex_code_len];
+            for (row, values) in ex_code_bytes
+                .chunks_exact_mut(ex_code_len)
+                .zip(ex_values.chunks_exact(code_dim))
+            {
+                crate::vector::bq::ex_dot::pack_blocked_row(values, ex_bits, row);
+            }
+            (RABIT_BLOCKED_EX_CODE_COLUMN, ex_code_len, ex_code_bytes)
+        };
         let ex_codes = FixedSizeListArray::try_new_from_values(
-            UInt8Array::from_iter_values(
-                (0..num_rows * ex_code_len).map(|idx| (idx * 37 % 251) as u8),
-            ),
+            UInt8Array::from(ex_code_bytes),
             ex_code_len as i32,
         )
         .unwrap();
@@ -3108,7 +3625,7 @@ mod tests {
                 SCALE_FACTORS_COLUMN,
                 Arc::new(Float32Array::from(vec![1.0; num_rows])) as ArrayRef,
             ),
-            (RABIT_EX_CODE_COLUMN, Arc::new(ex_codes) as ArrayRef),
+            (ex_code_column, Arc::new(ex_codes) as ArrayRef),
             (
                 EX_ADD_FACTORS_COLUMN,
                 Arc::new(Float32Array::from(vec![0.0; num_rows])) as ArrayRef,
@@ -3119,12 +3636,30 @@ mod tests {
             ),
         ])
         .unwrap();
+        let batch = if with_error_factors {
+            batch
+                .try_with_column(
+                    crate::vector::bq::transform::ERROR_FACTORS_FIELD.clone(),
+                    Arc::new(Float32Array::from(vec![1000.0; num_rows])) as ArrayRef,
+                )
+                .unwrap()
+        } else {
+            batch
+        };
         let storage =
             RabitQuantizationStorage::try_from_batch(batch, &metadata, DistanceType::L2, None)
                 .unwrap();
-        assert!(storage.packed_ex_codes.is_some());
+        // The FastScan transpose only exists for indexes that can reach the
+        // bulk bypass path (no error factors); gated indexes fall through to
+        // the exact per-row kernels in `distance_all`.
+        assert_eq!(storage.packed_ex_codes.is_some(), !with_error_factors);
 
-        let query = Arc::new(Float32Array::from(vec![1.0; code_dim])) as ArrayRef;
+        // A per-dim varying query so that any dim-mapping error in the
+        // FastScan LUT shows up as a value mismatch.
+        let query_values = (0..code_dim)
+            .map(|dim| (dim % 11) as f32 * 0.3 - 1.5)
+            .collect::<Vec<_>>();
+        let query = Arc::new(Float32Array::from(query_values.clone())) as ArrayRef;
         let calc = storage.dist_calculator(query, 0.0);
         let mut distances = Vec::new();
         let mut u16_scratch = Vec::new();
@@ -3140,15 +3675,57 @@ mod tests {
 
         assert_eq!(distances.len(), num_rows);
         assert_eq!(u16_scratch.len(), BATCH_SIZE);
-        assert_eq!(
-            u8_scratch.len(),
-            ex_fastscan_code_len(code_dim, ex_bits).unwrap() * 2 * SEGMENT_NUM_CODES
+        let loaded_ex_code_len = storage.ex_codes.as_ref().unwrap().value_length() as usize;
+        if with_error_factors {
+            // The gated path never builds the ex LUT; the scratch holds the
+            // binary LUT only.
+            assert_eq!(u8_scratch.len(), code_dim * 4);
+        } else {
+            assert_eq!(u8_scratch.len(), loaded_ex_code_len * 2 * SEGMENT_NUM_CODES);
+        }
+
+        // The fastscan estimate differs from the exact path only by the u8
+        // quantization of the binary LUT (amplified by 2^ex_bits) and of the
+        // ex LUT, so bound the comparison by those quantization errors.
+        let mut table_min = f32::INFINITY;
+        let mut table_max = f32::NEG_INFINITY;
+        for segment in query_values.chunks(4) {
+            for subset in 0..SEGMENT_NUM_CODES {
+                let value = segment
+                    .iter()
+                    .enumerate()
+                    .filter(|(idx, _)| subset & (1 << idx) != 0)
+                    .map(|(_, q)| *q)
+                    .sum::<f32>();
+                table_min = table_min.min(value);
+                table_max = table_max.max(value);
+            }
+        }
+        let code_scale = (1u32 << ex_bits) as f32;
+        let binary_bound =
+            code_scale * code_dim.div_ceil(4) as f32 * (table_max - table_min) / 510.0;
+        let mut padded_query = vec![0.0f32; crate::vector::bq::ex_dot::padded_query_len(code_dim)];
+        crate::vector::bq::ex_dot::pad_query_into(&query_values, &mut padded_query);
+        let mut quantized_table = Vec::new();
+        let (ex_qmin, ex_qmax, ex_qcap) = quantize_ex_fastscan_dist_table_into(
+            ex_bits,
+            loaded_ex_code_len,
+            &padded_query,
+            &mut quantized_table,
         );
+        // Without the FastScan transpose the ex stage is exact, so only the
+        // binary LUT quantization remains.
+        let ex_bound = if with_error_factors {
+            0.0
+        } else {
+            (loaded_ex_code_len * 2) as f32 * (ex_qmax - ex_qmin) / ex_qcap / 2.0
+        };
+        let bound = (binary_bound + ex_bound) * 1.5 + 1e-3;
         for (id, distance) in distances.iter().take(BATCH_SIZE).enumerate() {
             let exact = calc.distance(id as u32);
             assert!(
-                (*distance - exact).abs() < 10.0,
-                "distance_all fastscan mismatch for id {id}: actual={distance}, exact={exact}"
+                (*distance - exact).abs() <= bound,
+                "distance_all fastscan mismatch for id {id} (num_bits={num_bits} legacy={legacy_format}): actual={distance}, exact={exact}, bound={bound}"
             );
         }
         assert_eq!(distances[BATCH_SIZE], calc.distance(BATCH_SIZE as u32));
@@ -3156,8 +3733,108 @@ mod tests {
 
     #[test]
     fn test_raw_query_multi_bit_distance_all_uses_fastscan_for_split_ex_codes() {
-        for num_bits in [3, 9] {
-            assert_raw_query_multi_bit_distance_all_uses_fastscan(num_bits);
+        for num_bits in [3, 5, 9] {
+            for legacy_format in [false, true] {
+                assert_raw_query_multi_bit_distance_all_uses_fastscan(
+                    num_bits,
+                    legacy_format,
+                    false,
+                );
+            }
+            // Gated indexes (with error factors) skip the FastScan artifacts
+            // and score the bulk path with the exact kernels.
+            assert_raw_query_multi_bit_distance_all_uses_fastscan(num_bits, false, true);
+        }
+    }
+
+    /// A dist table whose `num_tables`-scaled reconstruction overflows `f32`
+    /// must fall back to exact distances rather than the affine dequant's
+    /// `0 * inf = NaN`. Covers both the u8 (Normal) and u16 (Accurate) LUT
+    /// paths end-to-end through `distance_all`, asserting the result is
+    /// NaN-free and bit-identical to the always-exact per-row computation.
+    #[rstest]
+    fn test_degenerate_dist_table_falls_back_to_exact_distances(
+        #[values(ApproxMode::Normal, ApproxMode::Accurate)] approx_mode: ApproxMode,
+    ) {
+        let code_dim = 8usize;
+        let num_rows = BATCH_SIZE + 5;
+        let num_bits = 3;
+        let ex_bits = rabit_ex_bits(num_bits).unwrap();
+        let identity = Float32Array::from_iter_values(
+            (0..code_dim)
+                .flat_map(|row| (0..code_dim).map(move |col| if row == col { 1.0 } else { 0.0 })),
+        );
+        let rotate_mat =
+            FixedSizeListArray::try_new_from_values(identity, code_dim as i32).unwrap();
+        let metadata = RabitQuantizationMetadata {
+            rotate_mat: Some(rotate_mat),
+            rotate_mat_position: None,
+            fast_rotation_signs: None,
+            rotation_type: RQRotationType::Matrix,
+            code_dim: code_dim as u32,
+            num_bits,
+            packed: false,
+            query_estimator: RabitQueryEstimator::RawQuery,
+        };
+        let codes = FixedSizeListArray::try_new_from_values(
+            UInt8Array::from_iter_values((0..num_rows).map(|idx| (idx * 19) as u8)),
+            rabit_binary_code_bytes(code_dim) as i32,
+        )
+        .unwrap();
+        let ex_codes = make_test_ex_codes(num_rows, code_dim, num_bits);
+        let batch = make_test_batch_with_ex(codes, ex_codes);
+        let storage =
+            RabitQuantizationStorage::try_from_batch(batch, &metadata, DistanceType::L2, None)
+                .unwrap();
+        let query = Arc::new(Float32Array::from(vec![1.0; code_dim])) as ArrayRef;
+
+        let mut calc = storage.dist_calculator(query, 4.0);
+        calc.approx_mode = approx_mode;
+        // num_tables = (code_dim * 4) / SEGMENT_NUM_CODES = 2; the extrema sum
+        // (qmax - qmin = 4e38) overflows when scaled by num_tables, so the
+        // quantizer returns `Exact`. Per-row sums stay finite (each row reads
+        // one entry per segment), so the exact path is well-defined.
+        let mut degenerate = vec![0.0f32; code_dim * 4];
+        degenerate[0] = -2e38;
+        degenerate[1] = 2e38;
+        calc.dist_table = Cow::Owned(degenerate);
+
+        let code_len = rabit_binary_code_bytes(code_dim);
+        let ex_codes = calc.ex_codes.unwrap();
+        let ex_add_factors = calc.ex_add_factors.unwrap();
+        let ex_scale_factors = calc.ex_scale_factors.unwrap();
+        let expected = (0..num_rows)
+            .map(|id| {
+                let binary_ip = compute_single_rq_distance(
+                    calc.codes,
+                    id,
+                    num_rows,
+                    code_len,
+                    &calc.dist_table,
+                );
+                calc.raw_query_multi_bit_exact_distance(
+                    id,
+                    binary_ip,
+                    ex_bits,
+                    ex_codes,
+                    ex_add_factors,
+                    ex_scale_factors,
+                )
+            })
+            .collect::<Vec<_>>();
+
+        let actual = calc.distance_all(0);
+        assert_eq!(actual.len(), num_rows);
+        for id in 0..num_rows {
+            assert!(
+                !actual[id].is_nan(),
+                "approx_mode={approx_mode:?} id={id}: degenerate table produced NaN"
+            );
+            assert_eq!(
+                actual[id].to_bits(),
+                expected[id].to_bits(),
+                "approx_mode={approx_mode:?} id={id}: distance_all must match the exact path"
+            );
         }
     }
 
@@ -3239,7 +3916,6 @@ mod tests {
                         id,
                         binary_ip,
                         ex_bits,
-                        ex_code_len,
                         ex_codes,
                         ex_add_factors,
                         ex_scale_factors,
@@ -3289,6 +3965,200 @@ mod tests {
         }
     }
 
+    /// Inputs crafted so the top-k scan outcomes are fully determined by the
+    /// factor columns: with zero scale factors, a zero query factor, and a
+    /// query error of one, the lower bound is
+    /// `add_factors[id] - error_factors[id]`, and with zero ex scale factors
+    /// the exact distance is `ex_add_factors[id]`, regardless of the random
+    /// codes and query.
+    struct CraftedTopkData {
+        codes: Vec<u8>,
+        ex_codes: Vec<u8>,
+        dist_table: Vec<f32>,
+        ex_query: Vec<f32>,
+        scale_factors: Vec<f32>,
+        add_factors: Vec<f32>,
+        error_factors: Vec<f32>,
+        ex_scale_factors: Vec<f32>,
+        ex_add_factors: Vec<f32>,
+    }
+
+    const CRAFTED_TOPK_DIM: usize = 64;
+    const CRAFTED_TOPK_NUM_BITS: u8 = 5;
+
+    impl CraftedTopkData {
+        fn new(
+            exact_dists: &[f32],
+            lower_bound_margins: &[f32],
+            error_factors: Vec<f32>,
+            rng: &mut SmallRng,
+        ) -> Self {
+            let n = exact_dists.len();
+            let code_len = rabit_binary_code_bytes(CRAFTED_TOPK_DIM);
+            let ex_code_len = blocked_ex_code_bytes(CRAFTED_TOPK_DIM, CRAFTED_TOPK_NUM_BITS - 1);
+            let add_factors = izip!(exact_dists, lower_bound_margins, &error_factors)
+                .map(|(dist, margin, error)| dist - margin + error)
+                .collect();
+            Self {
+                codes: (0..n * code_len).map(|_| rng.random()).collect(),
+                ex_codes: (0..n * ex_code_len).map(|_| rng.random()).collect(),
+                dist_table: (0..CRAFTED_TOPK_DIM * 4)
+                    .map(|_| rng.random_range(-1.0f32..1.0))
+                    .collect(),
+                ex_query: (0..CRAFTED_TOPK_DIM)
+                    .map(|_| rng.random_range(-1.0f32..1.0))
+                    .collect(),
+                scale_factors: vec![0.0; n],
+                add_factors,
+                error_factors,
+                ex_scale_factors: vec![0.0; n],
+                ex_add_factors: exact_dists.to_vec(),
+            }
+        }
+
+        fn calculator(&self, approx_mode: ApproxMode) -> RabitDistCalculator<'_> {
+            RabitDistCalculator::new(
+                CRAFTED_TOPK_DIM,
+                CRAFTED_TOPK_NUM_BITS,
+                RabitQueryEstimator::RawQuery,
+                Cow::Borrowed(self.dist_table.as_slice()),
+                Cow::Borrowed(self.ex_query.as_slice()),
+                0.7,
+                &self.codes,
+                Some(&self.ex_codes),
+                blocked_ex_code_bytes(CRAFTED_TOPK_DIM, CRAFTED_TOPK_NUM_BITS - 1),
+                &self.add_factors,
+                &self.scale_factors,
+                Some(&self.error_factors),
+                Some(&self.ex_add_factors),
+                Some(&self.ex_scale_factors),
+                None,
+                0.0,
+                1.0,
+                approx_mode,
+            )
+        }
+    }
+
+    fn canonical_heap_rows(heap: BinaryHeap<OrderedNode<u64>>) -> Vec<(u32, u64)> {
+        let mut rows = heap
+            .into_iter()
+            .map(|node| (node.dist.0.to_bits(), node.id))
+            .collect::<Vec<_>>();
+        rows.sort_unstable();
+        rows
+    }
+
+    /// The dense (SIMD-pruned) scan must reproduce the sparse scalar scan
+    /// exactly: identical heap contents including row ids, and the k smallest
+    /// in-bounds exact distances overall.
+    #[rstest]
+    fn test_raw_query_multi_bit_topk_dense_matches_sparse(
+        #[values(ApproxMode::Normal, ApproxMode::Accurate)] approx_mode: ApproxMode,
+        #[values("descending", "ascending", "random", "duplicates", "duplicate_ties")]
+        ordering: &str,
+    ) {
+        for n in [1usize, 15, 16, 17, 100, 4109] {
+            let mut rng = SmallRng::seed_from_u64(n as u64 * 31 + ordering.len() as u64);
+            let exact_dists: Vec<f32> = match ordering {
+                // Improving rows force constant heap updates.
+                "descending" => (0..n).map(|id| (n - id) as f32).collect(),
+                // Worsening rows force mass pruning, the common regime.
+                "ascending" => (0..n).map(|id| id as f32).collect(),
+                "random" => (0..n).map(|_| rng.random_range(0.0..n as f32)).collect(),
+                "duplicates" => (0..n).map(|id| (id % 7) as f32).collect(),
+                // Lower bound equals the distance, so heap-threshold and
+                // upper-bound comparisons hit exact `>=` ties.
+                "duplicate_ties" => (0..n).map(|id| (id % 5) as f32).collect(),
+                _ => unreachable!(),
+            };
+            let (margins, error_factors) = if ordering == "duplicate_ties" {
+                (vec![0.0; n], vec![0.0; n])
+            } else if ordering == "random" {
+                (
+                    (0..n).map(|_| rng.random_range(0.0f32..2.0)).collect(),
+                    (0..n).map(|_| rng.random_range(0.0f32..1.0)).collect(),
+                )
+            } else {
+                (
+                    vec![1.0; n],
+                    (0..n).map(|_| rng.random_range(0.0f32..1.0)).collect(),
+                )
+            };
+            let data = CraftedTopkData::new(&exact_dists, &margins, error_factors, &mut rng);
+            let calc = data.calculator(approx_mode);
+            assert!(
+                calc.raw_query_lower_bound_gating_disabled_reason()
+                    .is_none()
+            );
+
+            let max_dist = exact_dists.iter().fold(0.0f32, |acc, dist| acc.max(*dist));
+            for k in [1usize, 10, n + 7] {
+                for bounds in [(None, None), (Some(max_dist * 0.25), Some(max_dist * 0.7))] {
+                    let (lower_bound, upper_bound) = bounds;
+                    let mut dense_heap = BinaryHeap::new();
+                    let mut sparse_heap = BinaryHeap::new();
+                    let mut dists = Vec::new();
+                    let mut u16_scratch = Vec::new();
+                    let mut u8_scratch = Vec::new();
+                    let mut u32_scratch = Vec::new();
+                    // Two passes sharing the heap, as IVF partition probing
+                    // does: the second pass starts with a full, tight heap.
+                    for pass in 0..2u64 {
+                        let offset = pass * n as u64;
+                        calc.accumulate_topk_with_scratch(
+                            k,
+                            lower_bound,
+                            upper_bound,
+                            |id| id as u64 + offset,
+                            &mut dense_heap,
+                            &mut dists,
+                            &mut u16_scratch,
+                            &mut u8_scratch,
+                            &mut u32_scratch,
+                        );
+                        calc.accumulate_filtered_topk_with_scratch(
+                            k,
+                            lower_bound,
+                            upper_bound,
+                            (0..n as u32).map(|id| (id, id as u64 + offset)),
+                            |_| true,
+                            &mut sparse_heap,
+                            &mut dists,
+                            &mut u16_scratch,
+                            &mut u8_scratch,
+                            &mut u32_scratch,
+                        );
+                    }
+                    let dense = canonical_heap_rows(dense_heap);
+                    let sparse = canonical_heap_rows(sparse_heap);
+                    assert_eq!(
+                        dense, sparse,
+                        "ordering={ordering} n={n} k={k} bounds={bounds:?} mode={approx_mode:?}"
+                    );
+
+                    // The distance multiset must be the k smallest in-bounds
+                    // distances over both passes. Row ids are not compared:
+                    // evictions among tied maxima depend on heap layout.
+                    let query_lower_bound = lower_bound.unwrap_or(f32::MIN);
+                    let query_upper_bound = upper_bound.unwrap_or(f32::MAX);
+                    let mut expected = (0..2 * n)
+                        .map(|row| exact_dists[row % n])
+                        .filter(|dist| *dist >= query_lower_bound && *dist < query_upper_bound)
+                        .map(|dist| dist.to_bits())
+                        .collect::<Vec<_>>();
+                    expected.sort_unstable();
+                    expected.truncate(k);
+                    let actual = dense.iter().map(|(dist, _)| *dist).collect::<Vec<_>>();
+                    assert_eq!(
+                        actual, expected,
+                        "ordering={ordering} n={n} k={k} bounds={bounds:?} mode={approx_mode:?}"
+                    );
+                }
+            }
+        }
+    }
+
     #[test]
     fn test_raw_query_one_bit_distance_uses_binary_factors_without_ex_columns() {
         let code_dim = 8usize;
@@ -3457,7 +4327,8 @@ mod tests {
         )
         .unwrap_err();
         assert!(
-            err.to_string().contains("requires __ex_codes column"),
+            err.to_string()
+                .contains("requires __blocked_ex_codes column"),
             "{}",
             err
         );
@@ -3501,9 +4372,11 @@ mod tests {
         .unwrap();
 
         assert!(storage.metadata().packed);
+        // Legacy batches are normalized to the blocked column at load.
         let stored_batch = storage.to_batches().unwrap().next().unwrap();
+        assert!(stored_batch.column_by_name(RABIT_EX_CODE_COLUMN).is_none());
         assert_eq!(
-            stored_batch[RABIT_EX_CODE_COLUMN]
+            stored_batch[RABIT_BLOCKED_EX_CODE_COLUMN]
                 .as_fixed_size_list()
                 .value_length(),
             64
@@ -3571,11 +4444,19 @@ mod tests {
 
     #[test]
     fn test_remap_preserves_multi_bit_rq_split_columns() {
+        // num_bits=9 keeps sequential ex codes; num_bits 4/6/8 (ex_bits
+        // 3/5/7) also exercise the bit-plane repack rebuild in `remap`.
+        for num_bits in [4, 6, 8, 9u8] {
+            test_remap_preserves_multi_bit_rq_split_columns_impl(num_bits);
+        }
+    }
+
+    fn test_remap_preserves_multi_bit_rq_split_columns_impl(num_bits: u8) {
         let original_codes = make_test_codes(50, 64);
         let code_dim = original_codes.value_length() as usize * 8;
-        let ex_codes = make_test_ex_codes(original_codes.len(), code_dim, 9);
+        let ex_codes = make_test_ex_codes(original_codes.len(), code_dim, num_bits);
         let mut metadata = make_test_metadata(code_dim);
-        metadata.num_bits = 9;
+        metadata.num_bits = num_bits;
         let storage = RabitQuantizationStorage::try_from_batch(
             make_test_batch_with_ex(original_codes.clone(), ex_codes),
             &metadata,
@@ -3599,11 +4480,14 @@ mod tests {
         );
         assert_eq!(remapped_row_ids, expected_row_ids.values());
 
+        // Legacy batches are normalized to the blocked format at load, so the
+        // remapped batch carries the blocked column.
+        let ex_code_len = blocked_ex_code_bytes(code_dim, rabit_ex_bits(num_bits).unwrap());
         assert_eq!(
-            remapped_batch[RABIT_EX_CODE_COLUMN]
+            remapped_batch[RABIT_BLOCKED_EX_CODE_COLUMN]
                 .as_fixed_size_list()
                 .value_length(),
-            64
+            ex_code_len as i32
         );
         assert_eq!(
             &remapped_batch[EX_ADD_FACTORS_COLUMN]
@@ -3623,5 +4507,20 @@ mod tests {
                 .values()[..5],
             &[0.25, 1.25, 2.25, 4.25, 5.25]
         );
+
+        // The remapped storage must hold the same kernel-layout ex codes as a
+        // storage freshly loaded from the remapped batch.
+        let reloaded = RabitQuantizationStorage::try_from_batch(
+            remapped_batch,
+            &remapped.metadata,
+            DistanceType::L2,
+            None,
+        )
+        .unwrap();
+        assert_eq!(remapped.ex_codes, reloaded.ex_codes);
+        assert_eq!(
+            remapped.ex_codes.as_ref().unwrap().value_length() as usize,
+            blocked_ex_code_bytes(code_dim, rabit_ex_bits(num_bits).unwrap())
+        );
     }
 }
diff --git a/rust/lance-index/src/vector/bq/transform.rs b/rust/lance-index/src/vector/bq/transform.rs
index c2fc0608102..c87695e14cd 100644
--- a/rust/lance-index/src/vector/bq/transform.rs
+++ b/rust/lance-index/src/vector/bq/transform.rs
@@ -17,7 +17,9 @@ use tracing::instrument;
 
 use crate::vector::bq::builder::RabitQuantizer;
 use crate::vector::bq::rabit_ex_bits;
-use crate::vector::bq::storage::{RABIT_CODE_COLUMN, RABIT_EX_CODE_COLUMN, RabitQueryEstimator};
+use crate::vector::bq::storage::{
+    RABIT_BLOCKED_EX_CODE_COLUMN, RABIT_CODE_COLUMN, RabitQueryEstimator,
+};
 use crate::vector::quantizer::Quantization;
 use crate::vector::transform::Transformer;
 use crate::vector::{CENTROID_DIST_COLUMN, PART_ID_COLUMN};
@@ -281,7 +283,7 @@ impl Transformer for RQTransformer {
     #[instrument(name = "RQTransformer::transform", level = "debug", skip_all)]
     fn transform(&self, batch: &RecordBatch) -> Result<RecordBatch> {
         let has_split_codes = self.rq.num_bits() == 1
-            || (batch.column_by_name(RABIT_EX_CODE_COLUMN).is_some()
+            || (batch.column_by_name(RABIT_BLOCKED_EX_CODE_COLUMN).is_some()
                 && batch.column_by_name(EX_ADD_FACTORS_COLUMN).is_some()
                 && batch.column_by_name(EX_SCALE_FACTORS_COLUMN).is_some());
         if batch.column_by_name(RABIT_CODE_COLUMN).is_some() && has_split_codes {
@@ -494,7 +496,8 @@ mod tests {
 
     use crate::vector::bq::RQRotationType;
     use crate::vector::bq::builder::RabitQuantizer;
-    use crate::vector::bq::storage::RABIT_EX_CODE_COLUMN;
+    use crate::vector::bq::ex_dot::blocked_ex_code_bytes;
+    use crate::vector::bq::storage::RABIT_BLOCKED_EX_CODE_COLUMN;
     use crate::vector::transform::Transformer;
     use crate::vector::{CENTROID_DIST_COLUMN, PART_ID_COLUMN};
 
@@ -535,15 +538,19 @@ mod tests {
         .unwrap();
 
         let transformed = transformer.transform(&batch).unwrap();
-        assert!(transformed.column_by_name(RABIT_EX_CODE_COLUMN).is_some());
+        assert!(
+            transformed
+                .column_by_name(RABIT_BLOCKED_EX_CODE_COLUMN)
+                .is_some()
+        );
         assert_eq!(
-            transformed[RABIT_EX_CODE_COLUMN]
+            transformed[RABIT_BLOCKED_EX_CODE_COLUMN]
                 .as_fixed_size_list()
                 .value_length(),
-            3
+            blocked_ex_code_bytes(8, 3) as i32
         );
         assert!(
-            transformed[RABIT_EX_CODE_COLUMN]
+            transformed[RABIT_BLOCKED_EX_CODE_COLUMN]
                 .as_fixed_size_list()
                 .values()
                 .as_primitive::<UInt8Type>()
diff --git a/rust/lance-index/src/vector/distributed/index_merger.rs b/rust/lance-index/src/vector/distributed/index_merger.rs
index 5f59985673e..70371ad4794 100755
--- a/rust/lance-index/src/vector/distributed/index_merger.rs
+++ b/rust/lance-index/src/vector/distributed/index_merger.rs
@@ -1440,6 +1440,25 @@ pub async fn merge_partial_vector_auxiliary_files(
                     )));
                 }
 
+                // Shards written by older lance versions carry sequential ex
+                // codes; normalize every batch to the blocked layout before
+                // concatenation so mixed-version shards merge correctly
+                // (concat_batches combines columns by position and would
+                // otherwise mix the two layouts silently).
+                let batches = match rq_meta.as_ref() {
+                    Some(meta) if meta.num_bits > 1 => batches
+                        .into_iter()
+                        .map(|batch| {
+                            crate::vector::bq::storage::load_blocked_ex_codes(
+                                batch,
+                                meta.rotated_dim(),
+                                meta.num_bits,
+                            )
+                            .map(|(batch, _)| batch)
+                        })
+                        .collect::<Result<Vec<_>>>()?,
+                    _ => batches,
+                };
                 let schema = batches[0].schema();
                 let partition_batch = concat_batches(&schema, batches.iter())?;
                 if let Some(w) = v2w_opt.as_mut() {
@@ -1527,7 +1546,7 @@ mod tests {
     use prost::Message;
 
     use crate::vector::bq::RQRotationType;
-    use crate::vector::bq::storage::{RABIT_EX_CODE_COLUMN, RabitQueryEstimator};
+    use crate::vector::bq::storage::{RABIT_BLOCKED_EX_CODE_COLUMN, RabitQueryEstimator};
     use crate::vector::bq::transform::{EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN};
     lance_testing::define_stage_event_progress!(
         RecordingProgress,
@@ -2529,11 +2548,14 @@ mod tests {
             let batch = batch.unwrap();
             if !checked_split_columns {
                 let schema = batch.schema();
-                let ex_code_field = schema.field_with_name(RABIT_EX_CODE_COLUMN).unwrap();
+                let ex_code_field = schema
+                    .field_with_name(RABIT_BLOCKED_EX_CODE_COLUMN)
+                    .unwrap();
                 let DataType::FixedSizeList(_, ex_code_bytes) = ex_code_field.data_type() else {
                     panic!("RQ ex-code field should be FixedSizeList");
                 };
-                assert_eq!(*ex_code_bytes, 6);
+                // code_dim=16 padded to one 64-dim block at ex_bits=3.
+                assert_eq!(*ex_code_bytes, 24);
                 assert!(schema.field_with_name(ERROR_FACTORS_FIELD.name()).is_ok());
                 assert!(schema.field_with_name(EX_ADD_FACTORS_COLUMN).is_ok());
                 assert!(schema.field_with_name(EX_SCALE_FACTORS_COLUMN).is_ok());
diff --git a/rust/lance-index/src/vector/pq/storage.rs b/rust/lance-index/src/vector/pq/storage.rs
index 68747713aac..de5a7ac28bd 100644
--- a/rust/lance-index/src/vector/pq/storage.rs
+++ b/rust/lance-index/src/vector/pq/storage.rs
@@ -221,7 +221,7 @@ impl ProductQuantizationStorage {
                 "Row ID column not found from PQ storage".to_string(),
             ));
         };
-        let row_ids: Arc<UInt64Array> = row_ids
+        let mut row_ids: Arc<UInt64Array> = row_ids
             .as_primitive_opt::<UInt64Type>()
             .ok_or(Error::index(
                 "Row ID column is not of type UInt64".to_string(),
@@ -293,6 +293,11 @@ impl ProductQuantizationStorage {
                 .as_primitive::<UInt8Type>()
                 .clone()
                 .into();
+            // Refresh the stored row ids from the remapped batch. Without this
+            // the storage keeps the pre-remap (compacted-away) addresses while
+            // its codes are remapped, so search returns stale row ids and the
+            // take fails with "fragment ... does not exist".
+            row_ids = batch[ROW_ID].as_primitive::<UInt64Type>().clone().into();
         }
 
         let distance_type = match distance_type {
diff --git a/rust/lance-index/src/vector/storage.rs b/rust/lance-index/src/vector/storage.rs
index b036e187b77..a14308197ed 100644
--- a/rust/lance-index/src/vector/storage.rs
+++ b/rust/lance-index/src/vector/storage.rs
@@ -14,10 +14,12 @@ use lance_core::{Error, ROW_ID, Result};
 use lance_encoding::decoder::FilterExpression;
 use lance_file::reader::FileReader;
 use lance_io::ReadBatchParams;
+use lance_io::scheduler::IoStats;
 use lance_linalg::distance::DistanceType;
 use prost::Message;
 use std::{
     any::Any,
+    borrow::Cow,
     collections::BinaryHeap,
     mem::size_of,
     ops::{Deref, DerefMut},
@@ -249,7 +251,10 @@ pub struct RabitRawQueryContext {
     pub ex_bits: u8,
     pub rotated_query: Vec<f32>,
     pub dist_table: Vec<f32>,
-    pub ex_dist_table: Vec<f32>,
+    /// The rotated query zero-padded to a 64-dim multiple for the ex-dot
+    /// kernels; empty when `code_dim` is already aligned (the kernels then
+    /// read `rotated_query` directly).
+    pub ex_query: Vec<f32>,
     pub sum_q: f32,
 }
 
@@ -620,15 +625,29 @@ impl<Q: Quantization> IvfQuantizationStorage<Q> {
         self.ivf.num_partitions()
     }
 
-    pub async fn load_partition(&self, part_id: usize) -> Result<Q::Storage> {
+    /// Load a partition's quantization storage, optionally measuring the exact
+    /// I/O it performs into `io_stats`.
+    ///
+    /// When `io_stats` is `Some`, the partition is read through a reader whose
+    /// scheduler also records into the sink (a cheap clone that shares all
+    /// cached metadata, so no file is re-opened).  When `None`, the normal
+    /// uninstrumented reader is used.
+    pub async fn load_partition(
+        &self,
+        part_id: usize,
+        io_stats: Option<IoStats>,
+    ) -> Result<Q::Storage> {
         let range = self.ivf.row_range(part_id);
         let batch = if range.is_empty() {
             let schema = self.reader.schema();
             let arrow_schema = arrow_schema::Schema::from(schema.as_ref());
             RecordBatch::new_empty(Arc::new(arrow_schema))
         } else {
-            let batches = self
-                .reader
+            let reader = match &io_stats {
+                Some(io_stats) => Cow::Owned(self.reader.with_io_stats(io_stats.recorder())),
+                None => Cow::Borrowed(&self.reader),
+            };
+            let batches = reader
                 .read_stream(
                     ReadBatchParams::Range(range),
                     u32::MAX,
diff --git a/rust/lance-io/src/scheduler.rs b/rust/lance-io/src/scheduler.rs
index 4f43cb00668..efe4b9b0c24 100644
--- a/rust/lance-io/src/scheduler.rs
+++ b/rust/lance-io/src/scheduler.rs
@@ -15,6 +15,7 @@ use std::sync::{Arc, Mutex};
 use std::time::Instant;
 use tokio::sync::Notify;
 
+use lance_core::utils::io_stats::IoStatsRecorder;
 use lance_core::utils::parse::str_is_truthy;
 use lance_core::{Error, Result};
 
@@ -475,8 +476,25 @@ impl StatsCollector {
             Ordering::Relaxed,
         );
     }
+
+    /// Add already-aggregated counts (e.g. a snapshot captured from another
+    /// scheduler) into these counters.
+    fn add(&self, iops: u64, requests: u64, bytes_read: u64) {
+        self.iops.fetch_add(iops, Ordering::Relaxed);
+        self.requests.fetch_add(requests, Ordering::Relaxed);
+        self.bytes_read.fetch_add(bytes_read, Ordering::Relaxed);
+    }
 }
 
+impl IoStatsRecorder for StatsCollector {
+    fn record_request(&self, request: &[Range<u64>]) {
+        // Inherent methods take precedence in resolution, so this delegates to
+        // the inherent `record_request` above rather than recursing.
+        Self::record_request(self, request)
+    }
+}
+
+#[derive(Debug, Clone, Copy, Default)]
 pub struct ScanStats {
     pub iops: u64,
     pub requests: u64,
@@ -493,6 +511,57 @@ impl ScanStats {
     }
 }
 
+/// A shareable, cloneable handle to a set of cumulative I/O counters.
+///
+/// All clones share the same underlying counters.  This serves two purposes:
+///
+/// 1. It backs each [`ScanScheduler`]'s own running totals.
+/// 2. It can be attached to an individual [`FileScheduler`] (via
+///    [`FileScheduler::with_io_stats`]) as a *secondary* sink, so a caller can
+///    measure the exact bytes/IOPS performed through that file handle for a
+///    bounded scope (e.g. a single query) without disturbing the scheduler's
+///    global totals.  Read the result back with [`IoStats::snapshot`].
+#[derive(Debug, Clone)]
+pub struct IoStats(Arc<StatsCollector>);
+
+impl IoStats {
+    pub fn new() -> Self {
+        Self(Arc::new(StatsCollector::new()))
+    }
+
+    /// Record a single completed request.  `request` holds the byte ranges as
+    /// actually submitted to storage (post coalescing/splitting), so the counts
+    /// reflect physical I/O.
+    pub fn record_request(&self, request: &[Range<u64>]) {
+        self.0.record_request(request);
+    }
+
+    /// Take an immutable snapshot of the current cumulative counters.
+    pub fn snapshot(&self) -> ScanStats {
+        ScanStats::new(self.0.as_ref())
+    }
+
+    /// Return this handle as a type-erased [`IoStatsRecorder`], suitable for
+    /// attaching to a file reader (e.g. `FileReader::with_io_stats`).  The
+    /// returned recorder shares the same underlying counters as `self`.
+    pub fn recorder(&self) -> Arc<dyn IoStatsRecorder> {
+        self.0.clone()
+    }
+
+    /// Add a snapshot of already-aggregated statistics into this sink.  Used to
+    /// fold in I/O measured on a separate scheduler (e.g. the one-time reads
+    /// performed while opening an index).
+    pub fn add_scan_stats(&self, stats: &ScanStats) {
+        self.0.add(stats.iops, stats.requests, stats.bytes_read);
+    }
+}
+
+impl Default for IoStats {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 enum IoQueueType {
     Standard(Arc<IoQueue>),
     Lite(Arc<lite::IoQueue>),
@@ -509,7 +578,7 @@ enum IoQueueType {
 pub struct ScanScheduler {
     object_store: Arc<ObjectStore>,
     io_queue: IoQueueType,
-    stats: Arc<StatsCollector>,
+    stats: IoStats,
 }
 
 impl Debug for ScanScheduler {
@@ -606,7 +675,7 @@ impl ScanScheduler {
         Arc::new(Self {
             object_store,
             io_queue,
-            stats: Arc::new(StatsCollector::new()),
+            stats: IoStats::new(),
         })
     }
 
@@ -646,6 +715,7 @@ impl ScanScheduler {
             base_priority,
             max_iop_size,
             bypass_backpressure: false,
+            extra_stats: None,
         })
     }
 
@@ -791,7 +861,7 @@ impl ScanScheduler {
     }
 
     pub fn stats(&self) -> ScanStats {
-        ScanStats::new(self.stats.as_ref())
+        self.stats.snapshot()
     }
 
     #[cfg(test)]
@@ -829,6 +899,10 @@ pub struct FileScheduler {
     base_priority: u64,
     max_iop_size: u64,
     bypass_backpressure: bool,
+    /// Optional secondary statistics sink.  When set, every request submitted
+    /// through this handle is also recorded here, in addition to the
+    /// scheduler's global totals.  Used to measure per-scope I/O.
+    extra_stats: Option<Arc<dyn IoStatsRecorder>>,
 }
 
 fn is_close_together(range1: &Range<u64>, range2: &Range<u64>, block_size: u64) -> bool {
@@ -899,6 +973,9 @@ impl FileScheduler {
         }
 
         self.root.stats.record_request(&updated_requests);
+        if let Some(extra_stats) = &self.extra_stats {
+            extra_stats.record_request(&updated_requests);
+        }
 
         let bytes_vec_fut = self.root.submit_request(
             self.reader.clone(),
@@ -964,6 +1041,23 @@ impl FileScheduler {
             max_iop_size: self.max_iop_size,
             base_priority: priority,
             bypass_backpressure: self.bypass_backpressure,
+            extra_stats: self.extra_stats.clone(),
+        }
+    }
+
+    /// Returns a copy of this scheduler that additionally records the I/O it
+    /// performs into `stats`, on top of the scheduler's global statistics.
+    ///
+    /// This is the mechanism for measuring exact per-scope (e.g. per-query) I/O:
+    /// attach a recorder here (e.g. via [`IoStats::recorder`]), perform the reads
+    /// through the returned handle, then read the totals back with
+    /// [`IoStats::snapshot`].  The returned handle is cheap to create (a few
+    /// `Arc` clones) and reuses the same underlying reader, so it does not
+    /// re-open the file.
+    pub fn with_io_stats(&self, stats: Arc<dyn IoStatsRecorder>) -> Self {
+        Self {
+            extra_stats: Some(stats),
+            ..self.clone()
         }
     }
 
@@ -1183,6 +1277,59 @@ mod tests {
         assert_eq!(11, scheduler.stats().iops);
     }
 
+    #[tokio::test]
+    async fn test_io_stats_sink() {
+        let tmp_file = TempObjFile::default();
+        let obj_store = Arc::new(ObjectStore::local());
+
+        const DATA_SIZE: u64 = 1024 * 1024;
+        let mut some_data = vec![0; DATA_SIZE as usize];
+        rand::rng().fill_bytes(&mut some_data);
+        obj_store.put(&tmp_file, &some_data).await.unwrap();
+
+        let scheduler = ScanScheduler::new(obj_store, SchedulerConfig::default_for_testing());
+
+        // Attach a per-scope sink to one file handle.
+        let sink = IoStats::new();
+        let file_scheduler = scheduler
+            .open_file(&tmp_file, &CachedFileSize::unknown())
+            .await
+            .unwrap()
+            .with_io_stats(sink.recorder());
+
+        // Three reads within 4KiB coalesce into a single physical IOP.  The sink
+        // and the scheduler's global totals must agree exactly, because both are
+        // recorded from the same post-coalescing request.
+        file_scheduler
+            .submit_request(vec![50_000..51_000, 52_000..53_000, 54_000..55_000], 0)
+            .await
+            .unwrap();
+
+        let global = scheduler.stats();
+        let scoped = sink.snapshot();
+        assert_eq!(1, scoped.iops);
+        assert_eq!(1, scoped.requests);
+        // Coalesced range 50_000..55_000 => 5000 physical bytes.
+        assert_eq!(5000, scoped.bytes_read);
+        assert_eq!(global.iops, scoped.iops);
+        assert_eq!(global.requests, scoped.requests);
+        assert_eq!(global.bytes_read, scoped.bytes_read);
+
+        // A sibling handle without the sink: the global totals advance but the
+        // sink stays put, proving per-scope isolation.
+        let other = scheduler
+            .open_file(&tmp_file, &CachedFileSize::unknown())
+            .await
+            .unwrap();
+        other.submit_request(vec![0..1000], 0).await.unwrap();
+
+        let global_after = scheduler.stats();
+        let scoped_after = sink.snapshot();
+        assert_eq!(global.bytes_read + 1000, global_after.bytes_read);
+        assert_eq!(scoped.bytes_read, scoped_after.bytes_read);
+        assert_eq!(scoped.iops, scoped_after.iops);
+    }
+
     #[tokio::test]
     async fn test_priority() {
         let some_path = Path::parse("foo").unwrap();
diff --git a/rust/lance-linalg/Cargo.toml b/rust/lance-linalg/Cargo.toml
index cf91deb69d7..6a188ec3c62 100644
--- a/rust/lance-linalg/Cargo.toml
+++ b/rust/lance-linalg/Cargo.toml
@@ -18,6 +18,7 @@ lance-arrow = { workspace = true }
 lance-core = { workspace = true }
 num-traits = { workspace = true }
 rand = { workspace = true }
+rayon = { workspace = true }
 
 [dev-dependencies]
 approx = { workspace = true }
@@ -50,10 +51,6 @@ harness = false
 name = "cosine"
 harness = false
 
-[[bench]]
-name = "hamming"
-harness = false
-
 [[bench]]
 name = "norm_l2"
 harness = false
diff --git a/rust/lance-linalg/benches/hamming.rs b/rust/lance-linalg/benches/hamming.rs
deleted file mode 100644
index 9af3bf4614b..00000000000
--- a/rust/lance-linalg/benches/hamming.rs
+++ /dev/null
@@ -1,52 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright The Lance Authors
-
-use std::iter::repeat_with;
-
-use std::hint::black_box;
-
-use criterion::{Criterion, criterion_group, criterion_main};
-use lance_linalg::distance::hamming::{hamming, hamming_scalar};
-use rand::Rng;
-
-const DIMENSION: usize = 1024;
-const TOTAL: usize = 1024 * 1024; // 1M vectors
-
-fn bench_hamming(c: &mut Criterion) {
-    let mut rng = rand::rng();
-
-    let key = repeat_with(|| rng.random::<u8>())
-        .take(DIMENSION)
-        .collect::<Vec<_>>();
-    let target = repeat_with(|| rng.random::<u8>())
-        .take(TOTAL * DIMENSION)
-        .collect::<Vec<_>>();
-
-    c.bench_function("hamming,scalar", |b| {
-        b.iter(|| {
-            black_box(
-                target
-                    .chunks_exact(DIMENSION)
-                    .map(|tgt| hamming_scalar(&key, tgt))
-                    .sum::<f32>(),
-            );
-        })
-    });
-
-    c.bench_function("hamming,auto_vec", |b| {
-        b.iter(|| {
-            black_box(
-                target
-                    .chunks_exact(DIMENSION)
-                    .map(|tgt| hamming(&key, tgt))
-                    .sum::<f32>(),
-            );
-        })
-    });
-}
-
-criterion_group!(
-    name=benches;
-    config = Criterion::default().significance_level(0.1).sample_size(10);
-    targets = bench_hamming);
-criterion_main!(benches);
diff --git a/rust/lance-linalg/src/distance.rs b/rust/lance-linalg/src/distance.rs
index a356d5c1225..23d1cae2d63 100644
--- a/rust/lance-linalg/src/distance.rs
+++ b/rust/lance-linalg/src/distance.rs
@@ -27,7 +27,11 @@ pub mod norm_l2;
 
 pub use cosine::*;
 pub use dot::*;
-use hamming::hamming_distance_arrow_batch;
+pub use hamming::{
+    Cluster, ClusteringResult, PairwiseResult, UnionFind, cluster_edges, cluster_pairwise_result,
+    extract_hashes_from_fixed_list, hamming_distance_arrow_batch, hamming_u64,
+    pairwise_hamming_distance, pairwise_hamming_distance_parallel,
+};
 pub use l2::*;
 use lance_core::deepsize::DeepSizeOf;
 pub use norm_l2::*;
diff --git a/rust/lance-linalg/src/distance/hamming.rs b/rust/lance-linalg/src/distance/hamming.rs
index d8fd60f4054..a6f4b038195 100644
--- a/rust/lance-linalg/src/distance/hamming.rs
+++ b/rust/lance-linalg/src/distance/hamming.rs
@@ -2,14 +2,24 @@
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
 //! Hamming distance.
+//!
+//! This module provides hamming distance computation for binary vectors,
+//! including SIMD-accelerated pairwise hamming distance for 64-bit hashes.
 
+use std::collections::HashMap;
 use std::sync::Arc;
 
-use crate::{Error, Result};
+use arrow_array::builder::{ListBuilder, UInt64Builder};
 use arrow_array::cast::AsArray;
 use arrow_array::types::UInt8Type;
-use arrow_array::{Array, FixedSizeListArray, Float32Array};
-use arrow_schema::DataType;
+use arrow_array::{
+    Array, ArrayRef, FixedSizeListArray, Float32Array, RecordBatch, RecordBatchIterator,
+    RecordBatchReader, UInt32Array, UInt64Array,
+};
+use arrow_schema::{DataType, Field, Schema, SchemaRef};
+use rayon::prelude::*;
+
+use crate::{Error, Result};
 
 pub trait Hamming {
     /// Hamming distance between two vectors.
@@ -86,6 +96,640 @@ pub fn hamming_distance_arrow_batch(
     )))
 }
 
+/// Compute hamming distance between two 64-bit values using POPCNT.
+#[inline(always)]
+pub fn hamming_u64(a: u64, b: u64) -> u32 {
+    (a ^ b).count_ones()
+}
+
+/// Result of pairwise hamming distance computation.
+#[derive(Debug, Clone)]
+pub struct PairwiseResult {
+    pub row_id_a: Vec<u64>,
+    pub row_id_b: Vec<u64>,
+    pub distances: Vec<u32>,
+}
+
+impl PairwiseResult {
+    pub fn new() -> Self {
+        Self {
+            row_id_a: Vec::new(),
+            row_id_b: Vec::new(),
+            distances: Vec::new(),
+        }
+    }
+
+    pub fn with_capacity(capacity: usize) -> Self {
+        Self {
+            row_id_a: Vec::with_capacity(capacity),
+            row_id_b: Vec::with_capacity(capacity),
+            distances: Vec::with_capacity(capacity),
+        }
+    }
+
+    pub fn push(&mut self, a: u64, b: u64, dist: u32) {
+        self.row_id_a.push(a);
+        self.row_id_b.push(b);
+        self.distances.push(dist);
+    }
+
+    pub fn len(&self) -> usize {
+        self.row_id_a.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.row_id_a.is_empty()
+    }
+
+    pub fn extend(&mut self, other: Self) {
+        self.row_id_a.extend(other.row_id_a);
+        self.row_id_b.extend(other.row_id_b);
+        self.distances.extend(other.distances);
+    }
+
+    /// Convert to Arrow RecordBatch, consuming self.
+    pub fn into_record_batch(self) -> RecordBatch {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("row_id_a", DataType::UInt64, false),
+            Field::new("row_id_b", DataType::UInt64, false),
+            Field::new("distance", DataType::UInt32, false),
+        ]));
+
+        let row_id_a = Arc::new(UInt64Array::from(self.row_id_a));
+        let row_id_b = Arc::new(UInt64Array::from(self.row_id_b));
+        let distances = Arc::new(UInt32Array::from(self.distances));
+
+        RecordBatch::try_new(schema, vec![row_id_a, row_id_b, distances])
+            .expect("Failed to create RecordBatch")
+    }
+}
+
+impl Default for PairwiseResult {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Compute hamming distances for a query against multiple targets.
+/// Uses SIMD acceleration when available.
+#[inline]
+pub fn hamming_batch_u64(query: u64, targets: &[u64], results: &mut [u32]) {
+    debug_assert_eq!(targets.len(), results.len());
+    hamming_batch_simd(query, targets, results);
+}
+
+/// SIMD-accelerated batch hamming distance computation.
+#[inline]
+fn hamming_batch_simd(query: u64, targets: &[u64], results: &mut [u32]) {
+    #[cfg(target_arch = "x86_64")]
+    {
+        if is_x86_feature_detected!("avx512vpopcntdq") && is_x86_feature_detected!("avx512f") {
+            unsafe {
+                hamming_batch_avx512(query, targets, results);
+            }
+            return;
+        }
+        if is_x86_feature_detected!("avx2") {
+            unsafe {
+                hamming_batch_avx2(query, targets, results);
+            }
+            return;
+        }
+    }
+
+    // Scalar fallback (LLVM auto-vectorizes well on Apple Silicon)
+    hamming_batch_scalar(query, targets, results);
+}
+
+/// Scalar fallback using count_ones() which compiles to POPCNT.
+#[inline]
+fn hamming_batch_scalar(query: u64, targets: &[u64], results: &mut [u32]) {
+    // Unroll for better auto-vectorization
+    let n = targets.len();
+    let chunks = n / 8;
+    let mut i = 0;
+
+    for _ in 0..chunks {
+        results[i] = (query ^ targets[i]).count_ones();
+        results[i + 1] = (query ^ targets[i + 1]).count_ones();
+        results[i + 2] = (query ^ targets[i + 2]).count_ones();
+        results[i + 3] = (query ^ targets[i + 3]).count_ones();
+        results[i + 4] = (query ^ targets[i + 4]).count_ones();
+        results[i + 5] = (query ^ targets[i + 5]).count_ones();
+        results[i + 6] = (query ^ targets[i + 6]).count_ones();
+        results[i + 7] = (query ^ targets[i + 7]).count_ones();
+        i += 8;
+    }
+
+    // Handle remainder
+    while i < n {
+        results[i] = (query ^ targets[i]).count_ones();
+        i += 1;
+    }
+}
+
+/// AVX-512 VPOPCNTDQ: Process 8 x 64-bit values at once.
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx512f", enable = "avx512vpopcntdq")]
+unsafe fn hamming_batch_avx512(query: u64, targets: &[u64], results: &mut [u32]) {
+    use std::arch::x86_64::*;
+
+    let n = targets.len();
+    let query_vec = _mm512_set1_epi64(query as i64);
+
+    let chunks = n / 8;
+    let remainder = n % 8;
+
+    for i in 0..chunks {
+        let offset = i * 8;
+        let targets_ptr = targets.as_ptr().add(offset) as *const __m512i;
+        let target_vec = _mm512_loadu_si512(targets_ptr);
+
+        let xor_result = _mm512_xor_si512(query_vec, target_vec);
+        let popcount = _mm512_popcnt_epi64(xor_result);
+        let popcount_32 = _mm512_cvtepi64_epi32(popcount);
+
+        _mm256_storeu_si256(
+            results.as_mut_ptr().add(offset) as *mut __m256i,
+            popcount_32,
+        );
+    }
+
+    if remainder > 0 {
+        let offset = chunks * 8;
+        for j in 0..remainder {
+            results[offset + j] = (query ^ targets[offset + j]).count_ones();
+        }
+    }
+}
+
+/// AVX2 popcount using lookup table (Harley-Seal / PSHUFB method).
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+unsafe fn hamming_batch_avx2(query: u64, targets: &[u64], results: &mut [u32]) {
+    use std::arch::x86_64::*;
+
+    let n = targets.len();
+
+    let lookup = _mm256_setr_epi8(
+        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3,
+        3, 4,
+    );
+    let low_mask = _mm256_set1_epi8(0x0f);
+    let query_vec = _mm256_set1_epi64x(query as i64);
+
+    let chunks = n / 4;
+    let remainder = n % 4;
+
+    for i in 0..chunks {
+        let offset = i * 4;
+        let targets_ptr = targets.as_ptr().add(offset) as *const __m256i;
+        let target_vec = _mm256_loadu_si256(targets_ptr);
+
+        let xor_result = _mm256_xor_si256(query_vec, target_vec);
+
+        // Popcount using nibble lookup
+        let lo = _mm256_and_si256(xor_result, low_mask);
+        let hi = _mm256_and_si256(_mm256_srli_epi16(xor_result, 4), low_mask);
+        let popcnt_lo = _mm256_shuffle_epi8(lookup, lo);
+        let popcnt_hi = _mm256_shuffle_epi8(lookup, hi);
+        let popcnt_bytes = _mm256_add_epi8(popcnt_lo, popcnt_hi);
+        let popcount = _mm256_sad_epu8(popcnt_bytes, _mm256_setzero_si256());
+
+        let results_ptr = results.as_mut_ptr().add(offset);
+        *results_ptr = _mm256_extract_epi32::<0>(popcount) as u32;
+        *results_ptr.add(1) = _mm256_extract_epi32::<2>(popcount) as u32;
+        *results_ptr.add(2) = _mm256_extract_epi32::<4>(popcount) as u32;
+        *results_ptr.add(3) = _mm256_extract_epi32::<6>(popcount) as u32;
+    }
+
+    if remainder > 0 {
+        let offset = chunks * 4;
+        for j in 0..remainder {
+            results[offset + j] = (query ^ targets[offset + j]).count_ones();
+        }
+    }
+}
+
+/// Compute pairwise hamming distances for all pairs of hashes.
+///
+/// Returns pairs where distance <= threshold (if provided).
+///
+/// # Arguments
+/// * `hashes` - Vector of 64-bit hash values
+/// * `row_ids` - Optional row IDs (defaults to indices if None)
+/// * `threshold` - Optional maximum distance to include in results
+pub fn pairwise_hamming_distance(
+    hashes: &[u64],
+    row_ids: Option<&[u64]>,
+    threshold: Option<u32>,
+) -> PairwiseResult {
+    let n = hashes.len();
+    if n < 2 {
+        return PairwiseResult::new();
+    }
+
+    let threshold = threshold.unwrap_or(u32::MAX);
+    let num_pairs = n * (n - 1) / 2;
+    let mut result = PairwiseResult::with_capacity(num_pairs.min(1_000_000));
+
+    for i in 0..n {
+        for j in (i + 1)..n {
+            let dist = hamming_u64(hashes[i], hashes[j]);
+            if dist <= threshold {
+                let id_a = row_ids.map_or(i as u64, |ids| ids[i]);
+                let id_b = row_ids.map_or(j as u64, |ids| ids[j]);
+                result.push(id_a, id_b, dist);
+            }
+        }
+    }
+
+    result
+}
+
+/// Compute pairwise hamming distances in parallel using rayon + SIMD.
+///
+/// Uses chunked parallelization for balanced workload distribution.
+pub fn pairwise_hamming_distance_parallel(
+    hashes: &[u64],
+    row_ids: Option<&[u64]>,
+    threshold: Option<u32>,
+) -> PairwiseResult {
+    let n = hashes.len();
+    if n < 2 {
+        return PairwiseResult::new();
+    }
+
+    let threshold = threshold.unwrap_or(u32::MAX);
+    let total_pairs = n * (n - 1) / 2;
+
+    // For small datasets, use sequential to avoid thread overhead
+    if total_pairs < 10_000 {
+        return pairwise_hamming_distance(hashes, row_ids, Some(threshold));
+    }
+
+    let threads = rayon::current_num_threads();
+    let pairs_per_chunk = total_pairs.div_ceil(threads);
+    let chunks = compute_balanced_chunks(n, pairs_per_chunk);
+
+    let results: Vec<PairwiseResult> = chunks
+        .into_par_iter()
+        .map(|(start_row, end_row)| {
+            process_row_range(hashes, row_ids, threshold, start_row, end_row)
+        })
+        .collect();
+
+    let mut combined = PairwiseResult::new();
+    for r in results {
+        combined.extend(r);
+    }
+    combined
+}
+
+/// Compute balanced chunks for parallel processing.
+fn compute_balanced_chunks(n: usize, target_pairs_per_chunk: usize) -> Vec<(usize, usize)> {
+    let mut chunks = Vec::new();
+    let mut current_start = 0;
+    let mut current_pairs = 0;
+
+    for i in 0..n {
+        let pairs_for_row = n - i - 1;
+        current_pairs += pairs_for_row;
+
+        if current_pairs >= target_pairs_per_chunk || i == n - 1 {
+            chunks.push((current_start, i + 1));
+            current_start = i + 1;
+            current_pairs = 0;
+        }
+    }
+
+    chunks
+}
+
+/// Process a range of rows for pairwise comparison using SIMD.
+fn process_row_range(
+    hashes: &[u64],
+    row_ids: Option<&[u64]>,
+    threshold: u32,
+    start_row: usize,
+    end_row: usize,
+) -> PairwiseResult {
+    let n = hashes.len();
+    let mut result = PairwiseResult::new();
+
+    for i in start_row..end_row {
+        let remaining = n - i - 1;
+        if remaining == 0 {
+            continue;
+        }
+
+        let mut distances = vec![0u32; remaining];
+        hamming_batch_u64(hashes[i], &hashes[i + 1..], &mut distances);
+
+        let id_a = row_ids.map_or(i as u64, |ids| ids[i]);
+        for (j_offset, &dist) in distances.iter().enumerate() {
+            if dist <= threshold {
+                let j = i + 1 + j_offset;
+                let id_b = row_ids.map_or(j as u64, |ids| ids[j]);
+                result.push(id_a, id_b, dist);
+            }
+        }
+    }
+
+    result
+}
+
+/// Extract u64 hashes from a FixedSizeList<UInt8, 8> Arrow array.
+pub fn extract_hashes_from_fixed_list(array: &FixedSizeListArray) -> Result<Vec<u64>> {
+    let list_size = array.value_length();
+    if list_size != 8 {
+        return Err(Error::InvalidArgumentError(format!(
+            "Expected FixedSizeList with size 8, got size {}",
+            list_size
+        )));
+    }
+
+    let values = array
+        .values()
+        .as_any()
+        .downcast_ref::<arrow_array::UInt8Array>()
+        .ok_or_else(|| {
+            Error::InvalidArgumentError("Expected UInt8Array values in FixedSizeList".to_string())
+        })?;
+
+    let n = array.len();
+    let mut hashes = Vec::with_capacity(n);
+
+    for i in 0..n {
+        let start = i * 8;
+        let bytes = &values.values()[start..start + 8];
+        let mut arr = [0u8; 8];
+        arr.copy_from_slice(bytes);
+        hashes.push(u64::from_le_bytes(arr));
+    }
+
+    Ok(hashes)
+}
+
+/// Union-Find data structure with path compression for clustering.
+pub struct UnionFind {
+    parent: HashMap<u64, u64>,
+    rank: HashMap<u64, u32>,
+}
+
+impl UnionFind {
+    pub fn new() -> Self {
+        Self {
+            parent: HashMap::new(),
+            rank: HashMap::new(),
+        }
+    }
+
+    pub fn with_capacity(capacity: usize) -> Self {
+        Self {
+            parent: HashMap::with_capacity(capacity),
+            rank: HashMap::with_capacity(capacity),
+        }
+    }
+
+    /// Find the root of a node with path compression.
+    pub fn find(&mut self, x: u64) -> u64 {
+        if let std::collections::hash_map::Entry::Vacant(e) = self.parent.entry(x) {
+            e.insert(x);
+            self.rank.insert(x, 0);
+            return x;
+        }
+
+        let mut current = x;
+        let mut path = Vec::new();
+
+        while self.parent[&current] != current {
+            path.push(current);
+            current = self.parent[&current];
+        }
+        let root = current;
+
+        for node in path {
+            self.parent.insert(node, root);
+        }
+
+        root
+    }
+
+    /// Union two nodes, using union by rank.
+    pub fn union(&mut self, a: u64, b: u64) -> bool {
+        let root_a = self.find(a);
+        let root_b = self.find(b);
+
+        if root_a == root_b {
+            return false;
+        }
+
+        let rank_a = self.rank[&root_a];
+        let rank_b = self.rank[&root_b];
+
+        if rank_a < rank_b {
+            self.parent.insert(root_a, root_b);
+        } else if rank_a > rank_b {
+            self.parent.insert(root_b, root_a);
+        } else if root_a < root_b {
+            self.parent.insert(root_b, root_a);
+            *self.rank.get_mut(&root_a).unwrap() += 1;
+        } else {
+            self.parent.insert(root_a, root_b);
+            *self.rank.get_mut(&root_b).unwrap() += 1;
+        }
+
+        true
+    }
+
+    pub fn nodes(&self) -> impl Iterator<Item = &u64> {
+        self.parent.keys()
+    }
+
+    pub fn len(&self) -> usize {
+        self.parent.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.parent.is_empty()
+    }
+}
+
+impl Default for UnionFind {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// A cluster with representative and duplicates.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Cluster {
+    /// The representative row ID (smallest in the cluster).
+    pub representative: u64,
+    /// List of duplicate row IDs (excludes the representative).
+    pub duplicates: Vec<u64>,
+}
+
+impl Cluster {
+    pub fn size(&self) -> usize {
+        1 + self.duplicates.len()
+    }
+}
+
+/// Result of the clustering operation.
+#[derive(Debug, Clone)]
+pub struct ClusteringResult {
+    /// List of clusters, each with a representative and duplicates.
+    pub clusters: Vec<Cluster>,
+}
+
+impl ClusteringResult {
+    pub fn num_clusters(&self) -> usize {
+        self.clusters.len()
+    }
+
+    pub fn num_duplicates(&self) -> usize {
+        self.clusters.iter().map(|c| c.duplicates.len()).sum()
+    }
+
+    pub fn num_unique(&self) -> usize {
+        self.clusters.len()
+    }
+
+    /// Get the schema for clustering result batches.
+    pub fn schema() -> SchemaRef {
+        Arc::new(Schema::new(vec![
+            Field::new("representative", DataType::UInt64, false),
+            Field::new(
+                "duplicates",
+                DataType::List(Arc::new(Field::new("item", DataType::UInt64, true))),
+                false,
+            ),
+        ]))
+    }
+
+    /// Convert to Arrow RecordBatch with columns:
+    /// - `representative`: `UInt64`
+    /// - `duplicates`: `List<UInt64>`
+    pub fn to_record_batch(&self) -> RecordBatch {
+        let schema = Self::schema();
+
+        let mut representatives = Vec::with_capacity(self.clusters.len());
+        let mut duplicates_builder = ListBuilder::new(UInt64Builder::new());
+
+        for cluster in &self.clusters {
+            representatives.push(cluster.representative);
+            for &dup in &cluster.duplicates {
+                duplicates_builder.values().append_value(dup);
+            }
+            duplicates_builder.append(true);
+        }
+
+        let representative_array: ArrayRef = Arc::new(UInt64Array::from(representatives));
+        let duplicates_array: ArrayRef = Arc::new(duplicates_builder.finish());
+
+        RecordBatch::try_new(schema, vec![representative_array, duplicates_array])
+            .expect("Failed to create RecordBatch")
+    }
+
+    /// Convert to a RecordBatchReader that yields batches of the specified size.
+    ///
+    /// # Arguments
+    /// * `batch_size` - Number of clusters per batch (default: 10000)
+    pub fn into_reader(self, batch_size: Option<usize>) -> Box<dyn RecordBatchReader + Send> {
+        let batch_size = batch_size.unwrap_or(10_000);
+        let schema = Self::schema();
+
+        if self.clusters.is_empty() {
+            // Return empty reader
+            let batches: Vec<std::result::Result<RecordBatch, arrow_schema::ArrowError>> = vec![];
+            return Box::new(RecordBatchIterator::new(batches, schema));
+        }
+
+        let batches: Vec<std::result::Result<RecordBatch, arrow_schema::ArrowError>> = self
+            .clusters
+            .chunks(batch_size)
+            .map(|chunk| {
+                let mut representatives = Vec::with_capacity(chunk.len());
+                let mut duplicates_builder = ListBuilder::new(UInt64Builder::new());
+
+                for cluster in chunk {
+                    representatives.push(cluster.representative);
+                    for &dup in &cluster.duplicates {
+                        duplicates_builder.values().append_value(dup);
+                    }
+                    duplicates_builder.append(true);
+                }
+
+                let representative_array: ArrayRef = Arc::new(UInt64Array::from(representatives));
+                let duplicates_array: ArrayRef = Arc::new(duplicates_builder.finish());
+
+                RecordBatch::try_new(Self::schema(), vec![representative_array, duplicates_array])
+            })
+            .collect();
+
+        Box::new(RecordBatchIterator::new(batches, schema))
+    }
+}
+
+/// Cluster edges using union-find algorithm.
+///
+/// Takes a list of edges (row_id_a, row_id_b) and groups connected nodes
+/// into clusters. Each cluster has a representative (smallest row ID)
+/// and a list of duplicates.
+pub fn cluster_edges<I>(edges: I) -> ClusteringResult
+where
+    I: IntoIterator<Item = (u64, u64)>,
+{
+    let mut uf = UnionFind::new();
+
+    for (a, b) in edges {
+        uf.union(a, b);
+    }
+
+    let mut clusters_map: HashMap<u64, Vec<u64>> = HashMap::new();
+    let nodes: Vec<u64> = uf.nodes().copied().collect();
+
+    for node in nodes {
+        let root = uf.find(node);
+        clusters_map.entry(root).or_default().push(node);
+    }
+
+    let mut clusters = Vec::new();
+    for (_root, mut members) in clusters_map {
+        members.sort_unstable();
+
+        if members.len() > 1 {
+            let representative = *members.iter().min().unwrap();
+            let duplicates: Vec<u64> = members
+                .into_iter()
+                .filter(|&m| m != representative)
+                .collect();
+
+            clusters.push(Cluster {
+                representative,
+                duplicates,
+            });
+        }
+    }
+
+    clusters.sort_by_key(|c| c.representative);
+
+    ClusteringResult { clusters }
+}
+
+/// Cluster edges from PairwiseResult.
+pub fn cluster_pairwise_result(result: &PairwiseResult) -> ClusteringResult {
+    let edges = result
+        .row_id_a
+        .iter()
+        .zip(result.row_id_b.iter())
+        .map(|(&a, &b)| (a, b));
+
+    cluster_edges(edges)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -102,4 +746,677 @@ mod tests {
         let y = vec![0b1101_1010, 0b1010_1010, 0b1010_1001];
         assert_eq!(hamming(&x, &y), 2.0);
     }
+
+    #[test]
+    fn test_hamming_u64() {
+        assert_eq!(hamming_u64(0, 0), 0);
+        assert_eq!(hamming_u64(0, 1), 1);
+        assert_eq!(hamming_u64(0b1111, 0b0000), 4);
+        assert_eq!(hamming_u64(u64::MAX, 0), 64);
+        assert_eq!(hamming_u64(0xAAAAAAAAAAAAAAAA, 0x5555555555555555), 64);
+    }
+
+    #[test]
+    fn test_hamming_batch_u64() {
+        let query = 0u64;
+        let targets: Vec<u64> = (0..128).collect();
+        let mut results = vec![0u32; 128];
+
+        hamming_batch_u64(query, &targets, &mut results);
+
+        assert_eq!(results[0], 0);
+        assert_eq!(results[1], 1);
+        assert_eq!(results[3], 2); // 0b11 has 2 bits set
+        assert_eq!(results[7], 3); // 0b111 has 3 bits set
+    }
+
+    #[test]
+    fn test_pairwise_basic() {
+        let hashes = vec![0b0000u64, 0b0001, 0b0011, 0b0111];
+        let result = pairwise_hamming_distance(&hashes, None, None);
+
+        assert_eq!(result.len(), 6); // C(4,2) = 6 pairs
+        assert!(result.distances.iter().all(|&d| d <= 3));
+    }
+
+    #[test]
+    fn test_pairwise_with_threshold() {
+        let hashes = vec![0b0000u64, 0b0001, 0b1111];
+        let result = pairwise_hamming_distance(&hashes, None, Some(1));
+
+        assert_eq!(result.len(), 1);
+        assert_eq!(result.row_id_a[0], 0);
+        assert_eq!(result.row_id_b[0], 1);
+        assert_eq!(result.distances[0], 1);
+    }
+
+    #[test]
+    fn test_pairwise_with_row_ids() {
+        let hashes = vec![0b0000u64, 0b0001];
+        let row_ids = vec![100u64, 200u64];
+        let result = pairwise_hamming_distance(&hashes, Some(&row_ids), None);
+
+        assert_eq!(result.len(), 1);
+        assert_eq!(result.row_id_a[0], 100);
+        assert_eq!(result.row_id_b[0], 200);
+    }
+
+    #[test]
+    fn test_pairwise_parallel() {
+        let hashes: Vec<u64> = (0..100).collect();
+        let result_seq = pairwise_hamming_distance(&hashes, None, None);
+        let result_par = pairwise_hamming_distance_parallel(&hashes, None, None);
+
+        assert_eq!(result_seq.len(), result_par.len());
+    }
+
+    #[test]
+    fn test_union_find_basic() {
+        let mut uf = UnionFind::new();
+
+        assert_eq!(uf.find(1), 1);
+        assert_eq!(uf.find(2), 2);
+        assert_eq!(uf.find(3), 3);
+
+        assert!(uf.union(1, 2));
+        assert_eq!(uf.find(1), uf.find(2));
+
+        assert!(uf.union(2, 3));
+        assert_eq!(uf.find(1), uf.find(3));
+
+        assert!(!uf.union(1, 3));
+    }
+
+    #[test]
+    fn test_cluster_edges_simple() {
+        let edges = vec![(1, 2), (2, 3), (4, 5)];
+        let result = cluster_edges(edges);
+
+        assert_eq!(result.num_clusters(), 2);
+
+        let c1 = result
+            .clusters
+            .iter()
+            .find(|c| c.representative == 1)
+            .unwrap();
+        assert_eq!(c1.duplicates.len(), 2);
+        assert!(c1.duplicates.contains(&2));
+        assert!(c1.duplicates.contains(&3));
+
+        let c2 = result
+            .clusters
+            .iter()
+            .find(|c| c.representative == 4)
+            .unwrap();
+        assert_eq!(c2.duplicates.len(), 1);
+        assert!(c2.duplicates.contains(&5));
+    }
+
+    #[test]
+    fn test_cluster_pairwise_result() {
+        let hashes = vec![0b0000u64, 0b0001, 0b0011]; // distances: (0,1)=1, (0,2)=2, (1,2)=1
+        let pairwise = pairwise_hamming_distance(&hashes, None, Some(1)); // threshold 1
+
+        // Only pairs with distance <= 1: (0,1) and (1,2)
+        assert_eq!(pairwise.len(), 2);
+
+        let clustering = cluster_pairwise_result(&pairwise);
+        // All three should be in one cluster since 0-1-2 are connected
+        assert_eq!(clustering.num_clusters(), 1);
+        assert_eq!(clustering.clusters[0].representative, 0);
+        assert_eq!(clustering.clusters[0].duplicates.len(), 2);
+    }
+
+    #[test]
+    fn test_into_record_batch() {
+        let hashes = vec![0b0000u64, 0b0001, 0b0011];
+        let result = pairwise_hamming_distance(&hashes, None, None);
+        let batch = result.into_record_batch();
+
+        assert_eq!(batch.num_rows(), 3);
+        assert_eq!(batch.num_columns(), 3);
+        assert_eq!(batch.schema().field(0).name(), "row_id_a");
+        assert_eq!(batch.schema().field(1).name(), "row_id_b");
+        assert_eq!(batch.schema().field(2).name(), "distance");
+    }
+
+    // =========================================================================
+    // Additional tests from pairwise-hamming reference implementation
+    // =========================================================================
+
+    /// Reference implementation for validation - simple O(n²) nested loop
+    fn reference_pairwise(hashes: &[u64], threshold: Option<u32>) -> Vec<(usize, usize, u32)> {
+        let threshold = threshold.unwrap_or(u32::MAX);
+        let mut results = Vec::new();
+        for i in 0..hashes.len() {
+            for j in (i + 1)..hashes.len() {
+                let dist = (hashes[i] ^ hashes[j]).count_ones();
+                if dist <= threshold {
+                    results.push((i, j, dist));
+                }
+            }
+        }
+        results
+    }
+
+    /// Convert PairwiseResult to sorted vec for comparison
+    fn result_to_sorted_vec(result: &PairwiseResult) -> Vec<(u64, u64, u32)> {
+        let mut v: Vec<_> = result
+            .row_id_a
+            .iter()
+            .zip(result.row_id_b.iter())
+            .zip(result.distances.iter())
+            .map(|((&a, &b), &d)| (a, b, d))
+            .collect();
+        v.sort();
+        v
+    }
+
+    #[test]
+    fn test_pairwise_correctness_small() {
+        // Deterministic hashes with known distances
+        let hashes = vec![
+            0b0000_0000u64, // 0
+            0b0000_0001u64, // 1 bit from 0
+            0b0000_0011u64, // 2 bits from 0, 1 bit from 1
+            0b0000_0111u64, // 3 bits from 0, 2 bits from 1, 1 bit from 2
+            0b0000_1111u64, // 4 bits from 0, 3 bits from 1, 2 bits from 2, 1 bit from 3
+        ];
+
+        let result = pairwise_hamming_distance(&hashes, None, None);
+        let reference = reference_pairwise(&hashes, None);
+
+        assert_eq!(result.len(), reference.len());
+        assert_eq!(result.len(), 10); // C(5,2) = 10 pairs
+
+        // Verify specific distances
+        let result_vec = result_to_sorted_vec(&result);
+        for (i, j, expected_dist) in &reference {
+            let found = result_vec
+                .iter()
+                .find(|(a, b, _)| *a == *i as u64 && *b == *j as u64);
+            assert!(found.is_some(), "Missing pair ({}, {})", i, j);
+            assert_eq!(
+                found.unwrap().2,
+                *expected_dist,
+                "Wrong distance for pair ({}, {})",
+                i,
+                j
+            );
+        }
+    }
+
+    #[test]
+    fn test_pairwise_correctness_1000_deterministic() {
+        // Generate deterministic hashes using simple linear pattern
+        let hashes: Vec<u64> = (0u64..1000)
+            .map(|i| i.wrapping_mul(0x123456789ABCDEF))
+            .collect();
+
+        let result_seq = pairwise_hamming_distance(&hashes, None, Some(10));
+        let result_par = pairwise_hamming_distance_parallel(&hashes, None, Some(10));
+        let reference = reference_pairwise(&hashes, Some(10));
+
+        // Both implementations should match reference
+        assert_eq!(
+            result_seq.len(),
+            reference.len(),
+            "Sequential result count mismatch"
+        );
+        assert_eq!(
+            result_par.len(),
+            reference.len(),
+            "Parallel result count mismatch"
+        );
+
+        // Verify all pairs match
+        let seq_sorted = result_to_sorted_vec(&result_seq);
+        let par_sorted = result_to_sorted_vec(&result_par);
+
+        for (i, j, dist) in &reference {
+            let seq_found = seq_sorted
+                .iter()
+                .find(|(a, b, _)| *a == *i as u64 && *b == *j as u64);
+            let par_found = par_sorted
+                .iter()
+                .find(|(a, b, _)| *a == *i as u64 && *b == *j as u64);
+
+            assert!(
+                seq_found.is_some(),
+                "Sequential missing pair ({}, {})",
+                i,
+                j
+            );
+            assert!(par_found.is_some(), "Parallel missing pair ({}, {})", i, j);
+            assert_eq!(seq_found.unwrap().2, *dist);
+            assert_eq!(par_found.unwrap().2, *dist);
+        }
+    }
+
+    #[test]
+    fn test_pairwise_correctness_10000_deterministic() {
+        // Larger test with 10K hashes
+        let hashes: Vec<u64> = (0u64..10_000)
+            .map(|i| {
+                // Mix bits using a simple hash-like transformation
+                let x = i.wrapping_mul(0xDEADBEEFCAFEBABE);
+                x ^ (x >> 17) ^ (x << 13)
+            })
+            .collect();
+
+        let result_seq = pairwise_hamming_distance(&hashes, None, Some(5));
+        let result_par = pairwise_hamming_distance_parallel(&hashes, None, Some(5));
+
+        // Both should find the same number of pairs
+        assert_eq!(
+            result_seq.len(),
+            result_par.len(),
+            "10K test: sequential found {} pairs, parallel found {} pairs",
+            result_seq.len(),
+            result_par.len()
+        );
+
+        // Verify they contain the same pairs (sorted comparison)
+        let seq_sorted = result_to_sorted_vec(&result_seq);
+        let par_sorted = result_to_sorted_vec(&result_par);
+        assert_eq!(seq_sorted, par_sorted, "10K test: pair contents differ");
+    }
+
+    #[test]
+    fn test_pairwise_total_pairs_count() {
+        // Without threshold, should return exactly n*(n-1)/2 pairs
+        for n in [10, 50, 100, 500] {
+            let hashes: Vec<u64> = (0..n).map(|i| i as u64).collect();
+            let result = pairwise_hamming_distance_parallel(&hashes, None, None);
+            let expected = n * (n - 1) / 2;
+            assert_eq!(
+                result.len(),
+                expected,
+                "n={}: expected {} pairs, got {}",
+                n,
+                expected,
+                result.len()
+            );
+        }
+    }
+
+    #[test]
+    fn test_pairwise_threshold_filtering() {
+        // All identical hashes should have distance 0
+        let hashes = vec![0xABCDEF0123456789u64; 100];
+        let result = pairwise_hamming_distance_parallel(&hashes, None, Some(0));
+
+        // All pairs should be included (distance 0)
+        assert_eq!(result.len(), 100 * 99 / 2);
+        assert!(result.distances.iter().all(|&d| d == 0));
+
+        // With threshold 0 and all different hashes, should find fewer pairs
+        let different_hashes: Vec<u64> = (0u64..100).collect();
+        let result2 = pairwise_hamming_distance_parallel(&different_hashes, None, Some(0));
+        // Only pairs with identical values should match (none in this case except 0^0)
+        assert!(result2.len() < 100 * 99 / 2);
+    }
+
+    #[test]
+    fn test_pairwise_row_ids_preserved() {
+        let hashes: Vec<u64> = (0u64..100).collect();
+        let row_ids: Vec<u64> = (1000u64..1100).collect(); // offset row IDs
+
+        let result = pairwise_hamming_distance_parallel(&hashes, Some(&row_ids), Some(5));
+
+        // All row IDs should be in range [1000, 1100)
+        for &id in &result.row_id_a {
+            assert!((1000..1100).contains(&id), "row_id_a {} out of range", id);
+        }
+        for &id in &result.row_id_b {
+            assert!((1000..1100).contains(&id), "row_id_b {} out of range", id);
+        }
+        // row_id_a should always be less than row_id_b (upper triangular)
+        for (&a, &b) in result.row_id_a.iter().zip(result.row_id_b.iter()) {
+            assert!(a < b, "Expected row_id_a < row_id_b, got {} >= {}", a, b);
+        }
+    }
+
+    #[test]
+    fn test_pairwise_distance_bounds() {
+        // All distances should be in [0, 64] for u64 hashes
+        let hashes: Vec<u64> = (0u64..1000).map(|i| i.wrapping_mul(0x123456789)).collect();
+
+        let result = pairwise_hamming_distance_parallel(&hashes, None, None);
+
+        for &d in &result.distances {
+            assert!(d <= 64, "Distance {} exceeds maximum 64", d);
+        }
+    }
+
+    #[test]
+    fn test_pairwise_symmetry() {
+        // Hamming distance is symmetric: d(a,b) = d(b,a)
+        let hashes: Vec<u64> = vec![
+            0x0000000000000000,
+            0xFFFFFFFFFFFFFFFF,
+            0xAAAAAAAAAAAAAAAA,
+            0x5555555555555555,
+            0x123456789ABCDEF0,
+        ];
+
+        let result = pairwise_hamming_distance(&hashes, None, None);
+
+        // For each pair (i,j), verify distance matches manual calculation
+        for idx in 0..result.len() {
+            let i = result.row_id_a[idx] as usize;
+            let j = result.row_id_b[idx] as usize;
+            let dist = result.distances[idx];
+
+            let expected = (hashes[i] ^ hashes[j]).count_ones();
+            assert_eq!(dist, expected, "Distance mismatch for pair ({}, {})", i, j);
+        }
+    }
+
+    #[test]
+    fn test_balanced_chunks() {
+        // Verify chunks are reasonably balanced
+        let n = 10000;
+        let total_pairs = n * (n - 1) / 2;
+        let target_per_chunk = total_pairs / 16;
+
+        let chunks = compute_balanced_chunks(n, target_per_chunk);
+
+        // Should have roughly 16 chunks
+        assert!(
+            chunks.len() >= 14 && chunks.len() <= 18,
+            "Expected ~16 chunks, got {}",
+            chunks.len()
+        );
+
+        // Each chunk should have roughly equal work
+        for (start, end) in &chunks {
+            let mut chunk_pairs = 0usize;
+            for i in *start..*end {
+                chunk_pairs += n - i - 1;
+            }
+            // Allow 20% deviation from target
+            let lower = target_per_chunk * 80 / 100;
+            // last chunk may be smaller
+            assert!(
+                chunk_pairs >= lower || *end == n,
+                "Chunk [{}, {}) has {} pairs, expected ~{}",
+                start,
+                end,
+                chunk_pairs,
+                target_per_chunk
+            );
+        }
+
+        // Chunks should cover all rows without gaps
+        assert_eq!(chunks[0].0, 0);
+        assert_eq!(chunks.last().unwrap().1, n);
+        for i in 1..chunks.len() {
+            assert_eq!(chunks[i].0, chunks[i - 1].1, "Gap between chunks");
+        }
+    }
+
+    // =========================================================================
+    // SIMD-specific tests
+    // =========================================================================
+
+    #[test]
+    #[cfg(target_arch = "x86_64")]
+    fn test_avx2_popcount() {
+        if !is_x86_feature_detected!("avx2") {
+            return;
+        }
+
+        let query = 0u64;
+        let targets = vec![0u64, 1, 3, 7, 15, 31, 63, 127];
+        let mut results = vec![0u32; 8];
+
+        unsafe {
+            hamming_batch_avx2(query, &targets, &mut results);
+        }
+
+        assert_eq!(results[0], 0); // 0 ^ 0 = 0 bits
+        assert_eq!(results[1], 1); // 0 ^ 1 = 1 bit
+        assert_eq!(results[2], 2); // 0 ^ 3 = 2 bits
+        assert_eq!(results[3], 3); // 0 ^ 7 = 3 bits
+        assert_eq!(results[4], 4); // 0 ^ 15 = 4 bits
+        assert_eq!(results[5], 5); // 0 ^ 31 = 5 bits
+        assert_eq!(results[6], 6); // 0 ^ 63 = 6 bits
+        assert_eq!(results[7], 7); // 0 ^ 127 = 7 bits
+    }
+
+    #[test]
+    #[cfg(target_arch = "x86_64")]
+    fn test_avx2_max_distance() {
+        if !is_x86_feature_detected!("avx2") {
+            return;
+        }
+
+        let query = 0u64;
+        let targets = vec![u64::MAX; 4];
+        let mut results = vec![0u32; 4];
+
+        unsafe {
+            hamming_batch_avx2(query, &targets, &mut results);
+        }
+
+        for &r in &results {
+            assert_eq!(r, 64);
+        }
+    }
+
+    #[test]
+    #[cfg(target_arch = "x86_64")]
+    fn test_avx512_popcount() {
+        if !is_x86_feature_detected!("avx512vpopcntdq") || !is_x86_feature_detected!("avx512f") {
+            return;
+        }
+
+        let query = 0u64;
+        let targets = vec![0u64, 1, 3, 7, 15, 31, 63, 127];
+        let mut results = vec![0u32; 8];
+
+        unsafe {
+            hamming_batch_avx512(query, &targets, &mut results);
+        }
+
+        assert_eq!(results[0], 0);
+        assert_eq!(results[1], 1);
+        assert_eq!(results[2], 2);
+        assert_eq!(results[3], 3);
+        assert_eq!(results[4], 4);
+        assert_eq!(results[5], 5);
+        assert_eq!(results[6], 6);
+        assert_eq!(results[7], 7);
+    }
+
+    // =========================================================================
+    // Additional clustering tests
+    // =========================================================================
+
+    #[test]
+    fn test_union_find_path_compression() {
+        let mut uf = UnionFind::new();
+
+        // Create a chain: 1 -> 2 -> 3 -> 4 -> 5
+        uf.union(4, 5);
+        uf.union(3, 4);
+        uf.union(2, 3);
+        uf.union(1, 2);
+
+        // All should have the same root
+        let root = uf.find(1);
+        assert_eq!(uf.find(2), root);
+        assert_eq!(uf.find(3), root);
+        assert_eq!(uf.find(4), root);
+        assert_eq!(uf.find(5), root);
+    }
+
+    #[test]
+    fn test_cluster_edges_single_cluster() {
+        // All connected: 1-2-3-4-5
+        let edges = vec![(1, 2), (2, 3), (3, 4), (4, 5)];
+        let result = cluster_edges(edges);
+
+        assert_eq!(result.num_clusters(), 1);
+        let cluster = &result.clusters[0];
+        assert_eq!(cluster.representative, 1);
+        assert_eq!(cluster.duplicates.len(), 4);
+        assert_eq!(cluster.size(), 5);
+    }
+
+    #[test]
+    fn test_cluster_edges_no_duplicates() {
+        // No edges means no clusters
+        let edges: Vec<(u64, u64)> = vec![];
+        let result = cluster_edges(edges);
+
+        assert_eq!(result.num_clusters(), 0);
+        assert_eq!(result.num_duplicates(), 0);
+    }
+
+    #[test]
+    fn test_cluster_edges_self_loop() {
+        // Self-loop shouldn't create a cluster (size 1)
+        let edges = vec![(1, 1), (2, 3)];
+        let result = cluster_edges(edges);
+
+        // Only {2,3} should be a cluster
+        assert_eq!(result.num_clusters(), 1);
+        assert_eq!(result.clusters[0].representative, 2);
+    }
+
+    #[test]
+    fn test_cluster_edges_duplicate_edges() {
+        // Duplicate edges should be handled correctly
+        let edges = vec![(1, 2), (1, 2), (2, 3), (2, 3), (3, 1)];
+        let result = cluster_edges(edges);
+
+        assert_eq!(result.num_clusters(), 1);
+        assert_eq!(result.clusters[0].size(), 3);
+    }
+
+    #[test]
+    fn test_cluster_edges_large() {
+        // Create 100 clusters of size 10 each
+        let mut edges = Vec::new();
+        for cluster_id in 0..100u64 {
+            let base = cluster_id * 10;
+            for i in 0..9 {
+                edges.push((base + i, base + i + 1));
+            }
+        }
+
+        let result = cluster_edges(edges);
+
+        assert_eq!(result.num_clusters(), 100);
+        for cluster in &result.clusters {
+            assert_eq!(cluster.size(), 10);
+            assert_eq!(cluster.duplicates.len(), 9);
+        }
+    }
+
+    #[test]
+    fn test_cluster_edges_random_order() {
+        // Same edges in different order should produce same result
+        let edges1 = vec![(1, 2), (2, 3), (4, 5), (3, 4)];
+        let edges2 = vec![(4, 5), (1, 2), (3, 4), (2, 3)];
+        let edges3 = vec![(3, 4), (4, 5), (2, 3), (1, 2)];
+
+        let r1 = cluster_edges(edges1);
+        let r2 = cluster_edges(edges2);
+        let r3 = cluster_edges(edges3);
+
+        // All should produce the same single cluster
+        assert_eq!(r1.num_clusters(), 1);
+        assert_eq!(r2.num_clusters(), 1);
+        assert_eq!(r3.num_clusters(), 1);
+
+        assert_eq!(r1.clusters[0].representative, 1);
+        assert_eq!(r2.clusters[0].representative, 1);
+        assert_eq!(r3.clusters[0].representative, 1);
+
+        assert_eq!(r1.clusters[0].size(), 5);
+        assert_eq!(r2.clusters[0].size(), 5);
+        assert_eq!(r3.clusters[0].size(), 5);
+    }
+
+    #[test]
+    fn test_cluster_edges_non_contiguous_ids() {
+        // Row IDs don't need to be contiguous
+        let edges = vec![(100, 200), (200, 500), (1000, 2000)];
+        let result = cluster_edges(edges);
+
+        assert_eq!(result.num_clusters(), 2);
+
+        let c1 = result
+            .clusters
+            .iter()
+            .find(|c| c.representative == 100)
+            .unwrap();
+        assert_eq!(c1.duplicates, vec![200, 500]);
+
+        let c2 = result
+            .clusters
+            .iter()
+            .find(|c| c.representative == 1000)
+            .unwrap();
+        assert_eq!(c2.duplicates, vec![2000]);
+    }
+
+    #[test]
+    fn test_cluster_representative_is_minimum() {
+        // Representative should always be the minimum row ID in cluster
+        let edges = vec![
+            (5, 3),
+            (3, 7),
+            (7, 1), // 1 is minimum
+            (100, 50),
+            (50, 75), // 50 is minimum
+        ];
+        let result = cluster_edges(edges);
+
+        assert_eq!(result.num_clusters(), 2);
+
+        let c1 = result
+            .clusters
+            .iter()
+            .find(|c| c.duplicates.contains(&7))
+            .unwrap();
+        assert_eq!(c1.representative, 1);
+
+        let c2 = result
+            .clusters
+            .iter()
+            .find(|c| c.duplicates.contains(&100))
+            .unwrap();
+        assert_eq!(c2.representative, 50);
+    }
+
+    #[test]
+    fn test_cluster_duplicates_sorted() {
+        // Duplicates should be sorted
+        let edges = vec![(1, 5), (1, 3), (1, 7), (1, 2)];
+        let result = cluster_edges(edges);
+
+        assert_eq!(result.num_clusters(), 1);
+        assert_eq!(result.clusters[0].representative, 1);
+        assert_eq!(result.clusters[0].duplicates, vec![2, 3, 5, 7]);
+    }
+
+    #[test]
+    fn test_clustering_result_stats() {
+        let edges = vec![
+            (1, 2),
+            (2, 3), // cluster of 3
+            (10, 20),
+            (20, 30),
+            (30, 40), // cluster of 4
+        ];
+        let result = cluster_edges(edges);
+
+        assert_eq!(result.num_clusters(), 2);
+        assert_eq!(result.num_duplicates(), 5); // 2 + 3
+        assert_eq!(result.num_unique(), 2);
+    }
 }
diff --git a/rust/lance-namespace-datafusion/tests/sql.rs b/rust/lance-namespace-datafusion/tests/sql.rs
index e49cd7e58e3..5332e831cb6 100755
--- a/rust/lance-namespace-datafusion/tests/sql.rs
+++ b/rust/lance-namespace-datafusion/tests/sql.rs
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
+#![recursion_limit = "256"]
+
 use std::sync::Arc;
 
 use arrow_array::{Int32Array, Int64Array, RecordBatch, RecordBatchIterator, StringArray};
diff --git a/rust/lance-namespace-impls/BENCHMARK.md b/rust/lance-namespace-impls/BENCHMARK.md
new file mode 100644
index 00000000000..074ec303347
--- /dev/null
+++ b/rust/lance-namespace-impls/BENCHMARK.md
@@ -0,0 +1,73 @@
+# `__manifest` commit benchmark
+
+Measures how fast the copy-on-write directory catalog commits `__manifest` mutations as
+the manifest scales, with the inline scalar indices on or off.
+
+The catalog commits every mutation by rewriting the whole `__manifest` (copy-on-write)
+and atomically writing a new manifest version. This benchmark characterises:
+
+- **Continuous commit** — a single process commits `N` times into a manifest already
+  holding `rows` entries (per-commit latency + throughput).
+- **Concurrent commit** — `C` processes commit continuously for a fixed duration against
+  a manifest of `rows` entries (steady, contended TPS).
+
+## Binary: `examples/manifest_bench.rs`
+
+```
+manifest_bench seed-large --root <uri> --count <rows> --inline-optimization <true|false> \
+    [--storage-option aws_region=us-east-1]
+manifest_bench run --root <uri> --operation write-create-namespace \
+    --concurrency 1 --operations 100 --initial-entries <rows> --inline-optimization <bool>   # continuous
+manifest_bench run --root <uri> --operation write-create-namespace \
+    --concurrency 50 --duration-secs 30 --initial-entries <rows> --inline-optimization <bool> # concurrent
+```
+
+- `seed-large` bootstraps a manifest to `count` rows by writing the Lance dataset
+  directly (O(rows) once) and then triggering one CoW rewrite so the on-disk state
+  matches the steady catalog form (single fragment; inline indices when enabled).
+- `run` spawns `--concurrency` worker subprocesses. With `--operations` it runs a fixed
+  commit budget (continuous); with `--duration-secs` each worker commits until the
+  deadline (steady TPS). It prints one JSON `BenchResult` per concurrency level with
+  throughput and p50/p90/p99 latency.
+- The committed operation (`--operation`) defaults to `write-create-namespace`, the
+  cheapest pure-`__manifest` mutation (no table data). `write-create-table` /
+  `write-declare-table` are also available.
+
+S3 requires the default `dir-aws` feature (on by default) and AWS credentials in the
+environment; pass `--storage-option aws_region=<region>`.
+
+## Sweep panel: `benches/manifest_commit_sweep.sh`
+
+Runs the full panel — sizes × {inline index, no index} × {continuous, concurrent×C} —
+with per-run S3-copy isolation (each run starts at exactly the bootstrapped size),
+JSONL results, a `summary.csv`, and resume support.
+
+```bash
+cargo build --release --example manifest_bench -p lance-namespace-impls
+S3_BASE=s3://<bucket>/manifest-cow-bench/$(date -u +%Y%m%dT%H%M%SZ) \
+  rust/lance-namespace-impls/benches/manifest_commit_sweep.sh
+```
+
+Default panel (override via env): `SIZES="1000 2000 5000 10000 20000 50000 100000 200000
+500000 1000000"`, `CONCURRENCY="10 20 50 100 120 150 200"`, `INLINE_VARIANTS="true false"`,
+`CONT_OPS=100`, `CONC_DURATION_SECS=30`. Results land in `$OUT_DIR` (default
+`~/manifest_cow_bench_<RUN_ID>`).
+
+## Representative results
+
+EC2 `c7i.48xlarge`, S3 `us-east-1`, op `write-create-namespace`. The catalog is a
+single-writer-throughput system: per-commit cost scales ~O(rows) and throughput does **not**
+scale with concurrency (every commit is a serialized `__manifest` version bump).
+
+Continuous (1 process, 100 commits), ops/s — inline index vs no index:
+
+| rows | inline | no index |
+|---:|---:|---:|
+| 1,000 | 2.0 | 3.5 |
+| 100,000 | 1.1 | 2.1 |
+| 1,000,000 | 0.34 | 0.53 |
+
+Concurrent steady TPS is flat across C=10..200 (e.g. inline @100k ≈ 1.4–1.5 ops/s at every C;
+@1M ≈ 0.3 ops/s). Conflicts that exceed the retry budget surface as errors and grow with C
+(≈0 at C≤20, climbing at C≥100) — the contention ceiling, not data loss. No-index commits run
+~1.5–2× faster (no per-commit index build) at the cost of unindexed reads.
diff --git a/rust/lance-namespace-impls/Cargo.toml b/rust/lance-namespace-impls/Cargo.toml
index 53ff79fb333..27b9a4bc0e2 100644
--- a/rust/lance-namespace-impls/Cargo.toml
+++ b/rust/lance-namespace-impls/Cargo.toml
@@ -51,6 +51,8 @@ object_store = { workspace = true }
 arrow = { workspace = true }
 arrow-ipc = { workspace = true }
 arrow-schema = { workspace = true }
+datafusion-common = { workspace = true }
+datafusion-physical-plan = { workspace = true }
 
 # REST adapter implementation dependencies (optional, enabled by "rest-adapter" feature)
 axum = { workspace = true, optional = true }
@@ -66,6 +68,8 @@ serde_json = { workspace = true }
 futures.workspace = true
 log.workspace = true
 rand.workspace = true
+roaring.workspace = true
+uuid.workspace = true
 
 # Shared credential vending dependencies
 sha2 = { version = "0.10", optional = true }
@@ -75,6 +79,11 @@ base64 = { version = "0.22", optional = true }
 aws-sdk-sts = { version = "1.38.0", optional = true, default-features = false, features = ["default-https-client", "rt-tokio"] }
 aws-config = { workspace = true, optional = true }
 
+# Pin: time 0.3.48 conflicts with aws-smithy-types (E0119: conflicting `From` impls), which this
+# crate pulls in via the AWS credential vendor. Capping time here forces the workspace resolver to
+# 0.3.47 even for no-lock builds. Not used directly; remove once the upstream conflict is resolved.
+time = "=0.3.47"
+
 # GCP credential vending dependencies (optional, enabled by "credential-vendor-gcp" feature)
 ring = { version = "0.17", optional = true }
 rustls-pki-types = { version = "1", optional = true }
@@ -96,6 +105,11 @@ rstest.workspace = true
 lance-table.workspace = true
 lance-arrow = { workspace = true }
 lance = { workspace = true }
+serde = { workspace = true, features = ["derive"] }
+
+[[example]]
+name = "manifest_bench"
+path = "examples/manifest_bench.rs"
 
 [lints]
 workspace = true
diff --git a/rust/lance-namespace-impls/benches/manifest_commit_sweep.sh b/rust/lance-namespace-impls/benches/manifest_commit_sweep.sh
new file mode 100644
index 00000000000..7384ced4152
--- /dev/null
+++ b/rust/lance-namespace-impls/benches/manifest_commit_sweep.sh
@@ -0,0 +1,146 @@
+#!/usr/bin/env bash
+# Copy-on-write __manifest commit benchmark sweep panel.
+#
+# Drives `cargo run --release --example manifest_bench` across a panel of:
+#   - bootstrap manifest sizes (rows already in __manifest)
+#   - inline scalar indices on vs off
+#   - continuous commit (single process, N commits) and
+#     concurrent commit (C processes, steady TPS over a fixed duration)
+#
+# Each run is isolated: a "golden" manifest is bootstrapped once per (size, index)
+# and server-side-copied to a fresh S3 prefix per run, so every run starts at exactly
+# the bootstrapped size. Results are written as JSONL (one BenchResult per line) and
+# summarised to CSV. The sweep is resumable: completed runs are skipped.
+#
+# Usage:
+#   S3_BASE=s3://jack-devland-build/manifest-cow-bench/$(date -u +%Y%m%dT%H%M%SZ) \
+#     ./manifest_commit_sweep.sh
+#
+# Env knobs (defaults match the requested panel):
+#   SIZES, CONCURRENCY, INLINE_VARIANTS, CONT_OPS, CONC_DURATION_SECS,
+#   AWS_REGION, OUT_DIR, BIN
+#
+# Resilient by design: a single failed run is logged and skipped rather than aborting
+# the sweep, and re-running fills the gaps (completed runs are detected and skipped).
+set -uo pipefail
+
+RUN_ID="${RUN_ID:-$(date -u +%Y%m%dT%H%M%SZ)}"
+S3_BASE="${S3_BASE:?set S3_BASE, e.g. s3://jack-devland-build/manifest-cow-bench/$RUN_ID}"
+AWS_REGION="${AWS_REGION:-us-east-1}"
+export AWS_REGION AWS_DEFAULT_REGION="$AWS_REGION"
+
+REPO_ROOT="${REPO_ROOT:-$HOME/oss/lance}"
+BIN="${BIN:-$REPO_ROOT/target/release/examples/manifest_bench}"
+OUT_DIR="${OUT_DIR:-$HOME/manifest_cow_bench_${RUN_ID}}"
+RESULTS="$OUT_DIR/results.jsonl"
+PROGRESS="$OUT_DIR/progress.log"
+mkdir -p "$OUT_DIR"
+
+SIZES=(${SIZES:-1000 2000 5000 10000 20000 50000 100000 200000 500000 1000000})
+CONCURRENCY=(${CONCURRENCY:-10 20 50 100 120 150 200})
+INLINE_VARIANTS=(${INLINE_VARIANTS:-true false})
+CONT_OPS="${CONT_OPS:-100}"
+CONC_DURATION_SECS="${CONC_DURATION_SECS:-30}"
+STORAGE_OPT=(--storage-option "aws_region=${AWS_REGION}")
+
+log() { printf '%s %s\n' "$(date -u +%H:%M:%S)" "$*" | tee -a "$PROGRESS"; }
+
+# Skip a run if its tag already appears in results.jsonl (resume support).
+done_already() { grep -q "\"bench_tag\":\"$1\"" "$RESULTS" 2>/dev/null; }
+
+# Append a result line, tagging it so reruns can resume and we can pivot later.
+record() {
+    local tag="$1"; shift
+    # shellcheck disable=SC2016
+    python3 -c 'import json,sys; d=json.load(sys.stdin); d["bench_tag"]=sys.argv[1]; print(json.dumps(d))' \
+        "$tag" >> "$RESULTS"
+}
+
+s3_copy() { aws s3 cp --recursive --quiet "$1" "$2" --region "$AWS_REGION"; }
+s3_rm()   { aws s3 rm --recursive --quiet "$1" --region "$AWS_REGION" || true; }
+
+# Backstops for unattended runs: cap any single run and clear leaked worker processes
+# (a killed coordinator can orphan its worker children) before the next run.
+RUN_TIMEOUT="${RUN_TIMEOUT:-1200}"
+clear_stragglers() { pkill -f 'examples/manifest_bench worker' 2>/dev/null || true; sleep 1; }
+
+for inline in "${INLINE_VARIANTS[@]}"; do
+  for rows in "${SIZES[@]}"; do
+    golden="${S3_BASE}/golden/inline_${inline}_rows_${rows}"
+    boot_tag="boot_inline_${inline}_rows_${rows}"
+
+    if ! done_already "$boot_tag"; then
+      log "BOOTSTRAP inline=$inline rows=$rows -> $golden"
+      s3_rm "$golden"
+      if "$BIN" seed-large --root "$golden" --count "$rows" \
+          --inline-optimization "$inline" "${STORAGE_OPT[@]}"; then
+        echo "{\"bench_tag\":\"$boot_tag\"}" >> "$RESULTS"
+      else
+        log "BOOTSTRAP FAILED inline=$inline rows=$rows (skipping this size)"
+        continue
+      fi
+    else
+      log "skip bootstrap $boot_tag (done)"
+    fi
+
+    # ---- Continuous: single process, CONT_OPS commits ----
+    cont_tag="cont_inline_${inline}_rows_${rows}"
+    if ! done_already "$cont_tag"; then
+      run_prefix="${S3_BASE}/run/${cont_tag}"
+      log "CONTINUOUS inline=$inline rows=$rows ops=$CONT_OPS"
+      clear_stragglers
+      s3_copy "$golden" "$run_prefix"
+      timeout "$RUN_TIMEOUT" "$BIN" run --root "$run_prefix" --operation write-create-namespace \
+        --concurrency 1 --operations "$CONT_OPS" --initial-entries "$rows" \
+        --inline-optimization "$inline" "${STORAGE_OPT[@]}" \
+        2>>"$PROGRESS" | while read -r line; do record "$cont_tag" <<<"$line"; done
+      s3_rm "$run_prefix"
+    else
+      log "skip continuous $cont_tag (done)"
+    fi
+
+    # ---- Concurrent: C processes, steady TPS over CONC_DURATION_SECS ----
+    for c in "${CONCURRENCY[@]}"; do
+      conc_tag="conc_inline_${inline}_rows_${rows}_c_${c}"
+      if done_already "$conc_tag"; then log "skip concurrent $conc_tag (done)"; continue; fi
+      run_prefix="${S3_BASE}/run/${conc_tag}"
+      log "CONCURRENT inline=$inline rows=$rows c=$c dur=${CONC_DURATION_SECS}s"
+      clear_stragglers
+      s3_copy "$golden" "$run_prefix"
+      timeout "$RUN_TIMEOUT" "$BIN" run --root "$run_prefix" --operation write-create-namespace \
+        --concurrency "$c" --duration-secs "$CONC_DURATION_SECS" --initial-entries "$rows" \
+        --inline-optimization "$inline" "${STORAGE_OPT[@]}" \
+        2>>"$PROGRESS" | while read -r line; do record "$conc_tag" <<<"$line"; done
+      s3_rm "$run_prefix"
+    done
+  done
+done
+
+# ---- Summarise to CSV ----
+CSV="$OUT_DIR/summary.csv"
+python3 - "$RESULTS" "$CSV" <<'PY'
+import json, sys, csv
+rows = []
+with open(sys.argv[1]) as f:
+    for line in f:
+        d = json.loads(line)
+        if "throughput_ops_per_sec" not in d:
+            continue  # bootstrap marker
+        mode = "continuous" if d["duration_secs"] == 0 else "concurrent"
+        rows.append({
+            "mode": mode, "variant": d["variant"], "initial_entries": d["initial_entries"],
+            "concurrency": d["concurrency"], "duration_secs": d["duration_secs"],
+            "ops": d["total_operations"], "errors": d["errors"],
+            "tps": round(d["throughput_ops_per_sec"], 3),
+            "avg_ms": round(d["avg_latency_ms"], 2), "p50_ms": round(d["p50_latency_ms"], 2),
+            "p90_ms": round(d["p90_latency_ms"], 2), "p99_ms": round(d["p99_latency_ms"], 2),
+        })
+rows.sort(key=lambda r: (r["mode"], r["variant"], r["initial_entries"], r["concurrency"]))
+with open(sys.argv[2], "w", newline="") as f:
+    w = csv.DictWriter(f, fieldnames=list(rows[0].keys()) if rows else [])
+    w.writeheader(); w.writerows(rows)
+print(f"wrote {len(rows)} rows to {sys.argv[2]}")
+PY
+
+log "SWEEP COMPLETE. Results: $RESULTS  Summary: $CSV"
+s3_rm "${S3_BASE}/golden" "${S3_BASE}/run" 2>/dev/null || true
diff --git a/rust/lance-namespace-impls/examples/manifest_bench.rs b/rust/lance-namespace-impls/examples/manifest_bench.rs
new file mode 100644
index 00000000000..4841f2471d7
--- /dev/null
+++ b/rust/lance-namespace-impls/examples/manifest_bench.rs
@@ -0,0 +1,714 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Copy-on-write `__manifest` directory-catalog commit benchmark (S3 capable).
+//!
+//! Measures how fast the directory catalog commits `__manifest` mutations as the
+//! manifest scales, with the inline scalar indices on or off.
+//!
+//! Modes:
+//!   seed-large — bootstrap a `__manifest` with N rows (direct dataset write + one
+//!                CoW rewrite to build indices)
+//!   run        — coordinator: spawn `--concurrency` worker processes committing for
+//!                either a fixed op count (continuous) or a fixed duration (steady TPS)
+//!   worker     — (internal) a single committing process spawned by `run`
+//!
+//! Examples:
+//!   # Bootstrap 100k rows with inline indices
+//!   manifest_bench seed-large --root s3://bucket/bench/p --count 100000 \
+//!     --inline-optimization true --storage-option aws_region=us-east-1
+//!
+//!   # Continuous: 100 commits, single process
+//!   manifest_bench run --root s3://bucket/bench/p --operation write-create-namespace \
+//!     --concurrency 1 --operations 100 --initial-entries 100000 --inline-optimization true
+//!
+//!   # Concurrent steady TPS: 50 processes committing for 30s
+//!   manifest_bench run --root s3://bucket/bench/p --operation write-create-namespace \
+//!     --concurrency 50 --duration-secs 30 --initial-entries 100000 --inline-optimization true
+
+// A CLI benchmark tool: workers emit JSON latency records on stdout and progress on
+// stderr, so stdout/stderr printing is intentional here.
+#![allow(clippy::print_stdout, clippy::print_stderr)]
+
+use std::collections::HashMap;
+use std::io::{BufRead, BufReader};
+use std::process::{Command, Stdio};
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use arrow::array::builder::{ListBuilder, StringBuilder};
+use arrow::array::{RecordBatch, RecordBatchIterator, StringArray};
+use arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
+use bytes::Bytes;
+use lance::dataset::{InsertBuilder, WriteMode, WriteParams};
+use lance_core::datatypes::LANCE_UNENFORCED_PRIMARY_KEY_POSITION;
+use lance_namespace::LanceNamespace;
+use lance_namespace::models::{
+    CreateNamespaceRequest, CreateTableRequest, DeclareTableRequest, DescribeTableRequest,
+    ListNamespacesRequest, ListTablesRequest,
+};
+use lance_namespace_impls::DirectoryNamespaceBuilder;
+use serde::{Deserialize, Serialize};
+
+#[derive(Serialize, Deserialize, Clone)]
+struct LatencyRecord {
+    operation: String,
+    latency_ms: f64,
+    error: bool,
+}
+
+#[derive(Serialize)]
+struct BenchResult {
+    variant: String,
+    operation: String,
+    concurrency: usize,
+    initial_entries: usize,
+    duration_secs: u64,
+    total_operations: usize,
+    total_duration_ms: f64,
+    throughput_ops_per_sec: f64,
+    avg_latency_ms: f64,
+    p50_latency_ms: f64,
+    p90_latency_ms: f64,
+    p99_latency_ms: f64,
+    min_latency_ms: f64,
+    max_latency_ms: f64,
+    errors: usize,
+}
+
+fn percentile(sorted: &[f64], p: f64) -> f64 {
+    if sorted.is_empty() {
+        return 0.0;
+    }
+    let idx = ((sorted.len() as f64 - 1.0) * p).round() as usize;
+    sorted[idx.min(sorted.len() - 1)]
+}
+
+#[allow(clippy::too_many_arguments)]
+fn compute_result(
+    variant: &str,
+    operation: &str,
+    concurrency: usize,
+    initial_entries: usize,
+    duration_secs: u64,
+    wall_duration: Duration,
+    mut latencies: Vec<f64>,
+    errors: usize,
+) -> BenchResult {
+    latencies.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    let total = latencies.len();
+    let total_ms = wall_duration.as_secs_f64() * 1000.0;
+    let throughput = if total_ms > 0.0 {
+        total as f64 / (total_ms / 1000.0)
+    } else {
+        0.0
+    };
+    BenchResult {
+        variant: variant.to_string(),
+        operation: operation.to_string(),
+        concurrency,
+        initial_entries,
+        duration_secs,
+        total_operations: total,
+        total_duration_ms: total_ms,
+        throughput_ops_per_sec: throughput,
+        avg_latency_ms: if total > 0 {
+            latencies.iter().sum::<f64>() / total as f64
+        } else {
+            0.0
+        },
+        p50_latency_ms: percentile(&latencies, 0.50),
+        p90_latency_ms: percentile(&latencies, 0.90),
+        p99_latency_ms: percentile(&latencies, 0.99),
+        min_latency_ms: latencies.first().copied().unwrap_or(0.0),
+        max_latency_ms: latencies.last().copied().unwrap_or(0.0),
+        errors,
+    }
+}
+
+fn create_test_ipc_data() -> Vec<u8> {
+    use arrow::array::Int32Array;
+    use arrow_ipc::writer::StreamWriter;
+
+    let schema = Arc::new(ArrowSchema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("name", DataType::Utf8, false),
+    ]));
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![
+            Arc::new(Int32Array::from(vec![1, 2, 3])),
+            Arc::new(StringArray::from(vec!["a", "b", "c"])),
+        ],
+    )
+    .unwrap();
+    let mut buffer = Vec::new();
+    {
+        let mut writer = StreamWriter::try_new(&mut buffer, &schema).unwrap();
+        writer.write(&batch).unwrap();
+        writer.finish().unwrap();
+    }
+    buffer
+}
+
+/// The `__manifest` schema used by the copy-on-write directory catalog:
+/// `object_id`, `object_type`, `location`, `metadata` (Utf8), `base_objects` (List<Utf8>).
+fn manifest_schema() -> Arc<ArrowSchema> {
+    Arc::new(ArrowSchema::new(vec![
+        Field::new("object_id", DataType::Utf8, false).with_metadata(
+            [(
+                LANCE_UNENFORCED_PRIMARY_KEY_POSITION.to_string(),
+                "0".to_string(),
+            )]
+            .into_iter()
+            .collect(),
+        ),
+        Field::new("object_type", DataType::Utf8, false),
+        Field::new("location", DataType::Utf8, true),
+        Field::new("metadata", DataType::Utf8, true),
+        Field::new(
+            "base_objects",
+            DataType::List(Arc::new(Field::new("object_id", DataType::Utf8, true))),
+            true,
+        ),
+    ]))
+}
+
+async fn build_namespace(
+    root: &str,
+    inline_optimization: bool,
+    storage_options: &HashMap<String, String>,
+) -> Box<dyn LanceNamespace> {
+    let mut properties = HashMap::new();
+    properties.insert("root".to_string(), root.to_string());
+    properties.insert("dir_listing_enabled".to_string(), "false".to_string());
+    properties.insert(
+        "inline_optimization_enabled".to_string(),
+        inline_optimization.to_string(),
+    );
+    for (k, v) in storage_options {
+        properties.insert(format!("storage.{}", k), v.clone());
+    }
+    let builder = DirectoryNamespaceBuilder::from_properties(properties, None)
+        .expect("Failed to create namespace builder from properties");
+    Box::new(builder.build().await.expect("Failed to build namespace"))
+}
+
+// ──────────────────── seed-large mode ────────────────────
+// Bootstrap a `__manifest` with N rows by writing the Lance dataset directly (fast,
+// O(N) once), then trigger a single CoW rewrite via the namespace so the on-disk state
+// matches what the catalog produces (single fragment + inline indices when enabled).
+
+const SEED_LARGE_BATCH_SIZE: usize = 50_000;
+
+fn generate_manifest_batch(start_idx: usize, batch_size: usize, total_count: usize) -> RecordBatch {
+    let ns_count = total_count / 3;
+    let actual_size = batch_size.min(total_count - start_idx);
+
+    let mut object_ids = Vec::with_capacity(actual_size);
+    let mut object_types = Vec::with_capacity(actual_size);
+    let mut locations: Vec<Option<String>> = Vec::with_capacity(actual_size);
+    let mut metadatas: Vec<Option<String>> = Vec::with_capacity(actual_size);
+
+    for i in start_idx..start_idx + actual_size {
+        if i < ns_count {
+            object_ids.push(format!("ns_{}", i));
+            object_types.push("namespace".to_string());
+            locations.push(None);
+            metadatas.push(None);
+        } else {
+            let table_idx = i - ns_count;
+            object_ids.push(format!("table_{}", table_idx));
+            object_types.push("table".to_string());
+            locations.push(Some(format!("table_{}", table_idx)));
+            metadatas.push(Some(r#"{"bench":"true"}"#.to_string()));
+        }
+    }
+
+    // base_objects is null for every bootstrapped row.
+    let mut base_objects_builder = ListBuilder::new(StringBuilder::new())
+        .with_field(Arc::new(Field::new("object_id", DataType::Utf8, true)));
+    for _ in 0..actual_size {
+        base_objects_builder.append_null();
+    }
+
+    RecordBatch::try_new(
+        manifest_schema(),
+        vec![
+            Arc::new(StringArray::from(object_ids)),
+            Arc::new(StringArray::from(object_types)),
+            Arc::new(StringArray::from(
+                locations.iter().map(|l| l.as_deref()).collect::<Vec<_>>(),
+            )),
+            Arc::new(StringArray::from(
+                metadatas.iter().map(|m| m.as_deref()).collect::<Vec<_>>(),
+            )),
+            Arc::new(base_objects_builder.finish()),
+        ],
+    )
+    .expect("Failed to create manifest batch")
+}
+
+async fn seed_large(
+    root: &str,
+    count: usize,
+    inline_optimization: bool,
+    storage_options: &HashMap<String, String>,
+) {
+    let manifest_uri = format!("{}/{}", root, "__manifest");
+    eprintln!("Seed-large: writing {} rows to {}", count, manifest_uri);
+
+    let schema = manifest_schema();
+    let mut batches = Vec::new();
+    let mut offset = 0;
+    while offset < count {
+        let batch_size = SEED_LARGE_BATCH_SIZE.min(count - offset);
+        batches.push(generate_manifest_batch(offset, batch_size, count));
+        offset += batch_size;
+    }
+    eprintln!("  generated {} batches", batches.len());
+
+    let mut write_params = WriteParams {
+        mode: WriteMode::Create,
+        ..WriteParams::default()
+    };
+    if !storage_options.is_empty() {
+        let accessor = Arc::new(
+            lance_io::object_store::StorageOptionsAccessor::with_static_options(
+                storage_options.clone(),
+            ),
+        );
+        write_params.store_params = Some(lance_io::object_store::ObjectStoreParams {
+            storage_options_accessor: Some(accessor),
+            ..Default::default()
+        });
+    }
+
+    let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone());
+    InsertBuilder::new(manifest_uri.as_str())
+        .with_params(&write_params)
+        .execute_stream(reader)
+        .await
+        .expect("Failed to write manifest dataset");
+    eprintln!("  wrote Lance dataset");
+
+    // Trigger one CoW rewrite so the manifest is in steady catalog form (single
+    // fragment; inline indices when enabled). For the no-index variant the first real
+    // commit performs this rewrite instead.
+    if inline_optimization {
+        eprintln!("  triggering initial CoW rewrite to build indices...");
+        let start = Instant::now();
+        let ns = build_namespace(root, true, storage_options).await;
+        let mut req = CreateNamespaceRequest::new();
+        req.id = Some(vec!["__seed_trigger__".to_string()]);
+        ns.create_namespace(req)
+            .await
+            .expect("Failed to trigger CoW rewrite");
+        eprintln!(
+            "  CoW rewrite with index build took {:.1}s",
+            start.elapsed().as_secs_f64()
+        );
+    }
+
+    let ns_count = count / 3;
+    eprintln!(
+        "Seed-large complete: {} rows ({} namespaces, {} tables)",
+        count,
+        ns_count,
+        count - ns_count
+    );
+}
+
+// ──────────────────── worker mode ────────────────────
+
+#[allow(clippy::too_many_arguments)]
+async fn worker(
+    root: &str,
+    operation: &str,
+    operations: usize,
+    duration_secs: u64,
+    warmup: usize,
+    worker_id: usize,
+    table_count: usize,
+    inline_optimization: bool,
+    storage_options: &HashMap<String, String>,
+) {
+    let ns = build_namespace(root, inline_optimization, storage_options).await;
+    let ipc_data = Bytes::from(create_test_ipc_data());
+
+    if operation.starts_with("warm-read") {
+        for _ in 0..warmup {
+            let _ =
+                run_operation(ns.as_ref(), operation, worker_id, 0, table_count, &ipc_data).await;
+        }
+    }
+
+    let emit = |op_idx: usize, start: Instant, err: bool| {
+        let record = LatencyRecord {
+            operation: operation.to_string(),
+            latency_ms: start.elapsed().as_secs_f64() * 1000.0,
+            error: err,
+        };
+        let _ = op_idx;
+        println!("{}", serde_json::to_string(&record).unwrap());
+    };
+
+    if duration_secs > 0 {
+        // Steady-TPS mode: commit continuously until the deadline.
+        let deadline = Instant::now() + Duration::from_secs(duration_secs);
+        let mut op_idx = 0;
+        while Instant::now() < deadline {
+            let start = Instant::now();
+            let err = run_operation(
+                ns.as_ref(),
+                operation,
+                worker_id,
+                op_idx,
+                table_count,
+                &ipc_data,
+            )
+            .await
+            .is_err();
+            emit(op_idx, start, err);
+            op_idx += 1;
+        }
+    } else {
+        for op_idx in 0..operations {
+            let start = Instant::now();
+            let err = run_operation(
+                ns.as_ref(),
+                operation,
+                worker_id,
+                op_idx,
+                table_count,
+                &ipc_data,
+            )
+            .await
+            .is_err();
+            emit(op_idx, start, err);
+        }
+    }
+}
+
+async fn run_operation(
+    ns: &dyn LanceNamespace,
+    operation: &str,
+    worker_id: usize,
+    op_idx: usize,
+    table_count: usize,
+    ipc_data: &Bytes,
+) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+    match operation {
+        "cold-read-list-namespaces" | "warm-read-list-namespaces" => {
+            let mut req = ListNamespacesRequest::new();
+            req.id = Some(vec![]);
+            ns.list_namespaces(req).await?;
+        }
+        "cold-read-list-tables" | "warm-read-list-tables" => {
+            let mut req = ListTablesRequest::new();
+            req.id = Some(vec![]);
+            ns.list_tables(req).await?;
+        }
+        "cold-read-describe-table" | "warm-read-describe-table" => {
+            let table_idx = (worker_id * 1_000_000 + op_idx) % table_count.max(1);
+            let req = DescribeTableRequest {
+                id: Some(vec![format!("table_{}", table_idx)]),
+                ..Default::default()
+            };
+            ns.describe_table(req).await?;
+        }
+        "write-create-namespace" => {
+            let mut req = CreateNamespaceRequest::new();
+            req.id = Some(vec![format!("bench_w{}_{}", worker_id, op_idx)]);
+            ns.create_namespace(req).await?;
+        }
+        "write-create-table" => {
+            let mut req = CreateTableRequest::new();
+            req.id = Some(vec![format!("bench_t{}_{}", worker_id, op_idx)]);
+            ns.create_table(req, ipc_data.clone()).await?;
+        }
+        "write-declare-table" => {
+            let req = DeclareTableRequest {
+                id: Some(vec![format!("bench_d{}_{}", worker_id, op_idx)]),
+                ..Default::default()
+            };
+            ns.declare_table(req).await?;
+        }
+        _ => {
+            return Err(format!("unknown operation: {}", operation).into());
+        }
+    }
+    Ok(())
+}
+
+// ──────────────────── run mode (coordinator) ────────────────────
+
+#[allow(clippy::too_many_arguments)]
+fn run_workers(
+    self_exe: &str,
+    root: &str,
+    operation: &str,
+    concurrency: usize,
+    operations: usize,
+    duration_secs: u64,
+    warmup: usize,
+    table_count: usize,
+    initial_entries: usize,
+    inline_optimization: bool,
+    variant: &str,
+    storage_options: &HashMap<String, String>,
+) -> BenchResult {
+    // Continuous mode splits a fixed op budget across workers; steady-TPS mode lets each
+    // worker run for the full duration.
+    let ops_per_worker = if duration_secs > 0 {
+        0
+    } else {
+        operations / concurrency.max(1)
+    };
+    if duration_secs == 0 && ops_per_worker == 0 {
+        return compute_result(
+            variant,
+            operation,
+            concurrency,
+            initial_entries,
+            duration_secs,
+            Duration::ZERO,
+            vec![],
+            0,
+        );
+    }
+
+    let wall_start = Instant::now();
+    let children: Vec<_> = (0..concurrency)
+        .map(|worker_id| {
+            let mut cmd = Command::new(self_exe);
+            cmd.arg("worker")
+                .arg("--root")
+                .arg(root)
+                .arg("--operation")
+                .arg(operation)
+                .arg("--operations")
+                .arg(ops_per_worker.to_string())
+                .arg("--duration-secs")
+                .arg(duration_secs.to_string())
+                .arg("--warmup")
+                .arg(warmup.to_string())
+                .arg("--worker-id")
+                .arg(worker_id.to_string())
+                .arg("--table-count")
+                .arg(table_count.to_string())
+                .arg("--inline-optimization")
+                .arg(inline_optimization.to_string());
+            for (k, v) in storage_options {
+                cmd.arg("--storage-option").arg(format!("{}={}", k, v));
+            }
+            cmd.stdout(Stdio::piped())
+                .stderr(Stdio::inherit())
+                .spawn()
+                .expect("Failed to spawn worker")
+        })
+        .collect();
+
+    let mut all_latencies = Vec::new();
+    let mut total_errors = 0;
+    for mut child in children {
+        let stdout = child.stdout.take().unwrap();
+        for line in BufReader::new(stdout).lines() {
+            let line = line.expect("failed to read worker output");
+            if let Ok(record) = serde_json::from_str::<LatencyRecord>(&line) {
+                if record.error {
+                    total_errors += 1;
+                } else {
+                    all_latencies.push(record.latency_ms);
+                }
+            }
+        }
+        let status = child.wait().expect("failed to wait for worker");
+        if !status.success() {
+            eprintln!("Worker exited with status: {}", status);
+        }
+    }
+
+    compute_result(
+        variant,
+        operation,
+        concurrency,
+        initial_entries,
+        duration_secs,
+        wall_start.elapsed(),
+        all_latencies,
+        total_errors,
+    )
+}
+
+fn parse_concurrency_list(s: &str) -> Vec<usize> {
+    s.split(',')
+        .filter_map(|v| v.trim().parse::<usize>().ok())
+        .filter(|v| *v > 0)
+        .collect()
+}
+
+#[tokio::main]
+async fn main() {
+    let args: Vec<String> = std::env::args().collect();
+    if args.len() < 2 {
+        eprintln!("Usage: manifest_bench <seed-large|run|worker> [options]");
+        std::process::exit(1);
+    }
+
+    let mode = args[1].as_str();
+    let mut root = String::new();
+    let mut operation = String::new();
+    let mut operations: usize = 100;
+    let mut duration_secs: u64 = 0;
+    let mut warmup: usize = 0;
+    let mut concurrency_list = vec![1];
+    let mut count: usize = 1000;
+    let mut worker_id: usize = 0;
+    let mut table_count: usize = 667;
+    let mut initial_entries: usize = 0;
+    let mut inline_optimization = true;
+    let mut variant = String::new();
+    let mut storage_options: HashMap<String, String> = HashMap::new();
+
+    let mut i = 2;
+    while i < args.len() {
+        match args[i].as_str() {
+            "--root" => {
+                root = args[i + 1].clone();
+                i += 2;
+            }
+            "--operation" => {
+                operation = args[i + 1].clone();
+                i += 2;
+            }
+            "--operations" => {
+                operations = args[i + 1].parse().unwrap();
+                i += 2;
+            }
+            "--duration-secs" => {
+                duration_secs = args[i + 1].parse().unwrap();
+                i += 2;
+            }
+            "--warmup" => {
+                warmup = args[i + 1].parse().unwrap();
+                i += 2;
+            }
+            "--concurrency" => {
+                concurrency_list = parse_concurrency_list(&args[i + 1]);
+                i += 2;
+            }
+            "--count" => {
+                count = args[i + 1].parse().unwrap();
+                i += 2;
+            }
+            "--worker-id" => {
+                worker_id = args[i + 1].parse().unwrap();
+                i += 2;
+            }
+            "--table-count" => {
+                table_count = args[i + 1].parse().unwrap();
+                i += 2;
+            }
+            "--initial-entries" => {
+                initial_entries = args[i + 1].parse().unwrap();
+                i += 2;
+            }
+            "--inline-optimization" => {
+                inline_optimization = args[i + 1].parse().unwrap();
+                i += 2;
+            }
+            "--variant" => {
+                variant = args[i + 1].clone();
+                i += 2;
+            }
+            "--storage-option" => {
+                if let Some((k, v)) = args[i + 1].split_once('=') {
+                    storage_options.insert(k.to_string(), v.to_string());
+                }
+                i += 2;
+            }
+            other => {
+                eprintln!("Unknown argument: {}", other);
+                std::process::exit(1);
+            }
+        }
+    }
+
+    if variant.is_empty() {
+        variant = if inline_optimization {
+            "inline_index".to_string()
+        } else {
+            "no_index".to_string()
+        };
+    }
+
+    match mode {
+        "seed-large" => {
+            seed_large(&root, count, inline_optimization, &storage_options).await;
+        }
+        "worker" => {
+            worker(
+                &root,
+                &operation,
+                operations,
+                duration_secs,
+                warmup,
+                worker_id,
+                table_count,
+                inline_optimization,
+                &storage_options,
+            )
+            .await;
+        }
+        "run" => {
+            let self_exe = std::env::current_exe()
+                .expect("failed to get self exe path")
+                .to_string_lossy()
+                .to_string();
+            let op = if operation.is_empty() {
+                "write-create-namespace"
+            } else {
+                operation.as_str()
+            };
+
+            eprintln!("=== Manifest commit benchmark ===");
+            eprintln!(
+                "variant={} op={} root={} initial_entries={} concurrency={:?} operations={} duration_secs={}",
+                variant, op, root, initial_entries, concurrency_list, operations, duration_secs
+            );
+
+            for &concurrency in &concurrency_list {
+                let result = run_workers(
+                    &self_exe,
+                    &root,
+                    op,
+                    concurrency,
+                    operations,
+                    duration_secs,
+                    warmup,
+                    table_count,
+                    initial_entries,
+                    inline_optimization,
+                    &variant,
+                    &storage_options,
+                );
+                eprintln!(
+                    "  c={} -> {:.2} ops/s ({} ops, {} errors, p50={:.0}ms p99={:.0}ms)",
+                    concurrency,
+                    result.throughput_ops_per_sec,
+                    result.total_operations,
+                    result.errors,
+                    result.p50_latency_ms,
+                    result.p99_latency_ms
+                );
+                println!("{}", serde_json::to_string(&result).unwrap());
+            }
+            eprintln!("=== complete ===");
+        }
+        _ => {
+            eprintln!("Unknown mode: {}. Use seed-large, run, or worker.", mode);
+            std::process::exit(1);
+        }
+    }
+}
diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs
index 8859e4bc237..e97c5c836b7 100644
--- a/rust/lance-namespace-impls/src/dir.rs
+++ b/rust/lance-namespace-impls/src/dir.rs
@@ -7,6 +7,7 @@
 //! that stores tables as Lance datasets in a filesystem directory structure.
 
 pub mod manifest;
+pub mod manifest_feature_flags;
 
 use arrow::array::Float32Array;
 use arrow::record_batch::RecordBatchIterator;
@@ -195,9 +196,6 @@ pub struct DirectoryNamespaceBuilder {
     dir_listing_enabled: bool,
     inline_optimization_enabled: bool,
     table_version_tracking_enabled: bool,
-    /// When true, table versions are stored in the `__manifest` table instead of
-    /// relying on Lance's native version management.
-    table_version_storage_enabled: bool,
     /// When true, enables migration mode where the namespace checks the manifest first
     /// before falling back to directory listing for root-level tables. When false (default),
     /// root-level tables use directory listing directly without checking the manifest,
@@ -233,10 +231,6 @@ impl std::fmt::Debug for DirectoryNamespaceBuilder {
                 "table_version_tracking_enabled",
                 &self.table_version_tracking_enabled,
             )
-            .field(
-                "table_version_storage_enabled",
-                &self.table_version_storage_enabled,
-            )
             .field(
                 "dir_listing_to_manifest_migration_enabled",
                 &self.dir_listing_to_manifest_migration_enabled,
@@ -273,7 +267,6 @@ impl DirectoryNamespaceBuilder {
             dir_listing_enabled: true, // Default to enabled for backwards compatibility
             inline_optimization_enabled: true,
             table_version_tracking_enabled: false, // Default to disabled
-            table_version_storage_enabled: false,  // Default to disabled
             dir_listing_to_manifest_migration_enabled: false, // Default to disabled
             credential_vendor_properties: HashMap::new(),
             context_provider: None,
@@ -313,11 +306,10 @@ impl DirectoryNamespaceBuilder {
         self
     }
 
-    /// Enable or disable inline optimization of the __manifest table.
+    /// Enable or disable replacement index maintenance for the __manifest table.
     ///
-    /// When enabled (default), performs compaction and indexing on the __manifest table
-    /// after every write operation to maintain optimal performance.
-    /// When disabled, manual optimization must be performed separately.
+    /// When enabled (default), copy-on-write manifest rewrites build replacement indices
+    /// for fast reads. When disabled, rewrites only replace data files.
     pub fn inline_optimization_enabled(mut self, enabled: bool) -> Self {
         self.inline_optimization_enabled = enabled;
         self
@@ -335,19 +327,6 @@ impl DirectoryNamespaceBuilder {
         self
     }
 
-    /// Enable or disable table version management through the `__manifest` table.
-    ///
-    /// When enabled, table versions are tracked as `table_version` entries in the
-    /// `__manifest` Lance table. This enables:
-    /// - Centralized version tracking instead of per-table `_versions/` directories
-    ///
-    /// Requires `manifest_enabled` to be true.
-    /// When disabled (default), version storage uses per-table storage operations.
-    pub fn table_version_storage_enabled(mut self, enabled: bool) -> Self {
-        self.table_version_storage_enabled = enabled;
-        self
-    }
-
     /// Create a DirectoryNamespaceBuilder from properties HashMap.
     ///
     /// This method parses a properties map into builder configuration.
@@ -355,7 +334,7 @@ impl DirectoryNamespaceBuilder {
     /// - `root`: The root directory path (required)
     /// - `manifest_enabled`: Enable manifest-based table tracking (optional, default: true)
     /// - `dir_listing_enabled`: Enable directory listing for table discovery (optional, default: true)
-    /// - `inline_optimization_enabled`: Enable inline optimization of __manifest table (optional, default: true)
+    /// - `inline_optimization_enabled`: Enable replacement indices on __manifest rewrites (optional, default: true)
     /// - `storage.*`: Storage options (optional, prefix will be stripped)
     ///
     /// Credential vendor properties (prefixed with `credential_vendor.`, prefix is stripped):
@@ -465,12 +444,6 @@ impl DirectoryNamespaceBuilder {
             .and_then(|v| v.parse::<bool>().ok())
             .unwrap_or(false);
 
-        // Extract table_version_storage_enabled (default: false)
-        let table_version_storage_enabled = properties
-            .get("table_version_storage_enabled")
-            .and_then(|v| v.parse::<bool>().ok())
-            .unwrap_or(false);
-
         // Extract dir_listing_to_manifest_migration_enabled (default: false)
         let dir_listing_to_manifest_migration_enabled = properties
             .get("dir_listing_to_manifest_migration_enabled")
@@ -517,7 +490,6 @@ impl DirectoryNamespaceBuilder {
             dir_listing_enabled,
             inline_optimization_enabled,
             table_version_tracking_enabled,
-            table_version_storage_enabled,
             dir_listing_to_manifest_migration_enabled,
             credential_vendor_properties,
             context_provider: None,
@@ -694,14 +666,6 @@ impl DirectoryNamespaceBuilder {
     /// - Connection to the storage backend fails
     /// - Storage options are invalid
     pub async fn build(self) -> Result<DirectoryNamespace> {
-        // Validate: table_version_storage_enabled requires manifest_enabled
-        if self.table_version_storage_enabled && !self.manifest_enabled {
-            return Err(NamespaceError::InvalidInput {
-                message: "table_version_storage_enabled requires manifest_enabled=true".to_string(),
-            }
-            .into());
-        }
-
         let (object_store, base_path) =
             Self::initialize_object_store(&self.root, &self.storage_options, &self.session).await?;
 
@@ -715,11 +679,16 @@ impl DirectoryNamespaceBuilder {
                 self.dir_listing_enabled,
                 self.inline_optimization_enabled,
                 self.commit_retries,
-                self.table_version_storage_enabled,
             )
             .await
             {
                 Ok(ns) => Some(Arc::new(ns)),
+                Err(e) if manifest_feature_flags::is_incompatible_manifest_error(&e) => {
+                    // The manifest exists but was written with a feature flag this
+                    // build does not understand. Refuse rather than silently
+                    // degrading to a directory-listing view that ignores it.
+                    return Err(e);
+                }
                 Err(e) => {
                     // Failed to initialize manifest namespace, fall back to directory listing only
                     log::warn!(
@@ -760,7 +729,6 @@ impl DirectoryNamespaceBuilder {
             dir_listing_to_manifest_migration_enabled: self
                 .dir_listing_to_manifest_migration_enabled,
             table_version_tracking_enabled: self.table_version_tracking_enabled,
-            table_version_storage_enabled: self.table_version_storage_enabled,
             credential_vendor,
             context_provider: self.context_provider,
             vend_input_storage_options: self.vend_input_storage_options,
@@ -843,8 +811,6 @@ pub struct DirectoryNamespace {
     /// When true, `describe_table` returns `managed_versioning: true` to indicate
     /// commits should go through namespace table version APIs.
     table_version_tracking_enabled: bool,
-    /// When true, table versions are stored in the `__manifest` table.
-    table_version_storage_enabled: bool,
     /// Credential vendor created once during initialization.
     /// Used to vend temporary credentials for table access.
     credential_vendor: Option<Arc<dyn CredentialVendor>>,
@@ -1413,6 +1379,11 @@ impl DirectoryNamespace {
                     }
                     return Ok(response);
                 }
+                Err(e) if manifest_feature_flags::is_incompatible_manifest_error(&e) => {
+                    // An incompatible manifest must surface "please upgrade"
+                    // rather than degrading to a directory-listing view.
+                    return Err(e);
+                }
                 Err(_) if self.dir_listing_enabled && is_root_level => {
                     // Fall through to directory check only for single-level IDs
                 }
@@ -2143,6 +2114,7 @@ impl DirectoryNamespace {
     /// to the manifest to enable manifest-only mode:
     ///
     /// ```no_run
+    /// #![recursion_limit = "256"]
     /// # use lance_namespace_impls::DirectoryNamespaceBuilder;
     /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
     /// // Create namespace with dual mode (manifest + directory listing)
@@ -2211,18 +2183,16 @@ impl DirectoryNamespace {
         Ok(migrated_count)
     }
 
-    /// Delete physical manifest files for the given table version ranges (best-effort).
+    /// Delete physical manifest files for the given table version ranges.
     ///
-    /// This helper is used by `batch_delete_table_versions` in both the manifest-enabled
-    /// and non-manifest paths. It resolves each table's storage location, computes the
-    /// version file paths, and attempts to delete them. Errors are logged (best-effort)
-    /// when `best_effort` is true, or returned immediately when false.
+    /// This helper backs `batch_delete_table_versions`. It resolves each table's storage
+    /// location, computes the version file paths, and deletes them, returning an error on
+    /// the first failure.
     ///
     /// Returns the number of files successfully deleted.
     async fn delete_physical_version_files(
         &self,
         table_entries: &[TableDeleteEntry],
-        best_effort: bool,
         branch: Option<&str>,
     ) -> Result<i64> {
         let mut deleted_count = 0i64;
@@ -2268,22 +2238,13 @@ impl DirectoryNamespace {
                     }
                     Err(object_store::Error::NotFound { .. }) => {}
                     Err(e) => {
-                        if best_effort {
-                            log::warn!(
-                                "Failed to delete manifest file for version {} of table {:?}: {:?}",
-                                v,
-                                te.table_id,
-                                e
-                            );
-                        } else {
-                            return Err(NamespaceError::Internal {
-                                message: format!(
-                                    "Failed to delete version {} for table at '{}': {}",
-                                    v, table_uri, e
-                                ),
-                            }
-                            .into());
+                        return Err(NamespaceError::Internal {
+                            message: format!(
+                                "Failed to delete version {} for table at '{}': {}",
+                                v, table_uri, e
+                            ),
                         }
+                        .into());
                     }
                 }
             }
@@ -2650,6 +2611,11 @@ impl LanceNamespace for DirectoryNamespace {
         {
             match manifest_ns.table_exists(request.clone()).await {
                 Ok(()) => return Ok(()),
+                Err(e) if manifest_feature_flags::is_incompatible_manifest_error(&e) => {
+                    // An incompatible manifest must surface "please upgrade"
+                    // rather than degrading to a directory-listing view.
+                    return Err(e);
+                }
                 Err(_) if self.dir_listing_enabled && is_root_level => {
                     // Fall through to directory check only for single-level IDs
                 }
@@ -2927,20 +2893,6 @@ impl LanceNamespace for DirectoryNamespace {
     ) -> Result<ListTableVersionsResponse> {
         self.record_op("list_table_versions");
         let branch = Self::normalized_branch(request.branch.as_deref())?;
-        // The manifest catalog has no branch concept, so a branch lists its own
-        // version chain from storage under its tree path instead.
-        if branch.is_none()
-            && self.table_version_storage_enabled
-            && let Some(ref manifest_ns) = self.manifest_ns
-        {
-            let table_id = request.id.clone().unwrap_or_default();
-            let want_descending = request.descending == Some(true);
-            return manifest_ns
-                .list_table_versions(&table_id, want_descending, request.limit)
-                .await;
-        }
-
-        // Fallback when table_version_storage is not enabled: list from _versions/ directory
         let table_uri = self.resolve_table_location(&request.id).await?;
         let table_uri = match branch {
             Some(b) => self.resolve_branch_location(&table_uri, b).await?,
@@ -3087,43 +3039,6 @@ impl LanceNamespace for DirectoryNamespace {
             );
         }
 
-        // Also record in __manifest (best-effort). Branches aren't tracked there,
-        // so for a branch the storage manifest above is the only record.
-        if branch.is_none()
-            && self.table_version_storage_enabled
-            && let Some(ref manifest_ns) = self.manifest_ns
-        {
-            let table_id_str =
-                manifest::ManifestNamespace::str_object_id(&request.id.clone().unwrap_or_default());
-            let object_id =
-                manifest::ManifestNamespace::build_version_object_id(&table_id_str, version as i64);
-            let metadata_json = serde_json::json!({
-                "manifest_path": final_path.to_string(),
-                "manifest_size": manifest_size,
-                "e_tag": final_meta.e_tag,
-                "naming_scheme": request.naming_scheme.as_deref().unwrap_or("V2"),
-            })
-            .to_string();
-
-            if let Err(e) = manifest_ns
-                .insert_into_manifest_with_metadata(
-                    vec![manifest::ManifestEntry {
-                        object_id,
-                        object_type: manifest::ObjectType::TableVersion,
-                        location: None,
-                        metadata: Some(metadata_json),
-                    }],
-                    None,
-                )
-                .await
-            {
-                log::warn!(
-                    "Failed to record table version in __manifest (best-effort): {:?}",
-                    e
-                );
-            }
-        }
-
         Ok(CreateTableVersionResponse {
             transaction_id: None,
             version: Some(Box::new(TableVersion {
@@ -3143,18 +3058,6 @@ impl LanceNamespace for DirectoryNamespace {
     ) -> Result<DescribeTableVersionResponse> {
         self.record_op("describe_table_version");
         let branch = Self::normalized_branch(request.branch.as_deref())?;
-        // When table_version_storage_enabled and a specific version is requested,
-        // query from __manifest to avoid opening the entire dataset. A branch has
-        // no manifest-catalog entry, so it resolves from storage instead.
-        if branch.is_none()
-            && self.table_version_storage_enabled
-            && let (Some(manifest_ns), Some(version)) = (&self.manifest_ns, request.version)
-        {
-            let table_id = request.id.clone().unwrap_or_default();
-            return manifest_ns.describe_table_version(&table_id, version).await;
-        }
-
-        // Fallback when table_version_storage is not enabled: inspect physical manifests directly.
         let table_uri = self.resolve_table_location(&request.id).await?;
         let table_uri = match branch {
             Some(b) => self.resolve_branch_location(&table_uri, b).await?,
@@ -3206,9 +3109,9 @@ impl LanceNamespace for DirectoryNamespace {
             .map(|r| (r.start_version, r.end_version))
             .collect();
 
-        // Reject pathological bounded ranges up front: the manifest path below
-        // builds one id per version, so (0, i64::MAX) would exhaust memory. A
-        // through-latest range (end < 0) is bounded by the manifests that exist.
+        // Reject pathological bounded ranges up front: an explicit huge bounded
+        // range like (0, i64::MAX) is almost certainly a mistake. A through-latest
+        // range (end < 0) is bounded by the manifests that actually exist on storage.
         const MAX_VERSIONS_PER_REQUEST: i128 = 1_000_000;
         let requested: i128 = ranges
             .iter()
@@ -3235,76 +3138,8 @@ impl LanceNamespace for DirectoryNamespace {
             ranges,
         }];
 
-        let mut total_deleted_count = 0i64;
-
-        // Branches are not tracked in the manifest catalog, so a branch skips the
-        // __manifest phase entirely and deletes its physical manifests directly.
-        if branch.is_none()
-            && self.table_version_storage_enabled
-            && let Some(ref manifest_ns) = self.manifest_ns
-        {
-            // Through-latest ranges (end_version < 0) would require enumerating the
-            // __manifest chain up to the latest version, which is not wired up here.
-            // Reject rather than silently delete physical files while leaving the
-            // __manifest records in place.
-            if table_entries
-                .iter()
-                .any(|te| te.ranges.iter().any(|&(_, e)| e < 0))
-            {
-                return Err(NamespaceError::Unsupported {
-                    message: "through-latest delete (end_version < 0) is not supported \
-                              for managed-versioning tables"
-                        .to_string(),
-                }
-                .into());
-            }
-
-            // Phase 1 (atomic commit point): Delete version records from __manifest
-            // for ALL tables in a single atomic operation. This is the authoritative
-            // source of truth — once __manifest entries are removed, the versions
-            // are logically deleted across all tables atomically.
-
-            // Collect all (table_id_str, ranges) for batch deletion
-            let mut all_object_ids: Vec<String> = Vec::new();
-            for te in &table_entries {
-                let table_id_str = manifest::ManifestNamespace::str_object_id(
-                    &te.table_id.clone().unwrap_or_default(),
-                );
-                for (start, end) in &te.ranges {
-                    for version in *start..*end {
-                        let object_id = manifest::ManifestNamespace::build_version_object_id(
-                            &table_id_str,
-                            version,
-                        );
-                        all_object_ids.push(object_id);
-                    }
-                }
-            }
-
-            if !all_object_ids.is_empty() {
-                total_deleted_count = manifest_ns
-                    .batch_delete_table_versions_by_object_ids(&all_object_ids)
-                    .await?;
-            }
-
-            // Phase 2: Delete physical manifest files (best-effort).
-            // Even if some file deletions fail, the versions are already removed from
-            // __manifest, so they won't be visible to readers. Leftover files are
-            // orphaned but harmless and can be cleaned up later.
-            let _ = self
-                .delete_physical_version_files(&table_entries, true, branch)
-                .await;
-
-            return Ok(BatchDeleteTableVersionsResponse {
-                deleted_count: Some(total_deleted_count),
-                transaction_id: None,
-            });
-        }
-
-        // Direct path: delete physical files (no __manifest). Reached when storage
-        // tracking is off, or for any branch (which has no __manifest entries).
-        total_deleted_count = self
-            .delete_physical_version_files(&table_entries, false, branch)
+        let total_deleted_count = self
+            .delete_physical_version_files(&table_entries, branch)
             .await?;
 
         Ok(BatchDeleteTableVersionsResponse {
@@ -5380,7 +5215,6 @@ mod tests {
             DirectoryNamespaceBuilder::new(temp.to_str().unwrap())
                 .manifest_enabled(true)
                 .table_version_tracking_enabled(true)
-                .table_version_storage_enabled(true)
                 .ops_metrics_enabled(true)
                 .build()
                 .await
@@ -5755,150 +5589,12 @@ mod tests {
         );
     }
 
-    /// The managed `__manifest` delete path (the authoritative catalog) must honor
-    /// the exclusive end: `[min, max)` removes exactly min..max from `__manifest`,
-    /// keeping max. With storage tracking on, the writes register versions in
-    /// `__manifest` and `list_table_versions` reads it back, so this exercises the
-    /// Phase-1 path that the physical-path tests never reach.
-    #[tokio::test]
-    async fn test_batch_delete_managed_manifest_exclusive() {
-        use arrow::array::Int32Array;
-        use arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
-        use lance_namespace::models::{BatchDeleteTableVersionsRequest, VersionRange};
-
-        let temp = TempStdDir::default();
-        let ns: Arc<dyn LanceNamespace> = Arc::new(
-            DirectoryNamespaceBuilder::new(temp.to_str().unwrap())
-                .manifest_enabled(true)
-                .table_version_tracking_enabled(true)
-                .table_version_storage_enabled(true)
-                .build()
-                .await
-                .unwrap(),
-        );
-        let table_id = vec!["users".to_string()];
-        let schema = Arc::new(ArrowSchema::new(vec![Field::new(
-            "id",
-            DataType::Int32,
-            false,
-        )]));
-        let batch = |seed: i32| {
-            arrow::record_batch::RecordBatch::try_new(
-                schema.clone(),
-                vec![Arc::new(Int32Array::from(vec![seed]))],
-            )
-            .unwrap()
-        };
-
-        // Register v1, v2, v3 in __manifest via the managed write flow.
-        let mut ds = Dataset::write_into_namespace(
-            RecordBatchIterator::new(vec![Ok(batch(1))], schema.clone()),
-            ns.clone(),
-            table_id.clone(),
-            Some(WriteParams {
-                mode: WriteMode::Create,
-                ..Default::default()
-            }),
-        )
-        .await
-        .unwrap();
-        ds.append(
-            RecordBatchIterator::new(vec![Ok(batch(2))], schema.clone()),
-            None,
-        )
-        .await
-        .unwrap();
-        ds.append(
-            RecordBatchIterator::new(vec![Ok(batch(3))], schema.clone()),
-            None,
-        )
-        .await
-        .unwrap();
-
-        let before = ns
-            .list_table_versions(ListTableVersionsRequest {
-                id: Some(table_id.clone()),
-                ..Default::default()
-            })
-            .await
-            .unwrap()
-            .versions;
-        assert!(
-            before.len() >= 3,
-            "expected v1..v3 tracked in __manifest: {:?}",
-            before
-        );
-        let min_v = before.iter().map(|v| v.version).min().unwrap();
-        let max_v = before.iter().map(|v| v.version).max().unwrap();
-
-        // [min, max): exclusive end keeps max.
-        ns.batch_delete_table_versions(BatchDeleteTableVersionsRequest {
-            id: Some(table_id.clone()),
-            ranges: vec![VersionRange::new(min_v, max_v)],
-            ..Default::default()
-        })
-        .await
-        .unwrap();
-
-        let after = ns
-            .list_table_versions(ListTableVersionsRequest {
-                id: Some(table_id.clone()),
-                ..Default::default()
-            })
-            .await
-            .unwrap()
-            .versions;
-        assert_eq!(
-            after.len(),
-            1,
-            "only the exclusive end (max) should remain in __manifest: {:?}",
-            after
-        );
-        assert_eq!(after[0].version, max_v, "max must be kept");
-    }
-
-    /// On the managed path, a through-latest delete (`end_version < 0`) is rejected
-    /// rather than silently deleting physical files while leaving `__manifest`
-    /// records in place.
-    #[tokio::test]
-    async fn test_batch_delete_managed_rejects_through_latest() {
-        use lance_namespace::models::{BatchDeleteTableVersionsRequest, VersionRange};
-
-        let temp = TempStdDir::default();
-        let ns: Arc<dyn LanceNamespace> = Arc::new(
-            DirectoryNamespaceBuilder::new(temp.to_str().unwrap())
-                .manifest_enabled(true)
-                .table_version_tracking_enabled(true)
-                .table_version_storage_enabled(true)
-                .build()
-                .await
-                .unwrap(),
-        );
-
-        let err = ns
-            .batch_delete_table_versions(BatchDeleteTableVersionsRequest {
-                id: Some(vec!["users".to_string()]),
-                ranges: vec![VersionRange::new(0, -1)],
-                ..Default::default()
-            })
-            .await;
-        assert!(
-            err.is_err(),
-            "through-latest delete must be rejected on the managed path"
-        );
-        assert!(
-            err.unwrap_err().to_string().contains("not supported"),
-            "expected a not-supported error"
-        );
-    }
-
     /// Build a managed (manifest-tracked) namespace over `path`.
     async fn create_managed_namespace(path: &str) -> Arc<dyn LanceNamespace> {
         Arc::new(
             DirectoryNamespaceBuilder::new(path)
                 .manifest_enabled(true)
                 .table_version_tracking_enabled(true)
-                .table_version_storage_enabled(true)
                 .build()
                 .await
                 .unwrap(),
@@ -6328,7 +6024,6 @@ mod tests {
             DirectoryNamespaceBuilder::new(temp.to_str().unwrap())
                 .manifest_enabled(true)
                 .table_version_tracking_enabled(true)
-                .table_version_storage_enabled(true)
                 .ops_metrics_enabled(true)
                 .build()
                 .await
@@ -6474,49 +6169,6 @@ mod tests {
         );
     }
 
-    /// With the manifest store enabled, branch ops must still bypass the catalog
-    /// fast-path and read the chain from `tree/<branch>/_versions/`. Without the
-    /// `branch.is_none()` guard this would query `__manifest` (which has no
-    /// branch entries) and return the wrong result. The other branch tests use a
-    /// store-disabled namespace, so this pins the enabled path specifically.
-    #[tokio::test]
-    async fn test_branch_ops_skip_manifest_store_when_enabled() {
-        let temp_dir = TempStdDir::default();
-        let namespace = DirectoryNamespaceBuilder::new(temp_dir.to_str().unwrap())
-            .manifest_enabled(true)
-            .table_version_storage_enabled(true)
-            .build()
-            .await
-            .unwrap();
-
-        create_scalar_table(&namespace, "users").await;
-        create_branch_with_commits(&namespace, "users", "exp", 2).await;
-
-        // list resolves the branch chain from storage despite storage tracking
-        // being on (a successful result with tree/exp paths proves the bypass:
-        // the catalog has no "exp" entry, so the fast-path would not return these).
-        let branch_versions = list_versions(&namespace, "users", Some("exp"))
-            .await
-            .unwrap();
-        assert!(branch_versions.len() >= 2);
-        assert!(
-            branch_versions
-                .iter()
-                .all(|v| v.manifest_path.contains("tree/exp")),
-            "branch versions must come from branch storage with the store enabled: {:?}",
-            branch_versions
-        );
-
-        // describe likewise resolves from the branch's storage.
-        let req = DescribeTableVersionRequest {
-            id: Some(vec!["users".to_string()]),
-            branch: Some("exp".to_string()),
-            ..Default::default()
-        };
-        let resp = namespace.describe_table_version(req).await.unwrap();
-        assert!(resp.version.manifest_path.contains("tree/exp"));
-    }
-
     #[tokio::test]
     async fn test_create_table() {
         let (namespace, _temp_dir) = create_test_namespace().await;
@@ -11281,155 +10933,6 @@ mod tests {
         }
     }
 
-    /// Tests for multi-table transaction support via table_version_storage_enabled.
-    mod multi_table_transactions {
-        use super::*;
-        use futures::TryStreamExt;
-        use lance::dataset::builder::DatasetBuilder;
-        use lance_namespace::models::CreateTableVersionRequest;
-
-        /// Helper to create a namespace with table_version_storage_enabled enabled
-        async fn create_managed_namespace(temp_path: &str) -> Arc<DirectoryNamespace> {
-            Arc::new(
-                DirectoryNamespaceBuilder::new(temp_path)
-                    .table_version_tracking_enabled(true)
-                    .table_version_storage_enabled(true)
-                    .manifest_enabled(true)
-                    .build()
-                    .await
-                    .unwrap(),
-            )
-        }
-
-        /// Helper to create a table and get its staging manifest path
-        async fn create_table_and_get_staging(
-            namespace: Arc<dyn LanceNamespace>,
-            table_name: &str,
-        ) -> (Vec<String>, object_store::path::Path) {
-            let schema = create_test_schema();
-            let ipc_data = create_test_ipc_data(&schema);
-            let mut create_req = CreateTableRequest::new();
-            create_req.id = Some(vec![table_name.to_string()]);
-            namespace
-                .create_table(create_req, bytes::Bytes::from(ipc_data))
-                .await
-                .unwrap();
-
-            let table_id = vec![table_name.to_string()];
-            let dataset = DatasetBuilder::from_namespace(namespace.clone(), table_id.clone())
-                .await
-                .unwrap()
-                .load()
-                .await
-                .unwrap();
-
-            // Find existing manifest and create a staging copy
-            let versions_path = dataset.versions_dir();
-            let manifest_metas: Vec<_> = dataset
-                .object_store(None)
-                .await
-                .unwrap()
-                .inner
-                .list(Some(&versions_path))
-                .try_collect()
-                .await
-                .unwrap();
-
-            let manifest_meta = manifest_metas
-                .iter()
-                .find(|m| {
-                    m.location
-                        .filename()
-                        .map(|f| f.ends_with(".manifest"))
-                        .unwrap_or(false)
-                })
-                .expect("No manifest file found");
-
-            let manifest_data = dataset
-                .object_store(None)
-                .await
-                .unwrap()
-                .inner
-                .get(&manifest_meta.location)
-                .await
-                .unwrap()
-                .bytes()
-                .await
-                .unwrap();
-
-            let staging_path = dataset
-                .versions_dir()
-                .join(format!("staging_{}", table_name));
-            dataset
-                .object_store(None)
-                .await
-                .unwrap()
-                .inner
-                .put(&staging_path, manifest_data.into())
-                .await
-                .unwrap();
-
-            (table_id, staging_path)
-        }
-
-        #[tokio::test]
-        async fn test_table_version_storage_enabled_requires_manifest() {
-            // table_version_storage_enabled=true requires manifest_enabled=true
-            let temp_dir = TempStdDir::default();
-            let temp_path = temp_dir.to_str().unwrap();
-
-            let result = DirectoryNamespaceBuilder::new(temp_path)
-                .table_version_storage_enabled(true)
-                .manifest_enabled(false)
-                .build()
-                .await;
-
-            assert!(
-                result.is_err(),
-                "Should fail when table_version_storage_enabled=true but manifest_enabled=false"
-            );
-        }
-
-        #[tokio::test]
-        async fn test_create_table_version_records_in_manifest() {
-            // When table_version_storage_enabled is enabled, single create_table_version
-            // should also record the version in __manifest
-            let temp_dir = TempStrDir::default();
-            let temp_path: &str = &temp_dir;
-
-            let namespace = create_managed_namespace(temp_path).await;
-            let ns: Arc<dyn LanceNamespace> = namespace.clone();
-
-            let (table_id, staging_path) =
-                create_table_and_get_staging(ns.clone(), "table_managed").await;
-
-            // Create version 2
-            let mut create_req = CreateTableVersionRequest::new(2, staging_path.to_string());
-            create_req.id = Some(table_id.clone());
-            create_req.naming_scheme = Some("V2".to_string());
-            let response = namespace.create_table_version(create_req).await.unwrap();
-
-            assert!(response.version.is_some());
-            let version = response.version.unwrap();
-            assert_eq!(version.version, 2);
-
-            // Verify the version is recorded in __manifest by querying it
-            let manifest_ns = namespace.manifest_ns.as_ref().unwrap();
-            let table_id_str = manifest::ManifestNamespace::str_object_id(&table_id);
-            let versions = manifest_ns
-                .query_table_versions(&table_id_str, false, None)
-                .await
-                .unwrap();
-
-            assert!(
-                !versions.is_empty(),
-                "Version should be recorded in __manifest"
-            );
-            let (ver, _path) = &versions[0];
-            assert_eq!(*ver, 2, "Recorded version should be 2");
-        }
-    }
-
     #[tokio::test]
     async fn test_list_all_tables() {
         use lance_namespace::models::ListTablesRequest;
@@ -11783,6 +11286,40 @@ mod tests {
         );
     }
 
+    #[tokio::test]
+    async fn test_manifest_reload_observes_new_version_from_other_namespace() {
+        let temp_dir = TempStdDir::default();
+        let temp_path = temp_dir.to_str().unwrap();
+
+        let namespace_a = DirectoryNamespaceBuilder::new(temp_path)
+            .manifest_enabled(true)
+            .dir_listing_enabled(false)
+            .build()
+            .await
+            .unwrap();
+        create_scalar_table(&namespace_a, "alpha").await;
+
+        let namespace_b = DirectoryNamespaceBuilder::new(temp_path)
+            .manifest_enabled(true)
+            .dir_listing_enabled(false)
+            .build()
+            .await
+            .unwrap();
+        create_scalar_table(&namespace_b, "beta").await;
+
+        let response = namespace_a
+            .list_tables(ListTablesRequest {
+                id: Some(vec![]),
+                ..Default::default()
+            })
+            .await
+            .unwrap();
+
+        let mut tables = response.tables;
+        tables.sort();
+        assert_eq!(tables, vec!["alpha", "beta"]);
+    }
+
     #[tokio::test]
     async fn test_migration_not_found_errors_include_table_id() {
         let temp_dir = TempStdDir::default();
diff --git a/rust/lance-namespace-impls/src/dir/manifest.rs b/rust/lance-namespace-impls/src/dir/manifest.rs
index 0e22f1e8b69..aae924378da 100644
--- a/rust/lance-namespace-impls/src/dir/manifest.rs
+++ b/rust/lance-namespace-impls/src/dir/manifest.rs
@@ -6,52 +6,72 @@
 //! This module provides a namespace implementation that uses a manifest table
 //! to track tables and nested namespaces.
 
+use super::manifest_feature_flags::{ensure_readable, ensure_writable};
 use arrow::array::builder::{ListBuilder, StringBuilder};
-use arrow::array::{Array, RecordBatch, RecordBatchIterator, StringArray};
-use arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
+use arrow::array::{Array, ListArray, RecordBatch, RecordBatchIterator, StringArray, UInt64Array};
+use arrow::datatypes::{DataType, Field, Schema as ArrowSchema, SchemaRef};
 use arrow_ipc::reader::StreamReader;
 use async_trait::async_trait;
 use bytes::Bytes;
-use futures::{FutureExt, TryStreamExt, stream::StreamExt};
-use lance::dataset::optimize::{CompactionOptions, compact_files};
+use datafusion_common::DataFusionError;
+use datafusion_physical_plan::{
+    SendableRecordBatchStream,
+    stream::RecordBatchStreamAdapter as DatafusionRecordBatchStreamAdapter,
+};
+use futures::{
+    FutureExt, TryStreamExt,
+    stream::{self, StreamExt},
+};
+use lance::dataset::index::LanceIndexStoreExt;
+use lance::dataset::transaction::{Operation, Transaction};
 use lance::dataset::{
-    DeleteBuilder, MergeInsertBuilder, ReadParams, WhenMatched, WhenNotMatched, WriteMode,
-    WriteParams, builder::DatasetBuilder,
+    InsertBuilder, ReadParams, WhenMatched, WriteMode, WriteParams, builder::DatasetBuilder,
 };
-use lance::index::DatasetIndexExt;
 use lance::session::Session;
 use lance::{Dataset, dataset::scanner::Scanner};
 use lance_core::Error as LanceError;
 use lance_core::datatypes::LANCE_UNENFORCED_PRIMARY_KEY_POSITION;
-use lance_core::{Error, Result};
-use lance_index::IndexType;
-use lance_index::optimize::OptimizeOptions;
-use lance_index::scalar::{BuiltinIndexType, ScalarIndexParams};
+use lance_core::{Error, ROW_ID, Result};
+use lance_index::progress::noop_progress;
+use lance_index::registry::IndexPluginRegistry;
+use lance_index::scalar::lance_format::LanceIndexStore;
+use lance_index::scalar::registry::VALUE_COLUMN_NAME;
+use lance_index::scalar::{BuiltinIndexType, CreatedIndex, ScalarIndexParams};
 use lance_io::object_store::{ObjectStore, ObjectStoreParams};
+use lance_io::stream::RecordBatchStream as LanceRecordBatchStream;
 use lance_namespace::LanceNamespace;
 use lance_namespace::error::NamespaceError;
 use lance_namespace::models::{
     CreateNamespaceRequest, CreateNamespaceResponse, CreateTableRequest, CreateTableResponse,
     DeclareTableRequest, DeclareTableResponse, DeregisterTableRequest, DeregisterTableResponse,
     DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableRequest,
-    DescribeTableResponse, DescribeTableVersionResponse, DropNamespaceRequest,
-    DropNamespaceResponse, DropTableRequest, DropTableResponse, ListNamespacesRequest,
-    ListNamespacesResponse, ListTableVersionsResponse, ListTablesRequest, ListTablesResponse,
-    NamespaceExistsRequest, RegisterTableRequest, RegisterTableResponse, TableExistsRequest,
-    TableVersion,
+    DescribeTableResponse, DropNamespaceRequest, DropNamespaceResponse, DropTableRequest,
+    DropTableResponse, ListNamespacesRequest, ListNamespacesResponse, ListTablesRequest,
+    ListTablesResponse, NamespaceExistsRequest, RegisterTableRequest, RegisterTableResponse,
+    TableExistsRequest,
 };
 use lance_namespace::schema::arrow_schema_to_json;
+use lance_table::feature_flags::apply_feature_flags;
+use lance_table::format::{Fragment, IndexMetadata, Manifest};
+use lance_table::io::commit::{
+    CommitError, CommitHandler, commit_handler_from_url, write_manifest_file_to_path,
+};
 use object_store::{Error as ObjectStoreError, path::Path};
+use roaring::RoaringBitmap;
 use std::io::Cursor;
+use std::time::{SystemTime, UNIX_EPOCH};
 use std::{
-    collections::HashMap,
+    collections::{BTreeMap, HashMap, HashSet},
     hash::{DefaultHasher, Hash, Hasher},
     ops::{Deref, DerefMut},
-    sync::Arc,
+    sync::{Arc, Mutex as StdMutex, MutexGuard as StdMutexGuard},
 };
 use tokio::sync::{Mutex, RwLock, RwLockReadGuard, RwLockWriteGuard};
+use uuid::Uuid;
 
 const MANIFEST_TABLE_NAME: &str = "__manifest";
+const LANCE_DATA_DIR: &str = "data";
+const LANCE_INDICES_DIR: &str = "_indices";
 const DELIMITER: &str = "$";
 /// Bounded concurrency for per-table `_versions/` probes when filtering declared tables.
 /// Higher values reduce latency but increase burst load against the object store.
@@ -64,24 +84,23 @@ const OBJECT_ID_INDEX_NAME: &str = "object_id_btree";
 const OBJECT_TYPE_INDEX_NAME: &str = "object_type_bitmap";
 /// LabelList index on the base_objects column for view dependencies
 const BASE_OBJECTS_INDEX_NAME: &str = "base_objects_label_list";
-/// Inline maintenance on the manifest table is expensive relative to a single-row mutation.
-/// Wait until enough fragments accumulate before compacting files or merging indices.
-const MANIFEST_INLINE_OPTIMIZATION_FRAGMENT_THRESHOLD: usize = 8;
+// Each retry reloads and rewrites the full manifest. Match the regular Lance
+// commit retry budget so multi-process namespace writes can make progress.
+const DEFAULT_MANIFEST_REWRITE_COMMIT_RETRIES: u32 = 20;
+const MANIFEST_INDEX_BATCH_SIZE: usize = 8192;
 
 /// Object types that can be stored in the manifest
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum ObjectType {
     Namespace,
     Table,
-    TableVersion,
 }
 
 impl ObjectType {
-    pub fn as_str(&self) -> &str {
+    pub fn as_str(&self) -> &'static str {
         match self {
             Self::Namespace => "namespace",
             Self::Table => "table",
-            Self::TableVersion => "table_version",
         }
     }
 
@@ -89,7 +108,6 @@ impl ObjectType {
         match s {
             "namespace" => Ok(Self::Namespace),
             "table" => Ok(Self::Table),
-            "table_version" => Ok(Self::TableVersion),
             _ => Err(NamespaceError::Internal {
                 message: format!("Invalid object type: {}", s),
             }
@@ -152,7 +170,7 @@ pub struct TableInfo {
 pub struct ManifestEntry {
     /// The unique object identifier (e.g., table name or version object_id)
     pub object_id: String,
-    /// The type of the object (Namespace, Table, or TableVersion)
+    /// The type of the object (Namespace or Table)
     pub object_type: ObjectType,
     /// The storage location (e.g., directory name for tables)
     pub location: Option<String>,
@@ -160,6 +178,401 @@ pub struct ManifestEntry {
     pub metadata: Option<String>,
 }
 
+struct CopyOnWriteMutation<T> {
+    result: T,
+    has_changes: bool,
+}
+
+impl<T> CopyOnWriteMutation<T> {
+    fn updated(result: T) -> Self {
+        Self {
+            result,
+            has_changes: true,
+        }
+    }
+
+    fn unchanged(result: T) -> Self {
+        Self {
+            result,
+            has_changes: false,
+        }
+    }
+}
+
+struct ManifestIndexBuildInput {
+    index_name: &'static str,
+    column_name: &'static str,
+    params: ScalarIndexParams,
+    field: Field,
+    stream: SendableRecordBatchStream,
+}
+
+struct ManifestTrainedIndex {
+    index_name: &'static str,
+    column_name: &'static str,
+    uuid: Uuid,
+    created_index: CreatedIndex,
+}
+
+struct ManifestRowValue {
+    object_id: String,
+    object_type: ObjectType,
+    location: Option<String>,
+    metadata: Option<String>,
+    base_objects: Option<Vec<String>>,
+}
+
+struct ManifestOutputRow<'a> {
+    object_id: &'a str,
+    object_type: ObjectType,
+    location: Option<&'a str>,
+    metadata: Option<&'a str>,
+    base_objects: Option<&'a [String]>,
+}
+
+#[derive(Default)]
+struct ManifestIndexAccumulator {
+    object_ids: BTreeMap<Arc<str>, u64>,
+    object_types: BTreeMap<&'static str, RoaringBitmap>,
+    base_objects_values: Vec<Option<Vec<String>>>,
+    base_objects_row_ids: Vec<u64>,
+    row_count: u64,
+}
+
+impl ManifestIndexAccumulator {
+    fn next_row_id(&self) -> Result<u64> {
+        if self.row_count >= u64::from(u32::MAX) {
+            return Err(NamespaceError::Internal {
+                message: format!(
+                    "Manifest rewrite exceeded maximum single-fragment row count: {}",
+                    self.row_count
+                ),
+            }
+            .into());
+        }
+        Ok(self.row_count)
+    }
+
+    fn push(&mut self, row: &ManifestOutputRow<'_>) -> Result<u64> {
+        let row_id = self.next_row_id()?;
+        if self
+            .object_ids
+            .insert(Arc::<str>::from(row.object_id), row_id)
+            .is_some()
+        {
+            return Err(NamespaceError::Internal {
+                message: format!("Manifest contains duplicate object_id '{}'", row.object_id),
+            }
+            .into());
+        }
+        self.object_types
+            .entry(row.object_type.as_str())
+            .or_default()
+            .insert(row_id as u32);
+        self.base_objects_values
+            .push(row.base_objects.map(|objects| objects.to_vec()));
+        self.base_objects_row_ids.push(row_id);
+        self.row_count += 1;
+        Ok(row_id)
+    }
+}
+
+struct ManifestBatchBuilder {
+    object_ids: Vec<String>,
+    object_types: Vec<&'static str>,
+    locations: Vec<Option<String>>,
+    metadatas: Vec<Option<String>>,
+    base_objects: Vec<Option<Vec<String>>>,
+}
+
+impl ManifestBatchBuilder {
+    fn new() -> Self {
+        Self {
+            object_ids: Vec::new(),
+            object_types: Vec::new(),
+            locations: Vec::new(),
+            metadatas: Vec::new(),
+            base_objects: Vec::new(),
+        }
+    }
+
+    fn is_empty(&self) -> bool {
+        self.object_ids.is_empty()
+    }
+
+    fn append(
+        &mut self,
+        index_data: &mut ManifestIndexAccumulator,
+        row: ManifestOutputRow<'_>,
+    ) -> Result<()> {
+        index_data.push(&row)?;
+        self.object_ids.push(row.object_id.to_string());
+        self.object_types.push(row.object_type.as_str());
+        self.locations.push(row.location.map(ToString::to_string));
+        self.metadatas.push(row.metadata.map(ToString::to_string));
+        self.base_objects
+            .push(row.base_objects.map(|objects| objects.to_vec()));
+        Ok(())
+    }
+
+    fn finish(self) -> Result<RecordBatch> {
+        let base_objects_array = ManifestNamespace::base_objects_array(&self.base_objects);
+        RecordBatch::try_new(
+            ManifestNamespace::manifest_schema(),
+            vec![
+                Arc::new(StringArray::from(self.object_ids)),
+                Arc::new(StringArray::from(self.object_types)),
+                Arc::new(StringArray::from(self.locations)),
+                Arc::new(StringArray::from(self.metadatas)),
+                Arc::new(base_objects_array),
+            ],
+        )
+        .map_err(|e| {
+            lance_core::Error::from(NamespaceError::Internal {
+                message: format!("Failed to create manifest snapshot batch: {:?}", e),
+            })
+        })
+    }
+}
+
+/// How to resolve a storage commit conflict (or an ambiguous commit error that did
+/// not land) against the latest catalog state, without re-staging the full rewrite.
+enum ConflictResolution<O> {
+    /// Re-read the latest manifest and re-apply the mutation (upserts, version-range
+    /// deletes). The staged data/index files are discarded and a new rewrite is attempted.
+    Retry,
+    /// Creating these object ids with fail-on-conflict semantics. If any of them now
+    /// exists in the latest manifest, the create lost the race and must fail with a
+    /// concurrent-modification error; otherwise retry the rewrite.
+    FailIfExists(Vec<String>),
+    /// Deleting `object_id`. If it is already absent from the latest manifest the delete
+    /// has effectively happened, so return `output` as success; otherwise retry.
+    SucceedIfAbsent { object_id: String, output: O },
+}
+
+trait ManifestStreamMutation: Send {
+    type Output: Clone + Send + 'static;
+
+    fn process_existing_row(
+        &mut self,
+        row: ManifestRowValue,
+        output: &mut ManifestBatchBuilder,
+        index_data: &mut ManifestIndexAccumulator,
+    ) -> Result<()>;
+
+    fn append_rows(
+        &mut self,
+        output: &mut ManifestBatchBuilder,
+        index_data: &mut ManifestIndexAccumulator,
+    ) -> Result<()>;
+
+    fn finish(&self) -> CopyOnWriteMutation<Self::Output>;
+
+    /// Declares how a storage commit conflict should be resolved against the latest
+    /// committed catalog state. Defaults to re-reading and re-applying.
+    fn conflict_resolution(&self) -> ConflictResolution<Self::Output> {
+        ConflictResolution::Retry
+    }
+}
+
+struct ManifestRewriteShared<M: ManifestStreamMutation> {
+    mutation: M,
+    index_data: Option<ManifestIndexAccumulator>,
+    result: Option<CopyOnWriteMutation<M::Output>>,
+    error: Option<LanceError>,
+}
+
+impl<M: ManifestStreamMutation> ManifestRewriteShared<M> {
+    fn new(mutation: M) -> Self {
+        Self {
+            mutation,
+            index_data: Some(ManifestIndexAccumulator::default()),
+            result: None,
+            error: None,
+        }
+    }
+}
+
+struct UpsertManifestMutation {
+    entries: Vec<ManifestEntry>,
+    base_objects: Vec<Option<Vec<String>>>,
+    entry_positions: HashMap<String, usize>,
+    matched: Vec<bool>,
+    when_matched: WhenMatched,
+}
+
+impl UpsertManifestMutation {
+    fn new(
+        entries: Vec<ManifestEntry>,
+        base_objects: Option<Vec<String>>,
+        when_matched: WhenMatched,
+    ) -> Self {
+        let entry_positions = entries
+            .iter()
+            .enumerate()
+            .map(|(index, entry)| (entry.object_id.clone(), index))
+            .collect();
+        let matched = vec![false; entries.len()];
+        let mut entry_base_objects = vec![None; entries.len()];
+        if !entry_base_objects.is_empty() {
+            entry_base_objects[0] = base_objects;
+        }
+        Self {
+            entries,
+            base_objects: entry_base_objects,
+            entry_positions,
+            matched,
+            when_matched,
+        }
+    }
+
+    fn entry_row(&self, index: usize) -> ManifestOutputRow<'_> {
+        let entry = &self.entries[index];
+        ManifestOutputRow {
+            object_id: &entry.object_id,
+            object_type: entry.object_type,
+            location: entry.location.as_deref(),
+            metadata: entry.metadata.as_deref(),
+            base_objects: self.base_objects[index].as_deref(),
+        }
+    }
+}
+
+impl ManifestStreamMutation for UpsertManifestMutation {
+    type Output = ();
+
+    fn process_existing_row(
+        &mut self,
+        row: ManifestRowValue,
+        output: &mut ManifestBatchBuilder,
+        index_data: &mut ManifestIndexAccumulator,
+    ) -> Result<()> {
+        if let Some(index) = self.entry_positions.get(&row.object_id).copied() {
+            match self.when_matched {
+                WhenMatched::Fail => {
+                    return Err(NamespaceError::ConcurrentModification {
+                        message: format!(
+                            "Object '{}' was concurrently created by another operation",
+                            row.object_id
+                        ),
+                    }
+                    .into());
+                }
+                WhenMatched::UpdateAll => {
+                    self.matched[index] = true;
+                    output.append(index_data, self.entry_row(index))?;
+                    return Ok(());
+                }
+                _ => {
+                    return Err(NamespaceError::Internal {
+                        message: format!(
+                            "Unsupported manifest rewrite matched action: {:?}",
+                            self.when_matched
+                        ),
+                    }
+                    .into());
+                }
+            }
+        }
+
+        output.append(
+            index_data,
+            ManifestOutputRow {
+                object_id: &row.object_id,
+                object_type: row.object_type,
+                location: row.location.as_deref(),
+                metadata: row.metadata.as_deref(),
+                base_objects: row.base_objects.as_deref(),
+            },
+        )
+    }
+
+    fn append_rows(
+        &mut self,
+        output: &mut ManifestBatchBuilder,
+        index_data: &mut ManifestIndexAccumulator,
+    ) -> Result<()> {
+        for index in 0..self.entries.len() {
+            if !self.matched[index] {
+                output.append(index_data, self.entry_row(index))?;
+            }
+        }
+        Ok(())
+    }
+
+    fn finish(&self) -> CopyOnWriteMutation<Self::Output> {
+        CopyOnWriteMutation::updated(())
+    }
+
+    fn conflict_resolution(&self) -> ConflictResolution<Self::Output> {
+        match self.when_matched {
+            // Fail-on-conflict create: a concurrent writer may have created one of these
+            // ids. Re-applying would still fail, so check directly instead of re-staging.
+            WhenMatched::Fail => ConflictResolution::FailIfExists(
+                self.entries.iter().map(|e| e.object_id.clone()).collect(),
+            ),
+            // Metadata upsert is last-writer-wins: re-read and re-apply.
+            _ => ConflictResolution::Retry,
+        }
+    }
+}
+
+struct DeleteObjectMutation {
+    object_id: String,
+    deleted: bool,
+}
+
+impl ManifestStreamMutation for DeleteObjectMutation {
+    type Output = ();
+
+    fn process_existing_row(
+        &mut self,
+        row: ManifestRowValue,
+        output: &mut ManifestBatchBuilder,
+        index_data: &mut ManifestIndexAccumulator,
+    ) -> Result<()> {
+        if row.object_id == self.object_id {
+            self.deleted = true;
+            return Ok(());
+        }
+
+        output.append(
+            index_data,
+            ManifestOutputRow {
+                object_id: &row.object_id,
+                object_type: row.object_type,
+                location: row.location.as_deref(),
+                metadata: row.metadata.as_deref(),
+                base_objects: row.base_objects.as_deref(),
+            },
+        )
+    }
+
+    fn append_rows(
+        &mut self,
+        _output: &mut ManifestBatchBuilder,
+        _index_data: &mut ManifestIndexAccumulator,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn finish(&self) -> CopyOnWriteMutation<Self::Output> {
+        if self.deleted {
+            CopyOnWriteMutation::updated(())
+        } else {
+            CopyOnWriteMutation::unchanged(())
+        }
+    }
+
+    fn conflict_resolution(&self) -> ConflictResolution<Self::Output> {
+        // If a concurrent writer already removed the object, the delete is satisfied.
+        ConflictResolution::SucceedIfAbsent {
+            object_id: self.object_id.clone(),
+            output: (),
+        }
+    }
+}
+
 /// Information about a namespace stored in the manifest
 #[derive(Debug, Clone)]
 pub struct NamespaceInfo {
@@ -171,13 +584,23 @@ pub struct NamespaceInfo {
 /// A wrapper around a Dataset that provides concurrent access.
 ///
 /// This can be cloned cheaply. It supports concurrent reads or exclusive writes.
-/// The manifest dataset is always kept strongly consistent by reloading on each read.
+/// The manifest dataset uses contiguous attached versions and this module never
+/// runs old-version cleanup on it, allowing reads to check only the immediate
+/// successor manifest before deciding whether a reload is needed.
 #[derive(Debug, Clone)]
 pub struct DatasetConsistencyWrapper(Arc<RwLock<Dataset>>);
 
 impl DatasetConsistencyWrapper {
     /// Create a new wrapper with the given dataset.
     pub fn new(dataset: Dataset) -> Self {
+        debug_assert!(
+            !dataset
+                .manifest()
+                .config
+                .keys()
+                .any(|key| key.starts_with("lance.auto_cleanup.")),
+            "the directory manifest dataset must not enable old-version cleanup"
+        );
         Self(Arc::new(RwLock::new(dataset)))
     }
 
@@ -185,18 +608,35 @@ impl DatasetConsistencyWrapper {
     /// Always reloads to ensure strong consistency.
     pub async fn get(&self) -> Result<DatasetReadGuard<'_>> {
         self.reload().await?;
-        Ok(DatasetReadGuard {
+        let guard = DatasetReadGuard {
             guard: self.0.read().await,
-        })
+        };
+        // Refuse manifests written with a reader feature flag this build does
+        // not understand instead of misreading them.
+        ensure_readable(guard.metadata())?;
+        Ok(guard)
+    }
+
+    /// Reload the dataset and return a reference.
+    pub async fn get_refreshed(&self) -> Result<DatasetReadGuard<'_>> {
+        self.reload().await?;
+        let guard = DatasetReadGuard {
+            guard: self.0.read().await,
+        };
+        ensure_readable(guard.metadata())?;
+        Ok(guard)
     }
 
     /// Get a mutable reference to the dataset.
     /// Always reloads to ensure strong consistency.
     pub async fn get_mut(&self) -> Result<DatasetWriteGuard<'_>> {
         self.reload().await?;
-        Ok(DatasetWriteGuard {
+        let guard = DatasetWriteGuard {
             guard: self.0.write().await,
-        })
+        };
+        ensure_readable(guard.metadata())?;
+        ensure_writable(guard.metadata())?;
+        Ok(guard)
     }
 
     /// Provide a known latest version of the dataset.
@@ -221,21 +661,25 @@ impl DatasetConsistencyWrapper {
             dataset_uri,
             current_version
         );
-        let latest_version = read_guard.latest_version_id().await.map_err(|e| {
+        // The directory manifest table uses contiguous attached versions and
+        // does not run old-version cleanup, so the immediate successor probe is
+        // enough to detect changes without resolving or loading the latest
+        // manifest on every namespace read.
+        let has_successor_version = read_guard.has_successor_version().await.map_err(|e| {
             lance_core::Error::from(NamespaceError::Internal {
-                message: format!("Failed to get latest version: {:?}", e),
+                message: format!("Failed to check dataset staleness: {:?}", e),
             })
         })?;
         log::debug!(
-            "Reload got latest_version={} for uri={}, current_version={}",
-            latest_version,
+            "Reload checked successor_version_exists={} for uri={}, current_version={}",
+            has_successor_version,
             dataset_uri,
             current_version
         );
         drop(read_guard);
 
         // If already up-to-date, return early
-        if latest_version == current_version {
+        if !has_successor_version {
             log::debug!("Already up-to-date for uri={}", dataset_uri);
             return Ok(());
         }
@@ -244,13 +688,13 @@ impl DatasetConsistencyWrapper {
         let mut write_guard = self.0.write().await;
 
         // Double-check after acquiring write lock (someone else might have reloaded)
-        let latest_version = write_guard.latest_version_id().await.map_err(|e| {
+        let has_successor_version = write_guard.has_successor_version().await.map_err(|e| {
             lance_core::Error::from(NamespaceError::Internal {
-                message: format!("Failed to get latest version: {:?}", e),
+                message: format!("Failed to check dataset staleness: {:?}", e),
             })
         })?;
 
-        if latest_version != write_guard.version().version {
+        if has_successor_version {
             write_guard.checkout_latest().await.map_err(|e| {
                 lance_core::Error::from(NamespaceError::Internal {
                     message: format!("Failed to checkout latest: {:?}", e),
@@ -306,8 +750,8 @@ pub struct ManifestNamespace {
     /// If true, root namespace tables use {table_name}.lance naming
     /// If false, they use namespace-prefixed names
     dir_listing_enabled: bool,
-    /// Whether to perform inline optimization (compaction and indexing) on the __manifest table
-    /// after every write. Defaults to true.
+    /// Whether copy-on-write manifest rewrites should build replacement indices.
+    /// Defaults to true.
     inline_optimization_enabled: bool,
     /// Number of retries for commit operations on the manifest table.
     /// If None, defaults to [`lance_table::io::commit::CommitConfig`] default (20).
@@ -401,15 +845,10 @@ impl ManifestNamespace {
         dir_listing_enabled: bool,
         inline_optimization_enabled: bool,
         commit_retries: Option<u32>,
-        table_version_storage_enabled: bool,
     ) -> Result<Self> {
-        let manifest_dataset = Self::ensure_manifest_table_up_to_date(
-            &root,
-            &storage_options,
-            session.clone(),
-            table_version_storage_enabled,
-        )
-        .await?;
+        let manifest_dataset =
+            Self::ensure_manifest_table_up_to_date(&root, &storage_options, session.clone())
+                .await?;
 
         Ok(Self {
             root,
@@ -473,34 +912,6 @@ impl ManifestNamespace {
         format!("table id '{}'", Self::str_object_id(table_id))
     }
 
-    /// Format a version number as a zero-padded lexicographically sortable string.
-    ///
-    /// Versions are stored as 20-digit zero-padded integers (e.g., `00000000000000000001`
-    /// for version 1) so that string-based range queries and sorting work correctly.
-    pub fn format_table_version(version: i64) -> String {
-        format!("{:020}", version)
-    }
-
-    /// Build the object_id for a table version entry.
-    ///
-    /// Format: `{table_object_id}${zero_padded_version}`
-    pub fn build_version_object_id(table_object_id: &str, version: i64) -> String {
-        format!(
-            "{}{}{}",
-            table_object_id,
-            DELIMITER,
-            Self::format_table_version(version)
-        )
-    }
-
-    /// Parse a version number from the version suffix of a table version object_id.
-    ///
-    /// The object_id is formatted as `{table_id}${zero_padded_version}`.
-    pub fn parse_version_from_object_id(object_id: &str) -> Option<i64> {
-        let (_namespace, name) = Self::parse_object_id(object_id);
-        name.parse::<i64>().ok()
-    }
-
     /// Generate a new directory name in format: `<hash>_<object_id>`
     /// The hash is used to (1) optimize object store throughput,
     /// (2) have high enough entropy in a short period of time to prevent issues like
@@ -556,168 +967,392 @@ impl ManifestNamespace {
         Ok(full_url.to_string())
     }
 
-    /// Perform inline optimization on the __manifest table.
-    ///
-    /// This method:
-    /// 1. Creates three indexes on the manifest table:
-    ///    - BTREE index on object_id for fast lookups
-    ///    - Bitmap index on object_type for filtering by type
-    ///    - LabelList index on base_objects for view dependencies
-    /// 2. Runs file compaction to merge small files
-    /// 3. Optimizes existing indices
-    ///
-    /// This is called automatically after writes when inline_optimization_enabled is true.
-    async fn run_inline_optimization(&self) -> Result<()> {
-        if !self.inline_optimization_enabled {
-            return Ok(());
-        }
-
-        // Get a mutable reference to the dataset to perform optimization
-        let mut dataset_guard = self.manifest_dataset.get_mut().await?;
-        let dataset: &mut Dataset = &mut dataset_guard;
-
-        // Step 1: Create indexes if they don't already exist
-        let indices = dataset.load_indices().await?;
-
-        // Check which indexes already exist
-        let has_object_id_index = indices.iter().any(|idx| idx.name == OBJECT_ID_INDEX_NAME);
-        let has_object_type_index = indices.iter().any(|idx| idx.name == OBJECT_TYPE_INDEX_NAME);
-        let has_base_objects_index = indices
-            .iter()
-            .any(|idx| idx.name == BASE_OBJECTS_INDEX_NAME);
-
-        // Create BTREE index on object_id
-        if !has_object_id_index {
-            log::debug!(
-                "Creating BTREE index '{}' on object_id for __manifest table",
-                OBJECT_ID_INDEX_NAME
-            );
-            let params = ScalarIndexParams::for_builtin(BuiltinIndexType::BTree);
-            if let Err(e) = dataset
-                .create_index(
-                    &["object_id"],
-                    IndexType::BTree,
-                    Some(OBJECT_ID_INDEX_NAME.to_string()),
-                    &params,
-                    true,
-                )
-                .await
-            {
-                log::warn!(
-                    "Failed to create BTREE index on object_id for __manifest table: {:?}. Query performance may be impacted.",
-                    e
-                );
-            } else {
-                log::info!(
-                    "Created BTREE index '{}' on object_id for __manifest table",
-                    OBJECT_ID_INDEX_NAME
-                );
+    fn string_list_array(values: &[Option<Vec<String>>], child_name: &str) -> ListArray {
+        let string_builder = StringBuilder::new();
+        let mut list_builder = ListBuilder::new(string_builder).with_field(Arc::new(Field::new(
+            child_name,
+            DataType::Utf8,
+            true,
+        )));
+        for value in values {
+            match value {
+                Some(objects) => {
+                    for object in objects {
+                        list_builder.values().append_value(object);
+                    }
+                    list_builder.append(true);
+                }
+                None => list_builder.append_null(),
             }
         }
+        list_builder.finish()
+    }
 
-        // Create Bitmap index on object_type
-        if !has_object_type_index {
-            log::debug!(
-                "Creating Bitmap index '{}' on object_type for __manifest table",
-                OBJECT_TYPE_INDEX_NAME
-            );
-            let params = ScalarIndexParams::default();
-            if let Err(e) = dataset
-                .create_index(
-                    &["object_type"],
-                    IndexType::Bitmap,
-                    Some(OBJECT_TYPE_INDEX_NAME.to_string()),
-                    &params,
-                    true,
-                )
-                .await
-            {
-                log::warn!(
-                    "Failed to create Bitmap index on object_type for __manifest table: {:?}. Query performance may be impacted.",
-                    e
-                );
-            } else {
-                log::info!(
-                    "Created Bitmap index '{}' on object_type for __manifest table",
-                    OBJECT_TYPE_INDEX_NAME
-                );
-            }
-        }
+    fn base_objects_array(values: &[Option<Vec<String>>]) -> ListArray {
+        Self::string_list_array(values, "object_id")
+    }
 
-        // Create LabelList index on base_objects
-        if !has_base_objects_index {
-            log::debug!(
-                "Creating LabelList index '{}' on base_objects for __manifest table",
-                BASE_OBJECTS_INDEX_NAME
-            );
-            let params = ScalarIndexParams::default();
-            if let Err(e) = dataset
-                .create_index(
-                    &["base_objects"],
-                    IndexType::LabelList,
-                    Some(BASE_OBJECTS_INDEX_NAME.to_string()),
-                    &params,
-                    true,
-                )
-                .await
-            {
-                log::warn!(
-                    "Failed to create LabelList index on base_objects for __manifest table: {:?}. Query performance may be impacted.",
-                    e
-                );
-            } else {
-                log::info!(
-                    "Created LabelList index '{}' on base_objects for __manifest table",
-                    BASE_OBJECTS_INDEX_NAME
-                );
-            }
-        }
+    fn value_row_id_schema(value_field: Field) -> SchemaRef {
+        Arc::new(ArrowSchema::new(vec![
+            value_field,
+            Field::new(ROW_ID, DataType::UInt64, false),
+        ]))
+    }
 
-        let should_compact_and_optimize =
-            dataset.count_fragments() >= MANIFEST_INLINE_OPTIMIZATION_FRAGMENT_THRESHOLD;
+    fn string_row_id_batch(
+        schema: SchemaRef,
+        values: Vec<String>,
+        row_ids: Vec<u64>,
+    ) -> Result<RecordBatch> {
+        RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(StringArray::from(values)),
+                Arc::new(UInt64Array::from(row_ids)),
+            ],
+        )
+        .map_err(Into::into)
+    }
 
-        if !should_compact_and_optimize {
-            return Ok(());
-        }
+    fn list_row_id_batch(
+        schema: SchemaRef,
+        values: Vec<Option<Vec<String>>>,
+        row_ids: Vec<u64>,
+    ) -> Result<RecordBatch> {
+        RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(Self::string_list_array(&values, "item")),
+                Arc::new(UInt64Array::from(row_ids)),
+            ],
+        )
+        .map_err(Into::into)
+    }
 
-        // Step 2: Run file compaction
-        log::debug!("Running file compaction on __manifest table");
-        match compact_files(dataset, CompactionOptions::default(), None).await {
-            Ok(compaction_metrics) => {
-                if compaction_metrics.fragments_removed > 0 {
-                    log::info!(
-                        "Compacted __manifest table: removed {} fragments, added {} fragments",
-                        compaction_metrics.fragments_removed,
-                        compaction_metrics.fragments_added
-                    );
+    fn object_id_index_stream(object_ids: BTreeMap<Arc<str>, u64>) -> SendableRecordBatchStream {
+        let schema =
+            Self::value_row_id_schema(Field::new(VALUE_COLUMN_NAME, DataType::Utf8, false));
+        let stream_schema = schema.clone();
+        let stream = stream::unfold(
+            (object_ids.into_iter(), false, schema),
+            |(mut iter, emitted, schema)| async move {
+                let mut values = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE);
+                let mut row_ids = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE);
+                for _ in 0..MANIFEST_INDEX_BATCH_SIZE {
+                    let Some((value, row_id)) = iter.next() else {
+                        break;
+                    };
+                    values.push(value.to_string());
+                    row_ids.push(row_id);
                 }
-            }
-            Err(e) => {
-                log::warn!(
-                    "Failed to compact files for __manifest table: {:?}. Continuing with optimization.",
-                    e
-                );
-            }
-        }
-
-        // Step 3: Optimize indices
-        log::debug!("Optimizing indices on __manifest table");
-        match dataset.optimize_indices(&OptimizeOptions::default()).await {
-            Ok(_) => {
-                log::info!("Successfully optimized indices on __manifest table");
-            }
-            Err(e) => {
-                log::warn!(
-                    "Failed to optimize indices on __manifest table: {:?}. Continuing anyway.",
-                    e
-                );
-            }
-        }
-
-        Ok(())
+                if values.is_empty() {
+                    if emitted {
+                        None
+                    } else {
+                        let batch = Self::string_row_id_batch(schema.clone(), values, row_ids)
+                            .map_err(|err| DataFusionError::External(Box::new(err)));
+                        Some((batch, (iter, true, schema)))
+                    }
+                } else {
+                    let batch = Self::string_row_id_batch(schema.clone(), values, row_ids)
+                        .map_err(|err| DataFusionError::External(Box::new(err)));
+                    Some((batch, (iter, true, schema)))
+                }
+            },
+        );
+        Box::pin(DatafusionRecordBatchStreamAdapter::new(
+            stream_schema,
+            stream.fuse(),
+        ))
     }
 
-    /// Get the manifest schema
+    fn object_type_index_stream(
+        object_types: BTreeMap<&'static str, RoaringBitmap>,
+    ) -> SendableRecordBatchStream {
+        let schema =
+            Self::value_row_id_schema(Field::new(VALUE_COLUMN_NAME, DataType::Utf8, false));
+        let stream_schema = schema.clone();
+        let entries = object_types
+            .into_iter()
+            .map(|(value, bitmap)| {
+                (
+                    value,
+                    Box::new(bitmap.into_iter()) as Box<dyn Iterator<Item = u32> + Send>,
+                )
+            })
+            .collect::<Vec<_>>()
+            .into_iter();
+        let stream = stream::unfold(
+            (entries, None, false, schema),
+            |(mut entries, mut current, emitted, schema)| async move {
+                let mut values = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE);
+                let mut row_ids = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE);
+                while values.len() < MANIFEST_INDEX_BATCH_SIZE {
+                    if current.is_none() {
+                        current = entries.next();
+                    }
+                    let Some((value, iter)) = current.as_mut() else {
+                        break;
+                    };
+                    if let Some(row_id) = iter.next() {
+                        values.push((*value).to_string());
+                        row_ids.push(u64::from(row_id));
+                    } else {
+                        current = None;
+                    }
+                }
+
+                if values.is_empty() {
+                    if emitted {
+                        None
+                    } else {
+                        let batch = Self::string_row_id_batch(schema.clone(), values, row_ids)
+                            .map_err(|err| DataFusionError::External(Box::new(err)));
+                        Some((batch, (entries, current, true, schema)))
+                    }
+                } else {
+                    let batch = Self::string_row_id_batch(schema.clone(), values, row_ids)
+                        .map_err(|err| DataFusionError::External(Box::new(err)));
+                    Some((batch, (entries, current, true, schema)))
+                }
+            },
+        );
+        Box::pin(DatafusionRecordBatchStreamAdapter::new(
+            stream_schema,
+            stream.fuse(),
+        ))
+    }
+
+    fn base_objects_index_stream(
+        base_objects_values: Vec<Option<Vec<String>>>,
+        base_objects_row_ids: Vec<u64>,
+    ) -> SendableRecordBatchStream {
+        let schema = Self::value_row_id_schema(Field::new(
+            VALUE_COLUMN_NAME,
+            DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))),
+            true,
+        ));
+        let stream_schema = schema.clone();
+        let stream = stream::unfold(
+            (
+                base_objects_values.into_iter().zip(base_objects_row_ids),
+                false,
+                schema,
+            ),
+            |(mut iter, emitted, schema)| async move {
+                let mut values = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE);
+                let mut row_ids = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE);
+                for _ in 0..MANIFEST_INDEX_BATCH_SIZE {
+                    let Some((value, row_id)) = iter.next() else {
+                        break;
+                    };
+                    values.push(value);
+                    row_ids.push(row_id);
+                }
+                if values.is_empty() {
+                    if emitted {
+                        None
+                    } else {
+                        let batch = Self::list_row_id_batch(schema.clone(), values, row_ids)
+                            .map_err(|err| DataFusionError::External(Box::new(err)));
+                        Some((batch, (iter, true, schema)))
+                    }
+                } else {
+                    let batch = Self::list_row_id_batch(schema.clone(), values, row_ids)
+                        .map_err(|err| DataFusionError::External(Box::new(err)));
+                    Some((batch, (iter, true, schema)))
+                }
+            },
+        );
+        Box::pin(DatafusionRecordBatchStreamAdapter::new(
+            stream_schema,
+            stream.fuse(),
+        ))
+    }
+
+    async fn train_manifest_index(
+        dataset: &Dataset,
+        registry: Arc<IndexPluginRegistry>,
+        input: ManifestIndexBuildInput,
+        index_uuid: Uuid,
+    ) -> Result<ManifestTrainedIndex> {
+        let index_store = LanceIndexStore::from_dataset_for_new(dataset, &index_uuid)?;
+        let plugin = registry.get_plugin_by_name(&input.params.index_type)?;
+        let training_request = plugin
+            .new_training_request(input.params.params.as_deref().unwrap_or("{}"), &input.field)?;
+        let created_index = plugin
+            .train_index(
+                input.stream,
+                &index_store,
+                training_request,
+                None,
+                noop_progress(),
+            )
+            .await?;
+        Ok(ManifestTrainedIndex {
+            index_name: input.index_name,
+            column_name: input.column_name,
+            uuid: index_uuid,
+            created_index,
+        })
+    }
+
+    fn manifest_index_metadata(
+        lance_schema: &lance_core::datatypes::Schema,
+        fragment_bitmap: &RoaringBitmap,
+        dataset_version: u64,
+        trained_index: ManifestTrainedIndex,
+    ) -> Result<IndexMetadata> {
+        Ok(IndexMetadata {
+            uuid: trained_index.uuid,
+            fields: vec![lance_schema.field_id(trained_index.column_name)?],
+            name: trained_index.index_name.to_string(),
+            dataset_version,
+            fragment_bitmap: Some(fragment_bitmap.clone()),
+            index_details: Some(Arc::new(trained_index.created_index.index_details)),
+            index_version: trained_index.created_index.index_version as i32,
+            created_at: None,
+            base_id: None,
+            files: Some(trained_index.created_index.files),
+        })
+    }
+
+    fn manifest_fragment_bitmap(manifest: &Manifest) -> Result<RoaringBitmap> {
+        let mut bitmap = RoaringBitmap::new();
+        for fragment in manifest.fragments.iter() {
+            let fragment_id = u32::try_from(fragment.id).map_err(|_| {
+                lance_core::Error::from(NamespaceError::Internal {
+                    message: format!("Manifest fragment id {} exceeds u32", fragment.id),
+                })
+            })?;
+            bitmap.insert(fragment_id);
+        }
+        Ok(bitmap)
+    }
+
+    fn manifest_from_overwrite_transaction(
+        previous: &Manifest,
+        schema: lance_core::datatypes::Schema,
+        fragments: &[Fragment],
+    ) -> Manifest {
+        let mut next_fragment_id = 0;
+        let mut fragments = fragments
+            .iter()
+            .cloned()
+            .map(|mut fragment| {
+                if fragment.id == 0 {
+                    fragment.id = next_fragment_id;
+                    next_fragment_id += 1;
+                }
+                fragment
+            })
+            .collect::<Vec<_>>();
+        fragments.sort_by_key(|fragment| fragment.id);
+        Manifest::new_from_previous(previous, schema, Arc::new(fragments))
+    }
+
+    async fn build_manifest_indices(
+        dataset: &Dataset,
+        manifest: &Manifest,
+        index_data: ManifestIndexAccumulator,
+        index_uuids: [Uuid; 3],
+    ) -> Result<Vec<IndexMetadata>> {
+        let fragment_bitmap = Self::manifest_fragment_bitmap(manifest)?;
+        let schema = &manifest.schema;
+        let ManifestIndexAccumulator {
+            object_ids,
+            object_types,
+            base_objects_values,
+            base_objects_row_ids,
+            ..
+        } = index_data;
+        let [object_id_uuid, object_type_uuid, base_objects_uuid] = index_uuids;
+        let registry = IndexPluginRegistry::with_default_plugins();
+
+        let dataset_version = manifest.version;
+        let object_id_index_fut = Self::build_manifest_index(
+            dataset,
+            registry.clone(),
+            schema,
+            ManifestIndexBuildInput {
+                index_name: OBJECT_ID_INDEX_NAME,
+                column_name: "object_id",
+                params: ScalarIndexParams::for_builtin(BuiltinIndexType::BTree),
+                field: Field::new(VALUE_COLUMN_NAME, DataType::Utf8, false),
+                stream: Self::object_id_index_stream(object_ids),
+            },
+            &fragment_bitmap,
+            dataset_version,
+            object_id_uuid,
+        );
+        let object_type_index_fut = Self::build_manifest_index(
+            dataset,
+            registry.clone(),
+            schema,
+            ManifestIndexBuildInput {
+                index_name: OBJECT_TYPE_INDEX_NAME,
+                column_name: "object_type",
+                params: ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap),
+                field: Field::new(VALUE_COLUMN_NAME, DataType::Utf8, false),
+                stream: Self::object_type_index_stream(object_types),
+            },
+            &fragment_bitmap,
+            dataset_version,
+            object_type_uuid,
+        );
+        let base_objects_index_fut = Self::build_manifest_index(
+            dataset,
+            registry,
+            schema,
+            ManifestIndexBuildInput {
+                index_name: BASE_OBJECTS_INDEX_NAME,
+                column_name: "base_objects",
+                params: ScalarIndexParams::for_builtin(BuiltinIndexType::LabelList),
+                field: Field::new(
+                    VALUE_COLUMN_NAME,
+                    DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))),
+                    true,
+                ),
+                stream: Self::base_objects_index_stream(base_objects_values, base_objects_row_ids),
+            },
+            &fragment_bitmap,
+            dataset_version,
+            base_objects_uuid,
+        );
+
+        let (object_id_index, object_type_index, base_objects_index) = futures::join!(
+            object_id_index_fut,
+            object_type_index_fut,
+            base_objects_index_fut
+        );
+
+        Ok(vec![
+            object_id_index?,
+            object_type_index?,
+            base_objects_index?,
+        ])
+    }
+
+    async fn build_manifest_index(
+        dataset: &Dataset,
+        registry: Arc<IndexPluginRegistry>,
+        lance_schema: &lance_core::datatypes::Schema,
+        input: ManifestIndexBuildInput,
+        fragment_bitmap: &RoaringBitmap,
+        dataset_version: u64,
+        index_uuid: Uuid,
+    ) -> Result<IndexMetadata> {
+        let trained_index =
+            Self::train_manifest_index(dataset, registry, input, index_uuid).await?;
+        Self::manifest_index_metadata(
+            lance_schema,
+            fragment_bitmap,
+            dataset_version,
+            trained_index,
+        )
+    }
+
+    /// Get the manifest schema
     fn manifest_schema() -> Arc<ArrowSchema> {
         Arc::new(ArrowSchema::new(vec![
             // Set unenforced primary key on object_id for bloom filter conflict detection
@@ -783,6 +1418,627 @@ impl ManifestNamespace {
             })
     }
 
+    fn required_string_value<'a>(
+        array: &'a StringArray,
+        row: usize,
+        column_name: &str,
+    ) -> Result<&'a str> {
+        if array.is_null(row) {
+            return Err(NamespaceError::Internal {
+                message: format!("Manifest column '{}' has null at row {}", column_name, row),
+            }
+            .into());
+        }
+        Ok(array.value(row))
+    }
+
+    fn optional_string_value(array: &StringArray, row: usize) -> Option<String> {
+        (!array.is_null(row)).then(|| array.value(row).to_string())
+    }
+
+    fn base_objects_column_values(batch: &RecordBatch) -> Result<Vec<Option<Vec<String>>>> {
+        let Some(column) = batch.column_by_name("base_objects") else {
+            return Ok(vec![None; batch.num_rows()]);
+        };
+        let array = column.as_any().downcast_ref::<ListArray>().ok_or_else(|| {
+            lance_core::Error::from(NamespaceError::Internal {
+                message: format!(
+                    "Column 'base_objects' is not a list array: {:?}",
+                    column.data_type()
+                ),
+            })
+        })?;
+
+        let mut values = Vec::with_capacity(batch.num_rows());
+        for row in 0..batch.num_rows() {
+            if array.is_null(row) {
+                values.push(None);
+                continue;
+            }
+            let row_values = array.value(row);
+            let row_values = row_values
+                .as_any()
+                .downcast_ref::<StringArray>()
+                .ok_or_else(|| {
+                    lance_core::Error::from(NamespaceError::Internal {
+                        message: "Column 'base_objects' values are not strings".to_string(),
+                    })
+                })?;
+            let mut objects = Vec::with_capacity(row_values.len());
+            for value_index in 0..row_values.len() {
+                if row_values.is_null(value_index) {
+                    return Err(NamespaceError::Internal {
+                        message: format!(
+                            "Manifest column 'base_objects' has null item at row {} item {}",
+                            row, value_index
+                        ),
+                    }
+                    .into());
+                }
+                objects.push(row_values.value(value_index).to_string());
+            }
+            values.push(Some(objects));
+        }
+        Ok(values)
+    }
+
+    async fn manifest_projected_stream(dataset: &Dataset) -> Result<SendableRecordBatchStream> {
+        let mut scanner = dataset.scan();
+        scanner
+            .project(&[
+                "object_id",
+                "object_type",
+                "location",
+                "metadata",
+                "base_objects",
+            ])
+            .map_err(|e| {
+                lance_core::Error::from(NamespaceError::Internal {
+                    message: format!("Failed to project manifest columns: {:?}", e),
+                })
+            })?;
+        let stream = scanner.try_into_stream().await.map_err(|e| {
+            lance_core::Error::from(NamespaceError::Internal {
+                message: format!("Failed to create manifest stream: {:?}", e),
+            })
+        })?;
+        let schema = stream.schema();
+        let stream = stream.map_err(|err| DataFusionError::External(Box::new(err)));
+        Ok(Box::pin(DatafusionRecordBatchStreamAdapter::new(
+            schema,
+            stream.fuse(),
+        )))
+    }
+
+    fn manifest_rewrite_commit_retries(&self) -> u32 {
+        self.commit_retries
+            .unwrap_or(DEFAULT_MANIFEST_REWRITE_COMMIT_RETRIES)
+    }
+
+    fn lock_manifest_rewrite_shared<M: ManifestStreamMutation>(
+        shared: &Arc<StdMutex<ManifestRewriteShared<M>>>,
+    ) -> Result<StdMutexGuard<'_, ManifestRewriteShared<M>>> {
+        shared.lock().map_err(|_| {
+            lance_core::Error::from(NamespaceError::Internal {
+                message: "Manifest rewrite state mutex was poisoned".to_string(),
+            })
+        })
+    }
+
+    fn set_manifest_rewrite_error<M: ManifestStreamMutation>(
+        shared: &Arc<StdMutex<ManifestRewriteShared<M>>>,
+        err: LanceError,
+    ) {
+        match shared.lock() {
+            Ok(mut guard) => {
+                guard.error = Some(err);
+            }
+            Err(poisoned) => {
+                let mut guard = poisoned.into_inner();
+                guard.error = Some(err);
+            }
+        }
+    }
+
+    fn take_manifest_rewrite_error<M: ManifestStreamMutation>(
+        shared: &Arc<StdMutex<ManifestRewriteShared<M>>>,
+    ) -> Result<Option<LanceError>> {
+        let mut guard = Self::lock_manifest_rewrite_shared(shared)?;
+        Ok(guard.error.take())
+    }
+
+    fn process_manifest_rewrite_batch<M: ManifestStreamMutation>(
+        batch: RecordBatch,
+        shared: &Arc<StdMutex<ManifestRewriteShared<M>>>,
+    ) -> Result<Option<RecordBatch>> {
+        let object_ids = Self::get_string_column(&batch, "object_id")?;
+        let object_types = Self::get_string_column(&batch, "object_type")?;
+        let locations = Self::get_string_column(&batch, "location")?;
+        let metadatas = Self::get_string_column(&batch, "metadata")?;
+        let base_objects = Self::base_objects_column_values(&batch)?;
+        let mut output = ManifestBatchBuilder::new();
+        let mut guard = Self::lock_manifest_rewrite_shared(shared)?;
+        let mut index_data = guard.index_data.take().ok_or_else(|| {
+            lance_core::Error::from(NamespaceError::Internal {
+                message: "Manifest rewrite index state is unavailable".to_string(),
+            })
+        })?;
+        for (row, base_objects) in base_objects.into_iter().enumerate().take(batch.num_rows()) {
+            let row_value = ManifestRowValue {
+                object_id: Self::required_string_value(object_ids, row, "object_id")?.to_string(),
+                object_type: ObjectType::parse(Self::required_string_value(
+                    object_types,
+                    row,
+                    "object_type",
+                )?)?,
+                location: Self::optional_string_value(locations, row),
+                metadata: Self::optional_string_value(metadatas, row),
+                base_objects,
+            };
+            guard
+                .mutation
+                .process_existing_row(row_value, &mut output, &mut index_data)?;
+        }
+        guard.index_data = Some(index_data);
+        if output.is_empty() {
+            return Ok(None);
+        }
+        Ok(Some(output.finish()?))
+    }
+
+    fn finish_manifest_rewrite_stream<M: ManifestStreamMutation>(
+        shared: &Arc<StdMutex<ManifestRewriteShared<M>>>,
+    ) -> Result<Option<RecordBatch>> {
+        let mut output = ManifestBatchBuilder::new();
+        let mut guard = Self::lock_manifest_rewrite_shared(shared)?;
+        let mut index_data = guard.index_data.take().ok_or_else(|| {
+            lance_core::Error::from(NamespaceError::Internal {
+                message: "Manifest rewrite index state is unavailable".to_string(),
+            })
+        })?;
+        guard.mutation.append_rows(&mut output, &mut index_data)?;
+        let result = guard.mutation.finish();
+        let force_empty_batch = index_data.row_count == 0;
+        guard.result = Some(result);
+        guard.index_data = Some(index_data);
+        if output.is_empty() && !force_empty_batch {
+            Ok(None)
+        } else {
+            Ok(Some(output.finish()?))
+        }
+    }
+
+    fn manifest_rewrite_output_stream<M: ManifestStreamMutation + 'static>(
+        source: SendableRecordBatchStream,
+        shared: Arc<StdMutex<ManifestRewriteShared<M>>>,
+    ) -> SendableRecordBatchStream {
+        enum Phase {
+            Source,
+            Finish,
+            Done,
+        }
+
+        let schema = Self::manifest_schema();
+        let stream = stream::unfold(
+            (source, shared, Phase::Source),
+            |(mut source, shared, mut phase)| async move {
+                loop {
+                    match phase {
+                        Phase::Source => match source.next().await {
+                            Some(Ok(batch)) => {
+                                match Self::process_manifest_rewrite_batch(batch, &shared) {
+                                    Ok(Some(batch)) => {
+                                        return Some((Ok(batch), (source, shared, phase)));
+                                    }
+                                    Ok(None) => continue,
+                                    Err(err) => {
+                                        let message = err.to_string();
+                                        Self::set_manifest_rewrite_error(&shared, err);
+                                        return Some((
+                                            Err(DataFusionError::External(Box::new(
+                                                std::io::Error::other(message),
+                                            ))),
+                                            (source, shared, Phase::Done),
+                                        ));
+                                    }
+                                }
+                            }
+                            Some(Err(err)) => {
+                                return Some((Err(err), (source, shared, Phase::Done)));
+                            }
+                            None => phase = Phase::Finish,
+                        },
+                        Phase::Finish => {
+                            phase = Phase::Done;
+                            match Self::finish_manifest_rewrite_stream(&shared) {
+                                Ok(Some(batch)) => {
+                                    return Some((Ok(batch), (source, shared, phase)));
+                                }
+                                Ok(None) => continue,
+                                Err(err) => {
+                                    let message = err.to_string();
+                                    Self::set_manifest_rewrite_error(&shared, err);
+                                    return Some((
+                                        Err(DataFusionError::External(Box::new(
+                                            std::io::Error::other(message),
+                                        ))),
+                                        (source, shared, Phase::Done),
+                                    ));
+                                }
+                            }
+                        }
+                        Phase::Done => return None,
+                    }
+                }
+            },
+        );
+        Box::pin(DatafusionRecordBatchStreamAdapter::new(
+            schema,
+            stream.fuse(),
+        ))
+    }
+
+    fn take_manifest_rewrite_result<M: ManifestStreamMutation>(
+        shared: &Arc<StdMutex<ManifestRewriteShared<M>>>,
+    ) -> Result<(CopyOnWriteMutation<M::Output>, ManifestIndexAccumulator)> {
+        let mut guard = Self::lock_manifest_rewrite_shared(shared)?;
+        let result = guard.result.take().ok_or_else(|| {
+            lance_core::Error::from(NamespaceError::Internal {
+                message: "Manifest rewrite stream did not finish".to_string(),
+            })
+        })?;
+        let index_data = guard.index_data.take().ok_or_else(|| {
+            lance_core::Error::from(NamespaceError::Internal {
+                message: "Manifest rewrite index state is unavailable".to_string(),
+            })
+        })?;
+        Ok((result, index_data))
+    }
+
+    /// Delete the staged (uncommitted) data files and index directories for a rewrite.
+    /// Only call this once the rewrite is known *not* to have landed (a put-if-not-exists
+    /// conflict, or an ambiguous error whose target version does not reference our data
+    /// file) — otherwise it would orphan files a committed manifest still references.
+    async fn cleanup_staged_manifest_files(
+        &self,
+        object_store: &ObjectStore,
+        data_files: &HashSet<String>,
+        index_uuids: &[Uuid],
+    ) {
+        let data_dir = self
+            .base_path
+            .clone()
+            .join(MANIFEST_TABLE_NAME)
+            .join(LANCE_DATA_DIR);
+        for path in data_files {
+            let data_path = data_dir.clone().join(path.as_str());
+            if let Err(err) = object_store.delete(&data_path).await {
+                log::warn!(
+                    "Failed to clean up uncommitted manifest rewrite data file '{}': {}",
+                    data_path,
+                    err
+                );
+            }
+        }
+        self.cleanup_uncommitted_manifest_index_dirs(object_store, index_uuids.iter().copied())
+            .await;
+    }
+
+    async fn cleanup_uncommitted_manifest_index_dirs(
+        &self,
+        object_store: &ObjectStore,
+        index_uuids: impl IntoIterator<Item = Uuid>,
+    ) {
+        for index_uuid in index_uuids {
+            let index_dir = self
+                .base_path
+                .clone()
+                .join(MANIFEST_TABLE_NAME)
+                .join(LANCE_INDICES_DIR)
+                .join(index_uuid.to_string());
+            if let Err(err) = object_store.remove_dir_all(index_dir.clone()).await
+                && !matches!(err, LanceError::NotFound { .. })
+            {
+                log::warn!(
+                    "Failed to clean up uncommitted manifest rewrite index directory '{}': {}",
+                    index_dir,
+                    err
+                );
+            }
+        }
+    }
+
+    /// Resolve the commit handler for the `__manifest` dataset's storage backend.
+    async fn manifest_commit_handler(&self) -> Result<Arc<dyn CommitHandler>> {
+        commit_handler_from_url(&self.root, &None)
+            .await
+            .map_err(|e| {
+                lance_core::Error::from(NamespaceError::Internal {
+                    message: format!("Failed to resolve manifest commit handler: {:?}", e),
+                })
+            })
+    }
+
+    /// Directly write the rewritten `__manifest` as a new version using the storage
+    /// backend's atomic put-if-not-exists. The overwrite transaction is embedded inline
+    /// (no separate transaction file) and the commit handler writes the version hint.
+    async fn commit_manifest_overwrite(
+        &self,
+        dataset: &Dataset,
+        commit_handler: &dyn CommitHandler,
+        manifest: &mut Manifest,
+        indices: Option<Vec<IndexMetadata>>,
+        transaction: Transaction,
+    ) -> std::result::Result<(), CommitError> {
+        apply_feature_flags(manifest, false, false).map_err(CommitError::from)?;
+        let timestamp_nanos = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .map(|d| d.as_nanos())
+            .unwrap_or(0);
+        manifest.set_timestamp(timestamp_nanos);
+        manifest.update_max_fragment_id();
+
+        // Commit through the dataset's own object store, not `self.object_store`: for
+        // stores like `memory://` the namespace and the dataset can hold different
+        // instances, and a commit written to the wrong one is invisible to reads.
+        let object_store = dataset
+            .object_store(None)
+            .await
+            .map_err(CommitError::from)?;
+        let base_path = self.base_path.clone().join(MANIFEST_TABLE_NAME);
+        let naming_scheme = dataset.manifest_location().naming_scheme;
+        commit_handler
+            .commit(
+                manifest,
+                indices,
+                &base_path,
+                &object_store,
+                write_manifest_file_to_path,
+                naming_scheme,
+                Some((&transaction).into()),
+            )
+            .await
+            .map(|_location| ())
+    }
+
+    /// After an ambiguous commit error, determine whether our overwrite actually landed at
+    /// `target_version`. A network failure can leave the manifest committed even though the
+    /// client observed an error; in that case the committed version references one of our
+    /// staged data files, and deleting them would corrupt the catalog.
+    async fn manifest_commit_landed(
+        &self,
+        dataset: &Dataset,
+        target_version: u64,
+        data_files: &HashSet<String>,
+    ) -> bool {
+        let Ok(committed) = dataset.checkout_version(target_version).await else {
+            return false;
+        };
+        committed.manifest().fragments.iter().any(|fragment| {
+            fragment
+                .files
+                .iter()
+                .any(|file| data_files.contains(file.path.as_str()))
+        })
+    }
+
+    /// Resolve a storage commit conflict against the latest committed catalog state.
+    /// Returns `Some(output)` when the mutation's intent is already satisfied (no retry
+    /// needed), `Ok(None)` to retry the rewrite, or an error for a terminal conflict.
+    async fn resolve_manifest_conflict<O: Clone>(
+        &self,
+        resolution: &ConflictResolution<O>,
+    ) -> Result<Option<O>> {
+        match resolution {
+            ConflictResolution::Retry => Ok(None),
+            ConflictResolution::FailIfExists(object_ids) => {
+                for object_id in object_ids {
+                    if self.manifest_contains_object(object_id).await? {
+                        return Err(NamespaceError::ConcurrentModification {
+                            message: format!(
+                                "Object '{}' was concurrently created by another operation",
+                                object_id
+                            ),
+                        }
+                        .into());
+                    }
+                }
+                Ok(None)
+            }
+            ConflictResolution::SucceedIfAbsent { object_id, output } => {
+                if self.manifest_contains_object(object_id).await? {
+                    Ok(None)
+                } else {
+                    Ok(Some(output.clone()))
+                }
+            }
+        }
+    }
+
+    /// Validate that this build can write the current `__manifest` before a
+    /// mutating operation performs any side effect (e.g. writing table data), so
+    /// a refused write leaves nothing orphaned behind. The eventual
+    /// `rewrite_manifest` commit re-checks `ensure_writable` on each retry, so a
+    /// concurrent upgrade in between is still caught.
+    async fn ensure_manifest_writable(&self) -> Result<()> {
+        let dataset_guard = self.manifest_dataset.get().await?;
+        ensure_writable(dataset_guard.metadata())
+    }
+
+    async fn rewrite_manifest<M, F>(
+        &self,
+        operation: &str,
+        mut make_mutation: F,
+    ) -> Result<M::Output>
+    where
+        M: ManifestStreamMutation + 'static,
+        F: FnMut() -> M,
+    {
+        let _mutation_guard = self.manifest_mutation_lock.lock().await;
+        let max_retries = self.manifest_rewrite_commit_retries();
+        let mut retries = 0;
+        let build_indices = self.inline_optimization_enabled;
+        let commit_handler = self.manifest_commit_handler().await?;
+
+        loop {
+            let dataset_guard = self.manifest_dataset.get_refreshed().await?;
+            let dataset = Arc::new(dataset_guard.clone());
+            drop(dataset_guard);
+            // Refuse to mutate a manifest written with a writer feature flag this
+            // build does not understand.
+            ensure_writable(dataset.metadata())?;
+            // Staged files, indices, the commit, and cleanup must all use the dataset's
+            // own object store (see `commit_manifest_overwrite`).
+            let object_store = dataset.object_store(None).await?;
+
+            let source = Self::manifest_projected_stream(&dataset).await?;
+            let resolution = make_mutation().conflict_resolution();
+            let shared = Arc::new(StdMutex::new(ManifestRewriteShared::new(make_mutation())));
+            let output_stream = Self::manifest_rewrite_output_stream(source, shared.clone());
+            // Pin both limits so the overwrite never splits into multiple fragments: the
+            // replacement indices map each row to address `(0 << 32) | offset`, valid only
+            // for a single fragment with id 0. The row count is bounded below u32::MAX by
+            // `ManifestIndexAccumulator::next_row_id`.
+            let write_params = WriteParams {
+                mode: WriteMode::Overwrite,
+                session: self.session.clone(),
+                max_rows_per_file: u32::MAX as usize,
+                max_bytes_per_file: usize::MAX,
+                skip_auto_cleanup: true,
+                ..WriteParams::default()
+            };
+
+            let transaction = match InsertBuilder::new(dataset.clone())
+                .with_params(&write_params)
+                .execute_uncommitted_stream(output_stream)
+                .await
+            {
+                Ok(transaction) => transaction,
+                Err(err) => {
+                    if let Some(stream_err) = Self::take_manifest_rewrite_error(&shared)? {
+                        return Err(stream_err);
+                    }
+                    return Err(convert_lance_commit_error(&err, operation, None));
+                }
+            };
+
+            let (mutation, index_data) = Self::take_manifest_rewrite_result(&shared)?;
+
+            let Operation::Overwrite {
+                fragments, schema, ..
+            } = &transaction.operation
+            else {
+                return Err(NamespaceError::Internal {
+                    message: "Manifest rewrite transaction is not an overwrite".to_string(),
+                }
+                .into());
+            };
+            // Unique data files this attempt staged. Used to clean up orphans and to
+            // attribute an ambiguous commit error back to us.
+            let staged_data_files = fragments
+                .iter()
+                .flat_map(|fragment| fragment.files.iter())
+                .filter(|file| file.base_id.is_none())
+                .map(|file| file.path.clone())
+                .collect::<HashSet<_>>();
+
+            if !mutation.has_changes {
+                self.cleanup_staged_manifest_files(&object_store, &staged_data_files, &[])
+                    .await;
+                return Ok(mutation.result);
+            }
+
+            let mut manifest = Self::manifest_from_overwrite_transaction(
+                dataset.manifest(),
+                schema.clone(),
+                fragments,
+            );
+            let target_version = manifest.version;
+
+            let index_uuids = [Uuid::new_v4(), Uuid::new_v4(), Uuid::new_v4()];
+            let indices = if build_indices {
+                match Self::build_manifest_indices(&dataset, &manifest, index_data, index_uuids)
+                    .await
+                {
+                    Ok(indices) => Some(indices),
+                    Err(err) => {
+                        self.cleanup_staged_manifest_files(
+                            &object_store,
+                            &staged_data_files,
+                            &index_uuids,
+                        )
+                        .await;
+                        return Err(err);
+                    }
+                }
+            } else {
+                None
+            };
+            let staged_index_uuids: &[Uuid] = if build_indices { &index_uuids } else { &[] };
+
+            let commit_result = self
+                .commit_manifest_overwrite(
+                    &dataset,
+                    commit_handler.as_ref(),
+                    &mut manifest,
+                    indices,
+                    transaction,
+                )
+                .await;
+
+            match commit_result {
+                Ok(()) => {
+                    let _ = self.manifest_dataset.get_refreshed().await;
+                    return Ok(mutation.result);
+                }
+                Err(err) => {
+                    // The put may have landed even though the client saw an error (lost
+                    // ack). Verify before deleting anything so we never orphan files that a
+                    // committed manifest still references.
+                    if self
+                        .manifest_commit_landed(&dataset, target_version, &staged_data_files)
+                        .await
+                    {
+                        let _ = self.manifest_dataset.get_refreshed().await;
+                        return Ok(mutation.result);
+                    }
+                    self.cleanup_staged_manifest_files(
+                        &object_store,
+                        &staged_data_files,
+                        staged_index_uuids,
+                    )
+                    .await;
+                    match err {
+                        CommitError::CommitConflict => {
+                            if let Some(output) =
+                                self.resolve_manifest_conflict(&resolution).await?
+                            {
+                                return Ok(output);
+                            }
+                            if retries >= max_retries {
+                                return Err(NamespaceError::ConcurrentModification {
+                                    message: format!(
+                                        "{}: still conflicting after {} retries",
+                                        operation, max_retries
+                                    ),
+                                }
+                                .into());
+                            }
+                            retries += 1;
+                            tokio::time::sleep(std::time::Duration::from_millis(
+                                10 * u64::from(retries),
+                            ))
+                            .await;
+                        }
+                        CommitError::OtherError(err) => {
+                            return Err(convert_lance_commit_error(&err, operation, None));
+                        }
+                    }
+                }
+            }
+        }
+    }
+
     /// Check if the manifest contains an object with the given ID
     async fn manifest_contains_object(&self, object_id: &str) -> Result<bool> {
         let escaped_id = object_id.replace('\'', "''");
@@ -999,7 +2255,6 @@ impl ManifestNamespace {
     /// Insert one or more entries into the manifest table with metadata and base_objects.
     ///
     /// This is the unified entry point for both single and batch inserts.
-    /// Uses a single MergeInsert operation to insert all entries at once.
     /// If any entry already exists (matching object_id), the entire batch fails.
     pub async fn insert_into_manifest_with_metadata(
         &self,
@@ -1029,181 +2284,55 @@ impl ManifestNamespace {
             return Ok(());
         }
 
-        let schema = Self::manifest_schema();
-
-        let mut object_ids = Vec::with_capacity(entries.len());
-        let mut object_types = Vec::with_capacity(entries.len());
-        let mut locations: Vec<Option<String>> = Vec::with_capacity(entries.len());
-        let mut metadatas: Vec<Option<String>> = Vec::with_capacity(entries.len());
+        self.rewrite_manifest("Failed to overwrite manifest", || {
+            UpsertManifestMutation::new(entries.clone(), base_objects.clone(), when_matched.clone())
+        })
+        .await
+    }
 
-        let string_builder = StringBuilder::new();
-        let mut list_builder = ListBuilder::new(string_builder).with_field(Arc::new(Field::new(
-            "object_id",
-            DataType::Utf8,
-            true,
-        )));
+    /// Delete an entry from the manifest table
+    pub async fn delete_from_manifest(&self, object_id: &str) -> Result<()> {
+        let object_id = object_id.to_string();
+        self.rewrite_manifest("Failed to delete from manifest", || DeleteObjectMutation {
+            object_id: object_id.clone(),
+            deleted: false,
+        })
+        .await
+    }
 
-        for (i, entry) in entries.iter().enumerate() {
-            object_ids.push(entry.object_id.as_str());
-            object_types.push(entry.object_type.as_str());
-            locations.push(entry.location.clone());
-            metadatas.push(entry.metadata.clone());
-
-            // Only the first entry gets the base_objects (for single-entry inserts
-            // with base_objects like view creation); batch entries use null.
-            if i == 0 {
-                match &base_objects {
-                    Some(objects) => {
-                        for obj in objects {
-                            list_builder.values().append_value(obj);
-                        }
-                        list_builder.append(true);
-                    }
-                    None => {
-                        list_builder.append_null();
-                    }
-                }
-            } else {
-                list_builder.append_null();
+    /// Register a table in the manifest without creating the physical table (internal helper for migration)
+    pub async fn register_table(&self, name: &str, location: String) -> Result<()> {
+        let object_id = Self::build_object_id(&[], name);
+        if self.manifest_contains_object(&object_id).await? {
+            return Err(NamespaceError::Internal {
+                message: format!("Table '{}' already exists", name),
             }
+            .into());
         }
 
-        let base_objects_array = list_builder.finish();
-
-        let location_array: Arc<dyn Array> = Arc::new(StringArray::from(
-            locations.iter().map(|l| l.as_deref()).collect::<Vec<_>>(),
-        ));
-
-        let metadata_array: Arc<dyn Array> = Arc::new(StringArray::from(
-            metadatas.iter().map(|m| m.as_deref()).collect::<Vec<_>>(),
-        ));
-
-        let batch = RecordBatch::try_new(
-            schema.clone(),
-            vec![
-                Arc::new(StringArray::from(object_ids)),
-                Arc::new(StringArray::from(object_types.to_vec())),
-                location_array,
-                metadata_array,
-                Arc::new(base_objects_array),
-            ],
-        )
-        .map_err(|e| {
-            lance_core::Error::from(NamespaceError::Internal {
-                message: format!("Failed to create manifest entries: {:?}", e),
-            })
-        })?;
-
-        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone());
-
-        // Use MergeInsert so callers can choose fail-on-existing inserts or metadata upserts.
-        let _mutation_guard = self.manifest_mutation_lock.lock().await;
-        let dataset_guard = self.manifest_dataset.get().await?;
-        let dataset_arc = Arc::new(dataset_guard.clone());
-        drop(dataset_guard); // Drop read guard before merge insert
-
-        let mut merge_builder =
-            MergeInsertBuilder::try_new(dataset_arc, vec!["object_id".to_string()]).map_err(
-                |e| {
-                    lance_core::Error::from(NamespaceError::Internal {
-                        message: format!("Failed to create merge builder: {:?}", e),
-                    })
-                },
-            )?;
-        merge_builder.when_matched(when_matched);
-        merge_builder.when_not_matched(WhenNotMatched::InsertAll);
-        // Use conflict_retries to handle cross-process races on manifest mutations.
-        merge_builder.conflict_retries(5);
-        // TODO: after BTREE index creation on object_id, has_scalar_index=true causes
-        // MergeInsert to use V1 path which lacks bloom filters for conflict detection. This
-        // results in (Some, None) filter mismatch when rebasing against V2 operations.
-        // Setting use_index=false ensures all operations consistently use V2 path.
-        merge_builder.use_index(false);
-        if let Some(retries) = self.commit_retries {
-            merge_builder.commit_retries(retries);
-        }
-
-        let (new_dataset_arc, _merge_stats) = merge_builder
-            .try_build()
-            .map_err(|e| {
-                lance_core::Error::from(NamespaceError::Internal {
-                    message: format!("Failed to build merge: {:?}", e),
-                })
-            })?
-            .execute_reader(Box::new(reader))
+        self.insert_into_manifest(object_id, ObjectType::Table, Some(location))
             .await
-            .map_err(|e| {
-                convert_lance_commit_error(&e, "Failed to execute merge insert into manifest", None)
-            })?;
-
-        let new_dataset = Arc::try_unwrap(new_dataset_arc).unwrap_or_else(|arc| (*arc).clone());
-        self.manifest_dataset.set_latest(new_dataset).await;
-
-        // Run inline optimization after write
-        if let Err(e) = self.run_inline_optimization().await {
-            log::warn!(
-                "Unexpected failure when running inline optimization: {:?}",
-                e
-            );
-        }
-
-        Ok(())
     }
 
-    /// Delete an entry from the manifest table
-    pub async fn delete_from_manifest(&self, object_id: &str) -> Result<()> {
-        let predicate = format!("object_id = '{}'", object_id);
-
-        // Get dataset and use DeleteBuilder with configured retries
-        let _mutation_guard = self.manifest_mutation_lock.lock().await;
-        let dataset_guard = self.manifest_dataset.get().await?;
-        let dataset = Arc::new(dataset_guard.clone());
-        drop(dataset_guard); // Drop read guard before delete
-
-        let new_dataset = DeleteBuilder::new(dataset, &predicate)
-            .execute()
-            .await
-            .map_err(|e| convert_lance_commit_error(&e, "Failed to delete", None))?;
-
-        // Update the wrapper with the new dataset
-        self.manifest_dataset
-            .set_latest(
-                Arc::try_unwrap(new_dataset.new_dataset).unwrap_or_else(|arc| (*arc).clone()),
-            )
-            .await;
-
-        // Run inline optimization after delete
-        if let Err(e) = self.run_inline_optimization().await {
-            log::warn!(
-                "Unexpected failure when running inline optimization: {:?}",
-                e
-            );
+    /// Validate that all levels of a namespace path exist
+    async fn validate_namespace_levels_exist(&self, namespace_path: &[String]) -> Result<()> {
+        for i in 1..=namespace_path.len() {
+            let partial_path = &namespace_path[..i];
+            let object_id = partial_path.join(DELIMITER);
+            if !self.manifest_contains_object(&object_id).await? {
+                return Err(NamespaceError::NamespaceNotFound {
+                    message: format!("parent namespace '{}'", object_id),
+                }
+                .into());
+            }
         }
-
         Ok(())
     }
 
-    /// Query the manifest for all versions of a table, sorted by version.
-    ///
-    /// Returns a list of (version, metadata_json_string) tuples where metadata_json_string
-    /// contains the full metadata JSON stored in the manifest (manifest_path, manifest_size,
-    /// e_tag, naming_scheme).
-    ///
-    /// **Known limitation**: All matching rows are loaded into memory, sorted in Rust,
-    /// and then truncated. For tables with a very large number of versions this may be
-    /// expensive. Pushing sort/limit into the scan is not yet supported by Lance.
-    pub async fn query_table_versions(
-        &self,
-        object_id: &str,
-        descending: bool,
-        limit: Option<i32>,
-    ) -> Result<Vec<(i64, String)>> {
+    /// Query the manifest for a namespace with the given object ID
+    async fn query_manifest_for_namespace(&self, object_id: &str) -> Result<Option<NamespaceInfo>> {
         let escaped_id = object_id.replace('\'', "''");
-        // table_version object_ids are formatted as "{object_id}${zero_padded_version}"
-        let filter = format!(
-            "object_type = 'table_version' AND starts_with(object_id, '{}{}')",
-            escaped_id, DELIMITER
-        );
+        let filter = format!("object_id = '{}' AND object_type = 'namespace'", escaped_id);
         let mut scanner = self.manifest_scanner().await?;
         scanner.filter(&filter).map_err(|e| {
             lance_core::Error::from(NamespaceError::Internal {
@@ -1217,200 +2346,285 @@ impl ManifestNamespace {
         })?;
         let batches = Self::execute_scanner(scanner).await?;
 
-        let mut versions: Vec<(i64, String)> = Vec::new();
+        let mut found_result: Option<NamespaceInfo> = None;
+        let mut total_rows = 0;
+
         for batch in batches {
             if batch.num_rows() == 0 {
                 continue;
             }
-            let object_id_array = Self::get_string_column(&batch, "object_id")?;
-            let metadata_array = Self::get_string_column(&batch, "metadata")?;
-            for i in 0..batch.num_rows() {
-                let oid = object_id_array.value(i);
-                // Parse version from object_id
-                if let Some(version) = Self::parse_version_from_object_id(oid) {
-                    let metadata_str = metadata_array.value(i).to_string();
-                    versions.push((version, metadata_str));
+
+            total_rows += batch.num_rows();
+            if total_rows > 1 {
+                return Err(NamespaceError::Internal {
+                    message: format!(
+                        "Expected exactly 1 namespace with id '{}', found {}",
+                        object_id, total_rows
+                    ),
                 }
+                .into());
             }
-        }
 
-        if descending {
-            versions.sort_by(|a, b| b.0.cmp(&a.0));
-        } else {
-            versions.sort_by(|a, b| a.0.cmp(&b.0));
-        }
+            let object_id_array = Self::get_string_column(&batch, "object_id")?;
+            let metadata_array = Self::get_string_column(&batch, "metadata")?;
+
+            let object_id_str = object_id_array.value(0);
+            let metadata = if !metadata_array.is_null(0) {
+                let metadata_str = metadata_array.value(0);
+                match serde_json::from_str::<HashMap<String, String>>(metadata_str) {
+                    Ok(map) => Some(map),
+                    Err(e) => {
+                        return Err(NamespaceError::Internal {
+                            message: format!(
+                                "Failed to deserialize metadata for namespace '{}': {}",
+                                object_id, e
+                            ),
+                        }
+                        .into());
+                    }
+                }
+            } else {
+                None
+            };
 
-        if let Some(limit) = limit {
-            versions.truncate(limit as usize);
+            let (namespace, name) = Self::parse_object_id(object_id_str);
+            found_result = Some(NamespaceInfo {
+                namespace,
+                name,
+                metadata,
+            });
         }
 
-        Ok(versions)
+        Ok(found_result)
     }
 
-    /// Query the manifest for a specific version of a table.
-    ///
-    /// Returns the full metadata JSON string if found, which contains
-    /// manifest_path, manifest_size, e_tag, and naming_scheme.
+    /// Create or load the manifest dataset, ensuring it has the latest schema setup.
     ///
-    pub async fn query_table_version(
-        &self,
-        object_id: &str,
-        version: i64,
-    ) -> Result<Option<String>> {
-        let version_object_id = Self::build_version_object_id(object_id, version);
-        self.query_table_version_by_object_id(&version_object_id)
-            .await
-    }
+    /// This function will:
+    /// 1. Try to load an existing manifest table
+    /// 2. If it exists, check and migrate the schema if needed (e.g., add primary key metadata)
+    /// 3. If it doesn't exist, create a new manifest table with the current schema
+    async fn ensure_manifest_table_up_to_date(
+        root: &str,
+        storage_options: &Option<HashMap<String, String>>,
+        session: Option<Arc<Session>>,
+    ) -> Result<DatasetConsistencyWrapper> {
+        let manifest_path = format!("{}/{}", root, MANIFEST_TABLE_NAME);
+        log::debug!("Attempting to load manifest from {}", manifest_path);
+        let store_options = ObjectStoreParams {
+            storage_options_accessor: storage_options.as_ref().map(|opts| {
+                Arc::new(
+                    lance_io::object_store::StorageOptionsAccessor::with_static_options(
+                        opts.clone(),
+                    ),
+                )
+            }),
+            ..Default::default()
+        };
+        let read_params = ReadParams {
+            session: session.clone(),
+            store_options: Some(store_options.clone()),
+            ..Default::default()
+        };
+        let dataset_result = DatasetBuilder::from_uri(&manifest_path)
+            .with_read_params(read_params)
+            .load()
+            .await;
+        if let Ok(mut dataset) = dataset_result {
+            // Reject a manifest written with a reader feature flag this build
+            // does not understand before touching it.
+            ensure_readable(dataset.metadata())?;
 
-    /// Query a specific table version by its exact object_id.
-    async fn query_table_version_by_object_id(
-        &self,
-        version_object_id: &str,
-    ) -> Result<Option<String>> {
-        let escaped_id = version_object_id.replace('\'', "''");
-        let filter = format!(
-            "object_id = '{}' AND object_type = 'table_version'",
-            escaped_id
-        );
-        let mut scanner = self.manifest_scanner().await?;
-        scanner.filter(&filter).map_err(|e| {
-            lance_core::Error::from(NamespaceError::Internal {
-                message: format!("Failed to filter: {:?}", e),
-            })
-        })?;
-        scanner.project(&["metadata"]).map_err(|e| {
-            lance_core::Error::from(NamespaceError::Internal {
-                message: format!("Failed to project: {:?}", e),
-            })
-        })?;
-        let batches = Self::execute_scanner(scanner).await?;
+            // Check if the object_id field has primary key metadata, migrate if not
+            let needs_pk_migration = dataset
+                .schema()
+                .field("object_id")
+                .map(|f| {
+                    !f.metadata
+                        .contains_key(LANCE_UNENFORCED_PRIMARY_KEY_POSITION)
+                })
+                .unwrap_or(false);
 
-        for batch in batches {
-            if batch.num_rows() == 0 {
-                continue;
+            if needs_pk_migration {
+                // This legacy migration writes to the manifest, so confirm this
+                // build is allowed to write the current format first.
+                ensure_writable(dataset.metadata())?;
+                log::info!("Migrating __manifest table to add primary key metadata on object_id");
+                dataset
+                    .update_field_metadata()
+                    .update("object_id", [(LANCE_UNENFORCED_PRIMARY_KEY_POSITION, "0")])
+                    .map_err(|e| {
+                        lance_core::Error::from(NamespaceError::Internal {
+                            message: format!(
+                                "Failed to find object_id field for migration: {:?}",
+                                e
+                            ),
+                        })
+                    })?
+                    .await
+                    .map_err(|e| {
+                        lance_core::Error::from(NamespaceError::Internal {
+                            message: format!("Failed to migrate primary key metadata: {:?}", e),
+                        })
+                    })?;
             }
-            let metadata_array = Self::get_string_column(&batch, "metadata")?;
-            return Ok(Some(metadata_array.value(0).to_string()));
-        }
-
-        Ok(None)
-    }
-
-    /// Delete table version entries from the manifest for a given table and version ranges.
-    ///
-    /// Each range is (start_version, end_version) inclusive. Deletes all matching
-    /// `object_type = 'table_version'` entries whose object_id matches
-    /// `{object_id}${zero_padded_version}`.
-    ///
-    /// Builds a single filter expression covering all version ranges and executes
-    /// one bulk delete operation instead of deleting versions one at a time.
-    pub async fn delete_table_versions(
-        &self,
-        object_id: &str,
-        ranges: &[(i64, i64)],
-    ) -> Result<i64> {
-        if ranges.is_empty() {
-            return Ok(0);
-        }
 
-        // Collect all object_ids to delete (both new zero-padded and legacy formats)
-        let mut object_id_conditions: Vec<String> = Vec::new();
-        for (start, end) in ranges {
-            for version in *start..=*end {
-                let oid = Self::build_version_object_id(object_id, version);
-                let escaped = oid.replace('\'', "''");
-                object_id_conditions.push(format!("'{}'", escaped));
-            }
-        }
+            Ok(DatasetConsistencyWrapper::new(dataset))
+        } else {
+            log::info!("Creating new manifest table at {}", manifest_path);
+            let schema = Self::manifest_schema();
+            let empty_batch = RecordBatch::new_empty(schema.clone());
+            let reader = RecordBatchIterator::new(vec![Ok(empty_batch)], schema.clone());
 
-        if object_id_conditions.is_empty() {
-            return Ok(0);
-        }
+            let store_params = ObjectStoreParams {
+                storage_options_accessor: storage_options.as_ref().map(|opts| {
+                    Arc::new(
+                        lance_io::object_store::StorageOptionsAccessor::with_static_options(
+                            opts.clone(),
+                        ),
+                    )
+                }),
+                ..Default::default()
+            };
+            let write_params = WriteParams {
+                session: session.clone(),
+                store_params: Some(store_params),
+                ..Default::default()
+            };
 
-        // First, count how many entries exist so we can report the deleted count
-        let in_list = object_id_conditions.join(", ");
-        let filter = format!(
-            "object_type = 'table_version' AND object_id IN ({})",
-            in_list
-        );
+            let dataset =
+                Dataset::write(Box::new(reader), &manifest_path, Some(write_params)).await;
 
-        let mut scanner = self.manifest_scanner().await?;
-        scanner.filter(&filter).map_err(|e| {
-            lance_core::Error::from(NamespaceError::Internal {
-                message: format!("Failed to filter: {:?}", e),
-            })
-        })?;
-        scanner.project(&["object_id", "location"]).map_err(|e| {
-            lance_core::Error::from(NamespaceError::Internal {
-                message: format!("Failed to project: {:?}", e),
-            })
-        })?;
-        let batches = Self::execute_scanner(scanner).await?;
-        let deleted_count: i64 = batches.iter().map(|b| b.num_rows() as i64).sum();
-
-        if deleted_count == 0 {
-            return Ok(0);
+            // Handle race condition where another process created the manifest concurrently
+            match dataset {
+                Ok(dataset) => {
+                    log::info!(
+                        "Successfully created manifest table at {}, version={}, uri={}",
+                        manifest_path,
+                        dataset.version().version,
+                        dataset.uri()
+                    );
+                    Ok(DatasetConsistencyWrapper::new(dataset))
+                }
+                Err(ref e)
+                    if matches!(
+                        e,
+                        LanceError::DatasetAlreadyExists { .. }
+                            | LanceError::CommitConflict { .. }
+                            | LanceError::IncompatibleTransaction { .. }
+                            | LanceError::RetryableCommitConflict { .. }
+                    ) =>
+                {
+                    // Another process created the manifest concurrently, try to load it
+                    log::info!(
+                        "Manifest table was created by another process, loading it: {}",
+                        manifest_path
+                    );
+                    let recovery_store_options = ObjectStoreParams {
+                        storage_options_accessor: storage_options.as_ref().map(|opts| {
+                            Arc::new(
+                                lance_io::object_store::StorageOptionsAccessor::with_static_options(
+                                    opts.clone(),
+                                ),
+                            )
+                        }),
+                        ..Default::default()
+                    };
+                    let recovery_read_params = ReadParams {
+                        session,
+                        store_options: Some(recovery_store_options),
+                        ..Default::default()
+                    };
+                    let dataset = DatasetBuilder::from_uri(&manifest_path)
+                        .with_read_params(recovery_read_params)
+                        .load()
+                        .await
+                        .map_err(|e| {
+                            lance_core::Error::from(NamespaceError::Internal {
+                                message: format!(
+                                    "Failed to load manifest dataset after creation conflict: {}",
+                                    e
+                                ),
+                            })
+                        })?;
+                    Ok(DatasetConsistencyWrapper::new(dataset))
+                }
+                Err(e) => Err(lance_core::Error::from(NamespaceError::Internal {
+                    message: format!("Failed to create manifest dataset: {:?}", e),
+                })),
+            }
         }
+    }
 
-        // Execute a single bulk delete with the combined filter
-        let _mutation_guard = self.manifest_mutation_lock.lock().await;
-        let dataset_guard = self.manifest_dataset.get().await?;
-        let dataset = Arc::new(dataset_guard.clone());
-        drop(dataset_guard);
-
-        let new_dataset = DeleteBuilder::new(dataset, &filter)
-            .execute()
-            .await
-            .map_err(|e| {
-                convert_lance_commit_error(&e, "Failed to batch delete table versions", None)
-            })?;
+    /// Sorts names alphabetically and applies pagination using page_token (start_after) and limit.
+    ///
+    /// Returns the next page token (last item in this page) if more results exist beyond the limit,
+    /// or `None` if this is the last page.
+    fn apply_pagination(
+        names: &mut Vec<String>,
+        page_token: Option<String>,
+        limit: Option<i32>,
+    ) -> Option<String> {
+        names.sort();
 
-        self.manifest_dataset
-            .set_latest(
-                Arc::try_unwrap(new_dataset.new_dataset).unwrap_or_else(|arc| (*arc).clone()),
-            )
-            .await;
+        if let Some(start_after) = page_token {
+            if let Some(index) = names
+                .iter()
+                .position(|name| name.as_str() > start_after.as_str())
+            {
+                names.drain(0..index);
+            } else {
+                names.clear();
+            }
+        }
 
-        if let Err(e) = self.run_inline_optimization().await {
-            log::warn!(
-                "Unexpected failure when running inline optimization: {:?}",
-                e
-            );
+        if let Some(limit) = limit
+            && limit >= 0
+        {
+            let limit = limit as usize;
+            if names.len() > limit {
+                let next_page_token = if limit > 0 {
+                    Some(names[limit - 1].clone())
+                } else {
+                    None
+                };
+                names.truncate(limit);
+                return next_page_token;
+            }
         }
 
-        Ok(deleted_count)
+        None
     }
+}
 
-    /// Atomically delete table version entries from the manifest by their object_ids.
-    ///
-    /// This method supports multi-table transactional deletion: all specified
-    /// object_ids (which may span multiple tables) are deleted in a single atomic
-    /// `DeleteBuilder` operation. Either all entries are removed or none are.
-    ///
-    /// Object IDs are formatted as `{table_id}${version}`.
-    pub async fn batch_delete_table_versions_by_object_ids(
-        &self,
-        object_ids: &[String],
-    ) -> Result<i64> {
-        if object_ids.is_empty() {
-            return Ok(0);
-        }
+#[async_trait]
+impl LanceNamespace for ManifestNamespace {
+    fn namespace_id(&self) -> String {
+        self.root.clone()
+    }
 
-        let in_list: String = object_ids
-            .iter()
-            .map(|oid| {
-                let escaped = oid.replace('\'', "''");
-                format!("'{}'", escaped)
+    async fn list_tables(&self, request: ListTablesRequest) -> Result<ListTablesResponse> {
+        let namespace_id = request.id.as_ref().ok_or_else(|| {
+            lance_core::Error::from(NamespaceError::InvalidInput {
+                message: "Namespace ID is required".to_string(),
             })
-            .collect::<Vec<_>>()
-            .join(", ");
+        })?;
 
-        let filter = format!(
-            "object_type = 'table_version' AND object_id IN ({})",
-            in_list
-        );
+        // Build filter to find tables in this namespace
+        let filter = if namespace_id.is_empty() {
+            // Root namespace: find tables without a namespace prefix
+            "object_type = 'table' AND NOT contains(object_id, '$')".to_string()
+        } else {
+            // Namespaced: find tables that start with namespace$ but have no additional $
+            let prefix = namespace_id.join(DELIMITER);
+            format!(
+                "object_type = 'table' AND starts_with(object_id, '{}{}') AND NOT contains(substring(object_id, {}), '$')",
+                prefix,
+                DELIMITER,
+                prefix.len() + 2
+            )
+        };
 
-        // Count how many entries exist so we can report the deleted count
         let mut scanner = self.manifest_scanner().await?;
         scanner.filter(&filter).map_err(|e| {
             lance_core::Error::from(NamespaceError::Internal {
@@ -1422,576 +2636,420 @@ impl ManifestNamespace {
                 message: format!("Failed to project: {:?}", e),
             })
         })?;
-        let batches = Self::execute_scanner(scanner).await?;
-        let deleted_count: i64 = batches.iter().map(|b| b.num_rows() as i64).sum();
-
-        if deleted_count == 0 {
-            return Ok(0);
-        }
-
-        // Execute a single atomic bulk delete covering all tables
-        let _mutation_guard = self.manifest_mutation_lock.lock().await;
-        let dataset_guard = self.manifest_dataset.get().await?;
-        let dataset = Arc::new(dataset_guard.clone());
-        drop(dataset_guard);
 
-        let new_dataset = DeleteBuilder::new(dataset, &filter)
-            .execute()
-            .await
-            .map_err(|e| {
-                convert_lance_commit_error(
-                    &e,
-                    "Failed to batch delete table versions across multiple tables",
-                    None,
-                )
-            })?;
+        let batches = Self::execute_scanner(scanner).await?;
 
-        self.manifest_dataset
-            .set_latest(
-                Arc::try_unwrap(new_dataset.new_dataset).unwrap_or_else(|arc| (*arc).clone()),
-            )
-            .await;
+        let mut table_entries = Vec::new();
+        for batch in batches {
+            if batch.num_rows() == 0 {
+                continue;
+            }
 
-        if let Err(e) = self.run_inline_optimization().await {
-            log::warn!(
-                "Unexpected failure when running inline optimization: {:?}",
-                e
-            );
+            let object_id_array = Self::get_string_column(&batch, "object_id")?;
+            let location_array = Self::get_string_column(&batch, "location")?;
+            for i in 0..batch.num_rows() {
+                let object_id = object_id_array.value(i);
+                let location = location_array.value(i);
+                let (_namespace, name) = Self::parse_object_id(object_id);
+                table_entries.push((name, location.to_string()));
+            }
         }
 
-        Ok(deleted_count)
-    }
+        let mut tables: Vec<String> = if request.include_declared.unwrap_or(true) {
+            table_entries.into_iter().map(|(name, _)| name).collect()
+        } else {
+            let mut stream = futures::stream::iter(table_entries.into_iter().map(
+                |(name, location)| async move {
+                    // `include_declared=false` is an explicit opt-in. We still pay one
+                    // `_versions/` probe per table so declared-state is derived from actual
+                    // manifests. This is linear in the total number of listed tables, and we do
+                    // the probes with bounded concurrency before pagination.
+                    if self.location_has_actual_manifests(&location).await? {
+                        Ok::<Option<String>, Error>(Some(name))
+                    } else {
+                        Ok::<Option<String>, Error>(None)
+                    }
+                },
+            ))
+            .buffered(DECLARED_FILTER_CONCURRENCY);
 
-    /// Set a property flag in the __manifest table's metadata key-value map.
-    ///
-    /// This uses `dataset.update_metadata()` to persist the flag in the
-    /// __manifest dataset's table metadata, rather than inserting a row.
-    /// If the property already exists with the same value, this is a no-op.
-    pub async fn set_property(&self, name: &str, value: &str) -> Result<()> {
-        let _mutation_guard = self.manifest_mutation_lock.lock().await;
-        let dataset_guard = self.manifest_dataset.get().await?;
-        if dataset_guard.metadata().get(name) == Some(&value.to_string()) {
-            return Ok(());
-        }
-        drop(dataset_guard);
+            let mut filtered = Vec::new();
+            while let Some(result) = stream.next().await {
+                if let Some(name) = result? {
+                    filtered.push(name);
+                }
+            }
+            filtered
+        };
 
-        let mut dataset_guard = self.manifest_dataset.get_mut().await?;
-        dataset_guard
-            .update_metadata([(name, value)])
-            .await
-            .map_err(|e| {
-                lance_core::Error::from(NamespaceError::Internal {
-                    message: format!(
-                        "Failed to set property '{}' in __manifest metadata: {}",
-                        name, e
-                    ),
-                })
-            })?;
-        Ok(())
+        let next_page_token =
+            Self::apply_pagination(&mut tables, request.page_token, request.limit);
+        let mut response = ListTablesResponse::new(tables);
+        response.page_token = next_page_token;
+        Ok(response)
     }
 
-    /// Check if a property flag exists in the __manifest table's metadata key-value map.
-    pub async fn has_property(&self, name: &str) -> Result<bool> {
-        let dataset_guard = self.manifest_dataset.get().await?;
-        Ok(dataset_guard.metadata().contains_key(name))
-    }
+    async fn describe_table(&self, request: DescribeTableRequest) -> Result<DescribeTableResponse> {
+        let table_id = request.id.as_ref().ok_or_else(|| {
+            lance_core::Error::from(NamespaceError::InvalidInput {
+                message: "Table ID is required".to_string(),
+            })
+        })?;
 
-    /// Parse metadata JSON into a `TableVersion`.
-    ///
-    /// Returns `None` if metadata is invalid or missing required fields.
-    fn parse_table_version(version: i64, metadata_str: &str) -> Option<TableVersion> {
-        let meta: serde_json::Value = match serde_json::from_str(metadata_str) {
-            Ok(v) => v,
-            Err(e) => {
-                log::warn!(
-                    "Skipping version {} due to invalid metadata JSON: {}",
-                    version,
-                    e
-                );
-                return None;
-            }
-        };
-        let manifest_path = match meta.get("manifest_path").and_then(|v| v.as_str()) {
-            Some(p) => p.to_string(),
-            None => {
-                log::warn!(
-                    "Skipping version {} due to missing 'manifest_path' in metadata — \
-                     this may indicate data corruption",
-                    version
-                );
-                return None;
+        if table_id.is_empty() {
+            return Err(NamespaceError::InvalidInput {
+                message: "Table ID cannot be empty".to_string(),
             }
-        };
-        let manifest_size = meta.get("manifest_size").and_then(|v| v.as_i64());
-        let e_tag = meta
-            .get("e_tag")
-            .and_then(|v| v.as_str())
-            .map(|s| s.to_string());
-        Some(TableVersion {
-            version,
-            manifest_path,
-            manifest_size,
-            e_tag,
-            timestamp_millis: None,
-            metadata: None,
-        })
-    }
+            .into());
+        }
 
-    /// List table versions from the __manifest table.
-    ///
-    /// Queries the manifest for all versions of the given table and returns
-    /// them as a `ListTableVersionsResponse`.
-    pub async fn list_table_versions(
-        &self,
-        table_id: &[String],
-        descending: bool,
-        limit: Option<i32>,
-    ) -> Result<ListTableVersionsResponse> {
         let object_id = Self::str_object_id(table_id);
-        let manifest_versions = self
-            .query_table_versions(&object_id, descending, limit)
-            .await?;
-
-        let table_versions: Vec<TableVersion> = manifest_versions
-            .into_iter()
-            .filter_map(|(version, metadata_str)| Self::parse_table_version(version, &metadata_str))
-            .collect();
+        let table_info = self.query_manifest_for_table(&object_id).boxed().await?;
 
-        Ok(ListTableVersionsResponse {
-            versions: table_versions,
-            page_token: None,
-        })
-    }
+        // Extract table name and namespace from table_id
+        let table_name = table_id.last().cloned().unwrap_or_default();
+        let namespace_id: Vec<String> = if table_id.len() > 1 {
+            table_id[..table_id.len() - 1].to_vec()
+        } else {
+            vec![]
+        };
 
-    /// Describe a specific table version from the __manifest table.
-    ///
-    /// Queries the manifest for a specific version and returns it as a
-    /// `DescribeTableVersionResponse`. Returns an error if the version is not found.
-    pub async fn describe_table_version(
-        &self,
-        table_id: &[String],
-        version: i64,
-    ) -> Result<DescribeTableVersionResponse> {
-        let object_id = Self::str_object_id(table_id);
-        if let Some(metadata_str) = self.query_table_version(&object_id, version).await?
-            && let Some(tv) = Self::parse_table_version(version, &metadata_str)
-        {
-            return Ok(DescribeTableVersionResponse {
-                version: Box::new(tv),
-            });
-        }
-        Err(NamespaceError::TableVersionNotFound {
-            message: format!("version {} for table {:?}", version, table_id),
-        }
-        .into())
-    }
+        let load_detailed_metadata = request.load_detailed_metadata.unwrap_or(false);
+        let should_check_declared =
+            load_detailed_metadata || request.check_declared.unwrap_or(false);
+        // For backwards compatibility, only skip vending credentials when explicitly set to false
+        let vend_credentials = request.vend_credentials.unwrap_or(true);
 
-    /// Register a table in the manifest without creating the physical table (internal helper for migration)
-    pub async fn register_table(&self, name: &str, location: String) -> Result<()> {
-        let object_id = Self::build_object_id(&[], name);
-        if self.manifest_contains_object(&object_id).await? {
-            return Err(NamespaceError::Internal {
-                message: format!("Table '{}' already exists", name),
-            }
-            .into());
-        }
+        match table_info {
+            Some(info) => {
+                // Construct full URI from relative location
+                let table_uri = Self::construct_full_uri(&self.root, &info.location)?;
 
-        self.insert_into_manifest(object_id, ObjectType::Table, Some(location))
-            .await
-    }
+                let storage_options = if vend_credentials {
+                    self.storage_options.clone()
+                } else {
+                    None
+                };
+                let is_only_declared = if should_check_declared {
+                    Some(!self.location_has_actual_manifests(&info.location).await?)
+                } else {
+                    None
+                };
 
-    /// Validate that all levels of a namespace path exist
-    async fn validate_namespace_levels_exist(&self, namespace_path: &[String]) -> Result<()> {
-        for i in 1..=namespace_path.len() {
-            let partial_path = &namespace_path[..i];
-            let object_id = partial_path.join(DELIMITER);
-            if !self.manifest_contains_object(&object_id).await? {
-                return Err(NamespaceError::NamespaceNotFound {
-                    message: format!("parent namespace '{}'", object_id),
+                if !load_detailed_metadata {
+                    return Ok(DescribeTableResponse {
+                        table: Some(table_name),
+                        namespace: Some(namespace_id),
+                        location: Some(table_uri.clone()),
+                        table_uri: Some(table_uri),
+                        storage_options,
+                        properties: info.metadata,
+                        is_only_declared,
+                        ..Default::default()
+                    });
+                }
+
+                if is_only_declared == Some(true) {
+                    return Ok(DescribeTableResponse {
+                        table: Some(table_name),
+                        namespace: Some(namespace_id),
+                        location: Some(table_uri.clone()),
+                        table_uri: Some(table_uri),
+                        storage_options,
+                        properties: info.metadata,
+                        is_only_declared,
+                        ..Default::default()
+                    });
+                }
+
+                let mut builder = DatasetBuilder::from_uri(&table_uri);
+                if let Some(opts) = &self.storage_options {
+                    builder = builder.with_storage_options(opts.clone());
+                }
+                if let Some(session) = &self.session {
+                    builder = builder.with_session(session.clone());
+                }
+
+                match builder.load().await {
+                    Ok(mut dataset) => {
+                        // If a specific version is requested, checkout that version
+                        if let Some(requested_version) = request.version {
+                            dataset = dataset.checkout_version(requested_version as u64).await?;
+                        }
+
+                        let version = dataset.version().version;
+                        let lance_schema = dataset.schema();
+                        let arrow_schema: arrow_schema::Schema = lance_schema.into();
+                        let json_schema = arrow_schema_to_json(&arrow_schema)?;
+
+                        Ok(DescribeTableResponse {
+                            table: Some(table_name.clone()),
+                            namespace: Some(namespace_id.clone()),
+                            version: Some(version as i64),
+                            location: Some(table_uri.clone()),
+                            table_uri: Some(table_uri),
+                            schema: Some(Box::new(json_schema)),
+                            storage_options,
+                            properties: info.metadata.clone(),
+                            is_only_declared,
+                            ..Default::default()
+                        })
+                    }
+                    Err(err) => Err(NamespaceError::Internal {
+                        message: format!(
+                            "Table exists in manifest but failed to load dataset '{}': {}",
+                            object_id, err
+                        ),
+                    }
+                    .into()),
                 }
-                .into());
             }
+            None => Err(NamespaceError::TableNotFound {
+                message: Self::format_table_id(table_id),
+            }
+            .into()),
         }
-        Ok(())
     }
 
-    /// Query the manifest for a namespace with the given object ID
-    async fn query_manifest_for_namespace(&self, object_id: &str) -> Result<Option<NamespaceInfo>> {
-        let escaped_id = object_id.replace('\'', "''");
-        let filter = format!("object_id = '{}' AND object_type = 'namespace'", escaped_id);
-        let mut scanner = self.manifest_scanner().await?;
-        scanner.filter(&filter).map_err(|e| {
-            lance_core::Error::from(NamespaceError::Internal {
-                message: format!("Failed to filter: {:?}", e),
-            })
-        })?;
-        scanner.project(&["object_id", "metadata"]).map_err(|e| {
-            lance_core::Error::from(NamespaceError::Internal {
-                message: format!("Failed to project: {:?}", e),
+    async fn table_exists(&self, request: TableExistsRequest) -> Result<()> {
+        let table_id = request.id.as_ref().ok_or_else(|| {
+            lance_core::Error::from(NamespaceError::InvalidInput {
+                message: "Table ID is required".to_string(),
             })
         })?;
-        let batches = Self::execute_scanner(scanner).await?;
-
-        let mut found_result: Option<NamespaceInfo> = None;
-        let mut total_rows = 0;
 
-        for batch in batches {
-            if batch.num_rows() == 0 {
-                continue;
+        if table_id.is_empty() {
+            return Err(NamespaceError::InvalidInput {
+                message: "Table ID cannot be empty".to_string(),
             }
+            .into());
+        }
 
-            total_rows += batch.num_rows();
-            if total_rows > 1 {
-                return Err(NamespaceError::Internal {
-                    message: format!(
-                        "Expected exactly 1 namespace with id '{}', found {}",
-                        object_id, total_rows
-                    ),
-                }
-                .into());
+        let object_id = Self::str_object_id(table_id);
+        let exists = self.manifest_contains_object(&object_id).await?;
+        if exists {
+            Ok(())
+        } else {
+            Err(NamespaceError::TableNotFound {
+                message: Self::format_table_id(table_id),
             }
-
-            let object_id_array = Self::get_string_column(&batch, "object_id")?;
-            let metadata_array = Self::get_string_column(&batch, "metadata")?;
-
-            let object_id_str = object_id_array.value(0);
-            let metadata = if !metadata_array.is_null(0) {
-                let metadata_str = metadata_array.value(0);
-                match serde_json::from_str::<HashMap<String, String>>(metadata_str) {
-                    Ok(map) => Some(map),
-                    Err(e) => {
-                        return Err(NamespaceError::Internal {
-                            message: format!(
-                                "Failed to deserialize metadata for namespace '{}': {}",
-                                object_id, e
-                            ),
-                        }
-                        .into());
-                    }
-                }
-            } else {
-                None
-            };
-
-            let (namespace, name) = Self::parse_object_id(object_id_str);
-            found_result = Some(NamespaceInfo {
-                namespace,
-                name,
-                metadata,
-            });
+            .into())
         }
-
-        Ok(found_result)
     }
 
-    /// Create or load the manifest dataset, ensuring it has the latest schema setup.
-    ///
-    /// This function will:
-    /// 1. Try to load an existing manifest table
-    /// 2. If it exists, check and migrate the schema if needed (e.g., add primary key metadata)
-    /// 3. If it doesn't exist, create a new manifest table with the current schema
-    /// 4. Persist feature flags (e.g., table_version_storage_enabled) if requested
-    async fn ensure_manifest_table_up_to_date(
-        root: &str,
-        storage_options: &Option<HashMap<String, String>>,
-        session: Option<Arc<Session>>,
-        table_version_storage_enabled: bool,
-    ) -> Result<DatasetConsistencyWrapper> {
-        let manifest_path = format!("{}/{}", root, MANIFEST_TABLE_NAME);
-        log::debug!("Attempting to load manifest from {}", manifest_path);
-        let store_options = ObjectStoreParams {
-            storage_options_accessor: storage_options.as_ref().map(|opts| {
-                Arc::new(
-                    lance_io::object_store::StorageOptionsAccessor::with_static_options(
-                        opts.clone(),
-                    ),
-                )
-            }),
-            ..Default::default()
-        };
-        let read_params = ReadParams {
-            session: session.clone(),
-            store_options: Some(store_options.clone()),
-            ..Default::default()
-        };
-        let dataset_result = DatasetBuilder::from_uri(&manifest_path)
-            .with_read_params(read_params)
-            .load()
-            .await;
-        if let Ok(mut dataset) = dataset_result {
-            // Check if the object_id field has primary key metadata, migrate if not
-            let needs_pk_migration = dataset
-                .schema()
-                .field("object_id")
-                .map(|f| {
-                    !f.metadata
-                        .contains_key(LANCE_UNENFORCED_PRIMARY_KEY_POSITION)
-                })
-                .unwrap_or(false);
+    async fn create_table(
+        &self,
+        request: CreateTableRequest,
+        data: Bytes,
+    ) -> Result<CreateTableResponse> {
+        let table_id = request.id.as_ref().ok_or_else(|| {
+            lance_core::Error::from(NamespaceError::InvalidInput {
+                message: "Table ID is required".to_string(),
+            })
+        })?;
 
-            if needs_pk_migration {
-                log::info!("Migrating __manifest table to add primary key metadata on object_id");
-                dataset
-                    .update_field_metadata()
-                    .update("object_id", [(LANCE_UNENFORCED_PRIMARY_KEY_POSITION, "0")])
-                    .map_err(|e| {
-                        lance_core::Error::from(NamespaceError::Internal {
-                            message: format!(
-                                "Failed to find object_id field for migration: {:?}",
-                                e
-                            ),
-                        })
-                    })?
-                    .await
-                    .map_err(|e| {
-                        lance_core::Error::from(NamespaceError::Internal {
-                            message: format!("Failed to migrate primary key metadata: {:?}", e),
-                        })
-                    })?;
+        if table_id.is_empty() {
+            return Err(NamespaceError::InvalidInput {
+                message: "Table ID cannot be empty".to_string(),
             }
+            .into());
+        }
 
-            // Persist table_version_storage_enabled flag in __manifest so that once
-            // enabled, it becomes a permanent property of this namespace.
-            if table_version_storage_enabled {
-                let needs_flag = dataset
-                    .metadata()
-                    .get("table_version_storage_enabled")
-                    .map(|v| v != "true")
-                    .unwrap_or(true);
+        let (namespace, table_name) = Self::split_object_id(table_id);
+        let object_id = Self::build_object_id(&namespace, &table_name);
 
-                if needs_flag
-                    && let Err(e) = dataset
-                        .update_metadata([("table_version_storage_enabled", "true")])
-                        .await
-                {
-                    log::warn!(
-                        "Failed to persist table_version_storage_enabled flag in __manifest: {:?}",
-                        e
-                    );
-                }
-            }
+        // Refuse before writing any table data if this build cannot write the
+        // manifest, so a refused create leaves no orphaned dataset behind.
+        self.ensure_manifest_writable().await?;
 
-            Ok(DatasetConsistencyWrapper::new(dataset))
+        let existing_table = self.query_manifest_for_table(&object_id).await?;
+        let existing_has_manifests = if let Some(existing_table) = &existing_table {
+            Some(
+                self.location_has_actual_manifests(&existing_table.location)
+                    .await?,
+            )
         } else {
-            log::info!("Creating new manifest table at {}", manifest_path);
-            let schema = Self::manifest_schema();
-            let empty_batch = RecordBatch::new_empty(schema.clone());
-            let reader = RecordBatchIterator::new(vec![Ok(empty_batch)], schema.clone());
+            None
+        };
 
-            let store_params = ObjectStoreParams {
-                storage_options_accessor: storage_options.as_ref().map(|opts| {
-                    Arc::new(
-                        lance_io::object_store::StorageOptionsAccessor::with_static_options(
-                            opts.clone(),
-                        ),
-                    )
-                }),
-                ..Default::default()
-            };
-            let write_params = WriteParams {
-                session: session.clone(),
-                store_params: Some(store_params),
-                ..Default::default()
-            };
+        if existing_has_manifests == Some(false)
+            && request
+                .properties
+                .as_ref()
+                .is_some_and(|properties| !properties.is_empty())
+        {
+            return Err(NamespaceError::InvalidInput {
+                message: format!(
+                    "create_table cannot set properties for already declared table '{}'",
+                    object_id
+                ),
+            }
+            .into());
+        }
 
-            let dataset =
-                Dataset::write(Box::new(reader), &manifest_path, Some(write_params)).await;
+        let create_mode = if existing_has_manifests == Some(false) {
+            CreateTableMode::Create
+        } else {
+            CreateTableMode::parse(request.mode.as_deref())?
+        };
+        let dir_name = if let Some(existing_table) = &existing_table {
+            existing_table.location.clone()
+        } else if namespace.is_empty() && self.dir_listing_enabled {
+            format!("{}.lance", table_name)
+        } else {
+            Self::generate_dir_name(&object_id)
+        };
+        let table_uri = Self::construct_full_uri(&self.root, &dir_name)?;
+        let overwriting_existing_table =
+            existing_has_manifests == Some(true) && create_mode == CreateTableMode::Overwrite;
 
-            // Handle race condition where another process created the manifest concurrently
-            match dataset {
-                Ok(dataset) => {
-                    log::info!(
-                        "Successfully created manifest table at {}, version={}, uri={}",
-                        manifest_path,
-                        dataset.version().version,
-                        dataset.uri()
-                    );
-                    Ok(DatasetConsistencyWrapper::new(dataset))
+        if existing_has_manifests == Some(true) {
+            match create_mode {
+                CreateTableMode::Create => {
+                    return Err(NamespaceError::TableAlreadyExists {
+                        message: table_name.clone(),
+                    }
+                    .into());
                 }
-                Err(ref e)
-                    if matches!(
-                        e,
-                        LanceError::DatasetAlreadyExists { .. }
-                            | LanceError::CommitConflict { .. }
-                            | LanceError::IncompatibleTransaction { .. }
-                            | LanceError::RetryableCommitConflict { .. }
-                    ) =>
-                {
-                    // Another process created the manifest concurrently, try to load it
-                    log::info!(
-                        "Manifest table was created by another process, loading it: {}",
-                        manifest_path
-                    );
-                    let recovery_store_options = ObjectStoreParams {
-                        storage_options_accessor: storage_options.as_ref().map(|opts| {
-                            Arc::new(
-                                lance_io::object_store::StorageOptionsAccessor::with_static_options(
-                                    opts.clone(),
-                                ),
-                            )
-                        }),
-                        ..Default::default()
-                    };
-                    let recovery_read_params = ReadParams {
-                        session,
-                        store_options: Some(recovery_store_options),
+                CreateTableMode::ExistOk => {
+                    let properties = existing_table
+                        .as_ref()
+                        .and_then(|table| table.metadata.clone());
+                    return Ok(CreateTableResponse {
+                        location: Some(table_uri),
+                        storage_options: self.storage_options.clone(),
+                        properties,
                         ..Default::default()
-                    };
-                    let dataset = DatasetBuilder::from_uri(&manifest_path)
-                        .with_read_params(recovery_read_params)
-                        .load()
-                        .await
-                        .map_err(|e| {
-                            lance_core::Error::from(NamespaceError::Internal {
-                                message: format!(
-                                    "Failed to load manifest dataset after creation conflict: {}",
-                                    e
-                                ),
-                            })
-                        })?;
-                    Ok(DatasetConsistencyWrapper::new(dataset))
+                    });
                 }
-                Err(e) => Err(lance_core::Error::from(NamespaceError::Internal {
-                    message: format!("Failed to create manifest dataset: {:?}", e),
-                })),
-            }
-        }
-    }
-
-    /// Sorts names alphabetically and applies pagination using page_token (start_after) and limit.
-    ///
-    /// Returns the next page token (last item in this page) if more results exist beyond the limit,
-    /// or `None` if this is the last page.
-    fn apply_pagination(
-        names: &mut Vec<String>,
-        page_token: Option<String>,
-        limit: Option<i32>,
-    ) -> Option<String> {
-        names.sort();
-
-        if let Some(start_after) = page_token {
-            if let Some(index) = names
-                .iter()
-                .position(|name| name.as_str() > start_after.as_str())
-            {
-                names.drain(0..index);
-            } else {
-                names.clear();
+                CreateTableMode::Overwrite => {}
             }
         }
 
-        if let Some(limit) = limit
-            && limit >= 0
-        {
-            let limit = limit as usize;
-            if names.len() > limit {
-                let next_page_token = if limit > 0 {
-                    Some(names[limit - 1].clone())
-                } else {
-                    None
-                };
-                names.truncate(limit);
-                return next_page_token;
+        // Validate that request_data is provided
+        if data.is_empty() {
+            return Err(NamespaceError::InvalidInput {
+                message: "Request data (Arrow IPC stream) is required for create_table".to_string(),
             }
+            .into());
         }
 
-        None
-    }
-}
-
-#[async_trait]
-impl LanceNamespace for ManifestNamespace {
-    fn namespace_id(&self) -> String {
-        self.root.clone()
-    }
-
-    async fn list_tables(&self, request: ListTablesRequest) -> Result<ListTablesResponse> {
-        let namespace_id = request.id.as_ref().ok_or_else(|| {
-            lance_core::Error::from(NamespaceError::InvalidInput {
-                message: "Namespace ID is required".to_string(),
-            })
-        })?;
-
-        // Build filter to find tables in this namespace
-        let filter = if namespace_id.is_empty() {
-            // Root namespace: find tables without a namespace prefix
-            "object_type = 'table' AND NOT contains(object_id, '$')".to_string()
-        } else {
-            // Namespaced: find tables that start with namespace$ but have no additional $
-            let prefix = namespace_id.join(DELIMITER);
-            format!(
-                "object_type = 'table' AND starts_with(object_id, '{}{}') AND NOT contains(substring(object_id, {}), '$')",
-                prefix,
-                DELIMITER,
-                prefix.len() + 2
-            )
-        };
-
-        let mut scanner = self.manifest_scanner().await?;
-        scanner.filter(&filter).map_err(|e| {
-            lance_core::Error::from(NamespaceError::Internal {
-                message: format!("Failed to filter: {:?}", e),
-            })
-        })?;
-        scanner.project(&["object_id", "location"]).map_err(|e| {
+        // Write the data using Lance Dataset
+        let cursor = Cursor::new(data.to_vec());
+        let stream_reader = StreamReader::try_new(cursor, None).map_err(|e| {
             lance_core::Error::from(NamespaceError::Internal {
-                message: format!("Failed to project: {:?}", e),
+                message: format!("Failed to read IPC stream: {:?}", e),
             })
         })?;
 
-        let batches = Self::execute_scanner(scanner).await?;
+        let batches: Vec<RecordBatch> = stream_reader
+            .collect::<std::result::Result<Vec<_>, _>>()
+            .map_err(|e| {
+            lance_core::Error::from(NamespaceError::Internal {
+                message: format!("Failed to collect batches: {:?}", e),
+            })
+        })?;
 
-        let mut table_entries = Vec::new();
-        for batch in batches {
-            if batch.num_rows() == 0 {
-                continue;
+        if batches.is_empty() {
+            return Err(NamespaceError::Internal {
+                message: "No data provided for table creation".to_string(),
             }
+            .into());
+        }
 
-            let object_id_array = Self::get_string_column(&batch, "object_id")?;
-            let location_array = Self::get_string_column(&batch, "location")?;
-            for i in 0..batch.num_rows() {
-                let object_id = object_id_array.value(i);
-                let location = location_array.value(i);
-                let (_namespace, name) = Self::parse_object_id(object_id);
-                table_entries.push((name, location.to_string()));
-            }
+        let schema = batches[0].schema();
+        let batch_results: Vec<std::result::Result<RecordBatch, arrow_schema::ArrowError>> =
+            batches.into_iter().map(Ok).collect();
+        let reader = RecordBatchIterator::new(batch_results, schema);
+
+        let mut write_storage_options = self.storage_options.clone().unwrap_or_default();
+        if let Some(request_storage_options) = request.storage_options.as_ref() {
+            write_storage_options.extend(request_storage_options.clone());
         }
 
-        let mut tables: Vec<String> = if request.include_declared.unwrap_or(true) {
-            table_entries.into_iter().map(|(name, _)| name).collect()
+        let store_params = ObjectStoreParams {
+            storage_options_accessor: (!write_storage_options.is_empty()).then(|| {
+                Arc::new(
+                    lance_io::object_store::StorageOptionsAccessor::with_static_options(
+                        write_storage_options,
+                    ),
+                )
+            }),
+            ..Default::default()
+        };
+        let write_params = WriteParams {
+            mode: create_mode.write_mode(),
+            session: self.session.clone(),
+            store_params: Some(store_params),
+            ..Default::default()
+        };
+        let dataset = Dataset::write(Box::new(reader), &table_uri, Some(write_params))
+            .await
+            .map_err(|e| {
+                lance_core::Error::from(NamespaceError::Internal {
+                    message: format!("Failed to write dataset: {:?}", e),
+                })
+            })?;
+        let version = dataset.version().version as i64;
+
+        if overwriting_existing_table {
+            let metadata =
+                Self::serialize_metadata(request.properties.as_ref(), "table", &object_id)?;
+            self.upsert_into_manifest_with_metadata(
+                vec![ManifestEntry {
+                    object_id,
+                    object_type: ObjectType::Table,
+                    location: Some(dir_name),
+                    metadata,
+                }],
+                None,
+            )
+            .await?;
+
+            Ok(CreateTableResponse {
+                version: Some(version),
+                location: Some(table_uri),
+                storage_options: self.storage_options.clone(),
+                properties: request.properties,
+                ..Default::default()
+            })
         } else {
-            let mut stream = futures::stream::iter(table_entries.into_iter().map(
-                |(name, location)| async move {
-                    // `include_declared=false` is an explicit opt-in. We still pay one
-                    // `_versions/` probe per table so declared-state is derived from actual
-                    // manifests. This is linear in the total number of listed tables, and we do
-                    // the probes with bounded concurrency before pagination.
-                    if self.location_has_actual_manifests(&location).await? {
-                        Ok::<Option<String>, Error>(Some(name))
-                    } else {
-                        Ok::<Option<String>, Error>(None)
-                    }
-                },
-            ))
-            .buffered(DECLARED_FILTER_CONCURRENCY);
+            match existing_table {
+                Some(existing_table) => Ok(CreateTableResponse {
+                    version: Some(version),
+                    location: Some(table_uri),
+                    storage_options: self.storage_options.clone(),
+                    properties: existing_table.metadata,
+                    ..Default::default()
+                }),
+                None => {
+                    let metadata =
+                        Self::serialize_metadata(request.properties.as_ref(), "table", &object_id)?;
+                    // Register in manifest (store dir_name, not full URI)
+                    self.insert_into_manifest_with_metadata(
+                        vec![ManifestEntry {
+                            object_id,
+                            object_type: ObjectType::Table,
+                            location: Some(dir_name.clone()),
+                            metadata,
+                        }],
+                        None,
+                    )
+                    .await?;
 
-            let mut filtered = Vec::new();
-            while let Some(result) = stream.next().await {
-                if let Some(name) = result? {
-                    filtered.push(name);
+                    Ok(CreateTableResponse {
+                        version: Some(version),
+                        location: Some(table_uri),
+                        storage_options: self.storage_options.clone(),
+                        properties: request.properties,
+                        ..Default::default()
+                    })
                 }
             }
-            filtered
-        };
-
-        let next_page_token =
-            Self::apply_pagination(&mut tables, request.page_token, request.limit);
-        let mut response = ListTablesResponse::new(tables);
-        response.page_token = next_page_token;
-        Ok(response)
+        }
     }
 
-    async fn describe_table(&self, request: DescribeTableRequest) -> Result<DescribeTableResponse> {
+    async fn drop_table(&self, request: DropTableRequest) -> Result<DropTableResponse> {
         let table_id = request.id.as_ref().ok_or_else(|| {
             lance_core::Error::from(NamespaceError::InvalidInput {
                 message: "Table ID is required".to_string(),
@@ -2005,349 +3063,276 @@ impl LanceNamespace for ManifestNamespace {
             .into());
         }
 
-        let object_id = Self::str_object_id(table_id);
-        let table_info = self.query_manifest_for_table(&object_id).boxed().await?;
-
-        // Extract table name and namespace from table_id
-        let table_name = table_id.last().cloned().unwrap_or_default();
-        let namespace_id: Vec<String> = if table_id.len() > 1 {
-            table_id[..table_id.len() - 1].to_vec()
-        } else {
-            vec![]
-        };
+        let (namespace, table_name) = Self::split_object_id(table_id);
+        let object_id = Self::build_object_id(&namespace, &table_name);
 
-        let load_detailed_metadata = request.load_detailed_metadata.unwrap_or(false);
-        let should_check_declared =
-            load_detailed_metadata || request.check_declared.unwrap_or(false);
-        // For backwards compatibility, only skip vending credentials when explicitly set to false
-        let vend_credentials = request.vend_credentials.unwrap_or(true);
+        // Query manifest for table location
+        let table_info = self.query_manifest_for_table(&object_id).boxed().await?;
 
         match table_info {
             Some(info) => {
-                // Construct full URI from relative location
-                let table_uri = Self::construct_full_uri(&self.root, &info.location)?;
-
-                let storage_options = if vend_credentials {
-                    self.storage_options.clone()
-                } else {
-                    None
-                };
-                let is_only_declared = if should_check_declared {
-                    Some(!self.location_has_actual_manifests(&info.location).await?)
-                } else {
-                    None
-                };
-
-                if !load_detailed_metadata {
-                    return Ok(DescribeTableResponse {
-                        table: Some(table_name),
-                        namespace: Some(namespace_id),
-                        location: Some(table_uri.clone()),
-                        table_uri: Some(table_uri),
-                        storage_options,
-                        properties: info.metadata,
-                        is_only_declared,
-                        ..Default::default()
-                    });
-                }
-
-                if is_only_declared == Some(true) {
-                    return Ok(DescribeTableResponse {
-                        table: Some(table_name),
-                        namespace: Some(namespace_id),
-                        location: Some(table_uri.clone()),
-                        table_uri: Some(table_uri),
-                        storage_options,
-                        properties: info.metadata,
-                        is_only_declared,
-                        ..Default::default()
-                    });
-                }
-
-                let mut builder = DatasetBuilder::from_uri(&table_uri);
-                if let Some(opts) = &self.storage_options {
-                    builder = builder.with_storage_options(opts.clone());
-                }
-                if let Some(session) = &self.session {
-                    builder = builder.with_session(session.clone());
-                }
-
-                match builder.load().await {
-                    Ok(mut dataset) => {
-                        // If a specific version is requested, checkout that version
-                        if let Some(requested_version) = request.version {
-                            dataset = dataset.checkout_version(requested_version as u64).await?;
-                        }
+                // Delete from manifest first
+                self.delete_from_manifest(&object_id).boxed().await?;
 
-                        let version = dataset.version().version;
-                        let lance_schema = dataset.schema();
-                        let arrow_schema: arrow_schema::Schema = lance_schema.into();
-                        let json_schema = arrow_schema_to_json(&arrow_schema)?;
+                // Delete physical data directory using the dir_name from manifest
+                let table_path = self.base_path.clone().join(info.location.as_str());
+                let table_uri = Self::construct_full_uri(&self.root, &info.location)?;
 
-                        Ok(DescribeTableResponse {
-                            table: Some(table_name.clone()),
-                            namespace: Some(namespace_id.clone()),
-                            version: Some(version as i64),
-                            location: Some(table_uri.clone()),
-                            table_uri: Some(table_uri),
-                            schema: Some(Box::new(json_schema)),
-                            storage_options,
-                            properties: info.metadata.clone(),
-                            is_only_declared,
-                            ..Default::default()
+                // Remove the table directory
+                self.object_store
+                    .remove_dir_all(table_path)
+                    .boxed()
+                    .await
+                    .map_err(|e| {
+                        lance_core::Error::from(NamespaceError::Internal {
+                            message: format!("Failed to delete table directory: {:?}", e),
                         })
-                    }
-                    Err(err) => Err(NamespaceError::Internal {
-                        message: format!(
-                            "Table exists in manifest but failed to load dataset '{}': {}",
-                            object_id, err
-                        ),
-                    }
-                    .into()),
-                }
+                    })?;
+
+                Ok(DropTableResponse {
+                    id: request.id.clone(),
+                    location: Some(table_uri),
+                    ..Default::default()
+                })
             }
             None => Err(NamespaceError::TableNotFound {
-                message: Self::format_table_id(table_id),
+                message: table_name.to_string(),
             }
             .into()),
         }
     }
 
-    async fn table_exists(&self, request: TableExistsRequest) -> Result<()> {
-        let table_id = request.id.as_ref().ok_or_else(|| {
+    async fn list_namespaces(
+        &self,
+        request: ListNamespacesRequest,
+    ) -> Result<ListNamespacesResponse> {
+        let parent_namespace = request.id.as_ref().ok_or_else(|| {
             lance_core::Error::from(NamespaceError::InvalidInput {
-                message: "Table ID is required".to_string(),
+                message: "Namespace ID is required".to_string(),
             })
         })?;
 
-        if table_id.is_empty() {
-            return Err(NamespaceError::InvalidInput {
-                message: "Table ID cannot be empty".to_string(),
+        // Build filter to find direct child namespaces
+        let filter = if parent_namespace.is_empty() {
+            // Root namespace: find all namespaces without a parent
+            "object_type = 'namespace' AND NOT contains(object_id, '$')".to_string()
+        } else {
+            // Non-root: find namespaces that start with parent$ but have no additional $
+            let prefix = parent_namespace.join(DELIMITER);
+            format!(
+                "object_type = 'namespace' AND starts_with(object_id, '{}{}') AND NOT contains(substring(object_id, {}), '$')",
+                prefix,
+                DELIMITER,
+                prefix.len() + 2
+            )
+        };
+
+        let mut scanner = self.manifest_scanner().await?;
+        scanner.filter(&filter).map_err(|e| {
+            lance_core::Error::from(NamespaceError::Internal {
+                message: format!("Failed to filter: {:?}", e),
+            })
+        })?;
+        scanner.project(&["object_id"]).map_err(|e| {
+            lance_core::Error::from(NamespaceError::Internal {
+                message: format!("Failed to project: {:?}", e),
+            })
+        })?;
+
+        let batches = Self::execute_scanner(scanner).await?;
+        let mut namespaces = Vec::new();
+
+        for batch in batches {
+            if batch.num_rows() == 0 {
+                continue;
+            }
+
+            let object_id_array = Self::get_string_column(&batch, "object_id")?;
+            for i in 0..batch.num_rows() {
+                let object_id = object_id_array.value(i);
+                let (_namespace, name) = Self::parse_object_id(object_id);
+                namespaces.push(name);
             }
-            .into());
         }
 
-        let object_id = Self::str_object_id(table_id);
-        let exists = self.manifest_contains_object(&object_id).await?;
-        if exists {
-            Ok(())
-        } else {
-            Err(NamespaceError::TableNotFound {
-                message: Self::format_table_id(table_id),
+        let next_page_token =
+            Self::apply_pagination(&mut namespaces, request.page_token, request.limit);
+        let mut response = ListNamespacesResponse::new(namespaces);
+        response.page_token = next_page_token;
+        Ok(response)
+    }
+
+    async fn describe_namespace(
+        &self,
+        request: DescribeNamespaceRequest,
+    ) -> Result<DescribeNamespaceResponse> {
+        let namespace_id = request.id.as_ref().ok_or_else(|| {
+            lance_core::Error::from(NamespaceError::InvalidInput {
+                message: "Namespace ID is required".to_string(),
+            })
+        })?;
+
+        // Root namespace always exists
+        if namespace_id.is_empty() {
+            #[allow(clippy::needless_update)]
+            return Ok(DescribeNamespaceResponse {
+                properties: Some(HashMap::new()),
+                ..Default::default()
+            });
+        }
+
+        // Check if namespace exists in manifest
+        let object_id = namespace_id.join(DELIMITER);
+        let namespace_info = self.query_manifest_for_namespace(&object_id).await?;
+
+        match namespace_info {
+            #[allow(clippy::needless_update)]
+            Some(info) => Ok(DescribeNamespaceResponse {
+                properties: info.metadata,
+                ..Default::default()
+            }),
+            None => Err(NamespaceError::NamespaceNotFound {
+                message: object_id.to_string(),
             }
-            .into())
+            .into()),
         }
     }
 
-    async fn create_table(
+    async fn create_namespace(
         &self,
-        request: CreateTableRequest,
-        data: Bytes,
-    ) -> Result<CreateTableResponse> {
-        let table_id = request.id.as_ref().ok_or_else(|| {
+        request: CreateNamespaceRequest,
+    ) -> Result<CreateNamespaceResponse> {
+        let namespace_id = request.id.as_ref().ok_or_else(|| {
             lance_core::Error::from(NamespaceError::InvalidInput {
-                message: "Table ID is required".to_string(),
+                message: "Namespace ID is required".to_string(),
             })
         })?;
 
-        if table_id.is_empty() {
-            return Err(NamespaceError::InvalidInput {
-                message: "Table ID cannot be empty".to_string(),
+        // Root namespace always exists and cannot be created
+        if namespace_id.is_empty() {
+            return Err(NamespaceError::NamespaceAlreadyExists {
+                message: "root namespace".to_string(),
             }
             .into());
         }
 
-        let (namespace, table_name) = Self::split_object_id(table_id);
-        let object_id = Self::build_object_id(&namespace, &table_name);
-
-        let existing_table = self.query_manifest_for_table(&object_id).await?;
-        let existing_has_manifests = if let Some(existing_table) = &existing_table {
-            Some(
-                self.location_has_actual_manifests(&existing_table.location)
-                    .await?,
-            )
-        } else {
-            None
-        };
+        // Validate parent namespaces exist (but not the namespace being created)
+        if namespace_id.len() > 1 {
+            self.validate_namespace_levels_exist(&namespace_id[..namespace_id.len() - 1])
+                .await?;
+        }
 
-        if existing_has_manifests == Some(false)
-            && request
-                .properties
-                .as_ref()
-                .is_some_and(|properties| !properties.is_empty())
-        {
-            return Err(NamespaceError::InvalidInput {
-                message: format!(
-                    "create_table cannot set properties for already declared table '{}'",
-                    object_id
-                ),
+        let object_id = namespace_id.join(DELIMITER);
+        if self.manifest_contains_object(&object_id).await? {
+            return Err(NamespaceError::NamespaceAlreadyExists {
+                message: object_id.to_string(),
             }
             .into());
         }
 
-        let create_mode = if existing_has_manifests == Some(false) {
-            CreateTableMode::Create
-        } else {
-            CreateTableMode::parse(request.mode.as_deref())?
-        };
-        let dir_name = if let Some(existing_table) = &existing_table {
-            existing_table.location.clone()
-        } else if namespace.is_empty() && self.dir_listing_enabled {
-            format!("{}.lance", table_name)
-        } else {
-            Self::generate_dir_name(&object_id)
-        };
-        let table_uri = Self::construct_full_uri(&self.root, &dir_name)?;
-        let overwriting_existing_table =
-            existing_has_manifests == Some(true) && create_mode == CreateTableMode::Overwrite;
+        let metadata =
+            Self::serialize_metadata(request.properties.as_ref(), "namespace", &object_id)?;
 
-        if existing_has_manifests == Some(true) {
-            match create_mode {
-                CreateTableMode::Create => {
-                    return Err(NamespaceError::TableAlreadyExists {
-                        message: table_name.clone(),
-                    }
-                    .into());
-                }
-                CreateTableMode::ExistOk => {
-                    let properties = existing_table
-                        .as_ref()
-                        .and_then(|table| table.metadata.clone());
-                    return Ok(CreateTableResponse {
-                        location: Some(table_uri),
-                        storage_options: self.storage_options.clone(),
-                        properties,
-                        ..Default::default()
-                    });
-                }
-                CreateTableMode::Overwrite => {}
+        self.insert_into_manifest_with_metadata(
+            vec![ManifestEntry {
+                object_id,
+                object_type: ObjectType::Namespace,
+                location: None,
+                metadata,
+            }],
+            None,
+        )
+        .await?;
+
+        Ok(CreateNamespaceResponse {
+            properties: request.properties,
+            ..Default::default()
+        })
+    }
+
+    async fn drop_namespace(&self, request: DropNamespaceRequest) -> Result<DropNamespaceResponse> {
+        let namespace_id = request.id.as_ref().ok_or_else(|| {
+            lance_core::Error::from(NamespaceError::InvalidInput {
+                message: "Namespace ID is required".to_string(),
+            })
+        })?;
+
+        // Root namespace always exists and cannot be dropped
+        if namespace_id.is_empty() {
+            return Err(NamespaceError::InvalidInput {
+                message: "Root namespace cannot be dropped".to_string(),
             }
+            .into());
         }
 
-        // Validate that request_data is provided
-        if data.is_empty() {
-            return Err(NamespaceError::InvalidInput {
-                message: "Request data (Arrow IPC stream) is required for create_table".to_string(),
+        let object_id = namespace_id.join(DELIMITER);
+
+        // Check if namespace exists
+        if !self.manifest_contains_object(&object_id).boxed().await? {
+            return Err(NamespaceError::NamespaceNotFound {
+                message: object_id.to_string(),
             }
             .into());
         }
 
-        // Write the data using Lance Dataset
-        let cursor = Cursor::new(data.to_vec());
-        let stream_reader = StreamReader::try_new(cursor, None).map_err(|e| {
+        // Check for child namespaces
+        let escaped_id = object_id.replace('\'', "''");
+        let prefix = format!("{}{}", escaped_id, DELIMITER);
+        let filter = format!("starts_with(object_id, '{}')", prefix);
+        let mut scanner = self.manifest_scanner().boxed().await?;
+        scanner.filter(&filter).map_err(|e| {
             lance_core::Error::from(NamespaceError::Internal {
-                message: format!("Failed to read IPC stream: {:?}", e),
+                message: format!("Failed to filter: {:?}", e),
             })
         })?;
-
-        let batches: Vec<RecordBatch> = stream_reader
-            .collect::<std::result::Result<Vec<_>, _>>()
-            .map_err(|e| {
+        scanner.project::<&str>(&[]).map_err(|e| {
             lance_core::Error::from(NamespaceError::Internal {
-                message: format!("Failed to collect batches: {:?}", e),
+                message: format!("Failed to project: {:?}", e),
             })
         })?;
-
-        if batches.is_empty() {
-            return Err(NamespaceError::Internal {
-                message: "No data provided for table creation".to_string(),
+        scanner.with_row_id();
+        let count = scanner.count_rows().boxed().await.map_err(|e| {
+            lance_core::Error::from(NamespaceError::Internal {
+                message: format!("Failed to count rows: {:?}", e),
+            })
+        })?;
+
+        if count > 0 {
+            return Err(NamespaceError::NamespaceNotEmpty {
+                message: format!("'{}' (contains {} child objects)", object_id, count),
             }
             .into());
         }
 
-        let schema = batches[0].schema();
-        let batch_results: Vec<std::result::Result<RecordBatch, arrow_schema::ArrowError>> =
-            batches.into_iter().map(Ok).collect();
-        let reader = RecordBatchIterator::new(batch_results, schema);
+        self.delete_from_manifest(&object_id).boxed().await?;
 
-        let mut write_storage_options = self.storage_options.clone().unwrap_or_default();
-        if let Some(request_storage_options) = request.storage_options.as_ref() {
-            write_storage_options.extend(request_storage_options.clone());
-        }
+        Ok(DropNamespaceResponse::default())
+    }
 
-        let store_params = ObjectStoreParams {
-            storage_options_accessor: (!write_storage_options.is_empty()).then(|| {
-                Arc::new(
-                    lance_io::object_store::StorageOptionsAccessor::with_static_options(
-                        write_storage_options,
-                    ),
-                )
-            }),
-            ..Default::default()
-        };
-        let write_params = WriteParams {
-            mode: create_mode.write_mode(),
-            session: self.session.clone(),
-            store_params: Some(store_params),
-            ..Default::default()
-        };
-        let dataset = Dataset::write(Box::new(reader), &table_uri, Some(write_params))
-            .await
-            .map_err(|e| {
-                lance_core::Error::from(NamespaceError::Internal {
-                    message: format!("Failed to write dataset: {:?}", e),
-                })
-            })?;
-        let version = dataset.version().version as i64;
+    async fn namespace_exists(&self, request: NamespaceExistsRequest) -> Result<()> {
+        let namespace_id = request.id.as_ref().ok_or_else(|| {
+            lance_core::Error::from(NamespaceError::InvalidInput {
+                message: "Namespace ID is required".to_string(),
+            })
+        })?;
 
-        if overwriting_existing_table {
-            let metadata =
-                Self::serialize_metadata(request.properties.as_ref(), "table", &object_id)?;
-            self.upsert_into_manifest_with_metadata(
-                vec![ManifestEntry {
-                    object_id,
-                    object_type: ObjectType::Table,
-                    location: Some(dir_name),
-                    metadata,
-                }],
-                None,
-            )
-            .await?;
+        // Root namespace always exists
+        if namespace_id.is_empty() {
+            return Ok(());
+        }
 
-            Ok(CreateTableResponse {
-                version: Some(version),
-                location: Some(table_uri),
-                storage_options: self.storage_options.clone(),
-                properties: request.properties,
-                ..Default::default()
-            })
+        let object_id = namespace_id.join(DELIMITER);
+        if self.manifest_contains_object(&object_id).await? {
+            Ok(())
         } else {
-            match existing_table {
-                Some(existing_table) => Ok(CreateTableResponse {
-                    version: Some(version),
-                    location: Some(table_uri),
-                    storage_options: self.storage_options.clone(),
-                    properties: existing_table.metadata,
-                    ..Default::default()
-                }),
-                None => {
-                    let metadata =
-                        Self::serialize_metadata(request.properties.as_ref(), "table", &object_id)?;
-                    // Register in manifest (store dir_name, not full URI)
-                    self.insert_into_manifest_with_metadata(
-                        vec![ManifestEntry {
-                            object_id,
-                            object_type: ObjectType::Table,
-                            location: Some(dir_name.clone()),
-                            metadata,
-                        }],
-                        None,
-                    )
-                    .await?;
-
-                    Ok(CreateTableResponse {
-                        version: Some(version),
-                        location: Some(table_uri),
-                        storage_options: self.storage_options.clone(),
-                        properties: request.properties,
-                        ..Default::default()
-                    })
-                }
+            Err(NamespaceError::NamespaceNotFound {
+                message: object_id.to_string(),
             }
+            .into())
         }
     }
 
-    async fn drop_table(&self, request: DropTableRequest) -> Result<DropTableResponse> {
+    async fn declare_table(&self, request: DeclareTableRequest) -> Result<DeclareTableResponse> {
         let table_id = request.id.as_ref().ok_or_else(|| {
             lance_core::Error::from(NamespaceError::InvalidInput {
                 message: "Table ID is required".to_string(),
@@ -2364,546 +3349,1046 @@ impl LanceNamespace for ManifestNamespace {
         let (namespace, table_name) = Self::split_object_id(table_id);
         let object_id = Self::build_object_id(&namespace, &table_name);
 
-        // Query manifest for table location
-        let table_info = self.query_manifest_for_table(&object_id).boxed().await?;
-
-        match table_info {
-            Some(info) => {
-                // Delete from manifest first
-                self.delete_from_manifest(&object_id).boxed().await?;
-
-                // Delete physical data directory using the dir_name from manifest
-                let table_path = self.base_path.clone().join(info.location.as_str());
-                let table_uri = Self::construct_full_uri(&self.root, &info.location)?;
-
-                // Remove the table directory
-                self.object_store
-                    .remove_dir_all(table_path)
-                    .boxed()
-                    .await
-                    .map_err(|e| {
-                        lance_core::Error::from(NamespaceError::Internal {
-                            message: format!("Failed to delete table directory: {:?}", e),
-                        })
-                    })?;
-
-                Ok(DropTableResponse {
-                    id: request.id.clone(),
-                    location: Some(table_uri),
-                    ..Default::default()
-                })
-            }
-            None => Err(NamespaceError::TableNotFound {
+        // Check if table already exists in manifest
+        let existing = self.query_manifest_for_table(&object_id).await?;
+        if existing.is_some() {
+            return Err(NamespaceError::TableAlreadyExists {
                 message: table_name.to_string(),
             }
-            .into()),
+            .into());
         }
-    }
-
-    async fn list_namespaces(
-        &self,
-        request: ListNamespacesRequest,
-    ) -> Result<ListNamespacesResponse> {
-        let parent_namespace = request.id.as_ref().ok_or_else(|| {
-            lance_core::Error::from(NamespaceError::InvalidInput {
-                message: "Namespace ID is required".to_string(),
-            })
-        })?;
 
-        // Build filter to find direct child namespaces
-        let filter = if parent_namespace.is_empty() {
-            // Root namespace: find all namespaces without a parent
-            "object_type = 'namespace' AND NOT contains(object_id, '$')".to_string()
+        // Create table location path with hash-based naming
+        // When dir_listing_enabled is true and it's a root table, use directory-style naming: {table_name}.lance
+        // Otherwise, use hash-based naming: {hash}_{object_id}
+        let dir_name = if namespace.is_empty() && self.dir_listing_enabled {
+            // Root table with directory listing enabled: use {table_name}.lance
+            format!("{}.lance", table_name)
         } else {
-            // Non-root: find namespaces that start with parent$ but have no additional $
-            let prefix = parent_namespace.join(DELIMITER);
-            format!(
-                "object_type = 'namespace' AND starts_with(object_id, '{}{}') AND NOT contains(substring(object_id, {}), '$')",
-                prefix,
-                DELIMITER,
-                prefix.len() + 2
-            )
+            // Child namespace table or dir listing disabled: use hash-based naming
+            Self::generate_dir_name(&object_id)
         };
+        let table_path = self.base_path.clone().join(dir_name.as_str());
+        let table_uri = Self::construct_full_uri(&self.root, &dir_name)?;
 
-        let mut scanner = self.manifest_scanner().await?;
-        scanner.filter(&filter).map_err(|e| {
-            lance_core::Error::from(NamespaceError::Internal {
-                message: format!("Failed to filter: {:?}", e),
-            })
-        })?;
-        scanner.project(&["object_id"]).map_err(|e| {
-            lance_core::Error::from(NamespaceError::Internal {
-                message: format!("Failed to project: {:?}", e),
-            })
-        })?;
+        // Validate location if provided
+        if let Some(req_location) = &request.location {
+            let req_location = req_location.trim_end_matches('/');
+            if req_location != table_uri {
+                return Err(NamespaceError::InvalidInput {
+                    message: format!(
+                        "Cannot declare table {} at location {}, must be at location {}",
+                        table_name, req_location, table_uri
+                    ),
+                }
+                .into());
+            }
+        }
 
-        let batches = Self::execute_scanner(scanner).await?;
-        let mut namespaces = Vec::new();
+        // Create the .lance-reserved file to mark the table as existing
+        let reserved_file_path = table_path.clone().join(".lance-reserved");
 
-        for batch in batches {
-            if batch.num_rows() == 0 {
-                continue;
-            }
+        self.object_store
+            .create(&reserved_file_path)
+            .await
+            .map_err(|e| {
+                lance_core::Error::from(NamespaceError::Internal {
+                    message: format!(
+                        "Failed to create .lance-reserved file for table {}: {}",
+                        table_name, e
+                    ),
+                })
+            })?
+            .shutdown()
+            .await
+            .map_err(|e| {
+                lance_core::Error::from(NamespaceError::Internal {
+                    message: format!(
+                        "Failed to finalize .lance-reserved file for table {}: {}",
+                        table_name, e
+                    ),
+                })
+            })?;
 
-            let object_id_array = Self::get_string_column(&batch, "object_id")?;
-            for i in 0..batch.num_rows() {
-                let object_id = object_id_array.value(i);
-                let (_namespace, name) = Self::parse_object_id(object_id);
-                namespaces.push(name);
-            }
-        }
+        let metadata = Self::serialize_metadata(request.properties.as_ref(), "table", &object_id)?;
+
+        // Add entry to manifest marking this as a declared table (store dir_name, not full path)
+        self.insert_into_manifest_with_metadata(
+            vec![ManifestEntry {
+                object_id,
+                object_type: ObjectType::Table,
+                location: Some(dir_name),
+                metadata,
+            }],
+            None,
+        )
+        .await?;
+
+        log::info!(
+            "Declared table '{}' in manifest at {}",
+            table_name,
+            table_uri
+        );
 
-        let next_page_token =
-            Self::apply_pagination(&mut namespaces, request.page_token, request.limit);
-        let mut response = ListNamespacesResponse::new(namespaces);
-        response.page_token = next_page_token;
-        Ok(response)
+        // For backwards compatibility, only skip vending credentials when explicitly set to false
+        let vend_credentials = request.vend_credentials.unwrap_or(true);
+        let storage_options = if vend_credentials {
+            self.storage_options.clone()
+        } else {
+            None
+        };
+
+        Ok(DeclareTableResponse {
+            location: Some(table_uri),
+            storage_options,
+            properties: request.properties,
+            ..Default::default()
+        })
     }
 
-    async fn describe_namespace(
-        &self,
-        request: DescribeNamespaceRequest,
-    ) -> Result<DescribeNamespaceResponse> {
-        let namespace_id = request.id.as_ref().ok_or_else(|| {
+    async fn register_table(&self, request: RegisterTableRequest) -> Result<RegisterTableResponse> {
+        let table_id = request.id.as_ref().ok_or_else(|| {
             lance_core::Error::from(NamespaceError::InvalidInput {
-                message: "Namespace ID is required".to_string(),
+                message: "Table ID is required".to_string(),
             })
         })?;
 
-        // Root namespace always exists
-        if namespace_id.is_empty() {
-            #[allow(clippy::needless_update)]
-            return Ok(DescribeNamespaceResponse {
-                properties: Some(HashMap::new()),
-                ..Default::default()
-            });
+        if table_id.is_empty() {
+            return Err(NamespaceError::InvalidInput {
+                message: "Table ID cannot be empty".to_string(),
+            }
+            .into());
         }
 
-        // Check if namespace exists in manifest
-        let object_id = namespace_id.join(DELIMITER);
-        let namespace_info = self.query_manifest_for_namespace(&object_id).await?;
+        let location = request.location.clone();
 
-        match namespace_info {
-            #[allow(clippy::needless_update)]
-            Some(info) => Ok(DescribeNamespaceResponse {
-                properties: info.metadata,
-                ..Default::default()
-            }),
-            None => Err(NamespaceError::NamespaceNotFound {
-                message: object_id.to_string(),
+        // Validate that location is a relative path within the root directory
+        // We don't allow absolute URIs or paths that escape the root
+        if location.contains("://") {
+            return Err(NamespaceError::InvalidInput {
+                message: format!(
+                    "Absolute URIs are not allowed for register_table. Location must be a relative path within the root directory: {}",
+                    location
+                ),
             }
-            .into()),
+            .into());
         }
-    }
 
-    async fn create_namespace(
-        &self,
-        request: CreateNamespaceRequest,
-    ) -> Result<CreateNamespaceResponse> {
-        let namespace_id = request.id.as_ref().ok_or_else(|| {
-            lance_core::Error::from(NamespaceError::InvalidInput {
-                message: "Namespace ID is required".to_string(),
-            })
-        })?;
+        if location.starts_with('/') {
+            return Err(NamespaceError::InvalidInput {
+                message: format!(
+                    "Absolute paths are not allowed for register_table. Location must be a relative path within the root directory: {}",
+                    location
+                ),
+            }
+            .into());
+        }
 
-        // Root namespace always exists and cannot be created
-        if namespace_id.is_empty() {
-            return Err(NamespaceError::NamespaceAlreadyExists {
-                message: "root namespace".to_string(),
+        // Check for path traversal attempts
+        if location.contains("..") {
+            return Err(NamespaceError::InvalidInput {
+                message: format!(
+                    "Path traversal is not allowed. Location must be a relative path within the root directory: {}",
+                    location
+                ),
             }
             .into());
         }
 
-        // Validate parent namespaces exist (but not the namespace being created)
-        if namespace_id.len() > 1 {
-            self.validate_namespace_levels_exist(&namespace_id[..namespace_id.len() - 1])
-                .await?;
+        let (namespace, table_name) = Self::split_object_id(table_id);
+        let object_id = Self::build_object_id(&namespace, &table_name);
+
+        // Validate that parent namespaces exist (if not root)
+        if !namespace.is_empty() {
+            self.validate_namespace_levels_exist(&namespace).await?;
         }
 
-        let object_id = namespace_id.join(DELIMITER);
+        // Check if table already exists
         if self.manifest_contains_object(&object_id).await? {
-            return Err(NamespaceError::NamespaceAlreadyExists {
+            return Err(NamespaceError::TableAlreadyExists {
                 message: object_id.to_string(),
             }
             .into());
         }
 
-        let metadata =
-            Self::serialize_metadata(request.properties.as_ref(), "namespace", &object_id)?;
-
-        self.insert_into_manifest_with_metadata(
-            vec![ManifestEntry {
-                object_id,
-                object_type: ObjectType::Namespace,
-                location: None,
-                metadata,
-            }],
-            None,
-        )
-        .await?;
+        // Register the table with its location in the manifest
+        self.insert_into_manifest(object_id, ObjectType::Table, Some(location.clone()))
+            .await?;
 
-        Ok(CreateNamespaceResponse {
-            properties: request.properties,
+        Ok(RegisterTableResponse {
+            location: Some(location),
             ..Default::default()
         })
     }
 
-    async fn drop_namespace(&self, request: DropNamespaceRequest) -> Result<DropNamespaceResponse> {
-        let namespace_id = request.id.as_ref().ok_or_else(|| {
+    async fn deregister_table(
+        &self,
+        request: DeregisterTableRequest,
+    ) -> Result<DeregisterTableResponse> {
+        let table_id = request.id.as_ref().ok_or_else(|| {
             lance_core::Error::from(NamespaceError::InvalidInput {
-                message: "Namespace ID is required".to_string(),
+                message: "Table ID is required".to_string(),
             })
         })?;
 
-        // Root namespace always exists and cannot be dropped
-        if namespace_id.is_empty() {
+        if table_id.is_empty() {
             return Err(NamespaceError::InvalidInput {
-                message: "Root namespace cannot be dropped".to_string(),
+                message: "Table ID cannot be empty".to_string(),
             }
             .into());
         }
 
-        let object_id = namespace_id.join(DELIMITER);
+        let (namespace, table_name) = Self::split_object_id(table_id);
+        let object_id = Self::build_object_id(&namespace, &table_name);
 
-        // Check if namespace exists
-        if !self.manifest_contains_object(&object_id).boxed().await? {
-            return Err(NamespaceError::NamespaceNotFound {
-                message: object_id.to_string(),
+        // Get table info before deleting
+        let table_info = self.query_manifest_for_table(&object_id).await?;
+
+        let table_uri = match table_info {
+            Some(info) => {
+                // Delete from manifest only (leave physical data intact)
+                self.delete_from_manifest(&object_id).boxed().await?;
+                Self::construct_full_uri(&self.root, &info.location)?
             }
-            .into());
-        }
+            None => {
+                return Err(NamespaceError::TableNotFound {
+                    message: object_id.to_string(),
+                }
+                .into());
+            }
+        };
 
-        // Check for child namespaces
-        let escaped_id = object_id.replace('\'', "''");
-        let prefix = format!("{}{}", escaped_id, DELIMITER);
-        let filter = format!("starts_with(object_id, '{}')", prefix);
-        let mut scanner = self.manifest_scanner().boxed().await?;
-        scanner.filter(&filter).map_err(|e| {
-            lance_core::Error::from(NamespaceError::Internal {
-                message: format!("Failed to filter: {:?}", e),
-            })
-        })?;
-        scanner.project::<&str>(&[]).map_err(|e| {
-            lance_core::Error::from(NamespaceError::Internal {
-                message: format!("Failed to project: {:?}", e),
-            })
-        })?;
-        scanner.with_row_id();
-        let count = scanner.count_rows().boxed().await.map_err(|e| {
-            lance_core::Error::from(NamespaceError::Internal {
-                message: format!("Failed to count rows: {:?}", e),
-            })
-        })?;
+        Ok(DeregisterTableResponse {
+            id: request.id.clone(),
+            location: Some(table_uri),
+            ..Default::default()
+        })
+    }
+}
 
-        if count > 0 {
-            return Err(NamespaceError::NamespaceNotEmpty {
-                message: format!("'{}' (contains {} child objects)", object_id, count),
-            }
-            .into());
-        }
+#[cfg(test)]
+mod tests {
+    use super::{
+        BASE_OBJECTS_INDEX_NAME, ConflictResolution, CopyOnWriteMutation, DeleteObjectMutation,
+        LANCE_DATA_DIR, LANCE_INDICES_DIR, MANIFEST_TABLE_NAME, ManifestBatchBuilder,
+        ManifestEntry, ManifestIndexAccumulator, ManifestNamespace, ManifestOutputRow,
+        ManifestRowValue, ManifestStreamMutation, OBJECT_ID_INDEX_NAME, OBJECT_TYPE_INDEX_NAME,
+        ObjectType,
+    };
+    use crate::DirectoryNamespaceBuilder;
+    use arrow::datatypes::DataType;
+    use bytes::Bytes;
+    use futures::StreamExt;
+    use lance::index::DatasetIndexExt;
+    use lance_core::utils::tempfile::TempStdDir;
+    use lance_io::object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry};
+    use lance_namespace::LanceNamespace;
+    use lance_namespace::models::{
+        CreateNamespaceRequest, CreateTableRequest, DescribeTableRequest, DropTableRequest,
+        ListTablesRequest, TableExistsRequest,
+    };
+    use lance_table::format::Fragment;
+    use rstest::rstest;
+    use std::collections::{HashMap, HashSet};
+    use std::sync::Arc;
 
-        self.delete_from_manifest(&object_id).boxed().await?;
+    async fn create_manifest_namespace(
+        root: &str,
+        inline_optimization_enabled: bool,
+    ) -> ManifestNamespace {
+        create_manifest_namespace_with_retries(root, inline_optimization_enabled, None).await
+    }
 
-        Ok(DropNamespaceResponse::default())
+    async fn create_manifest_namespace_with_retries(
+        root: &str,
+        inline_optimization_enabled: bool,
+        commit_retries: Option<u32>,
+    ) -> ManifestNamespace {
+        let (object_store, base_path) = ObjectStore::from_uri_and_params(
+            Arc::new(ObjectStoreRegistry::default()),
+            root,
+            &ObjectStoreParams::default(),
+        )
+        .await
+        .unwrap();
+        ManifestNamespace::from_directory(
+            root.to_string(),
+            None,
+            None,
+            object_store,
+            base_path,
+            true,
+            inline_optimization_enabled,
+            commit_retries,
+        )
+        .await
+        .unwrap()
     }
 
-    async fn namespace_exists(&self, request: NamespaceExistsRequest) -> Result<()> {
-        let namespace_id = request.id.as_ref().ok_or_else(|| {
-            lance_core::Error::from(NamespaceError::InvalidInput {
-                message: "Namespace ID is required".to_string(),
+    struct CommitConflictAfterRewriteMutation {
+        root: String,
+        conflict_object_id: String,
+    }
+
+    impl ManifestStreamMutation for CommitConflictAfterRewriteMutation {
+        type Output = ();
+
+        fn process_existing_row(
+            &mut self,
+            row: ManifestRowValue,
+            output: &mut ManifestBatchBuilder,
+            index_data: &mut ManifestIndexAccumulator,
+        ) -> lance_core::Result<()> {
+            output.append(
+                index_data,
+                ManifestOutputRow {
+                    object_id: &row.object_id,
+                    object_type: row.object_type,
+                    location: row.location.as_deref(),
+                    metadata: row.metadata.as_deref(),
+                    base_objects: row.base_objects.as_deref(),
+                },
+            )
+        }
+
+        fn append_rows(
+            &mut self,
+            output: &mut ManifestBatchBuilder,
+            index_data: &mut ManifestIndexAccumulator,
+        ) -> lance_core::Result<()> {
+            output.append(
+                index_data,
+                ManifestOutputRow {
+                    object_id: "attempted_table",
+                    object_type: ObjectType::Table,
+                    location: Some("attempted_table.lance"),
+                    metadata: None,
+                    base_objects: None,
+                },
+            )
+        }
+
+        fn finish(&self) -> CopyOnWriteMutation<Self::Output> {
+            let root = self.root.clone();
+            let object_id = self.conflict_object_id.clone();
+            std::thread::spawn(move || {
+                let runtime = tokio::runtime::Runtime::new().unwrap();
+                runtime.block_on(async move {
+                    let writer = create_manifest_namespace(&root, false).await;
+                    writer
+                        .insert_into_manifest_with_metadata(
+                            vec![ManifestEntry {
+                                object_id,
+                                object_type: ObjectType::Table,
+                                location: Some("conflicting_table.lance".to_string()),
+                                metadata: None,
+                            }],
+                            None,
+                        )
+                        .await
+                        .unwrap();
+                });
             })
-        })?;
+            .join()
+            .unwrap();
+            CopyOnWriteMutation::updated(())
+        }
+    }
+
+    /// A delete mutation that, during staging, has a concurrent writer delete the same
+    /// object and commit first, so our own commit hits a conflict while the object is
+    /// already gone — exercising `ConflictResolution::SucceedIfAbsent`.
+    struct ConcurrentDeleteBeforeCommitMutation {
+        inner: DeleteObjectMutation,
+        root: String,
+        target: String,
+    }
+
+    impl ManifestStreamMutation for ConcurrentDeleteBeforeCommitMutation {
+        type Output = ();
+
+        fn process_existing_row(
+            &mut self,
+            row: ManifestRowValue,
+            output: &mut ManifestBatchBuilder,
+            index_data: &mut ManifestIndexAccumulator,
+        ) -> lance_core::Result<()> {
+            self.inner.process_existing_row(row, output, index_data)
+        }
 
-        // Root namespace always exists
-        if namespace_id.is_empty() {
-            return Ok(());
+        fn append_rows(
+            &mut self,
+            output: &mut ManifestBatchBuilder,
+            index_data: &mut ManifestIndexAccumulator,
+        ) -> lance_core::Result<()> {
+            self.inner.append_rows(output, index_data)
         }
 
-        let object_id = namespace_id.join(DELIMITER);
-        if self.manifest_contains_object(&object_id).await? {
-            Ok(())
-        } else {
-            Err(NamespaceError::NamespaceNotFound {
-                message: object_id.to_string(),
+        fn finish(&self) -> CopyOnWriteMutation<Self::Output> {
+            let root = self.root.clone();
+            let target = self.target.clone();
+            std::thread::spawn(move || {
+                let runtime = tokio::runtime::Runtime::new().unwrap();
+                runtime.block_on(async move {
+                    let writer = create_manifest_namespace(&root, false).await;
+                    writer.delete_from_manifest(&target).await.unwrap();
+                });
+            })
+            .join()
+            .unwrap();
+            self.inner.finish()
+        }
+
+        fn conflict_resolution(&self) -> ConflictResolution<Self::Output> {
+            ConflictResolution::SucceedIfAbsent {
+                object_id: self.target.clone(),
+                output: (),
             }
-            .into())
         }
     }
 
-    async fn declare_table(&self, request: DeclareTableRequest) -> Result<DeclareTableResponse> {
-        let table_id = request.id.as_ref().ok_or_else(|| {
-            lance_core::Error::from(NamespaceError::InvalidInput {
-                message: "Table ID is required".to_string(),
-            })
-        })?;
-
-        if table_id.is_empty() {
-            return Err(NamespaceError::InvalidInput {
-                message: "Table ID cannot be empty".to_string(),
+    async fn manifest_base_objects(
+        manifest_ns: &ManifestNamespace,
+    ) -> HashMap<String, Option<Vec<String>>> {
+        let mut scanner = manifest_ns.manifest_scanner().await.unwrap();
+        scanner.project(&["object_id", "base_objects"]).unwrap();
+        let batches = ManifestNamespace::execute_scanner(scanner).await.unwrap();
+        let mut rows = HashMap::new();
+        for batch in batches {
+            let object_ids = ManifestNamespace::get_string_column(&batch, "object_id").unwrap();
+            let base_objects = ManifestNamespace::base_objects_column_values(&batch).unwrap();
+            for (row, value) in base_objects.into_iter().enumerate() {
+                rows.insert(object_ids.value(row).to_string(), value);
             }
-            .into());
         }
+        rows
+    }
 
-        let (namespace, table_name) = Self::split_object_id(table_id);
-        let object_id = Self::build_object_id(&namespace, &table_name);
+    async fn manifest_data_paths(manifest_ns: &ManifestNamespace) -> HashSet<String> {
+        let data_dir = manifest_ns
+            .base_path
+            .clone()
+            .join(MANIFEST_TABLE_NAME)
+            .join(LANCE_DATA_DIR);
+        let mut stream = manifest_ns.object_store.read_dir_all(&data_dir, None);
+        let mut paths = HashSet::new();
+        while let Some(meta) = stream.next().await.transpose().unwrap() {
+            paths.insert(meta.location.to_string());
+        }
+        paths
+    }
 
-        // Check if table already exists in manifest
-        let existing = self.query_manifest_for_table(&object_id).await?;
-        if existing.is_some() {
-            return Err(NamespaceError::TableAlreadyExists {
-                message: table_name.to_string(),
-            }
-            .into());
+    async fn manifest_index_paths(manifest_ns: &ManifestNamespace) -> HashSet<String> {
+        let index_dir = manifest_ns
+            .base_path
+            .clone()
+            .join(MANIFEST_TABLE_NAME)
+            .join(LANCE_INDICES_DIR);
+        let mut stream = manifest_ns.object_store.read_dir_all(&index_dir, None);
+        let mut paths = HashSet::new();
+        while let Some(meta) = stream.next().await.transpose().unwrap() {
+            paths.insert(meta.location.to_string());
         }
+        paths
+    }
 
-        // Create table location path with hash-based naming
-        // When dir_listing_enabled is true and it's a root table, use directory-style naming: {table_name}.lance
-        // Otherwise, use hash-based naming: {hash}_{object_id}
-        let dir_name = if namespace.is_empty() && self.dir_listing_enabled {
-            // Root table with directory listing enabled: use {table_name}.lance
-            format!("{}.lance", table_name)
-        } else {
-            // Child namespace table or dir listing disabled: use hash-based naming
-            Self::generate_dir_name(&object_id)
-        };
-        let table_path = self.base_path.clone().join(dir_name.as_str());
-        let table_uri = Self::construct_full_uri(&self.root, &dir_name)?;
+    fn create_test_ipc_data() -> Vec<u8> {
+        use arrow::array::{Int32Array, StringArray};
+        use arrow::datatypes::{DataType, Field, Schema};
+        use arrow::ipc::writer::StreamWriter;
+        use arrow::record_batch::RecordBatch;
+        use std::sync::Arc;
 
-        // Validate location if provided
-        if let Some(req_location) = &request.location {
-            let req_location = req_location.trim_end_matches('/');
-            if req_location != table_uri {
-                return Err(NamespaceError::InvalidInput {
-                    message: format!(
-                        "Cannot declare table {} at location {}, must be at location {}",
-                        table_name, req_location, table_uri
-                    ),
-                }
-                .into());
-            }
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("name", DataType::Utf8, false),
+        ]));
+
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+                Arc::new(StringArray::from(vec!["a", "b", "c"])),
+            ],
+        )
+        .unwrap();
+
+        let mut buffer = Vec::new();
+        {
+            let mut writer = StreamWriter::try_new(&mut buffer, &schema).unwrap();
+            writer.write(&batch).unwrap();
+            writer.finish().unwrap();
         }
+        buffer
+    }
 
-        // Create the .lance-reserved file to mark the table as existing
-        let reserved_file_path = table_path.clone().join(".lance-reserved");
+    /// Open the `__manifest` dataset directly and set a table-metadata key,
+    /// simulating a future Lance client that persisted a feature flag.
+    async fn set_manifest_table_metadata(temp_path: &str, key: &str, value: &str) {
+        use lance::dataset::builder::DatasetBuilder;
+        let mut ds = DatasetBuilder::from_uri(format!("{}/{}", temp_path, MANIFEST_TABLE_NAME))
+            .load()
+            .await
+            .unwrap();
+        ds.update_metadata([(key, value)]).await.unwrap();
+    }
 
-        self.object_store
-            .create(&reserved_file_path)
+    async fn create_namespace_with_one_table(temp_path: &str) {
+        let ns = DirectoryNamespaceBuilder::new(temp_path)
+            .build()
             .await
-            .map_err(|e| {
-                lance_core::Error::from(NamespaceError::Internal {
-                    message: format!(
-                        "Failed to create .lance-reserved file for table {}: {}",
-                        table_name, e
-                    ),
-                })
-            })?
-            .shutdown()
+            .unwrap();
+        let mut create_request = CreateTableRequest::new();
+        create_request.id = Some(vec!["t1".to_string()]);
+        ns.create_table(create_request, Bytes::from(create_test_ipc_data()))
             .await
-            .map_err(|e| {
-                lance_core::Error::from(NamespaceError::Internal {
-                    message: format!(
-                        "Failed to finalize .lance-reserved file for table {}: {}",
-                        table_name, e
-                    ),
-                })
-            })?;
+            .unwrap();
+    }
 
-        let metadata = Self::serialize_metadata(request.properties.as_ref(), "table", &object_id)?;
+    /// This is a forward-compatibility checker only: it must not set any feature
+    /// flag, so existing clients keep treating the manifest as compatible.
+    #[tokio::test]
+    async fn test_manifest_has_no_feature_flags_by_default() {
+        use lance::dataset::builder::DatasetBuilder;
+        let temp_dir = TempStdDir::default();
+        let temp_path = temp_dir.to_str().unwrap();
+        create_namespace_with_one_table(temp_path).await;
 
-        // Add entry to manifest marking this as a declared table (store dir_name, not full path)
-        self.insert_into_manifest_with_metadata(
-            vec![ManifestEntry {
-                object_id,
-                object_type: ObjectType::Table,
-                location: Some(dir_name),
-                metadata,
-            }],
-            None,
+        let ds = DatasetBuilder::from_uri(format!("{}/{}", temp_path, MANIFEST_TABLE_NAME))
+            .load()
+            .await
+            .unwrap();
+        assert!(
+            !ds.metadata()
+                .contains_key(crate::dir::manifest_feature_flags::READER_FEATURE_FLAGS_KEY)
+        );
+        assert!(
+            !ds.metadata()
+                .contains_key(crate::dir::manifest_feature_flags::WRITER_FEATURE_FLAGS_KEY)
+        );
+    }
+
+    /// An unknown reader feature flag must block opening the catalog with a clear
+    /// "please upgrade" error rather than silently degrading to directory listing.
+    #[tokio::test]
+    async fn test_unknown_reader_flag_blocks_access() {
+        let temp_dir = TempStdDir::default();
+        let temp_path = temp_dir.to_str().unwrap();
+        create_namespace_with_one_table(temp_path).await;
+        set_manifest_table_metadata(
+            temp_path,
+            crate::dir::manifest_feature_flags::READER_FEATURE_FLAGS_KEY,
+            "1",
         )
-        .await?;
+        .await;
 
-        log::info!(
-            "Declared table '{}' in manifest at {}",
-            table_name,
-            table_uri
+        let err = DirectoryNamespaceBuilder::new(temp_path)
+            .build()
+            .await
+            .expect_err("opening a manifest with an unknown reader flag should fail");
+        assert!(
+            err.to_string().to_lowercase().contains("upgrade"),
+            "expected an upgrade error, got: {err}"
         );
+    }
 
-        // For backwards compatibility, only skip vending credentials when explicitly set to false
-        let vend_credentials = request.vend_credentials.unwrap_or(true);
-        let storage_options = if vend_credentials {
-            self.storage_options.clone()
-        } else {
-            None
-        };
+    /// An unknown writer feature flag must still allow reads but block writes.
+    #[tokio::test]
+    async fn test_unknown_writer_flag_blocks_writes_but_allows_reads() {
+        let temp_dir = TempStdDir::default();
+        let temp_path = temp_dir.to_str().unwrap();
+        create_namespace_with_one_table(temp_path).await;
+        set_manifest_table_metadata(
+            temp_path,
+            crate::dir::manifest_feature_flags::WRITER_FEATURE_FLAGS_KEY,
+            "1",
+        )
+        .await;
 
-        Ok(DeclareTableResponse {
-            location: Some(table_uri),
-            storage_options,
-            properties: request.properties,
-            ..Default::default()
-        })
+        let ns = DirectoryNamespaceBuilder::new(temp_path)
+            .build()
+            .await
+            .expect("reads should still be allowed with only a writer flag set");
+        let mut list_request = ListTablesRequest::new();
+        list_request.id = Some(vec![]);
+        assert_eq!(ns.list_tables(list_request).await.unwrap().tables.len(), 1);
+
+        // A refused write must not leave an orphaned table dataset behind.
+        let entries_before = dir_entry_names(temp_path);
+        let mut create_request = CreateTableRequest::new();
+        create_request.id = Some(vec!["t2".to_string()]);
+        let err = ns
+            .create_table(create_request, Bytes::from(create_test_ipc_data()))
+            .await
+            .expect_err("writing through an unknown writer flag should fail");
+        assert!(
+            err.to_string().to_lowercase().contains("upgrade"),
+            "expected an upgrade error, got: {err}"
+        );
+        assert_eq!(
+            entries_before,
+            dir_entry_names(temp_path),
+            "a refused create_table must not create an orphaned table directory"
+        );
+
+        // Mutations that go straight through rewrite_manifest (no early
+        // create_table check) must also be refused: an insert (create_namespace)
+        // and a delete (drop_table). This proves the writer check is enforced at
+        // the single copy-on-write chokepoint, not just on the create_table path.
+        let mut create_ns = CreateNamespaceRequest::new();
+        create_ns.id = Some(vec!["ns1".to_string()]);
+        let err = ns
+            .create_namespace(create_ns)
+            .await
+            .expect_err("create_namespace through an unknown writer flag should fail");
+        assert!(
+            err.to_string().to_lowercase().contains("upgrade"),
+            "expected an upgrade error, got: {err}"
+        );
+
+        let mut drop_request = DropTableRequest::new();
+        drop_request.id = Some(vec!["t1".to_string()]);
+        let err = ns
+            .drop_table(drop_request)
+            .await
+            .expect_err("drop_table through an unknown writer flag should fail");
+        assert!(
+            err.to_string().to_lowercase().contains("upgrade"),
+            "expected an upgrade error, got: {err}"
+        );
+    }
+
+    fn dir_entry_names(path: &str) -> std::collections::BTreeSet<String> {
+        std::fs::read_dir(path)
+            .unwrap()
+            .map(|e| e.unwrap().file_name().to_string_lossy().into_owned())
+            .collect()
+    }
+
+    #[tokio::test]
+    async fn test_manifest_rewrite_preserves_utf8_metadata_and_base_objects() {
+        let temp_dir = TempStdDir::default();
+        let temp_path = temp_dir.to_str().unwrap();
+        let manifest_ns = create_manifest_namespace(temp_path, true).await;
+
+        manifest_ns
+            .insert_into_manifest_with_metadata(
+                vec![ManifestEntry {
+                    object_id: "view".to_string(),
+                    object_type: ObjectType::Table,
+                    location: Some("view.lance".to_string()),
+                    metadata: Some(r#"{"kind":"view"}"#.to_string()),
+                }],
+                Some(vec!["base_a".to_string(), "base_b".to_string()]),
+            )
+            .await
+            .unwrap();
+        manifest_ns
+            .insert_into_manifest_with_metadata(
+                vec![ManifestEntry {
+                    object_id: "other".to_string(),
+                    object_type: ObjectType::Namespace,
+                    location: None,
+                    metadata: Some(r#"{"kind":"namespace"}"#.to_string()),
+                }],
+                None,
+            )
+            .await
+            .unwrap();
+
+        let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap();
+        let metadata_field = dataset_guard.schema().field("metadata").unwrap();
+        assert_eq!(metadata_field.data_type(), DataType::Utf8);
+        drop(dataset_guard);
+
+        let base_objects = manifest_base_objects(&manifest_ns).await;
+        assert_eq!(
+            base_objects.get("view").cloned().unwrap(),
+            Some(vec!["base_a".to_string(), "base_b".to_string()])
+        );
+        assert_eq!(base_objects.get("other").cloned().unwrap(), None);
+    }
+
+    #[tokio::test]
+    async fn test_manifest_rewrite_replacement_indices_are_versioned() {
+        let temp_dir = TempStdDir::default();
+        let temp_path = temp_dir.to_str().unwrap();
+        let manifest_ns = create_manifest_namespace(temp_path, true).await;
+
+        manifest_ns
+            .insert_into_manifest_with_metadata(
+                vec![ManifestEntry {
+                    object_id: "table".to_string(),
+                    object_type: ObjectType::Table,
+                    location: Some("table.lance".to_string()),
+                    metadata: None,
+                }],
+                Some(vec!["base".to_string()]),
+            )
+            .await
+            .unwrap();
+
+        let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap();
+        let dataset_version = dataset_guard.version().version;
+        let indices = dataset_guard.load_indices().await.unwrap();
+        let names = indices
+            .iter()
+            .map(|index| index.name.as_str())
+            .collect::<HashSet<_>>();
+        assert!(names.contains(OBJECT_ID_INDEX_NAME));
+        assert!(names.contains(OBJECT_TYPE_INDEX_NAME));
+        assert!(names.contains(BASE_OBJECTS_INDEX_NAME));
+        for index in indices.iter() {
+            assert_eq!(index.dataset_version, dataset_version);
+            assert!(!index.fragment_bitmap.as_ref().unwrap().is_empty());
+        }
+    }
+
+    #[tokio::test]
+    async fn test_manifest_rewrite_empty_manifest_keeps_replacement_indices_valid() {
+        let temp_dir = TempStdDir::default();
+        let temp_path = temp_dir.to_str().unwrap();
+        let manifest_ns = create_manifest_namespace(temp_path, true).await;
+
+        manifest_ns
+            .insert_into_manifest_with_metadata(
+                vec![ManifestEntry {
+                    object_id: "table".to_string(),
+                    object_type: ObjectType::Table,
+                    location: Some("table.lance".to_string()),
+                    metadata: None,
+                }],
+                None,
+            )
+            .await
+            .unwrap();
+        manifest_ns.delete_from_manifest("table").await.unwrap();
+
+        assert!(!manifest_ns.manifest_contains_object("table").await.unwrap());
+        let mut scanner = manifest_ns.manifest_scanner().await.unwrap();
+        scanner.project(&["object_id"]).unwrap();
+        let rows = ManifestNamespace::execute_scanner(scanner)
+            .await
+            .unwrap()
+            .into_iter()
+            .map(|batch| batch.num_rows())
+            .sum::<usize>();
+        assert_eq!(rows, 0);
+
+        let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap();
+        let dataset_version = dataset_guard.version().version;
+        let indices = dataset_guard.load_indices().await.unwrap();
+        let names = indices
+            .iter()
+            .map(|index| index.name.as_str())
+            .collect::<HashSet<_>>();
+        assert!(names.contains(OBJECT_ID_INDEX_NAME));
+        assert!(names.contains(OBJECT_TYPE_INDEX_NAME));
+        assert!(names.contains(BASE_OBJECTS_INDEX_NAME));
+        for index in indices.iter() {
+            assert_eq!(index.dataset_version, dataset_version);
+        }
+    }
+
+    #[tokio::test]
+    async fn test_manifest_rewrite_fragment_bitmap_uses_overwrite_fragment_ids() {
+        let temp_dir = TempStdDir::default();
+        let temp_path = temp_dir.to_str().unwrap();
+        let manifest_ns = create_manifest_namespace(temp_path, false).await;
+        let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap();
+        let fragments = vec![Fragment::new(0), Fragment::new(0), Fragment::new(7)];
+
+        let manifest = ManifestNamespace::manifest_from_overwrite_transaction(
+            dataset_guard.manifest(),
+            dataset_guard.manifest().schema.clone(),
+            &fragments,
+        );
+
+        let fragment_ids = manifest
+            .fragments
+            .iter()
+            .map(|fragment| fragment.id)
+            .collect::<Vec<_>>();
+        assert_eq!(fragment_ids, vec![0, 1, 7]);
+        assert_eq!(
+            ManifestNamespace::manifest_fragment_bitmap(&manifest)
+                .unwrap()
+                .into_iter()
+                .collect::<Vec<_>>(),
+            vec![0, 1, 7]
+        );
     }
 
-    async fn register_table(&self, request: RegisterTableRequest) -> Result<RegisterTableResponse> {
-        let table_id = request.id.as_ref().ok_or_else(|| {
-            lance_core::Error::from(NamespaceError::InvalidInput {
-                message: "Table ID is required".to_string(),
-            })
-        })?;
+    #[tokio::test]
+    async fn test_manifest_noop_delete_uses_latest_snapshot() {
+        let temp_dir = TempStdDir::default();
+        let temp_path = temp_dir.to_str().unwrap();
+        let stale_ns = create_manifest_namespace(temp_path, false).await;
+        let writer_ns = create_manifest_namespace(temp_path, false).await;
 
-        if table_id.is_empty() {
-            return Err(NamespaceError::InvalidInput {
-                message: "Table ID cannot be empty".to_string(),
-            }
-            .into());
-        }
+        writer_ns
+            .insert_into_manifest_with_metadata(
+                vec![ManifestEntry {
+                    object_id: "late_table".to_string(),
+                    object_type: ObjectType::Table,
+                    location: Some("late_table.lance".to_string()),
+                    metadata: None,
+                }],
+                None,
+            )
+            .await
+            .unwrap();
 
-        let location = request.location.clone();
+        stale_ns.delete_from_manifest("late_table").await.unwrap();
 
-        // Validate that location is a relative path within the root directory
-        // We don't allow absolute URIs or paths that escape the root
-        if location.contains("://") {
-            return Err(NamespaceError::InvalidInput {
-                message: format!(
-                    "Absolute URIs are not allowed for register_table. Location must be a relative path within the root directory: {}",
-                    location
-                ),
-            }
-            .into());
-        }
+        let check_ns = create_manifest_namespace(temp_path, false).await;
+        assert!(
+            !check_ns
+                .manifest_contains_object("late_table")
+                .await
+                .unwrap()
+        );
+    }
 
-        if location.starts_with('/') {
-            return Err(NamespaceError::InvalidInput {
-                message: format!(
-                    "Absolute paths are not allowed for register_table. Location must be a relative path within the root directory: {}",
-                    location
-                ),
-            }
-            .into());
-        }
+    #[tokio::test]
+    async fn test_manifest_noop_delete_cleans_uncommitted_data_file() {
+        let temp_dir = TempStdDir::default();
+        let temp_path = temp_dir.to_str().unwrap();
+        let manifest_ns = create_manifest_namespace(temp_path, false).await;
 
-        // Check for path traversal attempts
-        if location.contains("..") {
-            return Err(NamespaceError::InvalidInput {
-                message: format!(
-                    "Path traversal is not allowed. Location must be a relative path within the root directory: {}",
-                    location
-                ),
-            }
-            .into());
-        }
+        manifest_ns
+            .insert_into_manifest_with_metadata(
+                vec![ManifestEntry {
+                    object_id: "table".to_string(),
+                    object_type: ObjectType::Table,
+                    location: Some("table.lance".to_string()),
+                    metadata: None,
+                }],
+                None,
+            )
+            .await
+            .unwrap();
 
-        let (namespace, table_name) = Self::split_object_id(table_id);
-        let object_id = Self::build_object_id(&namespace, &table_name);
+        let before = manifest_data_paths(&manifest_ns).await;
+        assert!(!before.is_empty());
 
-        // Validate that parent namespaces exist (if not root)
-        if !namespace.is_empty() {
-            self.validate_namespace_levels_exist(&namespace).await?;
-        }
+        manifest_ns
+            .delete_from_manifest("missing_table")
+            .await
+            .unwrap();
 
-        // Check if table already exists
-        if self.manifest_contains_object(&object_id).await? {
-            return Err(NamespaceError::TableAlreadyExists {
-                message: object_id.to_string(),
-            }
-            .into());
-        }
+        let after = manifest_data_paths(&manifest_ns).await;
+        assert_eq!(after, before);
+    }
 
-        // Register the table with its location in the manifest
-        self.insert_into_manifest(object_id, ObjectType::Table, Some(location.clone()))
-            .await?;
+    #[tokio::test]
+    async fn test_manifest_final_commit_failure_cleans_uncommitted_rewrite_files() {
+        let temp_dir = TempStdDir::default();
+        let temp_path = temp_dir.to_str().unwrap();
+        let manifest_ns = create_manifest_namespace_with_retries(temp_path, true, Some(0)).await;
 
-        Ok(RegisterTableResponse {
-            location: Some(location),
-            ..Default::default()
-        })
-    }
+        manifest_ns
+            .insert_into_manifest_with_metadata(
+                vec![ManifestEntry {
+                    object_id: "table".to_string(),
+                    object_type: ObjectType::Table,
+                    location: Some("table.lance".to_string()),
+                    metadata: None,
+                }],
+                None,
+            )
+            .await
+            .unwrap();
 
-    async fn deregister_table(
-        &self,
-        request: DeregisterTableRequest,
-    ) -> Result<DeregisterTableResponse> {
-        let table_id = request.id.as_ref().ok_or_else(|| {
-            lance_core::Error::from(NamespaceError::InvalidInput {
-                message: "Table ID is required".to_string(),
+        let before_data_paths = manifest_data_paths(&manifest_ns).await;
+        let before_index_paths = manifest_index_paths(&manifest_ns).await;
+
+        let result = manifest_ns
+            .rewrite_manifest("Failed to test manifest cleanup", || {
+                CommitConflictAfterRewriteMutation {
+                    root: temp_path.to_string(),
+                    conflict_object_id: "conflicting_table".to_string(),
+                }
             })
-        })?;
+            .await;
+        assert!(result.is_err());
 
-        if table_id.is_empty() {
-            return Err(NamespaceError::InvalidInput {
-                message: "Table ID cannot be empty".to_string(),
-            }
-            .into());
-        }
+        let after_data_paths = manifest_data_paths(&manifest_ns).await;
+        assert!(before_data_paths.is_subset(&after_data_paths));
+        assert_eq!(after_data_paths.len(), before_data_paths.len() + 1);
+        assert_eq!(manifest_index_paths(&manifest_ns).await, before_index_paths);
+        assert!(
+            manifest_ns
+                .manifest_contains_object("conflicting_table")
+                .await
+                .unwrap()
+        );
+        assert!(
+            !manifest_ns
+                .manifest_contains_object("attempted_table")
+                .await
+                .unwrap()
+        );
+    }
 
-        let (namespace, table_name) = Self::split_object_id(table_id);
-        let object_id = Self::build_object_id(&namespace, &table_name);
+    #[tokio::test]
+    async fn test_manifest_commit_visible_on_memory_store() {
+        // Regression: the commit must use the same object store the manifest dataset reads
+        // from. On `memory://` the namespace store and the dataset store can be different
+        // in-memory instances, so a commit written to the wrong one is invisible to reads
+        // (manifests as stale version -> endless conflict / "not found").
+        let manifest_ns = create_manifest_namespace("memory://test_commit_visible", false).await;
+        manifest_ns
+            .insert_into_manifest_with_metadata(
+                vec![ManifestEntry {
+                    object_id: "table".to_string(),
+                    object_type: ObjectType::Table,
+                    location: Some("table.lance".to_string()),
+                    metadata: None,
+                }],
+                None,
+            )
+            .await
+            .unwrap();
+        assert!(manifest_ns.manifest_contains_object("table").await.unwrap());
+        // A second sequential commit must not falsely conflict.
+        manifest_ns
+            .insert_into_manifest_with_metadata(
+                vec![ManifestEntry {
+                    object_id: "table2".to_string(),
+                    object_type: ObjectType::Table,
+                    location: Some("table2.lance".to_string()),
+                    metadata: None,
+                }],
+                None,
+            )
+            .await
+            .unwrap();
+        assert!(
+            manifest_ns
+                .manifest_contains_object("table2")
+                .await
+                .unwrap()
+        );
+    }
 
-        // Get table info before deleting
-        let table_info = self.query_manifest_for_table(&object_id).await?;
+    #[tokio::test]
+    async fn test_manifest_commit_uses_inline_transaction() {
+        let temp_dir = TempStdDir::default();
+        let temp_path = temp_dir.to_str().unwrap();
+        let manifest_ns = create_manifest_namespace(temp_path, false).await;
 
-        let table_uri = match table_info {
-            Some(info) => {
-                // Delete from manifest only (leave physical data intact)
-                self.delete_from_manifest(&object_id).boxed().await?;
-                Self::construct_full_uri(&self.root, &info.location)?
-            }
-            None => {
-                return Err(NamespaceError::TableNotFound {
-                    message: object_id.to_string(),
-                }
-                .into());
-            }
-        };
+        manifest_ns
+            .insert_into_manifest_with_metadata(
+                vec![ManifestEntry {
+                    object_id: "table".to_string(),
+                    object_type: ObjectType::Table,
+                    location: Some("table.lance".to_string()),
+                    metadata: None,
+                }],
+                None,
+            )
+            .await
+            .unwrap();
 
-        Ok(DeregisterTableResponse {
-            id: request.id.clone(),
-            location: Some(table_uri),
-            ..Default::default()
-        })
+        let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap();
+        let manifest = dataset_guard.manifest();
+        // The overwrite transaction is embedded inline in the manifest, never written as a
+        // separate _transactions/*.txn file.
+        assert!(manifest.transaction_section.is_some());
+        assert!(manifest.transaction_file.is_none());
     }
-}
 
-#[cfg(test)]
-mod tests {
-    use crate::{DirectoryNamespaceBuilder, ManifestNamespace};
-    use bytes::Bytes;
-    use lance_core::utils::tempfile::TempStdDir;
-    use lance_namespace::LanceNamespace;
-    use lance_namespace::models::{
-        CreateNamespaceRequest, CreateTableRequest, DescribeTableRequest, DropTableRequest,
-        ListTablesRequest, TableExistsRequest,
-    };
-    use rstest::rstest;
+    #[tokio::test]
+    async fn test_manifest_commit_landed_attributes_data_file() {
+        let temp_dir = TempStdDir::default();
+        let temp_path = temp_dir.to_str().unwrap();
+        let manifest_ns = create_manifest_namespace(temp_path, false).await;
 
-    fn create_test_ipc_data() -> Vec<u8> {
-        use arrow::array::{Int32Array, StringArray};
-        use arrow::datatypes::{DataType, Field, Schema};
-        use arrow::ipc::writer::StreamWriter;
-        use arrow::record_batch::RecordBatch;
-        use std::sync::Arc;
+        manifest_ns
+            .insert_into_manifest_with_metadata(
+                vec![ManifestEntry {
+                    object_id: "table".to_string(),
+                    object_type: ObjectType::Table,
+                    location: Some("table.lance".to_string()),
+                    metadata: None,
+                }],
+                None,
+            )
+            .await
+            .unwrap();
 
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("id", DataType::Int32, false),
-            Field::new("name", DataType::Utf8, false),
-        ]));
+        let dataset = Arc::new(manifest_ns.manifest_dataset.get().await.unwrap().clone());
+        let version = dataset.manifest().version;
+        let our_files = dataset
+            .manifest()
+            .fragments
+            .iter()
+            .flat_map(|fragment| fragment.files.iter())
+            .map(|file| file.path.clone())
+            .collect::<HashSet<_>>();
+        assert!(!our_files.is_empty());
 
-        let batch = RecordBatch::try_new(
-            schema.clone(),
-            vec![
-                Arc::new(Int32Array::from(vec![1, 2, 3])),
-                Arc::new(StringArray::from(vec!["a", "b", "c"])),
-            ],
-        )
-        .unwrap();
+        // The committed version references our data file => attributed to us (a lost-ack
+        // commit must be treated as success, not cleaned up).
+        assert!(
+            manifest_ns
+                .manifest_commit_landed(&dataset, version, &our_files)
+                .await
+        );
+        // A different file set is not attributed to us.
+        let other = HashSet::from(["missing.lance".to_string()]);
+        assert!(
+            !manifest_ns
+                .manifest_commit_landed(&dataset, version, &other)
+                .await
+        );
+        // A version that does not exist did not land.
+        assert!(
+            !manifest_ns
+                .manifest_commit_landed(&dataset, version + 100, &our_files)
+                .await
+        );
+    }
 
-        let mut buffer = Vec::new();
-        {
-            let mut writer = StreamWriter::try_new(&mut buffer, &schema).unwrap();
-            writer.write(&batch).unwrap();
-            writer.finish().unwrap();
-        }
-        buffer
+    #[tokio::test]
+    async fn test_manifest_delete_conflict_with_concurrent_delete_succeeds() {
+        let temp_dir = TempStdDir::default();
+        let temp_path = temp_dir.to_str().unwrap();
+        let manifest_ns = create_manifest_namespace_with_retries(temp_path, false, Some(0)).await;
+
+        manifest_ns
+            .insert_into_manifest_with_metadata(
+                vec![ManifestEntry {
+                    object_id: "table".to_string(),
+                    object_type: ObjectType::Table,
+                    location: Some("table.lance".to_string()),
+                    metadata: None,
+                }],
+                None,
+            )
+            .await
+            .unwrap();
+        assert!(manifest_ns.manifest_contains_object("table").await.unwrap());
+
+        // A concurrent writer deletes "table" and commits first, so our own delete commit
+        // conflicts while "table" is already gone. Native resolution treats the goal as
+        // achieved and succeeds instead of erroring or retrying forever.
+        let result = manifest_ns
+            .rewrite_manifest("Failed to delete from manifest", || {
+                ConcurrentDeleteBeforeCommitMutation {
+                    inner: DeleteObjectMutation {
+                        object_id: "table".to_string(),
+                        deleted: false,
+                    },
+                    root: temp_path.to_string(),
+                    target: "table".to_string(),
+                }
+            })
+            .await;
+
+        assert!(result.is_ok(), "delete should succeed: {result:?}");
+        assert!(!manifest_ns.manifest_contains_object("table").await.unwrap());
     }
 
     #[rstest]
@@ -3939,9 +5424,9 @@ mod tests {
     /// Test that concurrent create_table calls for the same table name don't
     /// create duplicate entries in the manifest. Uses two independent
     /// ManifestNamespace instances pointing at the same directory to simulate
-    /// two separate OS processes racing on table creation. The conflict_retries
-    /// setting on the MergeInsert ensures the second operation properly detects
-    /// the duplicate via WhenMatched::Fail after retrying against the latest data.
+    /// two separate OS processes racing on table creation. Copy-on-write rewrite
+    /// retries ensure the second operation detects the duplicate after retrying
+    /// against the latest data.
     #[tokio::test]
     async fn test_concurrent_create_table_no_duplicates() {
         let temp_dir = TempStdDir::default();
diff --git a/rust/lance-namespace-impls/src/dir/manifest_feature_flags.rs b/rust/lance-namespace-impls/src/dir/manifest_feature_flags.rs
new file mode 100644
index 00000000000..d0849ceda4f
--- /dev/null
+++ b/rust/lance-namespace-impls/src/dir/manifest_feature_flags.rs
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Reader/writer feature flags for the directory-catalog `__manifest` dataset.
+//!
+//! Forward-compatibility infrastructure for the `__manifest` Lance dataset,
+//! analogous to the Lance table format's `reader_feature_flags` /
+//! `writer_feature_flags` but describing the *catalog manifest* format (schema
+//! and semantics) rather than the underlying Lance file format. The flags are
+//! persisted in the `__manifest` dataset's `table_metadata` map.
+//!
+//! Each manifest feature owns one bit in a `u64` bitmask. A build may read a
+//! `__manifest` only if it understands every set reader-flag bit, and may write
+//! it only if it understands every set writer-flag bit; otherwise it fails fast
+//! with a clear "please upgrade" error instead of silently misreading data. The
+//! set of bits a build understands is `READER_KNOWN_FLAGS` / `WRITER_KNOWN_FLAGS`.
+//!
+//! This is the mechanism only: no manifest feature is defined yet, so the known
+//! masks are `0` and nothing is ever set — every current manifest reads and
+//! writes unchanged. The first format change that needs forward-compatibility
+//! protection adds its bit to the known masks and stamps it on write; from then
+//! on, builds without that bit refuse the new format rather than misreading it.
+//! Manifests written before this mechanism carry no flag keys, which parse as
+//! `0` and stay compatible with every build.
+
+use std::collections::HashMap;
+
+use lance_core::{Error, Result};
+use lance_namespace::error::NamespaceError;
+
+/// `table_metadata` key holding the reader feature-flag bitmask (decimal `u64`).
+pub const READER_FEATURE_FLAGS_KEY: &str = "lance.namespace.manifest.reader_feature_flags";
+/// `table_metadata` key holding the writer feature-flag bitmask (decimal `u64`).
+pub const WRITER_FEATURE_FLAGS_KEY: &str = "lance.namespace.manifest.writer_feature_flags";
+
+/// Reader feature-flag bits this build understands. No manifest feature is
+/// defined yet, so this build understands none and refuses any non-zero reader
+/// flag. A future format change adds its bit here.
+const READER_KNOWN_FLAGS: u64 = 0;
+/// Writer feature-flag bits this build understands.
+const WRITER_KNOWN_FLAGS: u64 = 0;
+
+/// Whether this build can read a `__manifest` whose persisted reader feature
+/// flags are `reader_flags` — i.e. it understands every set bit.
+pub fn can_read_manifest(reader_flags: u64) -> bool {
+    (reader_flags & !READER_KNOWN_FLAGS) == 0
+}
+
+/// Whether this build can write a `__manifest` whose persisted writer feature
+/// flags are `writer_flags` — i.e. it understands every set bit.
+pub fn can_write_manifest(writer_flags: u64) -> bool {
+    (writer_flags & !WRITER_KNOWN_FLAGS) == 0
+}
+
+fn parse_flags(table_metadata: &HashMap<String, String>, key: &str) -> Result<u64> {
+    match table_metadata.get(key) {
+        None => Ok(0),
+        Some(raw) => raw.parse::<u64>().map_err(|e| {
+            Error::from(NamespaceError::Unsupported {
+                message: format!(
+                    "The __manifest dataset has an unparsable feature-flag value '{raw}' for \
+                     '{key}': {e}. This likely means it was written by a newer, incompatible \
+                     version of Lance; please upgrade Lance to use this catalog."
+                ),
+            })
+        }),
+    }
+}
+
+/// Reader feature flags persisted in the `__manifest` `table_metadata` (`0` if absent).
+pub fn reader_flags(table_metadata: &HashMap<String, String>) -> Result<u64> {
+    parse_flags(table_metadata, READER_FEATURE_FLAGS_KEY)
+}
+
+/// Writer feature flags persisted in the `__manifest` `table_metadata` (`0` if absent).
+pub fn writer_flags(table_metadata: &HashMap<String, String>) -> Result<u64> {
+    parse_flags(table_metadata, WRITER_FEATURE_FLAGS_KEY)
+}
+
+/// Validate that this build can READ the `__manifest` described by `table_metadata`,
+/// returning a clear "please upgrade" error otherwise.
+pub fn ensure_readable(table_metadata: &HashMap<String, String>) -> Result<()> {
+    let flags = reader_flags(table_metadata)?;
+    if !can_read_manifest(flags) {
+        return Err(Error::from(NamespaceError::Unsupported {
+            message: format!(
+                "The __manifest dataset was written with reader feature flags {flags}, which this \
+                 version of Lance does not understand (known reader flags: {READER_KNOWN_FLAGS}). \
+                 Please upgrade Lance to read this catalog."
+            ),
+        }));
+    }
+    Ok(())
+}
+
+/// Validate that this build can WRITE the `__manifest` described by `table_metadata`,
+/// returning a clear "please upgrade" error otherwise.
+pub fn ensure_writable(table_metadata: &HashMap<String, String>) -> Result<()> {
+    let flags = writer_flags(table_metadata)?;
+    if !can_write_manifest(flags) {
+        return Err(Error::from(NamespaceError::Unsupported {
+            message: format!(
+                "The __manifest dataset was written with writer feature flags {flags}, which this \
+                 version of Lance does not understand (known writer flags: {WRITER_KNOWN_FLAGS}). \
+                 Please upgrade Lance to modify this catalog."
+            ),
+        }));
+    }
+    Ok(())
+}
+
+/// Whether `err` indicates the `__manifest` is in a format this build cannot
+/// handle — i.e. it carries an unknown reader/writer feature flag, surfaced by
+/// [`ensure_readable`] / [`ensure_writable`] as a [`NamespaceError::Unsupported`].
+///
+/// Catalog initialization uses this to refuse opening such a manifest rather
+/// than silently degrading to a directory-listing view that ignores it. The
+/// `__manifest` open path raises no other `Unsupported` error, so matching the
+/// code is sufficient and avoids brittle message matching.
+pub fn is_incompatible_manifest_error(err: &Error) -> bool {
+    matches!(
+        err,
+        Error::Namespace { source, .. }
+            if source
+                .downcast_ref::<NamespaceError>()
+                .is_some_and(|e| matches!(e, NamespaceError::Unsupported { .. }))
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn meta(pairs: &[(&str, &str)]) -> HashMap<String, String> {
+        pairs
+            .iter()
+            .map(|(k, v)| (k.to_string(), v.to_string()))
+            .collect()
+    }
+
+    #[test]
+    fn unflagged_is_compatible() {
+        assert!(can_read_manifest(0));
+        assert!(can_write_manifest(0));
+        let empty = HashMap::new();
+        assert!(ensure_readable(&empty).is_ok());
+        assert!(ensure_writable(&empty).is_ok());
+        assert_eq!(reader_flags(&empty).unwrap(), 0);
+        assert_eq!(writer_flags(&empty).unwrap(), 0);
+        // Explicit zeroes are also compatible.
+        let zeroed = meta(&[
+            (READER_FEATURE_FLAGS_KEY, "0"),
+            (WRITER_FEATURE_FLAGS_KEY, "0"),
+        ]);
+        assert!(ensure_readable(&zeroed).is_ok());
+        assert!(ensure_writable(&zeroed).is_ok());
+    }
+
+    #[test]
+    fn any_unknown_flag_is_refused() {
+        // This build understands no feature flags, so any non-zero bit is refused.
+        assert!(!can_read_manifest(1));
+        assert!(!can_write_manifest(1));
+        assert!(!can_read_manifest(1 << 30));
+        assert!(!can_write_manifest(1 << 63));
+
+        let reader = meta(&[(READER_FEATURE_FLAGS_KEY, "1")]);
+        let err = ensure_readable(&reader).unwrap_err();
+        assert!(err.to_string().to_lowercase().contains("upgrade"));
+        assert!(is_incompatible_manifest_error(&err));
+        // A reader flag does not block writers that the writer mask allows.
+        assert!(ensure_writable(&reader).is_ok());
+
+        let writer = meta(&[(WRITER_FEATURE_FLAGS_KEY, "2")]);
+        let err = ensure_writable(&writer).unwrap_err();
+        assert!(err.to_string().to_lowercase().contains("upgrade"));
+        assert!(is_incompatible_manifest_error(&err));
+    }
+
+    #[test]
+    fn unparsable_value_is_refused() {
+        let m = meta(&[(READER_FEATURE_FLAGS_KEY, "not-a-number")]);
+        assert!(reader_flags(&m).is_err());
+        assert!(ensure_readable(&m).is_err());
+    }
+
+    #[test]
+    fn unrelated_error_is_not_an_incompatibility() {
+        let other = Error::from(NamespaceError::TableNotFound {
+            message: "x".to_string(),
+        });
+        assert!(!is_incompatible_manifest_error(&other));
+    }
+}
diff --git a/rust/lance-namespace-impls/src/rest_adapter.rs b/rust/lance-namespace-impls/src/rest_adapter.rs
index 7324ab0bb0e..44ebd866810 100644
--- a/rust/lance-namespace-impls/src/rest_adapter.rs
+++ b/rust/lance-namespace-impls/src/rest_adapter.rs
@@ -1527,8 +1527,7 @@ mod tests {
             }
 
             /// Like [`Self::new`], with managed versioning (table version
-            /// tracking through the `__manifest` catalog) enabled on the
-            /// backend.
+            /// tracking) enabled on the backend.
             async fn new_managed() -> Self {
                 Self::build(true).await
             }
@@ -1540,9 +1539,7 @@ mod tests {
                 // Create DirectoryNamespace backend with manifest enabled
                 let mut builder = DirectoryNamespaceBuilder::new(&temp_path).manifest_enabled(true);
                 if managed_versioning {
-                    builder = builder
-                        .table_version_tracking_enabled(true)
-                        .table_version_storage_enabled(true);
+                    builder = builder.table_version_tracking_enabled(true);
                 }
                 let backend = builder.build().await.unwrap();
                 let backend = Arc::new(backend);
diff --git a/rust/lance-select/src/mask.rs b/rust/lance-select/src/mask.rs
index ecacc118074..0fea6498fc9 100644
--- a/rust/lance-select/src/mask.rs
+++ b/rust/lance-select/src/mask.rs
@@ -13,7 +13,7 @@ use itertools::Itertools;
 use lance_core::deepsize::DeepSizeOf;
 use roaring::{MultiOps, RoaringBitmap, RoaringTreemap};
 
-use lance_core::cache::CacheCodecImpl;
+use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter};
 use lance_core::utils::address::RowAddress;
 use lance_core::{Error, Result};
 
@@ -697,12 +697,17 @@ impl RowAddrTreeMap {
 }
 
 impl CacheCodecImpl for RowAddrTreeMap {
-    fn serialize(&self, writer: &mut dyn Write) -> Result<()> {
-        self.serialize_into(writer)
+    const TYPE_ID: &'static str = "lance.RowAddrTreeMap";
+    const CURRENT_VERSION: u32 = 1;
+
+    fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> {
+        // A roaring bitmap has its own stable, portable serialization; it is
+        // the whole body, so write it raw rather than length-prefixed.
+        self.serialize_into(w.raw_writer())
     }
 
-    fn deserialize(data: &bytes::Bytes) -> Result<Self> {
-        Self::deserialize_from(data.as_ref())
+    fn deserialize(r: &mut CacheEntryReader<'_>) -> Result<Self> {
+        Self::deserialize_from(r.body().as_ref())
     }
 }
 
diff --git a/rust/lance-table/src/format/index.rs b/rust/lance-table/src/format/index.rs
index 33ee464fe76..f603536a3eb 100644
--- a/rust/lance-table/src/format/index.rs
+++ b/rust/lance-table/src/format/index.rs
@@ -15,6 +15,7 @@ use roaring::RoaringBitmap;
 use uuid::Uuid;
 
 use super::pb;
+use lance_core::cache::{CacheEntryReader, CacheEntryWriter};
 use lance_core::{Error, Result};
 
 /// Metadata about a single file within an index segment.
@@ -235,24 +236,26 @@ impl From<&IndexMetadata> for pb::IndexMetadata {
 /// orphan rule prevents `impl CacheCodecImpl for Vec<IndexMetadata>`.
 type ArcAny = Arc<dyn std::any::Any + Send + Sync>;
 
+/// Stable type identifier for the `Vec<IndexMetadata>` cache entry.
+const INDEX_METADATA_TYPE_ID: &str = "lance.table.IndexMetadataList";
+/// Body schema version written by this build.
+const INDEX_METADATA_VERSION: u32 = 1;
+
 fn serialize_index_metadata(
     any: &ArcAny,
-    writer: &mut dyn std::io::Write,
+    writer: &mut CacheEntryWriter<'_>,
 ) -> lance_core::Result<()> {
-    use prost::Message;
     let vec = any
         .downcast_ref::<Vec<IndexMetadata>>()
         .expect("index_metadata_codec: wrong type (this is a bug in the cache layer)");
     let section = pb::IndexSection {
         indices: vec.iter().map(pb::IndexMetadata::from).collect(),
     };
-    writer.write_all(&section.encode_to_vec())?;
-    Ok(())
+    writer.write_header(&section)
 }
 
-fn deserialize_index_metadata(data: &bytes::Bytes) -> lance_core::Result<ArcAny> {
-    use prost::Message;
-    let section = pb::IndexSection::decode(data.as_ref())?;
+fn deserialize_index_metadata(reader: &mut CacheEntryReader<'_>) -> lance_core::Result<ArcAny> {
+    let section: pb::IndexSection = reader.read_header()?;
     let indices: Vec<IndexMetadata> = section
         .indices
         .into_iter()
@@ -262,7 +265,12 @@ fn deserialize_index_metadata(data: &bytes::Bytes) -> lance_core::Result<ArcAny>
 }
 
 pub fn index_metadata_codec() -> lance_core::cache::CacheCodec {
-    lance_core::cache::CacheCodec::new(serialize_index_metadata, deserialize_index_metadata)
+    lance_core::cache::CacheCodec::new(
+        INDEX_METADATA_TYPE_ID,
+        INDEX_METADATA_VERSION,
+        serialize_index_metadata,
+        deserialize_index_metadata,
+    )
 }
 
 /// List all files in an index directory with their sizes.
@@ -348,7 +356,8 @@ mod tests {
         let bytes = store.get(&key).unwrap();
         let recovered = codec
             .deserialize(&bytes::Bytes::copy_from_slice(bytes))
-            .unwrap();
+            .hit()
+            .expect("entry should decode as a hit");
         let recovered = recovered
             .downcast::<Vec<IndexMetadata>>()
             .expect("downcast should succeed");
diff --git a/rust/lance-table/src/io/commit.rs b/rust/lance-table/src/io/commit.rs
index 3784e84a785..e1a4086730b 100644
--- a/rust/lance-table/src/io/commit.rs
+++ b/rust/lance-table/src/io/commit.rs
@@ -798,6 +798,26 @@ pub trait CommitHandler: Debug + Send + Sync {
         default_resolve_version(base_path, version, object_store).await
     }
 
+    /// Check whether an attached manifest version exists without loading it.
+    ///
+    /// The default implementation probes the deterministic manifest path for
+    /// the given naming scheme. Commit handlers with an external source of
+    /// truth should override this method.
+    async fn version_exists(
+        &self,
+        base_path: &Path,
+        version: u64,
+        object_store: &dyn OSObjectStore,
+        naming_scheme: ManifestNamingScheme,
+    ) -> Result<bool> {
+        let path = naming_scheme.manifest_path(base_path, version);
+        match object_store.head(&path).await {
+            Ok(_) => Ok(true),
+            Err(ObjectStoreError::NotFound { .. }) => Ok(false),
+            Err(e) => Err(e.into()),
+        }
+    }
+
     /// List detached manifest locations.
     ///
     /// Returns a stream of detached manifest locations in arbitrary order.
diff --git a/rust/lance-table/src/io/commit/external_manifest.rs b/rust/lance-table/src/io/commit/external_manifest.rs
index 75993ca8d1f..a6c9bbaa90d 100644
--- a/rust/lance-table/src/io/commit/external_manifest.rs
+++ b/rust/lance-table/src/io/commit/external_manifest.rs
@@ -456,6 +456,31 @@ impl CommitHandler for ExternalManifestCommitHandler {
         .await
     }
 
+    async fn version_exists(
+        &self,
+        base_path: &Path,
+        version: u64,
+        object_store: &dyn OSObjectStore,
+        naming_scheme: ManifestNamingScheme,
+    ) -> Result<bool> {
+        match self
+            .external_manifest_store
+            .get_manifest_location(base_path.as_ref(), version)
+            .await
+        {
+            Ok(_) => Ok(true),
+            Err(Error::NotFound { .. }) => {
+                let path = naming_scheme.manifest_path(base_path, version);
+                match object_store.head(&path).await {
+                    Ok(_) => Ok(true),
+                    Err(ObjectStoreError::NotFound { .. }) => Ok(false),
+                    Err(e) => Err(e.into()),
+                }
+            }
+            Err(e) => Err(e),
+        }
+    }
+
     async fn commit(
         &self,
         manifest: &mut Manifest,
diff --git a/rust/lance-tokenizer/Cargo.toml b/rust/lance-tokenizer/Cargo.toml
index 5edfe4a9f16..e1006cd93c7 100644
--- a/rust/lance-tokenizer/Cargo.toml
+++ b/rust/lance-tokenizer/Cargo.toml
@@ -17,6 +17,7 @@ jieba-rs = { workspace = true, optional = true }
 lindera = { workspace = true, optional = true }
 rust-stemmers = "1.2.0"
 serde = { workspace = true, features = ["derive"] }
+stop-words = { version = "0.10.0", default-features = false, features = ["iso", "nltk"] }
 unicode-normalization = "0.1.25"
 
 [features]
diff --git a/rust/lance-tokenizer/src/stop_word_filter.rs b/rust/lance-tokenizer/src/stop_word_filter.rs
index 0c49330a619..2acf0b3dbd5 100644
--- a/rust/lance-tokenizer/src/stop_word_filter.rs
+++ b/rust/lance-tokenizer/src/stop_word_filter.rs
@@ -12,6 +12,34 @@ use std::sync::Arc;
 
 use crate::{Language, Token, TokenFilter, TokenStream, Tokenizer};
 
+fn all_stop_words() -> impl Iterator<Item = &'static str> {
+    [
+        stop_words::get("ar"),
+        stopwords::DANISH,
+        stopwords::DUTCH,
+        stopwords::ENGLISH,
+        stopwords::FINNISH,
+        stopwords::FRENCH,
+        stopwords::GERMAN,
+        stop_words::get("el"),
+        stopwords::HUNGARIAN,
+        stopwords::ITALIAN,
+        stopwords::NORWEGIAN,
+        stopwords::PORTUGUESE,
+        stop_words::get("ro"),
+        stopwords::RUSSIAN,
+        stopwords::SPANISH,
+        stopwords::SWEDISH,
+        stop_words::get("ta"),
+        stop_words::get("tr"),
+        stop_words::get("zh"),
+        stop_words::get("ja"),
+        stop_words::get("ko"),
+    ]
+    .into_iter()
+    .flat_map(|words| words.iter().copied())
+}
+
 #[derive(Clone)]
 pub struct StopWordFilter {
     words: Arc<HashSet<String>>,
@@ -20,28 +48,32 @@ pub struct StopWordFilter {
 impl StopWordFilter {
     pub fn new(language: Language) -> Option<Self> {
         let words = match language {
+            Language::Arabic => stop_words::get("ar"),
             Language::Danish => stopwords::DANISH,
             Language::Dutch => stopwords::DUTCH,
-            Language::English => &[
-                "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into",
-                "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then",
-                "there", "these", "they", "this", "to", "was", "will", "with",
-            ],
+            Language::English => stopwords::ENGLISH,
             Language::Finnish => stopwords::FINNISH,
             Language::French => stopwords::FRENCH,
             Language::German => stopwords::GERMAN,
+            Language::Greek => stop_words::get("el"),
             Language::Hungarian => stopwords::HUNGARIAN,
             Language::Italian => stopwords::ITALIAN,
             Language::Norwegian => stopwords::NORWEGIAN,
             Language::Portuguese => stopwords::PORTUGUESE,
+            Language::Romanian => stop_words::get("ro"),
             Language::Russian => stopwords::RUSSIAN,
             Language::Spanish => stopwords::SPANISH,
             Language::Swedish => stopwords::SWEDISH,
-            _ => return None,
+            Language::Tamil => stop_words::get("ta"),
+            Language::Turkish => stop_words::get("tr"),
         };
         Some(Self::remove(words.iter().map(|word| (*word).to_owned())))
     }
 
+    pub fn all() -> Self {
+        Self::remove(all_stop_words().map(str::to_owned))
+    }
+
     pub fn remove<W: IntoIterator<Item = String>>(words: W) -> Self {
         Self {
             words: Arc::new(words.into_iter().collect()),
@@ -49,6 +81,42 @@ impl StopWordFilter {
     }
 }
 
+#[cfg(test)]
+mod tests {
+    use super::all_stop_words;
+    use crate::StopWordFilter;
+    use std::collections::HashSet;
+
+    #[test]
+    fn test_external_stop_word_lists_are_available() {
+        let words = all_stop_words().collect::<HashSet<_>>();
+        for word in ["إلى", "και", "acesta", "அவர்", "ama", "的", "ある", "그리고"]
+        {
+            assert!(
+                words.contains(word),
+                "built-in stop words should contain {word}"
+            );
+        }
+    }
+
+    #[test]
+    fn test_language_stop_word_lists_are_available() {
+        for (language, word) in [
+            (crate::Language::Arabic, "إلى"),
+            (crate::Language::Greek, "και"),
+            (crate::Language::Romanian, "acesta"),
+            (crate::Language::Tamil, "அவர்"),
+            (crate::Language::Turkish, "ama"),
+        ] {
+            let filter = StopWordFilter::new(language).unwrap();
+            assert!(
+                filter.words.contains(word),
+                "{language:?} should contain {word}"
+            );
+        }
+    }
+}
+
 impl TokenFilter for StopWordFilter {
     type Tokenizer<T: Tokenizer> = StopWordFilterWrapper<T>;
 
diff --git a/rust/lance-tokenizer/src/stop_word_filter/stopwords.rs b/rust/lance-tokenizer/src/stop_word_filter/stopwords.rs
index 2ac3f4a28aa..227556ba527 100644
--- a/rust/lance-tokenizer/src/stop_word_filter/stopwords.rs
+++ b/rust/lance-tokenizer/src/stop_word_filter/stopwords.rs
@@ -37,6 +37,12 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+pub const ENGLISH: &[&str] = &[
+    "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
+    "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
+    "they", "this", "to", "was", "will", "with",
+];
+
 pub const DANISH: &[&str] = &[
     "og", "i", "jeg", "det", "at", "en", "den", "til", "er", "som", "på", "de", "med", "han", "af",
     "for", "ikke", "der", "var", "mig", "sig", "men", "et", "har", "om", "vi", "min", "havde",
diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml
index 74e6faf5c07..6586c928de7 100644
--- a/rust/lance/Cargo.toml
+++ b/rust/lance/Cargo.toml
@@ -175,6 +175,10 @@ required-features = ["cli"]
 name = "scalar_index"
 harness = false
 
+[[bench]]
+name = "regex_ngram"
+harness = false
+
 [[bench]]
 name = "merge_insert"
 harness = false
@@ -296,5 +300,9 @@ harness = false
 name = "concurrent_append"
 harness = false
 
+[[bench]]
+name = "hamming"
+harness = false
+
 [lints]
 workspace = true
diff --git a/rust/lance/benches/hamming.rs b/rust/lance/benches/hamming.rs
new file mode 100644
index 00000000000..7e926a795db
--- /dev/null
+++ b/rust/lance/benches/hamming.rs
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Benchmark for hamming distance clustering.
+//!
+//! This benchmark tests the pairwise hamming distance computation and clustering
+//! performance at various scales.
+//!
+//! Run with: cargo bench -p lance --bench hamming
+//!
+//! Environment variables:
+//!   - DATASET_URI: Path to a dataset with a hash column (optional, generates random if not set)
+//!   - HASH_COLUMN: Name of the hash column (default: "hash")
+//!   - SAMPLE_SIZE: Number of rows to sample (default: 10000)
+//!   - THRESHOLD: Hamming distance threshold (default: 10)
+
+#![allow(clippy::print_stdout)]
+
+use std::env;
+use std::sync::Arc;
+use std::time::Instant;
+
+use arrow_array::{FixedSizeListArray, RecordBatch, RecordBatchIterator, UInt8Array};
+use arrow_schema::{DataType, Field, FieldRef, Schema};
+use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
+use lance_arrow::FixedSizeListArrayExt;
+use rand::Rng;
+
+use lance::index::vector::hamming::{
+    hamming_clustering_for_sample, hamming_clustering_from_hashes,
+};
+use lance::{Dataset, dataset::WriteParams};
+use lance_linalg::distance::pairwise_hamming_distance_parallel;
+
+#[cfg(target_os = "linux")]
+use lance_testing::pprof::{Output, PProfProfiler};
+
+/// Generate random 64-bit hashes.
+fn generate_random_hashes(n: usize) -> Vec<u64> {
+    let mut rng = rand::rng();
+    (0..n).map(|_| rng.random()).collect()
+}
+
+/// Generate random hash dataset as Arrow arrays.
+fn generate_hash_batch(num_rows: usize) -> RecordBatch {
+    let mut rng = rand::rng();
+
+    // Generate random bytes for the hashes (8 bytes per hash)
+    let bytes: Vec<u8> = (0..num_rows * 8).map(|_| rng.random()).collect();
+    let values = UInt8Array::from(bytes);
+
+    let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap();
+
+    let schema = Arc::new(Schema::new(vec![Field::new(
+        "hash",
+        DataType::FixedSizeList(FieldRef::new(Field::new("item", DataType::UInt8, true)), 8),
+        false,
+    )]));
+
+    RecordBatch::try_new(schema, vec![Arc::new(hash_array)]).unwrap()
+}
+
+/// Create a test dataset with random hashes.
+async fn create_hash_dataset(path: &std::path::Path, num_rows: usize) {
+    let batch = generate_hash_batch(num_rows);
+    let schema = batch.schema();
+
+    let write_params = WriteParams {
+        max_rows_per_file: num_rows,
+        max_rows_per_group: 10_000,
+        ..Default::default()
+    };
+
+    let reader = RecordBatchIterator::new(vec![Ok(batch)], schema);
+    Dataset::write(reader, path.to_str().unwrap(), Some(write_params))
+        .await
+        .unwrap();
+}
+
+/// Benchmark pure pairwise hamming computation (no I/O).
+fn bench_pairwise_compute(c: &mut Criterion) {
+    let mut group = c.benchmark_group("hamming_pairwise_compute");
+
+    for size in [1_000, 5_000, 10_000, 20_000] {
+        let hashes = generate_random_hashes(size);
+        let total_pairs = (size as u64) * (size as u64 - 1) / 2;
+
+        group.throughput(Throughput::Elements(total_pairs));
+        group.bench_with_input(BenchmarkId::new("parallel", size), &hashes, |b, hashes| {
+            b.iter(|| {
+                pairwise_hamming_distance_parallel(hashes, None, Some(10));
+            });
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark full clustering pipeline (compute + cluster).
+fn bench_cluster_hashes(c: &mut Criterion) {
+    let mut group = c.benchmark_group("hamming_cluster");
+
+    for size in [1_000, 5_000, 10_000] {
+        let hashes = generate_random_hashes(size);
+
+        group.bench_with_input(
+            BenchmarkId::new("full_pipeline", size),
+            &hashes,
+            |b, hashes| {
+                b.iter(|| {
+                    hamming_clustering_from_hashes(hashes, None, 10);
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark with dataset I/O (if DATASET_URI is set).
+fn bench_dataset_cluster(c: &mut Criterion) {
+    let rt = tokio::runtime::Runtime::new().unwrap();
+
+    // Check if we should use an external dataset
+    let dataset_uri = env::var("DATASET_URI").ok();
+    let hash_column = env::var("HASH_COLUMN").unwrap_or_else(|_| "hash".to_string());
+    let sample_size: usize = env::var("SAMPLE_SIZE")
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(10_000);
+    let threshold: u32 = env::var("THRESHOLD")
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(10);
+
+    let mut group = c.benchmark_group("hamming_dataset");
+
+    if let Some(uri) = dataset_uri {
+        // Use external dataset
+        println!("Using external dataset: {}", uri);
+        println!(
+            "Column: {}, Sample: {}, Threshold: {}",
+            hash_column, sample_size, threshold
+        );
+
+        let dataset = rt.block_on(async { Dataset::open(&uri).await.unwrap() });
+
+        group.bench_function(format!("external_sample_{}", sample_size), |b| {
+            b.to_async(&rt).iter(|| async {
+                hamming_clustering_for_sample(&dataset, &hash_column, Some(sample_size), threshold)
+                    .await
+                    .unwrap()
+            });
+        });
+    } else {
+        // Create temporary dataset with random hashes
+        let temp_dir = tempfile::tempdir().unwrap();
+        let uri = temp_dir.path().join("bench_hashes.lance");
+
+        rt.block_on(async {
+            create_hash_dataset(&uri, 100_000).await;
+        });
+
+        let dataset = rt.block_on(async { Dataset::open(uri.to_str().unwrap()).await.unwrap() });
+
+        for sample in [1_000, 5_000, 10_000] {
+            group.bench_function(format!("generated_sample_{}", sample), |b| {
+                let ds = dataset.clone();
+                b.to_async(&rt).iter(|| {
+                    let ds = ds.clone();
+                    async move {
+                        hamming_clustering_for_sample(&ds, "hash", Some(sample), 10)
+                            .await
+                            .unwrap()
+                    }
+                });
+            });
+        }
+    }
+
+    group.finish();
+}
+
+/// Quick standalone benchmark that prints results (for quick testing).
+#[allow(dead_code)]
+fn run_quick_bench() {
+    println!("=== Hamming Distance Clustering Benchmark ===\n");
+
+    let sizes = [1_000, 5_000, 10_000, 20_000];
+
+    for &size in &sizes {
+        let hashes = generate_random_hashes(size);
+        let total_pairs = (size as u64) * (size as u64 - 1) / 2;
+
+        println!("Size: {} rows, {} pairs", size, total_pairs);
+        let start = Instant::now();
+        let reader = hamming_clustering_from_hashes(&hashes, None, 10);
+        // Consume the reader to count clusters
+        let cluster_count: usize = reader.map(|b| b.unwrap().num_rows()).sum();
+        let elapsed = start.elapsed();
+
+        let pairs_per_sec = total_pairs as f64 / elapsed.as_secs_f64();
+        println!(
+            "  Total time: {:?} ({:.2}M pairs/sec)",
+            elapsed,
+            pairs_per_sec / 1_000_000.0
+        );
+        println!("  Total clusters: {}", cluster_count);
+        println!();
+    }
+}
+
+#[cfg(target_os = "linux")]
+criterion_group! {
+    name = benches;
+    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets = bench_pairwise_compute, bench_cluster_hashes, bench_dataset_cluster
+}
+
+#[cfg(not(target_os = "linux"))]
+criterion_group!(
+    benches,
+    bench_pairwise_compute,
+    bench_cluster_hashes,
+    bench_dataset_cluster
+);
+
+criterion_main!(benches);
diff --git a/rust/lance/benches/mem_wal/write/mem_wal_write.rs b/rust/lance/benches/mem_wal/write/mem_wal_write.rs
index 24f3a0d7c8f..9a5fc71ab17 100644
--- a/rust/lance/benches/mem_wal/write/mem_wal_write.rs
+++ b/rust/lance/benches/mem_wal/write/mem_wal_write.rs
@@ -649,8 +649,10 @@ fn bench_lance_memwal_write(c: &mut Criterion) {
                                     backpressure_log_interval: default_config
                                         .backpressure_log_interval,
                                     stats_log_interval: default_config.stats_log_interval,
+                                    frozen_memtable_grace: default_config.frozen_memtable_grace,
                                     enable_memtable,
                                     hnsw_params: default_config.hnsw_params,
+                                    warmer: None,
                                 };
 
                                 // Get writer through Dataset API (index configs loaded automatically)
diff --git a/rust/lance/benches/regex_ngram.rs b/rust/lance/benches/regex_ngram.rs
new file mode 100644
index 00000000000..76f597ad9cb
--- /dev/null
+++ b/rust/lance/benches/regex_ngram.rs
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Benchmark: regex predicate scans over an ngram-indexed string column.
+//!
+//! Each query is a `regexp_match(doc, '...')` filter against a dataset that has
+//! an NGram index on `doc`. The query set spans a selective AND pattern, an
+//! alternation, a plain literal (rewritten to an infix LIKE before it reaches
+//! the index), and a deliberately non-accelerable pattern (`a.b`, which yields
+//! no trigram) that serves as a regression guard.
+//!
+//! On `main` none of these use the index (regex falls through to a full scan +
+//! recheck); with the ngram-regex acceleration the index prunes candidates for
+//! the first three while `a.b` stays a full scan. Capture a baseline on `main`
+//! with `--save-baseline before_7130`, then compare after the change with
+//! `--baseline before_7130`.
+
+use std::hint::black_box;
+use std::sync::Arc;
+use std::time::Duration;
+
+use arrow::array::AsArray;
+use arrow_array::{RecordBatch, RecordBatchIterator, StringArray};
+use arrow_schema::{DataType, Field, Schema};
+use criterion::{Criterion, criterion_group, criterion_main};
+use futures::TryStreamExt;
+use lance::Dataset;
+use lance::index::DatasetIndexExt;
+use lance_core::utils::tempfile::TempStrDir;
+use lance_datagen::{RowCount, array};
+use lance_index::IndexType;
+use lance_index::scalar::ScalarIndexParams;
+#[cfg(target_os = "linux")]
+use lance_testing::pprof::{Output, PProfProfiler};
+
+const TOTAL: usize = 200_000;
+
+/// Build the `doc` column: random sentences with rare markers injected into a
+/// small fraction of rows so the regex queries have controlled selectivity.
+/// The markers (`zqxwvu`, `needlexyz`, `qwerasdf`) are unlikely to appear in
+/// the generated English-word sentences.
+fn build_docs() -> StringArray {
+    let mut sentence_gen = array::random_sentence(1, 30, false);
+    let base = sentence_gen
+        .generate_default(RowCount::from(TOTAL as u64))
+        .unwrap();
+    let base = base.as_string::<i32>();
+    let docs = (0..TOTAL).map(|i| {
+        let sentence = base.value(i);
+        if i % 200 == 0 {
+            // ~0.5% of rows match `zqxwvu.*needlexyz` and `zqxwvu`.
+            format!("{sentence} zqxwvu needlexyz")
+        } else if i % 211 == 0 {
+            // A second marker for the alternation query.
+            format!("{sentence} qwerasdf")
+        } else {
+            sentence.to_string()
+        }
+    });
+    StringArray::from_iter_values(docs)
+}
+
+async fn build_dataset(tempdir: &TempStrDir) -> Arc<Dataset> {
+    let schema = Arc::new(Schema::new(vec![Field::new("doc", DataType::Utf8, false)]));
+    let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(build_docs())]).unwrap();
+    let reader = RecordBatchIterator::new(vec![Ok(batch)], schema);
+
+    let mut dataset = Dataset::write(reader, tempdir.as_str(), None)
+        .await
+        .unwrap();
+    dataset
+        .create_index(
+            &["doc"],
+            IndexType::NGram,
+            None,
+            &ScalarIndexParams::default(),
+            true,
+        )
+        .await
+        .unwrap();
+    Arc::new(dataset)
+}
+
+async fn scan_filter(dataset: &Dataset, filter: &str) -> usize {
+    let mut scanner = dataset.scan();
+    scanner.filter(filter).unwrap();
+    let stream = scanner.try_into_stream().await.unwrap();
+    let batches: Vec<RecordBatch> = stream.try_collect().await.unwrap();
+    batches.iter().map(|b| b.num_rows()).sum()
+}
+
+fn bench_regex_ngram(c: &mut Criterion) {
+    let rt = tokio::runtime::Runtime::new().unwrap();
+    let tempdir = TempStrDir::default();
+    let dataset = rt.block_on(build_dataset(&tempdir));
+
+    let queries = [
+        ("selective_and", "regexp_match(doc, 'zqxwvu.*needlexyz')"),
+        (
+            "alternation",
+            "regexp_match(doc, '(zqxwvu|qwerasdf|needlexyz)')",
+        ),
+        ("plain_literal", "regexp_match(doc, 'zqxwvu')"),
+        ("non_accelerable_a_dot_b", "regexp_match(doc, 'a.b')"),
+    ];
+
+    let mut group = c.benchmark_group("regex_ngram");
+    group
+        .sample_size(10)
+        .measurement_time(Duration::from_secs(15));
+    for (name, filter) in queries {
+        group.bench_function(name, |b| {
+            b.iter(|| black_box(rt.block_on(scan_filter(&dataset, filter))));
+        });
+    }
+    group.finish();
+}
+
+#[cfg(target_os = "linux")]
+criterion_group!(
+    name = benches;
+    config = Criterion::default()
+        .significance_level(0.1)
+        .sample_size(10)
+        .with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets = bench_regex_ngram);
+
+#[cfg(not(target_os = "linux"))]
+criterion_group!(
+    name = benches;
+    config = Criterion::default().significance_level(0.1).sample_size(10);
+    targets = bench_regex_ngram);
+
+criterion_main!(benches);
diff --git a/rust/lance/src/blob.rs b/rust/lance/src/blob.rs
index 322bf67a04c..58df42b5cd3 100644
--- a/rust/lance/src/blob.rs
+++ b/rust/lance/src/blob.rs
@@ -7,12 +7,16 @@
 //! tagged with `ARROW:extension:name = "lance.blob.v2"`. This module offers a
 //! type-safe builder to construct that struct without manually wiring metadata
 
+use std::num::NonZeroUsize;
 use std::sync::Arc;
 
 use arrow_array::{ArrayRef, StructArray, builder::LargeBinaryBuilder, builder::StringBuilder};
 use arrow_buffer::NullBufferBuilder;
 use arrow_schema::{DataType, Field};
-use lance_arrow::{ARROW_EXT_NAME_KEY, BLOB_V2_EXT_NAME};
+use lance_arrow::{
+    ARROW_EXT_NAME_KEY, BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY,
+    BLOB_INLINE_SIZE_THRESHOLD_META_KEY, BLOB_V2_EXT_NAME,
+};
 
 use crate::{Error, Result};
 
@@ -21,9 +25,71 @@ use crate::{Error, Result};
 /// Blob v2 expects a column shaped as `Struct<data: LargeBinary?, uri: Utf8?>` and
 /// tagged with `ARROW:extension:name = "lance.blob.v2"`.
 pub fn blob_field(name: &str, nullable: bool) -> Field {
-    let metadata = [(ARROW_EXT_NAME_KEY.to_string(), BLOB_V2_EXT_NAME.to_string())]
+    blob_field_with_options(name, nullable, BlobFieldOptions::default())
+}
+
+/// Options for constructing a blob v2 field.
+#[derive(Clone, Debug, Default)]
+pub struct BlobFieldOptions {
+    /// Maximum payload size to keep inline in the data file before using packed blob storage.
+    pub inline_size_threshold: Option<usize>,
+    /// Maximum payload size to store in packed blob storage before using dedicated blob storage.
+    ///
+    /// A zero threshold is invalid because dedicated blob storage is selected when
+    /// the payload size is greater than this value.
+    pub dedicated_size_threshold: Option<NonZeroUsize>,
+}
+
+impl BlobFieldOptions {
+    /// Set the maximum payload size to keep inline in the data file.
+    pub fn with_inline_size_threshold(mut self, threshold: usize) -> Self {
+        self.inline_size_threshold = Some(threshold);
+        self
+    }
+
+    /// Set the maximum payload size to store in packed blob storage.
+    pub fn with_dedicated_size_threshold(mut self, threshold: NonZeroUsize) -> Self {
+        self.dedicated_size_threshold = Some(threshold);
+        self
+    }
+}
+
+/// Construct the Arrow field for a blob v2 column with storage layout options.
+///
+/// Blob v2 expects a column shaped as `Struct<data: LargeBinary?, uri: Utf8?>` and
+/// tagged with `ARROW:extension:name = "lance.blob.v2"`.
+///
+/// ```
+/// # use lance::{BlobFieldOptions, blob_field_with_options};
+/// let field = blob_field_with_options(
+///     "blob",
+///     true,
+///     BlobFieldOptions::default().with_inline_size_threshold(16 * 1024),
+/// );
+/// assert_eq!(
+///     field
+///         .metadata()
+///         .get("lance-encoding:blob-inline-size-threshold")
+///         .map(String::as_str),
+///     Some("16384"),
+/// );
+/// ```
+pub fn blob_field_with_options(name: &str, nullable: bool, options: BlobFieldOptions) -> Field {
+    let mut metadata = [(ARROW_EXT_NAME_KEY.to_string(), BLOB_V2_EXT_NAME.to_string())]
         .into_iter()
-        .collect();
+        .collect::<std::collections::HashMap<_, _>>();
+    if let Some(threshold) = options.inline_size_threshold {
+        metadata.insert(
+            BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(),
+            threshold.to_string(),
+        );
+    }
+    if let Some(threshold) = options.dedicated_size_threshold {
+        metadata.insert(
+            BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY.to_string(),
+            threshold.get().to_string(),
+        );
+    }
     Field::new(
         name,
         DataType::Struct(
@@ -142,6 +208,8 @@ impl BlobArrayBuilder {
 
 #[cfg(test)]
 mod tests {
+    use std::num::NonZeroUsize;
+
     use super::*;
     use arrow_array::Array;
     use arrow_array::cast::AsArray;
@@ -156,6 +224,31 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_field_metadata_with_options() {
+        let field = blob_field_with_options(
+            "blob",
+            true,
+            BlobFieldOptions::default()
+                .with_inline_size_threshold(16 * 1024)
+                .with_dedicated_size_threshold(NonZeroUsize::new(2 * 1024 * 1024).unwrap()),
+        );
+        assert_eq!(
+            field
+                .metadata()
+                .get(BLOB_INLINE_SIZE_THRESHOLD_META_KEY)
+                .unwrap(),
+            "16384"
+        );
+        assert_eq!(
+            field
+                .metadata()
+                .get(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY)
+                .unwrap(),
+            "2097152"
+        );
+    }
+
     #[test]
     fn test_builder_basic() {
         let mut b = BlobArrayBuilder::new(4);
diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs
index c9cc356aaa6..3e0d77704da 100644
--- a/rust/lance/src/dataset.rs
+++ b/rust/lance/src/dataset.rs
@@ -24,8 +24,7 @@ use lance_core::datatypes::{OnMissing, OnTypeMismatch, Projectable, Projection};
 use lance_core::traits::DatasetTakeRows;
 use lance_core::utils::address::RowAddress;
 use lance_core::utils::tracing::{
-    DATASET_CLEANING_EVENT, DATASET_DELETING_EVENT, DATASET_DROPPING_COLUMN_EVENT,
-    TRACE_DATASET_EVENTS,
+    DATASET_DELETING_EVENT, DATASET_DROPPING_COLUMN_EVENT, TRACE_DATASET_EVENTS,
 };
 use lance_datafusion::projection::ProjectionPlan;
 use lance_file::datatypes::populate_schema_dictionary;
@@ -104,7 +103,7 @@ use self::scanner::{DatasetRecordBatchStream, Scanner};
 use self::transaction::{Operation, Transaction, TransactionBuilder, UpdateMapEntry};
 use self::write::{cleanup_data_fragments, write_fragments_internal};
 use crate::dataset::branch_location::BranchLocation;
-use crate::dataset::cleanup::{CleanupPolicy, CleanupPolicyBuilder};
+use crate::dataset::cleanup::{CleanupOperation, CleanupPolicy, CleanupPolicyBuilder};
 use crate::dataset::refs::{BranchContents, BranchIdentifier, Branches, Tags};
 use crate::dataset::sql::SqlQueryBuilder;
 use crate::datatypes::Schema;
@@ -514,7 +513,10 @@ impl Dataset {
         let transaction = Transaction::new(version_number, clone_op, None);
 
         let builder = CommitBuilder::new(WriteDestination::Uri(branch_location.uri.as_str()))
-            .with_store_params(store_params.unwrap_or_default())
+            // Fall back to the dataset's own store params
+            .with_store_params(
+                store_params.unwrap_or(self.store_params.as_deref().cloned().unwrap_or_default()),
+            )
             .with_object_store(Arc::new(self.object_store.as_ref().clone()))
             .with_commit_handler(self.commit_handler.clone())
             .with_storage_format(self.manifest.data_storage_format.lance_file_version()?);
@@ -1283,8 +1285,15 @@ impl Dataset {
         &self,
         policy: CleanupPolicy,
     ) -> BoxFuture<'_, Result<RemovalStats>> {
-        info!(target: TRACE_DATASET_EVENTS, event=DATASET_CLEANING_EVENT, uri=&self.uri);
-        cleanup::cleanup_old_versions(self, policy).boxed()
+        async move { self.cleanup(policy).execute().await }.boxed()
+    }
+
+    /// Creates a cleanup operation for this dataset.
+    ///
+    /// The returned operation can be explained without deleting files, or
+    /// executed to re-evaluate the current dataset state and remove files.
+    pub fn cleanup(&self, policy: CleanupPolicy) -> CleanupOperation<'_> {
+        CleanupOperation::new(self, policy)
     }
 
     #[allow(clippy::too_many_arguments)]
@@ -2232,6 +2241,39 @@ impl Dataset {
             .version)
     }
 
+    /// Return whether the dataset has a newer committed version.
+    pub async fn is_stale(&self) -> Result<bool> {
+        let latest_version = self.latest_version_id().await?;
+        Ok(latest_version != self.manifest.version)
+    }
+
+    /// Return whether the immediate attached successor manifest exists.
+    ///
+    /// This is a fast contiguous-history probe. It does not resolve the latest
+    /// version and may return `false` if intermediate manifests have been
+    /// removed. Callers that need a general freshness check should use
+    /// [`Self::is_stale`].
+    #[doc(hidden)]
+    pub async fn has_successor_version(&self) -> Result<bool> {
+        let Some(next_version) = self.manifest.version.checked_add(1) else {
+            return Ok(false);
+        };
+        if lance_table::format::is_detached_version(next_version) {
+            return Ok(false);
+        }
+
+        let exists = self
+            .commit_handler
+            .version_exists(
+                &self.base,
+                next_version,
+                self.object_store.inner.as_ref(),
+                self.manifest_location.naming_scheme,
+            )
+            .await?;
+        Ok(exists)
+    }
+
     pub fn count_fragments(&self) -> usize {
         self.manifest.fragments.len()
     }
diff --git a/rust/lance/src/dataset/blob.rs b/rust/lance/src/dataset/blob.rs
index f2c243367ce..8cdde543e4e 100644
--- a/rust/lance/src/dataset/blob.rs
+++ b/rust/lance/src/dataset/blob.rs
@@ -12,14 +12,18 @@ use std::{
 
 use arrow::array::AsArray;
 use arrow::datatypes::{UInt8Type, UInt32Type, UInt64Type};
-use arrow_array::Array;
 use arrow_array::RecordBatch;
 use arrow_array::builder::{LargeBinaryBuilder, PrimitiveBuilder, StringBuilder};
-use arrow_schema::DataType as ArrowDataType;
+use arrow_array::{Array, ArrayRef};
+use arrow_schema::{DataType as ArrowDataType, Field as ArrowField};
 use bytes::Bytes;
+use futures::future::BoxFuture;
 use futures::stream::BoxStream;
 use futures::{FutureExt, StreamExt, TryStreamExt, stream};
-use lance_arrow::{BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, FieldExt};
+use lance_arrow::{
+    BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, BLOB_INLINE_SIZE_THRESHOLD_META_KEY, FieldExt,
+    r#struct::StructArrayExt,
+};
 use lance_io::object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry};
 use lance_io::scheduler::{FileScheduler, ScanScheduler, SchedulerConfig};
 use object_store::path::Path;
@@ -40,6 +44,58 @@ use lance_io::utils::CachedFileSize;
 const INLINE_MAX: usize = 64 * 1024; // 64KB inline cutoff
 const DEDICATED_THRESHOLD: usize = 4 * 1024 * 1024; // 4MB dedicated cutoff
 const PACK_FILE_MAX_SIZE: usize = 1024 * 1024 * 1024; // 1GiB per .pack sidecar
+
+pub(super) fn blob_inline_threshold_from_metadata(
+    metadata: &HashMap<String, String>,
+    field_name: &str,
+) -> Result<usize> {
+    blob_threshold_from_metadata(
+        metadata,
+        field_name,
+        BLOB_INLINE_SIZE_THRESHOLD_META_KEY,
+        INLINE_MAX,
+        true,
+    )
+}
+
+pub(super) fn blob_dedicated_threshold_from_metadata(
+    metadata: &HashMap<String, String>,
+    field_name: &str,
+) -> Result<usize> {
+    blob_threshold_from_metadata(
+        metadata,
+        field_name,
+        BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY,
+        DEDICATED_THRESHOLD,
+        false,
+    )
+}
+
+fn blob_threshold_from_metadata(
+    metadata: &HashMap<String, String>,
+    field_name: &str,
+    key: &str,
+    default_value: usize,
+    allow_zero: bool,
+) -> Result<usize> {
+    let Some(value) = metadata.get(key) else {
+        return Ok(default_value);
+    };
+    let threshold = value.parse::<usize>().map_err(|_| {
+        Error::invalid_input(format!(
+            "Invalid blob threshold metadata {key}={value:?} for field '{field_name}'; \
+             expected a non-negative integer that fits in usize"
+        ))
+    })?;
+    if !allow_zero && threshold == 0 {
+        return Err(Error::invalid_input(format!(
+            "Invalid blob threshold metadata {key}={value:?} for field '{field_name}'; \
+             expected a positive integer"
+        )));
+    }
+    Ok(threshold)
+}
+
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub(super) struct ResolvedExternalBase {
     pub base_id: u32,
@@ -205,9 +261,7 @@ pub struct BlobPreprocessor {
     data_file_key: String,
     local_counter: u32,
     pack_writer: PackWriter,
-    blob_v2_cols: Vec<bool>,
-    dedicated_thresholds: Vec<usize>,
-    writer_metadata: Vec<HashMap<String, String>>,
+    field_processors: Vec<BlobPreprocessField>,
     external_base_resolver: Option<Arc<ExternalBaseResolver>>,
     allow_external_blob_outside_bases: bool,
     external_blob_mode: ExternalBlobMode,
@@ -232,6 +286,64 @@ enum BlobWriteSource<'a> {
     External(&'a ExternalBlobSource),
 }
 
+#[derive(Clone, Debug)]
+struct BlobPreprocessField {
+    kind: BlobPreprocessFieldKind,
+}
+
+#[derive(Clone, Debug)]
+enum BlobPreprocessFieldKind {
+    BlobV2 {
+        inline_threshold: usize,
+        dedicated_threshold: usize,
+        writer_metadata: HashMap<String, String>,
+    },
+    Struct {
+        children: Vec<BlobPreprocessField>,
+    },
+    Passthrough,
+}
+
+impl BlobPreprocessField {
+    fn new(field: &ArrowField) -> Result<Self> {
+        if field.is_blob_v2() {
+            return Ok(Self {
+                kind: BlobPreprocessFieldKind::BlobV2 {
+                    inline_threshold: blob_inline_threshold_from_metadata(
+                        field.metadata(),
+                        field.name(),
+                    )?,
+                    dedicated_threshold: blob_dedicated_threshold_from_metadata(
+                        field.metadata(),
+                        field.name(),
+                    )?,
+                    writer_metadata: field.metadata().clone(),
+                },
+            });
+        }
+
+        if let ArrowDataType::Struct(children) = field.data_type() {
+            let children = children
+                .iter()
+                .map(|child| Self::new(child.as_ref()))
+                .collect::<Result<Vec<_>>>()?;
+            if children.iter().any(|child| child.requires_preprocessing()) {
+                return Ok(Self {
+                    kind: BlobPreprocessFieldKind::Struct { children },
+                });
+            }
+        }
+
+        Ok(Self {
+            kind: BlobPreprocessFieldKind::Passthrough,
+        })
+    }
+
+    fn requires_preprocessing(&self) -> bool {
+        !matches!(self.kind, BlobPreprocessFieldKind::Passthrough)
+    }
+}
+
 impl ExternalBlobSource {
     /// Return the logical payload size after applying any external slice.
     fn size(&self) -> u64 {
@@ -313,7 +425,7 @@ impl BlobPreprocessor {
         source_store_registry: Arc<ObjectStoreRegistry>,
         source_store_params: ObjectStoreParams,
         pack_file_size_threshold: Option<usize>,
-    ) -> Self {
+    ) -> Result<Self> {
         let mut pack_writer = PackWriter::new(
             object_store.clone(),
             data_dir.clone(),
@@ -323,32 +435,25 @@ impl BlobPreprocessor {
             pack_writer.max_pack_size = max_bytes;
         }
         let arrow_schema = arrow_schema::Schema::from(schema);
-        let fields = arrow_schema.fields();
-        let blob_v2_cols = fields.iter().map(|field| field.is_blob_v2()).collect();
-        let dedicated_thresholds = fields
-            .iter()
-            .map(|field| dedicated_threshold_from_metadata(field.as_ref()))
-            .collect();
-        let writer_metadata = fields
+        let field_processors = arrow_schema
+            .fields()
             .iter()
-            .map(|field| field.metadata().clone())
-            .collect();
-        Self {
+            .map(|field| BlobPreprocessField::new(field.as_ref()))
+            .collect::<Result<Vec<_>>>()?;
+        Ok(Self {
             object_store,
             data_dir,
             data_file_key,
             // Start at 1 to avoid a potential all-zero blob_id value.
             local_counter: 1,
             pack_writer,
-            blob_v2_cols,
-            dedicated_thresholds,
-            writer_metadata,
+            field_processors,
             external_base_resolver,
             allow_external_blob_outside_bases,
             external_blob_mode,
             source_store_registry,
             source_store_params,
-        }
+        })
     }
 
     fn next_blob_id(&mut self) -> u32 {
@@ -443,7 +548,7 @@ impl BlobPreprocessor {
     }
 
     pub(crate) async fn preprocess_batch(&mut self, batch: &RecordBatch) -> Result<RecordBatch> {
-        let expected_columns = self.blob_v2_cols.len();
+        let expected_columns = self.field_processors.len();
         if batch.num_columns() != expected_columns {
             return Err(Error::invalid_input(format!(
                 "Unexpected number of columns: expected {}, got {}",
@@ -454,245 +559,340 @@ impl BlobPreprocessor {
 
         let batch_schema = batch.schema();
         let batch_fields = batch_schema.fields();
+        let field_processors = self.field_processors.clone();
 
         let mut new_columns = Vec::with_capacity(batch.num_columns());
         let mut new_fields = Vec::with_capacity(batch.num_columns());
 
-        for idx in 0..batch.num_columns() {
-            let array = batch.column(idx);
-            let field = &batch_fields[idx];
-            if !self.blob_v2_cols[idx] {
-                new_columns.push(array.clone());
-                new_fields.push(field.clone());
+        for ((processor, array), field) in field_processors
+            .iter()
+            .zip(batch.columns().iter())
+            .zip(batch_fields.iter())
+        {
+            let (new_column, new_field) = self
+                .preprocess_field(processor, array.clone(), field)
+                .await?;
+            new_columns.push(new_column);
+            new_fields.push(new_field);
+        }
+
+        let new_schema = Arc::new(arrow_schema::Schema::new_with_metadata(
+            new_fields
+                .iter()
+                .map(|f| f.as_ref().clone())
+                .collect::<Vec<_>>(),
+            batch_schema.metadata().clone(),
+        ));
+
+        RecordBatch::try_new(new_schema, new_columns)
+            .map_err(|e| Error::invalid_input(e.to_string()))
+    }
+
+    fn preprocess_field<'a>(
+        &'a mut self,
+        processor: &'a BlobPreprocessField,
+        array: ArrayRef,
+        field: &'a Arc<ArrowField>,
+    ) -> BoxFuture<'a, Result<(ArrayRef, Arc<ArrowField>)>> {
+        async move {
+            match &processor.kind {
+                BlobPreprocessFieldKind::Passthrough => Ok((array, field.clone())),
+                BlobPreprocessFieldKind::BlobV2 {
+                    inline_threshold,
+                    dedicated_threshold,
+                    writer_metadata,
+                } => {
+                    self.preprocess_blob_array(
+                        array,
+                        field.as_ref(),
+                        *inline_threshold,
+                        *dedicated_threshold,
+                        writer_metadata,
+                    )
+                    .await
+                }
+                BlobPreprocessFieldKind::Struct { children } => {
+                    self.preprocess_struct_array(array, field.as_ref(), children)
+                        .await
+                }
+            }
+        }
+        .boxed()
+    }
+
+    async fn preprocess_struct_array(
+        &mut self,
+        array: ArrayRef,
+        field: &ArrowField,
+        children: &[BlobPreprocessField],
+    ) -> Result<(ArrayRef, Arc<ArrowField>)> {
+        let struct_arr = array
+            .as_any()
+            .downcast_ref::<StructArray>()
+            .ok_or_else(|| Error::invalid_input("Struct field was not a struct array"))?;
+        if struct_arr.num_columns() != children.len() {
+            return Err(Error::invalid_input(format!(
+                "Struct field '{}' expected {} children, got {}",
+                field.name(),
+                children.len(),
+                struct_arr.num_columns()
+            )));
+        }
+
+        let struct_arr = struct_arr.normalize_slicing()?;
+        let parent_nulls = struct_arr.nulls().cloned();
+        let pushed_down = struct_arr.pushdown_nulls()?;
+        let child_fields = pushed_down.fields().clone();
+        let child_columns = pushed_down.columns().to_vec();
+
+        let mut new_columns = Vec::with_capacity(children.len());
+        let mut new_fields = Vec::with_capacity(children.len());
+        for ((child_processor, child_array), child_field) in children
+            .iter()
+            .zip(child_columns.into_iter())
+            .zip(child_fields.iter())
+        {
+            let (new_column, new_field) = self
+                .preprocess_field(child_processor, child_array, child_field)
+                .await?;
+            new_columns.push(new_column);
+            new_fields.push(new_field);
+        }
+
+        let struct_array =
+            StructArray::try_new(new_fields.clone().into(), new_columns, parent_nulls)?;
+        let field = Arc::new(
+            ArrowField::new(
+                field.name(),
+                ArrowDataType::Struct(new_fields.into()),
+                field.is_nullable(),
+            )
+            .with_metadata(field.metadata().clone()),
+        );
+        Ok((Arc::new(struct_array), field))
+    }
+
+    async fn preprocess_blob_array(
+        &mut self,
+        array: ArrayRef,
+        field: &ArrowField,
+        inline_threshold: usize,
+        dedicated_threshold: usize,
+        writer_metadata: &HashMap<String, String>,
+    ) -> Result<(ArrayRef, Arc<ArrowField>)> {
+        let struct_arr = array
+            .as_any()
+            .downcast_ref::<StructArray>()
+            .ok_or_else(|| Error::invalid_input("Blob column was not a struct array"))?;
+
+        let data_col = struct_arr
+            .column_by_name("data")
+            .ok_or_else(|| Error::invalid_input("Blob struct missing `data` field"))?
+            .as_binary::<i64>();
+        let uri_col = struct_arr
+            .column_by_name("uri")
+            .ok_or_else(|| Error::invalid_input("Blob struct missing `uri` field"))?
+            .as_string::<i32>();
+        let position_col = struct_arr
+            .column_by_name("position")
+            .map(|col| col.as_primitive::<UInt64Type>());
+        let size_col = struct_arr
+            .column_by_name("size")
+            .map(|col| col.as_primitive::<UInt64Type>());
+
+        let mut data_builder = LargeBinaryBuilder::with_capacity(struct_arr.len(), 0);
+        let mut uri_builder = StringBuilder::with_capacity(struct_arr.len(), 0);
+        let mut blob_id_builder =
+            PrimitiveBuilder::<arrow_array::types::UInt32Type>::with_capacity(struct_arr.len());
+        let mut blob_size_builder =
+            PrimitiveBuilder::<arrow_array::types::UInt64Type>::with_capacity(struct_arr.len());
+        let mut kind_builder = PrimitiveBuilder::<UInt8Type>::with_capacity(struct_arr.len());
+        let mut position_builder =
+            PrimitiveBuilder::<arrow_array::types::UInt64Type>::with_capacity(struct_arr.len());
+
+        let struct_nulls = struct_arr.nulls();
+
+        for i in 0..struct_arr.len() {
+            if struct_arr.is_null(i) {
+                data_builder.append_null();
+                uri_builder.append_null();
+                blob_id_builder.append_null();
+                blob_size_builder.append_null();
+                kind_builder.append_null();
+                position_builder.append_null();
                 continue;
             }
 
-            let struct_arr = array
-                .as_any()
-                .downcast_ref::<arrow_array::StructArray>()
-                .ok_or_else(|| Error::invalid_input("Blob column was not a struct array"))?;
-
-            let data_col = struct_arr
-                .column_by_name("data")
-                .ok_or_else(|| Error::invalid_input("Blob struct missing `data` field"))?
-                .as_binary::<i64>();
-            let uri_col = struct_arr
-                .column_by_name("uri")
-                .ok_or_else(|| Error::invalid_input("Blob struct missing `uri` field"))?
-                .as_string::<i32>();
-            let position_col = struct_arr
-                .column_by_name("position")
-                .map(|col| col.as_primitive::<UInt64Type>());
-            let size_col = struct_arr
-                .column_by_name("size")
-                .map(|col| col.as_primitive::<UInt64Type>());
-
-            let mut data_builder = LargeBinaryBuilder::with_capacity(struct_arr.len(), 0);
-            let mut uri_builder = StringBuilder::with_capacity(struct_arr.len(), 0);
-            let mut blob_id_builder =
-                PrimitiveBuilder::<arrow_array::types::UInt32Type>::with_capacity(struct_arr.len());
-            let mut blob_size_builder =
-                PrimitiveBuilder::<arrow_array::types::UInt64Type>::with_capacity(struct_arr.len());
-            let mut kind_builder = PrimitiveBuilder::<UInt8Type>::with_capacity(struct_arr.len());
-            let mut position_builder =
-                PrimitiveBuilder::<arrow_array::types::UInt64Type>::with_capacity(struct_arr.len());
-
-            let struct_nulls = struct_arr.nulls();
-
-            for i in 0..struct_arr.len() {
-                if struct_arr.is_null(i) {
-                    data_builder.append_null();
-                    uri_builder.append_null();
-                    blob_id_builder.append_null();
-                    blob_size_builder.append_null();
-                    kind_builder.append_null();
-                    position_builder.append_null();
-                    continue;
-                }
+            let has_data = !data_col.is_null(i);
+            let has_uri = !uri_col.is_null(i);
+            let has_position = position_col
+                .as_ref()
+                .map(|col| !col.is_null(i))
+                .unwrap_or(false);
+            let has_size = size_col
+                .as_ref()
+                .map(|col| !col.is_null(i))
+                .unwrap_or(false);
+            let data_len = if has_data { data_col.value(i).len() } else { 0 };
 
-                let has_data = !data_col.is_null(i);
-                let has_uri = !uri_col.is_null(i);
-                let has_position = position_col
-                    .as_ref()
-                    .map(|col| !col.is_null(i))
-                    .unwrap_or(false);
-                let has_size = size_col
-                    .as_ref()
-                    .map(|col| !col.is_null(i))
-                    .unwrap_or(false);
-                let data_len = if has_data { data_col.value(i).len() } else { 0 };
-
-                let dedicated_threshold = self.dedicated_thresholds[idx];
-                if has_data && data_len > dedicated_threshold {
-                    let blob_id = self.next_blob_id();
-                    self.write_dedicated(blob_id, BlobWriteSource::Bytes(data_col.value(i)))
-                        .await?;
-
-                    kind_builder.append_value(BlobKind::Dedicated as u8);
-                    data_builder.append_null();
-                    uri_builder.append_null();
-                    blob_id_builder.append_value(blob_id);
-                    blob_size_builder.append_value(data_len as u64);
-                    position_builder.append_null();
-                    continue;
-                }
+            if has_data && data_len > dedicated_threshold {
+                let blob_id = self.next_blob_id();
+                self.write_dedicated(blob_id, BlobWriteSource::Bytes(data_col.value(i)))
+                    .await?;
 
-                if has_data && data_len > INLINE_MAX {
-                    let (pack_blob_id, position) = self
-                        .write_packed(BlobWriteSource::Bytes(data_col.value(i)))
-                        .await?;
+                kind_builder.append_value(BlobKind::Dedicated as u8);
+                data_builder.append_null();
+                uri_builder.append_null();
+                blob_id_builder.append_value(blob_id);
+                blob_size_builder.append_value(data_len as u64);
+                position_builder.append_null();
+                continue;
+            }
 
-                    kind_builder.append_value(BlobKind::Packed as u8);
-                    data_builder.append_null();
-                    uri_builder.append_null();
-                    blob_id_builder.append_value(pack_blob_id);
-                    blob_size_builder.append_value(data_len as u64);
-                    position_builder.append_value(position);
-                    continue;
-                }
+            if has_data && data_len > inline_threshold {
+                let (pack_blob_id, position) = self
+                    .write_packed(BlobWriteSource::Bytes(data_col.value(i)))
+                    .await?;
 
-                if has_uri {
-                    let uri_val = uri_col.value(i);
-                    if self.external_blob_mode == ExternalBlobMode::Ingest {
-                        let position = if has_position {
-                            Some(
-                                position_col
-                                    .as_ref()
-                                    .expect("position column must exist")
-                                    .value(i),
-                            )
-                        } else {
-                            None
-                        };
-                        let size = if has_size {
-                            Some(size_col.as_ref().expect("size column must exist").value(i))
-                        } else {
-                            None
-                        };
-                        let source = self.open_external_source(uri_val, position, size).await?;
-                        let data_len = source.size();
-
-                        if data_len > dedicated_threshold as u64 {
-                            let blob_id = self.next_blob_id();
-                            self.write_dedicated(blob_id, BlobWriteSource::External(&source))
-                                .await?;
-
-                            kind_builder.append_value(BlobKind::Dedicated as u8);
-                            data_builder.append_null();
-                            uri_builder.append_null();
-                            blob_id_builder.append_value(blob_id);
-                            blob_size_builder.append_value(data_len);
-                            position_builder.append_null();
-                            continue;
-                        }
+                kind_builder.append_value(BlobKind::Packed as u8);
+                data_builder.append_null();
+                uri_builder.append_null();
+                blob_id_builder.append_value(pack_blob_id);
+                blob_size_builder.append_value(data_len as u64);
+                position_builder.append_value(position);
+                continue;
+            }
 
-                        if data_len > INLINE_MAX as u64 {
-                            let (pack_blob_id, position) = self
-                                .write_packed(BlobWriteSource::External(&source))
-                                .await?;
-
-                            kind_builder.append_value(BlobKind::Packed as u8);
-                            data_builder.append_null();
-                            uri_builder.append_null();
-                            blob_id_builder.append_value(pack_blob_id);
-                            blob_size_builder.append_value(data_len);
-                            position_builder.append_value(position);
-                            continue;
-                        }
+            if has_uri {
+                let uri_val = uri_col.value(i);
+                if self.external_blob_mode == ExternalBlobMode::Ingest {
+                    let position = if has_position {
+                        Some(
+                            position_col
+                                .as_ref()
+                                .expect("position column must exist")
+                                .value(i),
+                        )
+                    } else {
+                        None
+                    };
+                    let size = if has_size {
+                        Some(size_col.as_ref().expect("size column must exist").value(i))
+                    } else {
+                        None
+                    };
+                    let source = self.open_external_source(uri_val, position, size).await?;
+                    let data_len = source.size();
 
-                        let data = source.read_all().await?;
+                    if data_len > dedicated_threshold as u64 {
+                        let blob_id = self.next_blob_id();
+                        self.write_dedicated(blob_id, BlobWriteSource::External(&source))
+                            .await?;
 
-                        kind_builder.append_value(BlobKind::Inline as u8);
-                        data_builder.append_value(data.as_ref());
+                        kind_builder.append_value(BlobKind::Dedicated as u8);
+                        data_builder.append_null();
                         uri_builder.append_null();
-                        blob_id_builder.append_null();
-                        blob_size_builder.append_null();
+                        blob_id_builder.append_value(blob_id);
+                        blob_size_builder.append_value(data_len);
                         position_builder.append_null();
                         continue;
                     }
 
-                    let (external_base_id, external_uri_or_path) =
-                        self.resolve_external_reference(uri_val).await?;
-                    kind_builder.append_value(BlobKind::External as u8);
-                    data_builder.append_null();
-                    uri_builder.append_value(external_uri_or_path);
-                    blob_id_builder.append_value(external_base_id);
-                    if has_position && has_size {
-                        let position = position_col
-                            .as_ref()
-                            .expect("position column must exist")
-                            .value(i);
-                        let size = size_col.as_ref().expect("size column must exist").value(i);
-                        blob_size_builder.append_value(size);
+                    if data_len > inline_threshold as u64 {
+                        let (pack_blob_id, position) = self
+                            .write_packed(BlobWriteSource::External(&source))
+                            .await?;
+
+                        kind_builder.append_value(BlobKind::Packed as u8);
+                        data_builder.append_null();
+                        uri_builder.append_null();
+                        blob_id_builder.append_value(pack_blob_id);
+                        blob_size_builder.append_value(data_len);
                         position_builder.append_value(position);
-                    } else {
-                        blob_size_builder.append_null();
-                        position_builder.append_null();
+                        continue;
                     }
-                    continue;
-                }
 
-                if has_data {
+                    let data = source.read_all().await?;
+
                     kind_builder.append_value(BlobKind::Inline as u8);
-                    let value = data_col.value(i);
-                    data_builder.append_value(value);
+                    data_builder.append_value(data.as_ref());
                     uri_builder.append_null();
                     blob_id_builder.append_null();
                     blob_size_builder.append_null();
                     position_builder.append_null();
+                    continue;
+                }
+
+                let (external_base_id, external_uri_or_path) =
+                    self.resolve_external_reference(uri_val).await?;
+                kind_builder.append_value(BlobKind::External as u8);
+                data_builder.append_null();
+                uri_builder.append_value(external_uri_or_path);
+                blob_id_builder.append_value(external_base_id);
+                if has_position && has_size {
+                    let position = position_col
+                        .as_ref()
+                        .expect("position column must exist")
+                        .value(i);
+                    let size = size_col.as_ref().expect("size column must exist").value(i);
+                    blob_size_builder.append_value(size);
+                    position_builder.append_value(position);
                 } else {
-                    data_builder.append_null();
-                    uri_builder.append_null();
-                    blob_id_builder.append_null();
                     blob_size_builder.append_null();
-                    kind_builder.append_null();
                     position_builder.append_null();
                 }
+                continue;
             }
 
-            let child_fields = vec![
-                arrow_schema::Field::new("kind", ArrowDataType::UInt8, true),
-                arrow_schema::Field::new("data", ArrowDataType::LargeBinary, true),
-                arrow_schema::Field::new("uri", ArrowDataType::Utf8, true),
-                arrow_schema::Field::new("blob_id", ArrowDataType::UInt32, true),
-                arrow_schema::Field::new("blob_size", ArrowDataType::UInt64, true),
-                arrow_schema::Field::new("position", ArrowDataType::UInt64, true),
-            ];
-
-            let struct_array = arrow_array::StructArray::try_new(
-                child_fields.clone().into(),
-                vec![
-                    Arc::new(kind_builder.finish()),
-                    Arc::new(data_builder.finish()),
-                    Arc::new(uri_builder.finish()),
-                    Arc::new(blob_id_builder.finish()),
-                    Arc::new(blob_size_builder.finish()),
-                    Arc::new(position_builder.finish()),
-                ],
-                struct_nulls.cloned(),
-            )?;
-
-            new_columns.push(Arc::new(struct_array));
-            new_fields.push(Arc::new(
-                arrow_schema::Field::new(
-                    field.name(),
-                    ArrowDataType::Struct(child_fields.into()),
-                    field.is_nullable(),
-                )
-                .with_metadata(self.writer_metadata[idx].clone()),
-            ));
+            if has_data {
+                kind_builder.append_value(BlobKind::Inline as u8);
+                let value = data_col.value(i);
+                data_builder.append_value(value);
+                uri_builder.append_null();
+                blob_id_builder.append_null();
+                blob_size_builder.append_null();
+                position_builder.append_null();
+            } else {
+                data_builder.append_null();
+                uri_builder.append_null();
+                blob_id_builder.append_null();
+                blob_size_builder.append_null();
+                kind_builder.append_null();
+                position_builder.append_null();
+            }
         }
 
-        let new_schema = Arc::new(arrow_schema::Schema::new_with_metadata(
-            new_fields
-                .iter()
-                .map(|f| f.as_ref().clone())
-                .collect::<Vec<_>>(),
-            batch_schema.metadata().clone(),
-        ));
+        let child_fields = vec![
+            ArrowField::new("kind", ArrowDataType::UInt8, true),
+            ArrowField::new("data", ArrowDataType::LargeBinary, true),
+            ArrowField::new("uri", ArrowDataType::Utf8, true),
+            ArrowField::new("blob_id", ArrowDataType::UInt32, true),
+            ArrowField::new("blob_size", ArrowDataType::UInt64, true),
+            ArrowField::new("position", ArrowDataType::UInt64, true),
+        ];
 
-        RecordBatch::try_new(new_schema, new_columns)
-            .map_err(|e| Error::invalid_input(e.to_string()))
+        let struct_array = StructArray::try_new(
+            child_fields.clone().into(),
+            vec![
+                Arc::new(kind_builder.finish()),
+                Arc::new(data_builder.finish()),
+                Arc::new(uri_builder.finish()),
+                Arc::new(blob_id_builder.finish()),
+                Arc::new(blob_size_builder.finish()),
+                Arc::new(position_builder.finish()),
+            ],
+            struct_nulls.cloned(),
+        )?;
+
+        let field = Arc::new(
+            ArrowField::new(
+                field.name(),
+                ArrowDataType::Struct(child_fields.into()),
+                field.is_nullable(),
+            )
+            .with_metadata(writer_metadata.clone()),
+        );
+        Ok((Arc::new(struct_array), field))
     }
 
     pub(crate) async fn finish(&mut self) -> Result<()> {
@@ -700,16 +900,6 @@ impl BlobPreprocessor {
     }
 }
 
-fn dedicated_threshold_from_metadata(field: &arrow_schema::Field) -> usize {
-    field
-        .metadata()
-        .get(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY)
-        .and_then(|value| value.parse::<i64>().ok())
-        .filter(|value| *value > 0)
-        .and_then(|value| usize::try_from(value).ok())
-        .unwrap_or(DEDICATED_THRESHOLD)
-}
-
 pub async fn preprocess_blob_batches(
     batches: &[RecordBatch],
     pre: &mut BlobPreprocessor,
@@ -2103,7 +2293,7 @@ mod tests {
     };
     use arrow_array::RecordBatch;
     use arrow_array::{
-        ArrayRef, RecordBatchIterator, StringArray, StructArray, UInt32Array, UInt64Array,
+        Array, ArrayRef, RecordBatchIterator, StringArray, StructArray, UInt32Array, UInt64Array,
     };
     use arrow_schema::{DataType, Field, Schema};
     use async_trait::async_trait;
@@ -2111,7 +2301,8 @@ mod tests {
     use chrono::Utc;
     use futures::{StreamExt, TryStreamExt, future::try_join_all};
     use lance_arrow::{
-        ARROW_EXT_NAME_KEY, BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, BLOB_V2_EXT_NAME, DataTypeExt,
+        ARROW_EXT_NAME_KEY, BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY,
+        BLOB_INLINE_SIZE_THRESHOLD_META_KEY, BLOB_V2_EXT_NAME, DataTypeExt,
     };
     use lance_core::datatypes::BlobKind;
     use lance_io::object_store::{
@@ -2142,7 +2333,7 @@ mod tests {
     use crate::{
         Dataset,
         blob::{BlobArrayBuilder, blob_field},
-        dataset::{ExternalBlobMode, WriteParams},
+        dataset::{ExternalBlobMode, WriteMode, WriteParams},
         utils::test::TestDatasetGenerator,
     };
 
@@ -2158,6 +2349,32 @@ mod tests {
         expected: Vec<u8>,
     }
 
+    fn nested_blob_v2_batch(blob_array: ArrayRef) -> (Arc<Schema>, RecordBatch) {
+        let blob_field = blob_field("blob", true);
+        let info_fields = vec![Field::new("name", DataType::Utf8, false), blob_field];
+        let info_array: ArrayRef = Arc::new(
+            StructArray::try_new(
+                info_fields.clone().into(),
+                vec![
+                    Arc::new(StringArray::from_iter_values(
+                        (0..blob_array.len()).map(|idx| format!("name-{idx}")),
+                    )) as ArrayRef,
+                    blob_array,
+                ],
+                None,
+            )
+            .unwrap(),
+        );
+
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "info",
+            DataType::Struct(info_fields.into()),
+            true,
+        )]));
+        let batch = RecordBatch::try_new(schema.clone(), vec![info_array]).unwrap();
+        (schema, batch)
+    }
+
     #[cfg(feature = "azure")]
     fn azure_store_params(account_name: &str) -> ObjectStoreParams {
         ObjectStoreParams {
@@ -3045,6 +3262,114 @@ mod tests {
         assert_eq!(second.as_ref(), b"world");
     }
 
+    #[tokio::test]
+    async fn test_write_and_take_nested_blob_v2() {
+        let test_dir = TempStrDir::default();
+        let packed_payload = vec![0x4A; super::INLINE_MAX + 1024];
+
+        let mut blob_builder = BlobArrayBuilder::new(3);
+        blob_builder.push_bytes(b"hello").unwrap();
+        blob_builder.push_bytes(&packed_payload).unwrap();
+        blob_builder.push_null().unwrap();
+        let blob_array: ArrayRef = blob_builder.finish().unwrap();
+
+        let (schema, batch) = nested_blob_v2_batch(blob_array);
+        let reader = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema);
+
+        let dataset = Arc::new(
+            Dataset::write(
+                reader,
+                &test_dir,
+                Some(WriteParams {
+                    data_storage_version: Some(LanceFileVersion::V2_2),
+                    ..Default::default()
+                }),
+            )
+            .await
+            .unwrap(),
+        );
+
+        let info_batch = dataset
+            .scan()
+            .project(&["info"])
+            .unwrap()
+            .try_into_batch()
+            .await
+            .unwrap();
+        let blob_desc = info_batch
+            .column(0)
+            .as_struct()
+            .column_by_name("blob")
+            .unwrap()
+            .as_struct();
+        assert_eq!(
+            blob_desc
+                .column_by_name("kind")
+                .unwrap()
+                .as_primitive::<UInt8Type>()
+                .value(0),
+            BlobKind::Inline as u8
+        );
+        assert_eq!(
+            blob_desc
+                .column_by_name("kind")
+                .unwrap()
+                .as_primitive::<UInt8Type>()
+                .value(1),
+            BlobKind::Packed as u8
+        );
+
+        let blobs = dataset
+            .take_blobs_by_indices(&[0, 1], "info.blob")
+            .await
+            .unwrap();
+        assert_eq!(blobs.len(), 2);
+        assert_eq!(blobs[0].read().await.unwrap().as_ref(), b"hello");
+        assert_eq!(
+            blobs[1].read().await.unwrap().as_ref(),
+            packed_payload.as_slice()
+        );
+
+        let null_blobs = dataset
+            .take_blobs_by_indices(&[2], "info.blob")
+            .await
+            .unwrap();
+        assert!(null_blobs.is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_nested_blob_v2_requires_v2_2() {
+        let test_dir = TempStrDir::default();
+
+        let mut blob_builder = BlobArrayBuilder::new(1);
+        blob_builder.push_bytes(b"hello").unwrap();
+        let blob_array: ArrayRef = blob_builder.finish().unwrap();
+
+        let (schema, batch) = nested_blob_v2_batch(blob_array);
+        let reader = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema);
+
+        let result = Dataset::write(
+            reader,
+            &test_dir,
+            Some(WriteParams {
+                data_storage_version: Some(LanceFileVersion::V2_1),
+                ..Default::default()
+            }),
+        )
+        .await;
+
+        assert!(
+            result.is_err(),
+            "Nested blob v2 should be rejected for file version 2.1"
+        );
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Blob v2 requires file version >= 2.2")
+        );
+    }
+
     #[tokio::test]
     async fn test_blob_file_read_empty_range_returns_empty_bytes() {
         let store = reject_empty_range_store();
@@ -3621,6 +3946,50 @@ mod tests {
         assert_eq!(blobs[0].read().await.unwrap().as_ref(), payload.as_slice());
     }
 
+    #[tokio::test]
+    async fn test_blob_v2_external_ingest_respects_inline_threshold() {
+        let dataset_dir = TempDir::default();
+        let external_dir = TempDir::default();
+        let external_path = external_dir.std_path().join("external.bin");
+        let payload = vec![0x5A; 2048];
+        std::fs::write(&external_path, &payload).unwrap();
+        let external_uri = format!("file://{}", external_path.display());
+
+        let mut blob_builder = BlobArrayBuilder::new(1);
+        blob_builder.push_uri(external_uri).unwrap();
+        let blob_array: arrow_array::ArrayRef = blob_builder.finish().unwrap();
+
+        let mut field = blob_field("blob", true);
+        let mut metadata = field.metadata().clone();
+        metadata.insert(
+            BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(),
+            "1024".to_string(),
+        );
+        field = field.with_metadata(metadata);
+        let schema = Arc::new(Schema::new(vec![field]));
+        let batch = RecordBatch::try_new(schema.clone(), vec![blob_array]).unwrap();
+        let reader = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema);
+
+        let dataset = Arc::new(
+            Dataset::write(
+                reader,
+                &dataset_dir.path_str(),
+                Some(WriteParams {
+                    data_storage_version: Some(LanceFileVersion::V2_2),
+                    external_blob_mode: ExternalBlobMode::Ingest,
+                    ..Default::default()
+                }),
+            )
+            .await
+            .unwrap(),
+        );
+
+        let blobs = dataset.take_blobs_by_indices(&[0], "blob").await.unwrap();
+        assert_eq!(blobs.len(), 1);
+        assert_eq!(blobs[0].kind(), BlobKind::Packed);
+        assert_eq!(blobs[0].read().await.unwrap().as_ref(), payload.as_slice());
+    }
+
     #[tokio::test]
     async fn test_blob_v2_external_ingest_dedicated() {
         let dataset_dir = TempDir::default();
@@ -3713,7 +4082,10 @@ mod tests {
         );
     }
 
-    async fn preprocess_kind_with_schema_metadata(metadata_value: &str, data_len: usize) -> u8 {
+    async fn try_preprocess_kind_with_blob_metadata(
+        metadata_entries: Vec<(&'static str, String)>,
+        data_len: usize,
+    ) -> Result<u8> {
         let (object_store, base_path) = ObjectStore::from_uri_and_params(
             Arc::new(ObjectStoreRegistry::default()),
             "memory://blob_preprocessor",
@@ -3726,10 +4098,9 @@ mod tests {
 
         let mut field = blob_field("blob", true);
         let mut metadata = field.metadata().clone();
-        metadata.insert(
-            BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY.to_string(),
-            metadata_value.to_string(),
-        );
+        for (key, value) in metadata_entries {
+            metadata.insert(key.to_string(), value);
+        }
         field = field.with_metadata(metadata);
 
         let writer_arrow_schema = Schema::new(vec![field.clone()]);
@@ -3746,7 +4117,7 @@ mod tests {
             Arc::new(ObjectStoreRegistry::default()),
             ObjectStoreParams::default(),
             None,
-        );
+        )?;
 
         let mut blob_builder = BlobArrayBuilder::new(1);
         blob_builder.push_bytes(vec![0u8; data_len]).unwrap();
@@ -3757,36 +4128,442 @@ mod tests {
         let batch_schema = Arc::new(Schema::new(vec![field_without_metadata]));
         let batch = RecordBatch::try_new(batch_schema, vec![blob_array]).unwrap();
 
-        let out = preprocessor.preprocess_batch(&batch).await.unwrap();
+        let out = preprocessor.preprocess_batch(&batch).await?;
         let struct_arr = out
             .column(0)
             .as_any()
             .downcast_ref::<arrow_array::StructArray>()
             .unwrap();
-        struct_arr
+        Ok(struct_arr
             .column_by_name("kind")
             .unwrap()
             .as_primitive::<arrow::datatypes::UInt8Type>()
-            .value(0)
+            .value(0))
+    }
+
+    async fn preprocess_kind_with_blob_metadata(
+        metadata_entries: Vec<(&'static str, String)>,
+        data_len: usize,
+    ) -> u8 {
+        try_preprocess_kind_with_blob_metadata(metadata_entries, data_len)
+            .await
+            .unwrap()
     }
 
     #[tokio::test]
-    async fn test_blob_v2_dedicated_threshold_ignores_non_positive_metadata() {
-        let kind = preprocess_kind_with_schema_metadata("0", 256 * 1024).await;
-        assert_eq!(kind, lance_core::datatypes::BlobKind::Packed as u8);
+    async fn test_blob_v2_dedicated_threshold_rejects_non_positive_metadata() {
+        let err = try_preprocess_kind_with_blob_metadata(
+            vec![(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, "0".to_string())],
+            256 * 1024,
+        )
+        .await
+        .unwrap_err();
+        assert!(err.to_string().contains("expected a positive integer"));
+    }
+
+    #[tokio::test]
+    async fn test_blob_v2_inline_threshold_rejects_invalid_metadata() {
+        let err = try_preprocess_kind_with_blob_metadata(
+            vec![(
+                BLOB_INLINE_SIZE_THRESHOLD_META_KEY,
+                "not-a-number".to_string(),
+            )],
+            256 * 1024,
+        )
+        .await
+        .unwrap_err();
+        assert!(
+            err.to_string()
+                .contains("expected a non-negative integer that fits in usize")
+        );
+    }
+
+    #[tokio::test]
+    async fn test_blob_v2_write_rejects_invalid_inline_threshold_metadata() {
+        let dataset_dir = TempDir::default();
+        let mut field = blob_field("blob", true);
+        let mut metadata = field.metadata().clone();
+        metadata.insert(
+            BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(),
+            "not-a-number".to_string(),
+        );
+        field = field.with_metadata(metadata);
+        let schema = Arc::new(Schema::new(vec![field]));
+
+        let mut blob_builder = BlobArrayBuilder::new(1);
+        blob_builder.push_bytes(vec![0u8; 256]).unwrap();
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![Arc::new(blob_builder.finish().unwrap()) as ArrayRef],
+        )
+        .unwrap();
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema);
+
+        let result = Dataset::write(
+            reader,
+            &dataset_dir.path_str(),
+            Some(WriteParams {
+                data_storage_version: Some(LanceFileVersion::V2_2),
+                ..Default::default()
+            }),
+        )
+        .await;
+        let Err(err) = result else {
+            panic!("write with invalid blob threshold metadata should fail");
+        };
+        assert!(
+            err.to_string()
+                .contains("expected a non-negative integer that fits in usize")
+        );
     }
 
     #[tokio::test]
     async fn test_blob_v2_dedicated_threshold_respects_smaller_metadata() {
-        let kind = preprocess_kind_with_schema_metadata("131072", 256 * 1024).await;
+        let kind = preprocess_kind_with_blob_metadata(
+            vec![(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, "131072".to_string())],
+            256 * 1024,
+        )
+        .await;
         assert_eq!(kind, lance_core::datatypes::BlobKind::Dedicated as u8);
     }
 
     #[tokio::test]
     async fn test_blob_v2_dedicated_threshold_respects_larger_metadata() {
-        let kind =
-            preprocess_kind_with_schema_metadata("8388608", super::DEDICATED_THRESHOLD + 1024)
-                .await;
+        let kind = preprocess_kind_with_blob_metadata(
+            vec![(
+                BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY,
+                "8388608".to_string(),
+            )],
+            super::DEDICATED_THRESHOLD + 1024,
+        )
+        .await;
+        assert_eq!(kind, lance_core::datatypes::BlobKind::Packed as u8);
+    }
+
+    #[tokio::test]
+    async fn test_blob_v2_inline_threshold_respects_smaller_metadata() {
+        let kind = preprocess_kind_with_blob_metadata(
+            vec![(BLOB_INLINE_SIZE_THRESHOLD_META_KEY, "1024".to_string())],
+            2048,
+        )
+        .await;
         assert_eq!(kind, lance_core::datatypes::BlobKind::Packed as u8);
     }
+
+    #[tokio::test]
+    async fn test_blob_v2_inline_threshold_respects_larger_metadata() {
+        let kind = preprocess_kind_with_blob_metadata(
+            vec![(
+                BLOB_INLINE_SIZE_THRESHOLD_META_KEY,
+                (super::INLINE_MAX + 8192).to_string(),
+            )],
+            super::INLINE_MAX + 4096,
+        )
+        .await;
+        assert_eq!(kind, lance_core::datatypes::BlobKind::Inline as u8);
+    }
+
+    #[tokio::test]
+    async fn test_blob_v2_inline_threshold_uses_strict_greater_than() {
+        let kind = preprocess_kind_with_blob_metadata(
+            vec![(BLOB_INLINE_SIZE_THRESHOLD_META_KEY, "1024".to_string())],
+            1024,
+        )
+        .await;
+        assert_eq!(kind, lance_core::datatypes::BlobKind::Inline as u8);
+    }
+
+    #[tokio::test]
+    async fn test_blob_v2_dedicated_threshold_uses_strict_greater_than() {
+        let kind = preprocess_kind_with_blob_metadata(
+            vec![
+                (BLOB_INLINE_SIZE_THRESHOLD_META_KEY, "2048".to_string()),
+                (BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, "1024".to_string()),
+            ],
+            1024,
+        )
+        .await;
+        assert_eq!(kind, lance_core::datatypes::BlobKind::Inline as u8);
+    }
+
+    #[tokio::test]
+    async fn test_blob_v2_inline_threshold_does_not_override_dedicated_threshold() {
+        let kind = preprocess_kind_with_blob_metadata(
+            vec![
+                (BLOB_INLINE_SIZE_THRESHOLD_META_KEY, "8192".to_string()),
+                (BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, "4096".to_string()),
+            ],
+            6144,
+        )
+        .await;
+        assert_eq!(kind, lance_core::datatypes::BlobKind::Dedicated as u8);
+    }
+
+    #[tokio::test]
+    async fn test_blob_v2_inline_threshold_is_per_column() {
+        let (object_store, base_path) = ObjectStore::from_uri_and_params(
+            Arc::new(ObjectStoreRegistry::default()),
+            "memory://blob_preprocessor",
+            &ObjectStoreParams::default(),
+        )
+        .await
+        .unwrap();
+        let object_store = object_store.as_ref().clone();
+        let data_dir = base_path.clone().join("data");
+
+        let mut inline_field = blob_field("inline_blob", true);
+        let mut inline_metadata = inline_field.metadata().clone();
+        inline_metadata.insert(
+            BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(),
+            "4096".to_string(),
+        );
+        inline_field = inline_field.with_metadata(inline_metadata);
+
+        let mut packed_field = blob_field("packed_blob", true);
+        let mut packed_metadata = packed_field.metadata().clone();
+        packed_metadata.insert(
+            BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(),
+            "1024".to_string(),
+        );
+        packed_field = packed_field.with_metadata(packed_metadata);
+
+        let writer_arrow_schema = Schema::new(vec![inline_field.clone(), packed_field.clone()]);
+        let writer_schema = lance_core::datatypes::Schema::try_from(&writer_arrow_schema).unwrap();
+
+        let mut preprocessor = super::BlobPreprocessor::new(
+            object_store.clone(),
+            data_dir,
+            "data_file_key".to_string(),
+            &writer_schema,
+            None,
+            false,
+            ExternalBlobMode::Reference,
+            Arc::new(ObjectStoreRegistry::default()),
+            ObjectStoreParams::default(),
+            None,
+        )
+        .unwrap();
+
+        let mut inline_builder = BlobArrayBuilder::new(1);
+        inline_builder.push_bytes(vec![0u8; 2048]).unwrap();
+        let inline_array: arrow_array::ArrayRef = inline_builder.finish().unwrap();
+
+        let mut packed_builder = BlobArrayBuilder::new(1);
+        packed_builder.push_bytes(vec![0u8; 2048]).unwrap();
+        let packed_array: arrow_array::ArrayRef = packed_builder.finish().unwrap();
+
+        let batch_schema = Arc::new(Schema::new(vec![
+            Field::new(
+                "inline_blob",
+                inline_field.data_type().clone(),
+                inline_field.is_nullable(),
+            ),
+            Field::new(
+                "packed_blob",
+                packed_field.data_type().clone(),
+                packed_field.is_nullable(),
+            ),
+        ]));
+        let batch = RecordBatch::try_new(batch_schema, vec![inline_array, packed_array]).unwrap();
+
+        let out = preprocessor.preprocess_batch(&batch).await.unwrap();
+        let inline_kind = out
+            .column(0)
+            .as_any()
+            .downcast_ref::<arrow_array::StructArray>()
+            .unwrap()
+            .column_by_name("kind")
+            .unwrap()
+            .as_primitive::<arrow::datatypes::UInt8Type>()
+            .value(0);
+        let packed_kind = out
+            .column(1)
+            .as_any()
+            .downcast_ref::<arrow_array::StructArray>()
+            .unwrap()
+            .column_by_name("kind")
+            .unwrap()
+            .as_primitive::<arrow::datatypes::UInt8Type>()
+            .value(0);
+
+        assert_eq!(inline_kind, lance_core::datatypes::BlobKind::Inline as u8);
+        assert_eq!(packed_kind, lance_core::datatypes::BlobKind::Packed as u8);
+    }
+
+    #[tokio::test]
+    async fn test_blob_v2_append_rejects_explicit_inline_threshold_mismatch() {
+        let dataset_dir = TempDir::default();
+        let payload = vec![0u8; 2048];
+
+        let schema = Arc::new(Schema::new(vec![blob_field("blob", true)]));
+        let mut initial_builder = BlobArrayBuilder::new(1);
+        initial_builder.push_bytes(payload.clone()).unwrap();
+        let initial_batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![Arc::new(initial_builder.finish().unwrap()) as ArrayRef],
+        )
+        .unwrap();
+        let initial_reader = RecordBatchIterator::new(vec![Ok(initial_batch)], schema);
+        let dataset = Dataset::write(
+            initial_reader,
+            &dataset_dir.path_str(),
+            Some(WriteParams {
+                data_storage_version: Some(LanceFileVersion::V2_2),
+                ..Default::default()
+            }),
+        )
+        .await
+        .unwrap();
+
+        let mut append_field = blob_field("blob", true);
+        let mut append_metadata = append_field.metadata().clone();
+        append_metadata.insert(
+            BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(),
+            "1024".to_string(),
+        );
+        append_field = append_field.with_metadata(append_metadata);
+        let append_schema = Arc::new(Schema::new(vec![append_field]));
+        let mut append_builder = BlobArrayBuilder::new(1);
+        append_builder.push_bytes(payload).unwrap();
+        let append_batch = RecordBatch::try_new(
+            append_schema.clone(),
+            vec![Arc::new(append_builder.finish().unwrap()) as ArrayRef],
+        )
+        .unwrap();
+        let append_reader = RecordBatchIterator::new(vec![Ok(append_batch)], append_schema);
+
+        let result = Dataset::write(
+            append_reader,
+            Arc::new(dataset),
+            Some(WriteParams {
+                mode: WriteMode::Append,
+                ..Default::default()
+            }),
+        )
+        .await;
+        let Err(err) = result else {
+            panic!("append with explicit blob threshold mismatch should fail");
+        };
+        let message = err.to_string();
+        assert!(message.contains("Cannot append data with blob threshold metadata"));
+        assert!(message.contains(BLOB_INLINE_SIZE_THRESHOLD_META_KEY));
+    }
+
+    #[tokio::test]
+    async fn test_blob_v2_append_rejects_threshold_mismatch_with_non_blob_input_extension() {
+        let dataset_dir = TempDir::default();
+        let payload = vec![0u8; 2048];
+
+        let schema = Arc::new(Schema::new(vec![blob_field("blob", true)]));
+        let mut initial_builder = BlobArrayBuilder::new(1);
+        initial_builder.push_bytes(payload.clone()).unwrap();
+        let initial_batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![Arc::new(initial_builder.finish().unwrap()) as ArrayRef],
+        )
+        .unwrap();
+        let initial_reader = RecordBatchIterator::new(vec![Ok(initial_batch)], schema);
+        let dataset = Dataset::write(
+            initial_reader,
+            &dataset_dir.path_str(),
+            Some(WriteParams {
+                data_storage_version: Some(LanceFileVersion::V2_2),
+                ..Default::default()
+            }),
+        )
+        .await
+        .unwrap();
+
+        let mut append_field = blob_field("blob", true);
+        let mut append_metadata = append_field.metadata().clone();
+        append_metadata.insert(
+            ARROW_EXT_NAME_KEY.to_string(),
+            "some.other.extension".to_string(),
+        );
+        append_metadata.insert(
+            BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(),
+            "1024".to_string(),
+        );
+        append_field = append_field.with_metadata(append_metadata);
+        let append_schema = Arc::new(Schema::new(vec![append_field]));
+        let mut append_builder = BlobArrayBuilder::new(1);
+        append_builder.push_bytes(payload).unwrap();
+        let append_batch = RecordBatch::try_new(
+            append_schema.clone(),
+            vec![Arc::new(append_builder.finish().unwrap()) as ArrayRef],
+        )
+        .unwrap();
+        let append_reader = RecordBatchIterator::new(vec![Ok(append_batch)], append_schema);
+
+        let result = Dataset::write(
+            append_reader,
+            Arc::new(dataset),
+            Some(WriteParams {
+                mode: WriteMode::Append,
+                ..Default::default()
+            }),
+        )
+        .await;
+        let Err(err) = result else {
+            panic!("append with ignored blob threshold metadata should fail");
+        };
+        let message = err.to_string();
+        assert!(message.contains("Cannot append data with blob threshold metadata"));
+        assert!(message.contains(BLOB_INLINE_SIZE_THRESHOLD_META_KEY));
+    }
+
+    #[tokio::test]
+    async fn test_blob_v2_append_accepts_explicit_default_inline_threshold() {
+        let dataset_dir = TempDir::default();
+        let payload = vec![0u8; 2048];
+
+        let schema = Arc::new(Schema::new(vec![blob_field("blob", true)]));
+        let mut initial_builder = BlobArrayBuilder::new(1);
+        initial_builder.push_bytes(payload.clone()).unwrap();
+        let initial_batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![Arc::new(initial_builder.finish().unwrap()) as ArrayRef],
+        )
+        .unwrap();
+        let initial_reader = RecordBatchIterator::new(vec![Ok(initial_batch)], schema);
+        let dataset = Dataset::write(
+            initial_reader,
+            &dataset_dir.path_str(),
+            Some(WriteParams {
+                data_storage_version: Some(LanceFileVersion::V2_2),
+                ..Default::default()
+            }),
+        )
+        .await
+        .unwrap();
+
+        let mut append_field = blob_field("blob", true);
+        let mut append_metadata = append_field.metadata().clone();
+        append_metadata.insert(
+            BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(),
+            super::INLINE_MAX.to_string(),
+        );
+        append_field = append_field.with_metadata(append_metadata);
+        let append_schema = Arc::new(Schema::new(vec![append_field]));
+        let mut append_builder = BlobArrayBuilder::new(1);
+        append_builder.push_bytes(payload).unwrap();
+        let append_batch = RecordBatch::try_new(
+            append_schema.clone(),
+            vec![Arc::new(append_builder.finish().unwrap()) as ArrayRef],
+        )
+        .unwrap();
+        let append_reader = RecordBatchIterator::new(vec![Ok(append_batch)], append_schema);
+
+        let dataset = Dataset::write(
+            append_reader,
+            Arc::new(dataset),
+            Some(WriteParams {
+                mode: WriteMode::Append,
+                ..Default::default()
+            }),
+        )
+        .await
+        .unwrap();
+        assert_eq!(dataset.count_rows(None).await.unwrap(), 2);
+    }
 }
diff --git a/rust/lance/src/dataset/branch_location.rs b/rust/lance/src/dataset/branch_location.rs
index 3a1185c8cf8..7ebce36ec86 100644
--- a/rust/lance/src/dataset/branch_location.rs
+++ b/rust/lance/src/dataset/branch_location.rs
@@ -31,14 +31,20 @@ impl BranchLocation {
     }
 
     fn get_root_path(path_str: &str, branch_name: &str) -> Result<String> {
+        // A uri may carry a query string (e.g. `s3+ddb://...?ddbTableName=t`);
+        // the branch suffix sits on the path part, before the query.
+        let (path_part, query) = match path_str.split_once('?') {
+            Some((path, query)) => (path, Some(query)),
+            None => (path_str, None),
+        };
         let branch_suffix = format!("{}/{}", BRANCH_DIR, branch_name);
         let branch_suffix = branch_suffix.as_str();
-        let root_path_str = path_str
+        let root_path_str = path_part
             .strip_suffix(branch_suffix)
             .or_else(|| {
                 if cfg!(windows) {
                     let windows_suffix = branch_suffix.replace('/', "\\");
-                    path_str.strip_suffix(&windows_suffix)
+                    path_part.strip_suffix(&windows_suffix)
                 } else {
                     None
                 }
@@ -59,7 +65,10 @@ impl BranchLocation {
                 root_path_str, path_str,
             )));
         };
-        Ok(root_path_str)
+        Ok(match query {
+            Some(query) => format!("{}?{}", root_path_str, query),
+            None => root_path_str,
+        })
     }
 
     /// The branch a location under `root` targets: the inverse of
@@ -132,13 +141,23 @@ impl BranchLocation {
     }
 
     fn join_str(base: &str, segment: &str) -> Result<String> {
+        // A uri may carry a query string (e.g. `s3+ddb://...?ddbTableName=t`);
+        // path segments must be appended before it.
+        let (path_part, query) = match base.split_once('?') {
+            Some((path, query)) => (path, Some(query)),
+            None => (base, None),
+        };
         let normalized_segment = segment.trim_start_matches('/');
-        let is_base_dir = base.ends_with("/");
-        if is_base_dir {
-            Ok(format!("{}{}", base, normalized_segment))
+        let is_base_dir = path_part.ends_with("/");
+        let joined = if is_base_dir {
+            format!("{}{}", path_part, normalized_segment)
         } else {
-            Ok(format!("{}/{}", base, normalized_segment))
-        }
+            format!("{}/{}", path_part, normalized_segment)
+        };
+        Ok(match query {
+            Some(query) => format!("{}?{}", joined, query),
+            None => joined,
+        })
     }
 }
 
@@ -255,6 +274,30 @@ mod tests {
         assert!(fs::create_dir_all(std::path::Path::new(new_location.uri.as_str())).is_ok());
     }
 
+    #[test]
+    fn test_branch_location_with_query_uri() {
+        // Uris like `s3+ddb://...?ddbTableName=t` carry the commit handler
+        // config in the query string; branch path segments must be inserted
+        // before it and the query must survive the round trip.
+        let location = BranchLocation {
+            path: Path::parse("bucket/table.lance").unwrap(),
+            uri: "s3+ddb://bucket/table.lance?ddbTableName=t".to_string(),
+            branch: None,
+        };
+        let dev = location.find_branch(Some("dev")).unwrap();
+        assert_eq!(
+            dev.uri,
+            "s3+ddb://bucket/table.lance/tree/dev?ddbTableName=t"
+        );
+        assert_eq!(dev.path.as_ref(), "bucket/table.lance/tree/dev");
+        assert_eq!(dev.branch.as_deref(), Some("dev"));
+
+        let main = dev.find_main().unwrap();
+        assert_eq!(main.uri, "s3+ddb://bucket/table.lance?ddbTableName=t");
+        assert_eq!(main.path.as_ref(), "bucket/table.lance");
+        assert_eq!(main.branch, None);
+    }
+
     #[test]
     fn test_branch_of() {
         let derive = |root: &str, location: &str| BranchLocation::branch_of(root, location);
diff --git a/rust/lance/src/dataset/cleanup.rs b/rust/lance/src/dataset/cleanup.rs
index b3ca60cfa0f..65928038cea 100644
--- a/rust/lance/src/dataset/cleanup.rs
+++ b/rust/lance/src/dataset/cleanup.rs
@@ -46,7 +46,8 @@ use lance_core::{
     Error, Result,
     utils::tracing::{
         AUDIT_MODE_DELETE, AUDIT_MODE_DELETE_UNVERIFIED, AUDIT_TYPE_DATA, AUDIT_TYPE_DELETION,
-        AUDIT_TYPE_INDEX, AUDIT_TYPE_MANIFEST, TRACE_FILE_AUDIT,
+        AUDIT_TYPE_INDEX, AUDIT_TYPE_MANIFEST, DATASET_CLEANING_EVENT, TRACE_DATASET_EVENTS,
+        TRACE_FILE_AUDIT,
     },
 };
 use lance_table::{
@@ -78,7 +79,7 @@ struct ReferencedFiles {
     index_uuids: HashSet<String>,
 }
 
-#[derive(Clone, Debug, Default)]
+#[derive(Clone, Debug, Default, PartialEq, Eq)]
 pub struct RemovalStats {
     pub bytes_removed: u64,
     pub old_versions: u64,
@@ -88,12 +89,194 @@ pub struct RemovalStats {
     pub deletion_files_removed: u64,
 }
 
-#[derive(Clone, Copy, Debug)]
-enum RemovedFileType {
+/// A read-only explanation of what a cleanup operation would remove.
+///
+/// This is an explanation, not a deletion plan.  Calling
+/// [`CleanupOperation::execute`] re-evaluates the current dataset and reference
+/// state before deleting files.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct CleanupExplanation {
+    /// Dataset version observed when the explanation was produced.
+    pub read_version: u64,
+    /// Aggregate statistics for files that would be removed.
+    pub stats: RemovalStats,
+    /// Candidate files that would be removed, capped by `candidate_file_limit`.
+    pub candidate_files: Vec<CleanupCandidateFile>,
+    /// True if more candidate files were found than are included.
+    pub candidate_files_truncated: bool,
+    /// Maximum number of candidate files included in this explanation.
+    pub candidate_file_limit: usize,
+    /// Referenced child branches and whether cleanup would cascade into them.
+    pub referenced_branches: Vec<CleanupReferencedBranch>,
+    /// Non-fatal warnings about the explanation.
+    pub warnings: Vec<String>,
+}
+
+/// A file that cleanup identified as removable.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct CleanupCandidateFile {
+    /// Dataset-relative or storage path for the candidate file.
+    pub path: String,
+    /// Kind of file identified by cleanup.
+    pub kind: CleanupFileKind,
+    /// True if the file is removable only because it aged past the unverified
+    /// retention threshold or `delete_unverified` is enabled.
+    pub unverified: bool,
+    /// Candidate file size in bytes.
+    pub size_bytes: u64,
+}
+
+/// A branch that references the current branch lineage.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct CleanupReferencedBranch {
+    /// Branch name.
+    pub name: String,
+    /// Version of the current lineage referenced by this branch.
+    pub referenced_version: u64,
+    /// True if this branch would be cleaned when cascading cleanup is enabled.
+    pub cleanup_candidate: bool,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum CleanupFileKind {
+    Manifest,
     Data,
     Transaction,
     Index,
     Deletion,
+    /// A leftover `_versions/.tmp` manifest from a failed transaction.  These
+    /// are deleted but excluded from per-kind `RemovalStats` counts and audit
+    /// logs to match the long-standing cleanup behavior.  Their bytes
+    /// are still included in `bytes_removed`.
+    TemporaryManifest,
+}
+
+impl CleanupCandidateFile {
+    fn from_cleanup_file(file: &CleanupFile) -> Self {
+        Self {
+            path: file.path.to_string(),
+            kind: file.kind,
+            unverified: file.unverified,
+            size_bytes: file.size_bytes,
+        }
+    }
+}
+
+fn cleanup_file(
+    path: Path,
+    kind: CleanupFileKind,
+    unverified: bool,
+    size_bytes: u64,
+) -> Option<CleanupFile> {
+    Some(CleanupFile {
+        path,
+        kind,
+        unverified,
+        size_bytes,
+    })
+}
+
+#[derive(Clone, Debug)]
+struct CleanupFile {
+    path: Path,
+    kind: CleanupFileKind,
+    /// True when the file was kept on disk past its referenced lifetime
+    /// because we could not verify it was safe to remove (e.g. produced by an
+    /// unfinished commit) and is being deleted only because it has aged past
+    /// the unverified-retention threshold or `delete_unverified` is set.
+    unverified: bool,
+    size_bytes: u64,
+}
+
+impl RemovalStats {
+    fn record_file(&mut self, file: &CleanupFile) {
+        self.bytes_removed += file.size_bytes;
+        match file.kind {
+            CleanupFileKind::Manifest => self.old_versions += 1,
+            CleanupFileKind::Data => self.data_files_removed += 1,
+            CleanupFileKind::Transaction => self.transaction_files_removed += 1,
+            CleanupFileKind::Index => self.index_files_removed += 1,
+            CleanupFileKind::Deletion => self.deletion_files_removed += 1,
+            CleanupFileKind::TemporaryManifest => {}
+        }
+    }
+
+    fn merge(&mut self, other: &Self) {
+        self.bytes_removed += other.bytes_removed;
+        self.old_versions += other.old_versions;
+        self.data_files_removed += other.data_files_removed;
+        self.transaction_files_removed += other.transaction_files_removed;
+        self.index_files_removed += other.index_files_removed;
+        self.deletion_files_removed += other.deletion_files_removed;
+    }
+}
+
+#[derive(Debug, Default)]
+struct CleanupRunResult {
+    stats: RemovalStats,
+    removed_manifests: HashSet<Path>,
+    candidate_files: Vec<CleanupCandidateFile>,
+    candidate_files_truncated: bool,
+    referenced_branches: Vec<CleanupReferencedBranch>,
+}
+
+impl CleanupRunResult {
+    fn record_file(
+        &mut self,
+        file: &CleanupFile,
+        candidate_file_limit: Option<usize>,
+        track_removed_manifests: bool,
+    ) {
+        self.stats.record_file(file);
+        if track_removed_manifests && matches!(file.kind, CleanupFileKind::Manifest) {
+            self.removed_manifests.insert(file.path.clone());
+        }
+        if let Some(limit) = candidate_file_limit {
+            if self.candidate_files.len() < limit {
+                self.candidate_files
+                    .push(CleanupCandidateFile::from_cleanup_file(file));
+            } else {
+                self.candidate_files_truncated = true;
+            }
+        }
+    }
+
+    fn merge(&mut self, other: Self, candidate_file_limit: Option<usize>) {
+        self.stats.merge(&other.stats);
+        self.removed_manifests.extend(other.removed_manifests);
+        self.referenced_branches.extend(other.referenced_branches);
+        if let Some(limit) = candidate_file_limit {
+            for file in other.candidate_files {
+                if self.candidate_files.len() < limit {
+                    self.candidate_files.push(file);
+                } else {
+                    self.candidate_files_truncated = true;
+                }
+            }
+            self.candidate_files_truncated |= other.candidate_files_truncated;
+        }
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+enum CleanupAction {
+    Execute,
+    Explain { max_candidate_files: usize },
+}
+
+impl CleanupAction {
+    fn deletes_files(self) -> bool {
+        matches!(self, Self::Execute)
+    }
+
+    fn candidate_file_limit(self) -> Option<usize> {
+        match self {
+            Self::Execute => None,
+            Self::Explain {
+                max_candidate_files,
+            } => Some(max_candidate_files),
+        }
+    }
 }
 
 fn remove_prefix(path: &Path, prefix: &Path) -> Path {
@@ -108,6 +291,11 @@ fn remove_prefix(path: &Path, prefix: &Path) -> Path {
 struct CleanupTask<'a> {
     dataset: &'a Dataset,
     policy: CleanupPolicy,
+    action: CleanupAction,
+    read_version: u64,
+    ignored_manifests: HashSet<Path>,
+    track_removed_manifests: bool,
+    include_referenced_branches: bool,
 }
 
 /// Information about the dataset that we learn by inspecting all of the manifests
@@ -131,21 +319,131 @@ struct CleanupInspection {
 const UNVERIFIED_THRESHOLD_DAYS: i64 = 7;
 const S3_DELETE_STREAM_BATCH_SIZE: u64 = 1_000;
 const AZURE_DELETE_STREAM_BATCH_SIZE: u64 = 256;
+const DEFAULT_EXPLANATION_MAX_CANDIDATE_FILES: usize = 1_000;
+
+/// Builder-style cleanup operation.
+///
+/// Call [`Self::explain`] for a read-only explanation of what cleanup would
+/// remove, or [`Self::execute`] to re-evaluate the current dataset state and
+/// delete files.
+pub struct CleanupOperation<'a> {
+    dataset: &'a Dataset,
+    policy: CleanupPolicy,
+    max_candidate_files: usize,
+}
+
+impl<'a> CleanupOperation<'a> {
+    pub(crate) fn new(dataset: &'a Dataset, policy: CleanupPolicy) -> Self {
+        Self {
+            dataset,
+            policy,
+            max_candidate_files: DEFAULT_EXPLANATION_MAX_CANDIDATE_FILES,
+        }
+    }
+
+    /// Set the maximum number of candidate files included in explanations.
+    ///
+    /// The aggregate [`RemovalStats`] in [`CleanupExplanation`] still include
+    /// all files that would be removed.
+    pub fn with_max_candidate_files(mut self, max_candidate_files: usize) -> Self {
+        self.max_candidate_files = max_candidate_files;
+        self
+    }
+
+    /// Explain what cleanup would remove without deleting files.
+    pub async fn explain(&self) -> Result<CleanupExplanation> {
+        let cleanup = CleanupTask::new(
+            self.dataset,
+            self.policy.clone(),
+            CleanupAction::Explain {
+                max_candidate_files: self.max_candidate_files,
+            },
+        );
+        let read_version = cleanup.read_version;
+        let result = cleanup.run().await?;
+        let warnings = if result.candidate_files_truncated {
+            vec![format!(
+                "candidate_files truncated to {} entries",
+                self.max_candidate_files
+            )]
+        } else {
+            Vec::new()
+        };
+        Ok(CleanupExplanation {
+            read_version,
+            stats: result.stats,
+            candidate_files: result.candidate_files,
+            candidate_files_truncated: result.candidate_files_truncated,
+            candidate_file_limit: self.max_candidate_files,
+            referenced_branches: result.referenced_branches,
+            warnings,
+        })
+    }
+
+    /// Execute cleanup by re-evaluating the current dataset state.
+    pub async fn execute(&self) -> Result<RemovalStats> {
+        info!(target: TRACE_DATASET_EVENTS, event=DATASET_CLEANING_EVENT, uri=&self.dataset.uri);
+        let cleanup = CleanupTask::new(self.dataset, self.policy.clone(), CleanupAction::Execute);
+        Ok(cleanup.run().await?.stats)
+    }
+}
 
 impl<'a> CleanupTask<'a> {
-    fn new(dataset: &'a Dataset, policy: CleanupPolicy) -> Self {
-        Self { dataset, policy }
+    fn new(dataset: &'a Dataset, policy: CleanupPolicy, action: CleanupAction) -> Self {
+        let track_removed_manifests = policy.clean_referenced_branches;
+        let include_referenced_branches = action.candidate_file_limit().is_some();
+        Self::new_with_ignored_manifests(
+            dataset,
+            policy,
+            action,
+            HashSet::new(),
+            track_removed_manifests,
+            include_referenced_branches,
+        )
+    }
+
+    fn new_with_ignored_manifests(
+        dataset: &'a Dataset,
+        policy: CleanupPolicy,
+        action: CleanupAction,
+        ignored_manifests: HashSet<Path>,
+        track_removed_manifests: bool,
+        include_referenced_branches: bool,
+    ) -> Self {
+        Self {
+            dataset,
+            policy,
+            action,
+            read_version: dataset.version().version,
+            ignored_manifests,
+            track_removed_manifests,
+            include_referenced_branches,
+        }
     }
 
-    async fn run(self) -> Result<RemovalStats> {
-        let mut final_stats = RemovalStats::default();
+    async fn run(self) -> Result<CleanupRunResult> {
+        let mut final_result = CleanupRunResult::default();
+        let candidate_file_limit = self.action.candidate_file_limit();
         // First check if we need to clean referenced branches
         // For cases that referenced branches never clean and the current cleanup cannot clean anything
         // This must happen before cleaning the current branch if the setting is enabled.
 
         let referenced_branches: Vec<(String, u64)> = self.find_referenced_branches().await?;
+        if self.include_referenced_branches {
+            final_result.referenced_branches = referenced_branches
+                .iter()
+                .map(|(name, referenced_version)| CleanupReferencedBranch {
+                    name: name.clone(),
+                    referenced_version: *referenced_version,
+                    cleanup_candidate: self.policy.clean_referenced_branches,
+                })
+                .collect();
+        }
         if self.policy.clean_referenced_branches {
-            self.clean_referenced_branches(&referenced_branches).await?;
+            final_result.merge(
+                self.clean_referenced_branches(&referenced_branches).await?,
+                candidate_file_limit,
+            );
         }
 
         // we process all manifest files in parallel to figure
@@ -179,19 +477,21 @@ impl<'a> CleanupTask<'a> {
         }
 
         if !referenced_branches.is_empty() {
+            let ignored_manifests: HashSet<_> = final_result
+                .removed_manifests
+                .union(&self.ignored_manifests)
+                .cloned()
+                .collect();
             inspection = self
-                .retain_branch_lineage_files(inspection, &referenced_branches)
+                .retain_branch_lineage_files(inspection, &referenced_branches, &ignored_manifests)
                 .await?
         };
 
-        let stats = self.delete_unreferenced_files(inspection).await?;
-        final_stats.bytes_removed += stats.bytes_removed;
-        final_stats.old_versions += stats.old_versions;
-        final_stats.data_files_removed += stats.data_files_removed;
-        final_stats.transaction_files_removed += stats.transaction_files_removed;
-        final_stats.index_files_removed += stats.index_files_removed;
-        final_stats.deletion_files_removed += stats.deletion_files_removed;
-        Ok(final_stats)
+        final_result.merge(
+            self.delete_unreferenced_files(inspection).await?,
+            candidate_file_limit,
+        );
+        Ok(final_result)
     }
 
     #[instrument(level = "debug", skip_all)]
@@ -203,6 +503,7 @@ impl<'a> CleanupTask<'a> {
         self.dataset
             .commit_handler
             .list_manifest_locations(&self.dataset.base, &self.dataset.object_store, false)
+            .try_filter(|location| future::ready(!self.ignored_manifests.contains(&location.path)))
             .try_for_each_concurrent(self.dataset.object_store.io_parallelism(), |location| {
                 self.process_manifest_file(location, &inspection, tagged_versions)
             })
@@ -224,12 +525,10 @@ impl<'a> CleanupTask<'a> {
 
         let manifest =
             read_manifest(&self.dataset.object_store, &location.path, location.size).await?;
-        let dataset_version = self.dataset.version().version;
-
         // Don't delete the latest version, even if it is old. Don't delete tagged versions,
         // regardless of age. Don't delete manifests if their version is newer than the dataset
         // version.  These are either in-progress or newly added since we started.
-        let is_latest = dataset_version <= manifest.version;
+        let is_latest = self.read_version <= manifest.version;
         let is_tagged = tagged_versions.contains(&manifest.version);
         let in_working_set = is_latest || !self.policy.should_clean(&manifest) || is_tagged;
         let indexes =
@@ -319,8 +618,10 @@ impl<'a> CleanupTask<'a> {
     async fn delete_unreferenced_files(
         &self,
         inspection: CleanupInspection,
-    ) -> Result<RemovalStats> {
-        let removal_stats = Mutex::new(RemovalStats::default());
+    ) -> Result<CleanupRunResult> {
+        let cleanup_result = Mutex::new(CleanupRunResult::default());
+        let deletes_files = self.action.deletes_files();
+        let candidate_file_limit = self.action.candidate_file_limit();
         let verification_threshold = utc_now()
             - TimeDelta::try_days(UNVERIFIED_THRESHOLD_DAYS).expect("TimeDelta::try_days");
 
@@ -335,9 +636,8 @@ impl<'a> CleanupTask<'a> {
             )
         };
         // Build stream for a managed subtree
-        let build_listing_stream = |dir: Path, file_type: Option<RemovedFileType>| {
+        let build_listing_stream = |dir: Path| {
             let inspection_ref = &inspection;
-            let removal_stats_ref = &removal_stats;
             self.dataset
                 .object_store
                 .read_dir_all(&dir, inspection.earliest_retained_manifest_time)
@@ -356,118 +656,133 @@ impl<'a> CleanupTask<'a> {
                     // delete it if we can verify it is part of an old version.
                     let maybe_in_progress = !self.policy.delete_unverified
                         && obj_meta.last_modified >= verification_threshold;
-                    let path_to_remove = self.path_if_not_referenced(
-                        obj_meta.location,
+                    let file_to_remove = self.cleanup_file_if_not_referenced(
+                        obj_meta,
                         maybe_in_progress,
                         inspection_ref,
                     );
-                    if matches!(path_to_remove, Ok(Some(..))) {
-                        let mut stats = removal_stats_ref.lock().unwrap();
-                        stats.bytes_removed += obj_meta.size;
-                        if let Some(file_type) = file_type {
-                            match file_type {
-                                RemovedFileType::Data => stats.data_files_removed += 1,
-                                RemovedFileType::Transaction => {
-                                    stats.transaction_files_removed += 1
-                                }
-                                RemovedFileType::Index => stats.index_files_removed += 1,
-                                RemovedFileType::Deletion => stats.deletion_files_removed += 1,
-                            }
-                        }
-                    }
-                    future::ready(path_to_remove)
+                    future::ready(file_to_remove)
                 })
                 .boxed()
         };
 
         // Restrict scanning to Lance-managed subtrees for safety and performance.
         let streams = vec![
-            build_listing_stream(self.dataset.versions_dir(), None),
-            build_listing_stream(
-                self.dataset.transactions_dir(),
-                Some(RemovedFileType::Transaction),
-            ),
-            build_listing_stream(self.dataset.data_dir(), Some(RemovedFileType::Data)),
-            build_listing_stream(self.dataset.indices_dir(), Some(RemovedFileType::Index)),
-            build_listing_stream(
-                self.dataset.deletions_dir(),
-                Some(RemovedFileType::Deletion),
-            ),
+            build_listing_stream(self.dataset.versions_dir()),
+            build_listing_stream(self.dataset.transactions_dir()),
+            build_listing_stream(self.dataset.data_dir()),
+            build_listing_stream(self.dataset.indices_dir()),
+            build_listing_stream(self.dataset.deletions_dir()),
         ];
-        let unreferenced_paths = stream::iter(streams).flatten().boxed();
+        let unreferenced_files = stream::iter(streams).flatten().boxed();
 
         let old_manifests = inspection.old_manifests.clone();
-        let num_old_manifests = old_manifests.len();
-
-        // Ideally this collect shouldn't be needed here but it seems necessary
-        // to avoid https://github.com/rust-lang/rust/issues/102211
-        let manifest_bytes_removed = stream::iter(old_manifests.keys())
-            .map(|path| self.dataset.object_store.size(path))
-            .collect::<Vec<_>>()
-            .await;
-        let manifest_bytes_removed = stream::iter(manifest_bytes_removed)
-            .buffer_unordered(self.dataset.object_store.io_parallelism())
-            .try_fold(0, |acc, size| async move { Ok(acc + (size)) })
-            .await;
-
-        let old_manifests_stream = stream::iter(old_manifests.into_keys())
-            .map(|path| {
-                info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_MANIFEST, path = path.as_ref());
-                Ok(path)
+        let manifest_files = stream::iter(old_manifests)
+            .map(|(path, _version)| async move {
+                let size_bytes = self.dataset.object_store.size(&path).await?;
+                Ok::<CleanupFile, Error>(CleanupFile {
+                    path,
+                    kind: CleanupFileKind::Manifest,
+                    unverified: false,
+                    size_bytes,
+                })
             })
+            .buffer_unordered(self.dataset.object_store.io_parallelism())
             .boxed();
-        let all_paths_to_remove =
-            stream::iter(vec![unreferenced_paths, old_manifests_stream]).flatten();
-
-        let paths_to_delete: BoxStream<Result<Path>> = if let Some(rate) =
-            self.policy.delete_rate_limit
-        {
-            let duration = calculate_duration(self.dataset.object_store.scheme().to_string(), rate);
-            let mut ticker = interval(duration);
-            ticker.set_missed_tick_behavior(MissedTickBehavior::Delay);
-            IntervalStream::new(ticker)
-                .zip(all_paths_to_remove)
-                .map(|(_, path)| path)
-                .boxed()
-        } else {
-            all_paths_to_remove.boxed()
-        };
 
-        let delete_fut = self
-            .dataset
-            .object_store
-            .remove_stream(paths_to_delete)
-            .try_for_each(|_| future::ready(Ok(())));
+        let all_files = stream::iter(vec![unreferenced_files, manifest_files]).flatten();
+        let all_paths_to_remove = all_files.map(|file| {
+            let file = file?;
+            if deletes_files {
+                let mode = if file.unverified {
+                    AUDIT_MODE_DELETE_UNVERIFIED
+                } else {
+                    AUDIT_MODE_DELETE
+                };
+                let path_str = file.path.as_ref();
+                match file.kind {
+                    CleanupFileKind::Manifest => {
+                        info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_MANIFEST, path = path_str);
+                    }
+                    CleanupFileKind::Data => {
+                        info!(target: TRACE_FILE_AUDIT, mode=mode, r#type=AUDIT_TYPE_DATA, path = path_str);
+                    }
+                    CleanupFileKind::Deletion => {
+                        info!(target: TRACE_FILE_AUDIT, mode=mode, r#type=AUDIT_TYPE_DELETION, path = path_str);
+                    }
+                    CleanupFileKind::Index => {
+                        info!(target: TRACE_FILE_AUDIT, mode=mode, r#type=AUDIT_TYPE_INDEX, path = path_str);
+                    }
+                    CleanupFileKind::Transaction | CleanupFileKind::TemporaryManifest => {}
+                }
+            }
+            cleanup_result
+                .lock()
+                .unwrap()
+                .record_file(&file, candidate_file_limit, self.track_removed_manifests);
+            Ok(file.path)
+        });
+
+        if deletes_files {
+            let paths_to_delete: BoxStream<Result<Path>> =
+                if let Some(rate) = self.policy.delete_rate_limit {
+                    let duration =
+                        calculate_duration(self.dataset.object_store.scheme().to_string(), rate);
+                    let mut ticker = interval(duration);
+                    ticker.set_missed_tick_behavior(MissedTickBehavior::Delay);
+                    IntervalStream::new(ticker)
+                        .zip(all_paths_to_remove)
+                        .map(|(_, path)| path)
+                        .boxed()
+                } else {
+                    all_paths_to_remove.boxed()
+                };
 
-        delete_fut.await?;
+            self.dataset
+                .object_store
+                .remove_stream(paths_to_delete)
+                .try_for_each(|_| future::ready(Ok(())))
+                .await?;
+        } else {
+            // Drain the stream to populate stats, but do not call remove_stream.
+            all_paths_to_remove
+                .try_for_each(|_| future::ready(Ok(())))
+                .await?;
+        }
 
-        let mut removal_stats = removal_stats.into_inner().unwrap();
-        removal_stats.old_versions = num_old_manifests as u64;
-        removal_stats.bytes_removed += manifest_bytes_removed?;
+        let cleanup_result = cleanup_result.into_inner().unwrap();
 
         let span = Span::current();
-        span.record("bytes_removed", removal_stats.bytes_removed);
-        span.record("data_files_removed", removal_stats.data_files_removed);
+        span.record("bytes_removed", cleanup_result.stats.bytes_removed);
+        span.record(
+            "data_files_removed",
+            cleanup_result.stats.data_files_removed,
+        );
         span.record(
             "transaction_files_removed",
-            removal_stats.transaction_files_removed,
+            cleanup_result.stats.transaction_files_removed,
+        );
+        span.record(
+            "index_files_removed",
+            cleanup_result.stats.index_files_removed,
         );
-        span.record("index_files_removed", removal_stats.index_files_removed);
         span.record(
             "deletion_files_removed",
-            removal_stats.deletion_files_removed,
+            cleanup_result.stats.deletion_files_removed,
         );
 
-        Ok(removal_stats)
+        Ok(cleanup_result)
     }
 
-    fn path_if_not_referenced(
+    fn cleanup_file_if_not_referenced(
         &self,
-        path: Path,
+        obj_meta: ObjectMeta,
         maybe_in_progress: bool,
         inspection: &CleanupInspection,
-    ) -> Result<Option<Path>> {
+    ) -> Result<Option<CleanupFile>> {
+        let path = obj_meta.location;
         let relative_path = remove_prefix(&path, &self.dataset.base);
+        let size_bytes = obj_meta.size;
         if relative_path.as_ref().starts_with("_versions/.tmp") {
             // This is a temporary manifest file.
             //
@@ -476,7 +791,12 @@ impl<'a> CleanupTask<'a> {
             if maybe_in_progress {
                 return Ok(None);
             } else {
-                return Ok(Some(path));
+                return Ok(cleanup_file(
+                    path,
+                    CleanupFileKind::TemporaryManifest,
+                    true,
+                    size_bytes,
+                ));
             }
         }
         if relative_path.as_ref().starts_with("_indices") {
@@ -490,15 +810,18 @@ impl<'a> CleanupTask<'a> {
                 {
                     return Ok(None);
                 } else if !maybe_in_progress {
-                    info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE_UNVERIFIED, r#type=AUDIT_TYPE_INDEX, path = path.to_string());
-                    return Ok(Some(path));
+                    return Ok(cleanup_file(path, CleanupFileKind::Index, true, size_bytes));
                 } else if inspection
                     .verified_files
                     .index_uuids
                     .contains(uuid.as_ref())
                 {
-                    info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_INDEX, path = path.to_string());
-                    return Ok(Some(path));
+                    return Ok(cleanup_file(
+                        path,
+                        CleanupFileKind::Index,
+                        false,
+                        size_bytes,
+                    ));
                 }
             } else {
                 return Ok(None);
@@ -514,15 +837,13 @@ impl<'a> CleanupTask<'a> {
                     {
                         Ok(None)
                     } else if !maybe_in_progress {
-                        info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE_UNVERIFIED, r#type=AUDIT_TYPE_DATA, path = path.to_string());
-                        Ok(Some(path))
+                        Ok(cleanup_file(path, CleanupFileKind::Data, true, size_bytes))
                     } else if inspection
                         .verified_files
                         .data_paths
                         .contains(&relative_path)
                     {
-                        info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_DATA, path = path.to_string());
-                        Ok(Some(path))
+                        Ok(cleanup_file(path, CleanupFileKind::Data, false, size_bytes))
                     } else {
                         Ok(None)
                     }
@@ -587,15 +908,13 @@ impl<'a> CleanupTask<'a> {
                 {
                     Ok(None)
                 } else if !maybe_in_progress {
-                    info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE_UNVERIFIED, r#type=AUDIT_TYPE_DATA, path = path.to_string());
-                    Ok(Some(path))
+                    Ok(cleanup_file(path, CleanupFileKind::Data, true, size_bytes))
                 } else if inspection
                     .verified_files
                     .data_paths
                     .contains(&parent_data_path)
                 {
-                    info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_DATA, path = path.to_string());
-                    Ok(Some(path))
+                    Ok(cleanup_file(path, CleanupFileKind::Data, false, size_bytes))
                 } else {
                     Ok(None)
                 }
@@ -613,15 +932,23 @@ impl<'a> CleanupTask<'a> {
                     {
                         Ok(None)
                     } else if !maybe_in_progress {
-                        info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE_UNVERIFIED, r#type=AUDIT_TYPE_DELETION, path = path.to_string());
-                        Ok(Some(path))
+                        Ok(cleanup_file(
+                            path,
+                            CleanupFileKind::Deletion,
+                            true,
+                            size_bytes,
+                        ))
                     } else if inspection
                         .verified_files
                         .delete_paths
                         .contains(&relative_path)
                     {
-                        info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_DELETION, path = path.to_string());
-                        Ok(Some(path))
+                        Ok(cleanup_file(
+                            path,
+                            CleanupFileKind::Deletion,
+                            false,
+                            size_bytes,
+                        ))
                     } else {
                         Ok(None)
                     }
@@ -640,7 +967,14 @@ impl<'a> CleanupTask<'a> {
                     } else if !maybe_in_progress
                         || inspection.verified_files.tx_paths.contains(&relative_path)
                     {
-                        Ok(Some(path))
+                        let unverified =
+                            !inspection.verified_files.tx_paths.contains(&relative_path);
+                        Ok(cleanup_file(
+                            path,
+                            CleanupFileKind::Transaction,
+                            unverified,
+                            size_bytes,
+                        ))
                     } else {
                         Ok(None)
                     }
@@ -709,8 +1043,8 @@ impl<'a> CleanupTask<'a> {
     async fn clean_referenced_branches(
         &self,
         referenced_branches: &[(String, u64)],
-    ) -> Result<RemovalStats> {
-        let final_stats = Mutex::new(RemovalStats::default());
+    ) -> Result<CleanupRunResult> {
+        let final_result = Mutex::new(CleanupRunResult::default());
 
         // Group branches by their lineage identifier (BranchIdentifier).
         // Branches with the same identifier share a lineage and must be cleaned sequentially
@@ -722,30 +1056,32 @@ impl<'a> CleanupTask<'a> {
                 .or_insert_with(Vec::new)
                 .push(branch.clone());
         }
+        let action = self.action;
+        let candidate_file_limit = self.action.candidate_file_limit();
         let tasks: Vec<_> = branches_chains
             .values()
             .map(|branch_chain| {
-                let final_stats = &final_stats;
+                let final_result = &final_result;
                 async move {
                     for branch in branch_chain {
                         let branch_dataset = self
                             .dataset
                             .checkout_version((branch.as_str(), None))
                             .await?;
-                        if let Some(stats) = cleanup_cascade_branch(
+                        let ignored_manifests =
+                            final_result.lock().unwrap().removed_manifests.clone();
+                        if let Some(result) = cleanup_cascade_branch_run(
                             &branch_dataset,
                             branch_dataset.manifest.as_ref(),
+                            action,
+                            ignored_manifests,
                         )
                         .await?
                         {
-                            let mut stats_guard = final_stats.lock().unwrap();
-                            stats_guard.bytes_removed += stats.bytes_removed;
-                            stats_guard.old_versions += stats.old_versions;
-                            stats_guard.data_files_removed += stats.data_files_removed;
-                            stats_guard.transaction_files_removed +=
-                                stats.transaction_files_removed;
-                            stats_guard.index_files_removed += stats.index_files_removed;
-                            stats_guard.deletion_files_removed += stats.deletion_files_removed;
+                            final_result
+                                .lock()
+                                .unwrap()
+                                .merge(result, candidate_file_limit);
                         }
                     }
                     Ok::<(), Error>(())
@@ -753,7 +1089,7 @@ impl<'a> CleanupTask<'a> {
             })
             .collect();
         try_join_all(tasks).await?;
-        Ok(final_stats.into_inner().unwrap())
+        Ok(final_result.into_inner().unwrap())
     }
 
     // Retain manifests containing files referenced by descendant branches.
@@ -762,6 +1098,7 @@ impl<'a> CleanupTask<'a> {
         &self,
         inspection: CleanupInspection,
         referenced_branches: &[(String, u64)],
+        removed_branch_manifests: &HashSet<Path>,
     ) -> Result<CleanupInspection> {
         let inspection = Mutex::new(inspection);
         for (branch, root_version_number) in referenced_branches {
@@ -772,6 +1109,9 @@ impl<'a> CleanupTask<'a> {
             self.dataset
                 .commit_handler
                 .list_manifest_locations(&branch_location.path, &self.dataset.object_store, false)
+                .try_filter(|location| {
+                    future::ready(!removed_branch_manifests.contains(&location.path))
+                })
                 .try_for_each_concurrent(self.dataset.object_store.io_parallelism(), |location| {
                     self.process_branch_referenced_manifests(
                         location,
@@ -1020,8 +1360,7 @@ pub async fn cleanup_old_versions(
     dataset: &Dataset,
     policy: CleanupPolicy,
 ) -> Result<RemovalStats> {
-    let cleanup = CleanupTask::new(dataset, policy);
-    cleanup.run().await
+    CleanupOperation::new(dataset, policy).execute().await
 }
 
 /// If the dataset config has `lance.auto_cleanup` parameters set,
@@ -1048,11 +1387,35 @@ pub async fn cleanup_cascade_branch(
     dataset: &Dataset,
     manifest: &Manifest,
 ) -> Result<Option<RemovalStats>> {
+    Ok(
+        cleanup_cascade_branch_run(dataset, manifest, CleanupAction::Execute, HashSet::new())
+            .await?
+            .map(|result| result.stats),
+    )
+}
+
+async fn cleanup_cascade_branch_run(
+    dataset: &Dataset,
+    manifest: &Manifest,
+    action: CleanupAction,
+    ignored_manifests: HashSet<Path>,
+) -> Result<Option<CleanupRunResult>> {
     let policy = build_cleanup_policy(dataset, manifest).await?;
     if let Some(mut policy) = policy {
         policy.clean_referenced_branches = false;
         policy.error_if_tagged_old_versions = false;
-        Ok(Some(dataset.cleanup_with_policy(policy).await?))
+        if action.deletes_files() {
+            info!(target: TRACE_DATASET_EVENTS, event=DATASET_CLEANING_EVENT, uri=&dataset.uri);
+        }
+        let cleanup = CleanupTask::new_with_ignored_manifests(
+            dataset,
+            policy,
+            action,
+            ignored_manifests,
+            true,
+            false,
+        );
+        Ok(Some(cleanup.run().await?))
     } else {
         Ok(None)
     }
@@ -1443,6 +1806,14 @@ mod tests {
             cleanup_old_versions(&db, policy).await
         }
 
+        async fn explain_cleanup_with_policy(
+            &self,
+            policy: CleanupPolicy,
+        ) -> Result<CleanupExplanation> {
+            let db = self.open().await?;
+            db.cleanup(policy).explain().await
+        }
+
         async fn run_cleanup_with_override(
             &self,
             before: DateTime<Utc>,
@@ -1670,6 +2041,51 @@ mod tests {
         assert_gt!(after_count.num_tx_files, 0);
     }
 
+    #[tokio::test]
+    async fn explain_cleanup_does_not_delete_files() {
+        let fixture = MockDatasetFixture::try_new().unwrap();
+        fixture.create_some_data().await.unwrap();
+        MockClock::set_system_time(TimeDelta::try_seconds(1).unwrap().to_std().unwrap());
+        fixture.overwrite_some_data().await.unwrap();
+
+        let before_count = fixture.count_files().await.unwrap();
+        let policy = CleanupPolicyBuilder::default()
+            .before_timestamp(utc_now())
+            .build();
+
+        let explanation = fixture
+            .explain_cleanup_with_policy(policy.clone())
+            .await
+            .unwrap();
+        let after_preview_count = fixture.count_files().await.unwrap();
+
+        // Files are not actually removed when explaining cleanup.
+        assert_eq!(before_count, after_preview_count);
+        assert_eq!(explanation.read_version, 2);
+        assert_eq!(explanation.stats.old_versions, 1);
+        assert_eq!(explanation.stats.data_files_removed, 1);
+        assert_eq!(explanation.stats.transaction_files_removed, 1);
+        assert_gt!(explanation.stats.bytes_removed, 0);
+        assert!(!explanation.candidate_files.is_empty());
+        assert!(!explanation.candidate_files_truncated);
+
+        // Running cleanup with the same policy should remove the same files the
+        // explanation reported for this unchanged dataset.
+        let removed = fixture.run_cleanup_with_policy(policy).await.unwrap();
+        let after_cleanup_count = fixture.count_files().await.unwrap();
+
+        assert_eq!(
+            removed.bytes_removed,
+            before_count.num_bytes - after_cleanup_count.num_bytes
+        );
+        assert_eq!(removed.old_versions, explanation.stats.old_versions);
+        assert_eq!(
+            removed.data_files_removed,
+            explanation.stats.data_files_removed
+        );
+        assert_eq!(removed.bytes_removed, explanation.stats.bytes_removed);
+    }
+
     #[tokio::test]
     async fn cleanup_blob_v2_sidecar_files() {
         let fixture = MockDatasetFixture::try_new().unwrap();
@@ -3073,6 +3489,17 @@ mod tests {
             self.run_cleanup_inner(policy).await
         }
 
+        async fn explain_cleanup_with_referenced_branches(&mut self) -> Result<CleanupExplanation> {
+            let policy = CleanupPolicyBuilder::default()
+                .error_if_tagged_old_versions(false)
+                .clean_referenced_branches(true)
+                .retain_n_versions(&self.dataset, 1)
+                .await?
+                .build();
+            self.dataset.checkout_latest().await?;
+            self.dataset.cleanup(policy).explain().await
+        }
+
         async fn run_cleanup_inner(&mut self, policy: CleanupPolicy) -> Result<RemovalStats> {
             let pre_count = self.count_data().await?;
             self.dataset.checkout_latest().await?;
@@ -3653,6 +4080,74 @@ mod tests {
         setup.assert_unchanged(&["branch4"]).await;
     }
 
+    #[tokio::test]
+    async fn explain_cleanup_with_referenced_branches_matches_cleanup() {
+        let mut setup = build_lineage_datasets().await.unwrap();
+
+        setup.enable_auto_cleanup().await.unwrap();
+        setup.main.write_data().await.unwrap();
+        setup.main.compact().await.unwrap();
+        setup.branch4.compact().await.unwrap();
+        setup.branch1.write_data().await.unwrap();
+        setup.branch1.compact().await.unwrap();
+        setup.branch2.write_data().await.unwrap();
+        setup.branch2.compact().await.unwrap();
+        setup.branch3.write_data().await.unwrap();
+        setup.branch3.compact().await.unwrap();
+
+        setup.main.refresh().await.unwrap();
+        setup.branch1.refresh().await.unwrap();
+        setup.branch2.refresh().await.unwrap();
+        setup.branch3.refresh().await.unwrap();
+        setup.branch4.refresh().await.unwrap();
+        let main_counts_before = setup.main.counts;
+        let branch1_counts_before = setup.branch1.counts;
+        let branch2_counts_before = setup.branch2.counts;
+        let branch3_counts_before = setup.branch3.counts;
+        let branch4_counts_before = setup.branch4.counts;
+
+        let explanation = setup
+            .main
+            .explain_cleanup_with_referenced_branches()
+            .await
+            .unwrap();
+
+        setup.main.refresh().await.unwrap();
+        setup.branch1.refresh().await.unwrap();
+        setup.branch2.refresh().await.unwrap();
+        setup.branch3.refresh().await.unwrap();
+        setup.branch4.refresh().await.unwrap();
+        assert_eq!(setup.main.counts, main_counts_before);
+        assert_eq!(setup.branch1.counts, branch1_counts_before);
+        assert_eq!(setup.branch2.counts, branch2_counts_before);
+        assert_eq!(setup.branch3.counts, branch3_counts_before);
+        assert_eq!(setup.branch4.counts, branch4_counts_before);
+
+        let removed = setup
+            .main
+            .run_cleanup_with_referenced_branches()
+            .await
+            .unwrap();
+
+        assert!(!explanation.referenced_branches.is_empty());
+        assert!(
+            explanation
+                .referenced_branches
+                .iter()
+                .any(|branch| branch.cleanup_candidate)
+        );
+        assert_eq!(explanation.stats, removed);
+        setup.branch1.refresh().await.unwrap();
+        setup.branch2.refresh().await.unwrap();
+        setup.branch3.refresh().await.unwrap();
+        setup.branch4.refresh().await.unwrap();
+        assert_eq!(setup.main.counts.num_manifest_files, 1);
+        assert_eq!(setup.branch1.counts.num_manifest_files, 1);
+        assert_eq!(setup.branch2.counts.num_manifest_files, 1);
+        assert_eq!(setup.branch3.counts.num_manifest_files, 1);
+        assert_eq!(setup.branch4.counts.num_manifest_files, 1);
+    }
+
     #[tokio::test]
     async fn auto_clean_referenced_branches_with_tags() {
         let mut setup = build_lineage_datasets().await.unwrap();
diff --git a/rust/lance/src/dataset/fragment.rs b/rust/lance/src/dataset/fragment.rs
index 11851e8846e..eb165e5f612 100644
--- a/rust/lance/src/dataset/fragment.rs
+++ b/rust/lance/src/dataset/fragment.rs
@@ -1792,7 +1792,7 @@ impl FileFragment {
         read_columns: Option<Vec<String>>,
         batch_size: Option<u32>,
     ) -> Result<(Fragment, Schema)> {
-        let (fragments, schema) = schema_evolution::add_columns_to_fragments(
+        let (fragments, schema, _) = schema_evolution::add_columns_to_fragments(
             self.dataset.as_ref(),
             transforms,
             read_columns,
diff --git a/rust/lance/src/dataset/index/frag_reuse.rs b/rust/lance/src/dataset/index/frag_reuse.rs
index 4fbefcd4725..ceebe456bbf 100644
--- a/rust/lance/src/dataset/index/frag_reuse.rs
+++ b/rust/lance/src/dataset/index/frag_reuse.rs
@@ -243,4 +243,198 @@ mod tests {
             Err(Error::RetryableCommitConflict { .. })
         ));
     }
+
+    /// With more than one index on the table, remapping every index must catch
+    /// all of them up so the reuse index can be trimmed.
+    ///
+    /// Regression: `remap_column_index` used to decide whether to remap an
+    /// index's data from the presence of the old fragments in its fragment
+    /// bitmap. But `load_indices` coverage-remaps the bitmap onto the new
+    /// fragments in memory, and remapping the *first* index commits a manifest
+    /// that persists that cleaned bitmap for the others — so remapping the
+    /// remaining indexes became a silent no-op (their data was never remapped
+    /// and their `dataset_version` never advanced), and the reuse index could
+    /// never be trimmed.
+    #[tokio::test]
+    async fn test_cleanup_frag_reuse_index_multiple_indices() {
+        let mut dataset = lance_datagen::gen_batch()
+            .col("i", lance_datagen::array::step::<Int32Type>())
+            .col("j", lance_datagen::array::step::<Int32Type>())
+            .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000))
+            .await
+            .unwrap();
+
+        for col in ["i", "j"] {
+            dataset
+                .create_index(
+                    &[col],
+                    IndexType::Scalar,
+                    Some(format!("{col}_idx")),
+                    &ScalarIndexParams::default(),
+                    false,
+                )
+                .await
+                .unwrap();
+        }
+
+        compact_files(
+            &mut dataset,
+            CompactionOptions {
+                target_rows_per_fragment: 2_000,
+                defer_index_remap: true,
+                ..Default::default()
+            },
+            None,
+        )
+        .await
+        .unwrap();
+
+        let frag_reuse_index_meta = dataset
+            .load_index_by_name(FRAG_REUSE_INDEX_NAME)
+            .await
+            .unwrap()
+            .expect("Fragment reuse index must be available");
+        let frag_reuse_details = load_frag_reuse_index_details(&dataset, &frag_reuse_index_meta)
+            .await
+            .unwrap();
+        assert_eq!(frag_reuse_details.versions.len(), 1);
+
+        for col in ["i", "j"] {
+            remapping::remap_column_index(&mut dataset, &[col], Some(format!("{col}_idx")))
+                .await
+                .unwrap();
+        }
+
+        // Every index must now be caught up (data remapped, version advanced).
+        let indices = dataset.load_indices().await.unwrap();
+        for col in ["i", "j"] {
+            let index = indices
+                .iter()
+                .find(|idx| idx.name == format!("{col}_idx"))
+                .unwrap();
+            assert!(
+                is_index_remap_caught_up(&frag_reuse_details.versions[0], index).unwrap(),
+                "index {col}_idx was not caught up after remap"
+            );
+        }
+
+        // ... so the reuse index trims down to zero versions.
+        cleanup_frag_reuse_index(&mut dataset).await.unwrap();
+        let frag_reuse_index_meta = dataset
+            .load_index_by_name(FRAG_REUSE_INDEX_NAME)
+            .await
+            .unwrap()
+            .expect("Fragment reuse index must be available");
+        let frag_reuse_details = load_frag_reuse_index_details(&dataset, &frag_reuse_index_meta)
+            .await
+            .unwrap();
+        assert_eq!(frag_reuse_details.versions.len(), 0);
+
+        // Data correctness, not just version bookkeeping: with the reuse index
+        // trimmed there is no auto-remap safety net, so each index must resolve
+        // to LIVE rows. An index whose data was not actually remapped (e.g. one
+        // whose bitmap was coverage-remapped by a sibling's commit before its
+        // own data remap) points at compacted-away fragments and errors on take.
+        use futures::TryStreamExt;
+        for col in ["i", "j"] {
+            let rows: usize = dataset
+                .scan()
+                .filter(&format!("{col} >= 2000 AND {col} < 3000"))
+                .unwrap()
+                .try_into_stream()
+                .await
+                .unwrap()
+                .try_collect::<Vec<_>>()
+                .await
+                .unwrap()
+                .iter()
+                .map(|b| b.num_rows())
+                .sum();
+            assert_eq!(
+                rows, 1000,
+                "index {col}_idx must resolve to live rows after remap+trim"
+            );
+        }
+    }
+
+    /// When the reuse index has accumulated several versions, a single remap
+    /// must compose them and rebuild + commit the index exactly ONCE, not once
+    /// per version.
+    #[tokio::test]
+    async fn test_remap_index_batches_multiple_reuse_versions() {
+        let mut dataset = lance_datagen::gen_batch()
+            .col("i", lance_datagen::array::step::<Int32Type>())
+            .into_ram_dataset(FragmentCount::from(8), FragmentRowCount::from(1000))
+            .await
+            .unwrap();
+        dataset
+            .create_index(
+                &["i"],
+                IndexType::Scalar,
+                Some("i_idx".into()),
+                &ScalarIndexParams::default(),
+                false,
+            )
+            .await
+            .unwrap();
+
+        // Accumulate multiple reuse versions: each round deletes a prefix, which
+        // shrinks fragments below target and forces another deferred compaction.
+        let options = CompactionOptions {
+            target_rows_per_fragment: 4_000,
+            defer_index_remap: true,
+            ..Default::default()
+        };
+        for round in 0..4 {
+            dataset
+                .delete(&format!("i < {}", 1_000 * (round + 1)))
+                .await
+                .unwrap();
+            compact_files(&mut dataset, options.clone(), None)
+                .await
+                .unwrap();
+        }
+
+        let frag_reuse_index_meta = dataset
+            .load_index_by_name(FRAG_REUSE_INDEX_NAME)
+            .await
+            .unwrap()
+            .expect("Fragment reuse index must be available");
+        let num_versions = load_frag_reuse_index_details(&dataset, &frag_reuse_index_meta)
+            .await
+            .unwrap()
+            .versions
+            .len();
+        assert!(
+            num_versions >= 2,
+            "test needs multiple reuse versions to exercise batching, got {num_versions}"
+        );
+
+        // A single remap must commit exactly once, regardless of version count.
+        let version_before = dataset.manifest.version;
+        remapping::remap_column_index(&mut dataset, &["i"], Some("i_idx".into()))
+            .await
+            .unwrap();
+        let commits = dataset.manifest.version - version_before;
+        assert_eq!(
+            commits, 1,
+            "batched remap must commit once, not once per reuse version ({num_versions})"
+        );
+
+        // ... and the reuse index then trims to zero.
+        cleanup_frag_reuse_index(&mut dataset).await.unwrap();
+        let frag_reuse_index_meta = dataset
+            .load_index_by_name(FRAG_REUSE_INDEX_NAME)
+            .await
+            .unwrap()
+            .expect("Fragment reuse index must be available");
+        assert_eq!(
+            load_frag_reuse_index_details(&dataset, &frag_reuse_index_meta)
+                .await
+                .unwrap()
+                .versions
+                .len(),
+            0
+        );
+    }
 }
diff --git a/rust/lance/src/dataset/mem_wal/api.rs b/rust/lance/src/dataset/mem_wal/api.rs
index b67f6434c9c..79184c13ec8 100644
--- a/rust/lance/src/dataset/mem_wal/api.rs
+++ b/rust/lance/src/dataset/mem_wal/api.rs
@@ -26,7 +26,7 @@ use crate::index::mem_wal::{load_mem_wal_index_details, new_mem_wal_index_meta};
 
 use super::ShardWriterConfig;
 use super::scanner::flushed_cache::open_flushed_dataset;
-use super::scanner::{FlushedMemTableCache, ShardSnapshot};
+use super::scanner::{DatasetCache, ShardSnapshot};
 use super::write::MemIndexConfig;
 use super::write::ShardWriter;
 
@@ -500,7 +500,7 @@ pub trait DatasetMemWalExt {
     async fn prewarm_mem_wal(
         &self,
         _snapshots: &[ShardSnapshot],
-        _cache: Option<&Arc<FlushedMemTableCache>>,
+        _cache: Option<&Arc<dyn DatasetCache>>,
     ) -> Result<()> {
         Ok(())
     }
@@ -586,7 +586,7 @@ impl DatasetMemWalExt for Dataset {
     async fn prewarm_mem_wal(
         &self,
         snapshots: &[ShardSnapshot],
-        cache: Option<&Arc<FlushedMemTableCache>>,
+        cache: Option<&Arc<dyn DatasetCache>>,
     ) -> Result<()> {
         let session = self.session();
         // Resolve flushed paths exactly as the LSM collector does, so the
@@ -601,7 +601,8 @@ impl DatasetMemWalExt for Dataset {
                 snapshot.flushed_generations.iter().map(move |flushed| {
                     let path = format!("{}/_mem_wal/{}/{}", base_path, shard_id, flushed.path);
                     async move {
-                        let dataset = open_flushed_dataset(&path, Some(session), cache).await?;
+                        let dataset =
+                            open_flushed_dataset(&path, Some(session), cache, None).await?;
                         prewarm_all_indexes(&dataset).await
                     }
                 })
@@ -762,6 +763,7 @@ async fn load_vector_index_config(
 
 #[cfg(test)]
 mod tests {
+    use super::super::scanner::FlushedMemTableCache;
     use super::*;
 
     use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator};
@@ -831,7 +833,7 @@ mod tests {
             .with_current_generation(2)
             .with_flushed_generation(1, folder.to_string());
 
-        let cache = Arc::new(FlushedMemTableCache::new(4));
+        let cache: Arc<dyn DatasetCache> = Arc::new(FlushedMemTableCache::new(4));
         base.prewarm_mem_wal(std::slice::from_ref(&snapshot), Some(&cache))
             .await
             .expect("prewarm must open the generation and warm its index");
diff --git a/rust/lance/src/dataset/mem_wal/index.rs b/rust/lance/src/dataset/mem_wal/index.rs
index 116ea6c60ce..208971f7be6 100644
--- a/rust/lance/src/dataset/mem_wal/index.rs
+++ b/rust/lance/src/dataset/mem_wal/index.rs
@@ -18,10 +18,14 @@ mod arena_skiplist;
 mod btree;
 mod fts;
 mod hnsw;
+mod pk_key;
 
 use std::collections::HashMap;
+use std::sync::Arc;
 use std::sync::atomic::{AtomicUsize, Ordering};
 
+use datafusion::common::ScalarValue;
+
 use super::memtable::batch_store::StoredBatch;
 use arrow_array::RecordBatch;
 use lance_core::datatypes::Schema as LanceSchema;
@@ -44,6 +48,32 @@ pub type RowPosition = u64;
 pub use btree::{BTreeIndexConfig, BTreeMemIndex};
 pub use fts::{FtsIndexConfig, FtsMemIndex, FtsQueryExpr, SearchOptions};
 pub use hnsw::{HnswIndexConfig, HnswMemIndex};
+pub use pk_key::encode_pk_tuple;
+
+use pk_key::encode_pk_batch;
+
+/// Synthetic column the composite PK index is keyed on: the order-preserving
+/// encoded tuple (see [`encode_pk_tuple`]), stored as `Binary` so a
+/// [`BTreeMemIndex`]'s byte backend indexes it directly.
+const PK_KEY_COLUMN: &str = "__pk_key__";
+
+/// The memtable's primary-key index, used to answer "newest visible version of
+/// this key" for dedup. Single-column PKs reuse the column's compact typed
+/// [`BTreeMemIndex`] (no second copy); composite PKs key a `BTreeMemIndex` on
+/// the order-preserving encoded tuple ([`encode_pk_tuple`]) instead. Either way
+/// the lookup is a single seek on one `BTreeMemIndex`.
+enum PkIndex {
+    /// Arity 1: aliases a `btree_indexes` entry, so the insert loop maintains it.
+    Single(Arc<BTreeMemIndex>),
+    /// Arity >= 2: a `BTreeMemIndex` over the encoded-tuple `Binary` key,
+    /// maintained explicitly in the insert paths (the original batch lacks the
+    /// synthetic key column). `columns` are the PK columns in order, resolved
+    /// against each batch's schema at insert time.
+    Composite {
+        index: Arc<BTreeMemIndex>,
+        columns: Vec<String>,
+    },
+}
 
 // ============================================================================
 // Index Store
@@ -195,12 +225,17 @@ impl MemIndexConfig {
 /// therefore safe for scanners to read. Scanners snapshot this at plan
 /// construction time so every plan keys on a stable MVCC cursor.
 pub struct IndexStore {
-    /// BTree indexes keyed by index name.
-    btree_indexes: HashMap<String, BTreeMemIndex>,
+    /// BTree indexes keyed by index name. `Arc` so the primary-key BTrees can be
+    /// shared into [`Self::pk_btrees`] without a second copy or a second insert.
+    btree_indexes: HashMap<String, Arc<BTreeMemIndex>>,
     /// HNSW vector indexes keyed by index name.
     hnsw_indexes: HashMap<String, HnswMemIndex>,
     /// FTS indexes keyed by index name.
     fts_indexes: HashMap<String, FtsMemIndex>,
+    /// The primary-key index (single-column or composite), or `None` without a
+    /// primary key. Queried via [`Self::pk_newest_visible`] (see
+    /// [`Self::enable_pk_index`]).
+    pk_index: Option<PkIndex>,
     /// Maximum batch position that is durable in the WAL and therefore
     /// visible to scanners. Advanced unconditionally after a WAL append
     /// succeeds; not gated on whether any indexes are configured.
@@ -213,6 +248,7 @@ impl Default for IndexStore {
             btree_indexes: HashMap::new(),
             hnsw_indexes: HashMap::new(),
             fts_indexes: HashMap::new(),
+            pk_index: None,
             max_visible_batch_position: AtomicUsize::new(0),
         }
     }
@@ -230,6 +266,16 @@ impl std::fmt::Debug for IndexStore {
                 &self.hnsw_indexes.keys().collect::<Vec<_>>(),
             )
             .field("fts_indexes", &self.fts_indexes.keys().collect::<Vec<_>>())
+            .field(
+                "pk_index",
+                &match &self.pk_index {
+                    None => "none".to_string(),
+                    Some(PkIndex::Single(b)) => format!("single({})", b.column_name()),
+                    Some(PkIndex::Composite { columns, .. }) => {
+                        format!("composite({})", columns.join(", "))
+                    }
+                },
+            )
             .field(
                 "max_visible_batch_position",
                 &self.max_visible_batch_position.load(Ordering::Acquire),
@@ -264,7 +310,7 @@ impl IndexStore {
         for config in configs {
             match config {
                 MemIndexConfig::BTree(c) => {
-                    let index = BTreeMemIndex::new(c.field_id, c.column.clone());
+                    let index = Arc::new(BTreeMemIndex::new(c.field_id, c.column.clone()));
                     registry.btree_indexes.insert(c.name.clone(), index);
                 }
                 MemIndexConfig::Hnsw(c) => {
@@ -293,7 +339,7 @@ impl IndexStore {
     /// the production memtable path goes through [`Self::from_configs`].
     pub fn add_btree(&mut self, name: String, field_id: i32, column: String) {
         self.btree_indexes
-            .insert(name, BTreeMemIndex::new(field_id, column));
+            .insert(name, Arc::new(BTreeMemIndex::new(field_id, column)));
     }
 
     /// Add an HNSW vector index with default build parameters.
@@ -362,6 +408,158 @@ impl IndexStore {
             .insert(name, FtsMemIndex::with_params(field_id, column, params));
     }
 
+    /// Maintain a primary-key index so the memtable can answer "newest visible
+    /// version of this key" (see [`Self::pk_newest_visible`]).
+    ///
+    /// Single-column PKs reuse an existing BTree on the field, else auto-create
+    /// one under a `__pk__*` name so the normal insert loop maintains it (no
+    /// second copy). Composite (arity >= 2) PKs key a `BTreeMemIndex` on the
+    /// order-preserving encoded tuple (synthetic `PK_KEY_COLUMN`), maintained
+    /// explicitly in the insert paths. Call once at construction, after
+    /// [`Self::from_configs`] and before any inserts; a no-op when `pk_columns`
+    /// is empty.
+    pub fn enable_pk_index(&mut self, pk_columns: &[(String, i32)]) {
+        self.pk_index = match pk_columns {
+            [] => None,
+            [(column, field_id)] => {
+                let btree = match self
+                    .btree_indexes
+                    .values()
+                    .find(|b| b.field_id() == *field_id)
+                {
+                    Some(existing) => existing.clone(),
+                    None => {
+                        let btree = Arc::new(BTreeMemIndex::new(*field_id, column.clone()));
+                        self.btree_indexes
+                            .insert(format!("__pk__{column}"), btree.clone());
+                        btree
+                    }
+                };
+                Some(PkIndex::Single(btree))
+            }
+            multi => Some(PkIndex::Composite {
+                // Synthetic field id (-1): the composite index is held directly,
+                // never resolved by field id.
+                index: Arc::new(BTreeMemIndex::new(-1, PK_KEY_COLUMN.to_string())),
+                columns: multi.iter().map(|(c, _)| c.clone()).collect(),
+            }),
+        };
+    }
+
+    /// Whether the memtable has a primary-key index.
+    pub fn has_pk_index(&self) -> bool {
+        self.pk_index.is_some()
+    }
+
+    /// Sorted `(value, row_id)` training batches for the flushed on-disk PK
+    /// BTree (the sidecar dedup index). Single-column emits the typed PK value;
+    /// composite emits the order-preserving `Binary` encoded tuple. Empty when
+    /// there is no primary key. Row positions line up 1:1 with the forward-
+    /// written data file, so they are the flushed row ids directly.
+    pub fn pk_training_batches(&self, batch_size: usize) -> Result<Vec<RecordBatch>> {
+        match &self.pk_index {
+            None => Ok(Vec::new()),
+            Some(PkIndex::Single(btree)) => btree.to_training_batches(batch_size),
+            Some(PkIndex::Composite { index, .. }) => index.to_training_batches(batch_size),
+        }
+    }
+
+    /// Resolve the PK columns' positions in `batch` (composite insert helper).
+    fn pk_batch_indices(batch: &RecordBatch, columns: &[String]) -> Result<Vec<usize>> {
+        columns
+            .iter()
+            .map(|c| {
+                batch
+                    .schema()
+                    .column_with_name(c)
+                    .map(|(i, _)| i)
+                    .ok_or_else(|| {
+                        Error::invalid_input(format!("PK column '{c}' not found in batch"))
+                    })
+            })
+            .collect()
+    }
+
+    /// Maintain the composite PK index for `batch` (no-op for single/no PK):
+    /// encode the PK columns into the synthetic `PK_KEY_COLUMN` `Binary` column
+    /// and feed that to the keyed `BTreeMemIndex`.
+    fn insert_composite_pk(&self, batch: &RecordBatch, row_offset: u64) -> Result<()> {
+        if let Some(PkIndex::Composite { index, columns }) = &self.pk_index {
+            let pk_indices = Self::pk_batch_indices(batch, columns)?;
+            let encoded = encode_pk_batch(batch, &pk_indices)?;
+            let schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new(
+                PK_KEY_COLUMN,
+                arrow_schema::DataType::Binary,
+                false,
+            )]));
+            let key_batch = RecordBatch::try_new(schema, vec![Arc::new(encoded)])
+                .map_err(|e| Error::invalid_input(e.to_string()))?;
+            index.insert(&key_batch, row_offset)?;
+        }
+        Ok(())
+    }
+
+    /// The newest row position of the primary-key tuple `values` (in PK order)
+    /// visible at `max_visible_row`, or `None`. A single seek either way:
+    /// single-column probes the typed BTree; composite probes the encoded-tuple
+    /// index. Collision-free, since `position` is the row identity.
+    pub fn pk_newest_visible(
+        &self,
+        values: &[ScalarValue],
+        max_visible_row: RowPosition,
+    ) -> Option<RowPosition> {
+        match &self.pk_index {
+            None => None,
+            Some(PkIndex::Single(btree)) => btree.get_newest_visible(&values[0], max_visible_row),
+            Some(PkIndex::Composite { index, .. }) => {
+                // An unsupported PK type would have failed at insert, so the
+                // index can't hold a tuple this fails to encode. The probe key is
+                // the same `Binary`-encoded tuple the insert path indexed.
+                let key = encode_pk_tuple(values).ok()?;
+                index.get_newest_visible(&ScalarValue::Binary(Some(key)), max_visible_row)
+            }
+        }
+    }
+
+    /// Whether `position` is the newest visible row of `values` — the recency
+    /// check the active index-search arms apply to drop predicate-crossing
+    /// stale hits. Callers gate on [`Self::has_pk_index`] first, since this is
+    /// `false` (drop) when the memtable has no primary-key index.
+    pub fn pk_is_newest(
+        &self,
+        values: &[ScalarValue],
+        position: RowPosition,
+        max_visible_row: RowPosition,
+    ) -> bool {
+        self.pk_newest_visible(values, max_visible_row) == Some(position)
+    }
+
+    /// Whether `key` has any version visible at `max_visible_row` — the
+    /// cross-source block-list's existence query, snapshot-bounded so a
+    /// not-yet-visible write can't shadow an older visible copy.
+    ///
+    /// `key` is already in the index's key space: the typed PK value for a
+    /// single-column key, the `Binary`-encoded tuple for a composite one (built
+    /// by `block_list::on_disk_pk_key`, the same key the flushed on-disk index is
+    /// probed with). Both arities forward it straight to the keyed BTree.
+    pub fn pk_contains_key(&self, key: &ScalarValue, max_visible_row: RowPosition) -> bool {
+        match &self.pk_index {
+            None => false,
+            Some(PkIndex::Single(btree)) | Some(PkIndex::Composite { index: btree, .. }) => {
+                btree.get_newest_visible(key, max_visible_row).is_some()
+            }
+        }
+    }
+
+    /// Whether the primary-key index holds no rows (or doesn't exist).
+    pub fn pk_is_empty(&self) -> bool {
+        match &self.pk_index {
+            None => true,
+            Some(PkIndex::Single(btree)) => btree.is_empty(),
+            Some(PkIndex::Composite { index, .. }) => index.is_empty(),
+        }
+    }
+
     /// Insert a batch into all indexes.
     pub fn insert(&self, batch: &RecordBatch, row_offset: u64) -> Result<()> {
         self.insert_with_batch_position(batch, row_offset, None)
@@ -384,6 +582,9 @@ impl IndexStore {
         for index in self.fts_indexes.values() {
             index.insert(batch, row_offset)?;
         }
+        // Single-column PK aliases a `btree_indexes` entry (maintained above);
+        // a composite PK has its own index, maintained here.
+        self.insert_composite_pk(batch, row_offset)?;
 
         // Update global watermark after all indexes have been updated
         if let Some(bp) = batch_position {
@@ -440,6 +641,12 @@ impl IndexStore {
             }
         }
 
+        // Single-column PK aliases a `btree_indexes` entry (maintained above);
+        // a composite PK has its own index, maintained here.
+        for stored in batches {
+            self.insert_composite_pk(&stored.data, stored.row_offset)?;
+        }
+
         // Update global watermark to the max batch position
         let max_bp = batches.iter().map(|b| b.batch_position).max().unwrap();
         self.advance_max_visible_batch_position(max_bp);
@@ -552,6 +759,14 @@ impl IndexStore {
                 .map(|(name, _idx_type, duration)| (name.to_string(), duration))
                 .collect();
 
+            // Single-column PK aliases a `btree_indexes` entry — its thread above
+            // already maintained it (and joined). A composite PK has its own
+            // index; maintain it here before the watermark advances so the
+            // visible prefix is fully indexed.
+            for stored in batches {
+                self.insert_composite_pk(&stored.data, stored.row_offset)?;
+            }
+
             // Update global watermark to the max batch position
             let max_bp = batches.iter().map(|b| b.batch_position).max().unwrap();
             self.advance_max_visible_batch_position(max_bp);
@@ -562,7 +777,7 @@ impl IndexStore {
 
     /// Get a BTree index by name.
     pub fn get_btree(&self, name: &str) -> Option<&BTreeMemIndex> {
-        self.btree_indexes.get(name)
+        self.btree_indexes.get(name).map(Arc::as_ref)
     }
 
     /// Get an HNSW vector index by name.
@@ -583,6 +798,7 @@ impl IndexStore {
         self.btree_indexes
             .values()
             .find(|idx| idx.field_id() == field_id)
+            .map(Arc::as_ref)
     }
 
     /// Get an HNSW vector index by field ID.
@@ -607,6 +823,7 @@ impl IndexStore {
         self.btree_indexes
             .values()
             .find(|idx| idx.column_name() == column)
+            .map(Arc::as_ref)
     }
 
     /// Get an HNSW vector index by column name.
@@ -694,6 +911,73 @@ mod tests {
         .unwrap()
     }
 
+    /// Single-column `id` batch for primary-key lookup tests.
+    fn id_batch(ids: &[i32]) -> RecordBatch {
+        RecordBatch::try_new(
+            Arc::new(ArrowSchema::new(vec![Field::new(
+                "id",
+                DataType::Int32,
+                false,
+            )])),
+            vec![Arc::new(Int32Array::from(ids.to_vec()))],
+        )
+        .unwrap()
+    }
+
+    #[test]
+    fn pk_newest_visible_single_column() {
+        let mut store = IndexStore::new();
+        store.enable_pk_index(&[("id".to_string(), 0)]);
+        // id=1 at positions 0 and 2 (an update), id=2 at position 1.
+        store.insert(&id_batch(&[1, 2]), 0).unwrap();
+        store.insert(&id_batch(&[1]), 2).unwrap();
+
+        let one = [ScalarValue::Int32(Some(1))];
+        // Watermark above the update sees the newest position; below it, the older.
+        assert_eq!(store.pk_newest_visible(&one, 5), Some(2));
+        assert_eq!(store.pk_newest_visible(&one, 1), Some(0));
+        assert!(store.pk_is_newest(&one, 2, 5));
+        assert!(!store.pk_is_newest(&one, 0, 5));
+        // Absent key (probed by the typed value, as the block-list does).
+        assert!(!store.pk_contains_key(&ScalarValue::Int32(Some(9)), 5));
+    }
+
+    #[test]
+    fn pk_newest_visible_composite_seeks_encoded_tuple() {
+        let mut store = IndexStore::new();
+        store.enable_pk_index(&[("id".to_string(), 0), ("name".to_string(), 1)]);
+        // Rows: (1,"a")@0, (1,"b")@1, (1,"a")@2 — an update of (1,"a").
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("name", DataType::Utf8, false),
+        ]));
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(Int32Array::from(vec![1, 1, 1])),
+                Arc::new(StringArray::from(vec!["a", "b", "a"])),
+            ],
+        )
+        .unwrap();
+        store.insert(&batch, 0).unwrap();
+
+        let tuple_1a = [ScalarValue::Int32(Some(1)), ScalarValue::from("a")];
+        let tuple_1b = [ScalarValue::Int32(Some(1)), ScalarValue::from("b")];
+        // (1,"a")'s newest visible row is its re-write at position 2.
+        assert_eq!(store.pk_newest_visible(&tuple_1a, 5), Some(2));
+        assert!(store.pk_is_newest(&tuple_1a, 2, 5));
+        assert!(!store.pk_is_newest(&tuple_1a, 0, 5));
+        // (1,"b") only exists at position 1.
+        assert_eq!(store.pk_newest_visible(&tuple_1b, 5), Some(1));
+        // Watermark below the re-write: the older (1,"a")@0 is the newest visible.
+        assert_eq!(store.pk_newest_visible(&tuple_1a, 1), Some(0));
+        // An absent tuple (probed by its Binary-encoded key, as the block-list
+        // does).
+        let tuple_2a = [ScalarValue::Int32(Some(2)), ScalarValue::from("a")];
+        let key_2a = ScalarValue::Binary(Some(encode_pk_tuple(&tuple_2a).unwrap()));
+        assert!(!store.pk_contains_key(&key_2a, 5));
+    }
+
     #[test]
     fn test_index_registry() {
         let schema = create_test_schema();
diff --git a/rust/lance/src/dataset/mem_wal/index/pk_key.rs b/rust/lance/src/dataset/mem_wal/index/pk_key.rs
new file mode 100644
index 00000000000..b31fe42c995
--- /dev/null
+++ b/rust/lance/src/dataset/mem_wal/index/pk_key.rs
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Composite primary-key encoding for MemWAL dedup.
+//!
+//! A multi-column primary key is reduced to a single order-preserving byte
+//! string ([`encode_pk_tuple`]) so the whole tuple is one comparable key:
+//! lexicographic byte order equals tuple order, and distinct tuples never
+//! collide. Encoded as a `Binary` value, the tuple is indexed directly by a
+//! [`super::BTreeMemIndex`] (its byte backend) — both in memory and, after
+//! flush, as the on-disk BTree's `Binary` value column — so a probe builds
+//! `ScalarValue::Binary(key)` and every layer agrees.
+//!
+//! Single-column primary keys do **not** use this — they key the typed
+//! `BTreeMemIndex` on the column value directly.
+
+use arrow_array::{BinaryArray, RecordBatch};
+use datafusion::common::ScalarValue;
+use lance_core::{Error, Result};
+
+/// Sign-flip a signed integer to an order-preserving unsigned key (matches the
+/// fixed-int BTree backend). Big-endian bytes of the result sort like the value.
+#[inline]
+fn encode_signed(v: i64) -> u64 {
+    (v as u64) ^ (1u64 << 63)
+}
+
+/// Append an order-preserving encoding of one non-null byte string: each `0x00`
+/// is escaped to `0x00 0xFF`, then a `0x00 0x00` terminator is appended. The
+/// terminator sorts before any escaped content, so a prefix orders before its
+/// extensions and no value can forge a column boundary.
+fn encode_bytes(out: &mut Vec<u8>, bytes: &[u8]) {
+    for &b in bytes {
+        out.push(b);
+        if b == 0x00 {
+            out.push(0xFF);
+        }
+    }
+    out.extend_from_slice(&[0x00, 0x00]);
+}
+
+/// Append the order-preserving encoding of a single PK column value. A leading
+/// tag (`0x00` null / `0x01` non-null) makes nulls sort first and keeps the
+/// per-column encoding self-delimiting (fixed-width for ints, terminated for
+/// bytes), so concatenating columns stays injective and order-preserving.
+fn encode_value(out: &mut Vec<u8>, value: &ScalarValue) -> Result<()> {
+    if value.is_null() {
+        out.push(0x00);
+        return Ok(());
+    }
+    out.push(0x01);
+    macro_rules! be_signed {
+        ($v:expr) => {
+            out.extend_from_slice(&encode_signed($v as i64).to_be_bytes())
+        };
+    }
+    match value {
+        ScalarValue::Int8(Some(v)) => be_signed!(*v),
+        ScalarValue::Int16(Some(v)) => be_signed!(*v),
+        ScalarValue::Int32(Some(v)) => be_signed!(*v),
+        ScalarValue::Int64(Some(v)) => be_signed!(*v),
+        ScalarValue::Date32(Some(v)) => be_signed!(*v),
+        ScalarValue::Date64(Some(v)) => be_signed!(*v),
+        ScalarValue::UInt8(Some(v)) => out.extend_from_slice(&(*v as u64).to_be_bytes()),
+        ScalarValue::UInt16(Some(v)) => out.extend_from_slice(&(*v as u64).to_be_bytes()),
+        ScalarValue::UInt32(Some(v)) => out.extend_from_slice(&(*v as u64).to_be_bytes()),
+        ScalarValue::UInt64(Some(v)) => out.extend_from_slice(&v.to_be_bytes()),
+        ScalarValue::Boolean(Some(b)) => out.push(*b as u8),
+        ScalarValue::Utf8(Some(s)) | ScalarValue::LargeUtf8(Some(s)) => {
+            encode_bytes(out, s.as_bytes())
+        }
+        ScalarValue::Binary(Some(b))
+        | ScalarValue::LargeBinary(Some(b))
+        | ScalarValue::FixedSizeBinary(_, Some(b)) => encode_bytes(out, b),
+        other => {
+            return Err(Error::invalid_input(format!(
+                "Unsupported primary-key column type for composite key: {other:?}"
+            )));
+        }
+    }
+    Ok(())
+}
+
+/// Encode a PK tuple (values in PK column order) to one order-preserving key.
+pub fn encode_pk_tuple(values: &[ScalarValue]) -> Result<Vec<u8>> {
+    let mut out = Vec::with_capacity(values.len() * 9);
+    for value in values {
+        encode_value(&mut out, value)?;
+    }
+    Ok(out)
+}
+
+/// Encode row `row` of `batch`'s PK columns (at `pk_indices`) to one key.
+fn encode_pk_row(batch: &RecordBatch, pk_indices: &[usize], row: usize) -> Result<Vec<u8>> {
+    let mut out = Vec::with_capacity(pk_indices.len() * 9);
+    for &col in pk_indices {
+        let value = ScalarValue::try_from_array(batch.column(col), row)?;
+        encode_value(&mut out, &value)?;
+    }
+    Ok(out)
+}
+
+/// Encode every row of `batch`'s PK columns (at `pk_indices`) into a `Binary`
+/// column of order-preserving composite keys — the form a [`super::BTreeMemIndex`]
+/// indexes directly (its byte backend), so the composite PK reuses the same
+/// index as a single-column one.
+pub fn encode_pk_batch(batch: &RecordBatch, pk_indices: &[usize]) -> Result<BinaryArray> {
+    let mut keys: Vec<Vec<u8>> = Vec::with_capacity(batch.num_rows());
+    for row in 0..batch.num_rows() {
+        keys.push(encode_pk_row(batch, pk_indices, row)?);
+    }
+    Ok(BinaryArray::from_iter_values(keys.iter()))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow_array::{Int32Array, StringArray};
+    use arrow_schema::{DataType, Field, Schema};
+    use std::sync::Arc;
+
+    fn tuple(a: i32, b: &str) -> Vec<ScalarValue> {
+        vec![ScalarValue::Int32(Some(a)), ScalarValue::from(b)]
+    }
+
+    #[test]
+    fn encoding_is_order_preserving_and_injective() {
+        // Sorting tuples by their encoding must match tuple order, and distinct
+        // tuples must produce distinct bytes.
+        let tuples = [
+            tuple(1, "a"),
+            tuple(1, "ab"),
+            tuple(1, "b"),
+            tuple(2, "a"),
+            tuple(-1, "z"),
+        ];
+        let mut encoded: Vec<(Vec<u8>, &Vec<ScalarValue>)> = tuples
+            .iter()
+            .map(|t| (encode_pk_tuple(t).unwrap(), t))
+            .collect();
+        encoded.sort_by(|x, y| x.0.cmp(&y.0));
+        let order: Vec<_> = encoded.iter().map(|(_, t)| (*t).clone()).collect();
+        // -1 < 1 < 2; within id=1, "a" < "ab" < "b".
+        assert_eq!(
+            order,
+            vec![
+                tuple(-1, "z"),
+                tuple(1, "a"),
+                tuple(1, "ab"),
+                tuple(1, "b"),
+                tuple(2, "a"),
+            ]
+        );
+        // Injective: 5 distinct tuples → 5 distinct keys.
+        let mut keys: Vec<Vec<u8>> = tuples.iter().map(|t| encode_pk_tuple(t).unwrap()).collect();
+        keys.sort();
+        keys.dedup();
+        assert_eq!(keys.len(), 5);
+    }
+
+    #[test]
+    fn null_sorts_first_and_is_distinct() {
+        let null_a = vec![ScalarValue::Int32(None), ScalarValue::from("a")];
+        let one_a = tuple(1, "a");
+        assert!(encode_pk_tuple(&null_a).unwrap() < encode_pk_tuple(&one_a).unwrap());
+        assert_ne!(
+            encode_pk_tuple(&null_a).unwrap(),
+            encode_pk_tuple(&one_a).unwrap()
+        );
+    }
+
+    #[test]
+    fn prefix_safety_with_embedded_zero() {
+        // A string containing 0x00 must not collide with or sort incorrectly
+        // against a shorter one (escaping + terminator).
+        let with_zero = vec![ScalarValue::Binary(Some(vec![0x00]))];
+        let empty = vec![ScalarValue::Binary(Some(vec![]))];
+        assert!(encode_pk_tuple(&empty).unwrap() < encode_pk_tuple(&with_zero).unwrap());
+    }
+
+    #[test]
+    fn encode_pk_batch_matches_per_tuple_encoding() {
+        // Each row of the encoded `Binary` column equals `encode_pk_tuple` of
+        // that row's PK values — so the column a BTreeMemIndex indexes is exactly
+        // what a probe builds.
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("name", DataType::Utf8, false),
+        ]));
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(Int32Array::from(vec![2, 1])),
+                Arc::new(StringArray::from(vec!["a", "b"])),
+            ],
+        )
+        .unwrap();
+        let encoded = encode_pk_batch(&batch, &[0, 1]).unwrap();
+        assert_eq!(encoded.value(0), encode_pk_tuple(&tuple(2, "a")).unwrap());
+        assert_eq!(encoded.value(1), encode_pk_tuple(&tuple(1, "b")).unwrap());
+        // (1,"b") encodes below (2,"a").
+        assert!(encoded.value(1) < encoded.value(0));
+    }
+}
diff --git a/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs b/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs
index f4d4d797acc..054d9b1630e 100644
--- a/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs
+++ b/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs
@@ -615,6 +615,22 @@ impl BatchStore {
         (0..end).collect()
     }
 
+    /// The inclusive maximum visible *row* position at `max_visible_batch_position`,
+    /// or `None` when no rows are visible. The visible batches are the committed
+    /// prefix `[0, last_visible_idx]`; each batch carries its cumulative
+    /// `row_offset`, so this is the end of the last visible batch minus one.
+    /// Used to bound MVCC seeks against the maintained PK-position index.
+    pub fn max_visible_row(&self, max_visible_batch_position: usize) -> Option<u64> {
+        let len = self.committed_len.load(Ordering::Acquire);
+        if len == 0 {
+            return None;
+        }
+        let last_visible_idx = max_visible_batch_position.min(len - 1);
+        let last = self.get(last_visible_idx)?;
+        let visible_end = last.row_offset + last.num_rows as u64; // exclusive
+        visible_end.checked_sub(1)
+    }
+
     /// Check if a specific batch is visible at a given visibility position.
     #[inline]
     pub fn is_batch_visible(
@@ -910,6 +926,37 @@ mod tests {
         assert!(!store.is_batch_visible(3, 10));
     }
 
+    #[test]
+    fn test_max_visible_row() {
+        // (1) Empty store: no rows are visible at any position.
+        let store = BatchStore::with_capacity(10);
+        assert_eq!(store.max_visible_row(0), None);
+        assert_eq!(store.max_visible_row(100), None);
+
+        // Three batches → rows [0,10) [10,30) [30,60); row_offsets 0, 10, 30.
+        store.append(create_test_batch(10)).unwrap(); // position 0
+        store.append(create_test_batch(20)).unwrap(); // position 1
+        store.append(create_test_batch(30)).unwrap(); // position 2
+
+        // (2) A position within range yields the inclusive end of that prefix.
+        assert_eq!(store.max_visible_row(0), Some(9)); // batch 0: 0..10
+        assert_eq!(store.max_visible_row(1), Some(29)); // batch 1: 10..30
+        assert_eq!(store.max_visible_row(2), Some(59)); // batch 2: 30..60
+
+        // (3) A position beyond the committed range clamps to the last batch,
+        // i.e. the inclusive max over all rows.
+        assert_eq!(store.max_visible_row(100), Some(59));
+
+        // (4) An empty leading batch contributes no rows: at its own position
+        // the inclusive end underflows to None, while a later non-empty batch
+        // is reported correctly.
+        let store = BatchStore::with_capacity(10);
+        store.append(create_test_batch(0)).unwrap(); // position 0: rows [0,0)
+        store.append(create_test_batch(5)).unwrap(); // position 1: rows [0,5)
+        assert_eq!(store.max_visible_row(0), None); // empty prefix → no rows
+        assert_eq!(store.max_visible_row(1), Some(4)); // through batch 1
+    }
+
     #[test]
     fn test_recommended_capacity() {
         // 64MB memtable, 64KB avg batch = 1024 batches * 1.2 = ~1228
diff --git a/rust/lance/src/dataset/mem_wal/memtable/flush.rs b/rust/lance/src/dataset/mem_wal/memtable/flush.rs
index c4794d4c8f3..ebcc06cab44 100644
--- a/rust/lance/src/dataset/mem_wal/memtable/flush.rs
+++ b/rust/lance/src/dataset/mem_wal/memtable/flush.rs
@@ -18,7 +18,7 @@ use lance_io::object_store::ObjectStore;
 use lance_table::format::IndexMetadata;
 use lance_table::io::commit::write_manifest_file_to_path;
 use lance_table::io::deletion::write_deletion_file;
-use log::info;
+use log::{info, warn};
 use object_store::ObjectStoreExt;
 use object_store::path::Path;
 use roaring::RoaringBitmap;
@@ -29,6 +29,7 @@ use super::super::index::MemIndexConfig;
 use super::super::memtable::MemTable;
 use crate::Dataset;
 use crate::dataset::mem_wal::manifest::ShardManifestStore;
+use crate::dataset::mem_wal::scanner::GenerationWarmer;
 use crate::dataset::mem_wal::scanner::exec::{compute_pk_hash, validate_pk_types};
 use crate::dataset::mem_wal::util::{flushed_memtable_path, generate_random_hash};
 
@@ -68,6 +69,9 @@ pub struct MemTableFlusher {
     base_uri: String,
     shard_id: Uuid,
     manifest_store: Arc<ShardManifestStore>,
+    /// When present, each new generation is warmed before it is committed, so
+    /// the first query sees zero cold reads. `None` => no warming.
+    warmer: Option<Arc<dyn GenerationWarmer>>,
 }
 
 impl MemTableFlusher {
@@ -84,6 +88,26 @@ impl MemTableFlusher {
             base_uri: base_uri.into(),
             shard_id,
             manifest_store,
+            warmer: None,
+        }
+    }
+
+    /// Attach the warmer fired pre-commit for each new generation.
+    pub fn with_warmer(mut self, warmer: Option<Arc<dyn GenerationWarmer>>) -> Self {
+        self.warmer = warmer;
+        self
+    }
+
+    /// Warm a just-written generation before it is committed. Best-effort: a
+    /// failure is logged and the flush proceeds — warming is never a commit
+    /// gate. No-op without a warmer. `uri` must be the resolved reader path
+    /// (`path_to_uri(gen_path)`) so warmed entries key-match later queries.
+    async fn warm_generation(&self, uri: &str) {
+        let Some(warmer) = &self.warmer else {
+            return;
+        };
+        if let Err(e) = warmer.warm(uri).await {
+            warn!("pre-commit warm failed for generation {uri}; committing cold: {e}");
         }
     }
 
@@ -178,6 +202,16 @@ impl MemTableFlusher {
         self.write_bloom_filter(&bloom_path, memtable.bloom_filter())
             .await?;
 
+        // Write the standalone primary-key dedup sidecar. A primary key needs
+        // no secondary index, so this is required on the plain-flush path too —
+        // the LSM scanner opens it to dedup the generation. (`flush_with_indexes`
+        // writes it on the indexed path.) No-op when the memtable has no PK.
+        self.create_pk_index(&gen_path, memtable.indexes()).await?;
+
+        // Warm before commit (zero cold window); no-op without a warmer.
+        let warm_uri = self.path_to_uri(&gen_path);
+        self.warm_generation(&warm_uri).await;
+
         let new_manifest = self
             .update_manifest(
                 epoch,
@@ -449,6 +483,10 @@ impl MemTableFlusher {
             all_indexes.extend(fts_indexes);
         }
 
+        // Write the standalone primary-key dedup index (sidecar, not a manifest
+        // index — the block-list opens it directly by path).
+        self.create_pk_index(&gen_path, memtable.indexes()).await?;
+
         // Write a single manifest that records the fragments, the
         // within-generation deletion vector, and all indexes, overwriting the
         // data-only v1 manifest created by Dataset::write.
@@ -459,6 +497,10 @@ impl MemTableFlusher {
         self.write_bloom_filter(&bloom_path, memtable.bloom_filter())
             .await?;
 
+        // Warm before commit (zero cold window); no-op without a warmer.
+        let warm_uri = self.path_to_uri(&gen_path);
+        self.warm_generation(&warm_uri).await;
+
         let new_manifest = self
             .update_manifest(
                 epoch,
@@ -543,6 +585,49 @@ impl MemTableFlusher {
         Ok(created_indexes)
     }
 
+    /// Write the standalone primary-key dedup index for this generation.
+    ///
+    /// Unlike user indexes, this is a **sidecar**: it is not registered in the
+    /// manifest. The block-list opens it directly by path
+    /// ([`pk_index_path`]) and probes it with `Equals`. Single-column primary
+    /// keys index the typed value; composite keys index the order-preserving
+    /// `Binary` encoded tuple (see [`super::super::index::encode_pk_tuple`]).
+    /// Row positions line up 1:1 with the forward-written data file, so they are
+    /// the flushed row ids directly. No-op without a primary-key index.
+    async fn create_pk_index(
+        &self,
+        gen_path: &Path,
+        mem_indexes: Option<&super::super::index::IndexStore>,
+    ) -> Result<()> {
+        use datafusion::physical_plan::SendableRecordBatchStream;
+        use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
+        use lance_index::scalar::btree::train_btree_index;
+        use lance_index::scalar::lance_format::LanceIndexStore;
+
+        use crate::dataset::mem_wal::util::pk_index_path;
+
+        let Some(registry) = mem_indexes else {
+            return Ok(());
+        };
+        let batches = registry.pk_training_batches(8192)?;
+        if batches.is_empty() {
+            return Ok(());
+        }
+
+        let schema = batches[0].schema();
+        let store = LanceIndexStore::new(
+            self.object_store.clone(),
+            pk_index_path(gen_path),
+            Arc::new(LanceCache::no_cache()),
+        );
+        let stream: SendableRecordBatchStream = Box::pin(RecordBatchStreamAdapter::new(
+            schema,
+            futures::stream::iter(batches.into_iter().map(Ok)),
+        ));
+        train_btree_index(stream, &store, 8192, None, None).await?;
+        Ok(())
+    }
+
     /// Create FTS (Full-Text Search) indexes from in-memory data (uncommitted).
     ///
     /// Writes the FTS index files and returns index metadata without committing.
@@ -965,21 +1050,30 @@ impl MemTableFlusher {
     }
 }
 
-/// Message to trigger flush of a frozen memtable to Lance storage.
-pub struct TriggerMemTableFlush {
-    /// The frozen memtable to flush.
-    pub memtable: Arc<MemTable>,
-    /// Optional channel to notify when flush completes.
-    pub done: Option<tokio::sync::oneshot::Sender<Result<FlushResult>>>,
+/// Message driving the background memtable-flush task.
+pub enum TriggerMemTableFlush {
+    /// Flush a frozen memtable to Lance storage.
+    Flush {
+        /// The frozen memtable to flush.
+        memtable: Arc<MemTable>,
+        /// Optional channel to notify when flush completes.
+        done: Option<tokio::sync::oneshot::Sender<Result<FlushResult>>>,
+    },
+    /// Periodic tick: evict frozen memtables whose post-flush grace has elapsed.
+    SweepExpired,
 }
 
 impl std::fmt::Debug for TriggerMemTableFlush {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("TriggerMemTableFlush")
-            .field("memtable_gen", &self.memtable.generation())
-            .field("memtable_rows", &self.memtable.row_count())
-            .field("has_done", &self.done.is_some())
-            .finish()
+        match self {
+            Self::Flush { memtable, done } => f
+                .debug_struct("TriggerMemTableFlush::Flush")
+                .field("memtable_gen", &memtable.generation())
+                .field("memtable_rows", &memtable.row_count())
+                .field("has_done", &done.is_some())
+                .finish(),
+            Self::SweepExpired => f.write_str("TriggerMemTableFlush::SweepExpired"),
+        }
     }
 }
 
@@ -1139,6 +1233,79 @@ mod tests {
         assert_eq!(updated_manifest.flushed_generations.len(), 1);
     }
 
+    /// A `GenerationWarmer` that counts calls and optionally fails.
+    #[derive(Debug)]
+    struct CountingWarmer {
+        calls: Arc<std::sync::atomic::AtomicUsize>,
+        fail: bool,
+    }
+
+    #[async_trait::async_trait]
+    impl GenerationWarmer for CountingWarmer {
+        async fn warm(&self, _path: &str) -> Result<()> {
+            self.calls.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
+            if self.fail {
+                Err(Error::io("simulated warm failure".to_string()))
+            } else {
+                Ok(())
+            }
+        }
+    }
+
+    /// Warming is a best-effort optimization, never a commit gate: a warmer that
+    /// errors pre-commit must still let the flush commit the generation. The
+    /// warm fires exactly once on the pre-commit path.
+    #[tokio::test]
+    async fn test_flusher_commits_when_warm_fails() {
+        let (store, base_path, base_uri, _temp_dir) = create_local_store().await;
+        let shard_id = Uuid::new_v4();
+        let manifest_store = Arc::new(ShardManifestStore::new(
+            store.clone(),
+            &base_path,
+            shard_id,
+            2,
+        ));
+        let (epoch, _manifest) = manifest_store.claim_epoch(0).await.unwrap();
+
+        let schema = create_test_schema();
+        let mut memtable = MemTable::new(schema.clone(), 1, vec![]).unwrap();
+        let frag_id = memtable
+            .insert(create_test_batch(&schema, 10))
+            .await
+            .unwrap();
+        memtable.mark_wal_flushed(&[frag_id], 1, &[0]);
+
+        let calls = Arc::new(std::sync::atomic::AtomicUsize::new(0));
+        let warmer: Arc<dyn GenerationWarmer> = Arc::new(CountingWarmer {
+            calls: calls.clone(),
+            fail: true,
+        });
+
+        let flusher = MemTableFlusher::new(
+            store.clone(),
+            base_path,
+            base_uri,
+            shard_id,
+            manifest_store.clone(),
+        )
+        .with_warmer(Some(warmer));
+        // Flush must succeed despite the warmer erroring.
+        let result = flusher.flush(&memtable, epoch, 1).await.unwrap();
+
+        assert_eq!(result.generation.generation, 1);
+        assert_eq!(
+            calls.load(std::sync::atomic::Ordering::SeqCst),
+            1,
+            "pre-commit warm fires exactly once"
+        );
+        let updated = manifest_store.read_latest().await.unwrap().unwrap();
+        assert_eq!(
+            updated.flushed_generations.len(),
+            1,
+            "generation still committed after a failed warm"
+        );
+    }
+
     /// Flushing a generation with within-generation duplicate PKs writes a
     /// deletion vector so the flushed dataset exposes newest-per-PK on scan.
     #[tokio::test]
@@ -1227,6 +1394,202 @@ mod tests {
         assert_eq!(rows.get(&3), Some(&"c2".to_string()));
     }
 
+    /// Flushing a memtable with a primary-key index writes a standalone sidecar
+    /// BTree at `{gen}/_pk_index` that the block-list can reopen by path and
+    /// probe by value — including for a within-gen-superseded PK (existence,
+    /// not visibility).
+    #[tokio::test]
+    async fn flushed_pk_index_sidecar_is_probeable() {
+        use lance_core::cache::LanceCache;
+        use lance_index::metrics::NoOpMetricsCollector;
+        use lance_index::registry::IndexPluginRegistry;
+        use lance_index::scalar::lance_format::LanceIndexStore;
+        use lance_index::scalar::{SargableQuery, SearchResult};
+
+        use super::super::super::index::IndexStore;
+        use crate::dataset::mem_wal::util::pk_index_path;
+        use datafusion::common::ScalarValue;
+
+        let (store, base_path, _base_uri, _temp_dir) = create_local_store().await;
+        let shard_id = Uuid::new_v4();
+        let manifest_store = Arc::new(ShardManifestStore::new(
+            store.clone(),
+            &base_path,
+            shard_id,
+            2,
+        ));
+        let (epoch, _manifest) = manifest_store.claim_epoch(0).await.unwrap();
+
+        // Primary-key index on `id`, no user indexes.
+        let schema = create_pk_schema();
+        let mut memtable = MemTable::new(schema.clone(), 1, vec![0]).unwrap();
+        let mut registry = IndexStore::new();
+        registry.enable_pk_index(&[("id".to_string(), 0)]);
+        memtable.set_indexes(registry);
+
+        // id=1 updated in-gen (a -> a2); id=2 unique.
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 1])),
+                Arc::new(StringArray::from(vec!["a", "b", "a2"])),
+            ],
+        )
+        .unwrap();
+        let frag_id = memtable.insert(batch).await.unwrap();
+        memtable.mark_wal_flushed(&[frag_id], 1, &[0]);
+
+        let flusher = MemTableFlusher::new(
+            store.clone(),
+            base_path.clone(),
+            _base_uri.clone(),
+            shard_id,
+            manifest_store.clone(),
+        );
+        let result = flusher
+            .flush_with_indexes(&memtable, epoch, &[], 1)
+            .await
+            .unwrap();
+
+        // Reopen the sidecar directly by path (the block-list's route).
+        let gen_path = base_path
+            .clone()
+            .join("_mem_wal")
+            .join(shard_id.to_string())
+            .join(result.generation.path.as_str());
+        let index_store = Arc::new(LanceIndexStore::new(
+            store.clone(),
+            pk_index_path(&gen_path),
+            Arc::new(LanceCache::no_cache()),
+        ));
+        let registry = IndexPluginRegistry::with_default_plugins();
+        let plugin = registry.get_plugin_by_name("BTree").unwrap();
+        let details =
+            prost_types::Any::from_msg(&lance_index::pbold::BTreeIndexDetails::default()).unwrap();
+        let index = plugin
+            .load_index(index_store, &details, None, &LanceCache::no_cache())
+            .await
+            .unwrap();
+
+        let contains = |id: i32| {
+            let index = index.clone();
+            async move {
+                let result = index
+                    .search(
+                        &SargableQuery::Equals(ScalarValue::Int32(Some(id))),
+                        &NoOpMetricsCollector,
+                    )
+                    .await
+                    .unwrap();
+                match result {
+                    SearchResult::Exact(s) | SearchResult::AtMost(s) | SearchResult::AtLeast(s) => {
+                        !s.is_empty()
+                    }
+                }
+            }
+        };
+        // Both PKs present (id=1 even though its first version was superseded);
+        // an absent PK is not.
+        assert!(contains(1).await);
+        assert!(contains(2).await);
+        assert!(!contains(99).await);
+    }
+
+    /// Regression: production dispatches a PK-only flush (a primary key, no
+    /// secondary index) to `flush`, not `flush_with_indexes`. `flush` must still
+    /// write the PK dedup sidecar, otherwise cross-generation dedup fails with
+    /// `page_lookup.lance not found`.
+    #[tokio::test]
+    async fn plain_flush_writes_pk_sidecar() {
+        use lance_core::cache::LanceCache;
+        use lance_index::metrics::NoOpMetricsCollector;
+        use lance_index::registry::IndexPluginRegistry;
+        use lance_index::scalar::lance_format::LanceIndexStore;
+        use lance_index::scalar::{SargableQuery, SearchResult};
+
+        use super::super::super::index::IndexStore;
+        use crate::dataset::mem_wal::util::pk_index_path;
+        use datafusion::common::ScalarValue;
+
+        let (store, base_path, _base_uri, _temp_dir) = create_local_store().await;
+        let shard_id = Uuid::new_v4();
+        let manifest_store = Arc::new(ShardManifestStore::new(
+            store.clone(),
+            &base_path,
+            shard_id,
+            2,
+        ));
+        let (epoch, _manifest) = manifest_store.claim_epoch(0).await.unwrap();
+
+        // Primary-key index on `id`, no user indexes.
+        let schema = create_pk_schema();
+        let mut memtable = MemTable::new(schema.clone(), 1, vec![0]).unwrap();
+        let mut registry = IndexStore::new();
+        registry.enable_pk_index(&[("id".to_string(), 0)]);
+        memtable.set_indexes(registry);
+
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2])),
+                Arc::new(StringArray::from(vec!["a", "b"])),
+            ],
+        )
+        .unwrap();
+        let frag_id = memtable.insert(batch).await.unwrap();
+        memtable.mark_wal_flushed(&[frag_id], 1, &[0]);
+
+        let flusher = MemTableFlusher::new(
+            store.clone(),
+            base_path.clone(),
+            _base_uri.clone(),
+            shard_id,
+            manifest_store.clone(),
+        );
+        // The plain-flush path — what the writer dispatches to with no indexes.
+        let result = flusher.flush(&memtable, epoch, 1).await.unwrap();
+
+        let gen_path = base_path
+            .clone()
+            .join("_mem_wal")
+            .join(shard_id.to_string())
+            .join(result.generation.path.as_str());
+        let index_store = Arc::new(LanceIndexStore::new(
+            store.clone(),
+            pk_index_path(&gen_path),
+            Arc::new(LanceCache::no_cache()),
+        ));
+        let registry = IndexPluginRegistry::with_default_plugins();
+        let plugin = registry.get_plugin_by_name("BTree").unwrap();
+        let details =
+            prost_types::Any::from_msg(&lance_index::pbold::BTreeIndexDetails::default()).unwrap();
+        let index = plugin
+            .load_index(index_store, &details, None, &LanceCache::no_cache())
+            .await
+            .unwrap();
+
+        let contains = |id: i32| {
+            let index = index.clone();
+            async move {
+                let result = index
+                    .search(
+                        &SargableQuery::Equals(ScalarValue::Int32(Some(id))),
+                        &NoOpMetricsCollector,
+                    )
+                    .await
+                    .unwrap();
+                match result {
+                    SearchResult::Exact(s) | SearchResult::AtMost(s) | SearchResult::AtLeast(s) => {
+                        !s.is_empty()
+                    }
+                }
+            }
+        };
+        assert!(contains(1).await);
+        assert!(contains(2).await);
+        assert!(!contains(99).await);
+    }
+
     /// Covers `finalize_generation` writing both a deletion vector *and*
     /// indexes into the same manifest — the deletion-only and index-only
     /// paths are exercised by sibling tests.
diff --git a/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs b/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs
index 2c5192e28a1..17fa9c76a65 100644
--- a/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs
+++ b/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs
@@ -366,6 +366,14 @@ impl MemTableScanner {
         self
     }
 
+    /// The `max_visible_batch_position` snapshot this scanner latched at
+    /// construction. A downstream recency filter must key on this same snapshot
+    /// (not a fresh read of the IndexStore watermark, which a concurrent append
+    /// could have advanced) so it stays consistent with the rows the search saw.
+    pub fn max_visible_batch_position(&self) -> usize {
+        self.max_visible_batch_position
+    }
+
     /// Include the _rowaddr column in output.
     ///
     /// Same value as _rowid but named for compatibility with LSM scanner.
diff --git a/rust/lance/src/dataset/mem_wal/scanner.rs b/rust/lance/src/dataset/mem_wal/scanner.rs
index b1766f8525f..fe14bd82dd8 100644
--- a/rust/lance/src/dataset/mem_wal/scanner.rs
+++ b/rust/lance/src/dataset/mem_wal/scanner.rs
@@ -43,12 +43,15 @@ mod point_lookup;
 mod projection;
 mod vector_search;
 
+pub use block_list::write_pk_sidecar;
 pub use builder::LsmScanner;
 pub use collector::{
     ActiveMemTableRef, InMemoryMemTableRef, InMemoryMemTables, LsmDataSourceCollector,
 };
-pub use data_source::{FlushedGeneration, LsmDataSource, LsmGeneration, ShardSnapshot};
-pub use flushed_cache::FlushedMemTableCache;
+pub use data_source::{
+    FlushedGeneration, FreshTierWatermark, LsmDataSource, LsmGeneration, ShardSnapshot,
+};
+pub use flushed_cache::{DatasetCache, FlushedMemTableCache, GenerationWarmer};
 pub use fts_search::{LsmFtsSearchPlanner, SCORE_COLUMN};
 pub use point_lookup::LsmPointLookupPlanner;
 pub use projection::DISTANCE_COLUMN;
diff --git a/rust/lance/src/dataset/mem_wal/scanner/block_list.rs b/rust/lance/src/dataset/mem_wal/scanner/block_list.rs
index 684fde48da1..69d16930888 100644
--- a/rust/lance/src/dataset/mem_wal/scanner/block_list.rs
+++ b/rust/lance/src/dataset/mem_wal/scanner/block_list.rs
@@ -3,37 +3,151 @@
 
 //! Per-source block-list construction for LSM vector search.
 //!
-//! A generation's membership is an `Arc<HashSet<u64>>` of PK hashes
-//! ([`compute_pk_hash`]), built once (immutable gens cached). Each source gets a
-//! `Vec<Arc<HashSet<u64>>>` of the newer generations' sets (`NEWER(G)`; base: all
-//! of them) — referenced, never merged. The KNN drops candidates whose PK is in
-//! any (see [`super::exec::PkHashFilterExec`]).
+//! A generation's membership is a [`GenMembership`]: in-memory generations
+//! (active / frozen) are probed by value against their maintained primary-key
+//! index (no per-query set), while flushed generations are probed against their
+//! standalone on-disk PK BTree (the sidecar written at flush, opened by path).
+//! Probing is batched — [`GenMembership::contains_keys`] tests a whole batch of
+//! keys per generation in one pass. Each source gets a `Vec<GenMembership>` of
+//! the newer generations (`NEWER(G)`; base: all of them); the KNN drops a
+//! candidate whose PK any of them contains (see
+//! [`super::exec::PkBlockFilterExec`]).
 //!
-//! Cross-generation only: within-gen dups share a hash and fall to the global
-//! dedup's `(generation, freshness)` tiebreaker.
+//! Cross-generation only: within-gen dups collapse via the global dedup's
+//! `(generation, freshness)` tiebreaker.
+
+use std::collections::HashMap;
+use std::sync::{Arc, LazyLock};
+
+use datafusion::common::ScalarValue;
+use lance_core::{Error, Result};
+
+use lance_index::metrics::NoOpMetricsCollector;
+use lance_index::registry::IndexPluginRegistry;
+use lance_index::scalar::btree::BTreeIndex;
+use lance_index::scalar::lance_format::LanceIndexStore;
+use lance_index::scalar::{
+    IndexStore as ScalarIndexStore, SargableQuery, ScalarIndex, SearchResult,
+};
+use uuid::Uuid;
 
-use std::collections::{HashMap, HashSet};
-use std::sync::Arc;
+use super::data_source::{FreshTierWatermark, LsmDataSource, LsmGeneration};
+use super::flushed_cache::{DatasetCache, open_flushed_dataset};
+use crate::dataset::mem_wal::index::encode_pk_tuple;
+use crate::dataset::mem_wal::util::PK_INDEX_DIR;
+use crate::dataset::mem_wal::write::{BatchStore, IndexStore};
+use crate::session::Session;
 
-use arrow_array::RecordBatch;
-use futures::TryStreamExt;
-use lance_core::Result;
+/// Default-plugin registry, used only to load the standalone PK BTree by its
+/// `BTreeIndexDetails` type. Built once.
+static PK_BTREE_REGISTRY: LazyLock<Arc<IndexPluginRegistry>> =
+    LazyLock::new(IndexPluginRegistry::with_default_plugins);
+
+/// One newer generation's PK membership, used to decide whether it shadows an
+/// older source's row.
+#[derive(Clone, Debug)]
+pub enum GenMembership {
+    /// Probe the in-memory memtable's primary-key index, bounded to its visible
+    /// prefix (so a not-yet-visible write can't shadow an older visible copy).
+    InMemory {
+        index_store: Arc<IndexStore>,
+        /// Inclusive visible row watermark; `None` when no rows are visible.
+        max_visible_row: Option<u64>,
+    },
+    /// Probe the flushed generation's standalone on-disk PK BTree.
+    OnDisk(Arc<dyn ScalarIndex>),
+}
 
-use uuid::Uuid;
+impl GenMembership {
+    /// Whether this generation visibly contains the primary `key` — the typed
+    /// value for a single-column PK, the encoded `Binary` tuple for a composite
+    /// one (built by [`on_disk_pk_key`]). The same key probes the in-memory
+    /// BTree and the flushed on-disk BTree, which now share a key space.
+    pub async fn contains(&self, key: &ScalarValue) -> Result<bool> {
+        match self {
+            Self::InMemory {
+                index_store,
+                max_visible_row,
+            } => Ok(max_visible_row.is_some_and(|max| index_store.pk_contains_key(key, max))),
+            Self::OnDisk(index) => {
+                let result = index
+                    .search(&SargableQuery::Equals(key.clone()), &NoOpMetricsCollector)
+                    .await
+                    .map_err(|e| Error::io(e.to_string()))?;
+                Ok(!search_is_empty(&result))
+            }
+        }
+    }
 
-use super::data_source::{LsmDataSource, LsmGeneration};
-use super::exec::{compute_pk_hash, resolve_pk_indices};
-use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset};
-use crate::dataset::Dataset;
-use crate::dataset::mem_wal::write::BatchStore;
-use crate::session::Session;
+    /// Batched [`Self::contains`]: for each key in `keys`, whether this
+    /// generation visibly contains it, returned as a mask aligned to `keys`.
+    ///
+    /// One probe replaces N. The on-disk arm issues a single
+    /// [`BTreeIndex::contains_keys`] (no per-key `SearchResult` allocation); the
+    /// in-memory arm maps the sync, allocation-free PK lookup over the slice.
+    /// Keys are in the index's key space (see [`on_disk_pk_key`]).
+    pub async fn contains_keys(&self, keys: &[ScalarValue]) -> Result<Vec<bool>> {
+        match self {
+            Self::InMemory {
+                index_store,
+                max_visible_row,
+            } => Ok(keys
+                .iter()
+                .map(|key| max_visible_row.is_some_and(|max| index_store.pk_contains_key(key, max)))
+                .collect()),
+            Self::OnDisk(index) => {
+                // The flushed PK sidecar is always a BTree (built via
+                // `PK_BTREE_REGISTRY`); downcast to reach the batched probe.
+                let btree = index.as_any().downcast_ref::<BTreeIndex>().ok_or_else(|| {
+                    Error::io("flushed PK dedup index is not a BTree".to_string())
+                })?;
+                btree
+                    .contains_keys(keys, &NoOpMetricsCollector)
+                    .await
+                    .map_err(|e| Error::io(e.to_string()))
+            }
+        }
+    }
+
+    /// Whether this generation has no (visible) membership — used to skip adding
+    /// an empty blocked set. A flushed generation always has rows (flush rejects
+    /// an empty memtable), so it is never empty.
+    fn is_empty(&self) -> bool {
+        match self {
+            Self::InMemory {
+                index_store,
+                max_visible_row,
+            } => max_visible_row.is_none() || index_store.pk_is_empty(),
+            Self::OnDisk(_) => false,
+        }
+    }
+}
 
-/// Per-source blocked PK-hash sets, keyed by `(shard_id, generation)`. Each
-/// value is the membership sets of the generations newer than that source.
-pub type SourceBlockLists = HashMap<(Option<Uuid>, LsmGeneration), Vec<Arc<HashSet<u64>>>>;
+/// Whether a scalar search returned no rows (existence test for the block-list).
+fn search_is_empty(result: &SearchResult) -> bool {
+    match result {
+        SearchResult::Exact(set) | SearchResult::AtMost(set) | SearchResult::AtLeast(set) => {
+            set.is_empty()
+        }
+    }
+}
 
-/// A shard's generations paired with their PK-hash membership, before sorting.
-type ShardGenSets = HashMap<Uuid, Vec<(LsmGeneration, Arc<HashSet<u64>>)>>;
+/// The probe key for the on-disk PK BTree: a single-column PK indexes its typed
+/// value directly; a composite PK indexes the order-preserving encoded tuple as
+/// `Binary` (matching what flush wrote — see [`encode_pk_tuple`]).
+pub fn on_disk_pk_key(values: &[ScalarValue]) -> Result<ScalarValue> {
+    match values {
+        [single] => Ok(single.clone()),
+        _ => Ok(ScalarValue::Binary(Some(encode_pk_tuple(values)?))),
+    }
+}
+
+/// Per-source blocked memberships, keyed by `(shard_id, generation)`. Each value
+/// is the memberships of the generations newer than that source.
+pub type SourceBlockLists = HashMap<(Option<Uuid>, LsmGeneration), Vec<GenMembership>>;
+
+/// A shard's generations paired with their membership, before sorting.
+type ShardGenSets = HashMap<Uuid, Vec<(LsmGeneration, GenMembership)>>;
 
 /// Per-source `NEWER(G)`, keyed by `(shard_id, generation)`. Generations are
 /// per-shard, so a source is superseded only by strictly-newer generations of
@@ -42,59 +156,64 @@ type ShardGenSets = HashMap<Uuid, Vec<(LsmGeneration, Arc<HashSet<u64>>)>>;
 /// Only superseded sources get an entry; the newest of each shard never does.
 pub async fn compute_source_block_lists(
     sources: &[LsmDataSource],
-    pk_columns: &[String],
     session: Option<&Arc<Session>>,
-    flushed_cache: Option<&Arc<FlushedMemTableCache>>,
+    flushed_cache: Option<&Arc<dyn DatasetCache>>,
 ) -> Result<SourceBlockLists> {
-    // Hash each non-base source's membership, grouped by shard (generations are
+    // Membership per non-base source, grouped by shard (generations are
     // per-shard, so supersession is within-shard only).
     let mut by_shard: ShardGenSets = HashMap::new();
     let mut has_base = false;
+    // Flushed PK-BTree opens are cold S3 reads; overlap them with
+    // `try_join_all`. Order is irrelevant — gens are sorted per-shard below.
+    let mut flushed_loads = Vec::new();
     for source in sources {
         match source {
             LsmDataSource::BaseTable { .. } => has_base = true,
             LsmDataSource::ActiveMemTable {
                 batch_store,
+                index_store,
                 shard_id,
                 generation,
                 ..
             } => {
-                let hashes = Arc::new(pk_hashes_from_batch_store(batch_store, pk_columns)?);
+                let membership = in_memory_membership(batch_store, index_store);
                 by_shard
                     .entry(*shard_id)
                     .or_default()
-                    .push((*generation, hashes));
+                    .push((*generation, membership));
             }
             LsmDataSource::FlushedMemTable {
                 path,
                 shard_id,
                 generation,
                 ..
-            } => {
-                // Cached by immutable path so repeated searches skip the PK scan.
-                let hashes = flushed_pk_hashes(path, pk_columns, session, flushed_cache).await?;
-                by_shard
-                    .entry(*shard_id)
-                    .or_default()
-                    .push((*generation, hashes));
-            }
+            } => flushed_loads.push(async move {
+                let index = open_pk_index(path, session, flushed_cache).await?;
+                Ok::<_, Error>((*shard_id, *generation, GenMembership::OnDisk(index)))
+            }),
         }
     }
+    for (shard_id, generation, membership) in futures::future::try_join_all(flushed_loads).await? {
+        by_shard
+            .entry(shard_id)
+            .or_default()
+            .push((generation, membership));
+    }
 
     let mut blocked: SourceBlockLists = HashMap::new();
     // Base (shardless, oldest) is superseded by every non-base generation.
-    let mut base_blocked: Vec<Arc<HashSet<u64>>> = Vec::new();
+    let mut base_blocked: Vec<GenMembership> = Vec::new();
     for (shard, mut gens) in by_shard {
         // Newest-first: a gen's blocked list is its own shard's newer gens.
         gens.sort_by_key(|(generation, _)| std::cmp::Reverse(*generation));
-        let mut newer: Vec<Arc<HashSet<u64>>> = Vec::new();
-        for (generation, hashes) in gens {
+        let mut newer: Vec<GenMembership> = Vec::new();
+        for (generation, membership) in gens {
             if !newer.is_empty() {
                 blocked.insert((Some(shard), generation), newer.clone());
             }
-            if !hashes.is_empty() {
-                base_blocked.push(hashes.clone());
-                newer.push(hashes);
+            if !membership.is_empty() {
+                base_blocked.push(membership.clone());
+                newer.push(membership);
             }
         }
     }
@@ -104,260 +223,355 @@ pub async fn compute_source_block_lists(
     Ok(blocked)
 }
 
-/// The fresh-tier block-list: one membership set per generation that shadows the
-/// base table — active + frozen memtables (hashed now) and flushed generations
-/// (from the cache). Same `Vec<Arc<HashSet<u64>>>` shape the vector-search filter
-/// consumes; a base/external reader can drop any row whose PK is in one of them.
-/// The base source, if present, is skipped (it is what gets shadowed).
+/// The fresh-tier block-list: one [`GenMembership`] per generation that shadows
+/// the base table — active + frozen memtables (probed against their index) and
+/// flushed generations (probed against their on-disk PK BTree). A base/external
+/// reader can test any PK against these (via [`GenMembership::contains`]) to
+/// decide whether the fresh tier shadows it. The base source, if present, is
+/// skipped (it is what gets shadowed).
+///
+/// When `watermarks` carries a watermark for a source's shard, membership is
+/// bounded to it (see [`FreshTierWatermark`]): higher generations are excluded,
+/// the active generation is bounded to its first `active_batch_count` batches,
+/// and lower generations (frozen and flushed) are immutable and included whole.
+/// A shard absent from `watermarks` (or `watermarks == None`) uses the live tier.
 pub async fn fresh_tier_block_list(
     sources: &[LsmDataSource],
-    pk_columns: &[String],
     session: Option<&Arc<Session>>,
-    flushed_cache: Option<&Arc<FlushedMemTableCache>>,
-) -> Result<Vec<Arc<HashSet<u64>>>> {
-    let mut sets = Vec::new();
+    flushed_cache: Option<&Arc<dyn DatasetCache>>,
+    watermarks: Option<&HashMap<Uuid, FreshTierWatermark>>,
+) -> Result<Vec<GenMembership>> {
+    // Membership per source, in source order (`None` = skipped). Flushed
+    // PK-BTree opens are cold S3 reads, so collect them tagged with their slot
+    // and overlap with `try_join_all` rather than opening one at a time.
+    let mut slots: Vec<Option<GenMembership>> = Vec::with_capacity(sources.len());
+    let mut flushed_loads = Vec::new();
     for source in sources {
-        let set = match source {
-            LsmDataSource::BaseTable { .. } => continue,
-            LsmDataSource::ActiveMemTable { batch_store, .. } => {
-                Arc::new(pk_hashes_from_batch_store(batch_store, pk_columns)?)
+        match source {
+            LsmDataSource::BaseTable { .. } => slots.push(None),
+            LsmDataSource::ActiveMemTable {
+                batch_store,
+                index_store,
+                shard_id,
+                generation,
+                ..
+            } => {
+                let membership = match watermarks.and_then(|m| m.get(shard_id)) {
+                    None => Some(in_memory_membership(batch_store, index_store)),
+                    Some(watermark) => {
+                        let g = generation.as_u64();
+                        if g > watermark.active_generation {
+                            // Rolled in after the snapshot; the arm never saw it.
+                            None
+                        } else if g == watermark.active_generation {
+                            // Bound the active generation to the batches the arm saw.
+                            Some(bounded_in_memory_membership(
+                                batch_store,
+                                index_store,
+                                watermark.active_batch_count,
+                            ))
+                        } else {
+                            // Lower (frozen) generations are immutable — include all.
+                            Some(in_memory_membership(batch_store, index_store))
+                        }
+                    }
+                };
+                slots.push(membership);
             }
-            LsmDataSource::FlushedMemTable { path, .. } => {
-                flushed_pk_hashes(path, pk_columns, session, flushed_cache).await?
+            LsmDataSource::FlushedMemTable {
+                path,
+                shard_id,
+                generation,
+                ..
+            } => {
+                // A generation at or above the active one was flushed after the
+                // snapshot; exclude it. Lower generations are immutable. The
+                // `==` case is the active generation flushed between the two
+                // reads: excluding the flushed copy loses nothing, since its
+                // rows are already captured by the in-memory arm above (bounded
+                // to `active_batch_count`).
+                let flushed_after_snapshot = watermarks
+                    .and_then(|m| m.get(shard_id))
+                    .is_some_and(|watermark| generation.as_u64() >= watermark.active_generation);
+                if flushed_after_snapshot {
+                    slots.push(None);
+                } else {
+                    let slot = slots.len();
+                    slots.push(None);
+                    flushed_loads.push(async move {
+                        let index = open_pk_index(path, session, flushed_cache).await?;
+                        Ok::<_, Error>((slot, GenMembership::OnDisk(index)))
+                    });
+                }
             }
-        };
-        if !set.is_empty() {
-            sets.push(set);
         }
     }
-    Ok(sets)
+    for (slot, membership) in futures::future::try_join_all(flushed_loads).await? {
+        slots[slot] = Some(membership);
+    }
+    Ok(slots
+        .into_iter()
+        .flatten()
+        .filter(|membership| !membership.is_empty())
+        .collect())
 }
 
-/// Hash the PK membership of an in-memory memtable (active or frozen) from its
-/// committed `BatchStore` rows.
-pub fn pk_hashes_from_batch_store(
-    store: &BatchStore,
-    pk_columns: &[String],
-) -> Result<HashSet<u64>> {
-    let mut batches: Vec<RecordBatch> = Vec::with_capacity(store.len());
-    for i in 0..store.len() {
-        if let Some(stored) = store.get(i) {
-            batches.push(stored.data.clone());
-        }
+/// Cross-source membership of an in-memory (active / frozen) memtable: a
+/// snapshot-bounded probe of its maintained primary-key index. A memtable
+/// without a primary-key index can't be probed, so it blocks nothing — the
+/// production vector-search path always enables the index.
+fn in_memory_membership(
+    batch_store: &Arc<BatchStore>,
+    index_store: &Arc<IndexStore>,
+) -> GenMembership {
+    let max_visible_row = batch_store.max_visible_row(index_store.max_visible_batch_position());
+    GenMembership::InMemory {
+        index_store: index_store.clone(),
+        max_visible_row,
     }
-    pk_hashes_from_batches(&batches, pk_columns)
 }
 
-/// Hash every row's primary key across `batches` into a membership set.
-fn pk_hashes_from_batches(batches: &[RecordBatch], pk_columns: &[String]) -> Result<HashSet<u64>> {
-    let mut pk_hashes = HashSet::new();
-    for batch in batches {
-        if batch.num_rows() == 0 {
-            continue;
-        }
-        let pk_indices = resolve_pk_indices(batch, pk_columns)
-            .map_err(|e| lance_core::Error::invalid_input(e.to_string()))?;
-        for row_idx in 0..batch.num_rows() {
-            pk_hashes.insert(compute_pk_hash(batch, &pk_indices, row_idx));
-        }
+/// As-of variant of [`in_memory_membership`] for the active generation under a
+/// watermark: bounds visibility to the first `batch_count` batches — those a
+/// prior scan observed before the memtable grew. A later append lands at a
+/// higher row position and is excluded by the probe, so it can't shadow a base
+/// row whose replacement the scan never delivered. `batch_count == 0` leaves the
+/// membership empty.
+fn bounded_in_memory_membership(
+    batch_store: &Arc<BatchStore>,
+    index_store: &Arc<IndexStore>,
+    batch_count: u64,
+) -> GenMembership {
+    let max_visible_row = batch_count
+        .checked_sub(1)
+        .and_then(|last_batch| batch_store.max_visible_row(last_batch as usize));
+    GenMembership::InMemory {
+        index_store: index_store.clone(),
+        max_visible_row,
     }
-    Ok(pk_hashes)
 }
 
-/// Build (or fetch the cached) PK-hash membership for one flushed generation.
-/// Cached by immutable path (single-flight); the build scans the flushed
-/// dataset's PK columns.
-async fn flushed_pk_hashes(
+/// Open the standalone PK BTree at `{flushed gen}/_pk_index` for one flushed
+/// generation. Reuses the flushed dataset's (session-configured) object store
+/// and **its index cache**, then loads the sidecar directly by path through the
+/// BTree plugin — it is not a manifest index. The opened index and its pages
+/// are cached in the session's index cache (keyed by the immutable flushed
+/// path), so repeated probes reuse them with no separate cache path and no
+/// upfront scan; concurrent first-opens may each load before the cache fills.
+/// A stable cache UUID for a non-manifest index identified only by its path.
+///
+/// `DSIndexCache::for_index` keys by `&Uuid`, but the flushed PK sidecar has no
+/// manifest UUID — its identity is its immutable path. Derive a deterministic
+/// UUID from the path so the cache namespace is per-path and stable across
+/// probes (the `uuid` crate lacks the `v5` "name-based" feature here, so hash to
+/// a `u128` instead).
+fn path_cache_uuid(path: &str) -> Uuid {
+    use std::hash::{Hash, Hasher};
+    let mut lo = std::collections::hash_map::DefaultHasher::new();
+    path.hash(&mut lo);
+    let mut hi = std::collections::hash_map::DefaultHasher::new();
+    // Seed the high half differently so it never equals the low half.
+    "lance/flushed-pk-index".hash(&mut hi);
+    path.hash(&mut hi);
+    Uuid::from_u128(((hi.finish() as u128) << 64) | lo.finish() as u128)
+}
+
+async fn open_pk_index(
     path: &str,
-    pk_columns: &[String],
     session: Option<&Arc<Session>>,
-    flushed_cache: Option<&Arc<FlushedMemTableCache>>,
-) -> Result<Arc<HashSet<u64>>> {
-    match flushed_cache {
-        Some(cache) => {
-            let build_cache = cache.clone();
-            let build_path = path.to_string();
-            let build_session = session.cloned();
-            let build_pk = pk_columns.to_vec();
-            cache
-                .get_or_build_pk_hashes(
-                    path,
-                    // `Box::pin` keeps this build future off the caller's future
-                    // (avoids `clippy::large_futures`).
-                    Box::pin(async move {
-                        let dataset = open_flushed_dataset(
-                            &build_path,
-                            build_session.as_ref(),
-                            Some(&build_cache),
-                        )
-                        .await?;
-                        scan_pk_hashes(&dataset, &build_pk).await
-                    }),
-                )
-                .await
-        }
-        None => {
-            let dataset = open_flushed_dataset(path, session, None).await?;
-            Ok(Arc::new(scan_pk_hashes(&dataset, pk_columns).await?))
-        }
+    flushed_cache: Option<&Arc<dyn DatasetCache>>,
+) -> Result<Arc<dyn ScalarIndex>> {
+    let dataset = open_flushed_dataset(path, session, flushed_cache, None).await?;
+    // Namespace the session index cache by the (immutable) flushed path so this
+    // sidecar's pages live alongside every other index instead of a bespoke
+    // cache. `fri_uuid` is None — flushed generations carry no fragment-reuse.
+    let index_cache = dataset.index_cache.for_index(&path_cache_uuid(path), None);
+    let index_dir = dataset.base.clone().join(PK_INDEX_DIR);
+    let store: Arc<dyn ScalarIndexStore> = Arc::new(LanceIndexStore::new(
+        dataset.object_store.clone(),
+        index_dir,
+        Arc::new(index_cache.clone()),
+    ));
+
+    let plugin = PK_BTREE_REGISTRY.get_plugin_by_name("BTree")?;
+    // Cache the opened index in the session cache (mirrors `open_scalar_index`).
+    if let Some(index) = plugin
+        .get_from_cache(store.clone(), None, &index_cache)
+        .await?
+    {
+        return Ok(index);
     }
+    let details = prost_types::Any::from_msg(&lance_index::pbold::BTreeIndexDetails::default())
+        .map_err(|e| Error::io(e.to_string()))?;
+    let index = plugin
+        .load_index(store, &details, None, &index_cache)
+        .await?;
+    plugin.put_in_cache(&index_cache, index.clone()).await?;
+    Ok(index)
 }
 
-/// Scan a dataset's PK columns and fold them into a membership set, one batch
-/// resident at a time (no full PK-column buffer).
-async fn scan_pk_hashes(dataset: &Dataset, pk_columns: &[String]) -> Result<HashSet<u64>> {
-    let pk_refs: Vec<&str> = pk_columns.iter().map(String::as_str).collect();
-    let mut scanner = dataset.scan();
-    scanner.project(&pk_refs)?;
-    let mut stream = scanner.try_into_stream().await?;
-    let mut hashes = HashSet::new();
-    while let Some(batch) = stream.try_next().await? {
-        if batch.num_rows() == 0 {
-            continue;
-        }
-        let pk_indices = resolve_pk_indices(&batch, pk_columns)
-            .map_err(|e| lance_core::Error::invalid_input(e.to_string()))?;
-        for row in 0..batch.num_rows() {
-            hashes.insert(compute_pk_hash(&batch, &pk_indices, row));
-        }
+/// Write a flushed generation's standalone PK sidecar at `{uri}/_pk_index` from
+/// `batches`, mirroring what flush does in production. `pk_columns` are the
+/// primary-key column names (field ids are synthesized by position — `insert`
+/// resolves columns by name). A no-op when no batch carries the PK columns.
+///
+/// Used by Rust scanner tests and by the Python test-support binding to stage
+/// faithful flushed generations (a flushed dataset alone, with no sidecar, is
+/// not a state production ever produces).
+pub async fn write_pk_sidecar(
+    uri: &str,
+    batches: &[arrow_array::RecordBatch],
+    pk_columns: &[&str],
+) -> Result<()> {
+    use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
+    use lance_core::cache::LanceCache;
+    use lance_index::scalar::btree::train_btree_index;
+    use lance_io::object_store::ObjectStore;
+
+    use crate::dataset::mem_wal::util::pk_index_path;
+
+    let pk: Vec<(String, i32)> = pk_columns
+        .iter()
+        .enumerate()
+        .map(|(i, c)| (c.to_string(), i as i32))
+        .collect();
+    let mut index = IndexStore::new();
+    index.enable_pk_index(&pk);
+    let mut offset = 0u64;
+    for batch in batches {
+        index.insert(batch, offset)?;
+        offset += batch.num_rows() as u64;
+    }
+
+    let training = index.pk_training_batches(8192)?;
+    if training.is_empty() {
+        return Ok(());
     }
-    Ok(hashes)
+    let schema = training[0].schema();
+    let (object_store, base_path) = ObjectStore::from_uri(uri).await?;
+    let store = LanceIndexStore::new(
+        object_store,
+        pk_index_path(&base_path),
+        Arc::new(LanceCache::no_cache()),
+    );
+    let stream = Box::pin(RecordBatchStreamAdapter::new(
+        schema,
+        futures::stream::iter(training.into_iter().map(Ok)),
+    ));
+    // `train_btree_index` now returns the written index files; the sidecar
+    // writer only needs success/failure.
+    train_btree_index(stream, &store, 8192, None, None).await?;
+    Ok(())
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow_array::Int32Array;
+    use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration};
+    use crate::dataset::mem_wal::write::IndexStore;
+    use arrow_array::{Int32Array, RecordBatch};
     use arrow_schema::{DataType, Field, Schema};
     use std::sync::Arc;
+    use uuid::Uuid;
 
     fn id_batch(ids: &[i32]) -> RecordBatch {
         let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
         RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(ids.to_vec()))]).unwrap()
     }
 
-    /// Hash a single Int32 `id` PK the way the planner does, so a test can probe
-    /// a returned blocked set by value.
-    fn hash_id(id: i32) -> u64 {
-        let batch = id_batch(&[id]);
-        let pk_indices = resolve_pk_indices(&batch, &["id".to_string()]).unwrap();
-        compute_pk_hash(&batch, &pk_indices, 0)
-    }
-
-    /// Whether `id`'s PK hash is blocked by any of a source's newer-gen sets.
-    fn blocks(sets: &[Arc<HashSet<u64>>], id: i32) -> bool {
-        sets.iter().any(|s| s.contains(&hash_id(id)))
-    }
-
-    #[test]
-    fn pk_hashes_collapse_within_gen_duplicates() {
-        // Two rows share pk=1 (a within-gen duplicate); pk=2 is unique.
-        let hashes = pk_hashes_from_batches(&[id_batch(&[1, 2, 1])], &["id".to_string()]).unwrap();
-        assert_eq!(hashes.len(), 2); // distinct pks: 1, 2
+    /// An active/frozen memtable source whose PK index holds one row per id in
+    /// `ids` (positions 0..n), all committed and visible.
+    fn active_source(shard: Uuid, generation: u64, ids: &[i32]) -> LsmDataSource {
+        let store = BatchStore::with_capacity(16);
+        let mut index = IndexStore::new();
+        index.enable_pk_index(&[("id".to_string(), 0)]);
+        for &id in ids {
+            let b = id_batch(&[id]);
+            let (bp, off, _) = store.append(b.clone()).unwrap();
+            index.insert_with_batch_position(&b, off, Some(bp)).unwrap();
+        }
+        LsmDataSource::ActiveMemTable {
+            batch_store: Arc::new(store),
+            index_store: Arc::new(index),
+            schema: id_batch(&[1]).schema(),
+            shard_id: shard,
+            generation: LsmGeneration::memtable(generation),
+        }
     }
 
-    #[test]
-    fn empty_batches_yield_empty_membership() {
-        let hashes = pk_hashes_from_batches(&[id_batch(&[])], &["id".to_string()]).unwrap();
-        assert!(hashes.is_empty());
+    /// Whether `id`'s PK is blocked by any of a source's newer-gen memberships.
+    async fn blocks(memberships: &[GenMembership], id: i32) -> bool {
+        let key = on_disk_pk_key(&[ScalarValue::Int32(Some(id))]).unwrap();
+        for m in memberships {
+            if m.contains(&key).await.unwrap() {
+                return true;
+            }
+        }
+        false
     }
 
     #[test]
-    fn batch_store_membership_collapses_within_gen_dups() {
-        let store = BatchStore::with_capacity(8);
-        // Two single-row batches, both pk=1 (a within-gen update).
-        store.append(id_batch(&[1])).unwrap();
-        store.append(id_batch(&[1])).unwrap();
-        // A two-row batch: pk=2, pk=3.
-        store.append(id_batch(&[2, 3])).unwrap();
-
-        let hashes = pk_hashes_from_batch_store(&store, &["id".to_string()]).unwrap();
-        assert_eq!(hashes.len(), 3); // distinct pks: 1, 2, 3
+    fn on_disk_key_is_typed_for_single_and_binary_for_composite() {
+        // Single-column → the typed value; composite → encoded Binary.
+        let single = [ScalarValue::Int32(Some(7))];
+        assert_eq!(
+            on_disk_pk_key(&single).unwrap(),
+            ScalarValue::Int32(Some(7))
+        );
+        let composite = [ScalarValue::Int32(Some(1)), ScalarValue::from("a")];
+        assert!(matches!(
+            on_disk_pk_key(&composite).unwrap(),
+            ScalarValue::Binary(Some(_))
+        ));
     }
 
     #[tokio::test]
-    async fn fresh_tier_block_list_one_set_per_in_memory_gen() {
-        use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration};
-        use crate::dataset::mem_wal::write::IndexStore;
-        use uuid::Uuid;
-
+    async fn fresh_tier_block_list_one_membership_per_in_memory_gen() {
         let shard = Uuid::new_v4();
-        let mk = |ids: &[i32], generation: u64| {
-            let store = BatchStore::with_capacity(8);
-            store.append(id_batch(ids)).unwrap();
-            LsmDataSource::ActiveMemTable {
-                batch_store: Arc::new(store),
-                index_store: Arc::new(IndexStore::new()),
-                schema: id_batch(&[1]).schema(),
-                shard_id: shard,
-                generation: LsmGeneration::memtable(generation),
-            }
-        };
         // Active gen 2: pk=1,2. Frozen gen 1: pk=3.
-        let sources = vec![mk(&[1, 2], 2), mk(&[3], 1)];
+        let sources = vec![
+            active_source(shard, 2, &[1, 2]),
+            active_source(shard, 1, &[3]),
+        ];
 
-        let sets = fresh_tier_block_list(&sources, &["id".to_string()], None, None)
+        let memberships = fresh_tier_block_list(&sources, None, None, None)
             .await
             .unwrap();
 
-        // One set per generation; together they cover pk=1,2,3 (not 4).
-        assert_eq!(sets.len(), 2);
+        // One membership per generation; together they cover pk=1,2,3 (not 4).
+        assert_eq!(memberships.len(), 2);
         for id in [1, 2, 3] {
-            assert!(blocks(&sets, id));
+            assert!(blocks(&memberships, id).await);
         }
-        assert!(!blocks(&sets, 4));
+        assert!(!blocks(&memberships, 4).await);
     }
 
     #[tokio::test]
     async fn block_lists_suppress_stale_across_in_memory_gens() {
-        use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration};
-        use crate::dataset::mem_wal::write::IndexStore;
-        use uuid::Uuid;
-
         let shard = Uuid::new_v4();
-        let mk = |batches: &[&[i32]], generation: u64| {
-            let store = BatchStore::with_capacity(8);
-            for ids in batches {
-                store.append(id_batch(ids)).unwrap();
-            }
-            LsmDataSource::ActiveMemTable {
-                batch_store: Arc::new(store),
-                index_store: Arc::new(IndexStore::new()),
-                schema: id_batch(&[1]).schema(),
-                shard_id: shard,
-                generation: LsmGeneration::memtable(generation),
-            }
-        };
-
-        // Frozen gen 1: stale pk=1.
-        // Active gen 2: pk=1 re-written, pk=2 new.
-        let sources = vec![mk(&[&[1]], 1), mk(&[&[1], &[2]], 2)];
+        // Frozen gen 1: stale pk=1. Active gen 2: pk=1 re-written, pk=2 new.
+        let sources = vec![
+            active_source(shard, 1, &[1]),
+            active_source(shard, 2, &[1, 2]),
+        ];
 
-        let blocked = Box::pin(compute_source_block_lists(
-            &sources,
-            &["id".to_string()],
-            None,
-            None,
-        ))
-        .await
-        .unwrap();
+        let blocked = Box::pin(compute_source_block_lists(&sources, None, None))
+            .await
+            .unwrap();
 
         let g1 = LsmGeneration::memtable(1);
         let g2 = LsmGeneration::memtable(2);
         // The newer active write supersedes the frozen copy: gen 1 is blocked on
         // pk=1, so its KNN drops pk=1.
-        assert!(blocks(&blocked[&(Some(shard), g1)], 1));
+        assert!(blocks(&blocked[&(Some(shard), g1)], 1).await);
         // The active (newest) generation is superseded by nothing — no entry.
         assert!(!blocked.contains_key(&(Some(shard), g2)));
     }
 
     #[tokio::test]
     async fn block_lists_suppress_stale_base_row() {
-        use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration};
-        use crate::dataset::mem_wal::write::IndexStore;
         use crate::dataset::{Dataset, WriteParams};
         use arrow_array::RecordBatchIterator;
-        use uuid::Uuid;
 
         // Base (gen 0): pk=1 (stale), pk=3 (live).
         let base_batch = id_batch(&[1, 3]);
@@ -372,89 +586,239 @@ mod tests {
         );
 
         // Active gen 1: pk=1 re-written, pk=2 new.
-        let store = BatchStore::with_capacity(8);
-        store.append(id_batch(&[1])).unwrap();
-        store.append(id_batch(&[2])).unwrap();
-
         let sources = vec![
             LsmDataSource::BaseTable { dataset: base },
-            LsmDataSource::ActiveMemTable {
-                batch_store: Arc::new(store),
-                index_store: Arc::new(IndexStore::new()),
-                schema,
-                shard_id: Uuid::new_v4(),
-                generation: LsmGeneration::memtable(1),
-            },
+            active_source(Uuid::new_v4(), 1, &[1, 2]),
         ];
 
-        let blocked = Box::pin(compute_source_block_lists(
-            &sources,
-            &["id".to_string()],
-            None,
-            None,
-        ))
-        .await
-        .unwrap();
+        let blocked = Box::pin(compute_source_block_lists(&sources, None, None))
+            .await
+            .unwrap();
 
         // Base is blocked by every newer gen: pk=1 (re-written in gen 1) is
-        // blocked, pk=3 (base-only) is not. End-to-end drop: vector_search specs.
+        // blocked, pk=3 (base-only) is not.
         let base_blocked = blocked
             .get(&(None, LsmGeneration::BASE_TABLE))
             .expect("base has a blocked set");
-        assert!(blocks(base_blocked, 1));
-        assert!(!blocks(base_blocked, 3));
+        assert!(blocks(base_blocked, 1).await);
+        assert!(!blocks(base_blocked, 3).await);
     }
 
     #[tokio::test]
     async fn block_lists_are_keyed_per_shard() {
         // Regression: generations are per-shard, so a source must only be blocked
-        // by newer generations of its OWN shard. A generation-only key would
-        // cross-block same-generation sources from different shards.
-        use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration};
-        use crate::dataset::mem_wal::write::IndexStore;
-        use uuid::Uuid;
-
-        let mk = |shard: Uuid, ids: &[i32], generation: u64| {
-            let store = BatchStore::with_capacity(8);
-            store.append(id_batch(ids)).unwrap();
-            LsmDataSource::ActiveMemTable {
-                batch_store: Arc::new(store),
-                index_store: Arc::new(IndexStore::new()),
-                schema: id_batch(&[1]).schema(),
-                shard_id: shard,
-                generation: LsmGeneration::memtable(generation),
-            }
-        };
-
-        // Two shards, each: frozen gen 1 (stale) + active gen 2 (re-write).
-        // Shard A keys pk=1; shard B keys pk=2 (disjoint partitions).
+        // by newer generations of its OWN shard.
         let a = Uuid::new_v4();
         let b = Uuid::new_v4();
+        // Two shards, each: frozen gen 1 (stale) + active gen 2 (re-write).
+        // Shard A keys pk=1; shard B keys pk=2 (disjoint partitions).
         let sources = vec![
-            mk(a, &[1], 1),
-            mk(a, &[1], 2),
-            mk(b, &[2], 1),
-            mk(b, &[2], 2),
+            active_source(a, 1, &[1]),
+            active_source(a, 2, &[1]),
+            active_source(b, 1, &[2]),
+            active_source(b, 2, &[2]),
         ];
 
-        let blocked = Box::pin(compute_source_block_lists(
-            &sources,
-            &["id".to_string()],
-            None,
-            None,
-        ))
-        .await
-        .unwrap();
+        let blocked = Box::pin(compute_source_block_lists(&sources, None, None))
+            .await
+            .unwrap();
 
         let g1 = LsmGeneration::memtable(1);
         let g2 = LsmGeneration::memtable(2);
         // Each shard's gen 1 is blocked by its OWN gen 2 only.
-        assert!(blocks(&blocked[&(Some(a), g1)], 1));
-        assert!(!blocks(&blocked[&(Some(a), g1)], 2));
-        assert!(blocks(&blocked[&(Some(b), g1)], 2));
-        assert!(!blocks(&blocked[&(Some(b), g1)], 1));
+        assert!(blocks(&blocked[&(Some(a), g1)], 1).await);
+        assert!(!blocks(&blocked[&(Some(a), g1)], 2).await);
+        assert!(blocks(&blocked[&(Some(b), g1)], 2).await);
+        assert!(!blocks(&blocked[&(Some(b), g1)], 1).await);
         // The newest generation of each shard is superseded by nothing.
         assert!(!blocked.contains_key(&(Some(a), g2)));
         assert!(!blocked.contains_key(&(Some(b), g2)));
     }
+
+    #[tokio::test]
+    async fn index_membership_is_snapshot_bounded() {
+        // The index-sourced membership only counts a PK whose version is visible
+        // at the source's watermark, so a newer generation's not-yet-visible
+        // write can't shadow an older generation's visible copy.
+        let shard = Uuid::new_v4();
+        let schema = id_batch(&[1]).schema();
+
+        // Older frozen gen 1: pk=1.
+        let g1 = active_source(shard, 1, &[1]);
+
+        // Newer active gen 2: pk=99 visible at position 0, then pk=1 written at
+        // position 1 but with the watermark left at batch 0 (so pk=1 is in the
+        // index yet not visible) — the concurrent-write race.
+        let g2_store = BatchStore::with_capacity(8);
+        let mut g2_index = IndexStore::new();
+        g2_index.enable_pk_index(&[("id".to_string(), 0)]);
+        let b0 = id_batch(&[99]);
+        let (bp0, off0, _) = g2_store.append(b0.clone()).unwrap();
+        g2_index
+            .insert_with_batch_position(&b0, off0, Some(bp0)) // advances watermark to 0
+            .unwrap();
+        let b1 = id_batch(&[1]);
+        let (_, off1, _) = g2_store.append(b1.clone()).unwrap();
+        g2_index
+            .insert_with_batch_position(&b1, off1, None) // index updated, watermark unchanged
+            .unwrap();
+        let g2 = LsmDataSource::ActiveMemTable {
+            batch_store: Arc::new(g2_store),
+            index_store: Arc::new(g2_index),
+            schema,
+            shard_id: shard,
+            generation: LsmGeneration::memtable(2),
+        };
+
+        let blocked = Box::pin(compute_source_block_lists(&[g1, g2], None, None))
+            .await
+            .unwrap();
+
+        let g1_block = &blocked[&(Some(shard), LsmGeneration::memtable(1))];
+        // pk=99 is visible in gen 2 → it blocks gen 1's pk=99.
+        assert!(blocks(g1_block, 99).await);
+        // pk=1's only gen-2 copy is not yet visible → it must NOT shadow gen 1.
+        assert!(
+            !blocks(g1_block, 1).await,
+            "a not-yet-visible newer write must not shadow an older visible copy"
+        );
+    }
+
+    /// A fresh-tier watermark bounds the active generation to the first
+    /// `active_batch_count` batches — those the arm observed before the memtable
+    /// grew. A later append is invisible, so a base row is never dropped without
+    /// the arm having delivered its replacement.
+    #[tokio::test]
+    async fn fresh_tier_watermark_bounds_active_memtable_by_batch_count() {
+        use crate::dataset::mem_wal::scanner::data_source::FreshTierWatermark;
+        use std::collections::HashMap;
+
+        let shard = Uuid::new_v4();
+        // Three single-row batches: pk=1 at batch 0, pk=2 at batch 1, pk=3 at
+        // batch 2 (appended after the arm).
+        let sources = vec![active_source(shard, 1, &[1, 2, 3])];
+
+        // Watermark at 2 batches of gen 1: pk=1,2 are members; pk=3 (batch 2) is not.
+        let watermarks: HashMap<Uuid, FreshTierWatermark> = [(
+            shard,
+            FreshTierWatermark {
+                active_generation: 1,
+                active_batch_count: 2,
+            },
+        )]
+        .into_iter()
+        .collect();
+        let sets = fresh_tier_block_list(&sources, None, None, Some(&watermarks))
+            .await
+            .unwrap();
+        assert!(blocks(&sets, 1).await);
+        assert!(blocks(&sets, 2).await);
+        assert!(!blocks(&sets, 3).await);
+
+        // No watermark → live tier: all three are members.
+        let sets = fresh_tier_block_list(&sources, None, None, None)
+            .await
+            .unwrap();
+        for id in [1, 2, 3] {
+            assert!(blocks(&sets, id).await);
+        }
+    }
+
+    /// A generation above the active one rolled in after the snapshot and is
+    /// excluded whole; a lower one is immutable (frozen) and included whole
+    /// regardless of the active batch count.
+    #[tokio::test]
+    async fn fresh_tier_watermark_excludes_newer_gen_includes_lower_gen() {
+        use crate::dataset::mem_wal::scanner::data_source::FreshTierWatermark;
+        use std::collections::HashMap;
+
+        let shard = Uuid::new_v4();
+        // gen 3 newer (after snapshot), gen 2 == active (bounded to 1 batch),
+        // gen 1 lower/immutable (whole). Each id is its own batch.
+        let sources = vec![
+            active_source(shard, 3, &[100]),
+            active_source(shard, 2, &[20, 21]),
+            active_source(shard, 1, &[1, 2]),
+        ];
+
+        let watermarks: HashMap<Uuid, FreshTierWatermark> = [(
+            shard,
+            FreshTierWatermark {
+                active_generation: 2,
+                active_batch_count: 1,
+            },
+        )]
+        .into_iter()
+        .collect();
+        let sets = fresh_tier_block_list(&sources, None, None, Some(&watermarks))
+            .await
+            .unwrap();
+        assert!(blocks(&sets, 1).await); // gen 1, whole
+        assert!(blocks(&sets, 2).await); // gen 1, whole
+        assert!(blocks(&sets, 20).await); // gen 2, batch 0
+        assert!(!blocks(&sets, 21).await); // gen 2, batch 1 — past the watermark
+        assert!(!blocks(&sets, 100).await); // gen 3 — after the snapshot
+    }
+
+    /// A flushed generation at or above the active generation was produced by a
+    /// flush after the snapshot and is excluded; one strictly below it is
+    /// immutable and included.
+    #[tokio::test]
+    async fn fresh_tier_watermark_excludes_flushed_at_or_above_active() {
+        use crate::dataset::mem_wal::scanner::data_source::FreshTierWatermark;
+        use crate::dataset::{Dataset, WriteParams};
+        use arrow_array::RecordBatchIterator;
+        use std::collections::HashMap;
+
+        // A flushed generation 2 holding pk=5, staged as a flushed dataset with
+        // its standalone PK sidecar (what the on-disk membership probes).
+        let flushed_batch = id_batch(&[5]);
+        let schema = flushed_batch.schema();
+        let tmp = tempfile::tempdir().unwrap();
+        let path = format!("{}/gen2", tmp.path().to_str().unwrap());
+        let reader = RecordBatchIterator::new(vec![Ok(flushed_batch.clone())], schema.clone());
+        Dataset::write(reader, &path, Some(WriteParams::default()))
+            .await
+            .unwrap();
+        write_pk_sidecar(&path, &[flushed_batch], &["id"])
+            .await
+            .unwrap();
+
+        let shard = Uuid::new_v4();
+        let sources = vec![LsmDataSource::FlushedMemTable {
+            path,
+            shard_id: shard,
+            generation: LsmGeneration::memtable(2),
+        }];
+
+        // active_generation 2 (gen 2 flushed at/after the snapshot): excluded.
+        let at: HashMap<Uuid, FreshTierWatermark> = [(
+            shard,
+            FreshTierWatermark {
+                active_generation: 2,
+                active_batch_count: u64::MAX,
+            },
+        )]
+        .into_iter()
+        .collect();
+        let sets = fresh_tier_block_list(&sources, None, None, Some(&at))
+            .await
+            .unwrap();
+        assert!(!blocks(&sets, 5).await);
+
+        // active_generation 3 (gen 2 strictly below, immutable): included.
+        let above: HashMap<Uuid, FreshTierWatermark> = [(
+            shard,
+            FreshTierWatermark {
+                active_generation: 3,
+                active_batch_count: u64::MAX,
+            },
+        )]
+        .into_iter()
+        .collect();
+        let sets = fresh_tier_block_list(&sources, None, None, Some(&above))
+            .await
+            .unwrap();
+        assert!(blocks(&sets, 5).await);
+    }
 }
diff --git a/rust/lance/src/dataset/mem_wal/scanner/builder.rs b/rust/lance/src/dataset/mem_wal/scanner/builder.rs
index ade4164d485..a006257493b 100644
--- a/rust/lance/src/dataset/mem_wal/scanner/builder.rs
+++ b/rust/lance/src/dataset/mem_wal/scanner/builder.rs
@@ -20,8 +20,8 @@ use lance_core::{Error, Result, is_system_column};
 use uuid::Uuid;
 
 use super::collector::{InMemoryMemTableRef, InMemoryMemTables, LsmDataSourceCollector};
-use super::data_source::ShardSnapshot;
-use super::flushed_cache::FlushedMemTableCache;
+use super::data_source::{FreshTierWatermark, ShardSnapshot};
+use super::flushed_cache::{DatasetCache, GenerationWarmer};
 use super::planner::LsmScanPlanner;
 use super::point_lookup::LsmPointLookupPlanner;
 use crate::dataset::Dataset;
@@ -124,7 +124,12 @@ pub struct LsmScanner {
     session: Option<Arc<Session>>,
     /// Cache of opened flushed-generation datasets. When set, repeated
     /// queries against the same generation skip the manifest read entirely.
-    flushed_cache: Option<Arc<FlushedMemTableCache>>,
+    flushed_cache: Option<Arc<dyn DatasetCache>>,
+    /// Optional warmer fired on first open of a flushed generation.
+    warmer: Option<Arc<dyn GenerationWarmer>>,
+    /// Over-fetch multiple for block-listed sources in search plans
+    /// (see [`super::LsmFtsSearchPlanner::with_overfetch_factor`]).
+    overfetch_factor: Option<f64>,
 }
 
 impl LsmScanner {
@@ -160,6 +165,8 @@ impl LsmScanner {
             pk_columns,
             session,
             flushed_cache: None,
+            warmer: None,
+            overfetch_factor: None,
         }
     }
 
@@ -198,6 +205,8 @@ impl LsmScanner {
             pk_columns,
             session: None,
             flushed_cache: None,
+            warmer: None,
+            overfetch_factor: None,
         }
     }
 
@@ -246,13 +255,29 @@ impl LsmScanner {
     ///
     /// With a cache, repeated queries against the same generation become a
     /// pure `Arc::clone` with no manifest read or object-store I/O. The cache
-    /// is owned and sized by the caller (see [`FlushedMemTableCache`]); not
-    /// set by default, so behavior is unchanged unless opted in.
-    pub fn with_flushed_cache(mut self, cache: Arc<FlushedMemTableCache>) -> Self {
+    /// is owned and sized by the caller (any [`DatasetCache`] impl, e.g.
+    /// [`FlushedMemTableCache`](super::FlushedMemTableCache)); not set by
+    /// default, so behavior is unchanged unless opted in.
+    pub fn with_flushed_cache(mut self, cache: Arc<dyn DatasetCache>) -> Self {
         self.flushed_cache = Some(cache);
         self
     }
 
+    /// Inject the warmer fired on first open of a flushed generation. Not set by
+    /// default, so behavior is unchanged unless opted in.
+    pub fn with_warmer(mut self, warmer: Arc<dyn GenerationWarmer>) -> Self {
+        self.warmer = Some(warmer);
+        self
+    }
+
+    /// Set the over-fetch multiple block-listed sources use in search plans
+    /// so they still yield `k` live rows after cross-generation dedup.
+    /// Threaded into [`super::LsmFtsSearchPlanner`]; clamped to `>= 1.0`.
+    pub fn with_overfetch_factor(mut self, factor: f64) -> Self {
+        self.overfetch_factor = Some(factor);
+        self
+    }
+
     /// Project specific columns.
     ///
     /// If not called, all columns from the base schema are included.
@@ -354,6 +379,9 @@ impl LsmScanner {
             if let Some(cache) = &self.flushed_cache {
                 planner = planner.with_flushed_cache(cache.clone());
             }
+            if let Some(warmer) = &self.warmer {
+                planner = planner.with_warmer(warmer.clone());
+            }
             let plan = planner
                 .plan_point_lookup(&keys, self.projection.as_deref())
                 .await?;
@@ -370,6 +398,12 @@ impl LsmScanner {
         if let Some(cache) = &self.flushed_cache {
             planner = planner.with_flushed_cache(cache.clone());
         }
+        if let Some(warmer) = &self.warmer {
+            planner = planner.with_warmer(warmer.clone());
+        }
+        if let Some(factor) = self.overfetch_factor {
+            planner = planner.with_overfetch_factor(factor);
+        }
 
         planner
             .plan_scan(
@@ -405,6 +439,12 @@ impl LsmScanner {
         if let Some(cache) = &self.flushed_cache {
             planner = planner.with_flushed_cache(cache.clone());
         }
+        if let Some(warmer) = &self.warmer {
+            planner = planner.with_warmer(warmer.clone());
+        }
+        if let Some(factor) = self.overfetch_factor {
+            planner = planner.with_overfetch_factor(factor);
+        }
         planner
             .plan_search(column, query, k, self.projection.as_deref())
             .await
@@ -454,24 +494,65 @@ impl LsmScanner {
     /// the primary-key columns; the returned `Vec<bool>` is aligned with its
     /// rows. Hashing matches the scanner's internal dedup, so the caller never
     /// hashes PKs itself. Flushed membership comes from the injected
-    /// [`FlushedMemTableCache`] when one is set.
+    /// [`DatasetCache`] when one is set.
     pub async fn contains_pks(&self, pks: &RecordBatch) -> Result<Vec<bool>> {
+        self.contains_pks_at(pks, None).await
+    }
+
+    /// As-of variant of [`Self::contains_pks`]. Membership is evaluated against
+    /// a per-shard watermark on the fresh tier, supplied via `watermarks` (see
+    /// [`FreshTierWatermark`]), matching the tier a prior scan observed and
+    /// avoiding the two-snapshot skew that would drop a base row with no
+    /// delivered replacement. `None` evaluates against the live tier.
+    pub async fn contains_pks_at(
+        &self,
+        pks: &RecordBatch,
+        watermarks: Option<&HashMap<Uuid, FreshTierWatermark>>,
+    ) -> Result<Vec<bool>> {
         let sources = self.build_collector().collect()?;
-        let sets = super::block_list::fresh_tier_block_list(
+        let memberships = super::block_list::fresh_tier_block_list(
             &sources,
-            &self.pk_columns,
             self.session.as_ref(),
             self.flushed_cache.as_ref(),
+            watermarks,
         )
         .await?;
         let pk_indices = super::exec::resolve_pk_indices(pks, &self.pk_columns)
             .map_err(|e| Error::invalid_input(e.to_string()))?;
-        Ok((0..pks.num_rows())
+        // One key per row, in the index key space (typed value, or encoded
+        // `Binary` tuple for a composite PK).
+        let keys: Vec<ScalarValue> = (0..pks.num_rows())
             .map(|row| {
-                let hash = super::exec::compute_pk_hash(pks, &pk_indices, row);
-                sets.iter().any(|set| set.contains(&hash))
+                let values: Vec<ScalarValue> = pk_indices
+                    .iter()
+                    .map(|&col| ScalarValue::try_from_array(pks.column(col), row))
+                    .collect::<std::result::Result<_, _>>()
+                    .map_err(|e| Error::invalid_input(e.to_string()))?;
+                super::block_list::on_disk_pk_key(&values)
             })
-            .collect())
+            .collect::<Result<_>>()?;
+
+        // A row is contained if any generation contains its key. Probe each
+        // generation once (batched), narrowing to still-unfound rows.
+        let mut contained = vec![false; keys.len()];
+        let mut live: Vec<usize> = (0..keys.len()).collect();
+        for membership in &memberships {
+            if live.is_empty() {
+                break;
+            }
+            let live_keys: Vec<ScalarValue> = live.iter().map(|&i| keys[i].clone()).collect();
+            let mask = membership.contains_keys(&live_keys).await?;
+            let mut next_live = Vec::with_capacity(live.len());
+            for (pos, &row) in live.iter().enumerate() {
+                if mask[pos] {
+                    contained[row] = true;
+                } else {
+                    next_live.push(row);
+                }
+            }
+            live = next_live;
+        }
+        Ok(contained)
     }
 
     /// Build the data source collector.
@@ -572,35 +653,42 @@ mod tests {
         assert_eq!(memtable_ref.generation, 10);
     }
 
-    #[tokio::test]
-    async fn contains_pks_reports_fresh_tier_membership() {
-        use crate::dataset::mem_wal::write::{BatchStore, IndexStore};
-        use arrow_array::Int32Array;
+    /// Single-column `id: Int32` schema used by the PK-membership tests.
+    fn pk_schema() -> SchemaRef {
         use arrow_schema::{DataType, Field, Schema};
+        Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]))
+    }
 
-        let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
-        let id_batch = |ids: &[i32]| {
-            RecordBatch::try_new(
-                schema.clone(),
-                vec![Arc::new(Int32Array::from(ids.to_vec()))],
-            )
-            .unwrap()
-        };
-        let mk = |ids: &[i32], generation: u64| {
-            let store = BatchStore::with_capacity(8);
-            store.append(id_batch(ids)).unwrap();
-            InMemoryMemTableRef {
-                batch_store: Arc::new(store),
-                index_store: Arc::new(IndexStore::new()),
-                schema: schema.clone(),
-                generation,
-            }
-        };
+    /// A `RecordBatch` of `id` values against [`pk_schema`].
+    fn id_pk_batch(ids: &[i32]) -> RecordBatch {
+        use arrow_array::Int32Array;
+        RecordBatch::try_new(pk_schema(), vec![Arc::new(Int32Array::from(ids.to_vec()))]).unwrap()
+    }
+
+    /// An active/frozen memtable holding `ids` at `generation`, with a single
+    /// batch and a maintained primary-key index on `id`.
+    fn mk_pk_memtable(ids: &[i32], generation: u64) -> InMemoryMemTableRef {
+        use crate::dataset::mem_wal::write::{BatchStore, IndexStore};
+        let store = BatchStore::with_capacity(8);
+        let mut index = IndexStore::new();
+        index.enable_pk_index(&[("id".to_string(), 0)]);
+        let b = id_pk_batch(ids);
+        let (bp, off, _) = store.append(b.clone()).unwrap();
+        index.insert_with_batch_position(&b, off, Some(bp)).unwrap();
+        InMemoryMemTableRef {
+            batch_store: Arc::new(store),
+            index_store: Arc::new(index),
+            schema: pk_schema(),
+            generation,
+        }
+    }
 
+    #[tokio::test]
+    async fn contains_pks_reports_fresh_tier_membership() {
         // Fresh-tier only: active gen 2 (pk=1,2) + frozen gen 1 (pk=3).
         let shard = Uuid::new_v4();
         let scanner = LsmScanner::without_base_table(
-            schema.clone(),
+            pk_schema(),
             "memory://t",
             vec![],
             vec!["id".to_string()],
@@ -608,16 +696,68 @@ mod tests {
         .with_in_memory_memtables(
             shard,
             InMemoryMemTables {
-                active: mk(&[1, 2], 2),
-                frozen: vec![mk(&[3], 1)],
+                active: mk_pk_memtable(&[1, 2], 2),
+                frozen: vec![mk_pk_memtable(&[3], 1)],
             },
         );
 
         // pk=1 (active), pk=4 (absent), pk=3 (frozen).
-        let result = scanner.contains_pks(&id_batch(&[1, 4, 3])).await.unwrap();
+        let result = scanner
+            .contains_pks(&id_pk_batch(&[1, 4, 3]))
+            .await
+            .unwrap();
         assert_eq!(result, vec![true, false, true]);
     }
 
+    /// `contains_pks_at` probes each generation once over the still-unfound
+    /// rows, so a multi-PK batch spanning several generations resolves to the
+    /// right per-row mask — and a watermark bounds which generations count.
+    #[tokio::test]
+    async fn contains_pks_at_batched_probe_respects_watermark() {
+        use crate::dataset::mem_wal::scanner::data_source::FreshTierWatermark;
+
+        // active gen 2 (pk=1,2) + frozen gen 1 (pk=3,4).
+        let shard = Uuid::new_v4();
+        let scanner = LsmScanner::without_base_table(
+            pk_schema(),
+            "memory://t",
+            vec![],
+            vec!["id".to_string()],
+        )
+        .with_in_memory_memtables(
+            shard,
+            InMemoryMemTables {
+                active: mk_pk_memtable(&[1, 2], 2),
+                frozen: vec![mk_pk_memtable(&[3, 4], 1)],
+            },
+        );
+
+        // Duplicate and out-of-order keys exercise the live-row narrowing: each
+        // generation only re-probes the rows earlier generations didn't claim.
+        let probe = id_pk_batch(&[4, 1, 9, 3, 2, 1]);
+
+        // watermark=None → live tier: every PK present in either generation.
+        let live = scanner.contains_pks_at(&probe, None).await.unwrap();
+        assert_eq!(live, vec![true, true, false, true, true, true]);
+
+        // watermark at gen 1 → active gen 2 rolled in after the snapshot and is
+        // excluded; only the frozen gen 1 keys (3,4) remain members.
+        let watermarks: HashMap<Uuid, FreshTierWatermark> = [(
+            shard,
+            FreshTierWatermark {
+                active_generation: 1,
+                active_batch_count: u64::MAX,
+            },
+        )]
+        .into_iter()
+        .collect();
+        let bounded = scanner
+            .contains_pks_at(&probe, Some(&watermarks))
+            .await
+            .unwrap();
+        assert_eq!(bounded, vec![true, false, false, true, false, false]);
+    }
+
     /// One active memtable with a maintained BTree on `id`, all rows visible.
     fn mk_indexed_memtable(schema: &SchemaRef, ids: &[i32], names: &[&str]) -> InMemoryMemTableRef {
         use crate::dataset::mem_wal::write::{BatchStore, IndexStore};
diff --git a/rust/lance/src/dataset/mem_wal/scanner/collector.rs b/rust/lance/src/dataset/mem_wal/scanner/collector.rs
index 2db4b4f277d..6645f159b12 100644
--- a/rust/lance/src/dataset/mem_wal/scanner/collector.rs
+++ b/rust/lance/src/dataset/mem_wal/scanner/collector.rs
@@ -229,6 +229,19 @@ impl LsmDataSourceCollector {
             .collect()
     }
 
+    /// True when `generation` for `shard_id` is still pinned in memory as a
+    /// frozen memtable. During the post-flush grace window a generation is both
+    /// committed to the manifest (a flushed source) and held in memory (an
+    /// in-memory source); it must be served only from memory — which preserves
+    /// the per-batch boundaries the flushed dataset has lost, so as-of reads
+    /// stay snapshot-bounded — and its on-disk copy skipped to avoid scanning
+    /// the generation twice. See `ShardWriterConfig::frozen_memtable_grace`.
+    fn flushed_gen_pinned_in_memory(&self, shard_id: &Uuid, generation: u64) -> bool {
+        self.in_memory_memtables
+            .get(shard_id)
+            .is_some_and(|mems| mems.frozen.iter().any(|f| f.generation == generation))
+    }
+
     /// Collect all data sources.
     ///
     /// Returns sources in a consistent order:
@@ -246,6 +259,9 @@ impl LsmDataSourceCollector {
 
         for snapshot in &self.shard_snapshots {
             for flushed in &snapshot.flushed_generations {
+                if self.flushed_gen_pinned_in_memory(&snapshot.shard_id, flushed.generation) {
+                    continue;
+                }
                 let path = self.resolve_flushed_path(&snapshot.shard_id, &flushed.path);
                 sources.push(LsmDataSource::FlushedMemTable {
                     path,
@@ -284,6 +300,9 @@ impl LsmDataSourceCollector {
             }
 
             for flushed in &snapshot.flushed_generations {
+                if self.flushed_gen_pinned_in_memory(&snapshot.shard_id, flushed.generation) {
+                    continue;
+                }
                 let path = self.resolve_flushed_path(&snapshot.shard_id, &flushed.path);
                 sources.push(LsmDataSource::FlushedMemTable {
                     path,
@@ -443,4 +462,53 @@ mod tests {
             3
         );
     }
+
+    /// During the post-flush grace window a generation is both committed to the
+    /// manifest (a flushed source) and still pinned in memory (a frozen
+    /// source). The collector must emit it once, from memory — so as-of reads
+    /// keep batch-resolved membership — and skip the on-disk copy. Flushed
+    /// generations NOT pinned in memory are still emitted from disk.
+    #[test]
+    fn test_collect_suppresses_flushed_gen_pinned_in_memory() {
+        let shard = Uuid::new_v4();
+        // Manifest lists gens 1 and 2 as flushed; gen 2 is still pinned in
+        // memory (just flushed, within grace), gen 1 has been swept.
+        let snapshot = ShardSnapshot {
+            shard_id: shard,
+            spec_id: 0,
+            current_generation: 3,
+            flushed_generations: vec![
+                FlushedGeneration {
+                    generation: 1,
+                    path: "gen_1".to_string(),
+                },
+                FlushedGeneration {
+                    generation: 2,
+                    path: "gen_2".to_string(),
+                },
+            ],
+        };
+        let mems = InMemoryMemTables {
+            active: memtable_ref(3),
+            frozen: vec![memtable_ref(2)],
+        };
+        let collector = LsmDataSourceCollector::without_base_table("/tmp/x", vec![snapshot])
+            .with_in_memory_memtables(shard, mems);
+
+        let sources = collector.collect().unwrap();
+        // gen 1: on-disk (not pinned). gen 2: in-memory only (pinned, disk
+        // copy suppressed). gen 3: active. No duplicate gen 2.
+        let flushed: Vec<u64> = sources
+            .iter()
+            .filter(|s| !s.is_active_memtable())
+            .map(|s| s.generation().as_u64())
+            .collect();
+        let in_memory: Vec<u64> = sources
+            .iter()
+            .filter(|s| s.is_active_memtable())
+            .map(|s| s.generation().as_u64())
+            .collect();
+        assert_eq!(flushed, vec![1], "only the unpinned flushed gen from disk");
+        assert_eq!(in_memory, vec![2, 3], "pinned gen 2 served from memory");
+    }
 }
diff --git a/rust/lance/src/dataset/mem_wal/scanner/data_source.rs b/rust/lance/src/dataset/mem_wal/scanner/data_source.rs
index 1a6207f27e3..0d5f3fdc925 100644
--- a/rust/lance/src/dataset/mem_wal/scanner/data_source.rs
+++ b/rust/lance/src/dataset/mem_wal/scanner/data_source.rs
@@ -11,6 +11,29 @@ use uuid::Uuid;
 use crate::dataset::Dataset;
 use crate::dataset::mem_wal::write::{BatchStore, IndexStore};
 
+/// A watermark marking how far into one shard's fresh tier a prior scan
+/// observed, so membership can be evaluated as of that point (see
+/// [`super::builder::LsmScanner::contains_pks_at`]).
+///
+/// Only the active memtable grows between two reads (appended batches, and a new
+/// generation when it rolls); everything at a lower generation — frozen and
+/// flushed — is immutable and was fully observed. The watermark includes lower
+/// generations whole, the active generation up to `active_batch_count` batches,
+/// and excludes higher generations (which appeared after it). It uses only the
+/// batch count and generation — both always available, unlike per-batch WAL
+/// positions, which the write path does not track. The bound only excludes rows
+/// the scan did not observe, so a stale watermark under-counts (a tolerable
+/// stale read) rather than dropping a row with no replacement.
+#[derive(Debug, Clone, Copy)]
+pub struct FreshTierWatermark {
+    /// Active generation the scan observed. Higher generations are excluded;
+    /// lower ones are immutable and included whole.
+    pub active_generation: u64,
+    /// Active-memtable batch count at snapshot time. Within the active
+    /// generation, only batches at index `< active_batch_count` were observed.
+    pub active_batch_count: u64,
+}
+
 /// Generation number in LSM tree.
 ///
 /// The base table has generation 0. MemTables have positive integers
diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec.rs b/rust/lance/src/dataset/mem_wal/scanner/exec.rs
index 88fd617dc0a..115cffccc81 100644
--- a/rust/lance/src/dataset/mem_wal/scanner/exec.rs
+++ b/rust/lance/src/dataset/mem_wal/scanner/exec.rs
@@ -9,22 +9,22 @@
 //! - [`MemtableGenTagExec`]: Wraps a scan to add `_memtable_gen` column
 //! - [`BloomFilterGuardExec`]: Guards child execution with bloom filter check
 //! - [`CoalesceFirstExec`]: Returns first non-empty result with short-circuit
-//! - [`WithinSourceDedupExec`]: Deduplicates rows with the same PK from a single source
-//! - [`PkHashFilterExec`]: Drops rows whose PK hash was superseded by a newer generation (the cross-generation block-list)
+//! - [`PkBlockFilterExec`]: Drops rows whose PK was superseded by a newer generation (the cross-generation block-list)
+//! - [`NewestPkFilterExec`]: Drops active-memtable hits that aren't the newest visible version of their PK (the within-source recency filter)
 
 mod bloom_guard;
 mod coalesce_first;
 mod generation_tag;
+mod newest_pk_filter;
 mod pk;
-mod pk_hash_filter;
-mod within_source_dedup;
+mod pk_block_filter;
 
 pub use bloom_guard::{BloomFilterGuardExec, compute_pk_hash_from_scalars};
 pub use coalesce_first::CoalesceFirstExec;
 pub use generation_tag::{MEMTABLE_GEN_COLUMN, MemtableGenTagExec};
+pub use newest_pk_filter::NewestPkFilterExec;
 pub use pk::{
     ROW_ADDRESS_COLUMN, compute_pk_hash, is_supported_pk_type, resolve_pk_indices,
     validate_pk_types,
 };
-pub use pk_hash_filter::PkHashFilterExec;
-pub use within_source_dedup::{DedupDirection, WithinSourceDedupExec};
+pub use pk_block_filter::PkBlockFilterExec;
diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/newest_pk_filter.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/newest_pk_filter.rs
new file mode 100644
index 00000000000..e1495cb0bb1
--- /dev/null
+++ b/rust/lance/src/dataset/mem_wal/scanner/exec/newest_pk_filter.rs
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Drop predicate-crossing stale rows from an active-memtable index search.
+//!
+//! The active memtable's HNSW / inverted index are append-only, so an updated
+//! row's old entries stay live. When an update moves a row out of the query's
+//! match set, the fresh version isn't in the index result, so a result-set
+//! dedup (keep-newest among the returned rows) has nothing to suppress the
+//! stale version against — and it leaks.
+//!
+//! This node closes that hole with a predicate-independent recency check: for
+//! each hit it asks the memtable's maintained primary-key index
+//! ([`IndexStore::pk_is_newest`]) whether the hit's own row position is the
+//! newest version of its primary key visible at the query's `max_visible`
+//! watermark, and keeps the hit **iff so**. A stale hit (some
+//! newer version exists) is dropped even when that newer version never appears
+//! in the result. This is exactly the seek point-lookup already does; the index
+//! search arms simply didn't do it.
+
+use std::any::Any;
+use std::fmt;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use arrow::compute::filter_record_batch;
+use arrow_array::{Array, BooleanArray, RecordBatch, UInt64Array};
+use arrow_schema::SchemaRef;
+use datafusion::common::ScalarValue;
+use datafusion::error::{DataFusionError, Result as DFResult};
+use datafusion::execution::TaskContext;
+use datafusion::physical_expr::EquivalenceProperties;
+use datafusion::physical_plan::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties,
+    SendableRecordBatchStream,
+};
+use futures::{Stream, StreamExt};
+
+use super::pk::resolve_pk_indices;
+use crate::dataset::mem_wal::write::{BatchStore, IndexStore};
+
+/// Keeps only the index hits that are the newest visible version of their PK.
+///
+/// The input must expose all `pk_columns` and the `row_id_column` (`UInt64`,
+/// the BatchStore row position). The output schema is unchanged.
+pub struct NewestPkFilterExec {
+    input: Arc<dyn ExecutionPlan>,
+    pk_columns: Vec<String>,
+    row_id_column: String,
+    /// Holds the maintained primary-key index, queried per hit via
+    /// [`IndexStore::pk_is_newest`].
+    index_store: Arc<IndexStore>,
+    /// Resolves the `max_visible` row watermark from the visible batch prefix.
+    batch_store: Arc<BatchStore>,
+    /// The MVCC batch-position snapshot the index search latched. Captured once
+    /// at plan time and shared with the search so the recency check keys on the
+    /// same snapshot the hits came from.
+    max_visible_batch_position: usize,
+    properties: Arc<PlanProperties>,
+}
+
+impl fmt::Debug for NewestPkFilterExec {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        // `BatchStore` / `IndexStore` aren't `Debug`; show only the knobs.
+        f.debug_struct("NewestPkFilterExec")
+            .field("pk_columns", &self.pk_columns)
+            .field("row_id_column", &self.row_id_column)
+            .field(
+                "max_visible_batch_position",
+                &self.max_visible_batch_position,
+            )
+            .finish()
+    }
+}
+
+impl NewestPkFilterExec {
+    pub fn new(
+        input: Arc<dyn ExecutionPlan>,
+        pk_columns: Vec<String>,
+        row_id_column: impl Into<String>,
+        index_store: Arc<IndexStore>,
+        batch_store: Arc<BatchStore>,
+        max_visible_batch_position: usize,
+    ) -> Self {
+        // A filter preserves the input schema and partitioning.
+        let properties = Arc::new(PlanProperties::new(
+            EquivalenceProperties::new(input.schema()),
+            input.output_partitioning().clone(),
+            input.pipeline_behavior(),
+            input.boundedness(),
+        ));
+        Self {
+            input,
+            pk_columns,
+            row_id_column: row_id_column.into(),
+            index_store,
+            batch_store,
+            max_visible_batch_position,
+            properties,
+        }
+    }
+
+    /// The inclusive max visible row position for this snapshot, or `None` when
+    /// no rows are visible.
+    fn max_visible_row(&self) -> Option<u64> {
+        self.batch_store
+            .max_visible_row(self.max_visible_batch_position)
+    }
+}
+
+impl DisplayAs for NewestPkFilterExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
+        match t {
+            DisplayFormatType::Default
+            | DisplayFormatType::Verbose
+            | DisplayFormatType::TreeRender => {
+                write!(
+                    f,
+                    "NewestPkFilterExec: pk=[{}], row_id={}, max_visible_batch={}",
+                    self.pk_columns.join(", "),
+                    self.row_id_column,
+                    self.max_visible_batch_position,
+                )
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for NewestPkFilterExec {
+    fn name(&self) -> &str {
+        "NewestPkFilterExec"
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.input.schema()
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.properties
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> DFResult<Arc<dyn ExecutionPlan>> {
+        if children.len() != 1 {
+            return Err(DataFusionError::Internal(
+                "NewestPkFilterExec requires exactly one child".to_string(),
+            ));
+        }
+        Ok(Arc::new(Self::new(
+            children[0].clone(),
+            self.pk_columns.clone(),
+            self.row_id_column.clone(),
+            self.index_store.clone(),
+            self.batch_store.clone(),
+            self.max_visible_batch_position,
+        )))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> DFResult<SendableRecordBatchStream> {
+        let input_stream = self.input.execute(partition, context)?;
+        Ok(Box::pin(NewestPkFilterStream {
+            input: input_stream,
+            pk_columns: self.pk_columns.clone(),
+            row_id_column: self.row_id_column.clone(),
+            index_store: self.index_store.clone(),
+            max_visible_row: self.max_visible_row(),
+            schema: self.schema(),
+        }))
+    }
+}
+
+struct NewestPkFilterStream {
+    input: SendableRecordBatchStream,
+    pk_columns: Vec<String>,
+    row_id_column: String,
+    index_store: Arc<IndexStore>,
+    /// Inclusive watermark snapshot; `None` when no rows are visible.
+    max_visible_row: Option<u64>,
+    schema: SchemaRef,
+}
+
+impl NewestPkFilterStream {
+    fn filter_batch(&self, batch: RecordBatch) -> DFResult<RecordBatch> {
+        // No primary-key index (memtable without a primary key), no visible
+        // rows, or an empty batch: nothing to dedup against, so pass it through.
+        if !self.index_store.has_pk_index() {
+            return Ok(batch);
+        }
+        let Some(max_visible_row) = self.max_visible_row else {
+            return Ok(batch);
+        };
+        if batch.num_rows() == 0 {
+            return Ok(batch);
+        }
+
+        let pk_indices = resolve_pk_indices(&batch, &self.pk_columns)?;
+        let row_ids = batch
+            .column_by_name(&self.row_id_column)
+            .ok_or_else(|| {
+                DataFusionError::Internal(format!(
+                    "Row-id column '{}' not found in NewestPkFilterExec input",
+                    self.row_id_column
+                ))
+            })?
+            .as_any()
+            .downcast_ref::<UInt64Array>()
+            .ok_or_else(|| {
+                DataFusionError::Internal(format!(
+                    "Row-id column '{}' is not UInt64",
+                    self.row_id_column
+                ))
+            })?;
+
+        let mut keep = Vec::with_capacity(batch.num_rows());
+        for row in 0..batch.num_rows() {
+            // A null row position can't be ordered; keep it rather than guess
+            // (callers always project a real position here).
+            if row_ids.is_null(row) {
+                keep.push(true);
+                continue;
+            }
+            let position = row_ids.value(row);
+            let values: Vec<ScalarValue> = pk_indices
+                .iter()
+                .map(|&col| ScalarValue::try_from_array(batch.column(col), row))
+                .collect::<DFResult<_>>()?;
+            // Keep iff this hit is the newest visible version of its PK.
+            keep.push(
+                self.index_store
+                    .pk_is_newest(&values, position, max_visible_row),
+            );
+        }
+        filter_record_batch(&batch, &BooleanArray::from(keep))
+            .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))
+    }
+}
+
+impl Stream for NewestPkFilterStream {
+    type Item = DFResult<RecordBatch>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        match self.input.poll_next_unpin(cx) {
+            Poll::Ready(Some(Ok(batch))) => Poll::Ready(Some(self.filter_batch(batch))),
+            other => other,
+        }
+    }
+}
+
+impl datafusion::physical_plan::RecordBatchStream for NewestPkFilterStream {
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow_array::Int32Array;
+    use arrow_schema::{DataType, Field, Schema};
+    use datafusion::prelude::SessionContext;
+    use datafusion_physical_plan::test::TestMemoryExec;
+    use futures::TryStreamExt;
+
+    /// Single-column `id` PK batch, one per append so a caller can control
+    /// row-level visibility via `max_visible_batch_position`.
+    fn id_batch(id: i32) -> RecordBatch {
+        let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+        RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(vec![id]))]).unwrap()
+    }
+
+    /// Index-search "hits": `(id, _rowid)` pairs the filter evaluates.
+    fn hits(rows: &[(i32, u64)]) -> RecordBatch {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new(lance_core::ROW_ID, DataType::UInt64, true),
+        ]));
+        let ids: Vec<i32> = rows.iter().map(|(id, _)| *id).collect();
+        let rowids: Vec<u64> = rows.iter().map(|(_, p)| *p).collect();
+        RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(Int32Array::from(ids)),
+                Arc::new(UInt64Array::from(rowids)),
+            ],
+        )
+        .unwrap()
+    }
+
+    /// Build an active memtable whose PK index + BatchStore hold one row per
+    /// `id` in `appended` (positions 0..n), all committed.
+    fn active(appended: &[i32]) -> (Arc<IndexStore>, Arc<BatchStore>) {
+        let batch_store = Arc::new(BatchStore::with_capacity(16));
+        let mut index = IndexStore::new();
+        index.enable_pk_index(&[("id".to_string(), 0)]);
+        for &id in appended {
+            let b = id_batch(id);
+            let (bp, off, _) = batch_store.append(b.clone()).unwrap();
+            index.insert_with_batch_position(&b, off, Some(bp)).unwrap();
+        }
+        (Arc::new(index), batch_store)
+    }
+
+    async fn run(
+        index_store: Arc<IndexStore>,
+        batch_store: Arc<BatchStore>,
+        max_visible_batch_position: usize,
+        hits_batch: RecordBatch,
+    ) -> Vec<(i32, u64)> {
+        let input =
+            TestMemoryExec::try_new_exec(&[vec![hits_batch.clone()]], hits_batch.schema(), None)
+                .unwrap();
+        let exec = NewestPkFilterExec::new(
+            input,
+            vec!["id".to_string()],
+            lance_core::ROW_ID,
+            index_store,
+            batch_store,
+            max_visible_batch_position,
+        );
+        let ctx = SessionContext::new();
+        let out: Vec<RecordBatch> = exec
+            .execute(0, ctx.task_ctx())
+            .unwrap()
+            .try_collect()
+            .await
+            .unwrap();
+        let mut rows = Vec::new();
+        for b in &out {
+            let ids = b.column(0).as_any().downcast_ref::<Int32Array>().unwrap();
+            let pos = b.column(1).as_any().downcast_ref::<UInt64Array>().unwrap();
+            for i in 0..b.num_rows() {
+                rows.push((ids.value(i), pos.value(i)));
+            }
+        }
+        rows
+    }
+
+    #[tokio::test]
+    async fn keeps_only_the_newest_visible_position_per_pk() {
+        // id=1 written at positions 0 and 2 (an update), id=2 at position 1; all
+        // visible. A stale hit (id=1 @ 0) is dropped; the newest (id=1 @ 2) and
+        // the unrelated id=2 survive — even though all three were "returned" by
+        // the index search.
+        let (index, store) = active(&[1, 2, 1]);
+        let rows = run(index, store, 2, hits(&[(1, 0), (2, 1), (1, 2)])).await;
+        assert_eq!(rows, vec![(2, 1), (1, 2)]);
+    }
+
+    #[tokio::test]
+    async fn does_not_vanish_a_visible_row_under_a_newer_invisible_write() {
+        // The store/index hold id=1 at positions 0 and 2, but the query latched
+        // `max_visible_batch_position = 0` (only position 0 visible) — i.e. the
+        // update at position 2 was committed *after* this query's snapshot. The
+        // visible older row (id=1 @ 0) must be KEPT (its newest *visible* version
+        // is itself), not dropped because of the not-yet-visible position 2.
+        let (index, store) = active(&[1, 2, 1]);
+        let kept = run(index.clone(), store.clone(), 0, hits(&[(1, 0)])).await;
+        assert_eq!(kept, vec![(1, 0)], "visible row must not vanish");
+
+        // And the not-yet-visible position is itself dropped (outside snapshot).
+        let dropped = run(index, store, 0, hits(&[(1, 2)])).await;
+        assert!(
+            dropped.is_empty(),
+            "row beyond the snapshot must be dropped"
+        );
+    }
+
+    #[tokio::test]
+    async fn passes_through_when_no_pk_index() {
+        // A memtable without a primary-key index can't be deduped here, so the
+        // filter is a pass-through rather than dropping everything.
+        let batch_store = Arc::new(BatchStore::with_capacity(16));
+        batch_store.append(id_batch(1)).unwrap();
+        let index = Arc::new(IndexStore::new()); // no enable_pk_index
+        let rows = run(index, batch_store, 0, hits(&[(1, 0), (1, 9)])).await;
+        assert_eq!(rows, vec![(1, 0), (1, 9)]);
+    }
+}
diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs
index 523dd30bf82..0707eb5e8dd 100644
--- a/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs
+++ b/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs
@@ -4,7 +4,7 @@
 //! Shared primary-key helpers for the LSM scanner execution nodes.
 //!
 //! Centralizes PK column resolution and per-row hashing so that every
-//! consumer (e.g. [`super::WithinSourceDedupExec`], [`super::PkHashFilterExec`])
+//! consumer (e.g. [`super::PkBlockFilterExec`], [`super::NewestPkFilterExec`])
 //! resolves and hashes a primary key the same way. The row hash is kept
 //! consistent with the variants supported by [`super::compute_pk_hash_from_scalars`]
 //! so a single PK produces the same hash regardless of which exec consumes it.
diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/pk_block_filter.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/pk_block_filter.rs
new file mode 100644
index 00000000000..c5b8f959d26
--- /dev/null
+++ b/rust/lance/src/dataset/mem_wal/scanner/exec/pk_block_filter.rs
@@ -0,0 +1,373 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Drop superseded rows from a per-source result by primary-key membership.
+//!
+//! Drops a row when any newer generation's membership ([`GenMembership`])
+//! contains its primary key — in-memory generations probe their PK index by
+//! value, flushed generations probe their on-disk PK BTree. Each generation is
+//! probed once per batch (see the perf note below). Used both as the KNN
+//! post-filter (vector search, with over-fetch) and the cross-generation scan
+//! filter (`k = 0`).
+//!
+//! Cross-generation only: within-gen duplicates collapse via the global dedup's
+//! `(generation, freshness)` tiebreaker.
+//!
+//! Post-filters an over-fetched KNN (the planner's `overfetch_factor`); warns
+//! when a source had >= k candidates but < k survived (over-fetch too small).
+//!
+//! Perf note: each generation is probed once per batch via
+//! [`GenMembership::contains_keys`] — a batched existence check over the
+//! batch's keys — not once per row. The on-disk arm issues a single
+//! `BTreeIndex::contains_keys` (one page pass, no per-key `SearchResult`
+//! allocation); the in-memory arm maps a sync PK lookup over the keys. Probes
+//! are not disk-bound in steady state: the opened index and its (small,
+//! memtable-sized) pages are held by the injected `FlushedMemTableCache` /
+//! `LanceCache`, so after the first touch every probe is memory-resident.
+//! Already-blocked rows are dropped from the key set before probing older
+//! generations, preserving the per-row short-circuit.
+
+use std::any::Any;
+use std::fmt;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use arrow::compute::filter_record_batch;
+use arrow_array::{BooleanArray, RecordBatch};
+use arrow_schema::SchemaRef;
+use datafusion::common::ScalarValue;
+use datafusion::error::{DataFusionError, Result as DFResult};
+use datafusion::execution::TaskContext;
+use datafusion::physical_expr::EquivalenceProperties;
+use datafusion::physical_plan::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties,
+    SendableRecordBatchStream,
+};
+use futures::future::BoxFuture;
+use futures::{FutureExt, Stream, StreamExt};
+use tracing::warn;
+
+use super::super::block_list::{GenMembership, on_disk_pk_key};
+use super::pk::resolve_pk_indices;
+
+/// Filters out rows whose PK is contained in any newer generation's membership.
+#[derive(Debug)]
+pub struct PkBlockFilterExec {
+    input: Arc<dyn ExecutionPlan>,
+    pk_columns: Vec<String>,
+    /// Newer generations' membership; a row is blocked if any contains its PK.
+    blocked: Vec<GenMembership>,
+    /// Target neighbor count, used only to warn on a per-source under-fetch.
+    k: usize,
+    properties: Arc<PlanProperties>,
+}
+
+impl PkBlockFilterExec {
+    pub fn new(
+        input: Arc<dyn ExecutionPlan>,
+        pk_columns: Vec<String>,
+        blocked: Vec<GenMembership>,
+        k: usize,
+    ) -> Self {
+        // A filter preserves the input schema and partitioning.
+        let properties = Arc::new(PlanProperties::new(
+            EquivalenceProperties::new(input.schema()),
+            input.output_partitioning().clone(),
+            input.pipeline_behavior(),
+            input.boundedness(),
+        ));
+        Self {
+            input,
+            pk_columns,
+            blocked,
+            k,
+            properties,
+        }
+    }
+}
+
+impl DisplayAs for PkBlockFilterExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
+        match t {
+            DisplayFormatType::Default
+            | DisplayFormatType::Verbose
+            | DisplayFormatType::TreeRender => {
+                write!(
+                    f,
+                    "PkBlockFilterExec: pk_cols=[{}], gens={}",
+                    self.pk_columns.join(", "),
+                    self.blocked.len(),
+                )
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for PkBlockFilterExec {
+    fn name(&self) -> &str {
+        "PkBlockFilterExec"
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.input.schema()
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.properties
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> DFResult<Arc<dyn ExecutionPlan>> {
+        if children.len() != 1 {
+            return Err(DataFusionError::Internal(
+                "PkBlockFilterExec requires exactly one child".to_string(),
+            ));
+        }
+        Ok(Arc::new(Self::new(
+            children[0].clone(),
+            self.pk_columns.clone(),
+            self.blocked.clone(),
+            self.k,
+        )))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> DFResult<SendableRecordBatchStream> {
+        let input_stream = self.input.execute(partition, context)?;
+        Ok(Box::pin(PkBlockFilterStream {
+            input: input_stream,
+            config: Arc::new(FilterConfig {
+                pk_columns: self.pk_columns.clone(),
+                blocked: self.blocked.clone(),
+            }),
+            k: self.k,
+            schema: self.schema(),
+            pending: None,
+            input_seen: 0,
+            kept: 0,
+            warned: false,
+        }))
+    }
+}
+
+/// Immutable per-stream filter config. Shared into each batch's `'static` async
+/// future by a single `Arc` clone, rather than deep-cloning the PK columns and
+/// memberships per batch.
+struct FilterConfig {
+    pk_columns: Vec<String>,
+    blocked: Vec<GenMembership>,
+}
+
+struct PkBlockFilterStream {
+    input: SendableRecordBatchStream,
+    config: Arc<FilterConfig>,
+    k: usize,
+    schema: SchemaRef,
+    /// The in-flight filter for the batch currently being processed (the probe
+    /// is async, so a batch is filtered off-poll and resumed here).
+    pending: Option<BoxFuture<'static, DFResult<RecordBatch>>>,
+    input_seen: usize,
+    kept: usize,
+    warned: bool,
+}
+
+/// Keep only the rows no newer-gen membership contains. Async because flushed
+/// generations are probed against their on-disk PK BTree.
+async fn filter_batch(batch: RecordBatch, config: Arc<FilterConfig>) -> DFResult<RecordBatch> {
+    let FilterConfig {
+        pk_columns,
+        blocked,
+    } = config.as_ref();
+    if blocked.is_empty() || batch.num_rows() == 0 {
+        return Ok(batch);
+    }
+    let pk_indices = resolve_pk_indices(&batch, pk_columns)?;
+    let to_df = |e: lance_core::Error| DataFusionError::Execution(e.to_string());
+
+    // One key per row, in the index key space.
+    let keys: Vec<ScalarValue> = (0..batch.num_rows())
+        .map(|row| {
+            let values: Vec<ScalarValue> = pk_indices
+                .iter()
+                .map(|&col| ScalarValue::try_from_array(batch.column(col), row))
+                .collect::<DFResult<_>>()?;
+            on_disk_pk_key(&values).map_err(to_df)
+        })
+        .collect::<DFResult<_>>()?;
+
+    // A row is dropped if any newer generation contains its key. Probe each
+    // generation once (batched) rather than once per row, narrowing to the
+    // still-live rows so an already-blocked row isn't re-probed against older
+    // generations.
+    let mut blocked_row = vec![false; keys.len()];
+    let mut live: Vec<usize> = (0..keys.len()).collect();
+    for membership in blocked {
+        if live.is_empty() {
+            break;
+        }
+        let live_keys: Vec<ScalarValue> = live.iter().map(|&i| keys[i].clone()).collect();
+        let mask = membership.contains_keys(&live_keys).await.map_err(to_df)?;
+        let mut next_live = Vec::with_capacity(live.len());
+        for (pos, &row) in live.iter().enumerate() {
+            if mask[pos] {
+                blocked_row[row] = true;
+            } else {
+                next_live.push(row);
+            }
+        }
+        live = next_live;
+    }
+
+    let keep = BooleanArray::from_iter(blocked_row.into_iter().map(|b| Some(!b)));
+    filter_record_batch(&batch, &keep).map_err(|e| DataFusionError::ArrowError(Box::new(e), None))
+}
+
+impl Stream for PkBlockFilterStream {
+    type Item = DFResult<RecordBatch>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+        loop {
+            // Drive an in-flight filter to completion before pulling more input.
+            if let Some(fut) = this.pending.as_mut() {
+                return match fut.as_mut().poll(cx) {
+                    Poll::Ready(Ok(out)) => {
+                        this.pending = None;
+                        this.kept += out.num_rows();
+                        Poll::Ready(Some(Ok(out)))
+                    }
+                    Poll::Ready(Err(e)) => {
+                        this.pending = None;
+                        Poll::Ready(Some(Err(e)))
+                    }
+                    Poll::Pending => Poll::Pending,
+                };
+            }
+
+            match this.input.poll_next_unpin(cx) {
+                Poll::Ready(Some(Ok(batch))) => {
+                    this.input_seen += batch.num_rows();
+                    this.pending = Some(filter_batch(batch, this.config.clone()).boxed());
+                    // Loop to poll the just-created future.
+                }
+                Poll::Ready(Some(Err(e))) => return Poll::Ready(Some(Err(e))),
+                Poll::Ready(None) => {
+                    // >= k candidates in, < k out: over-fetch missed superseded rows.
+                    if !this.warned && this.input_seen >= this.k && this.kept < this.k {
+                        warn!(
+                            k = this.k,
+                            fetched = this.input_seen,
+                            kept = this.kept,
+                            "LSM vector search: < k live rows survived the PK post-filter; \
+                             raise the over-fetch factor or use a true KNN prefilter."
+                        );
+                        this.warned = true;
+                    }
+                    return Poll::Ready(None);
+                }
+                Poll::Pending => return Poll::Pending,
+            }
+        }
+    }
+}
+
+impl datafusion::physical_plan::RecordBatchStream for PkBlockFilterStream {
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::dataset::mem_wal::write::{BatchStore, IndexStore};
+    use arrow_array::Int32Array;
+    use arrow_schema::{DataType, Field, Schema};
+    use datafusion::prelude::SessionContext;
+    use datafusion_physical_plan::test::TestMemoryExec;
+    use futures::TryStreamExt;
+
+    fn int_batch(ids: &[i32]) -> RecordBatch {
+        let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+        RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(ids.to_vec()))]).unwrap()
+    }
+
+    /// An in-memory membership whose PK index holds `ids` (positions 0..n).
+    fn membership(ids: &[i32]) -> GenMembership {
+        let store = BatchStore::with_capacity(16);
+        let mut index = IndexStore::new();
+        index.enable_pk_index(&[("id".to_string(), 0)]);
+        for &id in ids {
+            let b = int_batch(&[id]);
+            let (bp, off, _) = store.append(b.clone()).unwrap();
+            index.insert_with_batch_position(&b, off, Some(bp)).unwrap();
+        }
+        let max_visible_row = store.max_visible_row(index.max_visible_batch_position());
+        GenMembership::InMemory {
+            index_store: Arc::new(index),
+            max_visible_row,
+        }
+    }
+
+    async fn run(exec: PkBlockFilterExec) -> Vec<i32> {
+        let ctx = SessionContext::new();
+        let out: Vec<RecordBatch> = exec
+            .execute(0, ctx.task_ctx())
+            .unwrap()
+            .try_collect()
+            .await
+            .unwrap();
+        out.iter()
+            .flat_map(|b| {
+                b.column_by_name("id")
+                    .unwrap()
+                    .as_any()
+                    .downcast_ref::<Int32Array>()
+                    .unwrap()
+                    .values()
+                    .to_vec()
+            })
+            .collect()
+    }
+
+    #[tokio::test]
+    async fn drops_rows_blocked_by_a_newer_generation() {
+        let b = int_batch(&[10, 20, 30]);
+        let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap();
+        let exec =
+            PkBlockFilterExec::new(input, vec!["id".to_string()], vec![membership(&[20])], 1);
+        assert_eq!(run(exec).await, vec![10, 30]);
+    }
+
+    #[tokio::test]
+    async fn blocks_a_pk_present_in_any_generation() {
+        // Two newer-gen memberships: a row is dropped if either contains its PK.
+        let b = int_batch(&[10, 20, 30]);
+        let blocked = vec![membership(&[10]), membership(&[30])];
+        let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap();
+        let exec = PkBlockFilterExec::new(input, vec!["id".to_string()], blocked, 1);
+        assert_eq!(run(exec).await, vec![20]);
+    }
+
+    #[tokio::test]
+    async fn empty_blocked_keeps_all_rows() {
+        let b = int_batch(&[1, 2, 3]);
+        let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap();
+        let exec = PkBlockFilterExec::new(input, vec!["id".to_string()], Vec::new(), 1);
+        assert_eq!(run(exec).await, vec![1, 2, 3]);
+    }
+}
diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/pk_hash_filter.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/pk_hash_filter.rs
deleted file mode 100644
index ee473047d01..00000000000
--- a/rust/lance/src/dataset/mem_wal/scanner/exec/pk_hash_filter.rs
+++ /dev/null
@@ -1,350 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright The Lance Authors
-
-//! Drop superseded rows from a per-source KNN result by primary-key hash.
-//!
-//! Drops a row when its PK hash ([`super::compute_pk_hash`]) is in any `blocked`
-//! set — the newer generations' membership (`Arc<HashSet>`, shared, never merged;
-//! base table: all generations). Only the KNN output is hashed.
-//!
-//! Cross-generation only: within-gen duplicates share a hash, so the global
-//! dedup's `(generation, freshness)` tiebreaker collapses those instead.
-//!
-//! Post-filters an over-fetched KNN (the planner's `overfetch_factor`); warns
-//! when a source had >= k candidates but < k survived (over-fetch too small).
-
-use std::any::Any;
-use std::collections::HashSet;
-use std::fmt;
-use std::pin::Pin;
-use std::sync::Arc;
-use std::task::{Context, Poll};
-
-use arrow::compute::filter_record_batch;
-use arrow_array::{BooleanArray, RecordBatch};
-use arrow_schema::SchemaRef;
-use datafusion::error::{DataFusionError, Result as DFResult};
-use datafusion::execution::TaskContext;
-use datafusion::physical_expr::EquivalenceProperties;
-use datafusion::physical_plan::{
-    DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties,
-    SendableRecordBatchStream,
-};
-use futures::{Stream, StreamExt};
-use tracing::warn;
-
-use super::pk::{compute_pk_hash, resolve_pk_indices};
-
-/// Filters out rows whose PK hash is in any set of `blocked`.
-#[derive(Debug)]
-pub struct PkHashFilterExec {
-    input: Arc<dyn ExecutionPlan>,
-    pk_columns: Vec<String>,
-    /// Newer generations' membership; a row is blocked if any set holds its hash.
-    blocked: Vec<Arc<HashSet<u64>>>,
-    /// Target neighbor count, used only to warn on a per-source under-fetch.
-    k: usize,
-    properties: Arc<PlanProperties>,
-}
-
-impl PkHashFilterExec {
-    pub fn new(
-        input: Arc<dyn ExecutionPlan>,
-        pk_columns: Vec<String>,
-        blocked: Vec<Arc<HashSet<u64>>>,
-        k: usize,
-    ) -> Self {
-        // A filter preserves the input schema and partitioning.
-        let properties = Arc::new(PlanProperties::new(
-            EquivalenceProperties::new(input.schema()),
-            input.output_partitioning().clone(),
-            input.pipeline_behavior(),
-            input.boundedness(),
-        ));
-        Self {
-            input,
-            pk_columns,
-            blocked,
-            k,
-            properties,
-        }
-    }
-}
-
-impl DisplayAs for PkHashFilterExec {
-    fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
-        match t {
-            DisplayFormatType::Default
-            | DisplayFormatType::Verbose
-            | DisplayFormatType::TreeRender => {
-                let total: usize = self.blocked.iter().map(|s| s.len()).sum();
-                write!(
-                    f,
-                    "PkHashFilterExec: pk_cols=[{}], gens={}, blocked={}",
-                    self.pk_columns.join(", "),
-                    self.blocked.len(),
-                    total,
-                )
-            }
-        }
-    }
-}
-
-impl ExecutionPlan for PkHashFilterExec {
-    fn name(&self) -> &str {
-        "PkHashFilterExec"
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn schema(&self) -> SchemaRef {
-        self.input.schema()
-    }
-
-    fn properties(&self) -> &Arc<PlanProperties> {
-        &self.properties
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
-        vec![&self.input]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn ExecutionPlan>>,
-    ) -> DFResult<Arc<dyn ExecutionPlan>> {
-        if children.len() != 1 {
-            return Err(DataFusionError::Internal(
-                "PkHashFilterExec requires exactly one child".to_string(),
-            ));
-        }
-        Ok(Arc::new(Self::new(
-            children[0].clone(),
-            self.pk_columns.clone(),
-            self.blocked.clone(),
-            self.k,
-        )))
-    }
-
-    fn execute(
-        &self,
-        partition: usize,
-        context: Arc<TaskContext>,
-    ) -> DFResult<SendableRecordBatchStream> {
-        let input_stream = self.input.execute(partition, context)?;
-        Ok(Box::pin(PkHashFilterStream {
-            input: input_stream,
-            pk_columns: self.pk_columns.clone(),
-            blocked: self.blocked.clone(),
-            k: self.k,
-            schema: self.schema(),
-            input_seen: 0,
-            kept: 0,
-            warned: false,
-        }))
-    }
-}
-
-struct PkHashFilterStream {
-    input: SendableRecordBatchStream,
-    pk_columns: Vec<String>,
-    blocked: Vec<Arc<HashSet<u64>>>,
-    k: usize,
-    schema: SchemaRef,
-    input_seen: usize,
-    kept: usize,
-    warned: bool,
-}
-
-impl PkHashFilterStream {
-    fn filter_batch(&self, batch: RecordBatch) -> DFResult<RecordBatch> {
-        if self.blocked.is_empty() || batch.num_rows() == 0 {
-            return Ok(batch);
-        }
-        let pk_indices = resolve_pk_indices(&batch, &self.pk_columns)?;
-        let keep: BooleanArray = (0..batch.num_rows())
-            .map(|row| {
-                let hash = compute_pk_hash(&batch, &pk_indices, row);
-                !self.blocked.iter().any(|set| set.contains(&hash))
-            })
-            .collect();
-        filter_record_batch(&batch, &keep)
-            .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))
-    }
-}
-
-impl Stream for PkHashFilterStream {
-    type Item = DFResult<RecordBatch>;
-
-    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        match self.input.poll_next_unpin(cx) {
-            Poll::Ready(Some(Ok(batch))) => {
-                self.input_seen += batch.num_rows();
-                match self.filter_batch(batch) {
-                    Ok(out) => {
-                        self.kept += out.num_rows();
-                        Poll::Ready(Some(Ok(out)))
-                    }
-                    Err(e) => Poll::Ready(Some(Err(e))),
-                }
-            }
-            Poll::Ready(None) => {
-                // >= k candidates in, < k out: the over-fetch missed superseded rows.
-                if !self.warned && self.input_seen >= self.k && self.kept < self.k {
-                    warn!(
-                        k = self.k,
-                        fetched = self.input_seen,
-                        kept = self.kept,
-                        "LSM vector search: < k live rows survived the PK-hash post-filter; \
-                         raise the over-fetch factor or use a true KNN prefilter."
-                    );
-                    self.warned = true;
-                }
-                Poll::Ready(None)
-            }
-            other => other,
-        }
-    }
-}
-
-impl datafusion::physical_plan::RecordBatchStream for PkHashFilterStream {
-    fn schema(&self) -> SchemaRef {
-        self.schema.clone()
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use arrow_array::{Int32Array, StringArray};
-    use arrow_schema::{DataType, Field, Schema};
-    use datafusion::prelude::SessionContext;
-    use datafusion_physical_plan::test::TestMemoryExec;
-    use futures::TryStreamExt;
-
-    /// Hash a single-column Int32 PK value the way the exec does, so a test can
-    /// build blocked sets from values rather than hand-computed hashes.
-    fn hash_int_pk(id: i32) -> u64 {
-        let batch = int_batch(&[id]);
-        let pk_indices = resolve_pk_indices(&batch, &["id".to_string()]).unwrap();
-        compute_pk_hash(&batch, &pk_indices, 0)
-    }
-
-    fn int_batch(ids: &[i32]) -> RecordBatch {
-        let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
-        RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(ids.to_vec()))]).unwrap()
-    }
-
-    fn blocked(ids: &[i32]) -> Vec<Arc<HashSet<u64>>> {
-        vec![Arc::new(ids.iter().map(|&id| hash_int_pk(id)).collect())]
-    }
-
-    async fn run(exec: PkHashFilterExec) -> Vec<i32> {
-        let ctx = SessionContext::new();
-        let out: Vec<RecordBatch> = exec
-            .execute(0, ctx.task_ctx())
-            .unwrap()
-            .try_collect()
-            .await
-            .unwrap();
-        out.iter()
-            .flat_map(|b| {
-                b.column_by_name("id")
-                    .unwrap()
-                    .as_any()
-                    .downcast_ref::<Int32Array>()
-                    .unwrap()
-                    .values()
-                    .to_vec()
-            })
-            .collect()
-    }
-
-    #[tokio::test]
-    async fn drops_rows_with_blocked_pk_hash() {
-        let b = int_batch(&[10, 20, 30]);
-        let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap();
-        let exec = PkHashFilterExec::new(input, vec!["id".to_string()], blocked(&[20]), 1);
-        assert_eq!(run(exec).await, vec![10, 30]);
-    }
-
-    #[tokio::test]
-    async fn blocks_a_pk_present_in_any_generation_set() {
-        // Two newer-gen sets: a row is dropped if either contains its PK.
-        let b = int_batch(&[10, 20, 30]);
-        let sets = vec![
-            Arc::new(HashSet::from([hash_int_pk(10)])),
-            Arc::new(HashSet::from([hash_int_pk(30)])),
-        ];
-        let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap();
-        let exec = PkHashFilterExec::new(input, vec!["id".to_string()], sets, 1);
-        assert_eq!(run(exec).await, vec![20]);
-    }
-
-    #[tokio::test]
-    async fn empty_blocked_keeps_all_rows() {
-        let b = int_batch(&[1, 2, 3]);
-        let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap();
-        let exec = PkHashFilterExec::new(input, vec!["id".to_string()], Vec::new(), 1);
-        assert_eq!(run(exec).await, vec![1, 2, 3]);
-    }
-
-    #[tokio::test]
-    async fn null_pk_is_hashed_consistently_and_blockable() {
-        // A null PK hashes deterministically (compute_pk_hash hashes is_null),
-        // so a superseded null-key row can be dropped like any other.
-        let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, true)]));
-        let with_null = |ids: Vec<Option<i32>>| {
-            RecordBatch::try_new(schema.clone(), vec![Arc::new(Int32Array::from(ids))]).unwrap()
-        };
-        let pk = vec!["id".to_string()];
-        let null_row = with_null(vec![None]);
-        let pk_indices = resolve_pk_indices(&null_row, &pk).unwrap();
-        let sets = vec![Arc::new(HashSet::from([compute_pk_hash(
-            &null_row,
-            &pk_indices,
-            0,
-        )]))];
-
-        // Rows: 10, NULL, 30 — only the NULL-key row is dropped.
-        let b = with_null(vec![Some(10), None, Some(30)]);
-        let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap();
-        let exec = PkHashFilterExec::new(input, pk, sets, 1);
-        assert_eq!(run(exec).await, vec![10, 30]);
-    }
-
-    #[tokio::test]
-    async fn composite_pk_hash_matches_block_set() {
-        // Composite PK (id, name): block the (2, "b") tuple only.
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("id", DataType::Int32, false),
-            Field::new("name", DataType::Utf8, false),
-        ]));
-        let mk = |ids: &[i32], names: &[&str]| {
-            RecordBatch::try_new(
-                schema.clone(),
-                vec![
-                    Arc::new(Int32Array::from(ids.to_vec())),
-                    Arc::new(StringArray::from(names.to_vec())),
-                ],
-            )
-            .unwrap()
-        };
-        let pk = vec!["id".to_string(), "name".to_string()];
-        let one_row = mk(&[2], &["b"]);
-        let pk_indices = resolve_pk_indices(&one_row, &pk).unwrap();
-        let sets = vec![Arc::new(HashSet::from([compute_pk_hash(
-            &one_row,
-            &pk_indices,
-            0,
-        )]))];
-
-        // (1,"a") and (2,"a") survive; only the exact (2,"b") tuple is dropped.
-        let b = mk(&[1, 2, 2], &["a", "a", "b"]);
-        let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap();
-        let exec = PkHashFilterExec::new(input, pk, sets, 1);
-        assert_eq!(run(exec).await, vec![1, 2]);
-    }
-}
diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/within_source_dedup.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/within_source_dedup.rs
deleted file mode 100644
index be5dae6a668..00000000000
--- a/rust/lance/src/dataset/mem_wal/scanner/exec/within_source_dedup.rs
+++ /dev/null
@@ -1,432 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright The Lance Authors
-
-//! WithinSourceDedupExec - Deduplicates rows with the same primary key from a
-//! single LSM source, keeping the newest insert.
-//!
-//! In MemWAL/LSM mode the same primary key can be written multiple times into
-//! the same memtable. The active memtable stores rows in insert order (larger
-//! `_rowaddr` = newer), while flushed memtables are reverse-written so that
-//! within a flushed file the smallest `_rowid` is the newest insert (see
-//! `memtable/flush.rs:152` and `hnsw/storage.rs:307`). Point lookup uses this
-//! node to collapse such duplicates *within a single source* so that the
-//! downstream `CoalesceFirstExec` / `LIMIT` sees at most one row per primary
-//! key per source.
-
-use std::any::Any;
-use std::collections::HashMap;
-use std::fmt;
-use std::pin::Pin;
-use std::sync::Arc;
-use std::task::{Context, Poll};
-
-use arrow_array::{Array, RecordBatch, UInt64Array};
-use arrow_schema::SchemaRef;
-use datafusion::error::Result as DFResult;
-use datafusion::execution::TaskContext;
-use datafusion::physical_expr::{EquivalenceProperties, Partitioning};
-use datafusion::physical_plan::{
-    DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties,
-    SendableRecordBatchStream,
-};
-use futures::{Stream, StreamExt, ready};
-
-use super::pk::{compute_pk_hash, resolve_pk_indices};
-
-/// Among rows that share a primary key, which row-address extreme identifies
-/// the newest insert to keep. The kept row is always the freshest; only the
-/// row address (`_rowaddr`/`_rowid`) used to find it differs by source.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum DedupDirection {
-    /// Keep the row with the largest row-address value (active memtable: larger
-    /// `_rowaddr` = inserted later).
-    KeepMaxRowAddr,
-    /// Keep the row with the smallest row-address value (flushed memtable under
-    /// reverse-write: smaller `_rowid` = inserted later).
-    KeepMinRowAddr,
-}
-
-/// Deduplicates rows from a single source by primary key, keeping the row
-/// whose `row_addr_column` value wins per [`DedupDirection`].
-///
-/// # Required columns
-///
-/// The input must expose:
-/// - All `pk_columns`
-/// - `row_addr_column` of `UInt64` type
-///
-/// The output schema is unchanged from the input. Callers that need to hide
-/// the row-address column from downstream consumers should compose this node
-/// with `project_to_canonical` or `null_columns`.
-///
-/// # Performance
-///
-/// Memory: `O(unique primary keys in input)`. For point lookup the input is
-/// already filtered to a single primary key so the map holds at most one
-/// entry.
-#[derive(Debug)]
-pub struct WithinSourceDedupExec {
-    input: Arc<dyn ExecutionPlan>,
-    pk_columns: Vec<String>,
-    row_addr_column: String,
-    direction: DedupDirection,
-    schema: SchemaRef,
-    properties: Arc<PlanProperties>,
-}
-
-impl WithinSourceDedupExec {
-    pub fn new(
-        input: Arc<dyn ExecutionPlan>,
-        pk_columns: Vec<String>,
-        row_addr_column: impl Into<String>,
-        direction: DedupDirection,
-    ) -> Self {
-        let schema = input.schema();
-        let properties = Arc::new(PlanProperties::new(
-            EquivalenceProperties::new(schema.clone()),
-            Partitioning::UnknownPartitioning(1),
-            input.pipeline_behavior(),
-            input.boundedness(),
-        ));
-        Self {
-            input,
-            pk_columns,
-            row_addr_column: row_addr_column.into(),
-            direction,
-            schema,
-            properties,
-        }
-    }
-
-    pub fn pk_columns(&self) -> &[String] {
-        &self.pk_columns
-    }
-
-    pub fn row_addr_column(&self) -> &str {
-        &self.row_addr_column
-    }
-
-    pub fn direction(&self) -> DedupDirection {
-        self.direction
-    }
-}
-
-impl DisplayAs for WithinSourceDedupExec {
-    fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
-        match t {
-            DisplayFormatType::Default
-            | DisplayFormatType::Verbose
-            | DisplayFormatType::TreeRender => {
-                write!(
-                    f,
-                    "WithinSourceDedupExec: pk=[{}], row_addr={}, direction={:?}",
-                    self.pk_columns.join(", "),
-                    self.row_addr_column,
-                    self.direction,
-                )
-            }
-        }
-    }
-}
-
-impl ExecutionPlan for WithinSourceDedupExec {
-    fn name(&self) -> &str {
-        "WithinSourceDedupExec"
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn schema(&self) -> SchemaRef {
-        self.schema.clone()
-    }
-
-    fn properties(&self) -> &Arc<PlanProperties> {
-        &self.properties
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
-        vec![&self.input]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn ExecutionPlan>>,
-    ) -> DFResult<Arc<dyn ExecutionPlan>> {
-        if children.len() != 1 {
-            return Err(datafusion::error::DataFusionError::Internal(
-                "WithinSourceDedupExec requires exactly one child".to_string(),
-            ));
-        }
-        Ok(Arc::new(Self::new(
-            children[0].clone(),
-            self.pk_columns.clone(),
-            self.row_addr_column.clone(),
-            self.direction,
-        )))
-    }
-
-    fn execute(
-        &self,
-        partition: usize,
-        context: Arc<TaskContext>,
-    ) -> DFResult<SendableRecordBatchStream> {
-        let input_stream = self.input.execute(partition, context)?;
-        Ok(Box::pin(WithinSourceDedupStream {
-            input: input_stream,
-            pk_columns: self.pk_columns.clone(),
-            row_addr_column: self.row_addr_column.clone(),
-            direction: self.direction,
-            schema: self.schema.clone(),
-            winners: HashMap::new(),
-            emitted: false,
-        }))
-    }
-}
-
-/// One winning row, materialized as a single-row `RecordBatch` so we don't
-/// have to keep the source batch alive after we've picked the winner.
-struct Winner {
-    batch: RecordBatch,
-    row_addr: u64,
-}
-
-struct WithinSourceDedupStream {
-    input: SendableRecordBatchStream,
-    pk_columns: Vec<String>,
-    row_addr_column: String,
-    direction: DedupDirection,
-    schema: SchemaRef,
-    winners: HashMap<u64, Winner>,
-    emitted: bool,
-}
-
-impl WithinSourceDedupStream {
-    fn consume_batch(&mut self, batch: RecordBatch) -> DFResult<()> {
-        if batch.num_rows() == 0 {
-            return Ok(());
-        }
-        let pk_indices = resolve_pk_indices(&batch, &self.pk_columns)?;
-        let row_addr_array = batch
-            .column_by_name(&self.row_addr_column)
-            .ok_or_else(|| {
-                datafusion::error::DataFusionError::Internal(format!(
-                    "Row-address column '{}' not found in batch",
-                    self.row_addr_column
-                ))
-            })?
-            .as_any()
-            .downcast_ref::<UInt64Array>()
-            .ok_or_else(|| {
-                datafusion::error::DataFusionError::Internal(format!(
-                    "Row-address column '{}' is not UInt64",
-                    self.row_addr_column
-                ))
-            })?;
-
-        for row_idx in 0..batch.num_rows() {
-            if row_addr_array.is_null(row_idx) {
-                // A NULL row address can't be ordered against a real one. Skip
-                // rather than guess — callers should always project a real
-                // row-address column for dedup-eligible sources.
-                continue;
-            }
-            let row_addr = row_addr_array.value(row_idx);
-            let pk_hash = compute_pk_hash(&batch, &pk_indices, row_idx);
-
-            let take_row = match self.winners.get(&pk_hash) {
-                None => true,
-                Some(existing) => match self.direction {
-                    DedupDirection::KeepMaxRowAddr => row_addr > existing.row_addr,
-                    DedupDirection::KeepMinRowAddr => row_addr < existing.row_addr,
-                },
-            };
-
-            if take_row {
-                let single = batch.slice(row_idx, 1);
-                self.winners.insert(
-                    pk_hash,
-                    Winner {
-                        batch: single,
-                        row_addr,
-                    },
-                );
-            }
-        }
-        Ok(())
-    }
-
-    fn finalize(&mut self) -> DFResult<RecordBatch> {
-        if self.winners.is_empty() {
-            return Ok(RecordBatch::new_empty(self.schema.clone()));
-        }
-        let batches: Vec<RecordBatch> = self.winners.drain().map(|(_, w)| w.batch).collect();
-        let batch_refs: Vec<&RecordBatch> = batches.iter().collect();
-        arrow_select::concat::concat_batches(&self.schema, batch_refs)
-            .map_err(|e| datafusion::error::DataFusionError::ArrowError(Box::new(e), None))
-    }
-}
-
-impl Stream for WithinSourceDedupStream {
-    type Item = DFResult<RecordBatch>;
-
-    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        loop {
-            if self.emitted {
-                return Poll::Ready(None);
-            }
-            match ready!(self.input.poll_next_unpin(cx)) {
-                Some(Ok(batch)) => {
-                    if let Err(e) = self.consume_batch(batch) {
-                        self.emitted = true;
-                        return Poll::Ready(Some(Err(e)));
-                    }
-                }
-                Some(Err(e)) => {
-                    self.emitted = true;
-                    return Poll::Ready(Some(Err(e)));
-                }
-                None => {
-                    self.emitted = true;
-                    return Poll::Ready(Some(self.finalize()));
-                }
-            }
-        }
-    }
-}
-
-impl datafusion::physical_plan::RecordBatchStream for WithinSourceDedupStream {
-    fn schema(&self) -> SchemaRef {
-        self.schema.clone()
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use arrow_array::{Float32Array, Int32Array, StringArray};
-    use arrow_schema::{DataType, Field, Schema};
-    use datafusion::prelude::SessionContext;
-    use datafusion_physical_plan::test::TestMemoryExec;
-    use futures::TryStreamExt;
-
-    fn create_test_schema() -> SchemaRef {
-        Arc::new(Schema::new(vec![
-            Field::new("id", DataType::Int32, false),
-            Field::new("name", DataType::Utf8, true),
-            Field::new("_distance", DataType::Float32, true),
-            Field::new("_row_addr", DataType::UInt64, true),
-        ]))
-    }
-
-    fn batch(ids: &[i32], names: &[&str], distances: &[f32], row_addr: &[u64]) -> RecordBatch {
-        let schema = create_test_schema();
-        RecordBatch::try_new(
-            schema,
-            vec![
-                Arc::new(Int32Array::from(ids.to_vec())),
-                Arc::new(StringArray::from(names.to_vec())),
-                Arc::new(Float32Array::from(distances.to_vec())),
-                Arc::new(UInt64Array::from(row_addr.to_vec())),
-            ],
-        )
-        .unwrap()
-    }
-
-    async fn run(batches: Vec<RecordBatch>, direction: DedupDirection) -> Vec<RecordBatch> {
-        let schema = create_test_schema();
-        let input = TestMemoryExec::try_new_exec(&[batches], schema, None).unwrap();
-        let exec =
-            WithinSourceDedupExec::new(input, vec!["id".to_string()], "_row_addr", direction);
-        let ctx = SessionContext::new();
-        let stream = exec.execute(0, ctx.task_ctx()).unwrap();
-        stream.try_collect().await.unwrap()
-    }
-
-    fn extract(batches: &[RecordBatch]) -> Vec<(i32, String, u64)> {
-        let mut out = Vec::new();
-        for b in batches {
-            let ids = b.column(0).as_any().downcast_ref::<Int32Array>().unwrap();
-            let names = b.column(1).as_any().downcast_ref::<StringArray>().unwrap();
-            let addr = b.column(3).as_any().downcast_ref::<UInt64Array>().unwrap();
-            for i in 0..b.num_rows() {
-                out.push((ids.value(i), names.value(i).to_string(), addr.value(i)));
-            }
-        }
-        out.sort_by_key(|(id, _, _)| *id);
-        out
-    }
-
-    #[tokio::test]
-    async fn keep_max_picks_largest_row_addr() {
-        // Active-memtable case: same pk inserted twice; newer = larger _rowaddr.
-        let b1 = batch(
-            &[1, 1, 2],
-            &["old", "new", "two"],
-            &[0.1, 0.2, 0.3],
-            &[10, 99, 5],
-        );
-        let out = run(vec![b1], DedupDirection::KeepMaxRowAddr).await;
-        let rows = extract(&out);
-        assert_eq!(rows.len(), 2);
-        assert_eq!(rows[0], (1, "new".to_string(), 99));
-        assert_eq!(rows[1], (2, "two".to_string(), 5));
-    }
-
-    #[tokio::test]
-    async fn keep_min_picks_smallest_row_addr() {
-        // Flushed-memtable case under reverse-write: newer = smaller _rowid.
-        let b1 = batch(
-            &[1, 1, 2],
-            &["old", "new", "two"],
-            &[0.1, 0.2, 0.3],
-            &[99, 10, 5],
-        );
-        let out = run(vec![b1], DedupDirection::KeepMinRowAddr).await;
-        let rows = extract(&out);
-        assert_eq!(rows.len(), 2);
-        assert_eq!(rows[0], (1, "new".to_string(), 10));
-        assert_eq!(rows[1], (2, "two".to_string(), 5));
-    }
-
-    #[tokio::test]
-    async fn dedup_across_batches() {
-        let b1 = batch(&[1, 2], &["a", "b"], &[0.1, 0.2], &[1, 1]);
-        let b2 = batch(&[1, 3], &["a_new", "c"], &[0.5, 0.4], &[7, 1]);
-        let out = run(vec![b1, b2], DedupDirection::KeepMaxRowAddr).await;
-        let rows = extract(&out);
-        assert_eq!(rows.len(), 3);
-        assert_eq!(rows[0], (1, "a_new".to_string(), 7));
-        assert_eq!(rows[1], (2, "b".to_string(), 1));
-        assert_eq!(rows[2], (3, "c".to_string(), 1));
-    }
-
-    #[tokio::test]
-    async fn empty_input() {
-        let out = run(vec![], DedupDirection::KeepMaxRowAddr).await;
-        let total: usize = out.iter().map(|b| b.num_rows()).sum();
-        assert_eq!(total, 0);
-    }
-
-    #[tokio::test]
-    async fn null_row_addr_skipped() {
-        // Rows with NULL row address can't be ordered — they're dropped so they
-        // don't accidentally become winners against real values.
-        let schema = create_test_schema();
-        let b = RecordBatch::try_new(
-            schema.clone(),
-            vec![
-                Arc::new(Int32Array::from(vec![1, 1])),
-                Arc::new(StringArray::from(vec!["nulladdr", "real"])),
-                Arc::new(Float32Array::from(vec![0.1, 0.2])),
-                Arc::new(UInt64Array::from(vec![None, Some(5)])),
-            ],
-        )
-        .unwrap();
-        let out = run(vec![b], DedupDirection::KeepMaxRowAddr).await;
-        let rows = extract(&out);
-        assert_eq!(rows.len(), 1);
-        assert_eq!(rows[0], (1, "real".to_string(), 5));
-    }
-}
diff --git a/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs b/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs
index 39abf7e8c71..7a5280bedb8 100644
--- a/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs
+++ b/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs
@@ -22,6 +22,7 @@
 use std::collections::HashSet;
 use std::sync::Arc;
 
+use async_trait::async_trait;
 use lance_core::{Error, Result};
 
 use crate::dataset::{Dataset, DatasetBuilder};
@@ -41,12 +42,10 @@ use crate::session::Session;
 pub struct FlushedMemTableCache {
     // `moka`'s async cache gives a bounded size plus single-flight
     // `try_get_with`, so concurrent first-queries on a just-flushed
-    // generation open the dataset exactly once.
+    // generation open the dataset exactly once. The opened dataset carries the
+    // session index cache, which also backs each generation's standalone PK
+    // dedup index (see `block_list::open_pk_index`) — no separate cache path.
     inner: moka::future::Cache<String, Arc<Dataset>>,
-    // Per-generation set of PK hashes for the vector-search block-list, keyed by
-    // the same immutable flushed path. Built lazily on the first query that needs
-    // it (single-flight) so repeated searches skip re-scanning the PK column.
-    pk_hashes: moka::future::Cache<String, Arc<HashSet<u64>>>,
 }
 
 impl FlushedMemTableCache {
@@ -63,10 +62,6 @@ impl FlushedMemTableCache {
                 // into at build time.
                 .support_invalidation_closures()
                 .build(),
-            pk_hashes: moka::future::Cache::builder()
-                .max_capacity(max_entries)
-                .support_invalidation_closures()
-                .build(),
         }
     }
 
@@ -96,21 +91,6 @@ impl FlushedMemTableCache {
             .map_err(|e: Arc<Error>| Error::cloned(e.to_string()))
     }
 
-    /// Get the cached set of PK hashes for `path`, building it (exactly once) on
-    /// a miss via `build`. The flushed path is immutable, so a cached set is
-    /// never stale; concurrent first-queries share one build via `moka`'s
-    /// single-flight `try_get_with`.
-    pub async fn get_or_build_pk_hashes(
-        &self,
-        path: &str,
-        build: impl std::future::Future<Output = Result<HashSet<u64>>>,
-    ) -> Result<Arc<HashSet<u64>>> {
-        self.pk_hashes
-            .try_get_with(path.to_string(), async move { build.await.map(Arc::new) })
-            .await
-            .map_err(|e: Arc<Error>| Error::cloned(e.to_string()))
-    }
-
     /// Drop cached entries whose path is not in `live_paths`.
     ///
     /// Called by the consumer after compaction retires generations. Purely a
@@ -125,10 +105,6 @@ impl FlushedMemTableCache {
         let _ = self
             .inner
             .invalidate_entries_if(move |path, _| !live.contains(path));
-        let live = live_paths.clone();
-        let _ = self
-            .pk_hashes
-            .invalidate_entries_if(move |path, _| !live.contains(path));
     }
 }
 
@@ -140,29 +116,92 @@ impl std::fmt::Debug for FlushedMemTableCache {
     }
 }
 
+/// Caching of opened flushed-generation datasets, keyed by immutable path. The
+/// opened dataset carries the session index cache, which also backs each
+/// generation's secondary indexes and its PK dedup sidecar (see
+/// `block_list::open_pk_index`) — so a single `get_or_open` is the
+/// whole caching surface. Implemented by [`FlushedMemTableCache`]; a
+/// [`GenerationWarmer`] composes one to warm through it, and a consumer may
+/// supply its own implementation.
+#[async_trait]
+pub trait DatasetCache: Send + Sync + std::fmt::Debug {
+    async fn get_or_open(&self, path: &str, session: Option<Arc<Session>>) -> Result<Arc<Dataset>>;
+
+    /// Drop cached entries whose path is not in `live_paths`. Async so an
+    /// implementation can evict retired generations' index objects (e.g.
+    /// `Session::invalidate_index_prefix`) without a later breaking signature
+    /// change; [`FlushedMemTableCache`]'s own eviction is synchronous.
+    async fn retain_paths(&self, live_paths: &HashSet<String>);
+}
+
+#[async_trait]
+impl DatasetCache for FlushedMemTableCache {
+    async fn get_or_open(&self, path: &str, session: Option<Arc<Session>>) -> Result<Arc<Dataset>> {
+        Self::get_or_open(self, path, session).await
+    }
+
+    async fn retain_paths(&self, live_paths: &HashSet<String>) {
+        Self::retain_paths(self, live_paths)
+    }
+}
+
+/// Proactively warms a flushed generation into the shared caches: open the
+/// dataset and pre-load its secondary indexes and PK dedup sidecar so the first
+/// query sees no cold reads. This is the **seam** the flush and read paths fire
+/// — lance defines it; the consumer (e.g. the WAL pod) implements it. `None` =>
+/// no warming, generations warm lazily on first read.
+///
+/// Everything a warmer touches is keyed by the immutable generation `path`
+/// (opened dataset, its secondary indexes, its PK dedup sidecar), so `path` is
+/// the only input it needs.
+///
+/// `warm` is fired fire-and-forget from every read path that opens a generation
+/// (all four LSM planners) as well as pre-commit on flush, so the same path may
+/// be warmed concurrently and repeatedly. Implementations **must be idempotent
+/// and cheap when the path is already warm** (e.g. dedup in-flight and
+/// completed paths) — a redundant call must not re-do work or fail.
+#[async_trait]
+pub trait GenerationWarmer: Send + Sync + std::fmt::Debug {
+    async fn warm(&self, path: &str) -> Result<()>;
+}
+
 /// Open a flushed-generation dataset, shared by all three LSM open sites
 /// (scan, point lookup, vector search).
 ///
-/// - `cache` present: route through [`FlushedMemTableCache`] (single-flight,
-///   shared `Arc`, manifest read amortized across queries).
+/// - `cache` present: route through a [`DatasetCache`] (e.g.
+///   [`FlushedMemTableCache`]: single-flight, shared `Arc`, manifest read
+///   amortized across queries).
 /// - `cache` absent: cold open via [`DatasetBuilder`]. Passing `session`
 ///   still reuses the shared index / metadata caches; `None`/`None`
 ///   reproduces the original per-query cold-open behavior exactly.
+/// - `warmer` present: fire a fire-and-forget warm-on-open backstop behind the
+///   returned handle (the warmer dedups already-warm paths). `None` => no warming.
 pub async fn open_flushed_dataset(
     path: &str,
     session: Option<&Arc<Session>>,
-    cache: Option<&Arc<FlushedMemTableCache>>,
+    cache: Option<&Arc<dyn DatasetCache>>,
+    warmer: Option<&Arc<dyn GenerationWarmer>>,
 ) -> Result<Arc<Dataset>> {
-    match cache {
-        Some(cache) => cache.get_or_open(path, session.cloned()).await,
+    let dataset = match cache {
+        Some(cache) => cache.get_or_open(path, session.cloned()).await?,
         None => {
             let mut builder = DatasetBuilder::from_uri(path);
             if let Some(session) = session {
                 builder = builder.with_session(session.clone());
             }
-            Ok(Arc::new(builder.load().await?))
+            Arc::new(builder.load().await?)
         }
+    };
+    if let Some(warmer) = warmer {
+        let warmer = Arc::clone(warmer);
+        let path = path.to_string();
+        tokio::spawn(async move {
+            if let Err(e) = warmer.warm(&path).await {
+                tracing::debug!(generation = %path, error = %e, "warm-on-open failed");
+            }
+        });
     }
+    Ok(dataset)
 }
 
 #[cfg(test)]
@@ -250,34 +289,6 @@ mod tests {
         assert_eq!(cache.inner.entry_count(), 1, "exactly one entry cached");
     }
 
-    #[tokio::test]
-    async fn pk_hashes_cached_reuses_first_build() {
-        // The PK-hash set is keyed by the immutable flushed path: a hit returns
-        // the first-built set and never runs the second build closure.
-        let cache = FlushedMemTableCache::new(8);
-        let path = "memory://shard/gen_1";
-        let first = cache
-            .get_or_build_pk_hashes(path, async { Ok(HashSet::from([1u64, 2])) })
-            .await
-            .unwrap();
-        let second = cache
-            .get_or_build_pk_hashes(path, async {
-                // Different contents; must be ignored because the path is cached.
-                Ok(HashSet::from([9u64]))
-            })
-            .await
-            .unwrap();
-        assert!(
-            Arc::ptr_eq(&first, &second),
-            "a PK-hash cache hit must reuse the first-built set"
-        );
-        assert_eq!(
-            second.len(),
-            2,
-            "cached set keeps the first build's contents"
-        );
-    }
-
     #[tokio::test]
     async fn test_retain_paths_drops_unreferenced() {
         let temp_dir = tempfile::tempdir().unwrap();
@@ -310,8 +321,8 @@ mod tests {
         let uri = format!("{}/gen_1", temp_dir.path().to_str().unwrap());
         write_dataset(&uri, &[7, 8, 9]).await;
 
-        let a = open_flushed_dataset(&uri, None, None).await.unwrap();
-        let b = open_flushed_dataset(&uri, None, None).await.unwrap();
+        let a = open_flushed_dataset(&uri, None, None, None).await.unwrap();
+        let b = open_flushed_dataset(&uri, None, None, None).await.unwrap();
         assert!(
             !Arc::ptr_eq(&a, &b),
             "no-cache path must cold-open each call"
@@ -319,13 +330,57 @@ mod tests {
         assert_eq!(a.count_rows(None).await.unwrap(), 3);
 
         // With a cache, the second call is a shared clone.
-        let cache = Arc::new(FlushedMemTableCache::new(8));
-        let c = open_flushed_dataset(&uri, None, Some(&cache))
+        let cache: Arc<dyn DatasetCache> = Arc::new(FlushedMemTableCache::new(8));
+        let c = open_flushed_dataset(&uri, None, Some(&cache), None)
             .await
             .unwrap();
-        let d = open_flushed_dataset(&uri, None, Some(&cache))
+        let d = open_flushed_dataset(&uri, None, Some(&cache), None)
             .await
             .unwrap();
         assert!(Arc::ptr_eq(&c, &d), "cached path must reuse the Arc");
     }
+
+    /// A warmer that records calls and signals each one.
+    #[derive(Debug)]
+    struct NotifyingWarmer {
+        calls: Arc<AtomicUsize>,
+        notify: Arc<tokio::sync::Notify>,
+    }
+
+    #[async_trait]
+    impl GenerationWarmer for NotifyingWarmer {
+        async fn warm(&self, _path: &str) -> Result<()> {
+            self.calls.fetch_add(1, Ordering::SeqCst);
+            self.notify.notify_one();
+            Ok(())
+        }
+    }
+
+    #[tokio::test]
+    async fn test_open_flushed_dataset_fires_warm_on_open() {
+        // The warm-on-open backstop fires the warmer (fire-and-forget) when a
+        // generation is opened, so generations the flusher never warmed still
+        // get warmed lazily on first read.
+        let temp_dir = tempfile::tempdir().unwrap();
+        let uri = format!("{}/gen_1", temp_dir.path().to_str().unwrap());
+        write_dataset(&uri, &[1, 2, 3]).await;
+
+        let calls = Arc::new(AtomicUsize::new(0));
+        let notify = Arc::new(tokio::sync::Notify::new());
+        let warmer: Arc<dyn GenerationWarmer> = Arc::new(NotifyingWarmer {
+            calls: calls.clone(),
+            notify: notify.clone(),
+        });
+
+        let ds = open_flushed_dataset(&uri, None, None, Some(&warmer))
+            .await
+            .unwrap();
+        assert_eq!(ds.count_rows(None).await.unwrap(), 3);
+
+        // The warm is spawned fire-and-forget; wait (bounded) for it to run.
+        tokio::time::timeout(std::time::Duration::from_secs(5), notify.notified())
+            .await
+            .expect("warm-on-open must fire");
+        assert_eq!(calls.load(Ordering::SeqCst), 1, "warmer fired once on open");
+    }
 }
diff --git a/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs b/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs
index e3ef44d8b1a..e7c8d205d5d 100644
--- a/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs
+++ b/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs
@@ -44,7 +44,7 @@ use datafusion::physical_plan::ExecutionPlan;
 use datafusion::physical_plan::sorts::sort::SortExec;
 use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
 use datafusion::physical_plan::union::UnionExec;
-use lance_core::{Error, ROW_ID, Result, is_system_column};
+use lance_core::{Error, Result, is_system_column};
 use lance_index::scalar::FullTextSearchQuery;
 use lance_index::scalar::inverted::query::FtsQuery as IndexFtsQuery;
 use tracing::instrument;
@@ -52,8 +52,8 @@ use tracing::instrument;
 use super::block_list::compute_source_block_lists;
 use super::collector::LsmDataSourceCollector;
 use super::data_source::LsmDataSource;
-use super::exec::{DedupDirection, PkHashFilterExec, WithinSourceDedupExec};
-use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset};
+use super::exec::{NewestPkFilterExec, PkBlockFilterExec};
+use super::flushed_cache::{DatasetCache, GenerationWarmer, open_flushed_dataset};
 use super::projection::project_to_canonical;
 use crate::dataset::mem_wal::memtable::scanner::MemTableScanner;
 use crate::session::Session;
@@ -76,7 +76,9 @@ pub struct LsmFtsSearchPlanner {
     /// Session threaded into flushed-generation opens (shared caches).
     session: Option<Arc<Session>>,
     /// Cache of opened flushed-generation datasets.
-    flushed_cache: Option<Arc<FlushedMemTableCache>>,
+    flushed_cache: Option<Arc<dyn DatasetCache>>,
+    /// Optional warmer fired on first open of a flushed generation.
+    warmer: Option<Arc<dyn GenerationWarmer>>,
     /// Over-fetch multiple for blocked sources (clamped to `>= 1.0`).
     overfetch_factor: f64,
 }
@@ -94,6 +96,7 @@ impl LsmFtsSearchPlanner {
             base_schema,
             session: None,
             flushed_cache: None,
+            warmer: None,
             overfetch_factor: DEFAULT_OVERFETCH_FACTOR,
         }
     }
@@ -114,11 +117,17 @@ impl LsmFtsSearchPlanner {
 
     /// Inject a cache of opened flushed-generation datasets, making repeated
     /// searches against the same generation a pure `Arc::clone`.
-    pub fn with_flushed_cache(mut self, cache: Arc<FlushedMemTableCache>) -> Self {
+    pub fn with_flushed_cache(mut self, cache: Arc<dyn DatasetCache>) -> Self {
         self.flushed_cache = Some(cache);
         self
     }
 
+    /// Inject the warmer fired on first open of a flushed generation.
+    pub fn with_warmer(mut self, warmer: Arc<dyn GenerationWarmer>) -> Self {
+        self.warmer = Some(warmer);
+        self
+    }
+
     /// Build the FTS execution plan (local scoring).
     ///
     /// # Arguments
@@ -154,51 +163,57 @@ impl LsmFtsSearchPlanner {
             return self.empty_plan(&target_schema);
         }
 
-        // Per-source PK-hash block sets for cross-generation dedup (NEWER(G)
-        // per shard; base = union of all gens). Query-type-agnostic — same
-        // call the vector planner makes. `Box::pin` keeps the future off
+        // Per-source PK block sets for cross-generation dedup (NEWER(G) per
+        // shard; base = union of all gens). Query-type-agnostic — same call the
+        // vector planner makes. `Box::pin` keeps the future off
         // `clippy::large_futures`.
         let block_lists = Box::pin(compute_source_block_lists(
             &sources,
-            &self.pk_columns,
             self.session.as_ref(),
             self.flushed_cache.as_ref(),
         ))
         .await?;
         let overfetch = self.overfetch_factor.max(1.0);
 
-        let mut per_source_plans: Vec<Arc<dyn ExecutionPlan>> = Vec::with_capacity(sources.len());
-        for source in &sources {
-            let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. });
-            let blocked = block_lists.get(&(source.shard_id(), source.generation()));
-            // Over-fetch a blocked source so the post-filter still yields k live
-            // rows. The active arm returns all matches (no builder limit), so its
-            // within-source dedup needs no over-fetch hint.
-            let fetch_k = if blocked.is_some() {
-                ((k as f64) * overfetch).ceil() as usize
-            } else {
-                k
-            };
-
-            let plan = self
-                .build_source_plan(source, column, &query, fetch_k, projection, is_active)
-                .await?;
+        // Stage the per-source over-fetch decisions, then build every source
+        // plan concurrently — the builds are independent and a sequential loop
+        // was the dominant serial planning cost at multiple generations.
+        let arm_inputs: Vec<_> = sources
+            .iter()
+            .map(|source| {
+                let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. });
+                let blocked = block_lists.get(&(source.shard_id(), source.generation()));
+                // Over-fetch a blocked source so the post-filter still yields k live
+                // rows. The active arm returns all matches (no builder limit), so its
+                // within-source dedup needs no over-fetch hint.
+                let fetch_k = if blocked.is_some() {
+                    ((k as f64) * overfetch).ceil() as usize
+                } else {
+                    k
+                };
+                (source, is_active, blocked, fetch_k)
+            })
+            .collect();
+        let built =
+            futures::future::try_join_all(arm_inputs.iter().map(|(source, _, _, fetch_k)| {
+                Box::pin(self.build_source_plan(source, column, &query, *fetch_k, projection))
+            }))
+            .await?;
 
+        let mut per_source_plans: Vec<Arc<dyn ExecutionPlan>> = Vec::with_capacity(sources.len());
+        for ((_, is_active, blocked, _), plan) in arm_inputs.iter().zip(built) {
+            let is_active = *is_active;
+            let blocked = *blocked;
             // Dedup, mirroring LsmVectorSearchPlanner:
-            //  * active: collapse duplicate-PK appends to the newest insert
-            //    (larger _rowid = inserted later). The FTS index is append-only,
-            //    so an in-memtable update leaves both versions searchable.
+            //  * active: already wrapped in `NewestPkFilterExec` inside
+            //    `build_source_plan` (drops predicate-crossing stale hits, which a
+            //    result-set dedup can't catch).
             //  * flushed/base: drop rows superseded by a newer generation via the
             //    block-list (within-gen is handled by the flushed deletion vector).
             let deduped = if is_active {
-                Arc::new(WithinSourceDedupExec::new(
-                    plan,
-                    self.pk_columns.clone(),
-                    ROW_ID,
-                    DedupDirection::KeepMaxRowAddr,
-                )) as Arc<dyn ExecutionPlan>
+                plan
             } else if let Some(set) = blocked {
-                Arc::new(PkHashFilterExec::new(
+                Arc::new(PkBlockFilterExec::new(
                     plan,
                     self.pk_columns.clone(),
                     set.clone(),
@@ -219,8 +234,11 @@ impl LsmFtsSearchPlanner {
             per_source_plans.into_iter().next().unwrap()
         } else {
             #[allow(deprecated)]
-            let union: Arc<dyn ExecutionPlan> = Arc::new(UnionExec::new(per_source_plans));
-            union
+            // The downstream `SortPreservingMergeExec` already spawns one driver
+            // task per input partition (one per union arm) via `spawn_buffered`,
+            // so each arm's per-arm CPU (posting decode, BM25) runs on its own
+            // task without an extra repartition.
+            Arc::new(UnionExec::new(per_source_plans))
         };
 
         let score_idx = merged.schema().index_of(SCORE_COLUMN).map_err(|_| {
@@ -263,7 +281,6 @@ impl LsmFtsSearchPlanner {
         query: &FullTextSearchQuery,
         k: usize,
         projection: Option<&[String]>,
-        emit_row_id: bool,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         match source {
             LsmDataSource::BaseTable { dataset } => {
@@ -278,9 +295,13 @@ impl LsmFtsSearchPlanner {
                 scanner.create_plan().await
             }
             LsmDataSource::FlushedMemTable { path, .. } => {
-                let dataset =
-                    open_flushed_dataset(path, self.session.as_ref(), self.flushed_cache.as_ref())
-                        .await?;
+                let dataset = open_flushed_dataset(
+                    path,
+                    self.session.as_ref(),
+                    self.flushed_cache.as_ref(),
+                    self.warmer.as_ref(),
+                )
+                .await?;
                 let mut scanner = dataset.scan();
                 let cols = self.fts_scanner_projection(projection);
                 scanner.project(&cols.iter().map(|s| s.as_str()).collect::<Vec<_>>())?;
@@ -301,11 +322,12 @@ impl LsmFtsSearchPlanner {
                     MemTableScanner::new(batch_store.clone(), index_store.clone(), schema.clone());
                 let cols = self.fts_scanner_projection(projection);
                 scanner.project(&cols.iter().map(|s| s.as_str()).collect::<Vec<_>>());
-                // Emit `_rowid` (row position) so the planner can collapse
-                // duplicate-PK appends via WithinSourceDedupExec before the union.
-                if emit_row_id {
-                    scanner.with_row_id();
-                }
+                // Expose the row position so the recency filter can identify the
+                // newest visible version of each PK. The append-only inverted
+                // index keeps an updated row's old postings live, so a stale hit
+                // can match a query the fresh row no longer does; the filter
+                // drops it. `project_to_canonical` strips `_rowid` afterward.
+                scanner.with_row_id();
                 // `MemTableScanner::full_text_search` takes a raw match
                 // string; richer query shapes (phrase/boolean/fuzzy) can
                 // be plumbed through once the MemTable scanner accepts a
@@ -324,7 +346,19 @@ impl LsmFtsSearchPlanner {
                 // today; the per-partition Sort+fetch above bounds the
                 // emitted rows.
                 let _ = k;
-                scanner.create_plan().await
+                let plan = scanner.create_plan().await?;
+                // Drop predicate-crossing stale hits: keep a hit iff it is the
+                // newest visible version of its PK (collapses duplicate-PK
+                // appends too — supersedes the old WithinSourceDedupExec).
+                let filtered: Arc<dyn ExecutionPlan> = Arc::new(NewestPkFilterExec::new(
+                    plan,
+                    self.pk_columns.clone(),
+                    lance_core::ROW_ID,
+                    index_store.clone(),
+                    batch_store.clone(),
+                    scanner.max_visible_batch_position(),
+                ));
+                Ok(filtered)
             }
         }
     }
@@ -478,6 +512,7 @@ mod tests {
         // Active memtable with its own FTS index, containing a matching row.
         let batch_store = Arc::new(BatchStore::with_capacity(16));
         let mut indexes = IndexStore::new();
+        indexes.enable_pk_index(&[("id".to_string(), 0)]);
         indexes.add_fts("text_fts".to_string(), 1, "text".to_string());
         let active_batch = make_batch(
             &schema,
@@ -646,6 +681,7 @@ mod tests {
         let schema = fts_schema();
         let batch_store = Arc::new(BatchStore::with_capacity(16));
         let mut indexes = IndexStore::new();
+        indexes.enable_pk_index(&[("id".to_string(), 0)]);
         indexes.add_fts("text_fts".to_string(), 1, "text".to_string());
 
         // First append (positions 0,1): id=1 is the stale version of the PK.
@@ -725,4 +761,88 @@ mod tests {
             "dedup must keep the newest (max row-position) version"
         );
     }
+
+    #[tokio::test]
+    async fn active_stale_update_predicate_crossing_leaks() {
+        // A PK update that crosses out of the match set: pk=1 inserted as
+        // "alpha lance", then updated to "beta lance". The append-only inverted
+        // index keeps the old "alpha" posting live, so an "alpha" search still
+        // matches the STALE pk=1 row — and the fresh "beta lance" row isn't even
+        // a candidate, so a result-set dedup has nothing to suppress it against.
+        // `NewestPkFilterExec` drops it predicate-independently: pk=1's newest
+        // visible row is "beta lance", so the "alpha" hit is not the newest.
+        let schema = fts_schema();
+        let batch_store = Arc::new(BatchStore::with_capacity(16));
+        let mut indexes = IndexStore::new();
+        indexes.enable_pk_index(&[("id".to_string(), 0)]);
+        indexes.add_fts("text_fts".to_string(), 1, "text".to_string());
+
+        // Insert pk=1 ("alpha lance") and an unrelated live pk=2 ("alpha foo").
+        let b1 = make_batch(&schema, &[1, 2], &["alpha lance", "alpha foo"]);
+        let (bp1, off1, _) = batch_store.append(b1.clone()).unwrap();
+        indexes
+            .insert_with_batch_position(&b1, off1, Some(bp1))
+            .unwrap();
+
+        // Update pk=1 → "beta lance" (no longer matches "alpha").
+        let b2 = make_batch(&schema, &[1], &["beta lance"]);
+        let (bp2, off2, _) = batch_store.append(b2.clone()).unwrap();
+        indexes
+            .insert_with_batch_position(&b2, off2, Some(bp2))
+            .unwrap();
+        let indexes = Arc::new(indexes);
+
+        let tmp = tempfile::tempdir().unwrap();
+        let base_uri = format!("{}/base", tmp.path().to_str().unwrap());
+        let collector = LsmDataSourceCollector::without_base_table(base_uri, vec![])
+            .with_in_memory_memtables(
+                uuid::Uuid::new_v4(),
+                InMemoryMemTables {
+                    active: InMemoryMemTableRef {
+                        batch_store,
+                        index_store: indexes,
+                        schema: schema.clone(),
+                        generation: 1,
+                    },
+                    frozen: vec![],
+                },
+            );
+
+        let planner = LsmFtsSearchPlanner::new(collector, vec!["id".to_string()], schema);
+        let plan = planner
+            .plan_search(
+                "text",
+                FullTextSearchQuery::new("alpha".to_string()),
+                10,
+                None,
+            )
+            .await
+            .expect("planner should produce a plan");
+
+        let ctx = datafusion::prelude::SessionContext::new();
+        let stream = plan.execute(0, ctx.task_ctx()).unwrap();
+        let batches: Vec<RecordBatch> = stream.try_collect().await.unwrap();
+
+        let mut ids: Vec<i32> = Vec::new();
+        for b in &batches {
+            let col = b
+                .column_by_name("id")
+                .unwrap()
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .unwrap();
+            for i in 0..b.num_rows() {
+                ids.push(col.value(i));
+            }
+        }
+
+        assert!(
+            !ids.contains(&1),
+            "stale pk=1 (now 'beta lance') leaked on an 'alpha' search; got ids={ids:?}"
+        );
+        assert!(
+            ids.contains(&2),
+            "live pk=2 ('alpha foo') must still match 'alpha'; got ids={ids:?}"
+        );
+    }
 }
diff --git a/rust/lance/src/dataset/mem_wal/scanner/planner.rs b/rust/lance/src/dataset/mem_wal/scanner/planner.rs
index f3f15e2e680..f040428f342 100644
--- a/rust/lance/src/dataset/mem_wal/scanner/planner.rs
+++ b/rust/lance/src/dataset/mem_wal/scanner/planner.rs
@@ -15,8 +15,8 @@ use tracing::instrument;
 
 use super::collector::LsmDataSourceCollector;
 use super::data_source::LsmDataSource;
-use super::exec::{MEMTABLE_GEN_COLUMN, MemtableGenTagExec, PkHashFilterExec, ROW_ADDRESS_COLUMN};
-use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset};
+use super::exec::{MEMTABLE_GEN_COLUMN, MemtableGenTagExec, PkBlockFilterExec, ROW_ADDRESS_COLUMN};
+use super::flushed_cache::{DatasetCache, GenerationWarmer, open_flushed_dataset};
 use super::projection::{
     build_scanner_projection, canonical_output_schema, null_columns, project_to_canonical,
 };
@@ -33,7 +33,13 @@ pub struct LsmScanPlanner {
     /// Session threaded into flushed-generation opens (shared caches).
     session: Option<Arc<Session>>,
     /// Cache of opened flushed-generation datasets.
-    flushed_cache: Option<Arc<FlushedMemTableCache>>,
+    flushed_cache: Option<Arc<dyn DatasetCache>>,
+    /// Optional warmer fired on first open of a flushed generation.
+    warmer: Option<Arc<dyn GenerationWarmer>>,
+    /// Over-fetch multiple for the per-source limit pushdown: block-listed
+    /// sources scan `(offset + limit) * factor` rows so cross-gen dedup drops
+    /// still leave enough live rows. Clamped to `>= 1.0`.
+    overfetch_factor: f64,
 }
 
 impl LsmScanPlanner {
@@ -49,6 +55,8 @@ impl LsmScanPlanner {
             base_schema,
             session: None,
             flushed_cache: None,
+            warmer: None,
+            overfetch_factor: 1.0,
         }
     }
 
@@ -61,11 +69,24 @@ impl LsmScanPlanner {
 
     /// Inject a cache of opened flushed-generation datasets, making repeated
     /// queries against the same generation a pure `Arc::clone`.
-    pub fn with_flushed_cache(mut self, cache: Arc<FlushedMemTableCache>) -> Self {
+    pub fn with_flushed_cache(mut self, cache: Arc<dyn DatasetCache>) -> Self {
         self.flushed_cache = Some(cache);
         self
     }
 
+    /// Inject the warmer fired on first open of a flushed generation.
+    pub fn with_warmer(mut self, warmer: Arc<dyn GenerationWarmer>) -> Self {
+        self.warmer = Some(warmer);
+        self
+    }
+
+    /// Set the over-fetch multiple for the per-source limit pushdown
+    /// (see the field docs). Clamped to `>= 1.0` at use.
+    pub fn with_overfetch_factor(mut self, factor: f64) -> Self {
+        self.overfetch_factor = factor;
+        self
+    }
+
     /// Create scan plan with deduplication.
     ///
     /// # Arguments
@@ -82,7 +103,7 @@ impl LsmScanPlanner {
     /// Each source is independently newest-per-PK (active via the fused
     /// [`MemTableDedupScanExec`](super::super::memtable::scanner), flushed via
     /// its within-generation deletion vector) and a cross-generation block-list
-    /// ([`PkHashFilterExec`]) drops any PK superseded by a newer generation.
+    /// ([`PkBlockFilterExec`]) drops any PK superseded by a newer generation.
     /// Each PK therefore survives in exactly one source, so a plain
     /// `UnionExec` carries at most one row per PK — no cross-source dedup,
     /// sort, or merge needed. `_memtable_gen` / `_rowaddr` are output-only and
@@ -119,7 +140,6 @@ impl LsmScanPlanner {
         // `Box::pin` keeps the future off `clippy::large_futures`.
         let block_lists = Box::pin(super::block_list::compute_source_block_lists(
             &sources,
-            &self.pk_columns,
             self.session.as_ref(),
             self.flushed_cache.as_ref(),
         ))
@@ -130,23 +150,59 @@ impl LsmScanPlanner {
         // cross-gen block-list, not from output ordering.
         let sources: Vec<_> = sources.into_iter().rev().collect();
 
+        // Per-source limit pushdown: an unordered LIMIT needs only
+        // `offset + limit` live rows from EACH source to fill the global
+        // limit after dedup (any-N semantics), so cap every on-disk source
+        // instead of scanning whole generations and trimming above the
+        // union. Block-listed sources over-fetch by `overfetch_factor` so
+        // cross-gen dedup drops still leave `n_needed` live rows; the
+        // PkBlockFilter warns when that was not enough. The active memtable
+        // is in-memory and within-gen append duplicates are resolved by its
+        // own dedup, so it is never capped here.
+        let n_needed = limit.map(|l| l.saturating_add(offset.unwrap_or(0)));
+        let overfetch = self.overfetch_factor.max(1.0);
+
         let mut source_plans = Vec::new();
         for source in sources {
             let is_base = matches!(source, LsmDataSource::BaseTable { .. });
-            let scan = self.build_source_scan(&source, projection, filter).await?;
+            let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. });
+            let blocked = block_lists
+                .get(&(source.shard_id(), source.generation()))
+                .cloned();
+            let fetch = match (n_needed, is_active) {
+                (Some(n), false) => Some(if blocked.is_some() {
+                    ((n as f64) * overfetch).ceil() as usize
+                } else {
+                    n
+                }),
+                _ => None,
+            };
+            let scan = self
+                .build_source_scan(&source, projection, filter, fetch)
+                .await?;
 
             // Drop cross-generation stale rows (PKs superseded by a newer gen).
-            // `k = 0`: there is no top-k, so the under-fetch warning never fires.
-            let scan = match block_lists.get(&(source.shard_id(), source.generation())) {
-                Some(set) => Arc::new(PkHashFilterExec::new(
+            // With a limit, `k = n_needed` arms the under-fetch warning; with
+            // no limit `k = 0` keeps it silent.
+            let scan = match blocked {
+                Some(set) => Arc::new(PkBlockFilterExec::new(
                     scan,
                     self.pk_columns.clone(),
-                    set.clone(),
-                    0,
+                    set,
+                    n_needed.unwrap_or(0),
                 )) as Arc<dyn ExecutionPlan>,
                 None => scan,
             };
 
+            // Post-block-list cap: each source contributes at most `n_needed`
+            // live rows toward the global limit.
+            let scan: Arc<dyn ExecutionPlan> = match n_needed {
+                Some(n) if !is_active => Arc::new(
+                    datafusion::physical_plan::limit::LocalLimitExec::new(scan, n),
+                ),
+                _ => scan,
+            };
+
             // When `_rowaddr` is surfaced, NULL it for non-base arms: only base
             // values are meaningful (e.g. for `take_rows`); per-source addresses
             // collide with base IDs.
@@ -229,6 +285,7 @@ impl LsmScanPlanner {
         source: &LsmDataSource,
         projection: Option<&[String]>,
         filter: Option<&Expr>,
+        fetch: Option<usize>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         match source {
             LsmDataSource::BaseTable { dataset } => {
@@ -247,13 +304,22 @@ impl LsmScanPlanner {
                 if let Some(expr) = filter {
                     scanner.filter_expr(expr.clone());
                 }
+                // Per-source limit pushdown (post-filter rows): bounds the
+                // physical scan instead of trimming above the union.
+                if let Some(fetch) = fetch {
+                    scanner.limit(Some(fetch as i64), None)?;
+                }
 
                 scanner.create_plan().await
             }
             LsmDataSource::FlushedMemTable { path, .. } => {
-                let dataset =
-                    open_flushed_dataset(path, self.session.as_ref(), self.flushed_cache.as_ref())
-                        .await?;
+                let dataset = open_flushed_dataset(
+                    path,
+                    self.session.as_ref(),
+                    self.flushed_cache.as_ref(),
+                    self.warmer.as_ref(),
+                )
+                .await?;
                 let mut scanner = dataset.scan();
 
                 let cols =
@@ -264,6 +330,12 @@ impl LsmScanPlanner {
                 if let Some(expr) = filter {
                     scanner.filter_expr(expr.clone());
                 }
+                // Per-source limit pushdown: flushed generations are
+                // within-gen live (dedup-on-flush deletion vectors), so any
+                // `fetch` post-filter rows are valid contributions.
+                if let Some(fetch) = fetch {
+                    scanner.limit(Some(fetch as i64), None)?;
+                }
 
                 scanner.create_plan().await
             }
@@ -413,13 +485,36 @@ mod integration_tests {
         .unwrap()
     }
 
-    /// Create a dataset at the given URI with the provided batches.
+    /// Create a dataset at the given URI with the provided batches. Also writes
+    /// the standalone PK sidecar (on `id`) so a flushed-generation source can be
+    /// probed by the block-list; harmless for a base table (never probed).
     async fn create_dataset(uri: &str, batches: Vec<RecordBatch>) -> Dataset {
         let schema = batches[0].schema();
-        let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema);
-        Dataset::write(reader, uri, Some(WriteParams::default()))
+        let has_id = schema.column_with_name("id").is_some();
+        let reader = RecordBatchIterator::new(batches.clone().into_iter().map(Ok), schema);
+        let dataset = Dataset::write(reader, uri, Some(WriteParams::default()))
             .await
-            .unwrap()
+            .unwrap();
+        if has_id {
+            super::super::block_list::write_pk_sidecar(uri, &batches, &["id"])
+                .await
+                .unwrap();
+        }
+        dataset
+    }
+
+    /// Build an in-memory memtable's `(batch_store, index_store)` with the PK
+    /// index enabled and populated (mirrors production — the block-list needs
+    /// the PK index to dedup in-memory generations).
+    fn pk_indexed(batches: &[RecordBatch]) -> (Arc<BatchStore>, Arc<IndexStore>) {
+        let batch_store = Arc::new(BatchStore::with_capacity(100));
+        let mut index = IndexStore::new();
+        index.enable_pk_index(&[("id".to_string(), 0)]);
+        for b in batches {
+            let (bp, off, _) = batch_store.append(b.clone()).unwrap();
+            index.insert_with_batch_position(b, off, Some(bp)).unwrap();
+        }
+        (batch_store, Arc::new(index))
     }
 
     /// Setup a multi-level LSM structure with:
@@ -470,10 +565,8 @@ mod integration_tests {
             .with_flushed_generation(2, "gen_2".to_string());
 
         // Create active memtable
-        let batch_store = Arc::new(BatchStore::with_capacity(100));
-        let index_store = Arc::new(IndexStore::new());
-        let active_batch = create_test_batch(&schema, &[5, 6, 7], "active");
-        let _ = batch_store.append(active_batch);
+        let (batch_store, index_store) =
+            pk_indexed(&[create_test_batch(&schema, &[5, 6, 7], "active")]);
 
         let active_memtable = InMemoryMemTables {
             active: InMemoryMemTableRef {
@@ -515,18 +608,18 @@ mod integration_tests {
         // Verify the plan (gen DESC order: active -> gen2 -> gen1 -> base):
         // - plain UnionExec at top
         // - active arm: MemTableDedupScanExec (newest gen, not block-listed)
-        // - older arms: PkHashFilterExec (cross-gen block-list) -> LanceRead
+        // - older arms: PkBlockFilterExec (cross-gen block-list) -> LanceRead
         assert_plan_node_equals(
             plan,
             "ProjectionExec:...
   CoalescePartitionsExec
     UnionExec
     MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true
-    PkHashFilterExec: pk_cols=[id]...
+    PkBlockFilterExec: pk_cols=[id]...
       LanceRead:...gen_2...
-    PkHashFilterExec: pk_cols=[id]...
+    PkBlockFilterExec: pk_cols=[id]...
       LanceRead:...gen_1...
-    PkHashFilterExec: pk_cols=[id]...
+    PkBlockFilterExec: pk_cols=[id]...
       LanceRead:...base/data...refine_filter=--",
         )
         .await
@@ -549,9 +642,9 @@ mod integration_tests {
 
         // Verify the plan with `_memtable_gen` tags (gen DESC order):
         // - plain UnionExec at top
-        // - each arm: MemtableGenTagExec -> (PkHashFilterExec ->) data source
+        // - each arm: MemtableGenTagExec -> (PkBlockFilterExec ->) data source
         //   - gen3 (active): MemtableGenTagExec -> MemTableDedupScanExec
-        //   - gen2/gen1/base: MemtableGenTagExec -> PkHashFilterExec -> LanceRead
+        //   - gen2/gen1/base: MemtableGenTagExec -> PkBlockFilterExec -> LanceRead
         assert_plan_node_equals(
             plan,
             "ProjectionExec:...
@@ -560,13 +653,13 @@ mod integration_tests {
     MemtableGenTagExec: gen=gen3
       MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true
     MemtableGenTagExec: gen=gen2
-      PkHashFilterExec: pk_cols=[id]...
+      PkBlockFilterExec: pk_cols=[id]...
         LanceRead:...gen_2...
     MemtableGenTagExec: gen=gen1
-      PkHashFilterExec: pk_cols=[id]...
+      PkBlockFilterExec: pk_cols=[id]...
         LanceRead:...gen_1...
     MemtableGenTagExec: gen=base
-      PkHashFilterExec: pk_cols=[id]...
+      PkBlockFilterExec: pk_cols=[id]...
         LanceRead:...base/data...refine_filter=--",
         )
         .await
@@ -647,14 +740,14 @@ mod integration_tests {
         }
 
         // base/gen1/gen2 all hold PKs superseded by a newer generation, so each
-        // is wrapped in a `PkHashFilterExec`; the newest (active) arm is not.
+        // is wrapped in a `PkBlockFilterExec`; the newest (active) arm is not.
         let plan = scanner.create_plan().await.unwrap();
         let plan_str = format!(
             "{}",
             datafusion::physical_plan::displayable(plan.as_ref()).indent(true)
         );
         assert!(
-            plan_str.contains("PkHashFilterExec"),
+            plan_str.contains("PkBlockFilterExec"),
             "filtered-read plan must apply the cross-gen block-list, got:\n{}",
             plan_str
         );
@@ -730,21 +823,21 @@ mod integration_tests {
             .with_flushed_generation(2, "gen_2".to_string());
 
         // Frozen gen3 (sealed, NOT in the manifest) and active gen4.
-        let frozen_store = Arc::new(BatchStore::with_capacity(100));
-        let _ = frozen_store.append(create_test_batch(&schema, &[6, 7], "frozen"));
+        let (frozen_store, frozen_index) =
+            pk_indexed(&[create_test_batch(&schema, &[6, 7], "frozen")]);
         let frozen = InMemoryMemTableRef {
             batch_store: frozen_store,
-            index_store: Arc::new(IndexStore::new()),
+            index_store: frozen_index,
             schema: schema.clone(),
             generation: 3,
         };
 
-        let active_store = Arc::new(BatchStore::with_capacity(100));
-        let _ = active_store.append(create_test_batch(&schema, &[7, 8], "active"));
+        let (active_store, active_index) =
+            pk_indexed(&[create_test_batch(&schema, &[7, 8], "active")]);
         let in_memory = InMemoryMemTables {
             active: InMemoryMemTableRef {
                 batch_store: active_store,
-                index_store: Arc::new(IndexStore::new()),
+                index_store: active_index,
                 schema: schema.clone(),
                 generation: 4,
             },
@@ -969,12 +1062,12 @@ mod integration_tests {
     ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr]
       MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true
     ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr]
-      PkHashFilterExec: pk_cols=[id]...
+      PkBlockFilterExec: pk_cols=[id]...
         LanceRead:...gen_2...
     ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr]
-      PkHashFilterExec: pk_cols=[id]...
+      PkBlockFilterExec: pk_cols=[id]...
         LanceRead:...gen_1...
-    PkHashFilterExec: pk_cols=[id]...
+    PkBlockFilterExec: pk_cols=[id]...
       LanceRead:...base/data...refine_filter=--",
         )
         .await
@@ -1037,14 +1130,14 @@ mod integration_tests {
         MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true
     MemtableGenTagExec: gen=gen2
       ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr]
-        PkHashFilterExec: pk_cols=[id]...
+        PkBlockFilterExec: pk_cols=[id]...
           LanceRead:...gen_2...
     MemtableGenTagExec: gen=gen1
       ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr]
-        PkHashFilterExec: pk_cols=[id]...
+        PkBlockFilterExec: pk_cols=[id]...
           LanceRead:...gen_1...
     MemtableGenTagExec: gen=base
-      PkHashFilterExec: pk_cols=[id]...
+      PkBlockFilterExec: pk_cols=[id]...
         LanceRead:...base/data...refine_filter=--",
         )
         .await
@@ -1113,6 +1206,8 @@ mod integration_tests {
         let mut index_store = IndexStore::new();
         // Add BTree index on id column (field_id=0)
         index_store.add_btree("id_idx".to_string(), 0, "id".to_string());
+        // Reuse it as the PK index so the block-list can dedup this generation.
+        index_store.enable_pk_index(&[("id".to_string(), 0)]);
 
         let active_batch = create_test_batch(&schema, &[5, 6, 7], "active");
         let _ = batch_store.append(active_batch.clone());
@@ -1177,7 +1272,7 @@ mod integration_tests {
         // 1. Verify overall structure
         assert!(plan_str.contains("UnionExec"), "Should have UnionExec");
         assert!(
-            plan_str.contains("PkHashFilterExec"),
+            plan_str.contains("PkBlockFilterExec"),
             "older generations should be block-list filtered"
         );
         assert!(
@@ -1365,7 +1460,6 @@ mod integration_tests {
 
         // Active memtable: id=10 inserted ("keep") then updated to NULL within
         // the same generation; id=20 ("active_20") is a control that matches.
-        let batch_store = Arc::new(BatchStore::with_capacity(16));
         let active_batch = RecordBatch::try_new(
             schema.clone(),
             vec![
@@ -1378,12 +1472,12 @@ mod integration_tests {
             ],
         )
         .unwrap();
-        batch_store.append(active_batch).unwrap();
+        let (batch_store, index_store) = pk_indexed(&[active_batch]);
 
         let in_memory = InMemoryMemTables {
             active: InMemoryMemTableRef {
                 batch_store,
-                index_store: Arc::new(IndexStore::new()),
+                index_store,
                 schema: schema.clone(),
                 generation: 1,
             },
diff --git a/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs b/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs
index d1353e72dcc..2da4b5cd9a6 100644
--- a/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs
+++ b/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs
@@ -9,11 +9,14 @@ use std::collections::HashMap;
 use std::sync::Arc;
 
 use arrow_array::{Array, RecordBatch};
-use arrow_schema::SchemaRef;
+use arrow_schema::{SchemaRef, SortOptions};
 use datafusion::common::ScalarValue;
 use datafusion::execution::TaskContext;
+use datafusion::physical_expr::expressions::Column;
+use datafusion::physical_expr::{LexOrdering, PhysicalSortExpr};
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::physical_plan::limit::GlobalLimitExec;
+use datafusion::physical_plan::sorts::sort::SortExec;
 use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion::prelude::{Expr, SessionContext};
 use futures::TryStreamExt;
@@ -27,11 +30,8 @@ use crate::dataset::mem_wal::memtable::batch_store::BatchStore;
 
 use super::collector::LsmDataSourceCollector;
 use super::data_source::LsmDataSource;
-use super::exec::{
-    BloomFilterGuardExec, CoalesceFirstExec, DedupDirection, WithinSourceDedupExec,
-    compute_pk_hash_from_scalars,
-};
-use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset};
+use super::exec::{BloomFilterGuardExec, CoalesceFirstExec, compute_pk_hash_from_scalars};
+use super::flushed_cache::{DatasetCache, GenerationWarmer, open_flushed_dataset};
 use super::projection::{
     build_scanner_projection, canonical_output_schema, null_columns, project_to_canonical,
     wants_row_address, wants_row_id,
@@ -87,7 +87,9 @@ pub struct LsmPointLookupPlanner {
     /// Session threaded into flushed-generation opens (shared caches).
     session: Option<Arc<Session>>,
     /// Cache of opened flushed-generation datasets.
-    flushed_cache: Option<Arc<FlushedMemTableCache>>,
+    flushed_cache: Option<Arc<dyn DatasetCache>>,
+    /// Optional warmer fired on first open of a flushed generation.
+    warmer: Option<Arc<dyn GenerationWarmer>>,
     /// Precomputed canonical output schema for the no-projection case, so the
     /// hot `lookup(.., None)` path clones an `Arc` instead of rebuilding the
     /// schema on every call.
@@ -120,6 +122,7 @@ impl LsmPointLookupPlanner {
             bloom_filters: std::collections::HashMap::new(),
             session: None,
             flushed_cache: None,
+            warmer: None,
             none_target,
             task_ctx: SessionContext::new().task_ctx(),
         }
@@ -137,11 +140,17 @@ impl LsmPointLookupPlanner {
     /// front during scan setup via
     /// [`DatasetMemWalExt::prewarm_mem_wal`](crate::dataset::mem_wal::DatasetMemWalExt::prewarm_mem_wal)
     /// so the first gen-key lookup does not pay the dataset open.
-    pub fn with_flushed_cache(mut self, cache: Arc<FlushedMemTableCache>) -> Self {
+    pub fn with_flushed_cache(mut self, cache: Arc<dyn DatasetCache>) -> Self {
         self.flushed_cache = Some(cache);
         self
     }
 
+    /// Inject the warmer fired on first open of a flushed generation.
+    pub fn with_warmer(mut self, warmer: Arc<dyn GenerationWarmer>) -> Self {
+        self.warmer = Some(warmer);
+        self
+    }
+
     /// Add a bloom filter for a generation.
     ///
     /// Bloom filters are optional but improve performance by skipping
@@ -546,9 +555,13 @@ impl LsmPointLookupPlanner {
                 scanner.create_plan().await?
             }
             LsmDataSource::FlushedMemTable { path, .. } => {
-                let dataset =
-                    open_flushed_dataset(path, self.session.as_ref(), self.flushed_cache.as_ref())
-                        .await?;
+                let dataset = open_flushed_dataset(
+                    path,
+                    self.session.as_ref(),
+                    self.flushed_cache.as_ref(),
+                    self.warmer.as_ref(),
+                )
+                .await?;
                 let mut scanner = dataset.scan();
                 scanner.project(&cols.iter().map(|s| s.as_str()).collect::<Vec<_>>())?;
                 scanner.filter_expr(filter.clone());
@@ -573,19 +586,29 @@ impl LsmPointLookupPlanner {
                 // multiple rows sharing the target primary key.
                 scanner.with_row_id();
                 let raw = scanner.create_plan().await?;
-                // Within the active memtable, larger `_rowid` = newer
-                // insert. After dedup there is exactly one row per PK.
-                let deduped: Arc<dyn ExecutionPlan> = Arc::new(WithinSourceDedupExec::new(
-                    raw,
-                    self.pk_columns.clone(),
-                    lance_core::ROW_ID,
-                    DedupDirection::KeepMaxRowAddr,
-                ));
+                // The filter already restricts to the exact PK value, so the
+                // scan yields that key's insert history. Within the active
+                // memtable larger `_rowid` = newer insert, so sorting `_rowid`
+                // DESC and keeping the first row picks the newest version — one
+                // row per (value-exact) PK.
+                let rowid_idx = raw.schema().index_of(lance_core::ROW_ID)?;
+                let ordering = LexOrdering::new(vec![PhysicalSortExpr {
+                    expr: Arc::new(Column::new(lance_core::ROW_ID, rowid_idx)),
+                    options: SortOptions {
+                        descending: true,
+                        nulls_first: false,
+                    },
+                }])
+                .ok_or_else(|| {
+                    lance_core::Error::internal("point-lookup: failed to build _rowid ordering")
+                })?;
+                let newest: Arc<dyn ExecutionPlan> =
+                    Arc::new(SortExec::new(ordering, raw).with_fetch(Some(1)));
                 // Per-source `_rowid` would collide with the base table's;
                 // NULL it before canonicalization (the value is internal to
                 // this arm). project_to_canonical drops it entirely when
                 // the user didn't request `_rowid` in the projection.
-                null_columns(deduped, &[lance_core::ROW_ID])?
+                null_columns(newest, &[lance_core::ROW_ID])?
             }
         };
         project_to_canonical(scan, &target)
@@ -642,10 +665,6 @@ fn probe_position(
     pk_column: &str,
     pk_value: &ScalarValue,
 ) -> Result<ProbePos> {
-    let Some(btree) = index_store.get_btree_by_column(pk_column) else {
-        return Ok(ProbePos::NoIndex);
-    };
-
     // Visible batches are the committed prefix [0, last_visible_idx]; each
     // `StoredBatch` carries its cumulative `row_offset`, so visibility and the
     // position→batch mapping are O(1)/O(log) with no per-probe allocation.
@@ -661,22 +680,37 @@ fn probe_position(
     if visible_end == 0 {
         return Ok(ProbePos::Miss);
     }
+    let max_visible_row = visible_end - 1;
 
-    // Newest visible position of the key — a single seek-and-stop on the
-    // ordered skiplist (largest key ≤ (value, max_visible_row)). No range
-    // collect, no allocation.
-    let Some(pos) = btree.get_newest_visible(pk_value, visible_end - 1) else {
+    // A single-column primary key always has a value-keyed BTree (reused or
+    // auto-created — see `IndexStore::enable_pk_index`): collision-free, so one
+    // seek yields the answer with no re-check. Absent only when the table has no
+    // PK index, where the caller falls back to the plan path.
+    let Some(btree) = index_store.get_btree_by_column(pk_column) else {
+        return Ok(ProbePos::NoIndex);
+    };
+    let Some(pos) = btree.get_newest_visible(pk_value, max_visible_row) else {
         return Ok(ProbePos::Miss);
     };
+    let (batch_idx, row) = resolve_position(batch_store, last_visible_idx, pos)?;
+    Ok(ProbePos::Found { batch_idx, row })
+}
 
-    // Binary-search the owning batch by `row_offset` (appended in order).
+/// Map a global row `position` to its `(batch_idx, row_in_batch)` by binary
+/// searching the visible batch prefix on cumulative `row_offset` (batches are
+/// appended in order).
+fn resolve_position(
+    batch_store: &BatchStore,
+    last_visible_idx: usize,
+    position: u64,
+) -> Result<(usize, usize)> {
     let (mut lo, mut hi) = (0usize, last_visible_idx);
     while lo < hi {
         let mid = lo + (hi - lo).div_ceil(2);
         let off = batch_store.get(mid).map(|b| b.row_offset).ok_or_else(|| {
             lance_core::Error::internal("point-lookup: batch index out of range during search")
         })?;
-        if off <= pos {
+        if off <= position {
             lo = mid;
         } else {
             hi = mid - 1;
@@ -685,10 +719,7 @@ fn probe_position(
     let stored = batch_store
         .get(lo)
         .ok_or_else(|| lance_core::Error::internal("point-lookup: resolved batch missing"))?;
-    Ok(ProbePos::Found {
-        batch_idx: lo,
-        row: (pos - stored.row_offset) as usize,
-    })
+    Ok((lo, (position - stored.row_offset) as usize))
 }
 
 /// Gather `rows` from `batch_store`'s batch `batch_idx` into the `target`
@@ -1097,8 +1128,8 @@ mod tests {
         // Regression: same primary key inserted twice into one active
         // memtable must return the *newest* row. The bug was that
         // `FilterExec → LIMIT 1` over an insert-ordered scan returned the
-        // first (oldest) match. `WithinSourceDedupExec` collapses by PK,
-        // keeping the row with the largest `_rowid` (insert order).
+        // first (oldest) match. The plan-path active arm now sorts `_rowid`
+        // DESC and keeps the first row (largest `_rowid` = newest insert).
         use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables};
         use crate::dataset::mem_wal::write::{BatchStore, IndexStore};
         use futures::TryStreamExt;
@@ -1118,17 +1149,17 @@ mod tests {
         let b_old = create_test_batch(&schema, &[1], "old");
         let b_new = create_test_batch(&schema, &[1], "new");
         let b_other = create_test_batch(&schema, &[2], "two");
-        let (_, _, bp_old) = batch_store.append(b_old.clone()).unwrap();
+        let (bp_old, off_old, _) = batch_store.append(b_old.clone()).unwrap();
         index_store
-            .insert_with_batch_position(&b_old, 0, Some(bp_old))
+            .insert_with_batch_position(&b_old, off_old, Some(bp_old))
             .unwrap();
-        let (_, _, bp_new) = batch_store.append(b_new.clone()).unwrap();
+        let (bp_new, off_new, _) = batch_store.append(b_new.clone()).unwrap();
         index_store
-            .insert_with_batch_position(&b_new, 1, Some(bp_new))
+            .insert_with_batch_position(&b_new, off_new, Some(bp_new))
             .unwrap();
-        let (_, _, bp_other) = batch_store.append(b_other.clone()).unwrap();
+        let (bp_other, off_other, _) = batch_store.append(b_other.clone()).unwrap();
         index_store
-            .insert_with_batch_position(&b_other, 2, Some(bp_other))
+            .insert_with_batch_position(&b_other, off_other, Some(bp_other))
             .unwrap();
         let index_store = Arc::new(index_store);
 
@@ -1168,6 +1199,88 @@ mod tests {
         );
     }
 
+    #[tokio::test]
+    async fn test_point_lookup_probes_auto_created_pk_btree() {
+        // No user `add_btree` on the PK column — only `enable_pk_index`, which
+        // auto-creates a BTree on the primary key (the production default). The
+        // fast probe must resolve the newest visible version through that
+        // collision-free BTree rather than falling back to the plan path.
+        use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables};
+        use crate::dataset::mem_wal::write::{BatchStore, IndexStore};
+
+        let schema = create_pk_schema();
+        let temp_dir = tempfile::tempdir().unwrap();
+        let base_uri = format!("{}/base", temp_dir.path().to_str().unwrap());
+
+        let batch_store = Arc::new(BatchStore::with_capacity(16));
+        let mut index_store = IndexStore::new();
+        // No `add_btree` — `enable_pk_index` auto-creates the PK BTree.
+        index_store.enable_pk_index(&[("id".to_string(), 0)]);
+
+        // pk=1 written twice (the newer second), plus an unrelated pk=2.
+        let b_old = create_test_batch(&schema, &[1], "old");
+        let b_new = create_test_batch(&schema, &[1], "new");
+        let b_other = create_test_batch(&schema, &[2], "two");
+        let (bp_old, off_old, _) = batch_store.append(b_old.clone()).unwrap();
+        index_store
+            .insert_with_batch_position(&b_old, off_old, Some(bp_old))
+            .unwrap();
+        let (bp_new, off_new, _) = batch_store.append(b_new.clone()).unwrap();
+        index_store
+            .insert_with_batch_position(&b_new, off_new, Some(bp_new))
+            .unwrap();
+        let (bp_other, off_other, _) = batch_store.append(b_other.clone()).unwrap();
+        index_store
+            .insert_with_batch_position(&b_other, off_other, Some(bp_other))
+            .unwrap();
+        let index_store = Arc::new(index_store);
+
+        let shard_id = Uuid::new_v4();
+        let collector = LsmDataSourceCollector::without_base_table(base_uri, vec![])
+            .with_in_memory_memtables(
+                shard_id,
+                InMemoryMemTables {
+                    active: InMemoryMemTableRef {
+                        batch_store,
+                        index_store,
+                        schema: schema.clone(),
+                        generation: 1,
+                    },
+                    frozen: vec![],
+                },
+            );
+        let planner = LsmPointLookupPlanner::new(collector, vec!["id".to_string()], schema);
+
+        // `lookup` takes the fast probe path (single-column PK, no system cols).
+        let hit = planner
+            .lookup(&[ScalarValue::Int32(Some(1))], None)
+            .await
+            .unwrap()
+            .expect("pk=1 must be found via the PK-position index probe");
+        assert_eq!(hit.num_rows(), 1);
+        let name = hit
+            .column_by_name("name")
+            .unwrap()
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .unwrap();
+        assert_eq!(
+            name.value(0),
+            "new_1",
+            "probe must return the newest version"
+        );
+
+        // An absent key resolves to None (no on-disk sources to consult).
+        assert!(
+            planner
+                .lookup(&[ScalarValue::Int32(Some(999))], None)
+                .await
+                .unwrap()
+                .is_none(),
+            "absent key must miss"
+        );
+    }
+
     #[tokio::test]
     async fn test_point_lookup_flushed_memtable_returns_newest_duplicate() {
         // Regression / invariant pin: when a flushed memtable contains two
diff --git a/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs b/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs
index b6b1f952b25..7f849f3d8bf 100644
--- a/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs
+++ b/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs
@@ -27,8 +27,7 @@ use crate::io::exec::TakeExec;
 
 use super::collector::LsmDataSourceCollector;
 use super::data_source::LsmDataSource;
-use super::exec::{DedupDirection, WithinSourceDedupExec};
-use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset};
+use super::flushed_cache::{DatasetCache, GenerationWarmer, open_flushed_dataset};
 use super::projection::{
     DISTANCE_COLUMN, build_scanner_projection, canonical_output_schema, null_columns,
     project_to_canonical, wants_row_id,
@@ -38,10 +37,12 @@ use crate::session::Session;
 /// Plans vector search queries over LSM data.
 ///
 /// Each source is independently newest-per-PK before the union — the active
-/// memtable via an over-fetched KNN + within-source dedup, flushed generations
-/// via their within-generation deletion vector — and the cross-generation
-/// block-list ([`super::exec::PkHashFilterExec`]) drops any PK superseded by a
-/// newer generation. So each PK reaches the union from exactly one source and a
+/// memtable via an over-fetched KNN + a newest-per-PK recency filter
+/// ([`super::exec::NewestPkFilterExec`], which drops a hit that isn't the newest
+/// visible version of its PK), flushed generations via their within-generation
+/// deletion vector — and the cross-generation block-list
+/// ([`super::exec::PkBlockFilterExec`]) drops any PK superseded by a newer
+/// generation. So each PK reaches the union from exactly one source and a
 /// distance-ordered merge yields the global top-k; no cross-source dedup is
 /// needed.
 ///
@@ -54,15 +55,15 @@ use crate::session::Session;
 ///       UnionExec
 ///         ProjectionExec (canonical output schema)
 ///           SortExec(_distance, fetch=k)
-///             WithinSourceDedupExec: KeepMaxRowAddr           (active)
+///             NewestPkFilterExec: newest-per-PK recency        (active)
 ///               KNNExec: active memtable, fetch=ceil(k*overfetch)
 ///         ProjectionExec (canonical output schema)
 ///           ProjectionExec (null_columns _rowid)
-///             PkHashFilterExec: block-list                   (flushed)
+///             PkBlockFilterExec: block-list                   (flushed)
 ///               KNNExec: flushed gen N, fetch=ceil(k*overfetch) (fast_search)
 ///         … one per flushed gen …
 ///         ProjectionExec (canonical output schema)
-///           PkHashFilterExec: block-list                     (base)
+///           PkBlockFilterExec: block-list                     (base)
 ///             KNNExec: base table, k (fast_search)[.refine()?]
 /// ```
 ///
@@ -92,7 +93,9 @@ pub struct LsmVectorSearchPlanner {
     /// Session threaded into flushed-generation opens (shared caches).
     session: Option<Arc<Session>>,
     /// Cache of opened flushed-generation datasets.
-    flushed_cache: Option<Arc<FlushedMemTableCache>>,
+    flushed_cache: Option<Arc<dyn DatasetCache>>,
+    /// Optional warmer fired on first open of a flushed generation.
+    warmer: Option<Arc<dyn GenerationWarmer>>,
 }
 
 impl LsmVectorSearchPlanner {
@@ -121,6 +124,7 @@ impl LsmVectorSearchPlanner {
             dataset: None,
             session: None,
             flushed_cache: None,
+            warmer: None,
         }
     }
 
@@ -133,11 +137,17 @@ impl LsmVectorSearchPlanner {
 
     /// Inject a cache of opened flushed-generation datasets, making repeated
     /// searches against the same generation a pure `Arc::clone`.
-    pub fn with_flushed_cache(mut self, cache: Arc<FlushedMemTableCache>) -> Self {
+    pub fn with_flushed_cache(mut self, cache: Arc<dyn DatasetCache>) -> Self {
         self.flushed_cache = Some(cache);
         self
     }
 
+    /// Inject the warmer fired on first open of a flushed generation.
+    pub fn with_warmer(mut self, warmer: Arc<dyn GenerationWarmer>) -> Self {
+        self.warmer = Some(warmer);
+        self
+    }
+
     /// Set the base dataset for post-rerank take.
     ///
     /// After global PK dedup and sort, a `TakeExec` against this dataset
@@ -168,7 +178,7 @@ impl LsmVectorSearchPlanner {
     ///   the rows that filtering drops:
     ///
     ///   - `factor < 1.0` (e.g. `0.0`): **stale filtering off.** The per-source
-    ///     block-list / [`super::exec::PkHashFilterExec`] is not built or applied,
+    ///     block-list / [`super::exec::PkBlockFilterExec`] is not built or applied,
     ///     so rows superseded by a newer generation can surface. The global PK
     ///     dedup still runs, so it still suppresses stale copies in the cases
     ///     where both the stale and the fresh row reach it.
@@ -210,11 +220,10 @@ impl LsmVectorSearchPlanner {
         // live candidates after the post-filter.
         let overfetch_factor = overfetch_factor.max(1.0);
 
-        // Per-source PK-hash block sets (`NEWER(G)`; base = union of all gens).
+        // Per-source PK block sets (`NEWER(G)`; base = union of all gens).
         // `Box::pin` keeps the future off `clippy::large_futures`.
         let block_lists = Box::pin(super::block_list::compute_source_block_lists(
             &sources,
-            &self.pk_columns,
             self.session.as_ref(),
             self.flushed_cache.as_ref(),
         ))
@@ -233,49 +242,83 @@ impl LsmVectorSearchPlanner {
         // `block_lists` is non-empty exactly when a newer generation exists.
         let refine_base = refine_base_table || !block_lists.is_empty();
 
+        // Stage per-source over-fetch decisions, then build every KNN plan
+        // concurrently — the builds are independent and a sequential loop was
+        // the dominant serial planning cost at multiple generations.
+        let arm_inputs: Vec<_> = sources
+            .iter()
+            .map(|source| {
+                let generation = source.generation();
+                let is_base = matches!(source, LsmDataSource::BaseTable { .. });
+                let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. });
+                // Over-fetch when the post-source filter can drop candidates: a
+                // blocked source loses superseded rows; the active source's
+                // within-source dedup collapses duplicate-PK HNSW nodes. Block
+                // lookup is per shard — generations are per-shard.
+                let blocked = block_lists.get(&(source.shard_id(), generation));
+                let fetch_k = if blocked.is_some() || is_active {
+                    ((k as f64) * overfetch_factor).ceil() as usize
+                } else {
+                    k
+                };
+                (source, is_base, is_active, blocked, fetch_k)
+            })
+            .collect();
+        let built = futures::future::try_join_all(arm_inputs.iter().map(
+            |(source, is_base, _, _, fetch_k)| {
+                Box::pin(self.build_knn_plan(
+                    source,
+                    query_vector,
+                    *fetch_k,
+                    nprobes,
+                    projection,
+                    *is_base && refine_base,
+                ))
+            },
+        ))
+        .await?;
+
         let mut knn_plans = Vec::new();
-        for source in &sources {
-            let generation = source.generation();
-            let is_base = matches!(source, LsmDataSource::BaseTable { .. });
-            let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. });
-            // Over-fetch when the post-source filter can drop candidates: a
-            // blocked source loses superseded rows; the active source's
-            // within-source dedup collapses duplicate-PK HNSW nodes. Block
-            // lookup is per shard — generations are per-shard.
-            let blocked = block_lists.get(&(source.shard_id(), generation));
-            let fetch_k = if blocked.is_some() || is_active {
-                ((k as f64) * overfetch_factor).ceil() as usize
-            } else {
-                k
-            };
-            let knn = Box::pin(self.build_knn_plan(
-                source,
-                query_vector,
-                fetch_k,
-                nprobes,
-                projection,
-                is_base && refine_base,
-            ))
-            .await?;
+        // `build_knn_plan` returns each active arm's max-visible snapshot
+        // alongside its plan; the active arm's NewestPkFilterExec needs both it
+        // and `source` (for the batch/index stores), so neither is discarded.
+        for ((source, is_base, is_active, blocked, _), (knn, active_max_visible)) in
+            arm_inputs.iter().zip(built)
+        {
+            let is_base = *is_base;
+            let is_active = *is_active;
+            let blocked = *blocked;
             // Make each source independently newest-per-PK before the union:
             //  * active: the append-only HNSW returns one node per inserted
-            //    version, so collapse duplicate PKs to the newest insert
-            //    (KeepMaxRowAddr on `_rowid`) and re-sort by distance. This
-            //    stays probabilistic — a fresh version evicted from the
-            //    over-fetched top-k still leaks.
+            //    version *and* leaves stale versions of updated PKs live. The
+            //    recency filter keeps only the hit that is the newest visible
+            //    version of its PK (per the maintained MVCC PK-position index),
+            //    closing the predicate-crossing stale read, then re-sort by
+            //    distance.
             //  * flushed/base: drop cross-gen superseded rows via the
             //    block-list (within-gen is handled by the flushed DV).
             let knn = if is_active {
-                let deduped: Arc<dyn ExecutionPlan> = Arc::new(WithinSourceDedupExec::new(
-                    knn,
-                    self.pk_columns.clone(),
-                    lance_core::ROW_ID,
-                    DedupDirection::KeepMaxRowAddr,
-                ));
-                sort_by_distance(deduped, k)?
+                let (batch_store, index_store) = match source {
+                    LsmDataSource::ActiveMemTable {
+                        batch_store,
+                        index_store,
+                        ..
+                    } => (batch_store.clone(), index_store.clone()),
+                    _ => unreachable!("is_active implies ActiveMemTable"),
+                };
+                let filtered: Arc<dyn ExecutionPlan> =
+                    Arc::new(super::exec::NewestPkFilterExec::new(
+                        knn,
+                        self.pk_columns.clone(),
+                        lance_core::ROW_ID,
+                        index_store,
+                        batch_store,
+                        active_max_visible.expect("active arm returns its max_visible snapshot"),
+                    ));
+                sort_by_distance(filtered, k)?
             } else {
                 match blocked {
-                    Some(set) => Arc::new(super::exec::PkHashFilterExec::new(
+                    Some(set) => Arc::new(super::exec::PkBlockFilterExec::new(
                         knn,
                         self.pk_columns.clone(),
                         set.clone(),
@@ -301,6 +344,10 @@ impl LsmVectorSearchPlanner {
         // No cross-source dedup needed (see struct doc): SortExec(per partition)
         // + SortPreservingMerge does the p-way distance-ordered top-k merge.
         #[allow(deprecated)]
+        // The downstream `SortPreservingMergeExec` already spawns one driver
+        // task per input partition (one per union arm) via `spawn_buffered`, so
+        // each arm's per-arm CPU (HNSW search, distance refine) runs on its own
+        // task without an extra repartition.
         let merged: Arc<dyn ExecutionPlan> = Arc::new(UnionExec::new(knn_plans));
 
         let distance_idx = merged.schema().index_of(DISTANCE_COLUMN).map_err(|_| {
@@ -364,11 +411,15 @@ impl LsmVectorSearchPlanner {
             merged_sorted
         };
 
-        // Under-fetch is warned per-source inside `PkHashFilterExec`.
+        // Under-fetch is warned per-source inside `PkBlockFilterExec`.
         Ok(result)
     }
 
     /// Build KNN plan for a single data source.
+    ///
+    /// Returns the plan and, for the active memtable, the `max_visible_batch_position`
+    /// snapshot its scanner latched — threaded into the recency filter so it keys
+    /// on the same snapshot the search saw (`None` for base / flushed sources).
     async fn build_knn_plan(
         &self,
         source: &LsmDataSource,
@@ -377,7 +428,7 @@ impl LsmVectorSearchPlanner {
         nprobes: usize,
         projection: Option<&[String]>,
         refine: bool,
-    ) -> Result<Arc<dyn ExecutionPlan>> {
+    ) -> Result<(Arc<dyn ExecutionPlan>, Option<usize>)> {
         match source {
             LsmDataSource::BaseTable { dataset } => {
                 let mut scanner = dataset.scan();
@@ -402,12 +453,16 @@ impl LsmVectorSearchPlanner {
                 if refine {
                     scanner.refine(1);
                 }
-                scanner.create_plan().await
+                Ok((scanner.create_plan().await?, None))
             }
             LsmDataSource::FlushedMemTable { path, .. } => {
-                let dataset =
-                    open_flushed_dataset(path, self.session.as_ref(), self.flushed_cache.as_ref())
-                        .await?;
+                let dataset = open_flushed_dataset(
+                    path,
+                    self.session.as_ref(),
+                    self.flushed_cache.as_ref(),
+                    self.warmer.as_ref(),
+                )
+                .await?;
                 let mut scanner = dataset.scan();
                 let cols =
                     build_scanner_projection(projection, &self.base_schema, &self.pk_columns);
@@ -418,7 +473,7 @@ impl LsmVectorSearchPlanner {
                 scanner.nprobes(nprobes);
                 scanner.distance_metric(self.distance_type);
                 scanner.fast_search();
-                scanner.create_plan().await
+                Ok((scanner.create_plan().await?, None))
             }
             LsmDataSource::ActiveMemTable {
                 batch_store,
@@ -436,8 +491,8 @@ impl LsmVectorSearchPlanner {
                     build_scanner_projection(projection, &self.base_schema, &self.pk_columns);
                 scanner.project(&cols.iter().map(|s| s.as_str()).collect::<Vec<_>>());
                 // Expose `_rowid` (BatchStore row offset, monotonic with
-                // insert order) so [`WithinSourceDedupExec`] can collapse
-                // duplicate-PK rows to the newest insert. The value is
+                // insert order) so `NewestPkFilterExec` can compare each hit's
+                // position against the PK-position index. The value is
                 // per-source and NULL'd before reaching the canonical merge.
                 // (VectorIndexExec only plumbs `with_row_id`, not
                 // `with_row_address`, but the two yield identical values
@@ -447,7 +502,9 @@ impl LsmVectorSearchPlanner {
                 scanner.nearest(&self.vector_column, query_arr, k);
                 scanner.nprobes(nprobes);
                 scanner.distance_metric(self.distance_type);
-                scanner.create_plan().await
+                let plan = scanner.create_plan().await?;
+                // Capture the scanner's own latched snapshot for the recency filter.
+                Ok((plan, Some(scanner.max_visible_batch_position())))
             }
         }
     }
@@ -567,10 +624,19 @@ mod tests {
 
     async fn create_dataset(uri: &str, batches: Vec<RecordBatch>) -> Dataset {
         let schema = batches[0].schema();
-        let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema);
-        Dataset::write(reader, uri, Some(WriteParams::default()))
+        let has_id = schema.column_with_name("id").is_some();
+        let reader = RecordBatchIterator::new(batches.clone().into_iter().map(Ok), schema);
+        let dataset = Dataset::write(reader, uri, Some(WriteParams::default()))
             .await
-            .unwrap()
+            .unwrap();
+        // Also write the standalone PK sidecar (on `id`) so a flushed-generation
+        // source can be probed by the block-list (harmless for a base table).
+        if has_id {
+            crate::dataset::mem_wal::scanner::block_list::write_pk_sidecar(uri, &batches, &["id"])
+                .await
+                .unwrap();
+        }
+        dataset
     }
 
     #[tokio::test]
@@ -641,6 +707,7 @@ mod tests {
         // Active memtable with HNSW index over the "vector" column.
         let batch_store = Arc::new(BatchStore::with_capacity(16));
         let mut index_store = IndexStore::new();
+        index_store.enable_pk_index(&[("id".to_string(), 0)]);
         index_store.add_hnsw(
             "vector_hnsw".to_string(),
             1,
@@ -759,6 +826,7 @@ mod tests {
 
         let batch_store = Arc::new(BatchStore::with_capacity(16));
         let mut index_store = IndexStore::new();
+        index_store.enable_pk_index(&[("id".to_string(), 0)]);
         index_store.add_hnsw(
             "vector_hnsw".to_string(),
             1,
@@ -838,6 +906,7 @@ mod tests {
 
         let batch_store = Arc::new(BatchStore::with_capacity(16));
         let mut index_store = IndexStore::new();
+        index_store.enable_pk_index(&[("id".to_string(), 0)]);
         index_store.add_hnsw(
             "vector_hnsw".to_string(),
             1,
@@ -951,6 +1020,7 @@ mod tests {
 
         let batch_store = Arc::new(BatchStore::with_capacity(16));
         let mut index_store = IndexStore::new();
+        index_store.enable_pk_index(&[("id".to_string(), 0)]);
         index_store.add_hnsw(
             "vector_hnsw".to_string(),
             1,
@@ -1007,8 +1077,7 @@ mod tests {
             plan_str
         );
         assert!(
-            plan_str.contains("WithinSourceDedupExec")
-                && plan_str.contains("SortPreservingMergeExec"),
+            plan_str.contains("NewestPkFilterExec") && plan_str.contains("SortPreservingMergeExec"),
             "expected per-arm dedup + distance merge, got:\n{}",
             plan_str
         );
@@ -1091,6 +1160,7 @@ mod tests {
         // "right" vector close to the query, plus an unrelated pk=2.
         let batch_store = Arc::new(BatchStore::with_capacity(16));
         let mut index_store = IndexStore::new();
+        index_store.enable_pk_index(&[("id".to_string(), 0)]);
         index_store.add_hnsw(
             "vector_hnsw".to_string(),
             1,
@@ -1210,6 +1280,7 @@ mod tests {
         // Active memtable: id=3 with HNSW index.
         let batch_store = Arc::new(BatchStore::with_capacity(16));
         let mut index_store = IndexStore::new();
+        index_store.enable_pk_index(&[("id".to_string(), 0)]);
         index_store.add_hnsw(
             "vector_hnsw".to_string(),
             1,
@@ -1436,9 +1507,9 @@ mod tests {
     #[tokio::test]
     async fn test_vector_search_dedup_within_active_memtable() {
         // Regression: same PK inserted twice into one active memtable with
-        // *different* vectors. HNSW indexes each as a distinct node, so
-        // without WithinSourceDedupExec a KNN can return both candidates
-        // for the same PK and pollute top-k. The newer insert must win.
+        // *different* vectors. HNSW indexes each as a distinct node, so without
+        // the recency filter a KNN can return both candidates for the same PK
+        // and pollute top-k. The newer insert must win.
         use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables};
         use crate::dataset::mem_wal::write::{BatchStore, IndexStore};
         use datafusion::prelude::SessionContext;
@@ -1450,6 +1521,7 @@ mod tests {
 
         let batch_store = Arc::new(BatchStore::with_capacity(16));
         let mut index_store = IndexStore::new();
+        index_store.enable_pk_index(&[("id".to_string(), 0)]);
         index_store.add_hnsw(
             "vector_hnsw".to_string(),
             1,
@@ -1513,14 +1585,14 @@ mod tests {
             .await
             .unwrap();
 
-        // The active arm collapses duplicate-PK HNSW nodes itself via
-        // WithinSourceDedupExec — there is no cross-source dedup fallback.
+        // The active arm collapses duplicate-PK HNSW nodes itself via the
+        // recency filter — there is no cross-source dedup fallback.
         let plan_str = format!(
             "{}",
             datafusion::physical_plan::displayable(plan.as_ref()).indent(true)
         );
         assert!(
-            plan_str.contains("WithinSourceDedupExec"),
+            plan_str.contains("NewestPkFilterExec"),
             "active vector arm must self-dedup, got:\n{}",
             plan_str
         );
@@ -1549,10 +1621,120 @@ mod tests {
         );
     }
 
+    #[tokio::test]
+    async fn test_vector_search_active_stale_update_out_of_neighborhood() {
+        // BUG REPRODUCTION (vector case: a PK update that moves out of the neighborhood).
+        //
+        // Within a *single* active memtable, pk=1 is first inserted ON the query
+        // (distance ~0), then updated to a FAR vector. The append-only HNSW keeps
+        // both nodes live. A result-set dedup only collapses duplicate PKs that
+        // are BOTH present in the over-fetched candidate set.
+        //
+        // Here the fresh (far) pk=1 is evicted from the candidate set — there are
+        // enough nearer filler rows that it ranks below the fetch cutoff — so the
+        // dedup never sees it and the STALE near pk=1 leaks as the nearest hit.
+        // This is the predicate-crossing hole: the row that *would* suppress the
+        // stale version isn't in the result set, so result-set dedup can't help.
+        //
+        // Desired (NewestPkFilterExec) behaviour: pk=1's newest row-position is
+        // the far one, computed predicate-independently over the whole memtable,
+        // so the stale near node is dropped and pk=1 must NOT surface at ~0.
+        use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables};
+        use crate::dataset::mem_wal::write::{BatchStore, IndexStore};
+        use datafusion::prelude::SessionContext;
+        use futures::TryStreamExt;
+
+        let schema = create_vector_schema();
+        let temp_dir = tempfile::tempdir().unwrap();
+        let base_uri = format!("{}/base", temp_dir.path().to_str().unwrap());
+
+        let batch_store = Arc::new(BatchStore::with_capacity(16));
+        let mut index_store = IndexStore::new();
+        index_store.enable_pk_index(&[("id".to_string(), 0)]);
+        index_store.add_hnsw(
+            "vector_hnsw".to_string(),
+            1,
+            "vector".to_string(),
+            lance_linalg::distance::DistanceType::L2,
+            64,
+            8,
+        );
+
+        // First append: stale pk=1 ON the query, plus five filler rows strictly
+        // farther than pk=1 but far nearer than the eventual fresh pk=1.
+        let q = [0.1, 0.2, 0.3, 0.4];
+        let stale_then_fillers = batch_rows(
+            &schema,
+            &[
+                (1, q),
+                (10, [0.11, 0.21, 0.31, 0.41]),
+                (11, [0.13, 0.23, 0.33, 0.43]),
+                (12, [0.15, 0.25, 0.35, 0.45]),
+                (13, [0.17, 0.27, 0.37, 0.47]),
+                (14, [0.19, 0.29, 0.39, 0.49]),
+            ],
+        );
+        let (bp0, off0, _) = batch_store.append(stale_then_fillers.clone()).unwrap();
+        index_store
+            .insert_with_batch_position(&stale_then_fillers, off0, Some(bp0))
+            .unwrap();
+
+        // Second append: the UPDATE — pk=1 moved far from the query. This is the
+        // newest version (largest row position) but it sits well outside top-k.
+        let fresh_pk1 = batch_rows(&schema, &[(1, [9.0, 9.0, 9.0, 9.0])]);
+        let (bp1, off1, _) = batch_store.append(fresh_pk1.clone()).unwrap();
+        index_store
+            .insert_with_batch_position(&fresh_pk1, off1, Some(bp1))
+            .unwrap();
+        let index_store = Arc::new(index_store);
+
+        let shard_id = uuid::Uuid::new_v4();
+        let collector = LsmDataSourceCollector::without_base_table(base_uri, vec![])
+            .with_in_memory_memtables(
+                shard_id,
+                InMemoryMemTables {
+                    active: InMemoryMemTableRef {
+                        batch_store,
+                        index_store,
+                        schema: schema.clone(),
+                        generation: 1,
+                    },
+                    frozen: vec![],
+                },
+            );
+
+        let planner = LsmVectorSearchPlanner::new(
+            collector,
+            vec!["id".to_string()],
+            schema,
+            "vector".to_string(),
+            lance_linalg::distance::DistanceType::L2,
+        );
+
+        // k=3, no over-fetch: the candidate set is {pk1@near, two nearest
+        // fillers}; fresh pk1@far ranks 7th and never enters the candidates.
+        let query = create_query_vector();
+        let plan = planner
+            .plan_search(&query, 3, 1, None, false, 1.0)
+            .await
+            .unwrap();
+        let ctx = SessionContext::new();
+        let stream = plan.execute(0, ctx.task_ctx()).unwrap();
+        let batches: Vec<RecordBatch> = stream.try_collect().await.unwrap();
+        let rows = collect_id_dist(&batches);
+
+        assert!(
+            !rows.iter().any(|&(id, d)| id == 1 && d.abs() < 1e-3),
+            "stale near pk=1 leaked: its live vector is far from the query, so it \
+             must not appear at distance ~0. results={:?}",
+            rows
+        );
+    }
+
     #[tokio::test]
     async fn test_vector_search_stale_read_when_fresh_falls_out_of_top_k() {
         // Regression for the cross-generation stale-read gap that the
-        // PkHashFilterExec block-list closes.
+        // PkBlockFilterExec block-list closes.
         //
         // Scenario:
         //   * Base (gen 0): stale pk=1 sitting on the query (distance ~0).
@@ -1587,6 +1769,7 @@ mod tests {
         // active arm surfaces pk=2 and drops fresh pk=1.
         let batch_store = Arc::new(BatchStore::with_capacity(16));
         let mut index_store = IndexStore::new();
+        index_store.enable_pk_index(&[("id".to_string(), 0)]);
         index_store.add_hnsw(
             "vector_hnsw".to_string(),
             1,
@@ -1783,6 +1966,7 @@ mod tests {
         // Active (gen 1): pk 1,2,3 re-inserted with a far vector (the fresh value).
         let batch_store = Arc::new(BatchStore::with_capacity(16));
         let mut index_store = IndexStore::new();
+        index_store.enable_pk_index(&[("id".to_string(), 0)]);
         index_store.add_hnsw(
             "vector_hnsw".to_string(),
             1,
@@ -1987,6 +2171,7 @@ mod tests {
         // Active: (1,1) re-inserted far (fresh) + an unrelated nearby (2,2).
         let batch_store = Arc::new(BatchStore::with_capacity(16));
         let mut index_store = IndexStore::new();
+        index_store.enable_pk_index(&[("id1".to_string(), 0), ("id2".to_string(), 1)]);
         index_store.add_hnsw(
             "vector_hnsw".to_string(),
             1,
@@ -2091,6 +2276,7 @@ mod tests {
 
         let batch_store = Arc::new(BatchStore::with_capacity(16));
         let mut index_store = IndexStore::new();
+        index_store.enable_pk_index(&[("id".to_string(), 0)]);
         index_store.add_hnsw(
             "vector_hnsw".to_string(),
             1,
diff --git a/rust/lance/src/dataset/mem_wal/util.rs b/rust/lance/src/dataset/mem_wal/util.rs
index d1413b84b2a..3f5090f6b40 100644
--- a/rust/lance/src/dataset/mem_wal/util.rs
+++ b/rust/lance/src/dataset/mem_wal/util.rs
@@ -169,6 +169,16 @@ pub fn flushed_memtable_path(
     shard_base_path(base_path, shard_id).join(format!("{}_gen_{}", random_hash, generation))
 }
 
+/// Subdirectory of a flushed generation holding its standalone primary-key
+/// dedup index (a sidecar BTree, not registered in the manifest). Both the
+/// flush writer and the block-list probe join this onto the generation path.
+pub const PK_INDEX_DIR: &str = "_pk_index";
+
+/// Path to a flushed generation's standalone primary-key dedup index.
+pub fn pk_index_path(gen_path: &Path) -> Path {
+    gen_path.clone().join(PK_INDEX_DIR)
+}
+
 /// Generate an 8-character random hex string for flushed MemTable directories.
 pub fn generate_random_hash() -> String {
     let bytes: [u8; 4] = rand::random();
diff --git a/rust/lance/src/dataset/mem_wal/write.rs b/rust/lance/src/dataset/mem_wal/write.rs
index 441da920b57..491bb68aec5 100644
--- a/rust/lance/src/dataset/mem_wal/write.rs
+++ b/rust/lance/src/dataset/mem_wal/write.rs
@@ -47,6 +47,7 @@ pub use super::util::{WatchableOnceCell, WatchableOnceCellReader};
 pub use super::wal::{WalEntry, WalEntryData, WalFlushResult, WalFlusher};
 
 use super::memtable::flush::TriggerMemTableFlush;
+use super::scanner::GenerationWarmer;
 use super::wal::{
     TriggerWalFlush, WalAppender, WalFlushSource, WalOnlyState, WalTailer, empty_flush_result,
 };
@@ -177,6 +178,21 @@ pub struct ShardWriterConfig {
     /// Default: 60 seconds
     pub stats_log_interval: Option<Duration>,
 
+    /// How long a frozen memtable lingers in memory after its flush commits,
+    /// before it is evicted and served only from the on-disk flushed dataset.
+    ///
+    /// `Duration::ZERO` (the default) disables retention: evict on commit, no
+    /// sweep ticker. Correct for single-shot queries, which can't observe a
+    /// generation evicted mid-read.
+    ///
+    /// A non-zero value is required only for queries split across reads (e.g.
+    /// fresh tier and base table read separately, then deduped): the flushed
+    /// dataset loses the per-batch boundaries that bound as-of membership
+    /// (see [`crate::dataset::mem_wal::scanner::FreshTierWatermark`]), so a
+    /// generation evicted between a query's reads can serve a stale row. Set it
+    /// above the worst-case multi-part query latency, with margin.
+    pub frozen_memtable_grace: Duration,
+
     /// Whether to maintain an in-memory MemTable on top of the WAL.
     ///
     /// When `true` (default), the writer maintains an in-memory `MemTable`,
@@ -216,6 +232,11 @@ pub struct ShardWriterConfig {
     ///
     /// Default: empty.
     pub hnsw_params: HashMap<String, HnswBuildParams>,
+
+    /// Optional warmer fired pre-commit for each new generation (zero cold reads
+    /// on first query). Wired to the flusher; supplied by the consumer (e.g. the
+    /// WAL pod). Default: `None`.
+    pub warmer: Option<Arc<dyn GenerationWarmer>>,
 }
 
 impl Default for ShardWriterConfig {
@@ -236,8 +257,10 @@ impl Default for ShardWriterConfig {
             async_index_buffer_rows: 10_000,
             async_index_interval: Duration::from_secs(1),
             stats_log_interval: Some(Duration::from_secs(60)), // 1 minute
+            frozen_memtable_grace: Duration::ZERO,
             enable_memtable: true,
             hnsw_params: HashMap::new(),
+            warmer: None,
         }
     }
 }
@@ -335,6 +358,13 @@ impl ShardWriterConfig {
         self
     }
 
+    /// Set how long a flushed memtable lingers in memory before eviction. MUST
+    /// exceed the maximum query elapsed time — see `frozen_memtable_grace`.
+    pub fn with_frozen_memtable_grace(mut self, grace: Duration) -> Self {
+        self.frozen_memtable_grace = grace;
+        self
+    }
+
     /// Toggle the in-memory MemTable layer. See `enable_memtable` for the
     /// full WAL-only-mode contract. Defaults to `true`.
     pub fn with_enable_memtable(mut self, enable: bool) -> Self {
@@ -708,6 +738,15 @@ pub struct WriteResult {
     pub batch_positions: std::ops::Range<usize>,
 }
 
+/// A sealed memtable kept queryable in memory. `flushed_at_ms` is `None` while
+/// the generation is still awaiting (or retrying) its flush, and `Some(t)` once
+/// the flush commits — after which it lingers for `frozen_memtable_grace` so
+/// in-flight as-of reads keep batch-resolved membership, then is swept.
+struct FrozenMemTable {
+    memtable: Arc<MemTable>,
+    flushed_at_ms: Option<u64>,
+}
+
 /// ShardWriter state shared across tasks.
 struct WriterState {
     memtable: MemTable,
@@ -716,12 +755,13 @@ struct WriterState {
     frozen_memtable_bytes: usize,
     /// Flush watchers for frozen memtables (for backpressure).
     frozen_flush_watchers: VecDeque<(usize, DurabilityWatcher)>,
-    /// Sealed-but-undrained memtables, kept queryable so a concurrent reader
-    /// sees no hole between `freeze_memtable` and the flush task's manifest
-    /// commit. Pushed in `freeze_memtable`; removed by generation in
-    /// `flush_memtable` on commit success only (retained on failure until a
-    /// later flush or WAL replay on reopen).
-    frozen_memtables: VecDeque<Arc<MemTable>>,
+    /// Sealed memtables, kept queryable so a concurrent reader sees no hole
+    /// between `freeze_memtable` and the flush task's manifest commit, and for
+    /// `frozen_memtable_grace` beyond it so as-of reads stay batch-resolved.
+    /// Pushed in `freeze_memtable`; stamped `flushed_at_ms` by `flush_memtable`
+    /// on commit success only (retained un-stamped on failure until a later
+    /// flush or WAL replay on reopen); swept after the grace by `SweepExpired`.
+    frozen_memtables: VecDeque<FrozenMemTable>,
     /// Flag to prevent duplicate memtable flush requests.
     flush_requested: bool,
     /// Counter for WAL flush threshold crossings.
@@ -846,6 +886,16 @@ async fn replay_memtable_from_wal(
     Ok(position)
 }
 
+/// Pair each primary-key column name with its field id (both derived from the
+/// schema's primary key, in the same order) for [`IndexStore::enable_pk_index`].
+fn pk_index_columns(pk_columns: &[String], pk_field_ids: &[i32]) -> Vec<(String, i32)> {
+    pk_columns
+        .iter()
+        .cloned()
+        .zip(pk_field_ids.iter().copied())
+        .collect()
+}
+
 /// Shared state for writer operations.
 struct SharedWriterState {
     state: Arc<RwLock<WriterState>>,
@@ -855,6 +905,9 @@ struct SharedWriterState {
     config: ShardWriterConfig,
     schema: Arc<ArrowSchema>,
     pk_field_ids: Vec<i32>,
+    /// Primary-key column names, used to (re)enable the PK-position index on
+    /// each fresh active memtable created at freeze.
+    pk_columns: Vec<String>,
     max_memtable_batches: usize,
     max_memtable_rows: usize,
     index_configs: Vec<MemIndexConfig>,
@@ -870,6 +923,7 @@ impl SharedWriterState {
         config: ShardWriterConfig,
         schema: Arc<ArrowSchema>,
         pk_field_ids: Vec<i32>,
+        pk_columns: Vec<String>,
         max_memtable_batches: usize,
         max_memtable_rows: usize,
         index_configs: Vec<MemIndexConfig>,
@@ -882,6 +936,7 @@ impl SharedWriterState {
             config,
             schema,
             pk_field_ids,
+            pk_columns,
             max_memtable_batches,
             max_memtable_rows,
             index_configs,
@@ -907,13 +962,17 @@ impl SharedWriterState {
             self.max_memtable_batches,
         )?;
 
-        if !self.index_configs.is_empty() {
-            let indexes = Arc::new(IndexStore::from_configs(
+        // Build an IndexStore when there are user indexes *or* a primary key:
+        // the PK dedup index (and its flushed on-disk sidecar) is required for
+        // cross-generation dedup even when no secondary index is configured.
+        if !self.index_configs.is_empty() || !self.pk_columns.is_empty() {
+            let mut indexes = IndexStore::from_configs(
                 &self.index_configs,
                 self.max_memtable_rows,
                 self.max_memtable_batches,
-            )?);
-            new_memtable.set_indexes_arc(indexes);
+            )?;
+            indexes.enable_pk_index(&pk_index_columns(&self.pk_columns, &self.pk_field_ids));
+            new_memtable.set_indexes_arc(Arc::new(indexes));
         }
 
         let mut old_memtable = std::mem::replace(&mut state.memtable, new_memtable);
@@ -949,10 +1008,13 @@ impl SharedWriterState {
 
         let frozen_memtable = Arc::new(old_memtable);
 
-        // Keep this generation queryable until its manifest commit lands
-        // (dropped in `flush_memtable`, success only). Arc refcount, not a
-        // copy — the flush task holds it alive for the whole drain anyway.
-        state.frozen_memtables.push_back(frozen_memtable.clone());
+        // Keep this generation queryable past its manifest commit (swept after
+        // the grace by `SweepExpired`). Arc refcount, not a copy — the flush
+        // task holds it alive for the whole drain anyway.
+        state.frozen_memtables.push_back(FrozenMemTable {
+            memtable: frozen_memtable.clone(),
+            flushed_at_ms: None,
+        });
 
         debug!(
             "Frozen memtable generation {}, pending_count = {}",
@@ -960,7 +1022,7 @@ impl SharedWriterState {
             state.frozen_flush_watchers.len()
         );
 
-        let _ = self.memtable_flush_tx.send(TriggerMemTableFlush {
+        let _ = self.memtable_flush_tx.send(TriggerMemTableFlush::Flush {
             memtable: frozen_memtable,
             done: None,
         });
@@ -1287,11 +1349,9 @@ impl ShardWriter {
     ) -> Result<WriterMode> {
         // Create MemTable with primary key field IDs from schema
         let lance_schema = Schema::try_from(schema.as_ref())?;
-        let pk_field_ids: Vec<i32> = lance_schema
-            .unenforced_primary_key()
-            .iter()
-            .map(|f| f.id)
-            .collect();
+        let pk_fields = lance_schema.unenforced_primary_key();
+        let pk_field_ids: Vec<i32> = pk_fields.iter().map(|f| f.id).collect();
+        let pk_columns: Vec<String> = pk_fields.iter().map(|f| f.name.clone()).collect();
         let mut memtable = MemTable::with_capacity(
             schema.clone(),
             manifest.current_generation,
@@ -1300,14 +1360,18 @@ impl ShardWriter {
             config.max_memtable_batches,
         )?;
 
-        // Create indexes if configured and set them on the MemTable.
-        if !index_configs.is_empty() {
-            let indexes = Arc::new(IndexStore::from_configs(
+        // Create indexes if configured and set them on the MemTable. The
+        // PK-position index is enabled before any WAL replay below so replayed
+        // rows are recorded in it. A primary key alone (no secondary index)
+        // still needs the PK index so flush writes its on-disk dedup sidecar.
+        if !index_configs.is_empty() || !pk_columns.is_empty() {
+            let mut indexes = IndexStore::from_configs(
                 index_configs,
                 config.max_memtable_rows,
                 config.max_memtable_batches,
-            )?);
-            memtable.set_indexes_arc(indexes);
+            )?;
+            indexes.enable_pk_index(&pk_index_columns(&pk_columns, &pk_field_ids));
+            memtable.set_indexes_arc(Arc::new(indexes));
         }
 
         // Replay any WAL entries written after the last successfully-flushed
@@ -1357,13 +1421,10 @@ impl ShardWriter {
 
         let (memtable_flush_tx, memtable_flush_rx) = mpsc::unbounded_channel();
 
-        let flusher = Arc::new(MemTableFlusher::new(
-            object_store,
-            base_path,
-            base_uri,
-            shard_id,
-            manifest_store,
-        ));
+        let flusher = Arc::new(
+            MemTableFlusher::new(object_store, base_path, base_uri, shard_id, manifest_store)
+                .with_warmer(config.warmer.clone()),
+        );
 
         let backpressure = BackpressureController::new(config.clone());
 
@@ -1378,8 +1439,14 @@ impl ShardWriter {
 
         // Background MemTable flush handler — frozen memtable to Lance file.
         // It rebuilds the same secondary indexes on each flushed generation.
-        let memtable_handler =
-            MemTableFlushHandler::new(state.clone(), flusher, epoch, index_configs.to_vec(), stats);
+        let memtable_handler = MemTableFlushHandler::new(
+            state.clone(),
+            flusher,
+            epoch,
+            index_configs.to_vec(),
+            stats,
+            config.frozen_memtable_grace,
+        );
         task_executor.add_handler(
             "memtable_flusher".to_string(),
             Box::new(memtable_handler),
@@ -1395,6 +1462,7 @@ impl ShardWriter {
             config.clone(),
             schema.clone(),
             pk_field_ids,
+            pk_columns,
             config.max_memtable_batches,
             config.max_memtable_rows,
             index_configs.to_vec(),
@@ -1789,7 +1857,7 @@ impl ShardWriter {
             frozen: state
                 .frozen_memtables
                 .iter()
-                .map(|m| in_memory_ref(m))
+                .map(|m| in_memory_ref(&m.memtable))
                 .collect(),
         })
     }
@@ -2182,6 +2250,9 @@ struct MemTableFlushHandler {
     /// at all.
     index_configs: Vec<MemIndexConfig>,
     stats: SharedWriteStats,
+    /// How long a frozen memtable lingers in memory after its flush commits
+    /// before `SweepExpired` evicts it. See `ShardWriterConfig::frozen_memtable_grace`.
+    grace: Duration,
 }
 
 impl MemTableFlushHandler {
@@ -2191,6 +2262,7 @@ impl MemTableFlushHandler {
         epoch: u64,
         index_configs: Vec<MemIndexConfig>,
         stats: SharedWriteStats,
+        grace: Duration,
     ) -> Self {
         Self {
             state,
@@ -2198,22 +2270,51 @@ impl MemTableFlushHandler {
             epoch,
             index_configs,
             stats,
+            grace,
         }
     }
+
+    /// Evict frozen memtables whose post-flush grace has elapsed. Un-stamped
+    /// (not-yet-flushed) entries are always kept.
+    async fn sweep_expired_frozen(&self) {
+        let now = now_millis();
+        let grace_ms = self.grace.as_millis() as u64;
+        let mut state = self.state.write().await;
+        state
+            .frozen_memtables
+            .retain(|frozen| match frozen.flushed_at_ms {
+                Some(flushed_at) => now.saturating_sub(flushed_at) < grace_ms,
+                None => true,
+            });
+    }
 }
 
 #[async_trait]
 impl MessageHandler<TriggerMemTableFlush> for MemTableFlushHandler {
-    async fn handle(&mut self, message: TriggerMemTableFlush) -> Result<()> {
-        let TriggerMemTableFlush { memtable, done } = message;
+    fn tickers(&mut self) -> Vec<(Duration, MessageFactory<TriggerMemTableFlush>)> {
+        // Zero grace evicts on commit, so no sweeper is needed.
+        if self.grace.is_zero() {
+            return vec![];
+        }
+        // Sweep often enough that eviction lags the grace by at most ~1/3, so a
+        // generation lives no more than ~grace * 4/3 past its flush commit.
+        let tick = (self.grace / 3).max(Duration::from_millis(100));
+        vec![(tick, Box::new(|| TriggerMemTableFlush::SweepExpired))]
+    }
 
-        let result = self.flush_memtable(memtable).await;
-        if let Some(tx) = done {
-            // Send result through the channel - caller is waiting for it
-            let _ = tx.send(result);
-        } else {
-            // No done channel, propagate errors
-            result?;
+    async fn handle(&mut self, message: TriggerMemTableFlush) -> Result<()> {
+        match message {
+            TriggerMemTableFlush::Flush { memtable, done } => {
+                let result = self.flush_memtable(memtable).await;
+                if let Some(tx) = done {
+                    // Send result through the channel - caller is waiting for it
+                    let _ = tx.send(result);
+                } else {
+                    // No done channel, propagate errors
+                    result?;
+                }
+            }
+            TriggerMemTableFlush::SweepExpired => self.sweep_expired_frozen().await,
         }
         Ok(())
     }
@@ -2307,15 +2408,26 @@ impl MemTableFlushHandler {
                 state.frozen_memtable_bytes =
                     state.frozen_memtable_bytes.saturating_sub(memtable_size);
             }
-            // Drop the queryable handle ONLY on commit success. On failure
-            // keep it: rows must stay in the read union until a later flush
-            // or WAL replay, else a transient flush error reopens the hole.
-            // Keyed by generation, so non-FIFO completion is fine.
+            // Retire the frozen handle on commit success, keyed by generation
+            // (non-FIFO completion is fine). Zero grace evicts here; otherwise
+            // stamp the grace clock so it lingers for multi-part as-of reads
+            // until `SweepExpired`. On failure leave it un-stamped: rows stay in
+            // the read union until a later flush or WAL replay, else a transient
+            // error reopens the hole.
             if flush_result.is_ok() {
                 let flushed_generation = memtable.generation();
-                state
-                    .frozen_memtables
-                    .retain(|m| m.generation() != flushed_generation);
+                if self.grace.is_zero() {
+                    state
+                        .frozen_memtables
+                        .retain(|frozen| frozen.memtable.generation() != flushed_generation);
+                } else {
+                    let now = now_millis();
+                    for frozen in state.frozen_memtables.iter_mut() {
+                        if frozen.memtable.generation() == flushed_generation {
+                            frozen.flushed_at_ms = Some(now);
+                        }
+                    }
+                }
             }
         }
 
@@ -4198,10 +4310,12 @@ mod tests {
         writer.close().await.unwrap();
     }
 
-    /// On a successful flush commit the sealed generation is dropped from
-    /// the queryable set (no leak), and its rows land in the manifest.
+    /// On a successful flush commit the sealed generation's rows land in the
+    /// manifest immediately, but the in-memory handle is NOT dropped — it
+    /// lingers for `frozen_memtable_grace` (so in-flight as-of reads keep
+    /// batch-resolved membership), then is swept by the `SweepExpired` ticker.
     #[tokio::test]
-    async fn test_frozen_dropped_after_successful_flush() {
+    async fn test_frozen_retained_during_grace_then_swept() {
         let (store, base_path, base_uri, _temp_dir) = create_local_store().await;
         let schema = create_test_schema();
         let config = ShardWriterConfig {
@@ -4213,6 +4327,8 @@ mod tests {
             max_wal_flush_interval: None,
             max_memtable_size: 64 * 1024 * 1024,
             manifest_scan_batch_size: 2,
+            // Short grace so the sweep is observable without a slow test.
+            frozen_memtable_grace: Duration::from_secs(1),
             ..Default::default()
         };
         let writer = ShardWriter::open(store, base_path, base_uri, config, schema.clone(), vec![])
@@ -4227,13 +4343,66 @@ mod tests {
         writer.force_seal_active().await.unwrap();
         writer.wait_for_flush_drain().await.unwrap();
 
+        // Recorded in the manifest at commit time.
+        let manifest = writer.manifest().await.unwrap().expect("manifest exists");
+        assert!(
+            manifest
+                .flushed_generations
+                .iter()
+                .any(|g| g.generation == initial_gen),
+            "flushed generation must be recorded in the manifest"
+        );
+
+        // Still queryable in memory immediately after commit (within grace).
+        let refs = writer.in_memory_memtable_refs().await.unwrap();
+        assert_eq!(refs.active.generation, initial_gen + 1);
+        assert!(
+            refs.frozen.iter().any(|f| f.generation == initial_gen),
+            "flushed generation must stay queryable during the grace window"
+        );
+
+        // After the grace elapses (plus a sweep tick) the handle is evicted.
+        tokio::time::sleep(Duration::from_millis(1_500)).await;
         let refs = writer.in_memory_memtable_refs().await.unwrap();
         assert!(
             refs.frozen.is_empty(),
-            "frozen handle must be dropped once the flush commit lands"
+            "frozen handle must be swept once the grace elapses"
         );
-        assert_eq!(refs.active.generation, initial_gen + 1);
 
+        writer.close().await.unwrap();
+    }
+
+    /// With zero grace (the default) a frozen handle is evicted synchronously on
+    /// flush commit — no sweep tick, no lingering window.
+    #[tokio::test]
+    async fn test_frozen_evicted_immediately_with_zero_grace() {
+        let (store, base_path, base_uri, _temp_dir) = create_local_store().await;
+        let schema = create_test_schema();
+        let config = ShardWriterConfig {
+            shard_id: Uuid::new_v4(),
+            shard_spec_id: 0,
+            durable_write: false,
+            sync_indexed_write: false,
+            max_wal_buffer_size: 64 * 1024 * 1024,
+            max_wal_flush_interval: None,
+            max_memtable_size: 64 * 1024 * 1024,
+            manifest_scan_batch_size: 2,
+            frozen_memtable_grace: Duration::ZERO,
+            ..Default::default()
+        };
+        let writer = ShardWriter::open(store, base_path, base_uri, config, schema.clone(), vec![])
+            .await
+            .unwrap();
+
+        let initial_gen = writer.memtable_stats().await.unwrap().generation;
+        writer
+            .put(vec![create_test_batch(&schema, 0, 10)])
+            .await
+            .unwrap();
+        writer.force_seal_active().await.unwrap();
+        writer.wait_for_flush_drain().await.unwrap();
+
+        // Rows are durably in the manifest...
         let manifest = writer.manifest().await.unwrap().expect("manifest exists");
         assert!(
             manifest
@@ -4243,6 +4412,13 @@ mod tests {
             "flushed generation must be recorded in the manifest"
         );
 
+        // ...and the in-memory handle is already gone, no sweep tick needed.
+        let refs = writer.in_memory_memtable_refs().await.unwrap();
+        assert!(
+            refs.frozen.is_empty(),
+            "frozen handle must be evicted on commit when grace is zero"
+        );
+
         writer.close().await.unwrap();
     }
 
diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs
index d591e42cc73..87dda8e7e57 100644
--- a/rust/lance/src/dataset/optimize.rs
+++ b/rust/lance/src/dataset/optimize.rs
@@ -191,6 +191,13 @@ pub struct CompactionOptions {
     /// specified then the default (see
     /// [`crate::dataset::Scanner::batch_size`]) will be used.
     pub batch_size: Option<usize>,
+    /// The number of bytes to allow to queue up in the I/O buffer when scanning
+    /// the input fragments.  If not specified then the default (see
+    /// [`crate::dataset::Scanner::io_buffer_size`]) will be used.
+    ///
+    /// Increasing this can avoid a deadlock that occurs when a single batch of
+    /// data is larger than the I/O buffer size.
+    pub io_buffer_size: Option<u64>,
     /// Whether to defer remapping indices during compaction. If true, indices will
     /// not be remapped during this compaction operation. Instead, the fragment reuse index
     /// is updated and will be used to perform remapping later.
@@ -237,6 +244,7 @@ impl Default for CompactionOptions {
             num_threads: None,
             max_bytes_per_file: None,
             batch_size: None,
+            io_buffer_size: None,
             defer_index_remap: false,
             compaction_mode: None,
             enable_binary_copy: false,
@@ -264,6 +272,7 @@ impl CompactionOptions {
     /// - `lance.compaction.materialize_deletions_threshold`
     /// - `lance.compaction.defer_index_remap`
     /// - `lance.compaction.batch_size`
+    /// - `lance.compaction.io_buffer_size`
     /// - `lance.compaction.compaction_mode`
     /// - `lance.compaction.binary_copy_read_batch_bytes`
     /// - `lance.compaction.max_source_fragments`
@@ -347,6 +356,14 @@ impl CompactionOptions {
                         ))
                     })?);
                 }
+                "io_buffer_size" => {
+                    self.io_buffer_size = Some(value.parse().map_err(|_| {
+                        Error::invalid_input(format!(
+                            "Invalid value for {}: '{}' (expected a non-negative integer)",
+                            key, value
+                        ))
+                    })?);
+                }
                 "compaction_mode" => {
                     self.compaction_mode = Some(CompactionMode::try_from(value.as_str())?);
                 }
@@ -1194,6 +1211,8 @@ async fn transform_blob_v2_batch(
 ///   and preserve insertion order.
 /// - `batch_size`: Optional batch size; if provided, set it on the scanner to control
 ///   read batching.
+/// - `io_buffer_size`: Optional I/O buffer size in bytes; if provided, set it on the
+///   scanner to control how much data is queued during reads.
 /// - `with_frags`: Whether to scan only the specified old fragments and force
 ///   in-order reading.
 /// - `capture_row_ids`: When index remapping is needed, include and capture the
@@ -1209,6 +1228,7 @@ async fn prepare_reader(
     dataset: &Dataset,
     fragments: &[Fragment],
     batch_size: Option<usize>,
+    io_buffer_size: Option<u64>,
     with_frags: bool,
     capture_row_ids: bool,
 ) -> Result<(
@@ -1234,6 +1254,9 @@ async fn prepare_reader(
     if let Some(bs) = batch_size {
         scanner.batch_size(bs);
     }
+    if let Some(io_buffer_size) = io_buffer_size {
+        scanner.io_buffer_size(io_buffer_size);
+    }
     if with_frags {
         scanner
             .with_fragments(fragments.to_vec())
@@ -1515,6 +1538,7 @@ async fn rewrite_files(
             dataset.as_ref(),
             &fragments,
             options.batch_size,
+            options.io_buffer_size,
             true,
             needs_remapping,
         )
@@ -2636,6 +2660,57 @@ mod tests {
         assert_eq!(scanned_data, data);
     }
 
+    #[rstest]
+    #[tokio::test]
+    async fn test_compact_with_io_buffer_size(
+        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
+        data_storage_version: LanceFileVersion,
+    ) {
+        // Compaction should succeed and produce correct results when an
+        // explicit io_buffer_size is provided via CompactionOptions.
+        let test_dir = TempStrDir::default();
+        let test_uri = &test_dir;
+
+        let data = sample_data();
+
+        // Create a table with 2 small fragments so there is something to compact.
+        let reader = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema());
+        let write_params = WriteParams {
+            max_rows_per_file: 5_000,
+            max_rows_per_group: 1_000,
+            data_storage_version: Some(data_storage_version),
+            ..Default::default()
+        };
+        let mut dataset = Dataset::write(reader, test_uri, Some(write_params))
+            .await
+            .unwrap();
+        assert_eq!(dataset.get_fragments().len(), 2);
+
+        let options = CompactionOptions {
+            // A generous buffer so the read does not deadlock on large batches.
+            io_buffer_size: Some(256 * 1024 * 1024),
+            ..Default::default()
+        };
+        let plan = plan_compaction(&dataset, &options).await.unwrap();
+        assert_eq!(plan.tasks().len(), 1);
+
+        let metrics = compact_files(&mut dataset, options, None).await.unwrap();
+        assert_eq!(metrics.fragments_removed, 2);
+        assert_eq!(metrics.fragments_added, 1);
+
+        // All rows are preserved after compaction.
+        let scanner = dataset.scan();
+        let batches = scanner
+            .try_into_stream()
+            .await
+            .unwrap()
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+        let scanned_data = concat_batches(&batches[0].schema(), &batches).unwrap();
+        assert_eq!(scanned_data.num_rows(), data.num_rows());
+    }
+
     #[rstest]
     #[tokio::test]
     async fn test_compact_deletions(
@@ -4232,6 +4307,133 @@ mod tests {
         assert_eq!(scanner.count_rows().await.unwrap(), count3);
     }
 
+    /// Deferred compaction that materializes deletions must not corrupt an
+    /// inverted (FTS) index read through the fragment-reuse index. The index's
+    /// posting lists reference doc_ids positionally; if the load-time remap
+    /// dropped the deleted rows it would renumber the doc_ids and desync the
+    /// posting lists (out-of-bounds `num_tokens`, wrong/stale row ids). The
+    /// tombstone-preserve-positions load path must keep results correct in the
+    /// FRI window and after the physical remap + trim.
+    #[tokio::test]
+    async fn test_read_inverted_index_with_defer_index_remap_and_deletions() {
+        // Enough surviving docs for several compressed posting-list blocks
+        // (BLOCK_SIZE = 128), split across several fragments so compaction has
+        // real work — but no larger.
+        const ROWS: i32 = 1200;
+        const DELETED: i32 = 400;
+
+        // Every row contains "lance", so the term matches all live rows; `id`
+        // tells us exactly which rows survive.
+        let ids = Int32Array::from_iter_values(0..ROWS);
+        let docs = LargeStringArray::from_iter_values((0..ROWS).map(|_| "lance apple orange"));
+        let batch = RecordBatch::try_new(
+            Schema::new(vec![
+                Field::new("id", DataType::Int32, false),
+                Field::new("doc", DataType::LargeUtf8, false),
+            ])
+            .into(),
+            vec![Arc::new(ids) as ArrayRef, Arc::new(docs) as ArrayRef],
+        )
+        .unwrap();
+        let schema_ref = batch.schema();
+        let stream = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema_ref);
+        let mut dataset = Dataset::write(
+            stream,
+            "memory://test/table",
+            Some(WriteParams {
+                max_rows_per_file: 200, // 6 fragments
+                ..Default::default()
+            }),
+        )
+        .await
+        .unwrap();
+
+        dataset
+            .create_index(
+                &["doc"],
+                IndexType::Inverted,
+                Some("doc_idx".into()),
+                &InvertedIndexParams::default(),
+                false,
+            )
+            .await
+            .unwrap();
+
+        // Delete a prefix, then deferred-compact so the deletions are
+        // materialized into the fragment-reuse index the index is read through.
+        dataset.delete(&format!("id < {DELETED}")).await.unwrap();
+        compact_files(
+            &mut dataset,
+            CompactionOptions {
+                target_rows_per_fragment: 2_000,
+                defer_index_remap: true,
+                ..Default::default()
+            },
+            None,
+        )
+        .await
+        .unwrap();
+        assert!(
+            dataset
+                .load_index_by_name(FRAG_REUSE_INDEX_NAME)
+                .await
+                .unwrap()
+                .is_some(),
+            "deferred compaction must leave a fragment-reuse index"
+        );
+
+        // FTS "lance" → sorted surviving ids. Projecting `id` forces a take, so
+        // a stale row address would error or return a wrong/dead row.
+        async fn search_ids(dataset: &Dataset) -> Vec<i32> {
+            let mut scanner = dataset.scan();
+            scanner
+                .full_text_search(FullTextSearchQuery::new("lance".to_owned()))
+                .unwrap();
+            scanner.project::<&str>(&["id"]).unwrap();
+            let batches = scanner
+                .try_into_stream()
+                .await
+                .unwrap()
+                .try_collect::<Vec<_>>()
+                .await
+                .unwrap();
+            let mut ids: Vec<i32> = batches
+                .iter()
+                .flat_map(|b| {
+                    b.column_by_name("id")
+                        .unwrap()
+                        .as_any()
+                        .downcast_ref::<Int32Array>()
+                        .unwrap()
+                        .values()
+                        .to_vec()
+                })
+                .collect();
+            ids.sort_unstable();
+            ids
+        }
+
+        let expected = (DELETED..ROWS).collect::<Vec<_>>();
+
+        // FRI window: index read through the reuse index.
+        let during = search_ids(&dataset).await;
+        assert_eq!(
+            during, expected,
+            "FRI-window FTS must return exactly the surviving rows (no resurrection, no loss, no stale rows)"
+        );
+
+        // Physical remap + trim: must still be correct.
+        remapping::remap_column_index(&mut dataset, &["doc"], Some("doc_idx".into()))
+            .await
+            .unwrap();
+        cleanup_frag_reuse_index(&mut dataset).await.unwrap();
+        let after = search_ids(&dataset).await;
+        assert_eq!(
+            after, expected,
+            "FTS must stay correct after physical remap + fragment-reuse trim"
+        );
+    }
+
     #[tokio::test]
     async fn test_read_ngram_index_with_defer_index_remap() {
         // Generate random words using lance-datagen
@@ -4615,6 +4817,668 @@ mod tests {
         );
     }
 
+    #[tokio::test]
+    async fn test_read_ivf_rq_index_v3_with_defer_index_remap() {
+        use arrow_array::cast::AsArray;
+        use lance_index::vector::bq::RQBuildParams;
+
+        let mut dataset = lance_datagen::gen_batch()
+            .col(
+                "vec",
+                lance_datagen::array::rand_vec::<Float32Type>(Dimension::from(128)),
+            )
+            .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000))
+            .await
+            .unwrap();
+
+        let stored: Vec<Vec<f32>> = {
+            let mut scanner = dataset.scan();
+            scanner.project(&["vec"]).unwrap();
+            let batches = scanner
+                .try_into_stream()
+                .await
+                .unwrap()
+                .try_collect::<Vec<_>>()
+                .await
+                .unwrap();
+            let mut out = Vec::new();
+            for batch in &batches {
+                let vecs = batch["vec"].as_fixed_size_list();
+                for i in 0..batch.num_rows() {
+                    let values = vecs.value(i);
+                    let values = values.as_primitive::<Float32Type>();
+                    out.push(values.values().to_vec());
+                }
+            }
+            out
+        };
+
+        let index_name = Some("vec_idx".into());
+        dataset
+            .create_index(
+                &["vec"],
+                IndexType::Vector,
+                index_name.clone(),
+                &VectorIndexParams {
+                    metric_type: DistanceType::L2,
+                    stages: vec![
+                        StageParams::Ivf(IvfBuildParams {
+                            max_iters: 2,
+                            num_partitions: Some(2),
+                            sample_rate: 2,
+                            ..Default::default()
+                        }),
+                        StageParams::RQ(RQBuildParams::new(1)),
+                    ],
+                    version: crate::index::vector::IndexFileVersion::V3,
+                    skip_transpose: false,
+                    runtime_hints: Default::default(),
+                },
+                false,
+            )
+            .await
+            .unwrap();
+        let indices = dataset.load_indices().await.unwrap();
+        let original_index = indices.iter().find(|idx| idx.name == "vec_idx").unwrap();
+
+        let options = CompactionOptions {
+            target_rows_per_fragment: 2_000,
+            defer_index_remap: true,
+            ..Default::default()
+        };
+        let metrics = compact_files(&mut dataset, options, None).await.unwrap();
+        assert!(metrics.fragments_removed > 0);
+        assert!(metrics.fragments_added > 0);
+
+        let Some(current_index) = dataset.load_index_by_name("vec_idx").await.unwrap() else {
+            panic!("vec index must be available");
+        };
+        assert_eq!(current_index.uuid, original_index.uuid);
+
+        let frag_reuse_present = dataset
+            .load_indices()
+            .await
+            .unwrap()
+            .iter()
+            .any(|idx| idx.name == FRAG_REUSE_INDEX_NAME);
+        assert!(
+            frag_reuse_present,
+            "defer_index_remap must record a {} index",
+            FRAG_REUSE_INDEX_NAME
+        );
+
+        let sample_step = (stored.len() / 8).max(1);
+        let mut checked = 0;
+        for query in stored.iter().step_by(sample_step) {
+            let query_vec = PrimitiveArray::<Float32Type>::from_iter_values(query.iter().copied());
+            let mut scanner = dataset.scan();
+            scanner.nearest("vec", &query_vec, 5).unwrap();
+            scanner.project(&["vec"]).unwrap().with_row_id();
+            let batches = scanner
+                .try_into_stream()
+                .await
+                .unwrap()
+                .try_collect::<Vec<_>>()
+                .await
+                .unwrap();
+            assert!(!batches.is_empty(), "query returned no batches");
+            let top = &batches[0];
+            assert!(top.num_rows() > 0, "query returned empty top batch");
+            let top_vec = top["vec"].as_fixed_size_list().value(0);
+            let top_vec = top_vec.as_primitive::<Float32Type>();
+            assert_eq!(
+                top_vec.values(),
+                query.as_slice(),
+                "top-1 self-recall returned a different vector than the query"
+            );
+            checked += 1;
+        }
+        assert!(checked > 0, "expected to check at least one stored vector");
+    }
+
+    /// Build an `id` + `vec` dataset, create the given IVF vector index,
+    /// optionally delete rows, then run deferred compaction (which materializes
+    /// the deletions into the fragment-reuse index) and assert that KNN over
+    /// surviving vectors during the FRI window (a) never returns a deleted row
+    /// and (b) stays consistent with the pre-compaction answer.
+    ///
+    /// The deletion path is the interesting one: materialized deletions drop
+    /// rows from the quantization storage at load time, which shifts storage
+    /// positions. Flat storage (FLAT/PQ/SQ/RQ) is scanned linearly so this is
+    /// fine, but the HNSW graph addresses storage positionally and is not
+    /// frag-reuse aware, so a desync would surface here as recall collapse or a
+    /// resurrected/again-deleted row.
+    /// Top-k `id`s for a KNN query against the `vec` column.
+    async fn vector_knn_ids(dataset: &Dataset, query: &[f32], k: usize) -> Vec<i32> {
+        use arrow_array::cast::AsArray;
+        use arrow_array::types::{Float32Type, Int32Type};
+        let qa = PrimitiveArray::<Float32Type>::from_iter_values(query.iter().copied());
+        let mut scanner = dataset.scan();
+        scanner.nearest("vec", &qa, k).unwrap();
+        scanner.project(&["id"]).unwrap();
+        let batches = scanner
+            .try_into_stream()
+            .await
+            .unwrap()
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+        let mut ids = Vec::new();
+        for b in &batches {
+            ids.extend(b["id"].as_primitive::<Int32Type>().values().iter().copied());
+        }
+        ids
+    }
+
+    async fn check_vector_defer_compaction(
+        params: VectorIndexParams,
+        delete_predicate: Option<&str>,
+        k: usize,
+        min_overlap: usize,
+    ) {
+        use arrow_array::cast::AsArray;
+        use arrow_array::types::{Float32Type, Int32Type};
+        use lance_datagen::Dimension;
+
+        const DIM: u32 = 32;
+        let mut dataset = lance_datagen::gen_batch()
+            .col("id", lance_datagen::array::step::<Int32Type>())
+            .col(
+                "vec",
+                lance_datagen::array::rand_vec::<Float32Type>(Dimension::from(DIM)),
+            )
+            .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000))
+            .await
+            .unwrap();
+
+        dataset
+            .create_index(
+                &["vec"],
+                IndexType::Vector,
+                Some("vec_idx".into()),
+                &params,
+                false,
+            )
+            .await
+            .unwrap();
+        let original_uuid = dataset
+            .load_index_by_name("vec_idx")
+            .await
+            .unwrap()
+            .unwrap()
+            .uuid;
+
+        if let Some(pred) = delete_predicate {
+            dataset.delete(pred).await.unwrap();
+        }
+
+        // Collect surviving (id, vec) pairs and the set of surviving ids.
+        let mut survivors: Vec<(i32, Vec<f32>)> = Vec::new();
+        {
+            let mut scanner = dataset.scan();
+            scanner.project(&["id", "vec"]).unwrap();
+            let batches = scanner
+                .try_into_stream()
+                .await
+                .unwrap()
+                .try_collect::<Vec<_>>()
+                .await
+                .unwrap();
+            for batch in &batches {
+                let ids = batch["id"].as_primitive::<Int32Type>();
+                let vecs = batch["vec"].as_fixed_size_list();
+                for i in 0..batch.num_rows() {
+                    let v = vecs.value(i);
+                    let v = v.as_primitive::<Float32Type>().values().to_vec();
+                    survivors.push((ids.value(i), v));
+                }
+            }
+        }
+        assert!(!survivors.is_empty());
+        let surviving_ids: std::collections::HashSet<i32> =
+            survivors.iter().map(|(id, _)| *id).collect();
+
+        // Sample queries from survivors and capture the pre-compaction answer.
+        let step = (survivors.len() / 16).max(1);
+        let queries: Vec<(i32, Vec<f32>)> = survivors.iter().step_by(step).cloned().collect();
+        let mut baseline: Vec<Vec<i32>> = Vec::new();
+        for (_, q) in &queries {
+            baseline.push(vector_knn_ids(&dataset, q, k).await);
+        }
+
+        // Deferred compaction materializes the deletions into the frag-reuse index.
+        let metrics = compact_files(
+            &mut dataset,
+            CompactionOptions {
+                target_rows_per_fragment: 2_000,
+                defer_index_remap: true,
+                ..Default::default()
+            },
+            None,
+        )
+        .await
+        .unwrap();
+        assert!(metrics.fragments_removed > 0);
+        assert!(
+            dataset
+                .load_indices()
+                .await
+                .unwrap()
+                .iter()
+                .any(|idx| idx.name == FRAG_REUSE_INDEX_NAME),
+            "deferred compaction must record a frag-reuse index"
+        );
+        assert_eq!(
+            dataset
+                .load_index_by_name("vec_idx")
+                .await
+                .unwrap()
+                .unwrap()
+                .uuid,
+            original_uuid,
+            "index must not be physically remapped yet (FRI window)"
+        );
+
+        // During the FRI window: no deleted rows, and stable vs the baseline.
+        for (i, (_, q)) in queries.iter().enumerate() {
+            let after = vector_knn_ids(&dataset, q, k).await;
+            for id in &after {
+                assert!(
+                    surviving_ids.contains(id),
+                    "KNN returned id {id} that is not a surviving row (query #{i})"
+                );
+            }
+            let overlap = after.iter().filter(|id| baseline[i].contains(id)).count();
+            assert!(
+                overlap >= min_overlap,
+                "KNN top-{k} diverged after deferred compaction: overlap {overlap} < {min_overlap} (query #{i})"
+            );
+        }
+    }
+
+    fn small_ivf() -> lance_index::vector::ivf::IvfBuildParams {
+        lance_index::vector::ivf::IvfBuildParams {
+            max_iters: 2,
+            num_partitions: Some(2),
+            sample_rate: 2,
+            ..Default::default()
+        }
+    }
+
+    #[tokio::test]
+    async fn test_ivf_flat_defer_compaction_with_deletions() {
+        let params = VectorIndexParams::with_ivf_flat_params(DistanceType::L2, small_ivf());
+        // Flat storage is scanned linearly; dropping deleted rows is exact.
+        check_vector_defer_compaction(params, Some("id < 1500"), 10, 10).await;
+    }
+
+    #[tokio::test]
+    async fn test_ivf_hnsw_sq_defer_compaction_merge_only() {
+        use lance_index::vector::{hnsw::builder::HnswBuildParams, sq::builder::SQBuildParams};
+        let params = VectorIndexParams::with_ivf_hnsw_sq_params(
+            DistanceType::L2,
+            small_ivf(),
+            HnswBuildParams::default(),
+            SQBuildParams::default(),
+        );
+        // No deletions: storage positions are stable, so the graph stays aligned.
+        check_vector_defer_compaction(params, None, 10, 9).await;
+    }
+
+    // NOTE: IVF_HNSW_* under materialized deletions is a known gap (lance#3993,
+    // HNSW auto-remap not implemented) — the HNSW graph isn't realigned after the
+    // frag-reuse drop. Deferred remap is gated off for HNSW tables, so there is
+    // no lance-level reproducer here; the gate is tested in the data plane.
+    // Merge-only HNSW is covered (see the *_remap_and_trim tests).
+
+    #[tokio::test]
+    async fn test_ivf_pq_defer_compaction_with_deletions() {
+        use lance_index::vector::pq::PQBuildParams;
+        let params = VectorIndexParams::with_ivf_pq_params(
+            DistanceType::L2,
+            small_ivf(),
+            PQBuildParams {
+                max_iters: 2,
+                num_sub_vectors: 2,
+                ..Default::default()
+            },
+        );
+        check_vector_defer_compaction(params, Some("id < 1500"), 10, 8).await;
+    }
+
+    #[tokio::test]
+    async fn test_ivf_sq_defer_compaction_with_deletions() {
+        use lance_index::vector::sq::builder::SQBuildParams;
+        let params = VectorIndexParams::with_ivf_sq_params(
+            DistanceType::L2,
+            small_ivf(),
+            SQBuildParams::default(),
+        );
+        check_vector_defer_compaction(params, Some("id < 1500"), 10, 8).await;
+    }
+
+    #[tokio::test]
+    async fn test_ivf_rq_defer_compaction_with_deletions() {
+        use lance_index::vector::bq::RQBuildParams;
+        let params = VectorIndexParams::with_ivf_rq_params(
+            DistanceType::L2,
+            small_ivf(),
+            RQBuildParams::new(1),
+        );
+        check_vector_defer_compaction(params, Some("id < 1500"), 10, 8).await;
+    }
+
+    /// Merge-only deferred compaction, then a PHYSICAL remap + FRI trim. Asserts
+    /// the index is rebuilt, the fragment-reuse index trims to zero versions,
+    /// and KNN stays consistent with the pre-compaction answer through both the
+    /// FRI window and the physical remap. (HNSW rebuilds its graph on physical
+    /// remap, so the overlap is recall-tolerant.)
+    async fn check_vector_remap_and_trim(
+        params: VectorIndexParams,
+        k: usize,
+        window_overlap: usize,
+        post_remap_overlap: Option<usize>,
+    ) {
+        use arrow_array::cast::AsArray;
+        use arrow_array::types::{Float32Type, Int32Type};
+        use lance_datagen::Dimension;
+
+        const DIM: u32 = 32;
+        let mut dataset = lance_datagen::gen_batch()
+            .col("id", lance_datagen::array::step::<Int32Type>())
+            .col(
+                "vec",
+                lance_datagen::array::rand_vec::<Float32Type>(Dimension::from(DIM)),
+            )
+            .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000))
+            .await
+            .unwrap();
+        dataset
+            .create_index(
+                &["vec"],
+                IndexType::Vector,
+                Some("vec_idx".into()),
+                &params,
+                false,
+            )
+            .await
+            .unwrap();
+        let original_uuid = dataset
+            .load_index_by_name("vec_idx")
+            .await
+            .unwrap()
+            .unwrap()
+            .uuid;
+
+        // Sample queries from stored vectors + capture the pre-compaction answer.
+        let mut rows: Vec<Vec<f32>> = Vec::new();
+        {
+            let mut scanner = dataset.scan();
+            scanner.project(&["vec"]).unwrap();
+            let batches = scanner
+                .try_into_stream()
+                .await
+                .unwrap()
+                .try_collect::<Vec<_>>()
+                .await
+                .unwrap();
+            for batch in &batches {
+                let vecs = batch["vec"].as_fixed_size_list();
+                for i in 0..batch.num_rows() {
+                    let v = vecs.value(i);
+                    rows.push(v.as_primitive::<Float32Type>().values().to_vec());
+                }
+            }
+        }
+        let step = (rows.len() / 16).max(1);
+        let queries: Vec<Vec<f32>> = rows.iter().step_by(step).cloned().collect();
+        let mut baseline: Vec<Vec<i32>> = Vec::new();
+        for q in &queries {
+            baseline.push(vector_knn_ids(&dataset, q, k).await);
+        }
+
+        // Merge-only deferred compaction.
+        let metrics = compact_files(
+            &mut dataset,
+            CompactionOptions {
+                target_rows_per_fragment: 2_000,
+                defer_index_remap: true,
+                ..Default::default()
+            },
+            None,
+        )
+        .await
+        .unwrap();
+        assert!(metrics.fragments_removed > 0);
+        assert_eq!(
+            dataset
+                .load_index_by_name("vec_idx")
+                .await
+                .unwrap()
+                .unwrap()
+                .uuid,
+            original_uuid,
+            "index must not be physically remapped yet (FRI window)"
+        );
+        for (i, q) in queries.iter().enumerate() {
+            let window = vector_knn_ids(&dataset, q, k).await;
+            let overlap = window.iter().filter(|id| baseline[i].contains(id)).count();
+            assert!(
+                overlap >= window_overlap,
+                "FRI-window KNN diverged: overlap {overlap} < {window_overlap} (query #{i})"
+            );
+        }
+
+        // Physical remap + trim the fragment-reuse index.
+        remapping::remap_column_index(&mut dataset, &["vec"], Some("vec_idx".into()))
+            .await
+            .unwrap();
+        cleanup_frag_reuse_index(&mut dataset).await.unwrap();
+
+        let remapped_uuid = dataset
+            .load_index_by_name("vec_idx")
+            .await
+            .unwrap()
+            .unwrap()
+            .uuid;
+        assert_ne!(
+            remapped_uuid, original_uuid,
+            "index should have been physically remapped"
+        );
+        if let Some(meta) = dataset
+            .load_index_by_name(FRAG_REUSE_INDEX_NAME)
+            .await
+            .unwrap()
+        {
+            let versions = load_frag_reuse_index_details(&dataset, &meta)
+                .await
+                .unwrap()
+                .versions
+                .len();
+            assert_eq!(versions, 0, "frag-reuse index must trim to zero versions");
+        }
+
+        for (i, q) in queries.iter().enumerate() {
+            let after = vector_knn_ids(&dataset, q, k).await;
+            // No stale/desynced addresses (a bad address fails the take above).
+            assert!(
+                !after.is_empty(),
+                "post-remap KNN returned no rows (query #{i})"
+            );
+            // Physical remap rebuilds the HNSW graph, so recall is only compared
+            // for the exact (non-HNSW) types.
+            if let Some(min_overlap) = post_remap_overlap {
+                let overlap = after.iter().filter(|id| baseline[i].contains(id)).count();
+                assert!(
+                    overlap >= min_overlap,
+                    "post-remap KNN diverged: overlap {overlap} < {min_overlap} (query #{i})"
+                );
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_ivf_flat_remap_and_trim() {
+        let params = VectorIndexParams::with_ivf_flat_params(DistanceType::L2, small_ivf());
+        check_vector_remap_and_trim(params, 10, 8, Some(8)).await;
+    }
+
+    // Regression: PQ storage used to remap its codes through the frag-reuse
+    // index but keep the pre-remap `row_ids` field, so search returned stale
+    // (compacted-away) addresses and the take failed with "fragment ... does
+    // not exist" — even merge-only, and only observable when the query fetches
+    // row content (the existing `test_read_ivf_pq_index_v3_with_defer_index_remap`
+    // projects no columns, so it never takes and missed this).
+    #[tokio::test]
+    async fn test_ivf_pq_remap_and_trim() {
+        use lance_index::vector::pq::PQBuildParams;
+        let params = VectorIndexParams::with_ivf_pq_params(
+            DistanceType::L2,
+            small_ivf(),
+            PQBuildParams {
+                max_iters: 2,
+                num_sub_vectors: 2,
+                ..Default::default()
+            },
+        );
+        check_vector_remap_and_trim(params, 10, 8, Some(8)).await;
+    }
+
+    #[tokio::test]
+    async fn test_ivf_sq_remap_and_trim() {
+        use lance_index::vector::sq::builder::SQBuildParams;
+        let params = VectorIndexParams::with_ivf_sq_params(
+            DistanceType::L2,
+            small_ivf(),
+            SQBuildParams::default(),
+        );
+        check_vector_remap_and_trim(params, 10, 8, Some(8)).await;
+    }
+
+    #[tokio::test]
+    async fn test_ivf_rq_remap_and_trim() {
+        use lance_index::vector::bq::RQBuildParams;
+        let params = VectorIndexParams::with_ivf_rq_params(
+            DistanceType::L2,
+            small_ivf(),
+            RQBuildParams::new(1),
+        );
+        check_vector_remap_and_trim(params, 10, 8, Some(8)).await;
+    }
+
+    #[tokio::test]
+    async fn test_ivf_hnsw_sq_remap_and_trim() {
+        use lance_index::vector::{hnsw::builder::HnswBuildParams, sq::builder::SQBuildParams};
+        let params = VectorIndexParams::with_ivf_hnsw_sq_params(
+            DistanceType::L2,
+            small_ivf(),
+            HnswBuildParams::default(),
+            SQBuildParams::default(),
+        );
+        // Physical remap rebuilds the HNSW graph, so use a recall-tolerant overlap.
+        check_vector_remap_and_trim(params, 10, 7, None).await;
+    }
+
+    #[tokio::test]
+    async fn test_ivf_hnsw_pq_remap_and_trim() {
+        use lance_index::vector::{hnsw::builder::HnswBuildParams, pq::PQBuildParams};
+        let params = VectorIndexParams::with_ivf_hnsw_pq_params(
+            DistanceType::L2,
+            small_ivf(),
+            HnswBuildParams::default(),
+            PQBuildParams {
+                max_iters: 2,
+                num_sub_vectors: 2,
+                ..Default::default()
+            },
+        );
+        check_vector_remap_and_trim(params, 10, 7, None).await;
+    }
+
+    // Scalar index correctness across deferred compaction WITH materialized
+    // deletions. The existing test_read_*_index_with_defer_index_remap tests are
+    // merge-only and project no columns (count-only), so they never take and
+    // never exercise the deletion drop path. These add an `id` column, delete a
+    // prefix, defer-compact, then run the indexed query *projecting id* (a take)
+    // and assert no deleted row is returned. Bitmap/BTree have no positional
+    // internal structure so the drop path is exact; the Inverted (FTS) index
+    // does (see its test below), and currently desyncs under deletions.
+
+    #[tokio::test]
+    async fn test_bitmap_index_defer_compaction_with_deletions() {
+        use arrow_array::cast::AsArray;
+        use arrow_array::types::Int32Type;
+        let mut dataset = lance_datagen::gen_batch()
+            .col("id", lance_datagen::array::step::<Int32Type>())
+            .col(
+                "category",
+                lance_datagen::array::cycle::<Int32Type>(vec![1, 2, 3]),
+            )
+            .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000))
+            .await
+            .unwrap();
+        dataset
+            .create_index(
+                &["category"],
+                IndexType::Bitmap,
+                Some("category_idx".into()),
+                &ScalarIndexParams::default(),
+                false,
+            )
+            .await
+            .unwrap();
+        dataset.delete("id < 1500").await.unwrap();
+        let metrics = compact_files(
+            &mut dataset,
+            CompactionOptions {
+                target_rows_per_fragment: 2_000,
+                defer_index_remap: true,
+                ..Default::default()
+            },
+            None,
+        )
+        .await
+        .unwrap();
+        assert!(metrics.fragments_removed > 0);
+        assert!(
+            dataset
+                .load_indices()
+                .await
+                .unwrap()
+                .iter()
+                .any(|idx| idx.name == FRAG_REUSE_INDEX_NAME),
+            "deferred compaction must record a frag-reuse index"
+        );
+
+        let mut scanner = dataset.scan();
+        scanner.filter("category = 3").unwrap();
+        scanner.project(&["id"]).unwrap();
+        let batches = scanner
+            .try_into_stream()
+            .await
+            .unwrap()
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+        let mut returned = 0;
+        for b in &batches {
+            for id in b["id"].as_primitive::<Int32Type>().values() {
+                assert!(
+                    *id >= 1500,
+                    "bitmap returned deleted id {id} in the FRI window"
+                );
+                returned += 1;
+            }
+        }
+        assert!(returned > 0, "expected surviving category=3 rows");
+    }
+
+    // NOTE: Inverted/FTS under materialized deletions is broken (BM25 scores
+    // via positional num_tokens[doc_id]; the frag-reuse drop shifts doc_id
+    // positions -> out-of-bounds). It is gated off defer in the data plane
+    // until fixed, so there is no lance-level reproducer here. Merge-only FTS
+    // is covered by test_read_inverted_index_with_defer_index_remap.
+
     #[tokio::test]
     async fn test_default_compaction_planner() {
         let test_dir = TempStrDir::default();
@@ -4683,6 +5547,10 @@ mod tests {
                 "lance.compaction.batch_size".to_string(),
                 "4096".to_string(),
             ),
+            (
+                "lance.compaction.io_buffer_size".to_string(),
+                "1073741824".to_string(),
+            ),
             (
                 "lance.compaction.compaction_mode".to_string(),
                 "try_binary_copy".to_string(),
@@ -4701,6 +5569,7 @@ mod tests {
         assert!((opts.materialize_deletions_threshold - 0.25).abs() < f32::EPSILON);
         assert!(opts.defer_index_remap);
         assert_eq!(opts.batch_size, Some(4096));
+        assert_eq!(opts.io_buffer_size, Some(1_073_741_824));
         assert_eq!(opts.compaction_mode, Some(CompactionMode::TryBinaryCopy));
         assert_eq!(opts.binary_copy_read_batch_bytes, Some(8_388_608));
     }
diff --git a/rust/lance/src/dataset/optimize/remapping.rs b/rust/lance/src/dataset/optimize/remapping.rs
index dab62bf6166..266ac977a69 100644
--- a/rust/lance/src/dataset/optimize/remapping.rs
+++ b/rust/lance/src/dataset/optimize/remapping.rs
@@ -220,25 +220,37 @@ async fn remap_index(dataset: &mut Dataset, index_id: &Uuid) -> Result<()> {
         return Ok(());
     }
 
-    // Sequentially apply the row addr maps from oldest to latest
-    let mut curr_index_id = *index_id;
-    for (i, row_id_map) in frag_reuse_index.row_id_maps.iter().enumerate() {
-        let version = &frag_reuse_index.details.versions[i];
-        // load on-disk index metadata before auto-remap
-        let curr_index_meta = read_manifest_indexes(
-            &dataset.object_store,
-            &dataset.manifest_location,
-            &dataset.manifest,
-        )
-        .await?
-        .into_iter()
-        .find(|idx| idx.uuid == curr_index_id)
-        .unwrap();
-
-        let maybe_index_bitmap = curr_index_meta.fragment_bitmap.clone();
-        let (should_remap, bitmap_after_remap) = match maybe_index_bitmap {
-            Some(mut index_frag_bitmap) => {
-                let mut should_remap = false;
+    // Read the index's on-disk metadata once. Its stored row addresses are at
+    // this baseline; we compose all reuse versions into a single remap so the
+    // index file is rebuilt and committed exactly once, rather than once per
+    // version (the reuse index can accumulate many versions before remap runs).
+    let curr_index_meta = read_manifest_indexes(
+        &dataset.object_store,
+        &dataset.manifest_location,
+        &dataset.manifest,
+    )
+    .await?
+    .into_iter()
+    .find(|idx| idx.uuid == *index_id)
+    .ok_or_else(|| {
+        Error::index(format!(
+            "index {index_id} not found in manifest; it may have been concurrently dropped"
+        ))
+    })?;
+
+    // Compose the coverage (fragment bitmap) remap across every reuse version in
+    // one pass. Chaining is automatic: a version inserts its new fragments,
+    // which a later version then sees as its old fragments. `data_predates_version`
+    // is evaluated against the fixed baseline (there are no intermediate
+    // commits), and the new-fragment branch handles a bitmap that was already
+    // coverage-remapped + persisted before the data was remapped (e.g. while
+    // remapping a *sibling* index).
+    let baseline_version = curr_index_meta.dataset_version;
+    let (should_remap, bitmap_after_remap) = match curr_index_meta.fragment_bitmap.clone() {
+        Some(mut index_frag_bitmap) => {
+            let mut should_remap = false;
+            for version in frag_reuse_index.details.versions.iter() {
+                let data_predates_version = baseline_version < version.dataset_version;
                 for group in version.groups.iter() {
                     let mut old_frag_in_index = 0;
                     for old_frag in group.old_frags.iter() {
@@ -258,67 +270,97 @@ async fn remap_index(dataset: &mut Dataset, index_id: &Uuid) -> Result<()> {
                                 group.old_frags
                             )));
                         }
-                        index_frag_bitmap
-                            .extend(group.new_frags.clone().into_iter().map(|f| f.id as u32));
+                        index_frag_bitmap.extend(group.new_frags.iter().map(|f| f.id as u32));
+                        should_remap = true;
+                    } else if data_predates_version
+                        && group
+                            .new_frags
+                            .iter()
+                            .any(|new_frag| index_frag_bitmap.contains(new_frag.id as u32))
+                    {
+                        // The bitmap was already coverage-remapped onto this
+                        // group's new fragments and persisted before the data was
+                        // remapped, so the old fragments are gone from the bitmap
+                        // but the index data still needs remapping.
                         should_remap = true;
                     }
                 }
-                (should_remap, Some(index_frag_bitmap))
             }
-            // if there is no fragment bitmap for the index,
-            // we attempt remapping but will not update the fragment bitmap.
-            None => (true, None),
-        };
-
-        if should_remap {
-            let remap_result = index::remap_index(dataset, &curr_index_id, row_id_map).await?;
-
-            let new_index_meta = match remap_result {
-                RemapResult::Drop => continue,
-                RemapResult::Keep(new_id) => IndexMetadata {
-                    uuid: new_id,
-                    name: curr_index_meta.name.clone(),
-                    fields: curr_index_meta.fields.clone(),
-                    dataset_version: dataset.manifest.version,
-                    fragment_bitmap: bitmap_after_remap,
-                    index_details: curr_index_meta.index_details.clone(),
-                    index_version: curr_index_meta.index_version,
-                    created_at: curr_index_meta.created_at,
-                    base_id: None,
-                    files: curr_index_meta.files.clone(),
-                },
-                RemapResult::Remapped(remapped_index) => IndexMetadata {
-                    uuid: remapped_index.new_id,
-                    name: curr_index_meta.name.clone(),
-                    fields: curr_index_meta.fields.clone(),
-                    dataset_version: dataset.manifest.version,
-                    fragment_bitmap: bitmap_after_remap,
-                    index_details: Some(Arc::new(remapped_index.index_details)),
-                    index_version: remapped_index.index_version as i32,
-                    created_at: curr_index_meta.created_at,
-                    base_id: None,
-                    files: remapped_index.files,
-                },
-            };
-
-            let new_id = new_index_meta.uuid;
+            (should_remap, Some(index_frag_bitmap))
+        }
+        // if there is no fragment bitmap for the index,
+        // we attempt remapping but will not update the fragment bitmap.
+        None => (true, None),
+    };
 
-            let transaction = Transaction::new(
-                dataset.manifest.version,
-                Operation::CreateIndex {
-                    new_indices: vec![new_index_meta],
-                    removed_indices: vec![curr_index_meta.clone()],
-                },
-                None,
-            );
+    if !should_remap {
+        return Ok(());
+    }
 
-            dataset
-                .apply_commit(transaction, &Default::default(), &Default::default())
-                .await?;
+    // Compose the row-address remap across all versions. `remap_row_id` already
+    // chains every version (and passes through addresses a version does not
+    // touch), so mapping the union of all versions' keys yields a single
+    // baseline -> final address map applied in one rebuild.
+    //
+    // Map every old address; do NOT filter by the current `fragment_bitmap`. In
+    // the sibling-coverage-remap case the bitmap was already advanced onto the
+    // new fragments while the index data still holds old addresses, so filtering
+    // by it would drop exactly the keys this index needs and leave its data
+    // stale (an empty map makes `index::remap_index` return `Keep`). The map is
+    // bounded by the rows the reuse index touched; addresses this index does not
+    // store are simply never looked up.
+    let composed_row_id_map: HashMap<u64, Option<u64>> = frag_reuse_index
+        .row_id_maps
+        .iter()
+        .flat_map(|row_id_map| row_id_map.keys().copied())
+        .map(|old_addr| (old_addr, frag_reuse_index.remap_row_id(old_addr)))
+        .collect();
+
+    let remap_result = index::remap_index(dataset, index_id, &composed_row_id_map).await?;
+
+    let new_index_meta = match remap_result {
+        // The composed remap emptied the index (every row deleted). Matching the
+        // prior per-version behavior, leave the existing index untouched and
+        // commit nothing -- there is no remap to apply.
+        RemapResult::Drop => return Ok(()),
+        RemapResult::Keep(new_id) => IndexMetadata {
+            uuid: new_id,
+            name: curr_index_meta.name.clone(),
+            fields: curr_index_meta.fields.clone(),
+            dataset_version: dataset.manifest.version,
+            fragment_bitmap: bitmap_after_remap,
+            index_details: curr_index_meta.index_details.clone(),
+            index_version: curr_index_meta.index_version,
+            created_at: curr_index_meta.created_at,
+            base_id: None,
+            files: curr_index_meta.files.clone(),
+        },
+        RemapResult::Remapped(remapped_index) => IndexMetadata {
+            uuid: remapped_index.new_id,
+            name: curr_index_meta.name.clone(),
+            fields: curr_index_meta.fields.clone(),
+            dataset_version: dataset.manifest.version,
+            fragment_bitmap: bitmap_after_remap,
+            index_details: Some(Arc::new(remapped_index.index_details)),
+            index_version: remapped_index.index_version as i32,
+            created_at: curr_index_meta.created_at,
+            base_id: None,
+            files: remapped_index.files,
+        },
+    };
 
-            curr_index_id = new_id;
-        }
-    }
+    let transaction = Transaction::new(
+        dataset.manifest.version,
+        Operation::CreateIndex {
+            new_indices: vec![new_index_meta],
+            removed_indices: vec![curr_index_meta],
+        },
+        None,
+    );
+
+    dataset
+        .apply_commit(transaction, &Default::default(), &Default::default())
+        .await?;
 
     Ok(())
 }
diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs
index 6b19150c17b..09cd7023e74 100644
--- a/rust/lance/src/dataset/scanner.rs
+++ b/rust/lance/src/dataset/scanner.rs
@@ -3591,33 +3591,35 @@ impl Scanner {
             .clone();
 
         let mut columns = vec![column];
-        if let Some(expr) = filter_plan.full_expr.as_ref() {
-            let filter_columns = Planner::column_names_in_expr(expr);
-            columns.extend(filter_columns);
+        if let Some(refine_expr) = filter_plan.refine_expr.as_ref() {
+            columns.extend(Planner::column_names_in_expr(refine_expr));
         }
-        let flat_fts_scan_schema = Arc::new(self.dataset.schema().project(&columns).unwrap());
-        let mut scan_node = self.scan_fragments(
-            true,
-            false,
-            false,
-            false,
-            false,
-            flat_fts_scan_schema,
-            Arc::new(fragments),
-            None,
-            false,
-        );
+        let scan_projection = self
+            .dataset
+            .empty_projection()
+            .with_row_id()
+            .union_columns(&columns, OnMissing::Error)?;
 
-        if let Some(expr) = filter_plan.full_expr.as_ref() {
-            // If there is a prefilter we need to manually apply it to the new data
-            scan_node = Arc::new(LanceFilterExec::try_new(expr.clone(), scan_node)?);
+        let PlannedFilteredScan { mut plan, .. } = self
+            .filtered_read(
+                filter_plan,
+                scan_projection,
+                /*make_deletions_null=*/ false,
+                Some(Arc::new(fragments)),
+                None,
+                /*is_prefilter=*/ true,
+            )
+            .await?;
+
+        if let Some(refine_expr) = filter_plan.refine_expr.as_ref() {
+            plan = Arc::new(LanceFilterExec::try_new(refine_expr.clone(), plan)?);
         }
 
         let flat_match_plan = Arc::new(FlatMatchQueryExec::new(
             self.dataset.clone(),
             query.clone(),
             params.clone(),
-            scan_node,
+            plan,
         ));
         Ok(flat_match_plan)
     }
@@ -8412,6 +8414,198 @@ mod test {
         .unwrap();
     }
 
+    #[tokio::test]
+    async fn test_ngram_regex_index_scan() {
+        use arrow::array::AsArray;
+
+        // A small, fixed corpus written across multiple fragments so the ngram
+        // index spans fragment boundaries.
+        let values = [
+            "rhino",       // 0
+            "rhinos nose", // 1
+            "cat",         // 2
+            "dog",         // 3
+            "cat dog",     // 4
+            "elephant",    // 5
+            "catalog",     // 6
+            "scatter",     // 7
+            "rhino horn",  // 8
+            "mouse",       // 9
+            "category",    // 10
+            "dogma",       // 11
+        ];
+        let array = StringArray::from_iter_values(values);
+        let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
+            "s",
+            DataType::Utf8,
+            false,
+        )]));
+        let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap();
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema);
+        let write_params = WriteParams {
+            max_rows_per_file: 4, // 12 rows -> 3 fragments
+            ..Default::default()
+        };
+        let mut dataset = Dataset::write(reader, "memory://test_ngram_regex", Some(write_params))
+            .await
+            .unwrap();
+        dataset
+            .create_index(
+                &["s"],
+                IndexType::NGram,
+                None,
+                &ScalarIndexParams::default(),
+                true,
+            )
+            .await
+            .unwrap();
+        assert!(
+            dataset.get_fragments().len() > 1,
+            "expected a multi-fragment dataset"
+        );
+
+        // Scan with `filter` and return the matched `s` values, sorted.
+        async fn matched(dataset: &Dataset, filter: &str) -> Vec<String> {
+            let mut scan = dataset.scan();
+            scan.filter(filter).unwrap();
+            let batches = scan
+                .try_into_stream()
+                .await
+                .unwrap()
+                .try_collect::<Vec<_>>()
+                .await
+                .unwrap();
+            let mut out = Vec::new();
+            for batch in batches {
+                let col = batch.column_by_name("s").unwrap().as_string::<i32>();
+                out.extend(col.iter().flatten().map(|s| s.to_string()));
+            }
+            out.sort();
+            out
+        }
+
+        // `regexp_like`: a plain literal substring.
+        assert_eq!(
+            matched(&dataset, "regexp_like(s, 'rhino')").await,
+            ["rhino", "rhino horn", "rhinos nose"]
+        );
+        // `regexp_match` (coerced to `IsNotNull(regexp_match(...))`) accelerates too.
+        assert_eq!(
+            matched(&dataset, "regexp_match(s, 'rhino')").await,
+            ["rhino", "rhino horn", "rhinos nose"]
+        );
+        // Anchored: recheck must drop trigram false positives -- the `cat`
+        // trigram also occurs in cat dog / catalog / scatter / category.
+        assert_eq!(matched(&dataset, "regexp_like(s, 'cat$')").await, ["cat"]);
+        // AND across `.*`: row 8 ("rhino horn") shares the rhino trigrams but
+        // lacks the nose trigrams, so only "rhinos nose" survives.
+        assert_eq!(
+            matched(&dataset, "regexp_like(s, 'rhino.*nose')").await,
+            ["rhinos nose"]
+        );
+        // Alternation.
+        assert_eq!(
+            matched(&dataset, "regexp_like(s, '(catalog|elephant)')").await,
+            ["catalog", "elephant"]
+        );
+        // A non-accelerable pattern (no trigram derivable) still returns correct
+        // results via a full recheck.
+        assert_eq!(matched(&dataset, "regexp_like(s, 'o.m')").await, ["dogma"]);
+        // A case-insensitive flag is not accelerated (the index normalization
+        // disagrees with Unicode case folding) but must still return correct
+        // results via a full recheck -- here matching despite the upper-case
+        // pattern. This exercises the three-argument `regexp_like` flags path.
+        assert_eq!(
+            matched(&dataset, "regexp_like(s, 'RHINO', 'i')").await,
+            ["rhino", "rhino horn", "rhinos nose"]
+        );
+
+        // Infix LIKE is accelerated through the same machinery (a plain-literal
+        // `regexp_like` is rewritten to LIKE before it reaches the index).
+        assert_eq!(
+            matched(&dataset, "s LIKE '%rhino%'").await,
+            ["rhino", "rhino horn", "rhinos nose"]
+        );
+        // Prefix LIKE: recheck drops "scatter", which contains the `cat` trigram
+        // but does not start with "cat".
+        assert_eq!(
+            matched(&dataset, "s LIKE 'cat%'").await,
+            ["cat", "cat dog", "catalog", "category"]
+        );
+
+        // The ngram index is actually engaged for every accelerated form.
+        for filter in [
+            "regexp_like(s, 'rhino')",
+            "regexp_match(s, 'rhino')",
+            "s LIKE '%rhino%'",
+        ] {
+            let mut scan = dataset.scan();
+            scan.filter(filter).unwrap();
+            let plan = scan.create_plan().await.unwrap();
+            let plan_str = format!(
+                "{}",
+                datafusion::physical_plan::displayable(plan.as_ref()).indent(true)
+            );
+            assert!(
+                plan_str.contains("ScalarIndexQuery") && plan_str.contains("NGram"),
+                "expected ngram index usage for `{filter}`, got plan:\n{plan_str}"
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn test_ngram_regex_non_accelerable_recheck() {
+        // `a.b` yields no trigram, so the index returns "recheck everything".
+        // This must still produce ALL correct matches across fragments, not an
+        // empty set (a regression test for the AtLeast recheck path, which a
+        // single-match case would not catch).
+        let unit = ["acb", "dog", "axb", "cat", "qqq", "rhino"];
+        let values: Vec<&str> = unit.iter().copied().cycle().take(60).collect();
+        let array = StringArray::from_iter_values(values);
+        let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
+            "text",
+            DataType::Utf8,
+            false,
+        )]));
+        let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap();
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema);
+        let write_params = WriteParams {
+            max_rows_per_file: 20, // 60 rows -> 3 fragments
+            ..Default::default()
+        };
+        let mut dataset =
+            Dataset::write(reader, "memory://test_ngram_regex_ne", Some(write_params))
+                .await
+                .unwrap();
+        dataset
+            .create_index(
+                &["text"],
+                IndexType::NGram,
+                None,
+                &ScalarIndexParams::default(),
+                true,
+            )
+            .await
+            .unwrap();
+
+        async fn count(dataset: &Dataset, filter: &str) -> usize {
+            let mut scan = dataset.scan();
+            scan.filter(filter).unwrap();
+            let batches = scan
+                .try_into_stream()
+                .await
+                .unwrap()
+                .try_collect::<Vec<_>>()
+                .await
+                .unwrap();
+            batches.iter().map(|b| b.num_rows()).sum()
+        }
+
+        // "acb" and "axb" each appear 10 times in the 60 rows -> 20 matches.
+        assert_eq!(count(&dataset, "regexp_match(text, 'a.b')").await, 20);
+        assert_eq!(count(&dataset, "regexp_like(text, 'a.b')").await, 20);
+    }
+
     #[tokio::test]
     async fn test_like_prefix_with_btree_index() {
         // Create dataset with string data that has various prefixes
@@ -8843,6 +9037,93 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\")
         );
     }
 
+    /// Build an in-memory dataset with a single `Dictionary(Int16, Utf8)` column.
+    /// The dictionary cycles through "a", "b", "c" so each value appears in a
+    /// predictable, repeated pattern.
+    async fn dictionary_string_dataset() -> Dataset {
+        use arrow_array::{Int16Array, Int16DictionaryArray};
+
+        let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
+            "etld",
+            DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)),
+            false,
+        )]));
+
+        let dictionary = Arc::new(StringArray::from(vec!["a", "b", "c"]));
+        let indices = Int16Array::from((0..30).map(|i| i % 3).collect::<Vec<_>>());
+        let dict_array = Int16DictionaryArray::try_new(indices, dictionary).unwrap();
+
+        let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(dict_array)]).unwrap();
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone());
+        Dataset::write(reader, "memory://test_dict_filter", None)
+            .await
+            .unwrap()
+    }
+
+    /// Regression test for filtering a dictionary-encoded string column via the
+    /// SQL string path (`Scanner::filter`). This used to fail to plan with
+    /// "could not convert to literal of type 'Dictionary(Int16, Utf8)'".
+    #[tokio::test]
+    async fn test_filter_on_dictionary_string_column() {
+        let dataset = dictionary_string_dataset().await;
+
+        // Equality predicate.
+        let count = dataset
+            .scan()
+            .filter("etld = 'a'")
+            .unwrap()
+            .try_into_batch()
+            .await
+            .unwrap()
+            .num_rows();
+        assert_eq!(count, 10);
+
+        // IN-list predicate.
+        let count = dataset
+            .scan()
+            .filter("etld IN ('a', 'b')")
+            .unwrap()
+            .try_into_batch()
+            .await
+            .unwrap()
+            .num_rows();
+        assert_eq!(count, 20);
+    }
+
+    /// An `IN`/`=` predicate on a dictionary column with a scalar index should be
+    /// pushed down to the index rather than falling back to a full scan.
+    #[tokio::test]
+    async fn test_dictionary_string_column_uses_scalar_index() {
+        use lance_index::scalar::BuiltinIndexType;
+
+        let mut dataset = dictionary_string_dataset().await;
+        let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap);
+        dataset
+            .create_index(&["etld"], IndexType::Scalar, None, &params, true)
+            .await
+            .unwrap();
+
+        let mut scanner = dataset.scan();
+        scanner.filter("etld IN ('a', 'b')").unwrap();
+        let plan = scanner.create_plan().await.unwrap();
+        let plan_str = format!("{:?}", plan);
+        assert!(
+            plan_str.contains("ScalarIndexExec") || plan_str.contains("MaterializeIndex"),
+            "IN on a dictionary column should use the scalar index, but got: {}",
+            plan_str
+        );
+
+        let count = dataset
+            .scan()
+            .filter("etld IN ('a', 'b')")
+            .unwrap()
+            .try_into_batch()
+            .await
+            .unwrap()
+            .num_rows();
+        assert_eq!(count, 20);
+    }
+
     #[tokio::test]
     async fn test_like_prefix_with_segmented_zone_map() {
         use lance_index::scalar::BuiltinIndexType;
@@ -10191,7 +10472,12 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\")
         .await?;
 
         log::info!("Test case: Full text search with unindexed rows");
-        let expected = r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
+        // The flat-FTS path now reads through `FilteredReadExec`, matching the
+        // brute-force KNN path. With no prefilter the scan still produces no
+        // pushdown, but the operator differs by storage version: legacy emits
+        // a `LanceScan`, v2 emits a `LanceRead` with empty filters.
+        let expected = if data_storage_version == LanceFileVersion::Legacy {
+            r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
   Take: columns="_rowid, _score, (s)"
     CoalesceBatchesExec: target_batch_size=8192
       SortExec: expr=[_score@1 DESC NULLS LAST], preserve_partitioning=[false]
@@ -10199,7 +10485,18 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\")
           UnionExec
             MatchQuery: column=s, query=hello
             FlatMatchQuery: column=s, query=hello
-              LanceScan: uri=..., projection=[s], row_id=true, row_addr=false, ordered=false, range=None"#;
+              LanceScan: uri=..., projection=[s], row_id=true, row_addr=false, ordered=true, range=None"#
+        } else {
+            r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
+  Take: columns="_rowid, _score, (s)"
+    CoalesceBatchesExec: target_batch_size=8192
+      SortExec: expr=[_score@1 DESC NULLS LAST], preserve_partitioning=[false]
+        CoalescePartitionsExec
+          UnionExec
+            MatchQuery: column=s, query=hello
+            FlatMatchQuery: column=s, query=hello
+              LanceRead: uri=..., projection=[s], num_fragments=1, range_before=None, range_after=None, row_id=true, row_addr=false, full_filter=--, refine_filter=--"#
+        };
         dataset.append_new_data().await?;
         assert_plan_equals(
             &dataset.dataset,
@@ -10232,6 +10529,10 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\")
         .await?;
 
         log::info!("Test case: Full text search with unindexed rows and prefilter");
+        // After routing flat FTS through `FilteredReadExec`, the BTree on `i`
+        // pushes into the unindexed-fragment scan too — no more `FilterExec` on
+        // top of an unfiltered `LanceScan`. Legacy uses the `MaterializeIndex`
+        // shape, v2 uses `LanceRead` with `full_filter` set.
         let expected = if data_storage_version == LanceFileVersion::Legacy {
             r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
   Take: columns="_rowid, _score, (s)"
@@ -10247,8 +10548,14 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\")
                     FilterExec: i@0 > 10
                       LanceScan: uri=..., projection=[i], row_id=true, row_addr=false, ordered=false, range=None
             FlatMatchQuery: column=s, query=hello
-              FilterExec: i@1 > 10
-                LanceScan: uri=..., projection=[s, i], row_id=true, row_addr=false, ordered=false, range=None"#
+              CoalescePartitionsExec
+                UnionExec
+                  Take: columns="_rowid, (s)"
+                    CoalesceBatchesExec: target_batch_size=8192
+                      MaterializeIndex: query=[i > 10]@i_idx(BTree)
+                  ProjectionExec: expr=[_rowid@2 as _rowid, s@1 as s]
+                    FilterExec: i@0 > 10
+                      LanceScan: uri=..., projection=[i, s], row_id=true, row_addr=false, ordered=false, range=None"#
         } else {
             r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid]
   Take: columns="_rowid, _score, (s)"
@@ -10260,8 +10567,8 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\")
               LanceRead: uri=..., projection=[], num_fragments=5, range_before=None, range_after=None, row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=--
                 ScalarIndexQuery: query=[i > 10]@i_idx(BTree)
             FlatMatchQuery: column=s, query=hello
-              FilterExec: i@1 > 10
-                LanceScan: uri=..., projection=[s, i], row_id=true, row_addr=false, ordered=false, range=None"#
+              LanceRead: uri=..., projection=[s], num_fragments=1, range_before=None, range_after=None, row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=--
+                ScalarIndexQuery: query=[i > 10]@i_idx(BTree)"#
         };
         assert_plan_equals(
             &dataset.dataset,
diff --git a/rust/lance/src/dataset/schema_evolution.rs b/rust/lance/src/dataset/schema_evolution.rs
index f5d792979df..5ef35a33ab7 100644
--- a/rust/lance/src/dataset/schema_evolution.rs
+++ b/rust/lance/src/dataset/schema_evolution.rs
@@ -1,13 +1,18 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
-use std::{collections::HashSet, sync::Arc};
+use std::{
+    collections::{HashMap, HashSet},
+    sync::Arc,
+};
 
 use super::fragment::FileFragment;
 use super::{
     Dataset,
     transaction::{Operation, Transaction},
+    write::cleanup_data_fragments,
 };
+use crate::index::DatasetIndexExt;
 use crate::{Error, Result, io::exec::Planner};
 use arrow::compute::CastOptions;
 use arrow::compute::can_cast_types;
@@ -239,7 +244,7 @@ pub(super) async fn add_columns_to_fragments(
     read_columns: Option<Vec<String>>,
     fragments: &[FileFragment],
     batch_size: Option<u32>,
-) -> Result<(Vec<Fragment>, Schema)> {
+) -> Result<(Vec<Fragment>, Schema, Vec<Fragment>)> {
     // Check names early (before calling add_columns_impl) to avoid extra work if
     // the names are wrong.
     let version = dataset.manifest.data_storage_format.lance_file_version()?;
@@ -261,10 +266,10 @@ pub(super) async fn add_columns_to_fragments(
     }
     let transforms = optimizer.optimize(dataset, transforms)?;
 
-    let (output_schema, fragments) = match transforms {
+    let (output_schema, new_fragments, fragments_to_cleanup) = match transforms {
         NewColumnTransform::BatchUDF(udf) => {
             check_names(udf.output_schema.as_ref())?;
-            let fragments = add_columns_impl(
+            let result = add_columns_impl(
                 fragments,
                 read_columns,
                 udf.mapper,
@@ -273,7 +278,11 @@ pub(super) async fn add_columns_to_fragments(
                 None,
             )
             .await?;
-            Result::Ok((udf.output_schema, fragments))
+            Result::Ok((
+                udf.output_schema,
+                result.fragments,
+                result.fragments_to_cleanup,
+            ))
         }
         NewColumnTransform::SqlExpressions(expressions) => {
             // We just transform the SQL expression into a UDF backed by DataFusion
@@ -336,22 +345,22 @@ pub(super) async fn add_columns_to_fragments(
             let mapper = Box::new(mapper);
 
             let read_columns = Some(read_schema.field_names().into_iter().cloned().collect());
-            let fragments =
+            let result =
                 add_columns_impl(fragments, read_columns, mapper, batch_size, None, None).await?;
-            Ok((output_schema, fragments))
+            Ok((output_schema, result.fragments, result.fragments_to_cleanup))
         }
         NewColumnTransform::Stream(stream) => {
             let output_schema = stream.schema();
             check_names(output_schema.as_ref())?;
             let fragments = add_columns_from_stream(fragments, stream, None, batch_size).await?;
-            Ok((output_schema, fragments))
+            Ok((output_schema, fragments.clone(), fragments))
         }
         NewColumnTransform::Reader(reader) => {
             let output_schema = reader.schema();
             check_names(output_schema.as_ref())?;
             let stream = reader.into_stream();
             let fragments = add_columns_from_stream(fragments, stream, None, batch_size).await?;
-            Ok((output_schema, fragments))
+            Ok((output_schema, fragments.clone(), fragments))
         }
         NewColumnTransform::AllNulls(output_schema) => {
             check_names(output_schema.as_ref())?;
@@ -379,14 +388,20 @@ pub(super) async fn add_columns_to_fragments(
                 ));
             }
 
-            Ok((output_schema, fragments))
+            Ok((output_schema, fragments, Vec::new()))
         }
     }?;
 
-    let mut schema = dataset.schema().merge(output_schema.as_ref())?;
+    let mut schema = match dataset.schema().merge(output_schema.as_ref()) {
+        Ok(schema) => schema,
+        Err(e) => {
+            cleanup_new_column_data_files(fragments, &fragments_to_cleanup).await;
+            return Err(e);
+        }
+    };
     schema.set_field_id(Some(dataset.manifest.max_field_id()));
 
-    Ok((fragments, schema))
+    Ok((new_fragments, schema, fragments_to_cleanup))
 }
 
 pub(super) async fn add_columns(
@@ -395,7 +410,7 @@ pub(super) async fn add_columns(
     read_columns: Option<Vec<String>>,
     batch_size: Option<u32>,
 ) -> Result<()> {
-    let (fragments, schema) = add_columns_to_fragments(
+    let (fragments, schema, fragments_to_cleanup) = add_columns_to_fragments(
         dataset,
         transforms,
         read_columns,
@@ -406,11 +421,75 @@ pub(super) async fn add_columns(
 
     let operation = Operation::Merge { fragments, schema };
     let transaction = Transaction::new(dataset.manifest.version, operation, None);
-    dataset
+    match dataset
         .apply_commit(transaction, &Default::default(), &Default::default())
-        .await?;
+        .await
+    {
+        Ok(()) => Ok(()),
+        Err(e) => {
+            cleanup_new_column_data_files(&dataset.get_fragments(), &fragments_to_cleanup).await;
+            Err(e)
+        }
+    }
+}
 
-    Ok(())
+async fn cleanup_new_column_data_files(fragments: &[FileFragment], new_fragments: &[Fragment]) {
+    let Some(first_fragment) = fragments.first() else {
+        return;
+    };
+
+    // add_columns rewrites fragment metadata in place, so cleanup must delete
+    // only files created by the current attempt and must not touch pre-existing
+    // files that still belong to the fragment.
+    let original_files_by_fragment = fragments
+        .iter()
+        .map(|fragment| {
+            let files = fragment
+                .metadata
+                .files
+                .iter()
+                .map(|file| (file.base_id, file.path.clone()))
+                .collect::<HashSet<_>>();
+            (fragment.id() as u64, files)
+        })
+        .collect::<HashMap<_, _>>();
+
+    let fragments_to_cleanup = new_fragments
+        .iter()
+        .filter_map(|fragment| {
+            let original_files = original_files_by_fragment.get(&fragment.id)?;
+            let files = fragment
+                .files
+                .iter()
+                .filter(|file| !original_files.contains(&(file.base_id, file.path.clone())))
+                .cloned()
+                .collect::<Vec<_>>();
+
+            if files.is_empty() {
+                None
+            } else {
+                let mut fragment = fragment.clone();
+                fragment.files = files;
+                Some(fragment)
+            }
+        })
+        .collect::<Vec<_>>();
+
+    cleanup_data_fragments(
+        &first_fragment.dataset().object_store,
+        &first_fragment.dataset().base,
+        &fragments_to_cleanup,
+    )
+    .await;
+}
+
+struct AddColumnFragments {
+    /// Fragments produced by the add-columns operation and returned to the
+    /// caller for the final merge commit.
+    fragments: Vec<Fragment>,
+    /// Uncommitted fragments whose newly written data files must be removed if
+    /// the operation fails before the merge commit completes.
+    fragments_to_cleanup: Vec<Fragment>,
 }
 
 #[allow(clippy::type_complexity)]
@@ -421,63 +500,96 @@ async fn add_columns_impl(
     batch_size: Option<u32>,
     result_cache: Option<Arc<dyn UDFCheckpointStore>>,
     schemas: Option<(Schema, Schema)>,
-) -> Result<Vec<Fragment>> {
+) -> Result<AddColumnFragments> {
     let read_columns_ref = read_columns.as_deref();
     let mapper_ref = mapper.as_ref();
-    let fragments = futures::stream::iter(fragments)
-        .then(|fragment| {
-            let cache_ref = result_cache.clone();
-            let schemas_ref = &schemas;
-            async move {
-                if let Some(cache) = &cache_ref {
-                    let fragment_id = fragment.id() as u32;
-                    let fragment = cache.get_fragment(fragment_id)?;
-                    if let Some(fragment) = fragment {
-                        return Ok(fragment);
-                    }
+
+    let mut new_fragments = Vec::with_capacity(fragments.len());
+    let mut fragments_to_cleanup = Vec::with_capacity(fragments.len());
+
+    for fragment in fragments {
+        if let Some(cache) = &result_cache {
+            let fragment_id = fragment.id() as u32;
+            let fragment = match cache.get_fragment(fragment_id) {
+                Ok(fragment) => fragment,
+                Err(e) => {
+                    cleanup_new_column_data_files(fragments, &fragments_to_cleanup).await;
+                    return Err(e);
                 }
+            };
+            if let Some(fragment) = fragment {
+                new_fragments.push(fragment);
+                continue;
+            }
+        }
 
-                let mut updater = fragment
-                    .updater(read_columns_ref, schemas_ref.clone(), batch_size)
-                    .await?;
-
-                let mut batch_index = 0;
-                // TODO: the structure of the updater prevents batch-level parallelism here,
-                //       but there is no reason why we couldn't do this in parallel.
-                while let Some(batch) = updater.next().await? {
-                    let batch_info = BatchInfo {
-                        fragment_id: fragment.id() as u32,
-                        batch_index,
-                    };
+        let mut updater = match fragment
+            .updater(read_columns_ref, schemas.clone(), batch_size)
+            .await
+        {
+            Ok(updater) => updater,
+            Err(e) => {
+                cleanup_new_column_data_files(fragments, &fragments_to_cleanup).await;
+                return Err(e);
+            }
+        };
+        let fragment_result = async {
+            let mut batch_index = 0;
+            // TODO: the structure of the updater prevents batch-level parallelism here,
+            //       but there is no reason why we couldn't do this in parallel.
+            while let Some(batch) = updater.next().await? {
+                let batch_info = BatchInfo {
+                    fragment_id: fragment.id() as u32,
+                    batch_index,
+                };
 
-                    let new_batch = if let Some(cache) = &cache_ref {
-                        if let Some(batch) = cache.get_batch(&batch_info)? {
-                            batch
-                        } else {
-                            let new_batch = mapper_ref(batch)?;
-                            cache.insert_batch(batch_info, new_batch.clone())?;
-                            new_batch
-                        }
+                let new_batch = if let Some(cache) = &result_cache {
+                    if let Some(batch) = cache.get_batch(&batch_info)? {
+                        batch
                     } else {
-                        mapper_ref(batch)?
-                    };
+                        let new_batch = mapper_ref(batch)?;
+                        cache.insert_batch(batch_info, new_batch.clone())?;
+                        new_batch
+                    }
+                } else {
+                    mapper_ref(batch)?
+                };
 
-                    updater.update(new_batch).await?;
-                    batch_index += 1;
-                }
+                updater.update(new_batch).await?;
+                batch_index += 1;
+            }
 
-                let fragment = updater.finish().await?;
+            let new_fragment = updater.finish().await?;
+            fragments_to_cleanup.push(new_fragment.clone());
 
-                if let Some(cache) = &cache_ref {
-                    cache.insert_fragment(fragment.clone())?;
-                }
+            if let Some(cache) = &result_cache {
+                // Once the checkpoint store owns this fragment, retries may load
+                // it back instead of rewriting it. Removing it from the cleanup
+                // set avoids deleting data that has already been checkpointed.
+                cache.insert_fragment(new_fragment.clone())?;
+                fragments_to_cleanup.pop();
+            }
 
-                Ok::<_, Error>(fragment)
+            Ok::<_, Error>(new_fragment)
+        }
+        .await;
+
+        match fragment_result {
+            Ok(new_fragment) => {
+                new_fragments.push(new_fragment);
             }
-        })
-        .try_collect::<Vec<_>>()
-        .await?;
-    Ok(fragments)
+            Err(e) => {
+                updater.cleanup_unfinished_writer().await;
+                cleanup_new_column_data_files(fragments, &fragments_to_cleanup).await;
+                return Err(e);
+            }
+        }
+    }
+
+    Ok(AddColumnFragments {
+        fragments: new_fragments,
+        fragments_to_cleanup,
+    })
 }
 
 async fn add_columns_from_stream(
@@ -489,49 +601,80 @@ async fn add_columns_from_stream(
     let mut new_fragments = Vec::with_capacity(fragments.len());
     let mut last_seen_batch: Option<RecordBatch> = None;
     for fragment in fragments {
-        let mut updater = fragment
+        let mut updater = match fragment
             .updater::<String>(Some(&[]), schemas.clone(), batch_size)
-            .await?;
-        while let Some(batch) = updater.next().await? {
-            debug_assert_eq!(batch.num_columns(), 1);
-            let mut rows_remaining = batch.num_rows();
+            .await
+        {
+            Ok(updater) => updater,
+            Err(e) => {
+                cleanup_new_column_data_files(fragments, &new_fragments).await;
+                return Err(e);
+            }
+        };
+        let result: Result<Fragment> = async {
+            while let Some(batch) = updater.next().await? {
+                debug_assert_eq!(batch.num_columns(), 1);
+                let mut rows_remaining = batch.num_rows();
+
+                // The updater yields an empty batch when every row in a read batch
+                // has been deleted (e.g. a whole batch falls within the deletion
+                // vector). There is nothing to pull from the stream in that case, so
+                // feed an empty batch back to keep the updater in sync and continue.
+                if rows_remaining == 0 {
+                    updater
+                        .update(RecordBatch::new_empty(stream.schema()))
+                        .await?;
+                    continue;
+                }
 
-            let mut batches = Vec::new();
+                let mut batches = Vec::new();
 
-            while rows_remaining > 0 {
-                let next_batch = if let Some(last_seen_batch) = last_seen_batch {
-                    last_seen_batch
-                } else {
-                    stream.next().await.ok_or_else(|| {
-                        Error::invalid_input(
-                            "Stream ended before producing values for all rows in dataset",
-                        )
-                    })??
-                };
-                let num_rows = next_batch.num_rows();
-                if num_rows > rows_remaining {
-                    let new_batch = next_batch.slice(0, rows_remaining);
-                    batches.push(new_batch);
-                    last_seen_batch =
-                        Some(next_batch.slice(rows_remaining, num_rows - rows_remaining));
-                    rows_remaining = 0;
-                } else {
-                    batches.push(next_batch);
-                    rows_remaining -= num_rows;
-                    last_seen_batch = None;
+                while rows_remaining > 0 {
+                    let next_batch = if let Some(last_seen) = last_seen_batch.take() {
+                        last_seen
+                    } else {
+                        stream.next().await.ok_or_else(|| {
+                            Error::invalid_input(
+                                "Stream ended before producing values for all rows in dataset",
+                            )
+                        })??
+                    };
+                    let num_rows = next_batch.num_rows();
+                    if num_rows > rows_remaining {
+                        let new_batch = next_batch.slice(0, rows_remaining);
+                        batches.push(new_batch);
+                        last_seen_batch =
+                            Some(next_batch.slice(rows_remaining, num_rows - rows_remaining));
+                        rows_remaining = 0;
+                    } else {
+                        batches.push(next_batch);
+                        rows_remaining -= num_rows;
+                        last_seen_batch = None;
+                    }
                 }
-            }
 
-            let new_batch =
-                arrow_select::concat::concat_batches(&batches[0].schema(), batches.iter())?;
+                let new_batch =
+                    arrow_select::concat::concat_batches(&batches[0].schema(), batches.iter())?;
 
-            updater.update(new_batch).await?;
+                updater.update(new_batch).await?;
+            }
+            updater.finish().await
+        }
+        .await;
+
+        match result {
+            Ok(new_fragment) => new_fragments.push(new_fragment),
+            Err(e) => {
+                updater.cleanup_unfinished_writer().await;
+                cleanup_new_column_data_files(fragments, &new_fragments).await;
+                return Err(e);
+            }
         }
-        new_fragments.push(updater.finish().await?);
     }
 
     // Ensure the stream is fully consumed
     if last_seen_batch.is_some() || stream.next().await.is_some() {
+        cleanup_new_column_data_files(fragments, &new_fragments).await;
         return Err(Error::invalid_input_source(
             "Stream produced more values than expected for dataset".into(),
         ));
@@ -605,6 +748,41 @@ pub(super) async fn alter_columns(
 
     new_schema.validate()?;
 
+    // If any column being cast has an attached index, fail fast. Cast operations
+    // rewrite the underlying column data and silently invalidate any index on the
+    // affected column(s). The current behavior is to drop such indices without
+    // warning, which has caused production incidents where vector search silently
+    // regressed to brute-force scan. We require users to explicitly drop the
+    // index before altering the column type, so the action is never silent.
+    if !cast_fields.is_empty() {
+        let indices = dataset.load_indices().await?;
+        let affected: Vec<&lance_table::format::IndexMetadata> = indices
+            .iter()
+            .filter(|idx| {
+                cast_fields
+                    .iter()
+                    .any(|(old, _)| idx.fields.contains(&old.id))
+            })
+            .collect();
+        if !affected.is_empty() {
+            let affected_cols: Vec<String> = cast_fields
+                .iter()
+                .filter(|(old, _)| affected.iter().any(|i| i.fields.contains(&old.id)))
+                .map(|(old, _)| old.name.clone())
+                .collect();
+            let affected_idx_names: Vec<String> = affected.iter().map(|i| i.name.clone()).collect();
+            return Err(Error::invalid_input(format!(
+                "Cannot cast column(s) [{}] to a new type: they have {} index(es) \
+                 attached: [{}]. Cast rewrites column data and invalidates any index \
+                 on the affected column(s). Drop the index(es) with drop_index() \
+                 before altering, then recreate them after the cast completes.",
+                affected_cols.join(", "),
+                affected.len(),
+                affected_idx_names.join(", "),
+            )));
+        }
+    }
+
     // If we aren't casting a column, we don't need to touch the fragments.
     let transaction = if cast_fields.is_empty() {
         Transaction::new(
@@ -653,7 +831,7 @@ pub(super) async fn alter_columns(
         };
         let mapper = Box::new(mapper);
 
-        let fragments = add_columns_impl(
+        let result = add_columns_impl(
             &dataset.get_fragments(),
             Some(read_columns),
             mapper,
@@ -666,7 +844,8 @@ pub(super) async fn alter_columns(
         // Some data files may no longer contain any columns in the dataset (e.g. if every
         // remaining column has been altered into a different data file) and so we remove them
         let schema_field_ids = new_schema.field_ids().into_iter().collect::<Vec<_>>();
-        let fragments = fragments
+        let fragments = result
+            .fragments
             .into_iter()
             .map(|mut frag| {
                 frag.files.retain(|f| {
@@ -734,56 +913,751 @@ pub(super) async fn drop_columns(dataset: &mut Dataset, columns: &[&str]) -> Res
         .apply_commit(transaction, &Default::default(), &Default::default())
         .await?;
 
-    Ok(())
-}
+    Ok(())
+}
+
+/// Exclude the fields from `other` Schema, and returns a new Schema.
+pub fn exclude(source: &Schema, other: &Schema, version: &LanceFileVersion) -> Result<Schema> {
+    let other: Schema = other.try_into().map_err(|_| {
+        Error::schema("The other schema is not compatible with this schema".to_string())
+    })?;
+    let mut fields = vec![];
+    for field in source.fields.iter() {
+        if let Some(other_field) = other.field(&field.name) {
+            if version.support_remove_sub_column(field)
+                && let Some(f) = field.exclude(other_field)
+            {
+                fields.push(f)
+            }
+        } else {
+            fields.push(field.clone());
+        }
+    }
+    Ok(Schema {
+        fields,
+        metadata: source.metadata.clone(),
+    })
+}
+
+#[cfg(test)]
+mod test {
+    use std::{collections::HashMap, fs, num::NonZero, path::Path as StdPath, sync::Mutex};
+
+    use crate::dataset::WriteParams;
+    use arrow_array::{
+        ArrayRef, Int32Array, ListArray, RecordBatchIterator, StringArray, StructArray,
+    };
+
+    use super::*;
+    use arrow_schema::Fields as ArrowFields;
+    use lance_core::utils::tempfile::TempStrDir;
+    use lance_file::version::LanceFileVersion;
+    use lance_table::format::{BasePath, DataFile};
+    use rstest::rstest;
+
+    // Used to validate that futures returned are Send.
+    fn require_send<T: Send>(t: T) -> T {
+        t
+    }
+
+    fn file_paths_in(dir: impl AsRef<StdPath>) -> Vec<String> {
+        fn collect_files(
+            base_dir: &StdPath,
+            dir: &StdPath,
+            files: &mut Vec<String>,
+        ) -> std::io::Result<()> {
+            if !dir.exists() {
+                return Ok(());
+            }
+            for entry in std::fs::read_dir(dir)? {
+                let path = entry?.path();
+                if path.is_dir() {
+                    collect_files(base_dir, &path, files)?;
+                } else if path.is_file()
+                    && path
+                        .file_name()
+                        .and_then(|name| name.to_str())
+                        .is_some_and(|file_name| !file_name.starts_with('.'))
+                {
+                    files.push(
+                        path.strip_prefix(base_dir)
+                            .unwrap()
+                            .to_string_lossy()
+                            .to_string(),
+                    );
+                }
+            }
+            Ok(())
+        }
+
+        let base_dir = dir.as_ref();
+        let mut files = Vec::new();
+        collect_files(base_dir, base_dir, &mut files).unwrap();
+        files.sort();
+        files
+    }
+
+    fn data_file_paths_in(base_dir: &str) -> Vec<String> {
+        file_paths_in(StdPath::new(base_dir).join("data"))
+    }
+
+    #[tokio::test]
+    async fn test_append_columns_exprs() -> Result<()> {
+        let num_rows = 5;
+        let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
+            "id",
+            DataType::Int32,
+            false,
+        )]));
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![Arc::new(Int32Array::from_iter_values(0..num_rows as i32))],
+        )?;
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone());
+
+        let test_dir = TempStrDir::default();
+        let test_uri = &test_dir;
+        let mut dataset = Dataset::write(
+            reader,
+            test_uri,
+            Some(WriteParams {
+                data_storage_version: Some(LanceFileVersion::Legacy),
+                ..Default::default()
+            }),
+        )
+        .await?;
+        dataset.validate().await?;
+
+        // Adding a duplicate column name will break
+        let fut = dataset.add_columns(
+            NewColumnTransform::SqlExpressions(vec![("id".into(), "id + 1".into())]),
+            None,
+            None,
+        );
+        // (Quick validation that the future is Send)
+        let res = require_send(fut).await;
+        assert!(matches!(res, Err(Error::InvalidInput { .. })));
+
+        // Can add a column that is independent of any existing ones
+        dataset
+            .add_columns(
+                NewColumnTransform::SqlExpressions(vec![("value".into(), "2 * random()".into())]),
+                None,
+                None,
+            )
+            .await?;
+
+        // Can add a column derived from an existing one.
+        dataset
+            .add_columns(
+                NewColumnTransform::SqlExpressions(vec![("double_id".into(), "2 * id".into())]),
+                None,
+                None,
+            )
+            .await?;
+
+        // Can derive a column from existing ones across multiple data files.
+        dataset
+            .add_columns(
+                NewColumnTransform::SqlExpressions(vec![(
+                    "triple_id".into(),
+                    "id + double_id".into(),
+                )]),
+                None,
+                None,
+            )
+            .await?;
+
+        // These can be read back, the dataset is valid
+        dataset.validate().await?;
+
+        let data = dataset.scan().try_into_batch().await?;
+        let expected_schema = ArrowSchema::new(vec![
+            ArrowField::new("id", DataType::Int32, false),
+            ArrowField::new("value", DataType::Float64, true),
+            ArrowField::new("double_id", DataType::Int32, false),
+            ArrowField::new("triple_id", DataType::Int32, false),
+        ]);
+        assert_eq!(data.schema().as_ref(), &expected_schema);
+        assert_eq!(data.num_rows(), num_rows);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_add_columns_with_fully_deleted_batch() -> Result<()> {
+        // Regression test: when an entire read batch has been deleted, the
+        // updater yields a 0-row batch. The inner loop then never runs and
+        // `batches` stays empty, so `concat_batches(&batches[0]..)` used to
+        // panic with "index out of bounds: the len is 0 but the index is 0".
+        //
+        // A single fragment holds 105 rows; deleting the trailing 5 rows means
+        // that, when read with batch_size=50, the third batch [100..105) is
+        // fully filtered out and produces an empty batch.
+        let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
+            "i",
+            DataType::Int32,
+            false,
+        )]));
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![Arc::new(Int32Array::from_iter_values(0..105))],
+        )?;
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone());
+
+        let test_dir = TempStrDir::default();
+        let test_uri = &test_dir;
+        let mut dataset = Dataset::write(
+            reader,
+            test_uri,
+            Some(WriteParams {
+                max_rows_per_file: 200, // keep all rows in a single fragment
+                ..Default::default()
+            }),
+        )
+        .await?;
+
+        // Delete the entire trailing batch [100..105).
+        dataset.delete("i >= 100").await?;
+        assert_eq!(dataset.count_rows(None).await?, 100);
+
+        let new_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
+            "j",
+            DataType::Int32,
+            false,
+        )]));
+        let new_batch = RecordBatch::try_new(
+            new_schema.clone(),
+            vec![Arc::new(Int32Array::from_iter_values(0..100))],
+        )?;
+        let reader = RecordBatchIterator::new(vec![Ok(new_batch)], new_schema.clone());
+
+        // Read with batch_size=50 so the deleted trailing rows form a full empty batch.
+        dataset
+            .add_columns(NewColumnTransform::Reader(Box::new(reader)), None, Some(50))
+            .await?;
+
+        let data = dataset.scan().try_into_batch().await?;
+        assert_eq!(data.num_rows(), 100);
+        assert_eq!(
+            data.column_by_name("j").unwrap().as_ref(),
+            &Int32Array::from_iter_values(0..100)
+        );
+
+        Ok(())
+    }
+
+    #[rstest]
+    #[tokio::test]
+    async fn test_add_columns_cleans_up_blob_v2_data_on_stream_error(
+        #[values(
+            ("inline", b"inline".to_vec()),
+            ("packed", vec![1u8; 128 * 1024]),
+            ("dedicated", vec![2u8; 5 * 1024 * 1024]),
+            ("external", b"external".to_vec())
+        )]
+        blob_case: (&str, Vec<u8>),
+    ) -> Result<()> {
+        let (blob_kind, payload) = blob_case;
+        let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
+            "id",
+            DataType::Int32,
+            false,
+        )]));
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![Arc::new(Int32Array::from_iter_values(0..1))],
+        )?;
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone());
+
+        let test_dir = TempStrDir::default();
+        let test_uri = test_dir.as_str();
+        let external_dir = tempfile::tempdir()?;
+        let external_path = external_dir.path().join("blob.bin");
+        fs::write(&external_path, &payload)?;
+        let external_baseline_files = file_paths_in(external_dir.path());
+        let external_baseline_payload = fs::read(&external_path)?;
+
+        let mut dataset = Dataset::write(
+            reader,
+            test_uri,
+            Some(WriteParams {
+                data_storage_version: Some(LanceFileVersion::V2_2),
+                initial_bases: Some(vec![BasePath::new(
+                    1,
+                    external_dir.path().to_string_lossy().to_string(),
+                    Some("external".to_string()),
+                    false,
+                )]),
+                ..Default::default()
+            }),
+        )
+        .await?;
+        let baseline_files = data_file_paths_in(test_uri);
+
+        let mut blob_builder = crate::BlobArrayBuilder::new(2);
+        if blob_kind == "external" {
+            blob_builder.push_uri(external_path.to_string_lossy())?;
+        } else {
+            blob_builder.push_bytes(payload)?;
+        }
+        blob_builder.push_bytes(b"extra")?;
+        let blob_array = blob_builder.finish()?;
+        let blob_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)]));
+        let blob_batch = RecordBatch::try_new(blob_schema.clone(), vec![blob_array])?;
+        let reader = RecordBatchIterator::new(vec![Ok(blob_batch)], blob_schema);
+
+        let err = dataset
+            .add_columns(NewColumnTransform::Reader(Box::new(reader)), None, None)
+            .await
+            .unwrap_err();
+        assert!(
+            err.to_string()
+                .contains("Stream produced more values than expected for dataset")
+        );
+
+        assert_eq!(
+            data_file_paths_in(test_uri),
+            baseline_files,
+            "add_columns should clean up new data files and blob v2 sidecars on failure"
+        );
+        assert_eq!(
+            file_paths_in(external_dir.path()),
+            external_baseline_files,
+            "cleanup must not delete external files"
+        );
+        assert_eq!(
+            fs::read(&external_path)?,
+            external_baseline_payload,
+            "cleanup must not modify external files"
+        );
+        dataset.validate().await?;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_cleanup_preserves_checkpointed_fragment_files() -> Result<()> {
+        let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
+            "id",
+            DataType::Int32,
+            false,
+        )]));
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![Arc::new(Int32Array::from_iter_values(0..2))],
+        )?;
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema);
+
+        let test_dir = TempStrDir::default();
+        let test_uri = test_dir.as_str();
+        let mut dataset = Dataset::write(
+            reader,
+            test_uri,
+            Some(WriteParams {
+                max_rows_per_file: 1,
+                data_storage_version: Some(LanceFileVersion::V2_2),
+                ..Default::default()
+            }),
+        )
+        .await?;
+        let original_fragments = dataset.get_fragments();
+        assert_eq!(original_fragments.len(), 2);
+
+        let data_dir = StdPath::new(test_uri).join("data");
+        let cached_file = data_dir.join("checkpointed.lance");
+        let cached_blob_dir = data_dir.join("checkpointed");
+        fs::write(&cached_file, b"checkpointed data")?;
+        fs::create_dir_all(&cached_blob_dir)?;
+        fs::write(
+            cached_blob_dir.join("00000000000000000000000000000001.blob"),
+            b"blob",
+        )?;
+
+        let mut checkpointed_fragment = original_fragments[0].metadata().clone();
+        checkpointed_fragment.files.push(DataFile::new(
+            "checkpointed.lance",
+            vec![dataset.manifest.max_field_id() + 1],
+            vec![0],
+            2,
+            2,
+            NonZero::new(17),
+            None,
+        ));
+
+        #[derive(Default)]
+        struct CheckpointedFragmentStore {
+            fragment: Mutex<Option<Fragment>>,
+        }
+
+        impl UDFCheckpointStore for CheckpointedFragmentStore {
+            fn get_batch(&self, _info: &BatchInfo) -> Result<Option<RecordBatch>> {
+                Ok(None)
+            }
+
+            fn insert_batch(&self, _info: BatchInfo, _batch: RecordBatch) -> Result<()> {
+                Ok(())
+            }
+
+            fn get_fragment(&self, fragment_id: u32) -> Result<Option<Fragment>> {
+                if fragment_id == 0 {
+                    Ok(self.fragment.lock().unwrap().clone())
+                } else {
+                    Ok(None)
+                }
+            }
+
+            fn insert_fragment(&self, _fragment: Fragment) -> Result<()> {
+                Ok(())
+            }
+        }
+
+        let transforms = NewColumnTransform::BatchUDF(BatchUDF {
+            mapper: Box::new(|_| Err(Error::invalid_input("injected UDF failure"))),
+            output_schema: Arc::new(ArrowSchema::new(vec![ArrowField::new(
+                "checkpointed",
+                DataType::Int32,
+                true,
+            )])),
+            result_checkpoint: Some(Arc::new(CheckpointedFragmentStore {
+                fragment: Mutex::new(Some(checkpointed_fragment)),
+            })),
+        });
+
+        let err = dataset
+            .add_columns(transforms, None, None)
+            .await
+            .unwrap_err();
+        assert!(err.to_string().contains("injected UDF failure"));
+
+        assert!(
+            cached_file.exists(),
+            "cleanup must not delete fragment files restored from a checkpoint"
+        );
+        assert!(
+            cached_blob_dir.exists(),
+            "cleanup must not delete blob sidecars restored from a checkpoint"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_add_columns_cleans_current_blob_v2_writer_on_udf_error() -> Result<()> {
+        let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
+            "id",
+            DataType::Int32,
+            false,
+        )]));
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![Arc::new(Int32Array::from_iter_values(0..2))],
+        )?;
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema);
+
+        let test_dir = TempStrDir::default();
+        let test_uri = test_dir.as_str();
+        let mut dataset = Dataset::write(
+            reader,
+            test_uri,
+            Some(WriteParams {
+                data_storage_version: Some(LanceFileVersion::V2_2),
+                ..Default::default()
+            }),
+        )
+        .await?;
+        let baseline_files = data_file_paths_in(test_uri);
+
+        let call_count = Arc::new(Mutex::new(0usize));
+        let mapper_call_count = call_count.clone();
+        let output_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)]));
+        let mapper = move |batch: &RecordBatch| {
+            let mut call_count = mapper_call_count.lock().unwrap();
+            *call_count += 1;
+            if *call_count == 2 {
+                return Err(Error::invalid_input("injected UDF failure"));
+            }
+
+            let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows());
+            for _ in 0..batch.num_rows() {
+                blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?;
+            }
+            Ok(RecordBatch::try_new(
+                Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])),
+                vec![blob_builder.finish()?],
+            )?)
+        };
+        let transforms = NewColumnTransform::BatchUDF(BatchUDF {
+            mapper: Box::new(mapper),
+            output_schema,
+            result_checkpoint: None,
+        });
+
+        let err = dataset
+            .add_columns(transforms, None, Some(1))
+            .await
+            .unwrap_err();
+        assert!(err.to_string().contains("injected UDF failure"));
+        assert_eq!(
+            data_file_paths_in(test_uri),
+            baseline_files,
+            "add_columns should clean files written by the current unfinished writer"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_add_columns_preserves_checkpointed_blob_v2_fragment_on_checkpoint_lookup_error()
+    -> Result<()> {
+        let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
+            "id",
+            DataType::Int32,
+            false,
+        )]));
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![Arc::new(Int32Array::from_iter_values(0..2))],
+        )?;
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema);
+
+        let test_dir = TempStrDir::default();
+        let test_uri = test_dir.as_str();
+        let mut dataset = Dataset::write(
+            reader,
+            test_uri,
+            Some(WriteParams {
+                max_rows_per_file: 1,
+                data_storage_version: Some(LanceFileVersion::V2_2),
+                ..Default::default()
+            }),
+        )
+        .await?;
+
+        struct FailingLookupStore {
+            inserted: Arc<Mutex<Option<Fragment>>>,
+        }
+
+        impl UDFCheckpointStore for FailingLookupStore {
+            fn get_batch(&self, _info: &BatchInfo) -> Result<Option<RecordBatch>> {
+                Ok(None)
+            }
+
+            fn insert_batch(&self, _info: BatchInfo, _batch: RecordBatch) -> Result<()> {
+                Ok(())
+            }
+
+            fn get_fragment(&self, fragment_id: u32) -> Result<Option<Fragment>> {
+                if fragment_id == 1 {
+                    Err(Error::invalid_input("injected checkpoint lookup failure"))
+                } else {
+                    Ok(None)
+                }
+            }
+
+            fn insert_fragment(&self, fragment: Fragment) -> Result<()> {
+                *self.inserted.lock().unwrap() = Some(fragment);
+                Ok(())
+            }
+        }
+
+        let inserted = Arc::new(Mutex::new(None));
+        let output_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)]));
+        let mapper = move |batch: &RecordBatch| {
+            let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows());
+            for _ in 0..batch.num_rows() {
+                blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?;
+            }
+            Ok(RecordBatch::try_new(
+                Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])),
+                vec![blob_builder.finish()?],
+            )?)
+        };
+        let transforms = NewColumnTransform::BatchUDF(BatchUDF {
+            mapper: Box::new(mapper),
+            output_schema,
+            result_checkpoint: Some(Arc::new(FailingLookupStore {
+                inserted: inserted.clone(),
+            })),
+        });
+
+        let err = dataset
+            .add_columns(transforms, None, None)
+            .await
+            .unwrap_err();
+        assert!(
+            err.to_string()
+                .contains("injected checkpoint lookup failure")
+        );
+        let inserted = inserted.lock().unwrap().clone().unwrap();
+        let new_file = inserted
+            .files
+            .iter()
+            .find(|file| {
+                file.fields
+                    .iter()
+                    .any(|field| *field > dataset.manifest.max_field_id())
+            })
+            .expect("checkpoint should record the newly written data file");
+        let new_file_path = StdPath::new(test_uri).join("data").join(&new_file.path);
+        let new_blob_dir = StdPath::new(test_uri)
+            .join("data")
+            .join(StdPath::new(&new_file.path).file_stem().unwrap());
+        assert!(
+            new_file_path.exists(),
+            "cleanup must not delete data files after checkpoint takes ownership"
+        );
+        assert!(
+            new_blob_dir.exists(),
+            "cleanup must not delete blob sidecars after checkpoint takes ownership"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_add_columns_cleans_finished_blob_v2_writer_on_checkpoint_insert_error()
+    -> Result<()> {
+        let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
+            "id",
+            DataType::Int32,
+            false,
+        )]));
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![Arc::new(Int32Array::from_iter_values(0..1))],
+        )?;
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema);
+
+        let test_dir = TempStrDir::default();
+        let test_uri = test_dir.as_str();
+        let mut dataset = Dataset::write(
+            reader,
+            test_uri,
+            Some(WriteParams {
+                data_storage_version: Some(LanceFileVersion::V2_2),
+                ..Default::default()
+            }),
+        )
+        .await?;
+        let baseline_files = data_file_paths_in(test_uri);
+
+        struct FailingInsertStore;
 
-/// Exclude the fields from `other` Schema, and returns a new Schema.
-pub fn exclude(source: &Schema, other: &Schema, version: &LanceFileVersion) -> Result<Schema> {
-    let other: Schema = other.try_into().map_err(|_| {
-        Error::schema("The other schema is not compatible with this schema".to_string())
-    })?;
-    let mut fields = vec![];
-    for field in source.fields.iter() {
-        if let Some(other_field) = other.field(&field.name) {
-            if version.support_remove_sub_column(field)
-                && let Some(f) = field.exclude(other_field)
-            {
-                fields.push(f)
+        impl UDFCheckpointStore for FailingInsertStore {
+            fn get_batch(&self, _info: &BatchInfo) -> Result<Option<RecordBatch>> {
+                Ok(None)
+            }
+
+            fn insert_batch(&self, _info: BatchInfo, _batch: RecordBatch) -> Result<()> {
+                Ok(())
+            }
+
+            fn get_fragment(&self, _fragment_id: u32) -> Result<Option<Fragment>> {
+                Ok(None)
+            }
+
+            fn insert_fragment(&self, _fragment: Fragment) -> Result<()> {
+                Err(Error::invalid_input("injected checkpoint insert failure"))
             }
-        } else {
-            fields.push(field.clone());
         }
+
+        let output_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)]));
+        let mapper = move |batch: &RecordBatch| {
+            let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows());
+            for _ in 0..batch.num_rows() {
+                blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?;
+            }
+            Ok(RecordBatch::try_new(
+                Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])),
+                vec![blob_builder.finish()?],
+            )?)
+        };
+        let transforms = NewColumnTransform::BatchUDF(BatchUDF {
+            mapper: Box::new(mapper),
+            output_schema,
+            result_checkpoint: Some(Arc::new(FailingInsertStore)),
+        });
+
+        let err = dataset
+            .add_columns(transforms, None, None)
+            .await
+            .unwrap_err();
+        assert!(
+            err.to_string()
+                .contains("injected checkpoint insert failure")
+        );
+        assert_eq!(
+            data_file_paths_in(test_uri),
+            baseline_files,
+            "add_columns should clean finished writer files when checkpoint insert fails"
+        );
+
+        Ok(())
     }
-    Ok(Schema {
-        fields,
-        metadata: source.metadata.clone(),
-    })
-}
 
-#[cfg(test)]
-mod test {
-    use std::collections::HashMap;
-    use std::sync::Mutex;
+    #[tokio::test]
+    async fn test_add_columns_cleans_blob_v2_files_on_declared_schema_merge_error() -> Result<()> {
+        let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
+            "id",
+            DataType::Int32,
+            false,
+        )]));
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![Arc::new(Int32Array::from_iter_values(0..1))],
+        )?;
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema);
 
-    use crate::dataset::WriteParams;
-    use arrow_array::{
-        ArrayRef, Int32Array, ListArray, RecordBatchIterator, StringArray, StructArray,
-    };
+        let test_dir = TempStrDir::default();
+        let test_uri = test_dir.as_str();
+        let mut dataset = Dataset::write(
+            reader,
+            test_uri,
+            Some(WriteParams {
+                data_storage_version: Some(LanceFileVersion::V2_2),
+                ..Default::default()
+            }),
+        )
+        .await?;
+        let baseline_files = data_file_paths_in(test_uri);
 
-    use super::*;
-    use arrow_schema::Fields as ArrowFields;
-    use lance_core::utils::tempfile::TempStrDir;
-    use lance_file::version::LanceFileVersion;
-    use rstest::rstest;
+        let mapper = move |batch: &RecordBatch| {
+            let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows());
+            for _ in 0..batch.num_rows() {
+                blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?;
+            }
+            Ok(RecordBatch::try_new(
+                Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])),
+                vec![blob_builder.finish()?],
+            )?)
+        };
+        let transforms = NewColumnTransform::BatchUDF(BatchUDF {
+            mapper: Box::new(mapper),
+            output_schema: Arc::new(ArrowSchema::new(vec![
+                ArrowField::new("declared", DataType::Int32, true),
+                ArrowField::new("declared", DataType::Int32, true),
+            ])),
+            result_checkpoint: None,
+        });
 
-    // Used to validate that futures returned are Send.
-    fn require_send<T: Send>(t: T) -> T {
-        t
+        let err = dataset
+            .add_columns(transforms, None, None)
+            .await
+            .unwrap_err();
+        assert!(matches!(err, Error::Schema { .. }));
+        assert_eq!(
+            data_file_paths_in(test_uri),
+            baseline_files,
+            "add_columns should clean files written before declared schema merge fails"
+        );
+
+        Ok(())
     }
 
     #[tokio::test]
-    async fn test_append_columns_exprs() -> Result<()> {
-        let num_rows = 5;
+    async fn test_add_columns_preserves_checkpointed_blob_v2_fragment_after_later_failure()
+    -> Result<()> {
         let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
             "id",
             DataType::Int32,
@@ -791,75 +1665,101 @@ mod test {
         )]));
         let batch = RecordBatch::try_new(
             schema.clone(),
-            vec![Arc::new(Int32Array::from_iter_values(0..num_rows as i32))],
+            vec![Arc::new(Int32Array::from_iter_values(0..2))],
         )?;
-        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone());
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema);
 
         let test_dir = TempStrDir::default();
-        let test_uri = &test_dir;
+        let test_uri = test_dir.as_str();
         let mut dataset = Dataset::write(
             reader,
             test_uri,
             Some(WriteParams {
-                data_storage_version: Some(LanceFileVersion::Legacy),
+                max_rows_per_file: 1,
+                data_storage_version: Some(LanceFileVersion::V2_2),
                 ..Default::default()
             }),
         )
         .await?;
-        dataset.validate().await?;
 
-        // Adding a duplicate column name will break
-        let fut = dataset.add_columns(
-            NewColumnTransform::SqlExpressions(vec![("id".into(), "id + 1".into())]),
-            None,
-            None,
-        );
-        // (Quick validation that the future is Send)
-        let res = require_send(fut).await;
-        assert!(matches!(res, Err(Error::InvalidInput { .. })));
+        struct InsertThenFailStore {
+            inserted: Arc<Mutex<Option<Fragment>>>,
+        }
 
-        // Can add a column that is independent of any existing ones
-        dataset
-            .add_columns(
-                NewColumnTransform::SqlExpressions(vec![("value".into(), "2 * random()".into())]),
-                None,
-                None,
-            )
-            .await?;
+        impl UDFCheckpointStore for InsertThenFailStore {
+            fn get_batch(&self, info: &BatchInfo) -> Result<Option<RecordBatch>> {
+                if info.fragment_id == 1 {
+                    Err(Error::invalid_input("injected later checkpoint failure"))
+                } else {
+                    Ok(None)
+                }
+            }
 
-        // Can add a column derived from an existing one.
-        dataset
-            .add_columns(
-                NewColumnTransform::SqlExpressions(vec![("double_id".into(), "2 * id".into())]),
-                None,
-                None,
-            )
-            .await?;
+            fn insert_batch(&self, _info: BatchInfo, _batch: RecordBatch) -> Result<()> {
+                Ok(())
+            }
 
-        // Can derive a column from existing ones across multiple data files.
-        dataset
-            .add_columns(
-                NewColumnTransform::SqlExpressions(vec![(
-                    "triple_id".into(),
-                    "id + double_id".into(),
-                )]),
-                None,
-                None,
-            )
-            .await?;
+            fn get_fragment(&self, _fragment_id: u32) -> Result<Option<Fragment>> {
+                Ok(None)
+            }
 
-        // These can be read back, the dataset is valid
-        dataset.validate().await?;
+            fn insert_fragment(&self, fragment: Fragment) -> Result<()> {
+                *self.inserted.lock().unwrap() = Some(fragment);
+                Ok(())
+            }
+        }
 
-        let data = dataset.scan().try_into_batch().await?;
-        let expected_schema = ArrowSchema::new(vec![
-            ArrowField::new("id", DataType::Int32, false),
-            ArrowField::new("value", DataType::Float64, true),
-            ArrowField::new("double_id", DataType::Int32, false),
-            ArrowField::new("triple_id", DataType::Int32, false),
-        ]);
-        assert_eq!(data.schema().as_ref(), &expected_schema);
-        assert_eq!(data.num_rows(), num_rows);
+        let inserted = Arc::new(Mutex::new(None));
+        let output_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)]));
+        let mapper = move |batch: &RecordBatch| {
+            let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows());
+            for _ in 0..batch.num_rows() {
+                blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?;
+            }
+            Ok(RecordBatch::try_new(
+                Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])),
+                vec![blob_builder.finish()?],
+            )?)
+        };
+        let transforms = NewColumnTransform::BatchUDF(BatchUDF {
+            mapper: Box::new(mapper),
+            output_schema,
+            result_checkpoint: Some(Arc::new(InsertThenFailStore {
+                inserted: inserted.clone(),
+            })),
+        });
+
+        let err = dataset
+            .add_columns(transforms, None, None)
+            .await
+            .unwrap_err();
+        assert!(
+            err.to_string()
+                .contains("injected later checkpoint failure")
+        );
+
+        let inserted = inserted.lock().unwrap().clone().unwrap();
+        let new_file = inserted
+            .files
+            .iter()
+            .find(|file| {
+                file.fields
+                    .iter()
+                    .any(|field| *field > dataset.manifest.max_field_id())
+            })
+            .expect("checkpoint should record the newly written data file");
+        let new_file_path = StdPath::new(test_uri).join("data").join(&new_file.path);
+        let new_blob_dir = StdPath::new(test_uri)
+            .join("data")
+            .join(StdPath::new(&new_file.path).file_stem().unwrap());
+        assert!(
+            new_file_path.exists(),
+            "cleanup must not delete data files after checkpoint takes ownership"
+        );
+        assert!(
+            new_blob_dir.exists(),
+            "cleanup must not delete blob sidecars after checkpoint takes ownership"
+        );
 
         Ok(())
     }
@@ -1784,7 +2684,6 @@ mod test {
     ) -> Result<()> {
         // Create a table with 2 scalar columns, 1 vector column
 
-        use crate::index::DatasetIndexExt;
         use arrow::datatypes::{Int32Type, Int64Type};
         use arrow_array::{Float16Array, Float32Array, Int64Array, ListArray};
         use half::f16;
@@ -1885,7 +2784,10 @@ mod test {
             assert_eq!(f.files.len(), 2);
         });
 
-        // Cast scalar column with index, should not keep index (TODO: keep it)
+        // Cast scalar column with index. The index must be dropped first; cast
+        // is now a fail-fast operation when an index is attached, see
+        // test_alter_columns_cast_fails_with_attached_index for that path.
+        dataset.drop_index("i_idx").await?;
         dataset
             .alter_columns(&[ColumnAlteration::new("i".into()).cast_to(DataType::Int64)])
             .await?;
@@ -1906,7 +2808,8 @@ mod test {
         ]);
         assert_eq!(&ArrowSchema::from(dataset.schema()), &expected_schema);
 
-        // We currently lose the index when casting a column
+        // The scalar index on `i` is gone (we dropped it); the vector index on
+        // `vec` is still present.
         let indices = dataset.load_indices().await?;
         assert_eq!(indices.len(), 1);
 
@@ -1915,7 +2818,8 @@ mod test {
             assert_eq!(f.files.len(), 3);
         });
 
-        // Cast vector column, should not keep index (TODO: keep it)
+        // Cast vector column. Drop its index first (same reason as above).
+        dataset.drop_index("vec_idx").await?;
         dataset
             .alter_columns(&[
                 ColumnAlteration::new("vec".into()).cast_to(DataType::FixedSizeList(
@@ -1983,6 +2887,120 @@ mod test {
         Ok(())
     }
 
+    /// Cast on a column with an attached index must fail fast rather than
+    /// silently dropping the index. This guards against the historical behavior
+    /// where cast would rewrite column data and the index would vanish without
+    /// any error or warning, causing vector search to silently regress to a
+    /// brute-force scan.
+    #[rstest]
+    #[tokio::test]
+    async fn test_alter_columns_cast_fails_with_attached_index(
+        #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)]
+        data_storage_version: LanceFileVersion,
+    ) -> Result<()> {
+        use lance_arrow::FixedSizeListArrayExt;
+        use lance_index::IndexType;
+        use lance_linalg::distance::MetricType;
+        use lance_testing::datagen::generate_random_array;
+
+        use crate::index::vector::VectorIndexParams;
+
+        // Build a small dataset with one indexed vector column.
+        let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
+            "vec",
+            DataType::FixedSizeList(
+                Arc::new(ArrowField::new("item", DataType::Float32, true)),
+                64,
+            ),
+            false,
+        )]));
+        let nrows = 256;
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![Arc::new(
+                <arrow_array::FixedSizeListArray as FixedSizeListArrayExt>::try_new_from_values(
+                    generate_random_array(64 * nrows as usize),
+                    64,
+                )
+                .unwrap(),
+            )],
+        )?;
+
+        let test_dir = TempStrDir::default();
+        let mut dataset = Dataset::write(
+            RecordBatchIterator::new(vec![Ok(batch)], schema.clone()),
+            &test_dir,
+            Some(WriteParams {
+                data_storage_version: Some(data_storage_version),
+                ..Default::default()
+            }),
+        )
+        .await?;
+
+        // Build an IVF_PQ index on the vector column.
+        let params = VectorIndexParams::ivf_pq(4, 8, 8, MetricType::L2, 50);
+        dataset
+            .create_index(&["vec"], IndexType::Vector, None, &params, false)
+            .await?;
+
+        let indices_before = dataset.load_indices().await?;
+        assert_eq!(indices_before.len(), 1, "precondition: index exists");
+        let index_name = indices_before[0].name.clone();
+
+        // Attempting to cast the indexed column must fail with a clear message
+        // that names the offending index(es).
+        let result = dataset
+            .alter_columns(&[
+                ColumnAlteration::new("vec".into()).cast_to(DataType::FixedSizeList(
+                    Arc::new(ArrowField::new("item", DataType::Float16, true)),
+                    64,
+                )),
+            ])
+            .await;
+        let err = result.expect_err("cast on indexed column should fail");
+        let msg = err.to_string();
+        assert!(
+            msg.contains("vec") && msg.contains(&index_name),
+            "error should mention column and index name, got: {msg}"
+        );
+        assert!(
+            msg.contains("drop_index"),
+            "error should suggest the remediation, got: {msg}"
+        );
+
+        // The dataset must be unchanged: schema is still float32, index still present.
+        assert_eq!(
+            dataset.schema().field("vec").unwrap().data_type(),
+            DataType::FixedSizeList(
+                Arc::new(ArrowField::new("item", DataType::Float32, true)),
+                64,
+            ),
+        );
+        let indices_after = dataset.load_indices().await?;
+        assert_eq!(indices_after.len(), 1, "index should still exist");
+        assert_eq!(indices_after[0].name, index_name);
+
+        // Sanity check: after dropping the index, the same cast should succeed.
+        dataset.drop_index(&index_name).await?;
+        dataset
+            .alter_columns(&[
+                ColumnAlteration::new("vec".into()).cast_to(DataType::FixedSizeList(
+                    Arc::new(ArrowField::new("item", DataType::Float16, true)),
+                    64,
+                )),
+            ])
+            .await?;
+        assert_eq!(
+            dataset.schema().field("vec").unwrap().data_type(),
+            DataType::FixedSizeList(
+                Arc::new(ArrowField::new("item", DataType::Float16, true)),
+                64,
+            ),
+        );
+
+        Ok(())
+    }
+
     #[rstest]
     #[tokio::test]
     async fn test_drop_columns(
diff --git a/rust/lance/src/dataset/tests/dataset_index.rs b/rust/lance/src/dataset/tests/dataset_index.rs
index beb6e2b99fd..267296c984b 100644
--- a/rust/lance/src/dataset/tests/dataset_index.rs
+++ b/rust/lance/src/dataset/tests/dataset_index.rs
@@ -1137,6 +1137,78 @@ async fn test_fts_without_index() {
     assert_eq!(results.num_rows(), 1);
 }
 
+#[tokio::test]
+async fn test_fts_without_index_uses_scalar_index_for_prefilter() {
+    // Verify that flat FTS (no inverted index on text) routes its prefilter
+    // through `FilteredReadExec` so a scalar index on the filter column is
+    // actually used. Six rows with two distinct ids: a prefilter of `id = 1`
+    // must match exactly the three text rows tagged with id=1.
+    let text = StringArray::from(vec![
+        "alpha bravo",
+        "charlie delta",
+        "alpha echo",
+        "foxtrot",
+        "alpha golf",
+        "hotel india",
+    ]);
+    let ids = Int32Array::from(vec![1, 1, 1, 2, 2, 2]);
+    let batch = RecordBatch::try_new(
+        arrow_schema::Schema::new(vec![
+            Field::new("text", text.data_type().to_owned(), false),
+            Field::new("id", ids.data_type().to_owned(), false),
+        ])
+        .into(),
+        vec![Arc::new(text) as ArrayRef, Arc::new(ids) as ArrayRef],
+    )
+    .unwrap();
+    let schema = batch.schema();
+    let batches = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema);
+    let test_uri = TempStrDir::default();
+    let mut dataset = Dataset::write(batches, &test_uri, None).await.unwrap();
+
+    // Scalar index on `id` only — no FTS index on `text`.
+    dataset
+        .create_index(
+            &["id"],
+            IndexType::BTree,
+            None,
+            &ScalarIndexParams::default(),
+            true,
+        )
+        .await
+        .unwrap();
+
+    let mut scan = dataset.scan();
+    scan.prefilter(true)
+        .full_text_search(
+            FullTextSearchQuery::new("alpha".to_owned())
+                .with_columns(&["text".to_string()])
+                .unwrap(),
+        )
+        .unwrap()
+        .filter("id = 1")
+        .unwrap();
+
+    let plan = scan.analyze_plan().await.unwrap();
+    // The flat-FTS path now reads via `FilteredReadExec` (prints as `LanceRead`)
+    // with the prefilter plumbed into it, so the scalar index on `id` is used.
+    assert_contains!(&plan, "FlatMatchQuery");
+    assert_contains!(&plan, "LanceRead");
+    assert_contains!(&plan, "full_filter=id = Int32(1)");
+    // The legacy plan ran a `LanceScan` wrapped in a manual `LanceFilterExec`;
+    // make sure we did not regress to that shape.
+    assert_not_contains!(&plan, "LanceScan:");
+
+    let results = scan.try_into_batch().await.unwrap();
+    // Only rows with id=1 AND text matching "alpha": rows 0 ("alpha bravo")
+    // and 2 ("alpha echo"). Row 4 ("alpha golf") has id=2 and must be excluded.
+    assert_eq!(
+        results.num_rows(),
+        2,
+        "expected the two id=1 rows that match `alpha`, got plan:\n{plan}"
+    );
+}
+
 #[tokio::test]
 async fn test_fts_rank() {
     let params = InvertedIndexParams::default();
@@ -2078,11 +2150,7 @@ mod fts_serializing_backend {
         ) -> Option<CacheEntry> {
             let guard = self.serialized.lock().await;
             if let Some((bytes, stored_codec, _)) = guard.get(key) {
-                return Some(
-                    stored_codec
-                        .deserialize(&bytes.clone())
-                        .expect("deserialization should succeed"),
-                );
+                return stored_codec.deserialize(&bytes.clone()).hit();
             }
             drop(guard);
             self.passthrough.get(key, codec).await
diff --git a/rust/lance/src/dataset/tests/dataset_versioning.rs b/rust/lance/src/dataset/tests/dataset_versioning.rs
index a0bc7816a32..c04dd0f3183 100644
--- a/rust/lance/src/dataset/tests/dataset_versioning.rs
+++ b/rust/lance/src/dataset/tests/dataset_versioning.rs
@@ -211,6 +211,77 @@ async fn test_version_id_fast_path() {
     assert_eq!(historical.latest_version_id().await.unwrap(), 2);
 }
 
+#[rstest]
+#[tokio::test]
+async fn test_stale_checks_cover_fast_successor_and_latest_version(
+    #[values(false, true)] enable_v2_manifest_paths: bool,
+) {
+    let expected_scheme = if enable_v2_manifest_paths {
+        ManifestNamingScheme::V2
+    } else {
+        ManifestNamingScheme::V1
+    };
+    let test_uri = TempStrDir::default();
+    let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
+        "i",
+        DataType::UInt32,
+        false,
+    )]));
+
+    let data = RecordBatch::try_new(
+        schema.clone(),
+        vec![Arc::new(UInt32Array::from_iter_values(0..5))],
+    )
+    .unwrap();
+    let reader = RecordBatchIterator::new(vec![data].into_iter().map(Ok), schema.clone());
+
+    let original = Dataset::write(
+        reader,
+        &test_uri,
+        Some(WriteParams {
+            enable_v2_manifest_paths,
+            ..Default::default()
+        }),
+    )
+    .await
+    .unwrap();
+    assert_eq!(original.manifest_location().naming_scheme, expected_scheme);
+    assert!(!original.is_stale().await.unwrap());
+    assert!(!original.has_successor_version().await.unwrap());
+
+    let data = RecordBatch::try_new(
+        schema.clone(),
+        vec![Arc::new(UInt32Array::from_iter_values(5..10))],
+    )
+    .unwrap();
+    let reader = RecordBatchIterator::new(vec![data].into_iter().map(Ok), schema);
+    let updated = Dataset::write(
+        reader,
+        &test_uri,
+        Some(WriteParams {
+            mode: WriteMode::Append,
+            enable_v2_manifest_paths,
+            ..Default::default()
+        }),
+    )
+    .await
+    .unwrap();
+
+    assert!(original.is_stale().await.unwrap());
+    assert!(original.has_successor_version().await.unwrap());
+    assert_eq!(updated.manifest_location().naming_scheme, expected_scheme);
+    assert!(!updated.is_stale().await.unwrap());
+    assert!(!updated.has_successor_version().await.unwrap());
+
+    let historical = updated.checkout_version(1).await.unwrap();
+    assert_eq!(
+        historical.manifest_location().naming_scheme,
+        expected_scheme
+    );
+    assert!(historical.is_stale().await.unwrap());
+    assert!(historical.has_successor_version().await.unwrap());
+}
+
 #[rstest]
 #[tokio::test]
 async fn test_restore(
diff --git a/rust/lance/src/dataset/updater.rs b/rust/lance/src/dataset/updater.rs
index b9bc34f8706..90ef8df914b 100644
--- a/rust/lance/src/dataset/updater.rs
+++ b/rust/lance/src/dataset/updater.rs
@@ -6,13 +6,13 @@ use futures::StreamExt;
 use lance_core::datatypes::{OnMissing, OnTypeMismatch};
 use lance_core::utils::deletion::DeletionVector;
 use lance_core::{Error, Result, datatypes::Schema};
-use lance_table::format::Fragment;
+use lance_table::format::{DataFile, Fragment};
 use lance_table::utils::stream::ReadBatchFutStream;
 
 use super::Dataset;
 use super::fragment::FragmentReader;
 use super::scanner::get_default_batch_size;
-use super::write::{GenericWriter, open_writer};
+use super::write::{GenericWriter, cleanup_data_fragments, open_update_writer};
 use crate::dataset::FileFragment;
 use crate::dataset::utils::SchemaAdapter;
 
@@ -146,13 +146,7 @@ impl Updater {
             .data_storage_format
             .lance_file_version()?;
 
-        open_writer(
-            &self.fragment.dataset().object_store,
-            &schema,
-            &self.fragment.dataset().base,
-            data_storage_version,
-        )
-        .await
+        open_update_writer(self.dataset(), &schema, data_storage_version).await
     }
 
     /// Update one batch.
@@ -221,6 +215,34 @@ impl Updater {
         Ok(self.fragment.metadata().clone())
     }
 
+    /// Clean up any data file and blob sidecars created by the current unfinished writer.
+    pub(super) async fn cleanup_unfinished_writer(&mut self) {
+        let Some(writer) = self.writer.take() else {
+            return;
+        };
+        let (path, base_id) = writer.data_file_path();
+        let path = path.to_string();
+        drop(writer);
+
+        if path.is_empty() {
+            return;
+        }
+
+        let mut fragment = Fragment::new(self.fragment.id() as u64);
+        // cleanup_data_fragments only needs path/base_id to remove the unfinished
+        // data file and any blob sidecars. Build a minimal synthetic fragment so
+        // we can reuse the shared cleanup path without fabricating full metadata.
+        fragment
+            .files
+            .push(DataFile::new(path, vec![], vec![], 0, 0, None, base_id));
+        cleanup_data_fragments(
+            &self.dataset().object_store,
+            &self.dataset().base,
+            &[fragment],
+        )
+        .await;
+    }
+
     /// Get the final schema of the fragment after the update.
     ///
     /// This may be None if the schema is not known. This can happen if it was
diff --git a/rust/lance/src/dataset/write.rs b/rust/lance/src/dataset/write.rs
index 1e73618fc6b..ff0a119158c 100644
--- a/rust/lance/src/dataset/write.rs
+++ b/rust/lance/src/dataset/write.rs
@@ -6,7 +6,10 @@ use chrono::TimeDelta;
 use datafusion::physical_plan::SendableRecordBatchStream;
 use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use futures::{Stream, StreamExt, TryStreamExt};
-use lance_arrow::BLOB_META_KEY;
+use lance_arrow::{
+    ARROW_EXT_NAME_KEY, BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY,
+    BLOB_INLINE_SIZE_THRESHOLD_META_KEY, BLOB_META_KEY, BLOB_V2_EXT_NAME,
+};
 use lance_core::datatypes::{
     NullabilityComparison, OnMissing, OnTypeMismatch, SchemaCompareOptions,
 };
@@ -35,7 +38,9 @@ use tracing::{info, instrument};
 
 use crate::Dataset;
 use crate::dataset::blob::{
-    BlobPreprocessor, ExternalBaseCandidate, ExternalBaseResolver, preprocess_blob_batches,
+    BlobPreprocessor, ExternalBaseCandidate, ExternalBaseResolver,
+    blob_dedicated_threshold_from_metadata, blob_inline_threshold_from_metadata,
+    preprocess_blob_batches,
 };
 use crate::session::Session;
 
@@ -170,6 +175,77 @@ fn validate_external_blob_write_params(params: &WriteParams) -> Result<()> {
     Ok(())
 }
 
+fn validate_blob_threshold_metadata_for_append(
+    input_schema: &Schema,
+    dataset_schema: &Schema,
+) -> Result<()> {
+    for input_field in &input_schema.fields {
+        let Some(dataset_field) = dataset_schema.field(&input_field.name) else {
+            continue;
+        };
+        let input_is_blob_v2 = input_field
+            .metadata
+            .get(ARROW_EXT_NAME_KEY)
+            .is_some_and(|extension_name| extension_name == BLOB_V2_EXT_NAME);
+        let dataset_is_blob_v2 = dataset_field
+            .metadata
+            .get(ARROW_EXT_NAME_KEY)
+            .is_some_and(|extension_name| extension_name == BLOB_V2_EXT_NAME);
+        if !input_is_blob_v2 && !dataset_is_blob_v2 {
+            continue;
+        }
+
+        let has_inline_threshold = input_field
+            .metadata
+            .contains_key(BLOB_INLINE_SIZE_THRESHOLD_META_KEY);
+        let has_dedicated_threshold = input_field
+            .metadata
+            .contains_key(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY);
+        if !has_inline_threshold && !has_dedicated_threshold {
+            continue;
+        }
+
+        if has_inline_threshold {
+            let input_inline_threshold =
+                blob_inline_threshold_from_metadata(&input_field.metadata, &input_field.name)?;
+            let dataset_inline_threshold =
+                blob_inline_threshold_from_metadata(&dataset_field.metadata, &dataset_field.name)?;
+            if input_inline_threshold != dataset_inline_threshold {
+                return Err(Error::invalid_input(format!(
+                    "Cannot append data with blob threshold metadata {}={} for field '{}'; \
+                     the dataset schema has effective value {}. Blob thresholds for existing \
+                     columns are stored in the dataset schema.",
+                    BLOB_INLINE_SIZE_THRESHOLD_META_KEY,
+                    input_inline_threshold,
+                    input_field.name,
+                    dataset_inline_threshold,
+                )));
+            }
+        }
+        if has_dedicated_threshold {
+            let input_dedicated_threshold =
+                blob_dedicated_threshold_from_metadata(&input_field.metadata, &input_field.name)?;
+            let dataset_dedicated_threshold = blob_dedicated_threshold_from_metadata(
+                &dataset_field.metadata,
+                &dataset_field.name,
+            )?;
+            if input_dedicated_threshold != dataset_dedicated_threshold {
+                return Err(Error::invalid_input(format!(
+                    "Cannot append data with blob threshold metadata {}={} for field '{}'; \
+                     the dataset schema has effective value {}. Blob thresholds for existing \
+                     columns are stored in the dataset schema.",
+                    BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY,
+                    input_dedicated_threshold,
+                    input_field.name,
+                    dataset_dedicated_threshold,
+                )));
+            }
+        }
+    }
+
+    Ok(())
+}
+
 /// Auto cleanup parameters
 #[derive(Debug, Clone)]
 pub struct AutoCleanupParams {
@@ -507,7 +583,7 @@ pub async fn do_write_fragments(
     };
 
     let external_base_resolver = if storage_version >= LanceFileVersion::V2_2
-        && schema.fields.iter().any(|field| field.is_blob_v2())
+        && schema.fields_pre_order().any(|field| field.is_blob_v2())
     {
         Some(Arc::new(
             build_external_base_resolver(dataset, &params).await?,
@@ -953,6 +1029,7 @@ pub async fn write_fragments_internal(
                         ..Default::default()
                     },
                 )?;
+                validate_blob_threshold_metadata_for_append(&converted_schema, dataset.schema())?;
                 let write_schema = dataset.schema().project_by_schema(
                     &converted_schema,
                     OnMissing::Error,
@@ -984,7 +1061,8 @@ pub async fn write_fragments_internal(
         (converted_schema, params.storage_version_or_default())
     };
 
-    if storage_version < LanceFileVersion::V2_2 && schema.fields.iter().any(|f| f.is_blob_v2()) {
+    if storage_version < LanceFileVersion::V2_2 && schema.fields_pre_order().any(|f| f.is_blob_v2())
+    {
         return Err(Error::invalid_input(format!(
             "Blob v2 requires file version >= 2.2 (got {:?})",
             storage_version
@@ -992,13 +1070,10 @@ pub async fn write_fragments_internal(
     }
 
     if storage_version >= LanceFileVersion::V2_2
-        && schema
-            .fields
-            .iter()
-            .any(|f| f.metadata.contains_key(BLOB_META_KEY))
+        && let Some(blob_field_path) = legacy_blob_field_path(&schema)
     {
         return Err(Error::invalid_input(format!(
-            "Legacy blob columns (field metadata key {BLOB_META_KEY:?}) are not supported for file version >= 2.2. Use the blob v2 extension type (ARROW:extension:name = \"lance.blob.v2\") and the new blob APIs (e.g. lance::blob::blob_field / lance::blob::BlobArrayBuilder)."
+            "Legacy blob columns (field metadata key {BLOB_META_KEY:?}) are not supported for file version >= 2.2. Found legacy blob field: {blob_field_path}. Use the blob v2 extension type (ARROW:extension:name = \"lance.blob.v2\") and the new blob APIs (e.g. lance::blob::blob_field / lance::blob::BlobArrayBuilder)."
         )));
     }
 
@@ -1017,10 +1092,23 @@ pub async fn write_fragments_internal(
     Ok((fragments, schema))
 }
 
+fn legacy_blob_field_path(schema: &Schema) -> Option<String> {
+    schema
+        .fields_pre_order()
+        .find(|field| field.metadata.contains_key(BLOB_META_KEY))
+        .map(|field| {
+            schema
+                .field_path(field.id)
+                .unwrap_or_else(|_| field.name.clone())
+        })
+}
+
 #[async_trait::async_trait]
 pub trait GenericWriter: Send {
     /// Write the given batches to the file
     async fn write(&mut self, batches: &[RecordBatch]) -> Result<()>;
+    /// Get the file path and base ID for the data file being written.
+    fn data_file_path(&self) -> (&str, Option<u32>);
     /// Get the current position in the file
     ///
     /// We use this to know when the file is too large and we need to start
@@ -1047,6 +1135,9 @@ where
     async fn write(&mut self, batches: &[RecordBatch]) -> Result<()> {
         self.writer.write(batches).await
     }
+    fn data_file_path(&self) -> (&str, Option<u32>) {
+        (&self.path, self.base_id)
+    }
     async fn tell(&mut self) -> Result<u64> {
         Ok(self.writer.tell().await? as u64)
     }
@@ -1087,6 +1178,9 @@ impl GenericWriter for V2WriterAdapter {
         }
         Ok(())
     }
+    fn data_file_path(&self) -> (&str, Option<u32>) {
+        (&self.path, self.base_id)
+    }
     async fn tell(&mut self) -> Result<u64> {
         Ok(self.writer.tell().await?)
     }
@@ -1140,6 +1234,39 @@ pub async fn open_writer(
     .await
 }
 
+pub(super) async fn open_update_writer(
+    dataset: &Dataset,
+    schema: &Schema,
+    storage_version: LanceFileVersion,
+) -> Result<Box<dyn GenericWriter>> {
+    // add_columns / alter_columns reuse the normal writer stack, but they do not
+    // flow through WriteParams. Rebuild the external base resolver here so blob
+    // v2 reference columns can resolve dataset-registered external URIs.
+    let external_base_resolver = if storage_version >= LanceFileVersion::V2_2
+        && schema.fields_pre_order().any(|f| f.is_blob_v2())
+    {
+        Some(Arc::new(
+            build_external_base_resolver(Some(dataset), &WriteParams::default()).await?,
+        ))
+    } else {
+        None
+    };
+
+    open_writer_with_options(
+        &dataset.object_store,
+        schema,
+        &dataset.base,
+        storage_version,
+        WriterOptions {
+            add_data_dir: true,
+            external_base_resolver,
+            source_store_registry: dataset.session.store_registry(),
+            ..Default::default()
+        },
+    )
+    .await
+}
+
 #[derive(Default)]
 struct WriterOptions {
     add_data_dir: bool,
@@ -1216,7 +1343,7 @@ async fn open_writer_with_options(
                 source_store_registry,
                 source_store_params,
                 blob_pack_file_size_threshold,
-            ))
+            )?)
         } else {
             None
         };
diff --git a/rust/lance/src/dataset/write/insert.rs b/rust/lance/src/dataset/write/insert.rs
index 20209ed7f30..bfd702c9c3b 100644
--- a/rust/lance/src/dataset/write/insert.rs
+++ b/rust/lance/src/dataset/write/insert.rs
@@ -442,7 +442,7 @@ struct WriteContext<'a> {
 mod test {
     use std::collections::HashMap;
 
-    use arrow_array::{BinaryArray, Int32Array, RecordBatchReader, StructArray};
+    use arrow_array::{ArrayRef, BinaryArray, Int32Array, RecordBatchReader, StructArray};
     use arrow_schema::{ArrowError, DataType, Field, Schema};
     use lance_arrow::BLOB_META_KEY;
 
@@ -559,6 +559,41 @@ mod test {
         }
     }
 
+    #[tokio::test]
+    async fn create_v2_2_dataset_rejects_nested_legacy_blob_schema() {
+        let image_field = Field::new("image_bytes", DataType::Binary, true).with_metadata(
+            HashMap::from([(BLOB_META_KEY.to_string(), "true".to_string())]),
+        );
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "summary_image_nested",
+            DataType::Struct(vec![image_field.clone()].into()),
+            true,
+        )]));
+        let image_values: ArrayRef = Arc::new(BinaryArray::from(vec![Some(b"abc".as_slice())]));
+        let nested_values = StructArray::from(vec![(Arc::new(image_field), image_values)]);
+        let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(nested_values)]).unwrap();
+
+        let dataset = InsertBuilder::new("memory://forced-nested-blob-v2")
+            .with_params(&WriteParams {
+                mode: WriteMode::Create,
+                data_storage_version: Some(LanceFileVersion::V2_2),
+                ..Default::default()
+            })
+            .execute_stream(RecordBatchIterator::new(vec![Ok(batch)], schema.clone()))
+            .await;
+
+        let err = dataset.unwrap_err();
+        match err {
+            Error::InvalidInput { source, .. } => {
+                let message = source.to_string();
+                assert!(message.contains("Legacy blob columns"));
+                assert!(message.contains("summary_image_nested.image_bytes"));
+                assert!(message.contains("lance.blob.v2"));
+            }
+            other => panic!("unexpected error: {other:?}"),
+        }
+    }
+
     mod external_error {
         use super::*;
         use std::fmt;
diff --git a/rust/lance/src/dataset/write/merge_insert.rs b/rust/lance/src/dataset/write/merge_insert.rs
index 1f3414db4f8..b14421c963f 100644
--- a/rust/lance/src/dataset/write/merge_insert.rs
+++ b/rust/lance/src/dataset/write/merge_insert.rs
@@ -2224,18 +2224,13 @@ impl Merger {
         &self.output_schema
     }
 
-    // Retrieves a bitmap of rows where at least one of the columns in the range
-    // col_offset..coll_offset+num_cols is not null.
-    //
-    fn not_all_null(
-        batch: &RecordBatch,
-        col_offset: usize,
-        num_cols: usize,
-    ) -> Result<BooleanArray> {
+    // Retrieves a bitmap of rows where at least one of the given columns is
+    // not null.
+    fn not_all_null(batch: &RecordBatch, cols: &[usize]) -> Result<BooleanArray> {
         // For our purposes we know there is always at least 1 on key
-        debug_assert_ne!(num_cols, 0);
-        let mut at_least_one_valid = arrow::compute::is_not_null(batch.column(col_offset))?;
-        for idx in col_offset + 1..col_offset + num_cols {
+        debug_assert!(!cols.is_empty());
+        let mut at_least_one_valid = arrow::compute::is_not_null(batch.column(cols[0]))?;
+        for &idx in &cols[1..] {
             let is_valid = arrow::compute::is_not_null(batch.column(idx))?;
             at_least_one_valid = arrow::compute::or(&at_least_one_valid, &is_valid)?;
         }
@@ -2263,8 +2258,37 @@ impl Merger {
         right_offset: usize,
         num_keys: usize,
     ) -> Result<(BooleanArray, BooleanArray, BooleanArray)> {
-        let in_left = Self::not_all_null(combined_batch, 0, num_keys)?;
-        let in_right = Self::not_all_null(combined_batch, right_offset, num_keys)?;
+        // The outer join distinguishes its three cases by which side's join
+        // keys were NULL-padded: a present row always has non-null keys, while
+        // the absent side is filled with NULLs. We therefore test the *key*
+        // columns, located by name. They are NOT necessarily the first
+        // `num_keys` columns — a partial-schema source can place a payload
+        // column (e.g. an all-null vector) at position 0, and checking
+        // positions [0, num_keys) there misreads an all-null leading payload
+        // column as an absent join side, silently dropping every matched row
+        // (https://github.com/lancedb/lancedb/issues/3515). The target half
+        // carries the same columns in the same order, offset by `right_offset`.
+        let source_key_cols = self
+            .params
+            .on
+            .iter()
+            .map(|key| {
+                combined_batch.schema().index_of(key).map_err(|_| {
+                    Error::internal(format!(
+                        "merge insert key column '{}' not found in joined batch",
+                        key
+                    ))
+                })
+            })
+            .collect::<Result<Vec<_>>>()?;
+        debug_assert_eq!(source_key_cols.len(), num_keys);
+        let target_key_cols = source_key_cols
+            .iter()
+            .map(|c| c + right_offset)
+            .collect::<Vec<_>>();
+
+        let in_left = Self::not_all_null(combined_batch, &source_key_cols)?;
+        let in_right = Self::not_all_null(combined_batch, &target_key_cols)?;
         let in_both = arrow::compute::and(&in_left, &in_right)?;
         let left_only = arrow::compute::and(&in_left, &arrow::compute::not(&in_right)?)?;
         let right_only = arrow::compute::and(&arrow::compute::not(&in_left)?, &in_right)?;
@@ -3517,6 +3541,116 @@ mod tests {
         }
     }
 
+    /// Reproduces https://github.com/lancedb/lancedb/issues/3515:
+    /// a partial-schema `merge_insert` with a scalar index on the join key,
+    /// where every fragment is covered by the index (no unindexed data),
+    /// silently updates 0 rows instead of the expected matches.
+    #[rstest::rstest]
+    #[tokio::test]
+    async fn test_repro_3515_partial_schema_fully_indexed(
+        #[values(LanceFileVersion::V2_0, LanceFileVersion::V2_1, LanceFileVersion::V2_2)]
+        version: LanceFileVersion,
+    ) {
+        const N: usize = 1000;
+        const UPD: usize = 128;
+        let vec_field = Field::new(
+            "vector",
+            DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 4),
+            true,
+        );
+        let full_schema = Arc::new(Schema::new(vec![
+            vec_field.clone(),
+            Field::new("path", DataType::Utf8, false),
+            Field::new("status", DataType::Utf8, true),
+            Field::new("file_size", DataType::Int64, true),
+        ]));
+
+        // 1000 rows: vector all-null, path "/img/{i}.jpg", status "pending".
+        let paths = StringArray::from((0..N).map(|i| format!("/img/{i}.jpg")).collect::<Vec<_>>());
+        let statuses = StringArray::from(vec!["pending"; N]);
+        let file_sizes = Int64Array::from((0..N as i64).map(|i| 1000 + i).collect::<Vec<_>>());
+        let null_vectors = arrow_array::new_null_array(vec_field.data_type(), N);
+        let batch = RecordBatch::try_new(
+            full_schema.clone(),
+            vec![
+                null_vectors,
+                Arc::new(paths),
+                Arc::new(statuses),
+                Arc::new(file_sizes),
+            ],
+        )
+        .unwrap();
+
+        let mut ds = Dataset::write(
+            RecordBatchIterator::new([Ok(batch)], full_schema.clone()),
+            "memory://",
+            Some(WriteParams {
+                data_storage_version: Some(version),
+                ..Default::default()
+            }),
+        )
+        .await
+        .unwrap();
+
+        // Scalar index on the merge key, covering every fragment.
+        ds.create_index(
+            &["path"],
+            IndexType::Scalar,
+            None,
+            &ScalarIndexParams::default(),
+            false,
+        )
+        .await
+        .unwrap();
+        let ds = Arc::new(ds);
+
+        // Partial-schema source (no `file_size`): update the first 128 rows.
+        let upd_schema = Arc::new(Schema::new(vec![
+            vec_field,
+            Field::new("path", DataType::Utf8, false),
+            Field::new("status", DataType::Utf8, true),
+        ]));
+        let upd_paths = StringArray::from(
+            (0..UPD)
+                .map(|i| format!("/img/{i}.jpg"))
+                .collect::<Vec<_>>(),
+        );
+        let upd_vectors =
+            FixedSizeListArray::try_new_from_values(Float32Array::from(vec![0.1f32; 4 * UPD]), 4)
+                .unwrap();
+        let upd_statuses = StringArray::from(vec!["indexed"; UPD]);
+        let updates = RecordBatch::try_new(
+            upd_schema.clone(),
+            vec![
+                Arc::new(upd_vectors),
+                Arc::new(upd_paths),
+                Arc::new(upd_statuses),
+            ],
+        )
+        .unwrap();
+
+        let (ds, stats) = MergeInsertBuilder::try_new(ds.clone(), vec!["path".to_string()])
+            .unwrap()
+            .when_matched(WhenMatched::UpdateAll)
+            .when_not_matched(WhenNotMatched::DoNothing)
+            .try_build()
+            .unwrap()
+            .execute_reader(RecordBatchIterator::new([Ok(updates)], upd_schema))
+            .await
+            .unwrap();
+
+        assert_eq!(
+            stats.num_updated_rows, UPD as u64,
+            "expected {UPD} updated rows on {version:?}, got {}",
+            stats.num_updated_rows
+        );
+        let n_indexed = ds
+            .count_rows(Some("status = 'indexed'".to_string()))
+            .await
+            .unwrap();
+        assert_eq!(n_indexed, UPD, "expected {UPD} rows flipped to 'indexed'");
+    }
+
     #[tokio::test]
     async fn test_indexed_merge_insert() {
         let test_dir = TempStrDir::default();
diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs
index 8984d507408..1a3a3aa54ec 100644
--- a/rust/lance/src/index.rs
+++ b/rust/lance/src/index.rs
@@ -29,9 +29,7 @@ use lance_index::mem_wal::{MEM_WAL_INDEX_NAME, MemWalIndex};
 use lance_index::optimize::OptimizeOptions;
 use lance_index::pb::index::Implementation;
 pub use lance_index::progress::{IndexBuildProgress, NoopIndexBuildProgress};
-use lance_index::scalar::expression::{
-    IndexInformationProvider, MultiQueryParser, ScalarQueryParser,
-};
+use lance_index::scalar::expression::{IndexInformationProvider, MultiQueryParser};
 use lance_index::scalar::inverted::{InvertedIndex, InvertedIndexPlugin};
 use lance_index::scalar::lance_format::LanceIndexStore;
 use lance_index::scalar::registry::{TrainingCriteria, TrainingOrdering};
@@ -57,7 +55,7 @@ use lance_io::utils::{
     read_version,
 };
 use lance_table::format::{Fragment, SelfDescribingFileReader};
-use lance_table::format::{IndexMetadata, list_index_files_with_sizes};
+use lance_table::format::{IndexFile, IndexMetadata, list_index_files_with_sizes};
 use lance_table::io::manifest::read_manifest_indexes;
 use roaring::RoaringBitmap;
 use scalar::index_matches_criteria;
@@ -166,7 +164,8 @@ pub(crate) async fn build_index_metadata_from_segments(
     let mut new_indices = Vec::with_capacity(segments.len());
     for segment in segments {
         let (uuid, fragment_bitmap, index_details, index_version) = segment.into_parts();
-        if index_details.type_url.ends_with("InvertedIndexDetails") {
+        let is_inverted_index = index_details.type_url.ends_with("InvertedIndexDetails");
+        if is_inverted_index {
             let metadata = IndexMetadata {
                 uuid,
                 name: index_name.to_string(),
@@ -183,7 +182,10 @@ pub(crate) async fn build_index_metadata_from_segments(
                 .await?;
         }
         let index_dir = dataset.indices_dir().clone().join(uuid.to_string());
-        let files = list_index_files_with_sizes(&dataset.object_store, &index_dir).await?;
+        let mut files = list_index_files_with_sizes(&dataset.object_store, &index_dir).await?;
+        if is_inverted_index {
+            retain_committed_inverted_files(&mut files);
+        }
         new_indices.push(IndexMetadata {
             uuid,
             name: index_name.to_string(),
@@ -201,6 +203,10 @@ pub(crate) async fn build_index_metadata_from_segments(
     Ok(new_indices)
 }
 
+fn retain_committed_inverted_files(files: &mut Vec<IndexFile>) {
+    files.retain(|file| !file.path.starts_with("staging/"));
+}
+
 fn validate_segment_index_details(index_name: &str, segments: &[IndexMetadata]) -> Result<()> {
     let mut type_url = None::<&str>;
     for segment in segments {
@@ -652,10 +658,10 @@ pub struct ScalarIndexInfo {
 }
 
 impl IndexInformationProvider for ScalarIndexInfo {
-    fn get_index(&self, col: &str) -> Option<(&DataType, &dyn ScalarQueryParser)> {
+    fn get_index(&self, col: &str) -> Option<(&DataType, &MultiQueryParser)> {
         self.indexed_columns
             .get(col)
-            .map(|(ty, parser)| (ty, parser.as_ref() as &dyn ScalarQueryParser))
+            .map(|(ty, parser)| (ty, parser.as_ref()))
     }
 
     fn fragment_bitmap(&self, column: &str, index_name: &str) -> Option<RoaringBitmap> {
@@ -1891,9 +1897,15 @@ impl DatasetIndexInternalExt for Dataset {
         if let Some(entry) = self.index_cache.get_with_key(&state_key).await {
             log::debug!("Found IvfIndexState in cache uuid: {}", uuid);
             let partition_cache = self.index_cache.with_key_prefix(&state_key.key());
+            let frag_reuse_index = self.open_frag_reuse_index(metrics).await?;
             return entry
                 .0
-                .reconstruct(object_store, self.metadata_cache.as_ref(), partition_cache)
+                .reconstruct(
+                    object_store,
+                    self.metadata_cache.as_ref(),
+                    partition_cache,
+                    frag_reuse_index,
+                )
                 .await;
         }
 
@@ -2158,6 +2170,15 @@ impl DatasetIndexInternalExt for Dataset {
         };
         let (index, ivf_entry) = result?;
         metrics.record_index_load();
+        // Attribute the one-time index-open I/O (file footers, IVF centroids,
+        // quantization metadata) to this query's metrics.  This runs only on a
+        // real open; cache hits return earlier, so a warm query reports zero
+        // index-open I/O.
+        if let Some(io_stats) = metrics.io_stats()
+            && let Some(open_stats) = index.open_io_stats()
+        {
+            io_stats.add_scan_stats(&open_stats);
+        }
         if let Some(ivf_entry) = ivf_entry {
             let state_key = IvfIndexStateCacheKey::new(uuid, frag_reuse_uuid.as_ref());
             self.index_cache
diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs
index 388f3170251..037e1086d57 100644
--- a/rust/lance/src/index/append.rs
+++ b/rust/lance/src/index/append.rs
@@ -94,9 +94,9 @@ pub async fn build_old_data_filter(
     }
 }
 
-/// Split the stored fragment coverage of `segments` into fragments still live
-/// in `dataset` (`effective`) and fragments that compaction or deletion has
-/// already retired (`deleted`).
+/// Split the stored fragment coverage of `segments` into fragments still live in
+/// `dataset` (`effective`) and fragments that compaction or deletion has already
+/// retired (`deleted`).
 pub fn split_segment_coverage<'a>(
     dataset: &Dataset,
     segments: impl IntoIterator<Item = &'a IndexMetadata>,
@@ -114,44 +114,32 @@ pub fn split_segment_coverage<'a>(
     (effective, deleted)
 }
 
-/// Build one [`OldIndexDataFilter`] per segment, each derived from that
-/// segment's *own* effective (still-live) and retired fragment coverage.
+/// Build one [`OldIndexDataFilter`] per segment, each derived from that segment's
+/// *own* effective (still-live) and retired fragment coverage, plus the union of
+/// every segment's still-live coverage.
 pub async fn build_per_segment_filters(
     dataset: &Dataset,
     segments: &[&IndexMetadata],
-) -> Result<Vec<Option<OldIndexDataFilter>>> {
+) -> Result<(RoaringBitmap, Vec<Option<OldIndexDataFilter>>)> {
+    let mut effective_union = RoaringBitmap::new();
     let mut filters = Vec::with_capacity(segments.len());
     for segment in segments {
+        if segment.fragment_bitmap.is_none() {
+            return Err(Error::invalid_input(format!(
+                "CreateIndex: segment {} is missing fragment coverage",
+                segment.uuid
+            )));
+        }
         let effective = segment
             .effective_fragment_bitmap(&dataset.fragment_bitmap)
             .unwrap_or_default();
         let deleted = segment
             .deleted_fragment_bitmap(&dataset.fragment_bitmap)
             .unwrap_or_default();
+        effective_union |= &effective;
         filters.push(build_old_data_filter(dataset, &effective, &deleted).await?);
     }
-    Ok(filters)
-}
-
-/// Validate that every segment carries fragment coverage, then return the
-/// combined still-live coverage (for the merged segment's fragment bitmap)
-/// together with one [`OldIndexDataFilter`] per segment.
-pub async fn effective_coverage_and_filters(
-    dataset: &Dataset,
-    segments: &[IndexMetadata],
-) -> Result<(RoaringBitmap, Vec<Option<OldIndexDataFilter>>)> {
-    for segment in segments {
-        if segment.fragment_bitmap.is_none() {
-            return Err(Error::invalid_input(format!(
-                "CreateIndex: segment {} is missing fragment coverage",
-                segment.uuid
-            )));
-        }
-    }
-    let (effective, _deleted) = split_segment_coverage(dataset, segments);
-    let segment_refs: Vec<&IndexMetadata> = segments.iter().collect();
-    let filters = build_per_segment_filters(dataset, &segment_refs).await?;
-    Ok((effective, filters))
+    Ok((effective_union, filters))
 }
 
 async fn load_unindexed_training_data(
@@ -292,11 +280,11 @@ async fn merge_scalar_indices<'a>(
             load_unindexed_training_data(dataset.as_ref(), field_path, &update_criteria, unindexed)
                 .await?;
         let new_store = LanceIndexStore::from_dataset_for_new(&dataset, &new_uuid)?;
-        let old_data_filters =
-            build_per_segment_filters(dataset.as_ref(), selected_old_indices).await?;
 
         match index_type {
             IndexType::BTree => {
+                let (_, old_data_filters) =
+                    build_per_segment_filters(dataset.as_ref(), selected_old_indices).await?;
                 crate::index::scalar::btree::open_and_merge_segments(
                     dataset.as_ref(),
                     field_path,
@@ -315,6 +303,8 @@ async fn merge_scalar_indices<'a>(
                         .update(new_data_stream, &new_store, None)
                         .await?
                 } else {
+                    let (_, old_data_filters) =
+                        build_per_segment_filters(dataset.as_ref(), selected_old_indices).await?;
                     crate::index::scalar::bitmap::open_and_merge_segments(
                         dataset.as_ref(),
                         field_path,
@@ -327,10 +317,6 @@ async fn merge_scalar_indices<'a>(
                 }
             }
             _ => {
-                // Non-segmented scalar types only reach this branch with a single
-                // selected segment, so the union filter equals that segment's
-                // filter. Built lazily here so the segmented BTree/Bitmap paths
-                // above don't pay an extra row-id-sequence load they never use.
                 let old_data_filter = build_old_data_filter(
                     dataset.as_ref(),
                     &effective_old_frags,
@@ -840,7 +826,7 @@ mod tests {
     use rstest::rstest;
 
     use crate::dataset::builder::DatasetBuilder;
-    use crate::dataset::optimize::compact_files;
+    use crate::dataset::optimize::{CompactionOptions, compact_files};
     use crate::dataset::{MergeInsertBuilder, WhenMatched, WhenNotMatched, WriteMode, WriteParams};
     use crate::index::vector::VectorIndexParams;
     use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount};
@@ -2014,99 +2000,6 @@ mod tests {
         );
     }
 
-    #[tokio::test]
-    async fn test_optimize_btree_no_duplicate_row_addr() {
-        let test_dir = TempStrDir::default();
-        let test_uri = test_dir.as_str();
-
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("id", DataType::Int32, false),
-            Field::new("payload", DataType::Int32, false),
-        ]));
-        let batch = RecordBatch::try_new(
-            schema.clone(),
-            vec![
-                Arc::new(Int32Array::from(vec![1])),
-                Arc::new(Int32Array::from(vec![10])),
-            ],
-        )
-        .unwrap();
-        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone());
-        let mut dataset = Dataset::write(reader, test_uri, None).await.unwrap();
-
-        let params = ScalarIndexParams::for_builtin(BuiltinIndexType::BTree);
-        dataset
-            .create_index(
-                &["id"],
-                IndexType::BTree,
-                Some("id_idx".into()),
-                &params,
-                true,
-            )
-            .await
-            .unwrap();
-
-        // Reordered source columns (payload, id) force the partial-schema
-        // RewriteColumns path instead of a row rewrite.
-        let source_schema = Arc::new(Schema::new(vec![
-            Field::new("payload", DataType::Int32, false),
-            Field::new("id", DataType::Int32, false),
-        ]));
-        let source_batch = RecordBatch::try_new(
-            source_schema.clone(),
-            vec![
-                Arc::new(Int32Array::from(vec![100])),
-                Arc::new(Int32Array::from(vec![1])),
-            ],
-        )
-        .unwrap();
-        let merge_job =
-            MergeInsertBuilder::try_new(Arc::new(dataset.clone()), vec!["id".to_string()])
-                .unwrap()
-                .when_matched(WhenMatched::UpdateAll)
-                .try_build()
-                .unwrap();
-        let source_reader = Box::new(RecordBatchIterator::new(
-            [Ok(source_batch)],
-            source_schema.clone(),
-        ));
-        merge_job
-            .execute(reader_to_stream(source_reader))
-            .await
-            .unwrap();
-
-        // Build a delta BTree segment over the now-unindexed fragment.
-        let mut dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap();
-        dataset
-            .optimize_indices(&OptimizeOptions::append())
-            .await
-            .unwrap();
-        assert_eq!(
-            dataset.load_indices_by_name("id_idx").await.unwrap().len(),
-            2,
-            "append must create a delta segment over the rewritten fragment"
-        );
-
-        // Force the old segment + delta segment to merge.
-        dataset
-            .optimize_indices(&OptimizeOptions::merge(2))
-            .await
-            .unwrap();
-
-        let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap();
-        let rows = dataset
-            .scan()
-            .filter("id = 1")
-            .unwrap()
-            .project(&["id"])
-            .unwrap()
-            .try_into_batch()
-            .await
-            .unwrap()
-            .num_rows();
-        assert_eq!(rows, 1, "id = 1 must return exactly one row after merge");
-    }
-
     #[tokio::test]
     async fn test_optimize_bitmap_no_stale_postings() {
         async fn query_count(dataset: &Dataset, value: &str) -> usize {
@@ -2225,6 +2118,107 @@ mod tests {
         );
     }
 
+    #[tokio::test]
+    async fn test_optimize_bitmap_merge_remaps_deferred_compaction() {
+        let test_dir = TempStrDir::default();
+        let test_uri = test_dir.as_str();
+
+        let schema = Arc::new(Schema::new(vec![Field::new("cat", DataType::Int32, false)]));
+        let make = |range: std::ops::Range<i32>| {
+            RecordBatch::try_new(
+                schema.clone(),
+                vec![Arc::new(Int32Array::from_iter_values(range))],
+            )
+            .unwrap()
+        };
+
+        // Two fragments: [0, 50) and [50, 100).
+        let reader =
+            RecordBatchIterator::new(vec![Ok(make(0..50)), Ok(make(50..100))], schema.clone());
+        let mut dataset = Dataset::write(
+            reader,
+            test_uri,
+            Some(WriteParams {
+                max_rows_per_file: 50,
+                ..Default::default()
+            }),
+        )
+        .await
+        .unwrap();
+        assert_eq!(dataset.get_fragments().len(), 2);
+
+        let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap);
+        dataset
+            .create_index(
+                &["cat"],
+                IndexType::Bitmap,
+                Some("cat_idx".into()),
+                &params,
+                true,
+            )
+            .await
+            .unwrap();
+
+        // Deferred-remap compaction fuses the two fragments into one and leaves a
+        // pending FragReuseIndex; the bitmap segment is not eagerly remapped, so
+        // its on-disk postings still reference the pre-compaction fragments.
+        compact_files(
+            &mut dataset,
+            CompactionOptions {
+                defer_index_remap: true,
+                ..Default::default()
+            },
+            None,
+        )
+        .await
+        .unwrap();
+
+        // Append a third fragment, left unindexed.
+        let mut dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap();
+        dataset
+            .append(
+                RecordBatchIterator::new(vec![Ok(make(100..150))], schema.clone()),
+                None,
+            )
+            .await
+            .unwrap();
+
+        // Merge the deferred-remapped old segment with the new delta.
+        dataset
+            .optimize_indices(&OptimizeOptions::merge(2))
+            .await
+            .unwrap();
+
+        let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap();
+        // A value from the compacted fragments must still be found via the index;
+        // a missing remap would point the posting at a retired fragment address.
+        let hit = dataset
+            .scan()
+            .filter("cat = 25")
+            .unwrap()
+            .project(&["cat"])
+            .unwrap()
+            .try_into_batch()
+            .await
+            .unwrap()
+            .num_rows();
+        assert_eq!(
+            hit, 1,
+            "compacted-then-merged row must remain queryable via the bitmap index"
+        );
+        let total = dataset
+            .scan()
+            .filter("cat >= 0")
+            .unwrap()
+            .project(&["cat"])
+            .unwrap()
+            .try_into_batch()
+            .await
+            .unwrap()
+            .num_rows();
+        assert_eq!(total, 150, "no rows may be lost across compaction + merge");
+    }
+
     #[tokio::test]
     async fn test_optimize_btree_keeps_rows_with_stable_row_ids_after_compaction() {
         async fn query_id_count(dataset: &Dataset, id: &str) -> usize {
@@ -2359,4 +2353,205 @@ mod tests {
         assert_eq!(after_default[0].uuid, original_uuid);
         assert_eq!(dataset.manifest.version, original_version);
     }
+
+    #[rstest]
+    #[case::address_row_ids(false)]
+    #[case::stable_row_ids(true)]
+    #[tokio::test]
+    async fn test_optimize_btree_no_duplicate_row_addr(#[case] use_stable_row_ids: bool) {
+        let test_dir = TempStrDir::default();
+        let test_uri = test_dir.as_str();
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("payload", DataType::Int32, false),
+        ]));
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1])),
+                Arc::new(Int32Array::from(vec![10])),
+            ],
+        )
+        .unwrap();
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone());
+        let write_params = WriteParams {
+            enable_stable_row_ids: use_stable_row_ids,
+            ..Default::default()
+        };
+        let mut dataset = Dataset::write(reader, test_uri, Some(write_params))
+            .await
+            .unwrap();
+
+        let params = ScalarIndexParams::for_builtin(BuiltinIndexType::BTree);
+        dataset
+            .create_index(
+                &["id"],
+                IndexType::BTree,
+                Some("id_idx".into()),
+                &params,
+                true,
+            )
+            .await
+            .unwrap();
+
+        // Reordered source columns (payload, id) force the partial-schema
+        // RewriteColumns path instead of a full row rewrite.
+        let source_schema = Arc::new(Schema::new(vec![
+            Field::new("payload", DataType::Int32, false),
+            Field::new("id", DataType::Int32, false),
+        ]));
+        let source_batch = RecordBatch::try_new(
+            source_schema.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![100])),
+                Arc::new(Int32Array::from(vec![1])),
+            ],
+        )
+        .unwrap();
+        let merge_job =
+            MergeInsertBuilder::try_new(Arc::new(dataset.clone()), vec!["id".to_string()])
+                .unwrap()
+                .when_matched(WhenMatched::UpdateAll)
+                .try_build()
+                .unwrap();
+        let source_reader = Box::new(RecordBatchIterator::new(
+            [Ok(source_batch)],
+            source_schema.clone(),
+        ));
+        merge_job
+            .execute(reader_to_stream(source_reader))
+            .await
+            .unwrap();
+
+        // Build a delta BTree segment over the now-unindexed fragment.
+        let mut dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap();
+        dataset
+            .optimize_indices(&OptimizeOptions::append())
+            .await
+            .unwrap();
+        assert_eq!(
+            dataset.load_indices_by_name("id_idx").await.unwrap().len(),
+            2,
+            "append must create a delta segment over the rewritten fragment"
+        );
+
+        // Force the old segment + delta segment to merge.
+        dataset
+            .optimize_indices(&OptimizeOptions::merge(2))
+            .await
+            .unwrap();
+
+        let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap();
+        let rows = dataset
+            .scan()
+            .filter("id = 1")
+            .unwrap()
+            .project(&["id"])
+            .unwrap()
+            .try_into_batch()
+            .await
+            .unwrap()
+            .num_rows();
+        assert_eq!(rows, 1, "id = 1 must return exactly one row after merge");
+    }
+
+    #[tokio::test]
+    async fn test_optimize_btree_merge_remaps_deferred_compaction() {
+        let test_dir = TempStrDir::default();
+        let test_uri = test_dir.as_str();
+
+        let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+        let make = |range: std::ops::Range<i32>| {
+            RecordBatch::try_new(
+                schema.clone(),
+                vec![Arc::new(Int32Array::from_iter_values(range))],
+            )
+            .unwrap()
+        };
+
+        // Two fragments: [0, 50) and [50, 100).
+        let reader =
+            RecordBatchIterator::new(vec![Ok(make(0..50)), Ok(make(50..100))], schema.clone());
+        let mut dataset = Dataset::write(
+            reader,
+            test_uri,
+            Some(WriteParams {
+                max_rows_per_file: 50,
+                ..Default::default()
+            }),
+        )
+        .await
+        .unwrap();
+        assert_eq!(dataset.get_fragments().len(), 2);
+
+        let params = ScalarIndexParams::for_builtin(BuiltinIndexType::BTree);
+        dataset
+            .create_index(
+                &["id"],
+                IndexType::BTree,
+                Some("id_idx".into()),
+                &params,
+                true,
+            )
+            .await
+            .unwrap();
+
+        // Deferred-remap compaction fuses the two fragments into one and leaves a
+        // pending FragReuseIndex; the index segment is not eagerly remapped.
+        compact_files(
+            &mut dataset,
+            CompactionOptions {
+                defer_index_remap: true,
+                ..Default::default()
+            },
+            None,
+        )
+        .await
+        .unwrap();
+
+        // Append a third fragment, left unindexed.
+        let mut dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap();
+        dataset
+            .append(
+                RecordBatchIterator::new(vec![Ok(make(100..150))], schema.clone()),
+                None,
+            )
+            .await
+            .unwrap();
+
+        // Merge the deferred-remapped old segment with the new delta.
+        dataset
+            .optimize_indices(&OptimizeOptions::merge(2))
+            .await
+            .unwrap();
+
+        let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap();
+        // A value from the compacted fragments must still be found via the index.
+        let hit = dataset
+            .scan()
+            .filter("id = 25")
+            .unwrap()
+            .project(&["id"])
+            .unwrap()
+            .try_into_batch()
+            .await
+            .unwrap()
+            .num_rows();
+        assert_eq!(
+            hit, 1,
+            "compacted-then-merged row must remain queryable via the index"
+        );
+        let total = dataset
+            .scan()
+            .filter("id >= 0")
+            .unwrap()
+            .project(&["id"])
+            .unwrap()
+            .try_into_batch()
+            .await
+            .unwrap()
+            .num_rows();
+        assert_eq!(total, 150, "no rows may be lost across compaction + merge");
+    }
 }
diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs
index 37b4df81404..507c2d23114 100644
--- a/rust/lance/src/index/create.rs
+++ b/rust/lance/src/index/create.rs
@@ -1998,6 +1998,23 @@ mod tests {
         let segments = input_segments.clone();
         assert_eq!(segments.len(), input_segments.len());
 
+        crate::index::scalar::inverted::finalize_segment_files_if_needed(
+            &dataset,
+            &input_segments[0],
+        )
+        .await
+        .unwrap();
+        let stale_staging_path = dataset
+            .indices_dir()
+            .join(input_segments[0].uuid.to_string())
+            .join("staging")
+            .join("orphan.lance");
+        dataset
+            .object_store
+            .put(&stale_staging_path, b"stale")
+            .await
+            .unwrap();
+
         dataset
             .commit_existing_index_segments("text_idx", "text", segments)
             .await
@@ -2021,6 +2038,19 @@ mod tests {
 
         let indices = dataset.load_indices_by_name("text_idx").await.unwrap();
         assert_eq!(indices.len(), input_segments.len());
+        let finalized_segment = indices
+            .iter()
+            .find(|index| index.uuid == input_segments[0].uuid)
+            .expect("finalized segment should be committed");
+        assert!(
+            finalized_segment
+                .files
+                .as_ref()
+                .expect("committed segment should track files")
+                .iter()
+                .all(|file| !file.path.starts_with("staging/")),
+            "stale staging files must not be committed in IndexMetadata.files"
+        );
     }
 
     #[tokio::test]
diff --git a/rust/lance/src/index/scalar.rs b/rust/lance/src/index/scalar.rs
index 794ce399108..585e4e2ce72 100644
--- a/rust/lance/src/index/scalar.rs
+++ b/rust/lance/src/index/scalar.rs
@@ -12,6 +12,8 @@ pub(crate) mod zonemap;
 
 pub use inverted::{load_segment_details, load_segments};
 
+pub use crate::index::scalar_logical::{LogicalScalarIndex, load_named_scalar_segments};
+
 use std::sync::{Arc, LazyLock};
 
 use uuid::Uuid;
diff --git a/rust/lance/src/index/scalar/bitmap.rs b/rust/lance/src/index/scalar/bitmap.rs
index d5bbdcf2961..a947de1cbe3 100644
--- a/rust/lance/src/index/scalar/bitmap.rs
+++ b/rust/lance/src/index/scalar/bitmap.rs
@@ -54,13 +54,10 @@ pub(in crate::index) async fn merge_segments(
     })?;
     let field_path = dataset.schema().field_path(field_id)?;
 
-    // Intersect each segment's stored coverage with the dataset's current
-    // fragments so we don't claim coverage on row addresses that compaction or
-    // pruning has already retired.
+    let segment_refs: Vec<&IndexMetadata> = segments.iter().collect();
     let (fragment_bitmap, old_data_filters) =
-        crate::index::append::effective_coverage_and_filters(dataset, &segments).await?;
+        crate::index::append::build_per_segment_filters(dataset, &segment_refs).await?;
 
-    let segment_refs: Vec<&IndexMetadata> = segments.iter().collect();
     let source_indices = open_bitmap_segments(dataset, &field_path, &segment_refs).await?;
 
     let new_uuid = Uuid::new_v4();
diff --git a/rust/lance/src/index/scalar/btree.rs b/rust/lance/src/index/scalar/btree.rs
index 268048da4dd..4339b8c183b 100644
--- a/rust/lance/src/index/scalar/btree.rs
+++ b/rust/lance/src/index/scalar/btree.rs
@@ -117,18 +117,15 @@ pub(crate) async fn merge_segments(
     })?;
     let field_path = dataset.schema().field_path(field_id)?;
 
-    // Intersect each segment's stored bitmap with the dataset's current
-    // fragments so we don't claim coverage on IDs that compaction or pruning
-    // has already retired.
+    let segment_refs: Vec<&IndexMetadata> = segments.iter().collect();
     let (fragment_bitmap, old_data_filters) =
-        crate::index::append::effective_coverage_and_filters(dataset, &segments).await?;
+        crate::index::append::build_per_segment_filters(dataset, &segment_refs).await?;
 
     let output_uuid = Uuid::new_v4();
     let new_store = LanceIndexStore::from_dataset_for_new(dataset, &output_uuid)?;
     // Pure segment consolidation: no dataset scan, so `new_data` is an empty
     // stream and the merge is driven entirely by the source page data.
     let empty_new_data = empty_btree_update_stream(dataset, field_id)?;
-    let segment_refs: Vec<&IndexMetadata> = segments.iter().collect();
     let created_index = open_and_merge_segments(
         dataset,
         &field_path,
diff --git a/rust/lance/src/index/scalar_logical.rs b/rust/lance/src/index/scalar_logical.rs
index 75465cc817c..f3a7b637202 100644
--- a/rust/lance/src/index/scalar_logical.rs
+++ b/rust/lance/src/index/scalar_logical.rs
@@ -31,7 +31,17 @@ pub struct LogicalScalarIndex {
 }
 
 impl LogicalScalarIndex {
-    fn try_new(name: String, column: String, segments: Vec<Arc<dyn ScalarIndex>>) -> Result<Self> {
+    /// Merge several already-opened segments of one scalar index into a single
+    /// searchable [`ScalarIndex`].
+    ///
+    /// Used internally by `open_named_scalar_index`, and exposed so a
+    /// distributed query engine can open an explicit subset of a scalar
+    /// index's segments and present them as one index.
+    pub fn try_new(
+        name: String,
+        column: String,
+        segments: Vec<Arc<dyn ScalarIndex>>,
+    ) -> Result<Self> {
         let Some(first) = segments.first() else {
             return Err(Error::invalid_input(format!(
                 "LogicalScalarIndex '{}' on column '{}' must contain at least one segment",
@@ -210,7 +220,14 @@ fn index_intersects_dataset(index: &IndexMetadata, dataset: &Dataset) -> bool {
         .is_some_and(|index_bitmap| index_bitmap.intersection_len(&dataset.fragment_bitmap) > 0)
 }
 
-async fn load_named_scalar_segments(
+/// List the committed, dataset-intersecting segments of a named scalar index.
+///
+/// Returns one [`IndexMetadata`] per usable segment. The result length is the
+/// segment count: `1` means a single (non-segmented) index, `> 1` means the
+/// index is split across multiple segments that a distributed engine may route
+/// to different executors. All returned segments are validated to share the
+/// same underlying index type.
+pub async fn load_named_scalar_segments(
     dataset: &Dataset,
     column: &str,
     index_name: &str,
diff --git a/rust/lance/src/index/vector.rs b/rust/lance/src/index/vector.rs
index 0eb66ea2ede..af48bc94c41 100644
--- a/rust/lance/src/index/vector.rs
+++ b/rust/lance/src/index/vector.rs
@@ -9,6 +9,7 @@ use std::{any::Any, collections::HashMap};
 
 pub mod builder;
 pub(crate) mod details;
+pub mod hamming;
 pub mod ivf;
 pub mod pq;
 pub mod utils;
diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs
index 579449cc087..1e4fec8c762 100644
--- a/rust/lance/src/index/vector/builder.rs
+++ b/rust/lance/src/index/vector/builder.rs
@@ -1045,7 +1045,7 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
                 continue;
             }
 
-            let part_storage = existing_index.load_partition_storage(part_id).await?;
+            let part_storage = existing_index.load_partition_storage(part_id, None).await?;
             let mut part_batches = part_storage.to_batches()?.collect::<Vec<_>>();
             // for PQ, the PQ codes are transposed, so we need to transpose them back
             match Q::quantization_type() {
diff --git a/rust/lance/src/index/vector/hamming.rs b/rust/lance/src/index/vector/hamming.rs
new file mode 100644
index 00000000000..ba6ea98c42d
--- /dev/null
+++ b/rust/lance/src/index/vector/hamming.rs
@@ -0,0 +1,938 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Hamming distance clustering for IVF_FLAT indices.
+//!
+//! This module provides functionality to perform pairwise hamming distance
+//! computation and clustering on specific partitions of IVF_FLAT indices.
+
+use std::time::Instant;
+
+use arrow_array::RecordBatchReader;
+use arrow_array::cast::AsArray;
+use arrow_array::types::UInt64Type;
+use arrow_schema::DataType;
+use lance_core::{Error, Result};
+use lance_index::metrics::NoOpMetricsCollector;
+use lance_index::vector::VectorIndex;
+use lance_index::vector::flat::index::{FlatBinQuantizer, FlatIndex};
+use lance_index::vector::flat::storage::FLAT_COLUMN;
+use lance_index::vector::storage::VectorStore;
+use lance_linalg::distance::{
+    ClusteringResult, cluster_pairwise_result, extract_hashes_from_fixed_list,
+    pairwise_hamming_distance_parallel,
+};
+use rand::rng;
+use rand::seq::index::sample;
+
+use crate::dataset::Dataset;
+use crate::index::{DatasetIndexExt, DatasetIndexInternalExt};
+
+use super::ivf::v2::IVFIndex;
+
+/// Perform pairwise hamming distance clustering on a partition of an IVF_FLAT index.
+///
+/// This function loads a specific partition from an IVF_FLAT index on a hash column,
+/// computes pairwise hamming distances between all hashes in the partition,
+/// filters by threshold, and clusters the results using union-find.
+///
+/// # Arguments
+///
+/// * `dataset` - The Lance dataset
+/// * `index_name` - Name of the IVF_FLAT index on the hash column
+/// * `partition_id` - The partition ID within the IVF_FLAT index
+/// * `hamming_threshold` - Maximum hamming distance to consider as similar
+///
+/// # Returns
+///
+/// A `RecordBatchReader` yielding batches with columns:
+/// - `representative`: UInt64 - The representative row ID for each cluster
+/// - `duplicates`: `List<UInt64>` - List of duplicate row IDs in each cluster
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - The index doesn't exist or is not an IVF_FLAT index
+/// - The indexed column has wrong type (must be `FixedSizeList<UInt8, 8>`)
+/// - The partition ID is out of range
+pub async fn hamming_clustering_for_ivf_partition(
+    dataset: &Dataset,
+    index_name: &str,
+    partition_id: usize,
+    hamming_threshold: u32,
+) -> Result<Box<dyn RecordBatchReader + Send>> {
+    // Load indices and find the IVF_FLAT index
+    let indices = dataset.load_indices().await?;
+    let index_meta = indices
+        .iter()
+        .find(|idx| idx.name == index_name)
+        .ok_or_else(|| {
+            Error::invalid_input(format!("Index '{}' not found on dataset", index_name))
+        })?;
+
+    // Get the column name from the index metadata
+    let schema = dataset.schema();
+    let field_id = index_meta
+        .fields
+        .first()
+        .ok_or_else(|| Error::invalid_input(format!("Index '{}' has no fields", index_name)))?;
+    let field = schema.field_by_id(*field_id).ok_or_else(|| {
+        Error::invalid_input(format!(
+            "Field with id {} not found in schema for index '{}'",
+            field_id, index_name
+        ))
+    })?;
+    let column = &field.name;
+
+    // Check column is FixedSizeList<UInt8, 8>
+    let data_type = field.data_type();
+    match data_type {
+        DataType::FixedSizeList(inner, 8) => {
+            if *inner.data_type() != DataType::UInt8 {
+                return Err(Error::invalid_input(format!(
+                    "Column '{}' must be FixedSizeList<UInt8, 8>, got FixedSizeList<{:?}, 8>",
+                    column,
+                    inner.data_type()
+                )));
+            }
+        }
+        _ => {
+            return Err(Error::invalid_input(format!(
+                "Column '{}' must be FixedSizeList<UInt8, 8>, got {:?}",
+                column, data_type
+            )));
+        }
+    }
+
+    // Open the vector index
+    let index = dataset
+        .open_vector_index(column, &index_meta.uuid, &NoOpMetricsCollector)
+        .await?;
+
+    // Try to downcast to IVFIndex<FlatIndex, FlatBinQuantizer> (IVF_FLAT for binary data)
+    let ivf_index = index
+        .as_any()
+        .downcast_ref::<IVFIndex<FlatIndex, FlatBinQuantizer>>()
+        .ok_or_else(|| {
+            Error::invalid_input(format!(
+                "Index '{}' is not an IVF_FLAT index for binary data",
+                index_name
+            ))
+        })?;
+
+    // Check partition ID is valid
+    let num_partitions = ivf_index.ivf_model().num_partitions();
+    if partition_id >= num_partitions {
+        return Err(Error::invalid_input(format!(
+            "Partition ID {} is out of range (0..{})",
+            partition_id, num_partitions
+        )));
+    }
+
+    // Load the partition storage
+    let storage = ivf_index.load_partition_storage(partition_id, None).await?;
+
+    // Get row IDs
+    let row_id_slice: Vec<u64> = storage.row_ids().copied().collect();
+
+    if row_id_slice.is_empty() {
+        let empty = ClusteringResult {
+            clusters: Vec::new(),
+        };
+        return Ok(empty.into_reader(None));
+    }
+
+    // Get vectors from the storage batches
+    let batches: Vec<_> = storage.to_batches()?.collect();
+    if batches.is_empty() {
+        let empty = ClusteringResult {
+            clusters: Vec::new(),
+        };
+        return Ok(empty.into_reader(None));
+    }
+
+    // Extract the hash vectors from the FLAT_COLUMN
+    let mut all_hashes = Vec::new();
+    for batch in &batches {
+        let vectors = batch
+            .column_by_name(FLAT_COLUMN)
+            .ok_or_else(|| {
+                Error::invalid_input(format!("Column '{}' not found in storage", FLAT_COLUMN))
+            })?
+            .as_fixed_size_list();
+        let hashes = extract_hashes_from_fixed_list(vectors)?;
+        all_hashes.extend(hashes);
+    }
+
+    // Compute pairwise hamming distances with threshold filtering
+    let pairwise_result = pairwise_hamming_distance_parallel(
+        &all_hashes,
+        Some(&row_id_slice),
+        Some(hamming_threshold),
+    );
+
+    // Cluster the results
+    let clustering = cluster_pairwise_result(&pairwise_result);
+
+    Ok(clustering.into_reader(None))
+}
+
+/// Get partition statistics for an IVF_FLAT index.
+pub async fn get_ivf_partition_info(
+    dataset: &Dataset,
+    index_name: &str,
+) -> Result<Vec<PartitionInfo>> {
+    let indices = dataset.load_indices().await?;
+    let index_meta = indices
+        .iter()
+        .find(|idx| idx.name == index_name)
+        .ok_or_else(|| {
+            Error::invalid_input(format!("Index '{}' not found on dataset", index_name))
+        })?;
+
+    // Get the column name from the index metadata
+    let schema = dataset.schema();
+    let field_id = index_meta
+        .fields
+        .first()
+        .ok_or_else(|| Error::invalid_input(format!("Index '{}' has no fields", index_name)))?;
+    let field = schema.field_by_id(*field_id).ok_or_else(|| {
+        Error::invalid_input(format!(
+            "Field with id {} not found in schema for index '{}'",
+            field_id, index_name
+        ))
+    })?;
+    let column = &field.name;
+
+    let index = dataset
+        .open_vector_index(column, &index_meta.uuid, &NoOpMetricsCollector)
+        .await?;
+
+    let ivf_index = index
+        .as_any()
+        .downcast_ref::<IVFIndex<FlatIndex, FlatBinQuantizer>>()
+        .ok_or_else(|| {
+            Error::invalid_input(format!(
+                "Index '{}' is not an IVF_FLAT index for binary data",
+                index_name
+            ))
+        })?;
+
+    let num_partitions = ivf_index.ivf_model().num_partitions();
+    let mut partition_infos = Vec::with_capacity(num_partitions);
+
+    for i in 0..num_partitions {
+        partition_infos.push(PartitionInfo {
+            partition_id: i,
+            size: ivf_index.ivf_model().partition_size(i),
+        });
+    }
+
+    Ok(partition_infos)
+}
+
+/// Information about an IVF partition.
+#[derive(Debug, Clone)]
+pub struct PartitionInfo {
+    pub partition_id: usize,
+    pub size: usize,
+}
+
+/// Perform pairwise hamming distance clustering on sampled rows from a dataset.
+///
+/// This function samples N rows randomly from the dataset, extracts hashes,
+/// computes pairwise hamming distances, and clusters the results.
+/// It's useful for benchmarking and testing without requiring an IVF index.
+///
+/// # Arguments
+///
+/// * `dataset` - The Lance dataset
+/// * `column` - Name of the hash column (must be `FixedSizeList<UInt8, 8>`)
+/// * `sample_size` - Number of rows to sample (if None or >= total rows, uses all rows)
+/// * `hamming_threshold` - Maximum hamming distance to consider as similar
+///
+/// # Returns
+///
+/// A `RecordBatchReader` yielding batches with columns:
+/// - `representative`: UInt64 - The representative row ID for each cluster
+/// - `duplicates`: `List<UInt64>` - List of duplicate row IDs in each cluster
+pub async fn hamming_clustering_for_sample(
+    dataset: &Dataset,
+    column: &str,
+    sample_size: Option<usize>,
+    hamming_threshold: u32,
+) -> Result<Box<dyn RecordBatchReader + Send>> {
+    // Validate column exists and has correct type
+    let schema = dataset.schema();
+    let field = schema.field(column).ok_or_else(|| {
+        Error::invalid_input(format!("Column '{}' not found in dataset schema", column))
+    })?;
+
+    // Check column is FixedSizeList<UInt8, 8>
+    let data_type = field.data_type();
+    match data_type {
+        DataType::FixedSizeList(inner, 8) => {
+            if *inner.data_type() != DataType::UInt8 {
+                return Err(Error::invalid_input(format!(
+                    "Column '{}' must be FixedSizeList<UInt8, 8>, got FixedSizeList<{:?}, 8>",
+                    column,
+                    inner.data_type()
+                )));
+            }
+        }
+        _ => {
+            return Err(Error::invalid_input(format!(
+                "Column '{}' must be FixedSizeList<UInt8, 8>, got {:?}",
+                column, data_type
+            )));
+        }
+    }
+
+    // Get total row count
+    let total_rows: usize = dataset
+        .get_fragments()
+        .iter()
+        .filter_map(|f| f.metadata().physical_rows)
+        .sum();
+
+    let use_sampling = sample_size.is_some_and(|s| s < total_rows);
+    let effective_sample = sample_size.unwrap_or(total_rows).min(total_rows);
+
+    // Read data
+    let (hashes, row_ids) = if use_sampling {
+        // Random sample using take() with _rowid (take uses positional indices)
+        let indices: Vec<u64> = sample(&mut rng(), total_rows, effective_sample)
+            .iter()
+            .map(|i| i as u64)
+            .collect();
+
+        let batch = dataset
+            .take(
+                &indices,
+                crate::dataset::ProjectionRequest::from_columns(
+                    [column, "_rowid"],
+                    dataset.schema(),
+                ),
+            )
+            .await?;
+
+        let rowid_col = batch.column_by_name("_rowid").ok_or_else(|| {
+            Error::invalid_input("_rowid column not found in take result".to_string())
+        })?;
+        let row_ids = rowid_col.as_primitive::<UInt64Type>();
+        let row_id_vec: Vec<u64> = row_ids.values().to_vec();
+
+        let hash_col = batch.column_by_name(column).ok_or_else(|| {
+            Error::invalid_input(format!("Column '{}' not found in result", column))
+        })?;
+        let hashes_arr = hash_col.as_fixed_size_list();
+        let hashes = extract_hashes_from_fixed_list(hashes_arr)?;
+
+        (hashes, row_id_vec)
+    } else {
+        // Full scan
+        let batch = dataset
+            .scan()
+            .project(&[column])?
+            .with_row_id()
+            .try_into_batch()
+            .await?;
+
+        let rowid_col = batch.column_by_name("_rowid").ok_or_else(|| {
+            Error::invalid_input("_rowid column not found in scan result".to_string())
+        })?;
+        let row_ids = rowid_col.as_primitive::<UInt64Type>();
+        let row_id_vec: Vec<u64> = row_ids.values().to_vec();
+
+        let hash_col = batch.column_by_name(column).ok_or_else(|| {
+            Error::invalid_input(format!("Column '{}' not found in result", column))
+        })?;
+        let hashes_arr = hash_col.as_fixed_size_list();
+        let hashes = extract_hashes_from_fixed_list(hashes_arr)?;
+
+        (hashes, row_id_vec)
+    };
+
+    if hashes.len() < 2 {
+        let empty = ClusteringResult {
+            clusters: Vec::new(),
+        };
+        return Ok(empty.into_reader(None));
+    }
+
+    // Compute pairwise hamming distances
+    let pairwise =
+        pairwise_hamming_distance_parallel(&hashes, Some(&row_ids), Some(hamming_threshold));
+
+    // Cluster edges
+    let clustering = cluster_pairwise_result(&pairwise);
+
+    Ok(clustering.into_reader(None))
+}
+
+/// Perform pairwise hamming distance clustering on a contiguous range of rows from a fragment.
+///
+/// This function reads a contiguous range of rows from a specific fragment,
+/// extracts hashes, computes pairwise hamming distances, and clusters the results.
+/// Unlike sampling, this reads sequential rows which is useful for distributed
+/// processing where each worker handles a specific range of a fragment.
+///
+/// # Arguments
+///
+/// * `dataset` - The Lance dataset
+/// * `column` - Name of the hash column (must be `FixedSizeList<UInt8, 8>`)
+/// * `fragment_id` - The fragment ID to read from
+/// * `start_row` - The starting row offset within the fragment
+/// * `num_rows` - Number of rows to read from the start position
+/// * `hamming_threshold` - Maximum hamming distance to consider as similar
+///
+/// # Returns
+///
+/// A `RecordBatchReader` yielding batches with columns:
+/// - `representative`: UInt64 - The representative row ID for each cluster
+/// - `duplicates`: `List<UInt64>` - List of duplicate row IDs in each cluster
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - The fragment doesn't exist
+/// - The column has wrong type (must be `FixedSizeList<UInt8, 8>`)
+/// - The row range is out of bounds
+pub async fn hamming_clustering_for_range(
+    dataset: &Dataset,
+    column: &str,
+    fragment_id: usize,
+    start_row: usize,
+    num_rows: usize,
+    hamming_threshold: u32,
+) -> Result<Box<dyn RecordBatchReader + Send>> {
+    // Validate column exists and has correct type
+    let schema = dataset.schema();
+    let field = schema.field(column).ok_or_else(|| {
+        Error::invalid_input(format!("Column '{}' not found in dataset schema", column))
+    })?;
+
+    // Check column is FixedSizeList<UInt8, 8>
+    let data_type = field.data_type();
+    match data_type {
+        DataType::FixedSizeList(inner, 8) => {
+            if *inner.data_type() != DataType::UInt8 {
+                return Err(Error::invalid_input(format!(
+                    "Column '{}' must be FixedSizeList<UInt8, 8>, got FixedSizeList<{:?}, 8>",
+                    column,
+                    inner.data_type()
+                )));
+            }
+        }
+        _ => {
+            return Err(Error::invalid_input(format!(
+                "Column '{}' must be FixedSizeList<UInt8, 8>, got {:?}",
+                column, data_type
+            )));
+        }
+    }
+
+    // Get the fragment
+    let fragment = dataset.get_fragment(fragment_id).ok_or_else(|| {
+        Error::invalid_input(format!("Fragment with ID {} not found", fragment_id))
+    })?;
+
+    // Get fragment metadata for physical row count
+    let fragment_meta = fragment.metadata().clone();
+    let physical_rows = fragment_meta
+        .physical_rows
+        .ok_or_else(|| Error::invalid_input("Fragment has no physical_rows metadata"))?;
+
+    // Validate the range
+    if start_row >= physical_rows {
+        return Err(Error::invalid_input(format!(
+            "start_row {} is out of range for fragment with {} physical rows",
+            start_row, physical_rows
+        )));
+    }
+
+    // Adjust num_rows if it exceeds available rows
+    let effective_num_rows = num_rows.min(physical_rows - start_row);
+
+    if effective_num_rows == 0 {
+        let empty = ClusteringResult {
+            clusters: Vec::new(),
+        };
+        return Ok(empty.into_reader(None));
+    }
+
+    // Use scanner with the specific fragment and limit/offset
+    let batch = dataset
+        .scan()
+        .with_fragments(vec![fragment_meta])
+        .project(&[column])?
+        .with_row_id()
+        .limit(Some(effective_num_rows as i64), Some(start_row as i64))?
+        .try_into_batch()
+        .await?;
+
+    // Extract row IDs
+    let rowid_col = batch.column_by_name("_rowid").ok_or_else(|| {
+        Error::invalid_input("_rowid column not found in scan result".to_string())
+    })?;
+    let row_ids = rowid_col.as_primitive::<UInt64Type>();
+    let row_id_vec: Vec<u64> = row_ids.values().to_vec();
+
+    // Extract hashes
+    let hash_col = batch
+        .column_by_name(column)
+        .ok_or_else(|| Error::invalid_input(format!("Column '{}' not found in result", column)))?;
+    let hashes_arr = hash_col.as_fixed_size_list();
+    let hashes = extract_hashes_from_fixed_list(hashes_arr)?;
+
+    if hashes.len() < 2 {
+        let empty = ClusteringResult {
+            clusters: Vec::new(),
+        };
+        return Ok(empty.into_reader(None));
+    }
+
+    // Compute pairwise hamming distances
+    let pairwise =
+        pairwise_hamming_distance_parallel(&hashes, Some(&row_id_vec), Some(hamming_threshold));
+
+    // Cluster edges
+    let clustering = cluster_pairwise_result(&pairwise);
+
+    Ok(clustering.into_reader(None))
+}
+
+/// Perform pairwise hamming distance clustering on provided hashes (no I/O).
+///
+/// This is useful for benchmarking the pure compute performance without I/O.
+/// Logs timing information via tracing.
+///
+/// # Arguments
+///
+/// * `hashes` - Vector of 64-bit hash values
+/// * `row_ids` - Optional row IDs (defaults to indices if None)
+/// * `hamming_threshold` - Maximum hamming distance to consider as similar
+///
+/// # Returns
+///
+/// A `RecordBatchReader` yielding batches with columns:
+/// - `representative`: UInt64 - The representative row ID for each cluster
+/// - `duplicates`: `List<UInt64>` - List of duplicate row IDs in each cluster
+pub fn hamming_clustering_from_hashes(
+    hashes: &[u64],
+    row_ids: Option<&[u64]>,
+    hamming_threshold: u32,
+) -> Box<dyn RecordBatchReader + Send> {
+    let num_rows = hashes.len();
+    if num_rows < 2 {
+        let empty = ClusteringResult {
+            clusters: Vec::new(),
+        };
+        return empty.into_reader(None);
+    }
+
+    let total_pairs = (num_rows as u64) * (num_rows as u64 - 1) / 2;
+
+    // Compute pairwise hamming distances
+    let t_compute_start = Instant::now();
+    let pairwise = pairwise_hamming_distance_parallel(hashes, row_ids, Some(hamming_threshold));
+    let compute_time = t_compute_start.elapsed();
+
+    // Cluster edges
+    let t_cluster_start = Instant::now();
+    let clustering = cluster_pairwise_result(&pairwise);
+    let cluster_time = t_cluster_start.elapsed();
+
+    // Log timing info
+    let pairs_per_sec = if compute_time.as_secs_f64() > 0.0 {
+        total_pairs as f64 / compute_time.as_secs_f64()
+    } else {
+        0.0
+    };
+    tracing::info!(
+        num_rows,
+        total_pairs,
+        edges = pairwise.len(),
+        compute_time_ms = compute_time.as_millis(),
+        cluster_time_ms = cluster_time.as_millis(),
+        pairs_per_sec_millions = pairs_per_sec / 1_000_000.0,
+        num_clusters = clustering.num_clusters(),
+        num_duplicates = clustering.num_duplicates(),
+        "Hamming clustering completed"
+    );
+
+    clustering.into_reader(None)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow_array::Array;
+
+    /// Helper to collect all clusters from a reader.
+    fn collect_clusters(reader: Box<dyn RecordBatchReader + Send>) -> Vec<(u64, Vec<u64>)> {
+        let mut clusters = Vec::new();
+        for batch in reader {
+            let batch = batch.unwrap();
+            let reps = batch
+                .column(0)
+                .as_any()
+                .downcast_ref::<arrow_array::UInt64Array>()
+                .unwrap();
+            let dups = batch
+                .column(1)
+                .as_any()
+                .downcast_ref::<arrow_array::ListArray>()
+                .unwrap();
+
+            for i in 0..batch.num_rows() {
+                let rep = reps.value(i);
+                let dup_arr = dups.value(i);
+                let dup_values = dup_arr
+                    .as_any()
+                    .downcast_ref::<arrow_array::UInt64Array>()
+                    .unwrap();
+                let duplicates: Vec<u64> = dup_values.values().to_vec();
+                clusters.push((rep, duplicates));
+            }
+        }
+        clusters
+    }
+
+    #[test]
+    fn test_hamming_clustering_from_hashes_basic() {
+        // Create some test hashes with known distances
+        let hashes = vec![
+            0b0000u64, // hash 0
+            0b0001u64, // hash 1 - distance 1 from hash 0
+            0b0011u64, // hash 2 - distance 1 from hash 1, distance 2 from hash 0
+            0b1111u64, // hash 3 - distance 2 from hash 2, distance 4 from hash 0
+        ];
+
+        let reader = hamming_clustering_from_hashes(&hashes, None, 1);
+        let clusters = collect_clusters(reader);
+
+        // With threshold 1, pairs (0,1) and (1,2) should be connected
+        // This forms one cluster: {0, 1, 2}
+        assert_eq!(clusters.len(), 1);
+        assert_eq!(clusters[0].1.len(), 2); // 2 duplicates in the cluster
+    }
+
+    #[test]
+    fn test_hamming_clustering_from_hashes_no_clusters() {
+        // All hashes are far apart
+        let hashes = vec![
+            0x0000000000000000u64,
+            0xFFFFFFFFFFFFFFFFu64,
+            0xAAAAAAAAAAAAAAAAu64,
+        ];
+
+        let reader = hamming_clustering_from_hashes(&hashes, None, 5);
+        let clusters = collect_clusters(reader);
+
+        // With threshold 5, no pairs should be connected (min distance is 32)
+        assert_eq!(clusters.len(), 0);
+    }
+
+    #[test]
+    fn test_hamming_clustering_from_hashes_with_row_ids() {
+        let hashes = vec![0b0000u64, 0b0001u64];
+        let row_ids = vec![100u64, 200u64];
+
+        let reader = hamming_clustering_from_hashes(&hashes, Some(&row_ids), 1);
+        let clusters = collect_clusters(reader);
+
+        assert_eq!(clusters.len(), 1);
+        assert_eq!(clusters[0].0, 100); // representative
+        assert_eq!(clusters[0].1, vec![200]); // duplicates
+    }
+
+    #[tokio::test]
+    async fn test_hamming_clustering_for_ivf_partition() {
+        use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array};
+        use arrow_schema::{Field, Schema};
+        use lance_arrow::FixedSizeListArrayExt;
+        use lance_index::vector::ivf::IvfBuildParams;
+        use std::sync::Arc;
+        use tempfile::tempdir;
+
+        // Create test data with hash column (FixedSizeList<UInt8, 8>)
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "hash",
+            arrow_schema::DataType::FixedSizeList(
+                Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)),
+                8,
+            ),
+            false,
+        )]));
+
+        // Generate hashes with some duplicates (similar hashes)
+        let num_rows = 100;
+        let mut hash_bytes = Vec::with_capacity(num_rows * 8);
+        for i in 0..num_rows {
+            // Create groups of similar hashes
+            let base = (i / 10) as u64; // 10 groups
+            let variation = (i % 10) as u64;
+            let hash = base.wrapping_mul(0x123456789) ^ variation;
+            hash_bytes.extend_from_slice(&hash.to_le_bytes());
+        }
+        let values = UInt8Array::from(hash_bytes);
+        let hash_array =
+            FixedSizeListArray::try_new_from_values(values, 8).expect("create hash array");
+
+        let batch =
+            arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap();
+
+        // Write dataset
+        let temp_dir = tempdir().unwrap();
+        let uri = temp_dir.path().to_str().unwrap();
+
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema);
+        let mut dataset = crate::Dataset::write(reader, uri, None).await.unwrap();
+
+        // Create IVF_FLAT index with 4 partitions
+        let ivf_params = IvfBuildParams::new(4);
+        let params = crate::index::vector::VectorIndexParams::with_ivf_flat_params(
+            lance_linalg::distance::MetricType::Hamming,
+            ivf_params,
+        );
+
+        dataset
+            .create_index(
+                &["hash"],
+                crate::index::IndexType::Vector,
+                None,
+                &params,
+                false,
+            )
+            .await
+            .unwrap();
+
+        // Load and test
+        let dataset = crate::Dataset::open(uri).await.unwrap();
+        let indices = dataset.load_indices().await.unwrap();
+        let index_name = &indices[0].name;
+
+        // Test clustering on partition 0
+        let reader = hamming_clustering_for_ivf_partition(&dataset, index_name, 0, 10)
+            .await
+            .unwrap();
+        let clusters = collect_clusters(reader);
+
+        // Verify we get valid results (may or may not have clusters depending on data distribution)
+        // At minimum, verify no panics and valid schema
+        for (rep, dups) in &clusters {
+            assert!(*rep < num_rows as u64 * 10); // row IDs should be reasonable
+            for dup in dups {
+                assert!(*dup < num_rows as u64 * 10);
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_hamming_clustering_for_ivf_partition_invalid_index() {
+        use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array};
+        use arrow_schema::{Field, Schema};
+        use lance_arrow::FixedSizeListArrayExt;
+        use std::sync::Arc;
+        use tempfile::tempdir;
+
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "hash",
+            arrow_schema::DataType::FixedSizeList(
+                Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)),
+                8,
+            ),
+            false,
+        )]));
+
+        let values = UInt8Array::from(vec![0u8; 80]); // 10 rows * 8 bytes
+        let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap();
+        let batch =
+            arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap();
+
+        let temp_dir = tempdir().unwrap();
+        let uri = temp_dir.path().to_str().unwrap();
+
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema);
+        let dataset = crate::Dataset::write(reader, uri, None).await.unwrap();
+
+        // Test with non-existent index
+        let result = hamming_clustering_for_ivf_partition(&dataset, "nonexistent", 0, 10).await;
+        assert!(result.is_err());
+        let err = result.err().unwrap();
+        assert!(err.to_string().contains("not found"), "Error: {}", err);
+    }
+
+    #[tokio::test]
+    async fn test_hamming_clustering_for_sample_integration() {
+        use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array};
+        use arrow_schema::{Field, Schema};
+        use lance_arrow::FixedSizeListArrayExt;
+        use std::sync::Arc;
+        use tempfile::tempdir;
+
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "hash",
+            arrow_schema::DataType::FixedSizeList(
+                Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)),
+                8,
+            ),
+            false,
+        )]));
+
+        // Create 50 rows with some duplicate hashes
+        let num_rows = 50;
+        let mut hash_bytes = Vec::with_capacity(num_rows * 8);
+        for i in 0..num_rows {
+            // Create some identical hashes (groups of 5)
+            let hash = (i / 5) as u64;
+            hash_bytes.extend_from_slice(&hash.to_le_bytes());
+        }
+        let values = UInt8Array::from(hash_bytes);
+        let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap();
+        let batch =
+            arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap();
+
+        let temp_dir = tempdir().unwrap();
+        let uri = temp_dir.path().to_str().unwrap();
+
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema);
+        crate::Dataset::write(reader, uri, None).await.unwrap();
+
+        let dataset = crate::Dataset::open(uri).await.unwrap();
+
+        // Test full scan (no sampling)
+        let reader = hamming_clustering_for_sample(&dataset, "hash", None, 0)
+            .await
+            .unwrap();
+        let clusters = collect_clusters(reader);
+
+        // With threshold 0 (exact match) and groups of 5 identical hashes,
+        // we should have 10 clusters with 4 duplicates each
+        assert_eq!(clusters.len(), 10);
+        for (_, dups) in &clusters {
+            assert_eq!(dups.len(), 4);
+        }
+
+        // Test with sampling
+        let reader = hamming_clustering_for_sample(&dataset, "hash", Some(20), 0)
+            .await
+            .unwrap();
+        let clusters = collect_clusters(reader);
+        // With sampling, we may get fewer clusters
+        assert!(clusters.len() <= 10);
+    }
+
+    #[tokio::test]
+    async fn test_hamming_clustering_for_range_integration() {
+        use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array};
+        use arrow_schema::{Field, Schema};
+        use lance_arrow::FixedSizeListArrayExt;
+        use std::sync::Arc;
+        use tempfile::tempdir;
+
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "hash",
+            arrow_schema::DataType::FixedSizeList(
+                Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)),
+                8,
+            ),
+            false,
+        )]));
+
+        // Create 50 rows with some duplicate hashes (groups of 5 identical hashes)
+        let num_rows = 50;
+        let mut hash_bytes = Vec::with_capacity(num_rows * 8);
+        for i in 0..num_rows {
+            let hash = (i / 5) as u64;
+            hash_bytes.extend_from_slice(&hash.to_le_bytes());
+        }
+        let values = UInt8Array::from(hash_bytes);
+        let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap();
+        let batch =
+            arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap();
+
+        let temp_dir = tempdir().unwrap();
+        let uri = temp_dir.path().to_str().unwrap();
+
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema);
+        crate::Dataset::write(reader, uri, None).await.unwrap();
+
+        let dataset = crate::Dataset::open(uri).await.unwrap();
+
+        // Get fragment info
+        let fragments = dataset.get_fragments();
+        assert_eq!(fragments.len(), 1);
+        let fragment_id = fragments[0].id() as usize;
+
+        // Test reading range from the fragment
+        // Reading rows 0-25 should cover groups 0-4 (5 groups, each with 5 rows)
+        let reader = hamming_clustering_for_range(&dataset, "hash", fragment_id, 0, 25, 0)
+            .await
+            .unwrap();
+        let clusters = collect_clusters(reader);
+
+        // With threshold 0 and 25 rows (groups 0-4), we should have 5 clusters
+        // Each cluster has 4 duplicates (5 identical hashes - 1 representative = 4 duplicates)
+        assert_eq!(clusters.len(), 5);
+        for (_, dups) in &clusters {
+            assert_eq!(dups.len(), 4);
+        }
+
+        // Test reading a different range (rows 25-50)
+        let reader = hamming_clustering_for_range(&dataset, "hash", fragment_id, 25, 25, 0)
+            .await
+            .unwrap();
+        let clusters = collect_clusters(reader);
+
+        // Should have 5 clusters (groups 5-9)
+        assert_eq!(clusters.len(), 5);
+        for (_, dups) in &clusters {
+            assert_eq!(dups.len(), 4);
+        }
+    }
+
+    #[tokio::test]
+    async fn test_hamming_clustering_for_range_invalid_fragment() {
+        use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array};
+        use arrow_schema::{Field, Schema};
+        use lance_arrow::FixedSizeListArrayExt;
+        use std::sync::Arc;
+        use tempfile::tempdir;
+
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "hash",
+            arrow_schema::DataType::FixedSizeList(
+                Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)),
+                8,
+            ),
+            false,
+        )]));
+
+        let values = UInt8Array::from(vec![0u8; 80]); // 10 rows * 8 bytes
+        let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap();
+        let batch =
+            arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap();
+
+        let temp_dir = tempdir().unwrap();
+        let uri = temp_dir.path().to_str().unwrap();
+
+        let reader = RecordBatchIterator::new(vec![Ok(batch)], schema);
+        crate::Dataset::write(reader, uri, None).await.unwrap();
+
+        let dataset = crate::Dataset::open(uri).await.unwrap();
+
+        // Test with non-existent fragment
+        let result = hamming_clustering_for_range(&dataset, "hash", 999, 0, 10, 0).await;
+        assert!(result.is_err());
+        let err = result.err().unwrap();
+        assert!(err.to_string().contains("not found"), "Error: {}", err);
+
+        // Test with out-of-range start_row
+        let result = hamming_clustering_for_range(&dataset, "hash", 0, 1000, 10, 0).await;
+        assert!(result.is_err());
+        let err = result.err().unwrap();
+        assert!(err.to_string().contains("out of range"), "Error: {}", err);
+    }
+}
diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs
index 579990fc03b..fb01339ead9 100644
--- a/rust/lance/src/index/vector/ivf.rs
+++ b/rust/lance/src/index/vector/ivf.rs
@@ -6266,7 +6266,7 @@ mod tests {
         );
 
         // PQ code is on residual space
-        let pq_store = ivf_idx.load_partition_storage(0).await.unwrap();
+        let pq_store = ivf_idx.load_partition_storage(0, None).await.unwrap();
         pq_store
             .codebook()
             .values()
diff --git a/rust/lance/src/index/vector/ivf/partition_serde.rs b/rust/lance/src/index/vector/ivf/partition_serde.rs
index 83ced18c598..ad737620a94 100644
--- a/rust/lance/src/index/vector/ivf/partition_serde.rs
+++ b/rust/lance/src/index/vector/ivf/partition_serde.rs
@@ -3,32 +3,17 @@
 
 //! Serialization and zero-copy deserialization for IVF partition cache entries.
 //!
-//! The format is:
-//!
-//! ```text
-//! [header_len: u64 LE]
-//! [header: JSON bytes]
-//! [sub_index Arrow IPC stream]
-//! [... quantizer-specific IPC streams ...]
-//! [storage Arrow IPC stream]
-//! ```
-//!
-//! Each IPC section is a self-delimiting Arrow IPC stream (schema + batches + EOS
-//! marker), written directly to the underlying writer without buffering. On
-//! deserialization, each message is read into a per-message buffer and zero-copy
-//! decoded via [`lance_arrow::ipc`].
+//! Each entry is a protobuf header (see `lance-index/protos-cache/cache.proto`, with the
+//! distance and rotation types as proto enums) followed by 64-byte-aligned
+//! Arrow IPC sections in a fixed, version-keyed order: the sub-index, then any
+//! quantizer-specific arrays (PQ codebook, RabitQ Matrix rotation), then the
+//! quantizer storage batches. Sections decode zero-copy via [`lance_arrow::ipc`].
 
-use std::io::Write;
 use std::sync::Arc;
 
 use arrow_array::{FixedSizeListArray, RecordBatch};
 use arrow_schema::{DataType, Field, Schema};
-use bytes::Bytes;
-use lance_arrow::ipc::{
-    read_ipc_stream_at, read_ipc_stream_single_at, read_len_prefixed_bytes_at, write_ipc_stream,
-    write_ipc_stream_batches, write_len_prefixed_bytes,
-};
-use lance_core::cache::CacheCodecImpl;
+use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter};
 use lance_core::{Error, Result};
 use lance_index::vector::bq::RQRotationType;
 use lance_index::vector::bq::builder::RabitQuantizer;
@@ -38,11 +23,15 @@ use lance_index::vector::pq::ProductQuantizer;
 use lance_index::vector::pq::storage::ProductQuantizationMetadata;
 use lance_index::vector::quantizer::{Quantization, QuantizerStorage};
 use lance_index::vector::sq::ScalarQuantizer;
-use lance_index::vector::sq::storage::ScalarQuantizationMetadata;
 use lance_index::vector::storage::VectorStore;
 use lance_index::vector::v3::subindex::IvfSubIndex;
 use lance_linalg::distance::DistanceType;
-use serde::{Deserialize, Serialize};
+
+use lance_index::cache_pb::{
+    DistanceType as PbDistanceType, FlatPartitionHeader, PqPartitionHeader, RabitPartitionHeader,
+    RabitQueryEstimator as PbRabitQueryEstimator, RotationType as PbRotationType,
+    SqPartitionHeader,
+};
 
 use super::v2::PartitionEntry;
 
@@ -68,7 +57,7 @@ type ArcAny = Arc<dyn std::any::Any + Send + Sync>;
 
 fn serialize_partition_entry<S, Concrete>(
     any: &ArcAny,
-    writer: &mut dyn Write,
+    writer: &mut CacheEntryWriter<'_>,
 ) -> lance_core::Result<()>
 where
     S: IvfSubIndex + 'static,
@@ -81,14 +70,16 @@ where
     concrete.serialize(writer)
 }
 
-fn deserialize_partition_entry<S, Q, Concrete>(data: &Bytes) -> lance_core::Result<ArcAny>
+fn deserialize_partition_entry<S, Q, Concrete>(
+    reader: &mut CacheEntryReader<'_>,
+) -> lance_core::Result<ArcAny>
 where
     S: IvfSubIndex + 'static,
     Q: Quantization + 'static,
     Concrete: Quantization + 'static,
     PartitionEntry<S, Concrete>: CacheCodecImpl,
 {
-    let concrete = PartitionEntry::<S, Concrete>::deserialize(data)?;
+    let concrete = PartitionEntry::<S, Concrete>::deserialize(reader)?;
     let any: ArcAny = Arc::new(concrete);
     Ok(any
         .downcast::<PartitionEntry<S, Q>>()
@@ -109,6 +100,8 @@ where
     PartitionEntry<S, Concrete>: CacheCodecImpl,
 {
     lance_core::cache::CacheCodec::new(
+        <PartitionEntry<S, Concrete> as CacheCodecImpl>::TYPE_ID,
+        <PartitionEntry<S, Concrete> as CacheCodecImpl>::CURRENT_VERSION,
         serialize_partition_entry::<S, Concrete>,
         deserialize_partition_entry::<S, Q, Concrete>,
     )
@@ -118,51 +111,64 @@ where
 // Common helpers
 // ---------------------------------------------------------------------------
 
-fn distance_type_to_u8(dt: DistanceType) -> u8 {
+// Distance and rotation discriminants travel as proto enums in the header;
+// these map to/from the in-memory Rust enums.
+
+fn distance_type_to_proto(dt: DistanceType) -> PbDistanceType {
+    match dt {
+        DistanceType::L2 => PbDistanceType::L2,
+        DistanceType::Cosine => PbDistanceType::Cosine,
+        DistanceType::Dot => PbDistanceType::Dot,
+        DistanceType::Hamming => PbDistanceType::Hamming,
+    }
+}
+
+fn proto_to_distance_type(dt: PbDistanceType) -> DistanceType {
     match dt {
-        DistanceType::L2 => 0,
-        DistanceType::Cosine => 1,
-        DistanceType::Dot => 2,
-        DistanceType::Hamming => 3,
+        PbDistanceType::L2 => DistanceType::L2,
+        PbDistanceType::Cosine => DistanceType::Cosine,
+        PbDistanceType::Dot => DistanceType::Dot,
+        PbDistanceType::Hamming => DistanceType::Hamming,
     }
 }
 
-fn u8_to_distance_type(v: u8) -> Result<DistanceType> {
-    match v {
-        0 => Ok(DistanceType::L2),
-        1 => Ok(DistanceType::Cosine),
-        2 => Ok(DistanceType::Dot),
-        3 => Ok(DistanceType::Hamming),
-        _ => Err(Error::io(format!("unknown distance type: {v}"))),
+fn rotation_type_to_proto(rt: RQRotationType) -> PbRotationType {
+    match rt {
+        RQRotationType::Matrix => PbRotationType::Matrix,
+        RQRotationType::Fast => PbRotationType::Fast,
     }
 }
 
-fn rotation_type_to_u8(rt: RQRotationType) -> u8 {
+fn proto_to_rotation_type(rt: PbRotationType) -> RQRotationType {
     match rt {
-        RQRotationType::Matrix => 0,
-        RQRotationType::Fast => 1,
+        PbRotationType::Matrix => RQRotationType::Matrix,
+        PbRotationType::Fast => RQRotationType::Fast,
     }
 }
 
-fn u8_to_rotation_type(v: u8) -> Result<RQRotationType> {
-    match v {
-        0 => Ok(RQRotationType::Matrix),
-        1 => Ok(RQRotationType::Fast),
-        _ => Err(Error::io(format!("unknown rotation type: {v}"))),
+fn query_estimator_to_proto(qe: RabitQueryEstimator) -> PbRabitQueryEstimator {
+    match qe {
+        RabitQueryEstimator::ResidualQuery => PbRabitQueryEstimator::ResidualQuery,
+        RabitQueryEstimator::RawQuery => PbRabitQueryEstimator::RawQuery,
     }
 }
 
-/// Write a JSON-serializable header using [`write_len_prefixed_bytes`].
-fn write_json_header(writer: &mut dyn Write, header: &impl Serialize) -> Result<()> {
-    let header_json = serde_json::to_vec(header)?;
-    write_len_prefixed_bytes(writer, &header_json)?;
-    Ok(())
+fn proto_to_query_estimator(qe: PbRabitQueryEstimator) -> RabitQueryEstimator {
+    match qe {
+        PbRabitQueryEstimator::ResidualQuery => RabitQueryEstimator::ResidualQuery,
+        PbRabitQueryEstimator::RawQuery => RabitQueryEstimator::RawQuery,
+    }
 }
 
-/// Read a JSON header written by [`write_json_header`].
-fn read_json_header<T: serde::de::DeserializeOwned>(data: &Bytes, offset: &mut usize) -> Result<T> {
-    let bytes = read_len_prefixed_bytes_at(data, offset).map_err(|e| Error::io(e.to_string()))?;
-    serde_json::from_slice(&bytes).map_err(|e| Error::io(e.to_string()))
+/// Read a storage section expected to hold exactly one batch.
+fn read_single_storage_batch(r: &mut CacheEntryReader<'_>) -> Result<RecordBatch> {
+    let mut batches = r.read_ipc_batches()?;
+    match batches.len() {
+        1 => Ok(batches.remove(0)),
+        n => Err(Error::io(format!(
+            "expected exactly 1 storage batch, got {n}"
+        ))),
+    }
 }
 
 /// Wrap a `FixedSizeListArray` in a single-column `RecordBatch` with the given
@@ -202,17 +208,11 @@ fn batch_to_codebook(batch: &RecordBatch) -> Result<FixedSizeListArray> {
 // PQ
 // ---------------------------------------------------------------------------
 
-#[derive(Serialize, Deserialize)]
-struct PqPartitionHeader {
-    distance_type: u8,
-    nbits: u32,
-    num_sub_vectors: usize,
-    dimension: usize,
-    transposed: bool,
-}
-
 impl<S: IvfSubIndex> CacheCodecImpl for PartitionEntry<S, ProductQuantizer> {
-    fn serialize(&self, writer: &mut dyn Write) -> Result<()> {
+    const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.PQ";
+    const CURRENT_VERSION: u32 = 1;
+
+    fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> {
         let metadata = self.storage.metadata();
         let distance_type = self.storage.distance_type();
 
@@ -221,32 +221,28 @@ impl<S: IvfSubIndex> CacheCodecImpl for PartitionEntry<S, ProductQuantizer> {
         })?;
 
         let header = PqPartitionHeader {
-            distance_type: distance_type_to_u8(distance_type),
+            distance_type: distance_type_to_proto(distance_type) as i32,
             nbits: metadata.nbits,
-            num_sub_vectors: metadata.num_sub_vectors,
-            dimension: metadata.dimension,
+            num_sub_vectors: metadata.num_sub_vectors as u64,
+            dimension: metadata.dimension as u64,
             transposed: metadata.transposed,
         };
 
-        write_json_header(writer, &header)?;
-        write_ipc_stream(&self.index.to_batch()?, writer)?;
-        write_ipc_stream(&codebook_to_batch(codebook)?, writer)?;
-        write_ipc_stream_batches(self.storage.to_batches()?, writer)?;
+        w.write_header(&header)?;
+        w.write_ipc(&self.index.to_batch()?)?;
+        w.write_ipc(&codebook_to_batch(codebook)?)?;
+        w.write_ipc_batches(self.storage.to_batches()?)?;
 
         Ok(())
     }
 
-    fn deserialize(data: &Bytes) -> Result<Self> {
-        let mut offset = 0;
-        let header: PqPartitionHeader = read_json_header(data, &mut offset)?;
-        let distance_type = u8_to_distance_type(header.distance_type)?;
+    fn deserialize(r: &mut CacheEntryReader<'_>) -> Result<Self> {
+        let header: PqPartitionHeader = r.read_header()?;
+        let distance_type = proto_to_distance_type(header.distance_type());
 
-        let sub_index_batch =
-            read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?;
-        let codebook_batch =
-            read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?;
-        let storage_batch =
-            read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?;
+        let sub_index_batch = r.read_ipc()?;
+        let codebook_batch = r.read_ipc()?;
+        let storage_batch = read_single_storage_batch(r)?;
 
         let index = S::load(sub_index_batch)?;
         let codebook = batch_to_codebook(&codebook_batch)?;
@@ -254,8 +250,8 @@ impl<S: IvfSubIndex> CacheCodecImpl for PartitionEntry<S, ProductQuantizer> {
         let metadata = ProductQuantizationMetadata {
             codebook_position: 0,
             nbits: header.nbits,
-            num_sub_vectors: header.num_sub_vectors,
-            dimension: header.dimension,
+            num_sub_vectors: header.num_sub_vectors as usize,
+            dimension: header.dimension as usize,
             codebook: Some(codebook),
             codebook_tensor: Vec::new(),
             transposed: header.transposed,
@@ -276,41 +272,35 @@ impl<S: IvfSubIndex> CacheCodecImpl for PartitionEntry<S, ProductQuantizer> {
 // Flat (Float32)
 // ---------------------------------------------------------------------------
 
-#[derive(Serialize, Deserialize)]
-struct FlatPartitionHeader {
-    distance_type: u8,
-    dim: usize,
-}
-
 impl<S: IvfSubIndex> CacheCodecImpl for PartitionEntry<S, FlatQuantizer> {
-    fn serialize(&self, writer: &mut dyn Write) -> Result<()> {
-        let metadata = self.storage.metadata();
-        let distance_type = self.storage.distance_type();
+    const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.Flat";
+    const CURRENT_VERSION: u32 = 1;
 
+    fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> {
+        let metadata = self.storage.metadata();
         let header = FlatPartitionHeader {
-            distance_type: distance_type_to_u8(distance_type),
-            dim: metadata.dim,
+            distance_type: distance_type_to_proto(self.storage.distance_type()) as i32,
+            dim: metadata.dim as u64,
         };
 
-        write_json_header(writer, &header)?;
-        write_ipc_stream(&self.index.to_batch()?, writer)?;
-        write_ipc_stream_batches(self.storage.to_batches()?, writer)?;
+        w.write_header(&header)?;
+        w.write_ipc(&self.index.to_batch()?)?;
+        w.write_ipc_batches(self.storage.to_batches()?)?;
 
         Ok(())
     }
 
-    fn deserialize(data: &Bytes) -> Result<Self> {
-        let mut offset = 0;
-        let header: FlatPartitionHeader = read_json_header(data, &mut offset)?;
-        let distance_type = u8_to_distance_type(header.distance_type)?;
+    fn deserialize(r: &mut CacheEntryReader<'_>) -> Result<Self> {
+        let header: FlatPartitionHeader = r.read_header()?;
+        let distance_type = proto_to_distance_type(header.distance_type());
 
-        let sub_index_batch =
-            read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?;
-        let storage_batch =
-            read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?;
+        let sub_index_batch = r.read_ipc()?;
+        let storage_batch = read_single_storage_batch(r)?;
 
         let index = S::load(sub_index_batch)?;
-        let metadata = FlatMetadata { dim: header.dim };
+        let metadata = FlatMetadata {
+            dim: header.dim as usize,
+        };
         let storage = <FlatQuantizer as Quantization>::Storage::try_from_batch(
             storage_batch,
             &metadata,
@@ -327,34 +317,34 @@ impl<S: IvfSubIndex> CacheCodecImpl for PartitionEntry<S, FlatQuantizer> {
 // ---------------------------------------------------------------------------
 
 impl<S: IvfSubIndex> CacheCodecImpl for PartitionEntry<S, FlatBinQuantizer> {
-    fn serialize(&self, writer: &mut dyn Write) -> Result<()> {
-        let metadata = self.storage.metadata();
-        let distance_type = self.storage.distance_type();
+    const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.FlatBin";
+    const CURRENT_VERSION: u32 = 1;
 
+    fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> {
+        let metadata = self.storage.metadata();
         let header = FlatPartitionHeader {
-            distance_type: distance_type_to_u8(distance_type),
-            dim: metadata.dim,
+            distance_type: distance_type_to_proto(self.storage.distance_type()) as i32,
+            dim: metadata.dim as u64,
         };
 
-        write_json_header(writer, &header)?;
-        write_ipc_stream(&self.index.to_batch()?, writer)?;
-        write_ipc_stream_batches(self.storage.to_batches()?, writer)?;
+        w.write_header(&header)?;
+        w.write_ipc(&self.index.to_batch()?)?;
+        w.write_ipc_batches(self.storage.to_batches()?)?;
 
         Ok(())
     }
 
-    fn deserialize(data: &Bytes) -> Result<Self> {
-        let mut offset = 0;
-        let header: FlatPartitionHeader = read_json_header(data, &mut offset)?;
-        let distance_type = u8_to_distance_type(header.distance_type)?;
+    fn deserialize(r: &mut CacheEntryReader<'_>) -> Result<Self> {
+        let header: FlatPartitionHeader = r.read_header()?;
+        let distance_type = proto_to_distance_type(header.distance_type());
 
-        let sub_index_batch =
-            read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?;
-        let storage_batch =
-            read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?;
+        let sub_index_batch = r.read_ipc()?;
+        let storage_batch = read_single_storage_batch(r)?;
 
         let index = S::load(sub_index_batch)?;
-        let metadata = FlatMetadata { dim: header.dim };
+        let metadata = FlatMetadata {
+            dim: header.dim as usize,
+        };
         let storage = <FlatBinQuantizer as Quantization>::Storage::try_from_batch(
             storage_batch,
             &metadata,
@@ -370,56 +360,41 @@ impl<S: IvfSubIndex> CacheCodecImpl for PartitionEntry<S, FlatBinQuantizer> {
 // SQ
 // ---------------------------------------------------------------------------
 
-#[derive(Serialize, Deserialize)]
-struct SqPartitionHeader {
-    distance_type: u8,
-    num_bits: u16,
-    dim: usize,
-    bounds_start: f64,
-    bounds_end: f64,
-}
-
 impl<S: IvfSubIndex> CacheCodecImpl for PartitionEntry<S, ScalarQuantizer> {
-    fn serialize(&self, writer: &mut dyn Write) -> Result<()> {
-        let metadata = self.storage.metadata();
-        let distance_type = self.storage.distance_type();
+    const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.SQ";
+    const CURRENT_VERSION: u32 = 1;
 
+    fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> {
+        let metadata = self.storage.metadata();
         let header = SqPartitionHeader {
-            distance_type: distance_type_to_u8(distance_type),
-            num_bits: metadata.num_bits,
-            dim: metadata.dim,
+            distance_type: distance_type_to_proto(self.storage.distance_type()) as i32,
+            num_bits: metadata.num_bits as u32,
+            dim: metadata.dim as u64,
             bounds_start: metadata.bounds.start,
             bounds_end: metadata.bounds.end,
         };
 
-        write_json_header(writer, &header)?;
-        write_ipc_stream(&self.index.to_batch()?, writer)?;
-        // SQ storage may contain multiple batches; stream them all in one IPC stream.
-        write_ipc_stream_batches(self.storage.to_batches()?, writer)?;
+        w.write_header(&header)?;
+        w.write_ipc(&self.index.to_batch()?)?;
+        // SQ storage may contain multiple batches; write them all in one section.
+        w.write_ipc_batches(self.storage.to_batches()?)?;
 
         Ok(())
     }
 
-    fn deserialize(data: &Bytes) -> Result<Self> {
-        let mut offset = 0;
-        let header: SqPartitionHeader = read_json_header(data, &mut offset)?;
-        let distance_type = u8_to_distance_type(header.distance_type)?;
+    fn deserialize(r: &mut CacheEntryReader<'_>) -> Result<Self> {
+        let header: SqPartitionHeader = r.read_header()?;
+        let distance_type = proto_to_distance_type(header.distance_type());
 
-        let sub_index_batch =
-            read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?;
-        let storage_batches =
-            read_ipc_stream_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?;
+        let sub_index_batch = r.read_ipc()?;
+        let storage_batches = r.read_ipc_batches()?;
 
         let index = S::load(sub_index_batch)?;
-        let metadata = ScalarQuantizationMetadata {
-            dim: header.dim,
-            num_bits: header.num_bits,
-            bounds: header.bounds_start..header.bounds_end,
-        };
+        let num_bits = header.num_bits as u16;
         let storage = <ScalarQuantizer as Quantization>::Storage::try_new(
-            metadata.num_bits,
+            num_bits,
             distance_type,
-            metadata.bounds,
+            header.bounds_start..header.bounds_end,
             storage_batches,
             None,
         )?;
@@ -432,88 +407,69 @@ impl<S: IvfSubIndex> CacheCodecImpl for PartitionEntry<S, ScalarQuantizer> {
 // RabitQ
 // ---------------------------------------------------------------------------
 
-#[derive(Serialize, Deserialize)]
-struct RabitPartitionHeader {
-    distance_type: u8,
-    num_bits: u8,
-    code_dim: u32,
-    #[serde(default = "default_rabit_query_estimator")]
-    query_estimator: RabitQueryEstimator,
-    /// 0 = Matrix, 1 = Fast
-    rotation_type: u8,
-    /// Fast rotation signs (only set when rotation_type == Fast).
-    fast_rotation_signs: Option<Vec<u8>>,
-}
-
-fn default_rabit_query_estimator() -> RabitQueryEstimator {
-    RabitQueryEstimator::ResidualQuery
-}
-
 impl<S: IvfSubIndex> CacheCodecImpl for PartitionEntry<S, RabitQuantizer> {
-    fn serialize(&self, writer: &mut dyn Write) -> Result<()> {
-        let metadata = self.storage.metadata();
-        let distance_type = self.storage.distance_type();
+    const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.Rabit";
+    const CURRENT_VERSION: u32 = 1;
 
+    fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> {
+        let metadata = self.storage.metadata();
         let header = RabitPartitionHeader {
-            distance_type: distance_type_to_u8(distance_type),
-            num_bits: metadata.num_bits,
+            distance_type: distance_type_to_proto(self.storage.distance_type()) as i32,
+            num_bits: metadata.num_bits as u32,
             code_dim: metadata.code_dim,
-            query_estimator: metadata.query_estimator,
-            rotation_type: rotation_type_to_u8(metadata.rotation_type),
+            rotation_type: rotation_type_to_proto(metadata.rotation_type) as i32,
+            query_estimator: query_estimator_to_proto(metadata.query_estimator) as i32,
             fast_rotation_signs: metadata.fast_rotation_signs.clone(),
         };
 
-        write_json_header(writer, &header)?;
+        w.write_header(&header)?;
+        w.write_ipc(&self.index.to_batch()?)?;
 
-        write_ipc_stream(&self.index.to_batch()?, writer)?;
-
-        // Write the rotation matrix IPC stream only for Matrix rotation; the
-        // Fast rotation case stores its signs compactly in the JSON header.
+        // Write the rotation matrix IPC section only for Matrix rotation; the
+        // Fast rotation case stores its signs compactly in the proto header.
         if metadata.rotation_type == RQRotationType::Matrix {
             let mat = metadata.rotate_mat.as_ref().ok_or_else(|| {
                 Error::io(
                     "RabitQ Matrix metadata missing rotate_mat during serialization".to_string(),
                 )
             })?;
-            write_ipc_stream(&fsl_to_batch(mat, "rotate_mat")?, writer)?;
+            w.write_ipc(&fsl_to_batch(mat, "rotate_mat")?)?;
         }
 
-        write_ipc_stream_batches(self.storage.to_batches()?, writer)?;
+        w.write_ipc_batches(self.storage.to_batches()?)?;
 
         Ok(())
     }
 
-    fn deserialize(data: &Bytes) -> Result<Self> {
-        let mut offset = 0;
-        let header: RabitPartitionHeader = read_json_header(data, &mut offset)?;
-        let distance_type = u8_to_distance_type(header.distance_type)?;
-        let rotation_type = u8_to_rotation_type(header.rotation_type)?;
+    fn deserialize(r: &mut CacheEntryReader<'_>) -> Result<Self> {
+        let header: RabitPartitionHeader = r.read_header()?;
+        let distance_type = proto_to_distance_type(header.distance_type());
+        let rotation_type = proto_to_rotation_type(header.rotation_type());
 
-        let sub_index_batch =
-            read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?;
+        let sub_index_batch = r.read_ipc()?;
 
         let rotate_mat = if rotation_type == RQRotationType::Matrix {
-            let mat_batch = read_ipc_stream_single_at(data, &mut offset)
-                .map_err(|e| Error::io(e.to_string()))?;
+            let mat_batch = r.read_ipc()?;
             Some(batch_to_fsl(&mat_batch)?)
         } else {
             None
         };
 
-        let storage_batch =
-            read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?;
+        let storage_batch = read_single_storage_batch(r)?;
 
         let index = S::load(sub_index_batch)?;
+        // Read the proto enum accessor before moving fields out of `header`.
+        let query_estimator = proto_to_query_estimator(header.query_estimator());
         let metadata = RabitQuantizationMetadata {
             rotate_mat,
             rotate_mat_position: None,
             fast_rotation_signs: header.fast_rotation_signs,
             rotation_type,
             code_dim: header.code_dim,
-            num_bits: header.num_bits,
+            num_bits: header.num_bits as u8,
             // The storage batch already has packed codes; skip re-packing.
             packed: true,
-            query_estimator: header.query_estimator,
+            query_estimator,
         };
         let storage = <RabitQuantizer as Quantization>::Storage::try_from_batch(
             storage_batch,
@@ -551,6 +507,21 @@ mod tests {
     use lance_index::vector::flat::storage::FlatFloatStorage;
     use lance_index::vector::sq::storage::ScalarQuantizationStorage;
 
+    /// Serialize a codec body (no envelope) for tests.
+    fn ser_body<T: CacheCodecImpl>(entry: &T) -> Vec<u8> {
+        let mut buf = Vec::new();
+        entry
+            .serialize(&mut CacheEntryWriter::new(&mut buf))
+            .unwrap();
+        buf
+    }
+
+    /// Deserialize a codec body (no envelope) at the current build's version.
+    fn de_body<T: CacheCodecImpl>(bytes: Vec<u8>) -> Result<T> {
+        let data = bytes::Bytes::from(bytes);
+        T::deserialize(&mut CacheEntryReader::new(&data, 0, T::CURRENT_VERSION))
+    }
+
     // ----- PQ helpers -------------------------------------------------------
 
     fn make_test_codebook(dim: usize, num_sub_vectors: usize) -> FixedSizeListArray {
@@ -618,12 +589,9 @@ mod tests {
             storage,
         };
 
-        let mut serialized = Vec::new();
-        entry.serialize(&mut serialized).unwrap();
-        let deserialized = PartitionEntry::<FlatIndex, ProductQuantizer>::deserialize(
-            &bytes::Bytes::from(serialized),
-        )
-        .unwrap();
+        let serialized = ser_body(&entry);
+        let deserialized =
+            de_body::<PartitionEntry<FlatIndex, ProductQuantizer>>(serialized).unwrap();
 
         assert_eq!(entry.storage, deserialized.storage);
     }
@@ -671,12 +639,8 @@ mod tests {
                 storage,
             };
 
-            let mut bytes = Vec::new();
-            entry.serialize(&mut bytes).unwrap();
-            let restored = PartitionEntry::<FlatIndex, ProductQuantizer>::deserialize(
-                &bytes::Bytes::from(bytes),
-            )
-            .unwrap();
+            let bytes = ser_body(&entry);
+            let restored = de_body::<PartitionEntry<FlatIndex, ProductQuantizer>>(bytes).unwrap();
             assert_eq!(
                 restored.storage.distance_type(),
                 entry.storage.distance_type()
@@ -694,12 +658,9 @@ mod tests {
             storage,
         };
 
-        let mut serialized = Vec::new();
-        entry.serialize(&mut serialized).unwrap();
-        let deserialized = PartitionEntry::<FlatIndex, ProductQuantizer>::deserialize(
-            &bytes::Bytes::from(serialized),
-        )
-        .unwrap();
+        let serialized = ser_body(&entry);
+        let deserialized =
+            de_body::<PartitionEntry<FlatIndex, ProductQuantizer>>(serialized).unwrap();
         assert_eq!(entry.storage, deserialized.storage);
     }
 
@@ -712,13 +673,9 @@ mod tests {
             index: FlatIndex::default(),
             storage,
         };
-        let mut bytes = Vec::new();
-        entry.serialize(&mut bytes).unwrap();
+        let mut bytes = ser_body(&entry);
         bytes.truncate(3);
-        assert!(
-            PartitionEntry::<FlatIndex, ProductQuantizer>::deserialize(&bytes::Bytes::from(bytes))
-                .is_err()
-        );
+        assert!(de_body::<PartitionEntry<FlatIndex, ProductQuantizer>>(bytes).is_err());
     }
 
     // ----- Flat helpers -----------------------------------------------------
@@ -756,11 +713,8 @@ mod tests {
             storage,
         };
 
-        let mut bytes = Vec::new();
-        entry.serialize(&mut bytes).unwrap();
-        let restored =
-            PartitionEntry::<FlatIndex, FlatQuantizer>::deserialize(&bytes::Bytes::from(bytes))
-                .unwrap();
+        let bytes = ser_body(&entry);
+        let restored = de_body::<PartitionEntry<FlatIndex, FlatQuantizer>>(bytes).unwrap();
 
         assert_eq!(
             restored.storage.metadata().dim,
@@ -786,11 +740,8 @@ mod tests {
                 index: FlatIndex::default(),
                 storage,
             };
-            let mut bytes = Vec::new();
-            entry.serialize(&mut bytes).unwrap();
-            let restored =
-                PartitionEntry::<FlatIndex, FlatQuantizer>::deserialize(&bytes::Bytes::from(bytes))
-                    .unwrap();
+            let bytes = ser_body(&entry);
+            let restored = de_body::<PartitionEntry<FlatIndex, FlatQuantizer>>(bytes).unwrap();
             assert_eq!(restored.storage.distance_type(), dt);
         }
     }
@@ -803,11 +754,8 @@ mod tests {
             storage,
         };
 
-        let mut bytes = Vec::new();
-        entry.serialize(&mut bytes).unwrap();
-        let restored =
-            PartitionEntry::<FlatIndex, FlatQuantizer>::deserialize(&bytes::Bytes::from(bytes))
-                .unwrap();
+        let bytes = ser_body(&entry);
+        let restored = de_body::<PartitionEntry<FlatIndex, FlatQuantizer>>(bytes).unwrap();
 
         let restored_batch = restored.storage.to_batches().unwrap().next().unwrap();
         let schema = restored_batch.schema();
@@ -828,11 +776,8 @@ mod tests {
             storage,
         };
 
-        let mut bytes = Vec::new();
-        entry.serialize(&mut bytes).unwrap();
-        let restored =
-            PartitionEntry::<FlatIndex, FlatQuantizer>::deserialize(&bytes::Bytes::from(bytes))
-                .unwrap();
+        let bytes = ser_body(&entry);
+        let restored = de_body::<PartitionEntry<FlatIndex, FlatQuantizer>>(bytes).unwrap();
 
         let restored_batch = restored.storage.to_batches().unwrap().next().unwrap();
         let schema = restored_batch.schema();
@@ -884,11 +829,8 @@ mod tests {
             storage,
         };
 
-        let mut bytes = Vec::new();
-        entry.serialize(&mut bytes).unwrap();
-        let restored =
-            PartitionEntry::<FlatIndex, ScalarQuantizer>::deserialize(&bytes::Bytes::from(bytes))
-                .unwrap();
+        let bytes = ser_body(&entry);
+        let restored = de_body::<PartitionEntry<FlatIndex, ScalarQuantizer>>(bytes).unwrap();
 
         let m = entry.storage.metadata();
         let rm = restored.storage.metadata();
@@ -914,12 +856,8 @@ mod tests {
                 index: FlatIndex::default(),
                 storage,
             };
-            let mut bytes = Vec::new();
-            entry.serialize(&mut bytes).unwrap();
-            let restored = PartitionEntry::<FlatIndex, ScalarQuantizer>::deserialize(
-                &bytes::Bytes::from(bytes),
-            )
-            .unwrap();
+            let bytes = ser_body(&entry);
+            let restored = de_body::<PartitionEntry<FlatIndex, ScalarQuantizer>>(bytes).unwrap();
             assert_eq!(restored.storage.distance_type(), dt);
         }
     }
@@ -960,11 +898,8 @@ mod tests {
             index: FlatIndex::default(),
             storage,
         };
-        let mut bytes = Vec::new();
-        entry.serialize(&mut bytes).unwrap();
-        let restored =
-            PartitionEntry::<FlatIndex, ScalarQuantizer>::deserialize(&bytes::Bytes::from(bytes))
-                .unwrap();
+        let bytes = ser_body(&entry);
+        let restored = de_body::<PartitionEntry<FlatIndex, ScalarQuantizer>>(bytes).unwrap();
 
         assert_eq!(restored.storage.len(), 30);
         let orig_ids: Vec<u64> = entry.storage.row_ids().copied().collect();
@@ -978,14 +913,27 @@ mod tests {
         num_rows: usize,
         code_dim: usize,
         distance_type: DistanceType,
+    ) -> <RabitQuantizer as Quantization>::Storage {
+        make_rabit_storage(
+            num_rows,
+            code_dim,
+            distance_type,
+            RQRotationType::Fast,
+            RabitQueryEstimator::ResidualQuery,
+        )
+    }
+
+    fn make_rabit_storage(
+        num_rows: usize,
+        code_dim: usize,
+        distance_type: DistanceType,
+        rotation_type: RQRotationType,
+        query_estimator: RabitQueryEstimator,
     ) -> <RabitQuantizer as Quantization>::Storage {
         use lance_arrow::FixedSizeListArrayExt;
 
-        let quantizer = RabitQuantizer::new_with_rotation::<Float32Type>(
-            1,
-            code_dim as i32,
-            RQRotationType::Fast,
-        );
+        let quantizer =
+            RabitQuantizer::new_with_rotation::<Float32Type>(1, code_dim as i32, rotation_type);
         let values: Vec<f32> = (0..num_rows * code_dim)
             .map(|i| (i % 100) as f32 / 100.0 - 0.5)
             .collect();
@@ -997,7 +945,8 @@ mod tests {
             .as_fixed_size_list()
             .clone();
 
-        let metadata = quantizer.metadata(None);
+        let mut metadata = quantizer.metadata(None);
+        metadata.query_estimator = query_estimator;
         let batch = RecordBatch::try_from_iter(vec![
             (
                 lance_core::ROW_ID,
@@ -1044,11 +993,8 @@ mod tests {
             storage,
         };
 
-        let mut bytes = Vec::new();
-        entry.serialize(&mut bytes).unwrap();
-        let restored =
-            PartitionEntry::<FlatIndex, RabitQuantizer>::deserialize(&bytes::Bytes::from(bytes))
-                .unwrap();
+        let bytes = ser_body(&entry);
+        let restored = de_body::<PartitionEntry<FlatIndex, RabitQuantizer>>(bytes).unwrap();
 
         let m = entry.storage.metadata();
         let rm = restored.storage.metadata();
@@ -1082,22 +1028,125 @@ mod tests {
     fn test_rabitq_distance_types() {
         for dt in [DistanceType::L2, DistanceType::Cosine, DistanceType::Dot] {
             let storage = make_rabit_storage_fast(10, 32, dt);
-            let expected_distance_type = if dt == DistanceType::Cosine {
-                DistanceType::L2
-            } else {
-                dt
-            };
             let entry = PartitionEntry::<FlatIndex, RabitQuantizer> {
                 index: FlatIndex::default(),
                 storage,
             };
-            let mut bytes = Vec::new();
-            entry.serialize(&mut bytes).unwrap();
-            let restored = PartitionEntry::<FlatIndex, RabitQuantizer>::deserialize(
-                &bytes::Bytes::from(bytes),
-            )
+            let bytes = ser_body(&entry);
+            let restored = de_body::<PartitionEntry<FlatIndex, RabitQuantizer>>(bytes).unwrap();
+            // The codec round-trips the distance type faithfully.
+            assert_eq!(
+                restored.storage.distance_type(),
+                entry.storage.distance_type()
+            );
+        }
+    }
+
+    #[test]
+    fn test_roundtrip_rabitq_raw_query_estimator() {
+        // The query estimator is a non-default value here; it must survive the
+        // round trip so raw-query search keeps working after a cache reload.
+        let storage = make_rabit_storage(
+            40,
+            32,
+            DistanceType::L2,
+            RQRotationType::Fast,
+            RabitQueryEstimator::RawQuery,
+        );
+        assert_eq!(
+            storage.metadata().query_estimator,
+            RabitQueryEstimator::RawQuery
+        );
+        let entry = PartitionEntry::<FlatIndex, RabitQuantizer> {
+            index: FlatIndex::default(),
+            storage,
+        };
+
+        let bytes = ser_body(&entry);
+        let restored = de_body::<PartitionEntry<FlatIndex, RabitQuantizer>>(bytes).unwrap();
+        assert_eq!(
+            restored.storage.metadata().query_estimator,
+            RabitQueryEstimator::RawQuery
+        );
+    }
+
+    /// Matrix rotation writes an extra `rotate_mat` IPC section between the
+    /// sub-index and storage sections; exercise that the codec preserves it.
+    #[test]
+    fn test_roundtrip_flat_rabitq_matrix() {
+        let storage = make_rabit_storage(
+            40,
+            32,
+            DistanceType::L2,
+            RQRotationType::Matrix,
+            RabitQueryEstimator::ResidualQuery,
+        );
+        let entry = PartitionEntry::<FlatIndex, RabitQuantizer> {
+            index: FlatIndex::default(),
+            storage,
+        };
+
+        let bytes = ser_body(&entry);
+        let restored = de_body::<PartitionEntry<FlatIndex, RabitQuantizer>>(bytes).unwrap();
+
+        let m = entry.storage.metadata();
+        let rm = restored.storage.metadata();
+        assert_eq!(rm.rotation_type, RQRotationType::Matrix);
+        assert_eq!(rm.code_dim, m.code_dim);
+        assert_eq!(rm.num_bits, m.num_bits);
+        // The rotation matrix itself must survive the round trip.
+        let orig_mat = m
+            .rotate_mat
+            .as_ref()
+            .expect("matrix rotation has rotate_mat");
+        let rest_mat = rm
+            .rotate_mat
+            .as_ref()
+            .expect("restored matrix rotation has rotate_mat");
+        assert_eq!(
+            orig_mat.values().as_primitive::<Float32Type>().values(),
+            rest_mat.values().as_primitive::<Float32Type>().values(),
+        );
+    }
+
+    /// SQ storage (a multi-batch IPC section) must decode zero-copy through the
+    /// full envelope even though the proto header and sub-index section push it
+    /// to a non-aligned starting offset.
+    #[test]
+    fn test_partition_storage_is_zero_copy_through_envelope() {
+        use lance_core::cache::CacheCodec;
+        const ALIGN: usize = 64;
+
+        let entry = PartitionEntry::<FlatIndex, ScalarQuantizer> {
+            index: FlatIndex::default(),
+            storage: make_sq_storage(64, 32, DistanceType::L2),
+        };
+        let codec = CacheCodec::from_impl::<PartitionEntry<FlatIndex, ScalarQuantizer>>();
+        let any: Arc<dyn std::any::Any + Send + Sync> = Arc::new(entry);
+        let mut buf = Vec::new();
+        codec.serialize(&any, &mut buf).unwrap();
+
+        let mut v = vec![0u8; buf.len() + ALIGN];
+        let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN;
+        v[pad..pad + buf.len()].copy_from_slice(&buf);
+        let data = bytes::Bytes::from(v).slice(pad..pad + buf.len());
+
+        let restored = codec.deserialize(&data).hit().unwrap();
+        let restored = restored
+            .downcast::<PartitionEntry<FlatIndex, ScalarQuantizer>>()
             .unwrap();
-            assert_eq!(restored.storage.distance_type(), expected_distance_type);
+
+        let base = data.as_ptr() as usize;
+        let end = base + data.len();
+        let first = restored.storage.to_batches().unwrap().next().unwrap();
+        for col in first.columns() {
+            for buffer in col.to_data().buffers() {
+                let ptr = buffer.as_ptr() as usize;
+                assert!(
+                    ptr >= base && ptr < end,
+                    "storage buffer was realigned out of the input — misaligned IPC section",
+                );
+            }
         }
     }
 
@@ -1135,17 +1184,12 @@ mod tests {
 
         let entry = IvfStateEntryBox(Arc::new(state));
 
-        let mut bytes = Vec::new();
-        CacheCodecImpl::serialize(&entry, &mut bytes).unwrap();
-
-        let restored =
-            <IvfStateEntryBox as CacheCodecImpl>::deserialize(&bytes::Bytes::from(bytes.clone()))
-                .unwrap();
+        let bytes = ser_body(&entry);
+        let restored = de_body::<IvfStateEntryBox>(bytes.clone()).unwrap();
 
         // Re-serialize the restored entry and compare bytes — a stronger check
         // than field-by-field comparison and avoids needing to downcast.
-        let mut restored_bytes = Vec::new();
-        CacheCodecImpl::serialize(&restored, &mut restored_bytes).unwrap();
+        let restored_bytes = ser_body(&restored);
         assert_eq!(bytes, restored_bytes);
     }
 }
diff --git a/rust/lance/src/index/vector/ivf/v2.rs b/rust/lance/src/index/vector/ivf/v2.rs
index 4ea076ed420..5b29752f7c1 100644
--- a/rust/lance/src/index/vector/ivf/v2.rs
+++ b/rust/lance/src/index/vector/ivf/v2.rs
@@ -3,10 +3,10 @@
 
 //! IVF - Inverted File index.
 
-use std::io::Write as IoWrite;
 use std::marker::PhantomData;
 use std::{
     any::Any,
+    borrow::Cow,
     collections::{BinaryHeap, HashMap},
     sync::{Arc, Mutex},
 };
@@ -25,8 +25,10 @@ use futures::future::BoxFuture;
 use futures::prelude::stream::{self, TryStreamExt};
 use futures::{StreamExt, TryFutureExt};
 use lance_arrow::RecordBatchExt;
-use lance_arrow::ipc::write_len_prefixed_bytes;
-use lance_core::cache::{CacheCodec, CacheCodecImpl, CacheKey, LanceCache, WeakLanceCache};
+use lance_core::cache::{
+    CacheCodec, CacheCodecImpl, CacheEntryReader, CacheEntryWriter, CacheKey, LanceCache,
+    WeakLanceCache,
+};
 use lance_core::deepsize::DeepSizeOf;
 use lance_core::utils::tokio::{get_num_compute_intensive_cpus, spawn_cpu};
 use lance_core::utils::tracing::{IO_TYPE_LOAD_VECTOR_PART, TRACE_IO_EVENTS};
@@ -34,12 +36,14 @@ use lance_core::{Error, ROW_ID, Result};
 use lance_encoding::decoder::{DecoderPlugins, FilterExpression};
 use lance_file::LanceEncodingsIo;
 use lance_file::reader::{CachedFileMetadata, FileReader, FileReaderOptions};
+use lance_index::cache_pb::IvfStateHeader;
 use lance_index::frag_reuse::FragReuseIndex;
 use lance_index::metrics::{LocalMetricsCollector, MetricsCollector, NoOpMetricsCollector};
 use lance_index::vector::VectorIndexCacheEntry;
 use lance_index::vector::bq::builder::RabitQuantizer;
+use lance_index::vector::bq::ex_dot::{blocked_ex_code_bytes, padded_query_len};
+use lance_index::vector::bq::rabit_ex_bits;
 use lance_index::vector::bq::storage::{RabitQueryEstimator, SEGMENT_NUM_CODES};
-use lance_index::vector::bq::{rabit_ex_bits, rabit_ex_code_bytes};
 use lance_index::vector::flat::index::{FlatBinQuantizer, FlatIndex, FlatQuantizer};
 use lance_index::vector::graph::OrderedNode;
 use lance_index::vector::hnsw::HNSW;
@@ -64,7 +68,7 @@ use lance_index::{
 };
 use lance_index::{INDEX_METADATA_SCHEMA_KEY, IndexMetadata};
 use lance_io::local::to_local_path;
-use lance_io::scheduler::SchedulerConfig;
+use lance_io::scheduler::{IoStats, ScanStats, SchedulerConfig};
 use lance_io::utils::CachedFileSize;
 use lance_io::{
     ReadBatchParams, object_store::ObjectStore, scheduler::ScanScheduler, traits::Reader,
@@ -152,16 +156,18 @@ fn rotated_partition_centroid_slice(
     cache.rotated_centroids.get(start..end)
 }
 
-fn rabit_ex_dist_table_len(dim: usize, num_bits: u8) -> usize {
-    rabit_ex_bits(num_bits)
-        .map(|ex_bits| {
-            if ex_bits == 0 {
-                0
-            } else {
-                dim * (1usize << usize::from(ex_bits))
-            }
-        })
-        .unwrap_or(dim * 256)
+/// `f32` scratch needed for the ex-bit query state: a zero-padded query copy
+/// when the rotated dim is not a multiple of the 64-dim kernel block (the
+/// FastScan ex LUT is built directly from the query, with no f32 table).
+fn rabit_ex_scratch_len(dim: usize, num_bits: u8) -> usize {
+    let multi_bit = rabit_ex_bits(num_bits)
+        .map(|ex_bits| ex_bits > 0)
+        .unwrap_or(true);
+    if !multi_bit || dim.is_multiple_of(64) {
+        0
+    } else {
+        padded_query_len(dim)
+    }
 }
 
 fn rabit_u8_scratch_len(dim: usize, num_bits: u8) -> usize {
@@ -169,7 +175,7 @@ fn rabit_u8_scratch_len(dim: usize, num_bits: u8) -> usize {
     let ex_dist_table_len = rabit_ex_bits(num_bits)
         .ok()
         .and_then(|ex_bits| match ex_bits {
-            2 | 4 | 8 => rabit_ex_code_bytes(dim, ex_bits).ok(),
+            2 | 4 | 8 => Some(blocked_ex_code_bytes(dim, ex_bits)),
             _ => None,
         })
         .map(|ex_code_len| ex_code_len * 2 * SEGMENT_NUM_CODES)
@@ -183,12 +189,12 @@ fn rabit_query_scratch_capacity(
     num_bits: u8,
 ) -> QueryScratchCapacity {
     let dist_table_len = dim * 4;
-    let ex_dist_table_len = rabit_ex_dist_table_len(dim, num_bits);
+    let ex_scratch_len = rabit_ex_scratch_len(dim, num_bits);
     let u8_scratch_len = rabit_u8_scratch_len(dim, num_bits);
 
     QueryScratchCapacity::new(
         max_partition_len,
-        dim + dist_table_len + ex_dist_table_len,
+        dim + dist_table_len + ex_scratch_len,
         max_partition_len.max(dist_table_len),
         u8_scratch_len,
     )
@@ -213,28 +219,6 @@ impl<Q: Quantization> DeepSizeOf for IvfIndexState<Q> {
     }
 }
 
-/// Serialization header for the `IvfIndexState` wire format.
-///
-/// Kept as a flat, non-generic struct so the JSON header format is stable
-/// regardless of `Q`. `quantizer_metadata_json` holds the serialized
-/// `Q::Metadata`; large blobs (PQ codebook, RQ matrix) follow as raw bytes.
-#[derive(serde::Serialize, serde::Deserialize)]
-struct IvfIndexStateHeader {
-    index_file_path: String,
-    uuid: String,
-    distance_type: String,
-    sub_index_metadata: Vec<String>,
-    sub_index_type: String,
-    quantization_type: String,
-    quantizer_metadata_json: String,
-    #[serde(default)]
-    cache_key_prefix: String,
-    #[serde(default)]
-    index_file_size: u64,
-    #[serde(default)]
-    aux_file_size: u64,
-}
-
 /// Object-safe interface for a type-erased `IvfIndexState<Q>`.
 ///
 /// Stored as `Arc<dyn IvfStateEntry>` inside [`IvfStateEntryBox`], which is
@@ -242,13 +226,14 @@ struct IvfIndexStateHeader {
 /// wrapper lets the cache infrastructure work with a sized type while the
 /// hot paths call `reconstruct` without knowing `Q`.
 pub(crate) trait IvfStateEntry: DeepSizeOf + Send + Sync + 'static {
-    fn serialize_state(&self, writer: &mut dyn IoWrite) -> Result<()>;
+    fn serialize_state(&self, w: &mut CacheEntryWriter<'_>) -> Result<()>;
 
     fn reconstruct<'a>(
         &'a self,
         object_store: Arc<ObjectStore>,
         file_metadata_cache: &'a LanceCache,
         index_cache: LanceCache,
+        frag_reuse_index: Option<Arc<FragReuseIndex>>,
     ) -> BoxFuture<'a, Result<Arc<dyn VectorIndex>>>;
 }
 
@@ -266,42 +251,39 @@ impl DeepSizeOf for IvfStateEntryBox {
     }
 }
 
-/// Wire format (unchanged from the non-generic `IvfIndexState`):
-/// `[header_json_len: u64 LE][header JSON][ivf_pb_len: u64 LE][ivf protobuf]
-///  [extra_len: u64 LE][extra bytes][aux_ivf_pb_len: u64 LE][aux_ivf protobuf]`
+/// Wire format:
+/// ```text
+/// HEADER   : IvfStateHeader proto (paths, types, quantizer metadata JSON)
+/// RAW_BLOB : IVF model protobuf
+/// RAW_BLOB : quantizer extra-metadata buffer (may be empty)
+/// RAW_BLOB : auxiliary IVF model protobuf
+/// ```
 impl CacheCodecImpl for IvfStateEntryBox {
-    fn serialize(&self, writer: &mut dyn IoWrite) -> Result<()> {
-        self.0.serialize_state(writer)
-    }
+    const TYPE_ID: &'static str = "lance.vector.ivf.IvfState";
+    const CURRENT_VERSION: u32 = 1;
 
-    fn deserialize(data: &bytes::Bytes) -> Result<Self> {
-        use lance_arrow::ipc::read_len_prefixed_bytes_at;
+    fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> {
+        self.0.serialize_state(w)
+    }
 
-        // Parse the common wire format, then dispatch on quantization_type to
+    fn deserialize(r: &mut CacheEntryReader<'_>) -> Result<Self> {
+        // Parse the common header, then dispatch on quantization_type to
         // construct the right IvfIndexState<Q>.
-        let mut offset = 0;
-        let header_bytes = read_len_prefixed_bytes_at(data, &mut offset)?;
-        let header: IvfIndexStateHeader = serde_json::from_slice(&header_bytes)
-            .map_err(|e| lance_core::Error::io(format!("IvfIndexState header: {e}")))?;
+        let header: IvfStateHeader = r.read_header()?;
 
-        let ivf_bytes = read_len_prefixed_bytes_at(data, &mut offset)?;
+        let ivf_bytes = r.read_raw()?;
         let ivf = IvfModel::try_from(
             pb::Ivf::decode(ivf_bytes.as_ref())
                 .map_err(|e| lance_core::Error::io(format!("IvfIndexState IVF decode: {e}")))?,
         )?;
 
-        let extra_bytes = read_len_prefixed_bytes_at(data, &mut offset)?;
+        let extra_bytes = r.read_raw()?;
 
-        // aux_ivf was added after initial deployment; fall back to ivf on
-        // clean EOF (legacy format without the field).
-        let aux_ivf = if offset + 8 <= data.len() {
-            let aux_ivf_bytes = read_len_prefixed_bytes_at(data, &mut offset)?;
+        let aux_ivf_bytes = r.read_raw()?;
+        let aux_ivf =
             IvfModel::try_from(pb::Ivf::decode(aux_ivf_bytes.as_ref()).map_err(|e| {
                 lance_core::Error::io(format!("IvfIndexState aux IVF decode: {e}"))
-            })?)?
-        } else {
-            ivf.clone()
-        };
+            })?)?;
 
         let distance_type = DistanceType::try_from(header.distance_type.as_str())?;
         let sub_index_type = SubIndexType::try_from(header.sub_index_type.as_str())?;
@@ -310,7 +292,7 @@ impl CacheCodecImpl for IvfStateEntryBox {
         // Helper: parse Q::Metadata from the JSON+extra_bytes in the header,
         // then build an IvfStateEntryBox wrapping IvfIndexState<Q>.
         fn make_entry<Q: Quantization + 'static>(
-            header: IvfIndexStateHeader,
+            header: IvfStateHeader,
             ivf: IvfModel,
             aux_ivf: IvfModel,
             extra_bytes: bytes::Bytes,
@@ -396,13 +378,13 @@ impl CacheCodecImpl for IvfStateEntryBox {
 }
 
 impl<Q: Quantization + 'static> IvfStateEntry for IvfIndexState<Q> {
-    fn serialize_state(&self, writer: &mut dyn IoWrite) -> Result<()> {
+    fn serialize_state(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> {
         let quantizer_metadata_json = serde_json::to_string(&self.metadata)
             .map_err(|e| lance_core::Error::io(format!("IvfIndexState metadata: {e}")))?;
         let extra = self.metadata.extra_metadata()?;
         let extra = extra.as_deref().unwrap_or(&[]);
 
-        let header = IvfIndexStateHeader {
+        let header = IvfStateHeader {
             index_file_path: self.index_file_path.clone(),
             uuid: self.uuid.to_string(),
             distance_type: self.distance_type.to_string(),
@@ -414,15 +396,13 @@ impl<Q: Quantization + 'static> IvfStateEntry for IvfIndexState<Q> {
             index_file_size: self.index_file_size,
             aux_file_size: self.aux_file_size,
         };
-        let header_json = serde_json::to_vec(&header)
-            .map_err(|e| lance_core::Error::io(format!("IvfIndexState header: {e}")))?;
         let ivf_bytes = pb::Ivf::try_from(&self.ivf)?.encode_to_vec();
         let aux_ivf_bytes = pb::Ivf::try_from(&self.aux_ivf)?.encode_to_vec();
 
-        write_len_prefixed_bytes(writer, &header_json)?;
-        write_len_prefixed_bytes(writer, &ivf_bytes)?;
-        write_len_prefixed_bytes(writer, extra)?;
-        write_len_prefixed_bytes(writer, &aux_ivf_bytes)?;
+        w.write_header(&header)?;
+        w.write_raw(&ivf_bytes)?;
+        w.write_raw(extra)?;
+        w.write_raw(&aux_ivf_bytes)?;
         Ok(())
     }
 
@@ -431,6 +411,7 @@ impl<Q: Quantization + 'static> IvfStateEntry for IvfIndexState<Q> {
         object_store: Arc<ObjectStore>,
         file_metadata_cache: &'a LanceCache,
         index_cache: LanceCache,
+        frag_reuse_index: Option<Arc<FragReuseIndex>>,
     ) -> BoxFuture<'a, Result<Arc<dyn VectorIndex>>> {
         Box::pin(async move {
             match self.sub_index_type {
@@ -440,6 +421,7 @@ impl<Q: Quantization + 'static> IvfStateEntry for IvfIndexState<Q> {
                         object_store,
                         file_metadata_cache,
                         index_cache,
+                        frag_reuse_index,
                     )
                     .await
                 }
@@ -449,6 +431,7 @@ impl<Q: Quantization + 'static> IvfStateEntry for IvfIndexState<Q> {
                         object_store,
                         file_metadata_cache,
                         index_cache,
+                        frag_reuse_index,
                     )
                     .await
                 }
@@ -614,6 +597,11 @@ pub struct IVFIndex<S: IvfSubIndex + 'static, Q: Quantization + 'static> {
     index_cache: WeakLanceCache,
 
     io_parallelism: usize,
+    /// Cumulative I/O performed while opening this index (file footers, IVF
+    /// centroids, quantization metadata).  Captured once in `try_new`; exposed
+    /// via [`VectorIndex::open_io_stats`] so the opening query can attribute the
+    /// one-time open cost to its plan metrics.
+    open_io_stats: ScanStats,
     scratch_pool: Arc<QueryScratchPool>,
     use_query_residual: bool,
     use_residual_scratch: bool,
@@ -1090,6 +1078,12 @@ impl<S: IvfSubIndex + 'static, Q: Quantization> IVFIndex<S, Q> {
         let use_residual_scratch = Self::use_residual_scratch(&ivf, use_query_residual);
         let rq_search_cache = Self::build_rq_search_cache(&ivf, &storage)?;
 
+        // The scheduler is freshly created above and, at this point, has served
+        // only the open-time reads (file footers, IVF centroids, quantization
+        // metadata) -- partition reads happen later, during queries.  So its
+        // cumulative stats are exactly the one-time index-open I/O.
+        let open_io_stats = scheduler.stats();
+
         Ok(Self {
             uri: to_local_path(&uri),
             index_path: uri.as_ref().to_string(),
@@ -1105,6 +1099,7 @@ impl<S: IvfSubIndex + 'static, Q: Quantization> IVFIndex<S, Q> {
             distance_type,
             index_cache: WeakLanceCache::from(&index_cache),
             io_parallelism,
+            open_io_stats,
             _marker: PhantomData,
         })
     }
@@ -1142,6 +1137,10 @@ impl<S: IvfSubIndex + 'static, Q: Quantization> IVFIndex<S, Q> {
             distance_type,
             index_cache: WeakLanceCache::from(&index_cache),
             io_parallelism,
+            // Reconstruction from cached state re-opens readers on its own path;
+            // the open-time I/O is not attributed here (it is a one-time cost,
+            // and the first open via `try_new` already accounts for it).
+            open_io_stats: ScanStats::default(),
             _marker: PhantomData,
         }
     }
@@ -1169,7 +1168,8 @@ impl<S: IvfSubIndex + 'static, Q: Quantization> IVFIndex<S, Q> {
                 .get_or_insert_with_key(cache_key, || async {
                     info!(target: TRACE_IO_EVENTS, r#type=IO_TYPE_LOAD_VECTOR_PART, index_type="ivf", part_id=partition_id);
                     metrics.record_part_load();
-                    self.load_partition_entry(partition_id).await
+                    self.load_partition_entry(partition_id, metrics.io_stats())
+                        .await
                 })
                 .await?;
             Ok(entry as Arc<dyn VectorIndexCacheEntry>)
@@ -1179,11 +1179,18 @@ impl<S: IvfSubIndex + 'static, Q: Quantization> IVFIndex<S, Q> {
             }
             info!(target: TRACE_IO_EVENTS, r#type=IO_TYPE_LOAD_VECTOR_PART, index_type="ivf", part_id=partition_id);
             metrics.record_part_load();
-            Ok(Arc::new(self.load_partition_entry(partition_id).await?))
+            Ok(Arc::new(
+                self.load_partition_entry(partition_id, metrics.io_stats())
+                    .await?,
+            ))
         }
     }
 
-    async fn load_partition_entry(&self, partition_id: usize) -> Result<PartitionEntry<S, Q>> {
+    async fn load_partition_entry(
+        &self,
+        partition_id: usize,
+        io_stats: Option<IoStats>,
+    ) -> Result<PartitionEntry<S, Q>> {
         let schema = Arc::new(self.reader.schema().as_ref().into());
         let batch = match self.reader.metadata().num_rows {
             0 => RecordBatch::new_empty(schema),
@@ -1192,8 +1199,17 @@ impl<S: IvfSubIndex + 'static, Q: Quantization> IVFIndex<S, Q> {
                 if row_range.is_empty() {
                     RecordBatch::new_empty(schema)
                 } else {
-                    let batches = self
-                        .reader
+                    // When I/O is being measured, read through a reader whose
+                    // scheduler also records into the per-query sink (a cheap
+                    // clone sharing all cached metadata, no file re-open).
+                    // Otherwise borrow the shared reader as-is, with no clone.
+                    let reader = match &io_stats {
+                        Some(io_stats) => {
+                            Cow::Owned(self.reader.with_io_stats(io_stats.recorder()))
+                        }
+                        None => Cow::Borrowed(&self.reader),
+                    };
+                    let batches = reader
                         .read_stream(
                             ReadBatchParams::Range(row_range),
                             u32::MAX,
@@ -1212,15 +1228,19 @@ impl<S: IvfSubIndex + 'static, Q: Quantization> IVFIndex<S, Q> {
             self.sub_index_metadata[partition_id].clone(),
         )?;
         let idx = S::load(batch)?;
-        let storage = self.load_partition_storage(partition_id).await?;
+        let storage = self.load_partition_storage(partition_id, io_stats).await?;
         Ok(PartitionEntry {
             index: idx,
             storage,
         })
     }
 
-    pub async fn load_partition_storage(&self, partition_id: usize) -> Result<Q::Storage> {
-        self.storage.load_partition(partition_id).await
+    pub async fn load_partition_storage(
+        &self,
+        partition_id: usize,
+        io_stats: Option<IoStats>,
+    ) -> Result<Q::Storage> {
+        self.storage.load_partition(partition_id, io_stats).await
     }
 
     /// preprocess the query vector given the partition id.
@@ -1800,6 +1820,10 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> VectorIndex for IVFInd
     fn metric_type(&self) -> DistanceType {
         self.distance_type
     }
+
+    fn open_io_stats(&self) -> Option<ScanStats> {
+        Some(self.open_io_stats)
+    }
 }
 
 pub type IvfFlatIndex = IVFIndex<FlatIndex, FlatQuantizer>;
@@ -1812,6 +1836,7 @@ async fn reconstruct_typed<S: IvfSubIndex + 'static, Q: Quantization + 'static>(
     object_store: Arc<ObjectStore>,
     file_metadata_cache: &LanceCache,
     index_cache: LanceCache,
+    frag_reuse_index: Option<Arc<FragReuseIndex>>,
 ) -> Result<Arc<dyn VectorIndex>> {
     let io_parallelism = object_store.io_parallelism();
 
@@ -1867,7 +1892,7 @@ async fn reconstruct_typed<S: IvfSubIndex + 'static, Q: Quantization + 'static>(
         state.aux_ivf.clone(),
         state.metadata.clone(),
         state.distance_type,
-        None,
+        frag_reuse_index,
     );
     let rq_search_cache = IVFIndex::<S, Q>::rq_search_cache_from_state(state, &storage)?;
 
@@ -1908,7 +1933,8 @@ mod tests {
     use lance_arrow::FixedSizeListArrayExt;
     use lance_index::vector::bq::{
         RQBuildParams, RQRotationType,
-        storage::{RABIT_EX_CODE_COLUMN, RabitQuantizationMetadata, RabitQueryEstimator},
+        ex_dot::{blocked_ex_code_bytes, padded_query_len},
+        storage::{RABIT_BLOCKED_EX_CODE_COLUMN, RabitQuantizationMetadata, RabitQueryEstimator},
         transform::{EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN},
     };
     use lance_index::vector::storage::VectorStore;
@@ -1983,14 +2009,17 @@ mod tests {
     }
 
     #[test]
-    fn test_rabit_ex_dist_table_len_uses_num_bits() {
+    fn test_rabit_ex_scratch_len_uses_num_bits() {
+        // Block-aligned dims read the rotated query in place.
         let dim = 960;
+        for num_bits in [1, 3, 5, 7, 9] {
+            assert_eq!(super::rabit_ex_scratch_len(dim, num_bits), 0);
+        }
 
-        assert_eq!(super::rabit_ex_dist_table_len(dim, 1), 0);
-        assert_eq!(super::rabit_ex_dist_table_len(dim, 3), dim * 4);
-        assert_eq!(super::rabit_ex_dist_table_len(dim, 5), dim * 16);
-        assert_eq!(super::rabit_ex_dist_table_len(dim, 7), dim * 64);
-        assert_eq!(super::rabit_ex_dist_table_len(dim, 9), dim * 256);
+        // Unaligned multi-bit queries add one padded query copy.
+        let dim = 968;
+        assert_eq!(super::rabit_ex_scratch_len(dim, 1), 0);
+        assert_eq!(super::rabit_ex_scratch_len(dim, 7), padded_query_len(dim));
     }
 
     #[test]
@@ -2012,7 +2041,7 @@ mod tests {
         let capacity = super::rabit_query_scratch_capacity(dim, max_partition_len, 5);
 
         assert_eq!(capacity.distances, max_partition_len);
-        assert_eq!(capacity.query_f32, dim + dim * 4 + dim * 16);
+        assert_eq!(capacity.query_f32, dim + dim * 4);
         assert_eq!(capacity.u16, max_partition_len);
         assert_eq!(capacity.u8, dim * 16);
         assert_eq!(capacity.u32, 0);
@@ -2723,7 +2752,7 @@ mod tests {
     async fn load_partition_row_ids(index: &IvfPq, partition_idx: usize) -> Vec<u64> {
         index
             .storage
-            .load_partition(partition_idx)
+            .load_partition(partition_idx, None)
             .await
             .unwrap()
             .row_ids()
@@ -4403,18 +4432,24 @@ mod tests {
     }
 
     #[rstest]
-    #[case::l2(DistanceType::L2)]
-    #[case::cosine(DistanceType::Cosine)]
+    #[case::l2(DistanceType::L2, 9)]
+    #[case::cosine(DistanceType::Cosine, 9)]
+    // ex_bits=3 and ex_bits=5 have no FastScan support and use the bit-plane
+    // repack, so these searches go through the exact ex-dot rerank kernels
+    // end to end.
+    #[case::l2_plane_repack_3bit(DistanceType::L2, 4)]
+    #[case::l2_plane_repack_5bit(DistanceType::L2, 6)]
     #[tokio::test]
     async fn test_build_ivf_rq_multi_bit_persists_split_codes_and_searches(
         #[case] distance_type: DistanceType,
+        #[case] num_bits: u8,
     ) {
         let test_dir = TempStrDir::default();
         let test_uri = test_dir.as_str();
         let (mut dataset, vectors) = generate_test_dataset::<Float32Type>(test_uri, 0.0..1.0).await;
 
         let ivf_params = IvfBuildParams::new(4);
-        let rq_params = RQBuildParams::with_rotation_type(9, RQRotationType::Fast);
+        let rq_params = RQBuildParams::with_rotation_type(num_bits, RQRotationType::Fast);
         let params = VectorIndexParams::with_ivf_rq_params(distance_type, ivf_params, rq_params);
         dataset
             .create_index(&["vector"], IndexType::Vector, None, &params, true)
@@ -4427,16 +4462,18 @@ mod tests {
         let scheduler = ScanScheduler::new(obj_store, SchedulerConfig::default_for_testing());
         let index_uuid = indices[0].uuid.to_string();
         let rq_meta = get_rq_metadata(&dataset, scheduler.clone(), &index_uuid).await;
-        assert_eq!(rq_meta.num_bits, 9);
+        assert_eq!(rq_meta.num_bits, num_bits);
         assert_eq!(rq_meta.query_estimator, RabitQueryEstimator::RawQuery);
 
         let reader = open_rq_aux_reader(&dataset, scheduler, &index_uuid).await;
         let schema = reader.schema();
-        let ex_field = schema.field(RABIT_EX_CODE_COLUMN).unwrap();
+        let ex_field = schema.field(RABIT_BLOCKED_EX_CODE_COLUMN).unwrap();
         let DataType::FixedSizeList(_, ex_code_bytes) = ex_field.data_type() else {
             panic!("RQ ex-code field should be FixedSizeList");
         };
-        assert_eq!(ex_code_bytes, 32);
+        let expected_ex_code_bytes =
+            blocked_ex_code_bytes(rq_meta.rotated_dim(), num_bits - 1) as i32;
+        assert_eq!(ex_code_bytes, expected_ex_code_bytes);
         assert!(schema.field(EX_ADD_FACTORS_COLUMN).is_some());
         assert!(schema.field(EX_SCALE_FACTORS_COLUMN).is_some());
 
@@ -6178,11 +6215,9 @@ mod tests {
             // Try serialized store first
             let guard = self.serialized.lock().await;
             if let Some((bytes, stored_codec, _)) = guard.get(key) {
-                return Some(
-                    stored_codec
-                        .deserialize(&bytes::Bytes::copy_from_slice(bytes))
-                        .expect("deserialization should succeed"),
-                );
+                return stored_codec
+                    .deserialize(&bytes::Bytes::copy_from_slice(bytes))
+                    .hit();
             }
             drop(guard);
             // Fall through to passthrough
diff --git a/rust/lance/src/io/commit/external_manifest.rs b/rust/lance/src/io/commit/external_manifest.rs
index df2b84a4878..eee4fbf07b6 100644
--- a/rust/lance/src/io/commit/external_manifest.rs
+++ b/rust/lance/src/io/commit/external_manifest.rs
@@ -365,6 +365,32 @@ mod test {
         assert_eq!(ds.version().version, 6);
         assert_eq!(ds.count_rows(None).await.unwrap(), 60);
 
+        {
+            inner_store.lock().await.remove(&(ds.base.to_string(), 6));
+        }
+        assert!(
+            handler
+                .version_exists(
+                    &ds.base,
+                    6,
+                    ds.object_store.inner.as_ref(),
+                    ds.manifest_location().naming_scheme,
+                )
+                .await
+                .unwrap()
+        );
+        assert!(
+            !handler
+                .version_exists(
+                    &ds.base,
+                    7,
+                    ds.object_store.inner.as_ref(),
+                    ds.manifest_location().naming_scheme,
+                )
+                .await
+                .unwrap()
+        );
+
         // Open without external store handler again, should see the newly sync'd commit
         let ds = DatasetBuilder::from_uri(ds_uri).load().await.unwrap();
         assert_eq!(ds.version().version, 6);
diff --git a/rust/lance/src/io/commit/namespace_manifest.rs b/rust/lance/src/io/commit/namespace_manifest.rs
index 92d5e7bc789..f4f012adcca 100644
--- a/rust/lance/src/io/commit/namespace_manifest.rs
+++ b/rust/lance/src/io/commit/namespace_manifest.rs
@@ -14,8 +14,24 @@ use lance_table::io::commit::{ManifestLocation, ManifestNamingScheme};
 use object_store::ObjectStore as OSObjectStore;
 use object_store::path::Path;
 
+use lance_namespace::error::NamespaceError;
+
 use crate::dataset::branch_location::BranchLocation;
 
+/// Whether `e` says the requested chain (table or branch) does not exist, as
+/// opposed to a failure talking to the namespace.
+fn is_chain_not_found(e: &lance_core::Error) -> bool {
+    if let lance_core::Error::Namespace { source, .. } = e
+        && let Some(ns_err) = source.downcast_ref::<NamespaceError>()
+    {
+        return matches!(
+            ns_err,
+            NamespaceError::TableNotFound { .. } | NamespaceError::TableBranchNotFound { .. }
+        );
+    }
+    false
+}
+
 #[derive(Debug)]
 pub struct LanceNamespaceExternalManifestStore {
     namespace_client: Arc<dyn LanceNamespace>,
@@ -90,7 +106,15 @@ impl ExternalManifestStore for LanceNamespaceExternalManifestStore {
             ..Default::default()
         };
 
-        let response = self.namespace_client.list_table_versions(request).await?;
+        let response = match self.namespace_client.list_table_versions(request).await {
+            Ok(response) => response,
+            // A chain that does not exist yet (e.g. probing a branch location
+            // before the branch is created) has no latest version; the
+            // ExternalManifestStore contract reports that as None, not an
+            // error, so existence checks can treat it as a missing dataset.
+            Err(e) if is_chain_not_found(&e) => return Ok(None),
+            Err(e) => return Err(e),
+        };
 
         if response.versions.is_empty() {
             return Ok(None);
@@ -182,3 +206,93 @@ impl ExternalManifestStore for LanceNamespaceExternalManifestStore {
         ))
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use lance_namespace::models::ListTableVersionsResponse;
+
+    /// A namespace whose list_table_versions always fails with the configured
+    /// error, to pin how get_latest_version classifies failures.
+    #[derive(Debug)]
+    struct FailingNamespace {
+        error: fn() -> lance_core::Error,
+    }
+
+    #[async_trait]
+    impl LanceNamespace for FailingNamespace {
+        fn namespace_id(&self) -> String {
+            "failing".to_string()
+        }
+
+        async fn list_table_versions(
+            &self,
+            _request: ListTableVersionsRequest,
+        ) -> Result<ListTableVersionsResponse> {
+            Err((self.error)())
+        }
+    }
+
+    fn store_with(error: fn() -> lance_core::Error) -> LanceNamespaceExternalManifestStore {
+        LanceNamespaceExternalManifestStore::new(
+            Arc::new(FailingNamespace { error }),
+            vec!["t".to_string()],
+            Path::parse("data/t.lance").unwrap(),
+        )
+    }
+
+    /// A chain that does not exist (missing table or branch) has no latest
+    /// version; everything else is a real failure and must propagate so an
+    /// outage is never mistaken for an absent dataset.
+    #[tokio::test]
+    async fn test_get_latest_version_error_classification() {
+        use lance_namespace::error::NamespaceError;
+
+        let absent = [
+            store_with(|| {
+                NamespaceError::TableNotFound {
+                    message: "missing table".to_string(),
+                }
+                .into()
+            }),
+            store_with(|| {
+                NamespaceError::TableBranchNotFound {
+                    message: "missing branch".to_string(),
+                }
+                .into()
+            }),
+        ];
+        for store in absent {
+            let latest = store.get_latest_version("data/t.lance/tree/dev").await;
+            assert!(
+                matches!(latest, Ok(None)),
+                "a missing chain must read as no latest version, got: {:?}",
+                latest
+            );
+        }
+
+        let failures = [
+            store_with(|| {
+                NamespaceError::Internal {
+                    message: "server error".to_string(),
+                }
+                .into()
+            }),
+            store_with(|| {
+                NamespaceError::Throttling {
+                    message: "slow down".to_string(),
+                }
+                .into()
+            }),
+            store_with(|| lance_core::Error::io("connection reset".to_string())),
+        ];
+        for store in failures {
+            let latest = store.get_latest_version("data/t.lance/tree/dev").await;
+            assert!(
+                latest.is_err(),
+                "a real failure must propagate, got: {:?}",
+                latest
+            );
+        }
+    }
+}
diff --git a/rust/lance/src/io/exec/knn.rs b/rust/lance/src/io/exec/knn.rs
index c4c79dcee5e..0ceddf7c5ee 100644
--- a/rust/lance/src/io/exec/knn.rs
+++ b/rust/lance/src/io/exec/knn.rs
@@ -926,6 +926,9 @@ impl ExecutionPlan for ANNIvfPartitionExec {
             })
             .buffered(self.index_uuids.len().min(target_partitions).max(1))
             .finally(move || {
+                // Partition ranking reads centroids from memory, so this is
+                // typically zero; flushed for symmetry with ANNSubIndex.
+                metrics_clone.index_metrics.flush_io();
                 metrics_clone.baseline_metrics.done();
                 metrics_clone
                     .baseline_metrics
@@ -1627,6 +1630,9 @@ impl ExecutionPlan for ANNIvfSubIndexExec {
                 // will not start until the early search is complete across all deltas.
                 .try_flatten_unordered(None)
                 .finally(move || {
+                    // Publish the exact index-file I/O measured for this query
+                    // (cache misses only) to the iops/requests/bytes_read gauges.
+                    metrics_clone.index_metrics.flush_io();
                     metrics_clone
                         .baseline_metrics
                         .elapsed_compute()
@@ -2919,6 +2925,128 @@ mod tests {
         assert_find_partitions_elapsed_recorded(&stats);
     }
 
+    /// The ANN operators report the exact index-file I/O performed for a query
+    /// (bytes_read / iops), measured only on cache misses.  A cold search loads
+    /// partitions from storage and reports non-zero I/O; an immediately
+    /// following warm search serves every partition from the index cache and
+    /// reports zero -- which is the cache-effectiveness signal the metric adds.
+    #[tokio::test]
+    async fn test_io_metrics_cold_vs_warm() {
+        let fixture = NprobesTestFixture::new(100, 1).await;
+        let q = fixture.get_centroid(0);
+
+        let run = |holder: &StatsHolder| {
+            let setter = holder.get_setter();
+            async {
+                fixture
+                    .dataset
+                    .scan()
+                    .nearest("vector", q.as_ref(), 10)
+                    .unwrap()
+                    .minimum_nprobes(10)
+                    .scan_stats_callback(setter)
+                    .project(&Vec::<String>::new())
+                    .unwrap()
+                    .with_row_id()
+                    .try_into_batch()
+                    .await
+                    .unwrap()
+            }
+        };
+
+        // Cold: a freshly opened dataset has an empty index cache, so the
+        // sub-index search must read partitions (and their quantization storage)
+        // from disk.  Those reads flow through the per-query I/O sink.
+        let cold_holder = StatsHolder::default();
+        run(&cold_holder).await;
+        let cold = cold_holder.consume();
+        assert!(
+            cold.parts_loaded > 0,
+            "cold search should load partitions, got parts_loaded={}",
+            cold.parts_loaded
+        );
+        assert!(
+            cold.bytes_read > 0,
+            "cold search should report index-file I/O, got bytes_read={}",
+            cold.bytes_read
+        );
+        assert!(
+            cold.iops > 0,
+            "cold search should report index-file IOPS, got iops={}",
+            cold.iops
+        );
+
+        // Warm: the same query on the same dataset finds every partition it
+        // needs already cached, so no index-file I/O is performed.
+        let warm_holder = StatsHolder::default();
+        run(&warm_holder).await;
+        let warm = warm_holder.consume();
+        assert_eq!(
+            warm.parts_loaded, 0,
+            "warm search should not reload partitions, got parts_loaded={}",
+            warm.parts_loaded
+        );
+        assert_eq!(
+            warm.bytes_read, 0,
+            "warm search should report no index-file I/O, got bytes_read={}",
+            warm.bytes_read
+        );
+    }
+
+    /// The new I/O metrics must actually surface in `EXPLAIN ANALYZE` text on
+    /// the ANN operators: non-zero on a cold query (partition reads on
+    /// `ANNSubIndex`, index-open reads on `ANNIvfPartition`) and zero on a warm
+    /// query (everything served from the index cache).
+    #[tokio::test]
+    async fn test_io_metrics_visible_in_explain_analyze() {
+        // Returns the value of `metric=` from the analyzed-plan line for `node`.
+        fn node_metric<'a>(plan: &'a str, node: &str, metric: &str) -> &'a str {
+            let line = plan
+                .lines()
+                .find(|l| l.trim_start().starts_with(node))
+                .unwrap_or_else(|| panic!("plan missing node {node}:\n{plan}"));
+            let after = line
+                .split_once(&format!("{metric}="))
+                .unwrap_or_else(|| panic!("node {node} line missing {metric}=:\n{line}"))
+                .1;
+            after.split([',', ']']).next().unwrap().trim()
+        }
+
+        let fixture = NprobesTestFixture::new(100, 1).await;
+        let q = fixture.get_centroid(0);
+
+        // Cold: a freshly opened dataset must show real index-file I/O.
+        let cold = fixture
+            .dataset
+            .scan()
+            .nearest("vector", q.as_ref(), 10)
+            .unwrap()
+            .minimum_nprobes(10)
+            .analyze_plan()
+            .await
+            .unwrap();
+        // Sub-index partition reads.
+        assert_ne!(node_metric(&cold, "ANNSubIndex", "bytes_read"), "0");
+        assert_ne!(node_metric(&cold, "ANNSubIndex", "iops"), "0");
+        // Index-open reads (centroids/metadata) now attributed to the partition
+        // operator -- the value this part of the change adds.
+        assert_ne!(node_metric(&cold, "ANNIvfPartition", "bytes_read"), "0");
+        assert_ne!(node_metric(&cold, "ANNIvfPartition", "iops"), "0");
+
+        // Warm: same query, everything cache-resident -> zero index-file I/O.
+        let warm = fixture
+            .dataset
+            .scan()
+            .nearest("vector", q.as_ref(), 10)
+            .unwrap()
+            .minimum_nprobes(10)
+            .analyze_plan()
+            .await
+            .unwrap();
+        assert_eq!(node_metric(&warm, "ANNSubIndex", "bytes_read"), "0");
+        assert_eq!(node_metric(&warm, "ANNIvfPartition", "bytes_read"), "0");
+    }
+
     #[rstest]
     #[tokio::test]
     async fn test_no_prefilter_results(#[values(1, 20)] num_deltas: usize) {
diff --git a/rust/lance/src/io/exec/take.rs b/rust/lance/src/io/exec/take.rs
index 977a9c88dce..c3642cdb043 100644
--- a/rust/lance/src/io/exec/take.rs
+++ b/rust/lance/src/io/exec/take.rs
@@ -4,6 +4,7 @@
 use std::borrow::Cow;
 use std::collections::{HashMap, HashSet};
 use std::sync::{Arc, Mutex};
+use std::task::Poll;
 
 use arrow::array::AsArray;
 use arrow::compute::{TakeOptions, concat_batches};
@@ -27,6 +28,7 @@ use lance_arrow::RecordBatchExt;
 use lance_core::datatypes::{Field, OnMissing, Projection};
 use lance_core::error::{DataFusionResult, LanceOptionExt};
 use lance_core::utils::address::RowAddress;
+use lance_core::utils::futures::FinallyStreamExt;
 use lance_core::utils::tokio::get_num_compute_intensive_cpus;
 use lance_core::{ROW_ADDR, ROW_ID};
 use lance_io::scheduler::{ScanScheduler, SchedulerConfig};
@@ -353,10 +355,6 @@ impl TakeStream {
             (None, None) => {}
         }
 
-        self.metrics
-            .baseline_metrics
-            .record_output(new_data.num_rows());
-        self.metrics.batches_processed.add(1);
         Ok(batch.merge_with_schema(&new_data, self.output_schema.as_ref())?)
     }
 
@@ -364,8 +362,10 @@ impl TakeStream {
         self: Arc<Self>,
         input: S,
     ) -> impl Stream<Item = Result<RecordBatch>> {
-        let scan_scheduler = self.scan_scheduler.clone();
-        let metrics = self.metrics.clone();
+        let result_scan_scheduler = self.scan_scheduler.clone();
+        let final_scan_scheduler = self.scan_scheduler.clone();
+        let result_metrics = self.metrics.clone();
+        let final_metrics = self.metrics.clone();
         let batches = input
             .enumerate()
             .map(move |(batch_index, batch)| {
@@ -378,8 +378,24 @@ impl TakeStream {
             })
             .boxed();
         batches
-            .inspect_ok(move |_| metrics.io_metrics.record(&scan_scheduler))
             .try_buffered(get_num_compute_intensive_cpus())
+            .map(move |result| {
+                if result.is_ok() {
+                    result_metrics.batches_processed.add(1);
+                }
+                result_metrics.io_metrics.record(&result_scan_scheduler);
+                match result_metrics
+                    .baseline_metrics
+                    .record_poll(Poll::Ready(Some(result)))
+                {
+                    Poll::Ready(Some(result)) => result,
+                    _ => unreachable!("record_poll returned a different poll state"),
+                }
+            })
+            .finally(move || {
+                final_metrics.baseline_metrics.done();
+                final_metrics.io_metrics.record(&final_scan_scheduler);
+            })
     }
 }
 
@@ -839,6 +855,80 @@ mod tests {
         }
     }
 
+    #[tokio::test(flavor = "current_thread")]
+    async fn test_take_records_output_and_io_metrics() {
+        use datafusion::physical_plan::metrics::MetricValue;
+        use lance_datafusion::utils::{BYTES_READ_METRIC, IOPS_METRIC, REQUESTS_METRIC};
+        let TestFixture {
+            dataset,
+            _tmp_dir_guard,
+        } = test_fixture().await;
+
+        let row_addrs = UInt64Array::from(vec![0_u64, 1, 2, 3, 4]);
+        let schema = Arc::new(ArrowSchema::new(vec![Field::new(
+            ROW_ADDR,
+            DataType::UInt64,
+            true,
+        )]));
+        let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(row_addrs)]).unwrap();
+        let stream = futures::stream::iter(vec![Ok(batch)]);
+        let stream = Box::pin(RecordBatchStreamAdapter::new(schema, stream));
+        let input = Arc::new(OneShotExec::new(stream));
+
+        let projection = dataset
+            .empty_projection()
+            .union_column("s", OnMissing::Error)
+            .unwrap();
+
+        let take_exec = TakeExec::try_new(dataset, input, projection)
+            .unwrap()
+            .unwrap();
+
+        let stream = take_exec
+            .execute(0, Arc::new(TaskContext::default()))
+            .unwrap();
+        let batches: Vec<RecordBatch> = stream.try_collect().await.unwrap();
+        assert_eq!(batches.iter().map(|b| b.num_rows()).sum::<usize>(), 5);
+
+        let metrics = take_exec.metrics().unwrap();
+
+        let output_batches: usize = metrics
+            .iter()
+            .filter_map(|m| match m.value() {
+                MetricValue::OutputBatches(count) => Some(count.value()),
+                _ => None,
+            })
+            .sum();
+
+        let output_bytes: usize = metrics
+            .iter()
+            .filter_map(|m| match m.value() {
+                MetricValue::OutputBytes(count) => Some(count.value()),
+                _ => None,
+            })
+            .sum();
+
+        let gauge = |name: &str| -> usize {
+            metrics
+                .iter_gauges()
+                .find_map(|(metric_name, gauge)| {
+                    (metric_name.as_ref() == name).then(|| gauge.value())
+                })
+                .unwrap_or(0)
+        };
+
+        let bytes_read = gauge(BYTES_READ_METRIC);
+        let iops = gauge(IOPS_METRIC);
+        let requests = gauge(REQUESTS_METRIC);
+
+        assert_eq!(metrics.output_rows(), Some(5));
+        assert_eq!(metrics.find_count("batches_processed").unwrap().value(), 1);
+        assert!(
+            output_batches > 0 && output_bytes > 0 && bytes_read > 0 && iops > 0 && requests > 0,
+            "expected positive TakeExec metrics, got output_batches={output_batches}, output_bytes={output_bytes}, bytes_read={bytes_read}, iops={iops}, requests={requests}"
+        );
+    }
+
     #[tokio::test]
     async fn test_take_order() {
         let TestFixture {
diff --git a/rust/lance/src/io/exec/utils.rs b/rust/lance/src/io/exec/utils.rs
index af3c5095f75..6e2d50d3736 100644
--- a/rust/lance/src/io/exec/utils.rs
+++ b/rust/lance/src/io/exec/utils.rs
@@ -6,7 +6,7 @@ use lance_datafusion::utils::{
     IOPS_METRIC, PARTS_LOADED_METRIC, REQUESTS_METRIC,
 };
 use lance_index::metrics::MetricsCollector;
-use lance_io::scheduler::ScanScheduler;
+use lance_io::scheduler::{IoStats, ScanScheduler, ScanStats};
 use lance_table::format::IndexMetadata;
 use pin_project::pin_project;
 use std::future::Future;
@@ -502,12 +502,17 @@ impl IoMetrics {
     }
 
     pub fn record(&self, scan_scheduler: &ScanScheduler) {
-        let current_stats = scan_scheduler.stats();
+        self.record_stats(scan_scheduler.stats());
+    }
 
-        // Use set_max to ensure gauge always shows the highest value seen
-        self.iops.set_max(current_stats.iops as usize);
-        self.requests.set_max(current_stats.requests as usize);
-        self.bytes_read.set_max(current_stats.bytes_read as usize);
+    /// Record a snapshot of cumulative I/O statistics.
+    ///
+    /// Uses `set_max` because the underlying counters are cumulative; the gauge
+    /// always reflects the highest (i.e. final) value seen.
+    pub fn record_stats(&self, stats: ScanStats) {
+        self.iops.set_max(stats.iops as usize);
+        self.requests.set_max(stats.requests as usize);
+        self.bytes_read.set_max(stats.bytes_read as usize);
     }
 }
 
@@ -516,6 +521,12 @@ pub struct IndexMetrics {
     indices_loaded: Count,
     parts_loaded: Count,
     index_comparisons: Count,
+    /// Per-query sink that accumulates exact index-file I/O as partitions are
+    /// loaded from storage.  Shared by all clones of this `IndexMetrics`, so
+    /// concurrent partition loads all funnel into the same counters.  Published
+    /// to `io_metrics` for display via [`IndexMetrics::flush_io`].
+    io_stats: IoStats,
+    io_metrics: IoMetrics,
 }
 
 impl IndexMetrics {
@@ -524,8 +535,18 @@ impl IndexMetrics {
             indices_loaded: metrics.new_count(INDICES_LOADED_METRIC, partition),
             parts_loaded: metrics.new_count(PARTS_LOADED_METRIC, partition),
             index_comparisons: metrics.new_count(INDEX_COMPARISONS_METRIC, partition),
+            io_stats: IoStats::new(),
+            io_metrics: IoMetrics::new(metrics, partition),
         }
     }
+
+    /// Publish the I/O accumulated in the per-query sink to the displayed
+    /// `iops`/`requests`/`bytes_read` metrics.  Call once when the operator's
+    /// stream finishes; the sink only accumulates on cache misses, so a fully
+    /// cache-resident query publishes zeros.
+    pub fn flush_io(&self) {
+        self.io_metrics.record_stats(self.io_stats.snapshot());
+    }
 }
 
 impl MetricsCollector for IndexMetrics {
@@ -538,6 +559,9 @@ impl MetricsCollector for IndexMetrics {
     fn record_comparisons(&self, num_comparisons: usize) {
         self.index_comparisons.add(num_comparisons);
     }
+    fn io_stats(&self) -> Option<IoStats> {
+        Some(self.io_stats.clone())
+    }
 }
 
 #[cfg(test)]
diff --git a/rust/lance/src/lib.rs b/rust/lance/src/lib.rs
index 284e10a9b6f..729cf2ffbe7 100644
--- a/rust/lance/src/lib.rs
+++ b/rust/lance/src/lib.rs
@@ -90,7 +90,7 @@ pub mod pb {
     include!(concat!(env!("OUT_DIR"), "/lance.pb.rs"));
 }
 
-pub use blob::{BlobArrayBuilder, blob_field};
+pub use blob::{BlobArrayBuilder, BlobFieldOptions, blob_field, blob_field_with_options};
 pub use dataset::Dataset;
 use lance_index::vector::DIST_COL;
 
diff --git a/rust/lance/src/session.rs b/rust/lance/src/session.rs
index 484d53c066a..8d5e9717570 100644
--- a/rust/lance/src/session.rs
+++ b/rust/lance/src/session.rs
@@ -4,7 +4,7 @@
 use std::collections::HashMap;
 use std::sync::Arc;
 
-use lance_core::cache::{CacheBackend, LanceCache};
+use lance_core::cache::{CacheBackend, CacheKeyIterator, LanceCache};
 use lance_core::deepsize::DeepSizeOf;
 use lance_core::{Error, Result};
 use lance_index::IndexType;
@@ -209,6 +209,44 @@ impl Session {
     pub async fn index_cache_stats(&self) -> lance_core::cache::CacheStats {
         self.index_cache.0.stats().await
     }
+
+    /// Return an iterator over keys currently held by the index cache.
+    ///
+    /// Returns `None` when the index cache backend does not support key
+    /// inventory.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use lance::session::Session;
+    /// # async fn example() {
+    /// let session = Session::default();
+    /// let keys = session.index_cache_keys().await;
+    /// assert!(keys.is_some());
+    /// # }
+    /// ```
+    pub async fn index_cache_keys(&self) -> Option<CacheKeyIterator<'_>> {
+        self.index_cache.0.keys().await
+    }
+
+    /// Return an iterator over keys currently held by the metadata cache.
+    ///
+    /// Returns `None` when the metadata cache backend does not support key
+    /// inventory.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use lance::session::Session;
+    /// # async fn example() {
+    /// let session = Session::default();
+    /// let keys = session.metadata_cache_keys().await;
+    /// assert!(keys.is_some());
+    /// # }
+    /// ```
+    pub async fn metadata_cache_keys(&self) -> Option<CacheKeyIterator<'_>> {
+        self.metadata_cache.0.keys().await
+    }
 }
 
 impl Default for Session {
@@ -224,10 +262,23 @@ impl Default for Session {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use lance_core::cache::UnsizedCacheKey;
+    use lance_core::cache::{CacheKey, UnsizedCacheKey};
     use lance_index::vector::VectorIndex;
     use std::borrow::Cow;
 
+    struct TestKey(&'static str);
+    impl CacheKey for TestKey {
+        type ValueType = Vec<i32>;
+
+        fn key(&self) -> Cow<'_, str> {
+            Cow::Borrowed(self.0)
+        }
+
+        fn type_name() -> &'static str {
+            "TestVec"
+        }
+    }
+
     struct TestUnsizedKey(&'static str);
     impl UnsizedCacheKey for TestUnsizedKey {
         type ValueType = dyn VectorIndex;
@@ -251,4 +302,41 @@ mod tests {
                 .is_none()
         );
     }
+
+    #[tokio::test]
+    async fn test_session_cache_keys() {
+        let session = Session::new(10_000, 10_000, Default::default());
+
+        session
+            .index_cache
+            .insert_with_key(&TestKey("index-key"), Arc::new(vec![1]))
+            .await;
+        session
+            .metadata_cache
+            .0
+            .insert_with_key(&TestKey("metadata-key"), Arc::new(vec![2]))
+            .await;
+
+        let index_keys = session
+            .index_cache_keys()
+            .await
+            .unwrap()
+            .collect::<Vec<_>>();
+        assert_eq!(index_keys.len(), 1);
+        assert_eq!(index_keys[0].prefix(), "");
+        assert_eq!(index_keys[0].key(), "index-key");
+        assert_eq!(index_keys[0].type_name(), "TestVec");
+
+        let metadata_keys = session
+            .metadata_cache_keys()
+            .await
+            .unwrap()
+            .collect::<Vec<_>>();
+        assert_eq!(metadata_keys.len(), 1);
+        assert_eq!(metadata_keys[0].prefix(), "");
+        assert_eq!(metadata_keys[0].key(), "metadata-key");
+        assert_eq!(metadata_keys[0].type_name(), "TestVec");
+
+        assert_ne!(index_keys, metadata_keys);
+    }
 }