diff --git a/Cargo.lock b/Cargo.lock index 8b494caf..702996a4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -629,7 +629,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "fff-c" -version = "0.5.1" +version = "0.5.2" dependencies = [ "fff-query-parser", "fff-search", @@ -642,7 +642,7 @@ dependencies = [ [[package]] name = "fff-grep" -version = "0.5.1" +version = "0.5.2" dependencies = [ "bstr", "memchr", @@ -650,7 +650,7 @@ dependencies = [ [[package]] name = "fff-mcp" -version = "0.5.1" +version = "0.5.2" dependencies = [ "clap", "fff-query-parser", @@ -667,7 +667,7 @@ dependencies = [ [[package]] name = "fff-nvim" -version = "0.5.1" +version = "0.5.2" dependencies = [ "ahash", "blake3", @@ -700,7 +700,7 @@ dependencies = [ [[package]] name = "fff-query-parser" -version = "0.5.1" +version = "0.5.2" dependencies = [ "criterion", "zlob", @@ -708,7 +708,7 @@ dependencies = [ [[package]] name = "fff-search" -version = "0.5.1" +version = "0.5.2" dependencies = [ "ahash", "aho-corasick", diff --git a/crates/fff-core/src/bigram_filter.rs b/crates/fff-core/src/bigram_filter.rs index 233437c5..744f5213 100644 --- a/crates/fff-core/src/bigram_filter.rs +++ b/crates/fff-core/src/bigram_filter.rs @@ -3,6 +3,19 @@ use std::sync::atomic::{AtomicU16, AtomicU64, AtomicUsize, Ordering}; use ahash::AHashMap; +/// Query interface for bigram-based file filtering. +/// +/// Implemented by `BigramFilter` (owned) and can be implemented by external +/// types (e.g. a zero-copy mmap-backed index). +pub trait BigramQuery { + /// AND the posting lists for all query bigrams. + /// Returns `None` if the pattern is too short or no bigrams are tracked. + fn query(&self, pattern: &[u8]) -> Option>; + + /// Whether the index has been populated with at least one file. + fn is_ready(&self) -> bool; +} + /// Maximum number of distinct bigrams tracked in the inverted index. /// 95 printable ASCII chars (32..=126) after lowercasing → ~70 distinct → 4900 possible. /// We cap at 5000 to cover all printable bigrams with margin. @@ -269,10 +282,20 @@ fn bitset_and(result: &mut [u64], bitset: &[u64]) { .for_each(|(r, b)| *r &= *b); } +impl BigramQuery for BigramFilter { + fn query(&self, pattern: &[u8]) -> Option> { + self.query_inner(pattern) + } + + fn is_ready(&self) -> bool { + self.populated > 0 + } +} + impl BigramFilter { /// AND the posting lists for all query bigrams (consecutive + skip). /// Returns None if no query bigrams are tracked. - pub fn query(&self, pattern: &[u8]) -> Option> { + fn query_inner(&self, pattern: &[u8]) -> Option> { if pattern.len() < 2 { return None; } @@ -360,10 +383,6 @@ impl BigramFilter { candidates.iter().map(|w| w.count_ones() as usize).sum() } - pub fn is_ready(&self) -> bool { - self.populated > 0 - } - pub fn file_count(&self) -> usize { self.file_count } @@ -562,3 +581,61 @@ impl BigramOverlay { } } } + +#[cfg(test)] +mod tests { + use super::*; + + fn build_test_index() -> BigramFilter { + let builder = BigramIndexBuilder::new(3); + let skip_builder = BigramIndexBuilder::new(3); + builder.add_file_content(&skip_builder, 0, b"fn main() { hello_world(); }"); + builder.add_file_content(&skip_builder, 1, b"struct Foo { bar: i32 }"); + builder.add_file_content(&skip_builder, 2, b"fn test_hello() { assert!(true); }"); + builder.compress(None) + } + + #[test] + fn test_bigram_query_trait_matches_inherent() { + let index = build_test_index(); + + // Inherent method + let inherent_result = index.query_inner(b"hello"); + // Trait method + let trait_result = BigramQuery::query(&index, b"hello"); + + assert_eq!(inherent_result, trait_result); + } + + #[test] + fn test_bigram_query_via_dyn() { + let index = build_test_index(); + let dyn_ref: &dyn BigramQuery = &index; + + assert!(dyn_ref.is_ready()); + + let candidates = dyn_ref.query(b"hello").expect("should match"); + // Files 0 and 2 contain "hello" + assert!(BigramFilter::is_candidate(&candidates, 0)); + assert!(BigramFilter::is_candidate(&candidates, 2)); + // File 1 does not + assert!(!BigramFilter::is_candidate(&candidates, 1)); + } + + #[test] + fn test_bigram_query_not_ready() { + let empty = BigramFilter::new(vec![NO_COLUMN; 65536], vec![], 0, 0, 0, 0); + assert!(!BigramQuery::is_ready(&empty)); + } + + #[test] + fn test_bigram_query_short_pattern() { + let index = build_test_index(); + let dyn_ref: &dyn BigramQuery = &index; + + // Single byte: too short for bigrams + assert!(dyn_ref.query(b"x").is_none()); + // Empty + assert!(dyn_ref.query(b"").is_none()); + } +} diff --git a/crates/fff-core/src/file_list_view.rs b/crates/fff-core/src/file_list_view.rs new file mode 100644 index 00000000..c4c4ecea --- /dev/null +++ b/crates/fff-core/src/file_list_view.rs @@ -0,0 +1,285 @@ +//! Zero-copy file list backed by a memory-mapped buffer. +//! +//! [`FileRecord`] is a `repr(C)` fixed-size record describing one file. +//! [`FileListView`] holds an array of records plus a string table, both +//! borrowed from an mmap. Provides indexed access to file metadata +//! without constructing owned [`FileItem`](crate::types::FileItem)s. + +use crate::types::FileItem; +use std::path::{Path, PathBuf}; + +/// Fixed-size, `repr(C)` file metadata record for mmap-friendly storage. +/// +/// Fields are ordered to avoid padding on both 32-bit and 64-bit platforms. +/// The `name_len` high bit stores the `is_binary` flag (max component length +/// is 255 on most filesystems, so 15 bits is sufficient). +#[repr(C)] +#[derive(Clone, Copy, Debug)] +pub struct FileRecord { + /// Byte offset of `relative_path` in the string table. + pub path_offset: u32, + /// Length of `relative_path` in bytes. + pub path_len: u16, + /// Length of `file_name` (the last path component). High bit = is_binary. + pub name_len_and_flags: u16, + /// File size in bytes. + pub size: u64, + /// Last modification time (seconds since UNIX epoch). + pub modified: u64, +} + +const BINARY_FLAG: u16 = 0x8000; + +impl FileRecord { + /// Create a new record. + pub fn new( + path_offset: u32, + path_len: u16, + name_len: u16, + is_binary: bool, + size: u64, + modified: u64, + ) -> Self { + let mut flags = name_len; + if is_binary { + flags |= BINARY_FLAG; + } + Self { + path_offset, + path_len, + name_len_and_flags: flags, + size, + modified, + } + } + + /// Length of the file name component. + #[inline] + pub fn name_len(&self) -> u16 { + self.name_len_and_flags & !BINARY_FLAG + } + + /// Whether the file was detected as binary. + #[inline] + pub fn is_binary(&self) -> bool { + self.name_len_and_flags & BINARY_FLAG != 0 + } + + /// Size of this struct in bytes (for serialization stride). + pub const SIZE: usize = std::mem::size_of::(); +} + +// Verify repr(C) layout is what we expect. +const _: () = assert!(FileRecord::SIZE == 24); + +/// Read-only view over a flat file list stored as [`FileRecord`]s + a string table. +/// +/// Both the records and strings are borrowed — typically from an mmap. +/// This avoids all heap allocation during load. Call [`to_file_items`] +/// when you need owned [`FileItem`]s for the search pipeline. +pub struct FileListView<'a> { + records: &'a [FileRecord], + strings: &'a [u8], +} + +impl<'a> FileListView<'a> { + /// Construct a view from raw record and string table slices. + /// + /// # Safety + /// The caller must ensure `records` was produced from a `repr(C)` + /// `FileRecord` array and `strings` contains valid UTF-8 at all + /// offsets referenced by the records. + pub unsafe fn new(records: &'a [FileRecord], strings: &'a [u8]) -> Self { + Self { records, strings } + } + + /// Number of files in this view. + #[inline] + pub fn len(&self) -> usize { + self.records.len() + } + + /// Whether the view is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.records.is_empty() + } + + /// Get the record at index `i`. + #[inline] + pub fn record(&self, i: usize) -> &FileRecord { + &self.records[i] + } + + /// Get the relative path for file `i` as a `&str`. + #[inline] + pub fn relative_path(&self, i: usize) -> &'a str { + let r = &self.records[i]; + let start = r.path_offset as usize; + let end = start + r.path_len as usize; + // SAFETY: caller guarantees valid UTF-8. + unsafe { std::str::from_utf8_unchecked(&self.strings[start..end]) } + } + + /// Get the file name for file `i` (last component of relative path). + #[inline] + pub fn file_name(&self, i: usize) -> &'a str { + let r = &self.records[i]; + let name_len = r.name_len() as usize; + let path_start = r.path_offset as usize; + let path_end = path_start + r.path_len as usize; + // SAFETY: name is a suffix of relative_path, both valid UTF-8. + unsafe { std::str::from_utf8_unchecked(&self.strings[path_end - name_len..path_end]) } + } + + /// Convert to owned `FileItem`s for the search pipeline. + /// Allocates strings and PathBufs from a sequential scan of the mmap'd data. + pub fn to_file_items(&self, base_path: &Path) -> Vec { + let base_bytes = base_path.as_os_str().as_encoded_bytes(); + let mut items = Vec::with_capacity(self.records.len()); + + for i in 0..self.records.len() { + let r = &self.records[i]; + let path_bytes = &self.strings + [r.path_offset as usize..(r.path_offset as usize + r.path_len as usize)]; + let name_len = r.name_len() as usize; + + let relative_path = unsafe { String::from_utf8_unchecked(path_bytes.to_vec()) }; + let file_name = if name_len > 0 && name_len <= path_bytes.len() { + unsafe { + String::from_utf8_unchecked(path_bytes[path_bytes.len() - name_len..].to_vec()) + } + } else { + relative_path.clone() + }; + + let mut full = Vec::with_capacity(base_bytes.len() + 1 + path_bytes.len()); + full.extend_from_slice(base_bytes); + full.push(b'/'); + full.extend_from_slice(path_bytes); + let full_path = + PathBuf::from(unsafe { std::ffi::OsString::from_encoded_bytes_unchecked(full) }); + + items.push(FileItem::new_raw( + full_path, + relative_path, + file_name, + r.size, + r.modified, + None, + r.is_binary(), + )); + } + + items + } +} + +/// Build a `FileRecord` array and string table from a slice of `FileItem`s. +/// +/// Returns `(records, string_table)` suitable for writing to disk or +/// constructing a `FileListView`. +pub fn build_file_records(files: &[FileItem]) -> (Vec, Vec) { + let string_table_size: usize = files.iter().map(|f| f.relative_path.len()).sum(); + let mut records = Vec::with_capacity(files.len()); + let mut strings = Vec::with_capacity(string_table_size); + + for file in files { + let path_offset = strings.len() as u32; + let path_len = file.relative_path.len() as u16; + let name_len = file.file_name.len() as u16; + strings.extend_from_slice(file.relative_path.as_bytes()); + + records.push(FileRecord::new( + path_offset, + path_len, + name_len, + file.is_binary, + file.size, + file.modified, + )); + } + + (records, strings) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_file_record_size() { + assert_eq!(FileRecord::SIZE, 24); + } + + #[test] + fn test_file_record_flags() { + let r = FileRecord::new(0, 10, 5, false, 100, 200); + assert_eq!(r.name_len(), 5); + assert!(!r.is_binary()); + + let r = FileRecord::new(0, 10, 5, true, 100, 200); + assert_eq!(r.name_len(), 5); + assert!(r.is_binary()); + } + + #[test] + fn test_round_trip() { + let files = vec![ + FileItem::new_raw( + PathBuf::from("/base/src/main.rs"), + "src/main.rs".to_string(), + "main.rs".to_string(), + 1024, + 1000000, + None, + false, + ), + FileItem::new_raw( + PathBuf::from("/base/tests/test.rs"), + "tests/test.rs".to_string(), + "test.rs".to_string(), + 512, + 2000000, + None, + false, + ), + FileItem::new_raw( + PathBuf::from("/base/image.png"), + "image.png".to_string(), + "image.png".to_string(), + 8192, + 3000000, + None, + true, + ), + ]; + + let (records, strings) = build_file_records(&files); + let view = unsafe { FileListView::new(&records, &strings) }; + + assert_eq!(view.len(), 3); + + assert_eq!(view.relative_path(0), "src/main.rs"); + assert_eq!(view.file_name(0), "main.rs"); + assert_eq!(view.record(0).size, 1024); + assert_eq!(view.record(0).modified, 1000000); + assert!(!view.record(0).is_binary()); + + assert_eq!(view.relative_path(1), "tests/test.rs"); + assert_eq!(view.file_name(1), "test.rs"); + + assert_eq!(view.relative_path(2), "image.png"); + assert!(view.record(2).is_binary()); + + // Test to_file_items round-trip + let items = view.to_file_items(Path::new("/base")); + assert_eq!(items.len(), 3); + assert_eq!(items[0].relative_path, "src/main.rs"); + assert_eq!(items[0].file_name, "main.rs"); + assert_eq!(items[0].size, 1024); + assert!(items[0].path.ends_with("src/main.rs")); + assert!(!items[0].is_binary); + assert!(items[2].is_binary); + } +} diff --git a/crates/fff-core/src/file_picker.rs b/crates/fff-core/src/file_picker.rs index c4f98c19..5d77b532 100644 --- a/crates/fff-core/src/file_picker.rs +++ b/crates/fff-core/src/file_picker.rs @@ -31,7 +31,7 @@ //! the file index, so read-heavy search workloads rarely contend. use crate::background_watcher::BackgroundWatcher; -use crate::bigram_filter::{BigramFilter, BigramIndexBuilder, BigramOverlay}; +use crate::bigram_filter::{BigramFilter, BigramIndexBuilder, BigramOverlay, BigramQuery}; use crate::error::Error; use crate::frecency::FrecencyTracker; use crate::git::GitStatusCache; @@ -627,7 +627,7 @@ impl FilePicker { query, options, self.cache_budget(), - self.bigram_index.as_deref(), + self.bigram_index.as_deref().map(|b| b as &dyn BigramQuery), overlay_guard.as_deref(), Some(&self.cancelled), ) @@ -645,7 +645,7 @@ impl FilePicker { query, options, self.cache_budget(), - self.bigram_index.as_deref(), + self.bigram_index.as_deref().map(|b| b as &dyn BigramQuery), None, Some(&self.cancelled), ) diff --git a/crates/fff-core/src/grep.rs b/crates/fff-core/src/grep.rs index bbea8db5..516d1586 100644 --- a/crates/fff-core/src/grep.rs +++ b/crates/fff-core/src/grep.rs @@ -6,7 +6,7 @@ //! termination once enough results are collected. use crate::{ - BigramFilter, BigramOverlay, + BigramFilter, BigramOverlay, BigramQuery, constraints::apply_constraints, extract_bigrams, sort_buffer::sort_with_buffer, @@ -1676,7 +1676,7 @@ pub fn grep_search<'a>( query: &FFFQuery<'_>, options: &GrepSearchOptions, budget: &ContentCacheBudget, - bigram_index: Option<&BigramFilter>, + bigram_index: Option<&dyn BigramQuery>, bigram_overlay: Option<&BigramOverlay>, is_cancelled: Option<&AtomicBool>, ) -> GrepResult<'a> { diff --git a/crates/fff-core/src/lib.rs b/crates/fff-core/src/lib.rs index bf0fa148..abca55b0 100644 --- a/crates/fff-core/src/lib.rs +++ b/crates/fff-core/src/lib.rs @@ -137,6 +137,9 @@ pub mod query_tracker; /// Core data types shared across the crate. pub mod types; +/// Zero-copy file list view backed by repr(C) records and a string table. +pub mod file_list_view; + mod ignore; /// Thread-safe shared handles for [`FilePicker`], [`FrecencyTracker`], /// and [`QueryTracker`]. diff --git a/crates/fff-nvim/src/bin/bench_grep_query.rs b/crates/fff-nvim/src/bin/bench_grep_query.rs index 5f2833a1..cc241b42 100644 --- a/crates/fff-nvim/src/bin/bench_grep_query.rs +++ b/crates/fff-nvim/src/bin/bench_grep_query.rs @@ -38,7 +38,15 @@ fn run_grep(files: &[fff::FileItem], index: Option<&fff::BigramFilter>, query: & for i in 0..iters { let t = Instant::now(); - let result = grep_search(files, &parsed, &options, &budget, index, None, None); + let result = grep_search( + files, + &parsed, + &options, + &budget, + index.map(|b| b as &dyn fff::BigramQuery), + None, + None, + ); let us = t.elapsed().as_micros(); times_us.push(us); diff --git a/crates/fff-nvim/src/bin/grep_profiler.rs b/crates/fff-nvim/src/bin/grep_profiler.rs index b6ce231d..fa95c3a9 100644 --- a/crates/fff-nvim/src/bin/grep_profiler.rs +++ b/crates/fff-nvim/src/bin/grep_profiler.rs @@ -159,7 +159,7 @@ impl<'a> GrepBench<'a> { &parsed, &self.options, &ContentCacheBudget::default(), - self.bigram_index, + self.bigram_index.map(|b| b as &dyn fff::BigramQuery), None, None, );