Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

87 changes: 82 additions & 5 deletions crates/fff-core/src/bigram_filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,19 @@ use std::sync::atomic::{AtomicU16, AtomicU64, AtomicUsize, Ordering};

use ahash::AHashMap;

/// Query interface for bigram-based file filtering.
///
/// Implemented by `BigramFilter` (owned) and can be implemented by external
/// types (e.g. a zero-copy mmap-backed index).
pub trait BigramQuery {
/// AND the posting lists for all query bigrams.
/// Returns `None` if the pattern is too short or no bigrams are tracked.
fn query(&self, pattern: &[u8]) -> Option<Vec<u64>>;

/// Whether the index has been populated with at least one file.
fn is_ready(&self) -> bool;
}

/// Maximum number of distinct bigrams tracked in the inverted index.
/// 95 printable ASCII chars (32..=126) after lowercasing → ~70 distinct → 4900 possible.
/// We cap at 5000 to cover all printable bigrams with margin.
Expand Down Expand Up @@ -269,10 +282,20 @@ fn bitset_and(result: &mut [u64], bitset: &[u64]) {
.for_each(|(r, b)| *r &= *b);
}

impl BigramQuery for BigramFilter {
fn query(&self, pattern: &[u8]) -> Option<Vec<u64>> {
self.query_inner(pattern)
}

fn is_ready(&self) -> bool {
self.populated > 0
}
}

impl BigramFilter {
/// AND the posting lists for all query bigrams (consecutive + skip).
/// Returns None if no query bigrams are tracked.
pub fn query(&self, pattern: &[u8]) -> Option<Vec<u64>> {
fn query_inner(&self, pattern: &[u8]) -> Option<Vec<u64>> {
if pattern.len() < 2 {
return None;
}
Expand Down Expand Up @@ -360,10 +383,6 @@ impl BigramFilter {
candidates.iter().map(|w| w.count_ones() as usize).sum()
}

pub fn is_ready(&self) -> bool {
self.populated > 0
}

pub fn file_count(&self) -> usize {
self.file_count
}
Expand Down Expand Up @@ -562,3 +581,61 @@ impl BigramOverlay {
}
}
}

#[cfg(test)]
mod tests {
use super::*;

fn build_test_index() -> BigramFilter {
let builder = BigramIndexBuilder::new(3);
let skip_builder = BigramIndexBuilder::new(3);
builder.add_file_content(&skip_builder, 0, b"fn main() { hello_world(); }");
builder.add_file_content(&skip_builder, 1, b"struct Foo { bar: i32 }");
builder.add_file_content(&skip_builder, 2, b"fn test_hello() { assert!(true); }");
builder.compress(None)
}

#[test]
fn test_bigram_query_trait_matches_inherent() {
let index = build_test_index();

// Inherent method
let inherent_result = index.query_inner(b"hello");
// Trait method
let trait_result = BigramQuery::query(&index, b"hello");

assert_eq!(inherent_result, trait_result);
}

#[test]
fn test_bigram_query_via_dyn() {
let index = build_test_index();
let dyn_ref: &dyn BigramQuery = &index;

assert!(dyn_ref.is_ready());

let candidates = dyn_ref.query(b"hello").expect("should match");
// Files 0 and 2 contain "hello"
assert!(BigramFilter::is_candidate(&candidates, 0));
assert!(BigramFilter::is_candidate(&candidates, 2));
// File 1 does not
assert!(!BigramFilter::is_candidate(&candidates, 1));
}

#[test]
fn test_bigram_query_not_ready() {
let empty = BigramFilter::new(vec![NO_COLUMN; 65536], vec![], 0, 0, 0, 0);
assert!(!BigramQuery::is_ready(&empty));
}

#[test]
fn test_bigram_query_short_pattern() {
let index = build_test_index();
let dyn_ref: &dyn BigramQuery = &index;

// Single byte: too short for bigrams
assert!(dyn_ref.query(b"x").is_none());
// Empty
assert!(dyn_ref.query(b"").is_none());
}
}
Loading
Loading