Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions ordvec-python/python/ordvec/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@
caller-supplied data, validate or sandbox the path first, exactly as you would
before a bare ``open()``.

Threading: ``search`` / ``search_asymmetric`` / ``add`` and the candidate
generators release the GIL during the heavy Rust scan, so other Python threads
run concurrently. The input arrays are *read in place* (not copied) for that
window — do not mutate an array from another thread while a call that received
it is in progress, or the scan races the write and may return inconsistent
results. This is the standard contract for GIL-releasing numeric extensions
(NumPy itself behaves this way).
Threading: ``search`` / ``search_asymmetric`` / ``add`` and the dense scoring /
candidate generator methods release the GIL during the heavy Rust scan, so other
Python threads run concurrently. The input arrays are *read in place* (not
copied) for that window — do not mutate an array from another thread while a
call that received it is in progress, or the scan races the write and may return
inconsistent results. This is the standard contract for GIL-releasing numeric
extensions (NumPy itself behaves this way).
"""

from ._ordvec import (
Expand Down
44 changes: 44 additions & 0 deletions ordvec-python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1201,6 +1201,50 @@ impl SignBitmap {
.into_pyarray(py))
}

/// Dense full-corpus sign-agreement scores for a single query. Returns a
/// 1-D `uint32` array of length `len(index)`, aligned by document id.
fn score_all<'py>(
&self,
py: Python<'py>,
query: &Bound<'py, PyAny>,
) -> PyResult<Bound<'py, PyArray1<u32>>> {
let query = as_f32_1d(query, Some(self.inner.dim()))?;
let arr = query.as_array();
let slice = arr.as_slice().ok_or_else(|| {
pyo3::exceptions::PyValueError::new_err(
"array must be C-contiguous; call np.ascontiguousarray() first",
)
})?;
let scores = py.detach(|| self.inner.score_all(slice));
Ok(scores.into_pyarray(py))
}

/// Batched dense full-corpus sign-agreement scores. Returns a 2-D `uint32`
/// array of shape `(batch, len(index))`, aligned by query row and document id.
fn score_all_batched<'py>(
&self,
py: Python<'py>,
queries: &Bound<'py, PyAny>,
) -> PyResult<Bound<'py, PyArray2<u32>>> {
let queries = as_f32_2d(queries, self.inner.dim())?;
let arr = queries.as_array();
let batch = arr.nrows();
let slice = arr.as_slice().ok_or_else(|| {
pyo3::exceptions::PyValueError::new_err(
"array must be C-contiguous; call np.ascontiguousarray() first",
)
})?;
let n = self.inner.len();
let qpv = self.inner.dim() / 64;
batch.checked_mul(n.max(qpv)).ok_or_else(|| {
pyo3::exceptions::PyValueError::new_err("batch * index size overflows usize")
})?;
let scores = py.detach(|| self.inner.score_all_batched_flat(slice));
Ok(numpy::ndarray::Array2::from_shape_vec((batch, n), scores)
.expect("internal: batched dense score flatten shape invariant")
.into_pyarray(py))
}
Comment thread
Fieldnote-Echo marked this conversation as resolved.

/// Build the query-side sign bitmap from an FP32 query, returned as a 1-D
/// `uint64` array of `dim / 64` words (`bit j` set iff `q[j] > 0`).
fn build_query_bitmap<'py>(
Expand Down
44 changes: 44 additions & 0 deletions ordvec-python/tests/test_sign_bitmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ def unit_vectors(n: int, dim: int, seed: int = 0) -> np.ndarray:
return v


def sign_agreement_reference(vectors: np.ndarray, query: np.ndarray) -> np.ndarray:
return ((vectors > 0.0) == (query > 0.0)).sum(axis=1).astype(np.uint32)


def test_new_reports_dim_and_is_empty():
idx = SignBitmap(dim=128)
assert idx.dim == 128
Expand Down Expand Up @@ -118,6 +122,46 @@ def test_empty_batch_against_empty_index_yields_zero_columns():
assert batched.shape == (0, 0)


def test_score_all_shape_dtype_and_reference_values():
vectors = unit_vectors(25, 128, seed=21)
query = unit_vectors(1, 128, seed=22)[0]
idx = SignBitmap(dim=128)
idx.add(vectors)

scores = idx.score_all(query)

assert scores.shape == (25,)
assert scores.dtype == np.uint32
np.testing.assert_array_equal(scores, sign_agreement_reference(vectors, query))


def test_score_all_batched_shape_and_matches_single_query():
vectors = unit_vectors(30, 128, seed=23)
queries = unit_vectors(4, 128, seed=24)
idx = SignBitmap(dim=128)
idx.add(vectors)

batched = idx.score_all_batched(queries)

assert batched.shape == (4, 30)
assert batched.dtype == np.uint32
expected = np.vstack([idx.score_all(q) for q in queries])
np.testing.assert_array_equal(batched, expected)


def test_score_all_empty_shapes():
idx = SignBitmap(dim=64)
q = unit_vectors(1, 64, seed=25)[0]
assert idx.score_all(q).shape == (0,)

queries = unit_vectors(3, 64, seed=26)
assert idx.score_all_batched(queries).shape == (3, 0)

idx.add(unit_vectors(5, 64, seed=27))
empty_q = np.empty((0, 64), dtype=np.float32)
assert idx.score_all_batched(empty_q).shape == (0, 5)


def test_dim_not_multiple_of_64_rejected():
# The binding validates that dim is a positive multiple of 64 -> ValueError.
with pytest.raises(ValueError, match="multiple of 64"):
Expand Down
166 changes: 166 additions & 0 deletions src/sign_bitmap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,83 @@ impl SignBitmap {
.collect()
}

/// Score every indexed document against one query and return dense
/// sign-agreement counts aligned by document id.
///
/// `scores[di] = dim - popcount(q_bits ^ doc_bits[di])`, so higher is
/// better. This is a full-corpus scoring primitive, not a retrieval helper:
/// it performs no top-k selection and no sorting.
#[must_use = "this scans the corpus to score every document; dropping the result discards that work"]
pub fn score_all(&self, q: &[f32]) -> Vec<u32> {
let qb = self.build_query_bitmap(q);
let mut scores = vec![0u32; self.n_vectors]; // Hamming distance first.
sign_scan_collect(
&self.bitmaps,
self.n_vectors,
self.qwords_per_vec,
&qb,
&mut scores,
);
let dim = u32::try_from(self.dim).expect("sign bitmap dim fits u32");
scores.par_iter_mut().for_each(|h| *h = dim - *h);
scores
}

/// Batched dense scoring. Returns a flat row-major buffer of full-corpus
/// sign-agreement scores of length `batch * len(index)`, with columns
/// aligned by document id and no sorting.
#[must_use = "this scans the corpus to score every document per query; dropping the result discards that work"]
pub fn score_all_batched_flat(&self, queries: &[f32]) -> Vec<u32> {
let dim = self.dim;
let batch = queries.len() / dim;
assert_eq!(queries.len(), batch * dim);
if batch == 0 {
return Vec::new();
}
let n = self.n_vectors;
let qpv = self.qwords_per_vec;

let q_batch_len = batch
.checked_mul(qpv)
.expect("batched query-bitmap buffer length (batch * qpv) overflows usize");
let mut q_batch = vec![0u64; q_batch_len];
for bi in 0..batch {
let qb = self.build_query_bitmap(&queries[bi * dim..(bi + 1) * dim]);
q_batch[bi * qpv..(bi + 1) * qpv].copy_from_slice(&qb);
}

if n == 0 {
return Vec::new();
}

let scores_len = batch
.checked_mul(n)
.expect("batched dense score buffer length (batch * n) overflows usize");
let mut scores = vec![0u32; scores_len]; // Hamming distance first.
sign_scan_collect_batched(&self.bitmaps, n, qpv, &q_batch, batch, &mut scores);

let dim = u32::try_from(dim).expect("sign bitmap dim fits u32");
scores
.par_chunks_mut(n)
.for_each(|row| row.iter_mut().for_each(|h| *h = dim - *h));
scores
}

/// Batched dense scoring. Returns one full-corpus sign-agreement row per
/// query, with columns aligned by document id and no sorting.
#[must_use = "this scans the corpus to score every document per query; dropping the result discards that work"]
pub fn score_all_batched(&self, queries: &[f32]) -> Vec<Vec<u32>> {
let dim = self.dim;
let batch = queries.len() / dim;
assert_eq!(queries.len(), batch * dim);
let n = self.n_vectors;
let flat = self.score_all_batched_flat(queries);
if n == 0 {
return vec![Vec::new(); batch];
}
flat.chunks(n).map(|row| row.to_vec()).collect()
}
Comment thread
Fieldnote-Echo marked this conversation as resolved.

pub fn len(&self) -> usize {
self.n_vectors
}
Expand Down Expand Up @@ -562,6 +639,95 @@ mod tests {
}
}

#[test]
fn score_all_returns_sign_agreement_by_doc_id() {
let n = 37;
let corpus = make_corpus(27, n);
let mut idx = SignBitmap::new(D);
idx.add(&corpus);
let mut rng = ChaCha8Rng::seed_from_u64(28);
let query: Vec<f32> = (0..D).map(|_| rng.random_range(-1.0..1.0)).collect();

let scores = idx.score_all(&query);
assert_eq!(scores.len(), n);
let qbm = idx.build_query_bitmap(&query);
for (di, &score) in scores.iter().enumerate() {
let off = di * idx.qwords_per_vec;
let dbm = &idx.bitmaps[off..off + idx.qwords_per_vec];
assert_eq!(
score,
D as u32 - scalar_hamming(&qbm, dbm),
"score_all must return sign agreement for doc {di}",
);
}
}

#[test]
fn score_all_batched_matches_single_query() {
let n = 75;
let corpus = make_corpus(29, n);
let mut idx = SignBitmap::new(D);
idx.add(&corpus);
let mut rng = ChaCha8Rng::seed_from_u64(30);
let batch = 6;
let queries: Vec<f32> = (0..batch * D)
.map(|_| rng.random_range(-1.0..1.0))
.collect();

let batched = idx.score_all_batched(&queries);
assert_eq!(batched.len(), batch);
for bi in 0..batch {
assert_eq!(
batched[bi],
idx.score_all(&queries[bi * D..(bi + 1) * D]),
"batched dense scoring diverged at batch idx {bi}",
);
}
}
Comment thread
Fieldnote-Echo marked this conversation as resolved.

#[test]
fn score_all_batched_flat_matches_single_query() {
let n = 75;
let corpus = make_corpus(31, n);
let mut idx = SignBitmap::new(D);
idx.add(&corpus);
let mut rng = ChaCha8Rng::seed_from_u64(32);
let batch = 6;
let queries: Vec<f32> = (0..batch * D)
.map(|_| rng.random_range(-1.0..1.0))
.collect();

let batched = idx.score_all_batched_flat(&queries);
assert_eq!(batched.len(), batch * n);
for bi in 0..batch {
assert_eq!(
&batched[bi * n..(bi + 1) * n],
idx.score_all(&queries[bi * D..(bi + 1) * D]),
"flat batched dense scoring diverged at batch idx {bi}",
);
}
}

#[test]
fn score_all_empty_shapes() {
let idx = SignBitmap::new(D);
let query = vec![1.0f32; D];
assert!(idx.score_all(&query).is_empty());

let queries = vec![1.0f32; 2 * D];
assert!(idx.score_all_batched_flat(&queries).is_empty());
assert_eq!(idx.score_all_batched(&queries), vec![Vec::<u32>::new(); 2]);

let empty_queries: Vec<f32> = Vec::new();
assert!(idx.score_all_batched_flat(&empty_queries).is_empty());
assert!(idx.score_all_batched(&empty_queries).is_empty());

let mut idx = SignBitmap::new(D);
idx.add(&make_corpus(33, 5));
assert!(idx.score_all_batched_flat(&empty_queries).is_empty());
assert!(idx.score_all_batched(&empty_queries).is_empty());
}
Comment thread
Fieldnote-Echo marked this conversation as resolved.

#[test]
fn large_dim_above_u16_max_roundtrips() {
// Regression for the Codex stop-time finding: SignBitmap::new
Expand Down
Loading