From 68cf532ba9b70423e363e5e2632a40b9e27a6eea Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Thu, 4 Jun 2026 01:14:25 -0500 Subject: [PATCH 1/3] Tighten subset ordering contract coverage Signed-off-by: Nelson Spence --- docs/RANK_MODES.md | 3 +++ fuzz/fuzz_targets/search_rankquant.rs | 26 +++++++++++-------- .../signbitmap_rankquant_twostage.rs | 18 +++++++++++-- ordvec-ffi/include/ordvec.h | 9 +++++++ ordvec-ffi/src/lib.rs | 7 +++++ ordvec-go/README.md | 4 +++ ordvec-go/doc.go | 4 +++ ordvec-go/ordvec.go | 4 +++ ordvec-python/src/lib.rs | 11 +++++--- ordvec-python/tests/test_rank_quant.py | 19 ++++++++++++-- src/quant.rs | 3 +++ tests/index/two_stage.rs | 24 +++++++++++------ 12 files changed, 106 insertions(+), 26 deletions(-) diff --git a/docs/RANK_MODES.md b/docs/RANK_MODES.md index f36fed1..f95b4ac 100644 --- a/docs/RANK_MODES.md +++ b/docs/RANK_MODES.md @@ -427,6 +427,9 @@ serialisers living in [`src/rank_io.rs`](../src/rank_io.rs) and [`src/sign_bitmap.rs`](../src/sign_bitmap.rs). `RankQuant` additionally exposes `search_asymmetric_subset` for scoring a precomputed candidate set — the rerank half of the two-stage pattern. +Candidate IDs are global row ordinals; duplicate candidates are scored as +separate entries and can produce duplicate hits, so callers that need +unique output rows should deduplicate candidate lists before reranking. `RankQuantFastscan` (re-exported `#[doc(hidden)]`) is an optional single-pass b=2 fast path; it supports `add`/`search` but not diff --git a/fuzz/fuzz_targets/search_rankquant.rs b/fuzz/fuzz_targets/search_rankquant.rs index 6c7b386..cfddcae 100644 --- a/fuzz/fuzz_targets/search_rankquant.rs +++ b/fuzz/fuzz_targets/search_rankquant.rs @@ -9,8 +9,8 @@ //! huge value. Invalid dimensions, non-finite floats, and ragged vector lengths //! are caller contract violations, so this target avoids them and treats any //! panic as a compute-path bug. Assertions stay structural: shape, finite -//! scores, valid doc IDs, score-descending rows, and repeat determinism in one -//! process. +//! scores, valid doc IDs, score-descending/doc-ID-ascending rows, and repeat +//! determinism in one process. #![no_main] use libfuzzer_sys::{ @@ -105,15 +105,19 @@ fn assert_results(label: &str, res: &SearchResults, nq: usize, k_eff: usize, n: "{label}: doc id {id} out of range for n={n} at query {qi} slot {slot}", ); } - for slot in 1..k_eff { - let prev = (scores[slot - 1], ids[slot - 1]); - let cur = (scores[slot], ids[slot]); - assert!( - cur.0 <= prev.0, - "{label}: row {qi} not sorted at slots {} and {slot}", - slot - 1, - ); - } + assert_score_then_id_order(label, qi, scores, ids); + } +} + +fn assert_score_then_id_order(label: &str, qi: usize, scores: &[f32], ids: &[i64]) { + for slot in 1..scores.len() { + let prev = (scores[slot - 1], ids[slot - 1]); + let cur = (scores[slot], ids[slot]); + assert!( + cur.0 < prev.0 || (cur.0 == prev.0 && cur.1 >= prev.1), + "{label}: row {qi} violates score-desc/doc-id-asc order at slots {} and {slot}", + slot - 1, + ); } } diff --git a/fuzz/fuzz_targets/signbitmap_rankquant_twostage.rs b/fuzz/fuzz_targets/signbitmap_rankquant_twostage.rs index 3a45c4b..776e57d 100644 --- a/fuzz/fuzz_targets/signbitmap_rankquant_twostage.rs +++ b/fuzz/fuzz_targets/signbitmap_rankquant_twostage.rs @@ -11,7 +11,9 @@ //! reranking agrees with a full RankQuant search. //! //! Contract: no panic, abort, or out-of-bounds access on any in-range candidate -//! input, and full-corpus candidate reranking must match full RankQuant search. +//! input, subset reranking must preserve score-descending/doc-ID-ascending +//! ordering, and full-corpus candidate reranking must match full RankQuant +//! search. #![no_main] use libfuzzer_sys::{ @@ -31,6 +33,18 @@ struct TwoStageInput { payload: Vec, } +fn assert_score_then_id_order(scores: &[f32], ids: &[i64]) { + for slot in 1..scores.len() { + let prev = (scores[slot - 1], ids[slot - 1]); + let cur = (scores[slot], ids[slot]); + assert!( + cur.0 < prev.0 || (cur.0 == prev.0 && cur.1 >= prev.1), + "subset rerank violates score-desc/doc-id-asc order at slots {} and {slot}", + slot - 1, + ); + } +} + impl<'a> Arbitrary<'a> for TwoStageInput { fn arbitrary(u: &mut Unstructured<'a>) -> Result { let dim = *u.choose(&[64usize, 128, 256, 512])?; @@ -108,7 +122,7 @@ fuzz_target!(|input: TwoStageInput| { assert_eq!(scores.len(), k_eff); assert_eq!(ids.len(), k_eff); assert!(scores.iter().all(|score| score.is_finite())); - assert!(scores.windows(2).all(|pair| pair[0] >= pair[1])); + assert_score_then_id_order(&scores, &ids); for &id in &ids { assert!(id >= 0); assert!(subset_candidates.contains(&(id as u32))); diff --git a/ordvec-ffi/include/ordvec.h b/ordvec-ffi/include/ordvec.h index 36f34cb..b02bc6f 100644 --- a/ordvec-ffi/include/ordvec.h +++ b/ordvec-ffi/include/ordvec.h @@ -52,6 +52,10 @@ typedef struct { const float *query; uint64_t dim; uint64_t k; + /** + * Optional subset row IDs. These are entry lists, not sets: duplicate + * candidates are scored independently and can produce duplicate hits. + */ const uint32_t *candidate_rows; uint64_t candidate_count; uint64_t flags; @@ -224,6 +228,11 @@ void ordvec_index_free(ordvec_index_t *index); /** * Run a synchronous single-query search. * + * When `params.candidate_rows` is supplied, those IDs are global row ordinals + * and may be unsorted or duplicated. Duplicate candidates are scored as + * separate entries and can produce duplicate hits; callers that need unique + * output rows must deduplicate before calling. + * * # Safety * * `index` must be a live handle returned by `ordvec_index_load`. All non-null diff --git a/ordvec-ffi/src/lib.rs b/ordvec-ffi/src/lib.rs index 6b35d48..d1f0097 100644 --- a/ordvec-ffi/src/lib.rs +++ b/ordvec-ffi/src/lib.rs @@ -73,6 +73,8 @@ pub struct ordvec_search_params_t { pub query: *const f32, pub dim: u64, pub k: u64, + /// Optional subset row IDs. These are entry lists, not sets: duplicate + /// candidates are scored independently and can produce duplicate hits. pub candidate_rows: *const u32, pub candidate_count: u64, pub flags: u64, @@ -871,6 +873,11 @@ pub unsafe extern "C" fn ordvec_index_free(index: *mut ordvec_index_t) { #[no_mangle] /// Run a synchronous single-query search. /// +/// When `params.candidate_rows` is supplied, those IDs are global row ordinals +/// and may be unsorted or duplicated. Duplicate candidates are scored as +/// separate entries and can produce duplicate hits; callers that need unique +/// output rows must deduplicate before calling. +/// /// # Safety /// /// `index` must be a live handle returned by `ordvec_index_load`. All non-null diff --git a/ordvec-go/README.md b/ordvec-go/README.md index 8fde2ef..15cdb4c 100644 --- a/ordvec-go/README.md +++ b/ordvec-go/README.md @@ -19,3 +19,7 @@ Search with `nil` options or `nil` `SearchOptions.Candidates` performs a full search. An empty, non-nil `Candidates` slice is treated as an explicit empty subset and returns a typed `StatusBadArgument`, matching the C ABI v1 pointer/count contract. + +`SearchOptions.Candidates` is an entry list of global row ordinals, not a set. +Duplicate candidates are scored independently and can produce duplicate hits; +deduplicate before searching when unique row IDs are required. diff --git a/ordvec-go/doc.go b/ordvec-go/doc.go index fb79eff..eaf3458 100644 --- a/ordvec-go/doc.go +++ b/ordvec-go/doc.go @@ -6,4 +6,8 @@ // Search pins and passes caller-owned query and candidate slices to the C ABI // without copying them. Callers must not mutate those slices until Search // returns. +// +// Candidate slices are entry lists, not sets. Duplicate candidate IDs are scored +// independently and can produce duplicate hits; callers that require unique row +// IDs should deduplicate before Search. package ordvec diff --git a/ordvec-go/ordvec.go b/ordvec-go/ordvec.go index 1566cb9..820b678 100644 --- a/ordvec-go/ordvec.go +++ b/ordvec-go/ordvec.go @@ -145,6 +145,10 @@ type Stats struct { } type SearchOptions struct { + // Candidates restricts the search to these global row ordinals. It is an + // entry list, not a set: duplicate candidates are scored independently and + // can produce duplicate hits. Deduplicate before calling if unique rows are + // required. Candidates []uint32 UserTag uint64 } diff --git a/ordvec-python/src/lib.rs b/ordvec-python/src/lib.rs index ce03c29..e6d7365 100644 --- a/ordvec-python/src/lib.rs +++ b/ordvec-python/src/lib.rs @@ -731,9 +731,14 @@ impl RankQuant { /// Asymmetric scoring restricted to a candidate subset (e.g. the top-M /// shortlist from a [`Bitmap`] or [`SignBitmap`] probe). Returns /// ``(scores, global_ids)`` where ``global_ids`` are the original doc - /// indices (mapped from the local candidate slot); slots that could not be - /// filled are returned as ``-1``. Uses the same AVX-512 → AVX2 → scalar - /// dispatch as ``search_asymmetric``. + /// indices (mapped from the local candidate slot). ``k`` is capped to the + /// candidate-list length; the subset path does not add sentinel padding. + /// Uses the same AVX-512 → AVX2 → scalar dispatch as ``search_asymmetric``. + /// + /// ``candidates`` may be unsorted and may contain duplicates. Duplicate + /// candidate IDs are scored as separate entries and can produce duplicate + /// hits; callers that require unique row IDs should deduplicate before + /// calling. /// /// If the shortlist came from [`Bitmap`], this is the exact RankQuant /// rerank stage over that survivor set; it does not itself apply or diff --git a/ordvec-python/tests/test_rank_quant.py b/ordvec-python/tests/test_rank_quant.py index 21bf830..4e97ca8 100644 --- a/ordvec-python/tests/test_rank_quant.py +++ b/ordvec-python/tests/test_rank_quant.py @@ -310,8 +310,9 @@ def test_search_asymmetric_subset_returns_global_ids(): assert ids.dtype == np.int64 # Self-query against a candidate set containing self → top-1 is self. assert int(ids[0]) == 0 - # All returned ids are from the candidate set (or sentinel -1). - candidate_set = set(candidates.tolist()) | {-1} + # All returned ids are from the candidate set; k is capped instead of + # sentinel-padding unfilled slots. + candidate_set = set(candidates.tolist()) for i in ids: assert int(i) in candidate_set @@ -347,6 +348,20 @@ def test_search_asymmetric_subset_ties_use_global_row_ids(): np.testing.assert_array_equal(scores, np.array([0.0, 0.0], dtype=np.float32)) +def test_search_asymmetric_subset_duplicates_remain_duplicate_entries(): + vectors = np.ones((12, 64), dtype=np.float32) + idx = RankQuant(dim=64, bits=2) + idx.add(vectors) + + candidates = np.array([7, 8, 7], dtype=np.uint32) + scores, ids = idx.search_asymmetric_subset( + np.zeros(64, dtype=np.float32), candidates, k=2 + ) + + np.testing.assert_array_equal(ids, np.array([7, 7], dtype=np.int64)) + np.testing.assert_array_equal(scores, np.array([0.0, 0.0], dtype=np.float32)) + + def test_search_asymmetric_subset_k_caps_at_candidate_count(): # k > len(candidates) should silently cap — no panic, no sentinel # padding beyond the candidate-set size. diff --git a/src/quant.rs b/src/quant.rs index f770043..87f0087 100644 --- a/src/quant.rs +++ b/src/quant.rs @@ -539,6 +539,9 @@ impl RankQuant { /// to global IDs before returning). Results are ordered by score /// descending, then global row ID ascending, matching the full-index /// search tie policy even when `candidates` is unsorted. + /// Duplicate candidate IDs are scored as separate entries and can + /// produce duplicate hits; callers that require unique row IDs should + /// deduplicate before calling. /// /// Uses the same AVX-512 → AVX2 → scalar dispatch as /// [`Self::search_asymmetric`] and the same centre-drop math, just diff --git a/tests/index/two_stage.rs b/tests/index/two_stage.rs index d434234..cedbf67 100644 --- a/tests/index/two_stage.rs +++ b/tests/index/two_stage.rs @@ -19,6 +19,18 @@ fn assert_two_stage_invariants(sign: &SignBitmap, rankquant: &RankQuant) { assert_eq!(sign.len(), N); } +fn assert_score_then_id_order(scores: &[f32], ids: &[i64]) { + for slot in 1..scores.len() { + let prev = (scores[slot - 1], ids[slot - 1]); + let cur = (scores[slot], ids[slot]); + assert!( + cur.0 < prev.0 || (cur.0 == prev.0 && cur.1 >= prev.1), + "results violate score-desc/doc-id-asc order at slots {} and {slot}", + slot - 1, + ); + } +} + #[test] fn sign_rankquant_pipeline_handles_edge_candidate_and_k_shapes() { let (sign, rankquant, _corpus) = build_two_stage(2); @@ -51,6 +63,7 @@ fn sign_rankquant_pipeline_handles_edge_candidate_and_k_shapes() { assert_eq!(scores.len(), shortlist.len()); assert_eq!(ids.len(), shortlist.len()); assert!(ids.iter().all(|&id| shortlist.contains(&(id as u32)))); + assert_score_then_id_order(&scores, &ids); } #[test] @@ -64,15 +77,10 @@ fn sign_rankquant_full_candidate_set_matches_full_rankquant_search() { let full = rankquant.search_asymmetric(query, 16); let (subset_scores, subset_ids) = rankquant.search_asymmetric_subset(query, &candidates, 16); - assert!(subset_ids - .iter() - .all(|&id| candidates.contains(&(id as u32)))); + assert_eq!(subset_ids, full.indices_for_query(0)); assert_eq!(subset_scores.len(), full.scores_for_query(0).len()); - let mut subset_scores_sorted = subset_scores; - let mut full_scores_sorted = full.scores_for_query(0).to_vec(); - subset_scores_sorted.sort_by(|left, right| left.total_cmp(right)); - full_scores_sorted.sort_by(|left, right| left.total_cmp(right)); - for (subset, full) in subset_scores_sorted.iter().zip(&full_scores_sorted) { + assert_score_then_id_order(&subset_scores, &subset_ids); + for (subset, full) in subset_scores.iter().zip(full.scores_for_query(0)) { assert!( (subset - full).abs() <= 1e-6, "subset score {subset} diverged from full score {full}" From 14392b3fce3012eb7674d7fd1a5f7130338cad56 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Thu, 4 Jun 2026 09:02:03 -0500 Subject: [PATCH 2/3] Fix visible RankQuant tie ordering after score offsets Signed-off-by: Nelson Spence --- src/quant.rs | 35 +++++++++++------------------------ src/util.rs | 32 +++++++++++++++++++++++++++++++- tests/index/two_stage.rs | 29 +++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 25 deletions(-) diff --git a/src/quant.rs b/src/quant.rs index 87f0087..285fb10 100644 --- a/src/quant.rs +++ b/src/quant.rs @@ -397,25 +397,16 @@ impl RankQuant { &mut top, ); - top.finalize_into(out_scores, out_indices); - if centre_drop_used { // The asym kernels drop the per-lane `- centre` term from - // the hot loop; it is a query-constant shift, re-applied - // here. Guarded by `is_finite` so it lands only on filled - // slots: when fewer than `k` docs were scored the trailing - // top-k positions stay at the `f32::NEG_INFINITY` sentinel, - // and `NEG_INFINITY + offset` would wrongly turn a sentinel - // into a finite score. (Real scores are always finite — the - // finite-input policy guarantees it — so the guard only ever - // skips sentinels, never a genuine result.) + // the hot loop; apply the query-constant shift before the + // final visible-score sort so rounding-collapse ties still + // use the public row-id tie key. let q_sum: f32 = q_unit.iter().sum(); let offset = -centre * q_sum * inv_norm; - for s in out_scores.iter_mut() { - if s.is_finite() { - *s += offset; - } - } + top.finalize_with_score_offset_into(out_scores, out_indices, offset); + } else { + top.finalize_into(out_scores, out_indices); } let _ = bytes_per_vec; // shape clarity @@ -659,17 +650,13 @@ impl RankQuant { let mut scores = vec![f32::NEG_INFINITY; k_eff]; let mut local_indices = vec![-1i64; k_eff]; - top.finalize_into(&mut scores, &mut local_indices); if centre_drop_used { // Re-apply the per-query centre shift dropped from the kernel hot - // loop; the `is_finite` guard skips unfilled top-k slots (still at - // the `f32::NEG_INFINITY` sentinel) so a sentinel never becomes a - // finite score. See the matching note in `search_asymmetric`. - for s in scores.iter_mut() { - if s.is_finite() { - *s += centre_offset; - } - } + // loop before final sorting so visible-score ties are still ordered + // by global row ID. + top.finalize_with_score_offset_into(&mut scores, &mut local_indices, centre_offset); + } else { + top.finalize_into(&mut scores, &mut local_indices); } // Map local → global doc IDs. let global_indices: Vec = local_indices diff --git a/src/util.rs b/src/util.rs index 0229f72..7055731 100644 --- a/src/util.rs +++ b/src/util.rs @@ -474,6 +474,22 @@ impl TopK { /// user-requested `k`; positions beyond `self.filled` are left as /// sentinels. pub(crate) fn finalize_into(&self, out_scores: &mut [f32], out_indices: &mut [i64]) { + self.finalize_with_score_offset_into(out_scores, out_indices, 0.0); + } + + /// Drain into `out_scores` / `out_indices`, applying a query-constant score + /// offset before the final `(score desc, tie_key asc)` ordering. + /// + /// SIMD RankQuant asymmetric kernels drop a query-constant centre term from + /// the hot loop and re-apply it at finalize time. Adding that offset can + /// collapse two distinct finite `f32` scores into one visible output score, + /// so the public tie order must be computed after the offset is applied. + pub(crate) fn finalize_with_score_offset_into( + &self, + out_scores: &mut [f32], + out_indices: &mut [i64], + score_offset: f32, + ) { debug_assert_eq!(out_scores.len(), out_indices.len()); for s in out_scores.iter_mut() { *s = f32::NEG_INFINITY; @@ -488,7 +504,7 @@ impl TopK { .zip(self.tie_keys.iter()) .enumerate() .take(self.filled) - .map(|(slot, ((&s, &i), &tie_key))| (s, i, tie_key, slot)) + .map(|(slot, ((&s, &i), &tie_key))| (s + score_offset, i, tie_key, slot)) .collect(); // Composite key: score descending, then tie key ascending. The kept // slot is only a final deterministic tie-break when duplicate @@ -578,6 +594,20 @@ mod tests { assert_eq!(indices, [0, 1]); } + #[test] + fn topk_offset_sort_uses_visible_score_ties() { + let mut top = TopK::new_with_tie_keys(2, &[10, 5]); + top.maybe_insert(1.0 + f32::EPSILON, 0); + top.maybe_insert(1.0, 1); + + let mut scores = [f32::NEG_INFINITY; 2]; + let mut indices = [-1; 2]; + top.finalize_with_score_offset_into(&mut scores, &mut indices, 100_000_000.0); + + assert_eq!(scores, [100_000_000.0, 100_000_000.0]); + assert_eq!(indices, [1, 0]); + } + #[test] fn checked_new_len_accepts_up_to_max() { use crate::rank_io::MAX_VECTORS; diff --git a/tests/index/two_stage.rs b/tests/index/two_stage.rs index cedbf67..613ea56 100644 --- a/tests/index/two_stage.rs +++ b/tests/index/two_stage.rs @@ -87,3 +87,32 @@ fn sign_rankquant_full_candidate_set_matches_full_rankquant_search() { ); } } + +#[test] +fn sign_rankquant_subset_orders_visible_ties_after_centre_offset() { + let dim = 128usize; + let n_vectors = 5usize; + let bits = 4u8; + let payload = [ + 158u8, 158, 158, 158, 158, 158, 158, 158, 158, 158, 137, 10, 10, + ]; + let floats: Vec = (0..((n_vectors + 1) * dim)) + .map(|i| payload[i % payload.len()] as f32 - 128.0) + .collect(); + let (corpus, query) = floats.split_at(n_vectors * dim); + + let mut sign = SignBitmap::new(dim); + let mut rankquant = RankQuant::new(dim, bits); + sign.add(corpus); + rankquant.add(corpus); + + let candidates = sign.top_m_candidates(query, n_vectors); + assert_eq!(candidates.len(), n_vectors); + + let (scores, ids) = rankquant.search_asymmetric_subset(query, &candidates, n_vectors + 1); + + assert_eq!(scores.len(), n_vectors); + assert_eq!(ids.len(), n_vectors); + assert!(scores.iter().all(|score| score.is_finite())); + assert_score_then_id_order(&scores, &ids); +} From b0e91a9c62bf1c51c7c236b72273bb41fc7d755b Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Thu, 4 Jun 2026 09:42:11 -0500 Subject: [PATCH 3/3] Fix RankQuant offset eviction ordering Signed-off-by: Nelson Spence --- fuzz/fuzz_targets/search_rankquant.rs | 2 +- src/quant.rs | 56 +++++++++------------------ src/util.rs | 50 ++++++++++++------------ 3 files changed, 44 insertions(+), 64 deletions(-) diff --git a/fuzz/fuzz_targets/search_rankquant.rs b/fuzz/fuzz_targets/search_rankquant.rs index cfddcae..3854a83 100644 --- a/fuzz/fuzz_targets/search_rankquant.rs +++ b/fuzz/fuzz_targets/search_rankquant.rs @@ -114,7 +114,7 @@ fn assert_score_then_id_order(label: &str, qi: usize, scores: &[f32], ids: &[i64 let prev = (scores[slot - 1], ids[slot - 1]); let cur = (scores[slot], ids[slot]); assert!( - cur.0 < prev.0 || (cur.0 == prev.0 && cur.1 >= prev.1), + cur.0 < prev.0 || (cur.0 == prev.0 && cur.1 > prev.1), "{label}: row {qi} violates score-desc/doc-id-asc order at slots {} and {slot}", slot - 1, ); diff --git a/src/quant.rs b/src/quant.rs index 285fb10..aef5038 100644 --- a/src/quant.rs +++ b/src/quant.rs @@ -338,10 +338,10 @@ impl RankQuant { #[cfg_attr(not(target_arch = "x86_64"), allow(unused_variables))] let simd_tier = select_simd_tier(dim, bits); - // For the AVX2 path we drop the per-lane centre subtract from - // the hot loop and add it back as a per-query constant offset - // to the top-k scores at finalize time. Ranking is invariant - // to this constant; absolute scores stay exact. + // SIMD asymmetric kernels drop the per-lane centre subtract from the + // hot loop. Apply the query-constant offset before TopK insertion so + // retention and final ordering use the same public visible score key. + #[cfg(target_arch = "x86_64")] let centre = ((1u32 << bits) as f32 - 1.0) / 2.0; queries @@ -351,27 +351,27 @@ impl RankQuant { .for_each(|((q, out_scores), out_indices)| { let q_unit = l2_normalise(q); let mut top = TopK::new(k_eff); - #[cfg_attr(not(target_arch = "x86_64"), allow(unused_mut))] - let mut centre_drop_used = false; + #[cfg(target_arch = "x86_64")] + let centre_offset = -centre * q_unit.iter().sum::() * inv_norm; #[cfg(target_arch = "x86_64")] unsafe { match (simd_tier, bits) { (SimdTier::Avx512, 2) => { + top.set_score_offset(centre_offset); scan_b2_asym_avx512(&self.packed, n, dim, &q_unit, inv_norm, &mut top); - centre_drop_used = true; } (SimdTier::Avx512, 4) => { + top.set_score_offset(centre_offset); scan_b4_asym_avx512(&self.packed, n, dim, &q_unit, inv_norm, &mut top); - centre_drop_used = true; } (SimdTier::Avx2, 2) => { + top.set_score_offset(centre_offset); scan_b2_asym_avx2(&self.packed, n, dim, &q_unit, inv_norm, &mut top); - centre_drop_used = true; } (SimdTier::Avx2, 4) => { + top.set_score_offset(centre_offset); scan_b4_asym_avx2(&self.packed, n, dim, &q_unit, inv_norm, &mut top); - centre_drop_used = true; } _ => scan_via_lut_scalar( &self.packed, @@ -397,17 +397,7 @@ impl RankQuant { &mut top, ); - if centre_drop_used { - // The asym kernels drop the per-lane `- centre` term from - // the hot loop; apply the query-constant shift before the - // final visible-score sort so rounding-collapse ties still - // use the public row-id tie key. - let q_sum: f32 = q_unit.iter().sum(); - let offset = -centre * q_sum * inv_norm; - top.finalize_with_score_offset_into(out_scores, out_indices, offset); - } else { - top.finalize_into(out_scores, out_indices); - } + top.finalize_into(out_scores, out_indices); let _ = bytes_per_vec; // shape clarity }); @@ -579,12 +569,13 @@ impl RankQuant { let norm = rankquant_norm(dim, bits); let inv_norm = 1.0_f32 / norm; + #[cfg(target_arch = "x86_64")] let centre = ((1u32 << bits) as f32 - 1.0) / 2.0; // L2-normalise the query and gather centre-correction. let q_unit = l2_normalise(query); - let q_sum: f32 = q_unit.iter().sum(); - let centre_offset = -centre * q_sum * inv_norm; + #[cfg(target_arch = "x86_64")] + let centre_offset = -centre * q_unit.iter().sum::() * inv_norm; // Pack the candidate docs' bytes into a contiguous buffer so // the SIMD kernels can scan them as if they were a small dense @@ -603,26 +594,24 @@ impl RankQuant { #[cfg_attr(not(target_arch = "x86_64"), allow(unused_variables))] let simd_tier = select_simd_tier(dim, bits); let mut top = TopK::new_with_tie_keys(k_eff, candidates); - #[cfg_attr(not(target_arch = "x86_64"), allow(unused_mut))] - let mut centre_drop_used = false; #[cfg(target_arch = "x86_64")] unsafe { match (simd_tier, bits) { (SimdTier::Avx512, 2) => { + top.set_score_offset(centre_offset); scan_b2_asym_avx512(&sub_packed, m, dim, &q_unit, inv_norm, &mut top); - centre_drop_used = true; } (SimdTier::Avx512, 4) => { + top.set_score_offset(centre_offset); scan_b4_asym_avx512(&sub_packed, m, dim, &q_unit, inv_norm, &mut top); - centre_drop_used = true; } (SimdTier::Avx2, 2) => { + top.set_score_offset(centre_offset); scan_b2_asym_avx2(&sub_packed, m, dim, &q_unit, inv_norm, &mut top); - centre_drop_used = true; } (SimdTier::Avx2, 4) => { + top.set_score_offset(centre_offset); scan_b4_asym_avx2(&sub_packed, m, dim, &q_unit, inv_norm, &mut top); - centre_drop_used = true; } _ => scan_via_lut_scalar( &sub_packed, @@ -650,14 +639,7 @@ impl RankQuant { let mut scores = vec![f32::NEG_INFINITY; k_eff]; let mut local_indices = vec![-1i64; k_eff]; - if centre_drop_used { - // Re-apply the per-query centre shift dropped from the kernel hot - // loop before final sorting so visible-score ties are still ordered - // by global row ID. - top.finalize_with_score_offset_into(&mut scores, &mut local_indices, centre_offset); - } else { - top.finalize_into(&mut scores, &mut local_indices); - } + top.finalize_into(&mut scores, &mut local_indices); // Map local → global doc IDs. let global_indices: Vec = local_indices .iter() diff --git a/src/util.rs b/src/util.rs index 7055731..d0dae5c 100644 --- a/src/util.rs +++ b/src/util.rs @@ -368,6 +368,12 @@ pub(crate) struct TopK { indices: Vec, tie_keys: Vec, tie_key_by_index: Option>, + /// Query-constant score offset applied before insertion/eviction. + /// + /// RankQuant SIMD asymmetric kernels can drop a per-query centre term from + /// the hot loop. Applying it here keeps TopK's retention key identical to + /// the public visible score key, including f32 rounding-collapse ties. + score_offset: f32, filled: usize, /// Slot holding the worst kept entry under `(score asc, tie_key /// desc)` — the next to be evicted. @@ -387,6 +393,7 @@ impl TopK { indices: vec![-1; k], tie_keys: vec![i64::MAX; k], tie_key_by_index: None, + score_offset: 0.0, filled: 0, worst_pos: 0, worst_val: f32::INFINITY, @@ -406,8 +413,14 @@ impl TopK { top } + #[cfg_attr(not(target_arch = "x86_64"), allow(dead_code))] + pub(crate) fn set_score_offset(&mut self, score_offset: f32) { + self.score_offset = score_offset; + } + #[inline] pub(crate) fn maybe_insert(&mut self, score: f32, idx: usize) { + let score = score + self.score_offset; // Convert the doc_id to its i64 storage form once, up front. doc_ids // are `< n_vectors ≤ MAX_VECTORS` (2^26) by the `add` cap, so this // never fails in practice; the checked conversion makes the "a doc_id @@ -474,22 +487,6 @@ impl TopK { /// user-requested `k`; positions beyond `self.filled` are left as /// sentinels. pub(crate) fn finalize_into(&self, out_scores: &mut [f32], out_indices: &mut [i64]) { - self.finalize_with_score_offset_into(out_scores, out_indices, 0.0); - } - - /// Drain into `out_scores` / `out_indices`, applying a query-constant score - /// offset before the final `(score desc, tie_key asc)` ordering. - /// - /// SIMD RankQuant asymmetric kernels drop a query-constant centre term from - /// the hot loop and re-apply it at finalize time. Adding that offset can - /// collapse two distinct finite `f32` scores into one visible output score, - /// so the public tie order must be computed after the offset is applied. - pub(crate) fn finalize_with_score_offset_into( - &self, - out_scores: &mut [f32], - out_indices: &mut [i64], - score_offset: f32, - ) { debug_assert_eq!(out_scores.len(), out_indices.len()); for s in out_scores.iter_mut() { *s = f32::NEG_INFINITY; @@ -504,7 +501,7 @@ impl TopK { .zip(self.tie_keys.iter()) .enumerate() .take(self.filled) - .map(|(slot, ((&s, &i), &tie_key))| (s + score_offset, i, tie_key, slot)) + .map(|(slot, ((&s, &i), &tie_key))| (s, i, tie_key, slot)) .collect(); // Composite key: score descending, then tie key ascending. The kept // slot is only a final deterministic tie-break when duplicate @@ -595,17 +592,18 @@ mod tests { } #[test] - fn topk_offset_sort_uses_visible_score_ties() { - let mut top = TopK::new_with_tie_keys(2, &[10, 5]); - top.maybe_insert(1.0 + f32::EPSILON, 0); - top.maybe_insert(1.0, 1); + fn topk_score_offset_is_part_of_eviction_key() { + let mut top = TopK::new_with_tie_keys(1, &[10, 3]); + top.set_score_offset(16_777_216.0); + top.maybe_insert(1.0, 0); + top.maybe_insert(0.0, 1); - let mut scores = [f32::NEG_INFINITY; 2]; - let mut indices = [-1; 2]; - top.finalize_with_score_offset_into(&mut scores, &mut indices, 100_000_000.0); + let mut scores = [f32::NEG_INFINITY; 1]; + let mut indices = [-1; 1]; + top.finalize_into(&mut scores, &mut indices); - assert_eq!(scores, [100_000_000.0, 100_000_000.0]); - assert_eq!(indices, [1, 0]); + assert_eq!(scores, [16_777_216.0]); + assert_eq!(indices, [1]); } #[test]