From 68cf532ba9b70423e363e5e2632a40b9e27a6eea Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Thu, 4 Jun 2026 01:14:25 -0500
Subject: [PATCH 1/3] Tighten subset ordering contract coverage

Signed-off-by: Nelson Spence <nelson@projectnavi.ai>
---
 docs/RANK_MODES.md                            |  3 +++
 fuzz/fuzz_targets/search_rankquant.rs         | 26 +++++++++++--------
 .../signbitmap_rankquant_twostage.rs          | 18 +++++++++++--
 ordvec-ffi/include/ordvec.h                   |  9 +++++++
 ordvec-ffi/src/lib.rs                         |  7 +++++
 ordvec-go/README.md                           |  4 +++
 ordvec-go/doc.go                              |  4 +++
 ordvec-go/ordvec.go                           |  4 +++
 ordvec-python/src/lib.rs                      | 11 +++++---
 ordvec-python/tests/test_rank_quant.py        | 19 ++++++++++++--
 src/quant.rs                                  |  3 +++
 tests/index/two_stage.rs                      | 24 +++++++++++------
 12 files changed, 106 insertions(+), 26 deletions(-)

diff --git a/docs/RANK_MODES.md b/docs/RANK_MODES.md
index f36fed1..f95b4ac 100644
--- a/docs/RANK_MODES.md
+++ b/docs/RANK_MODES.md
@@ -427,6 +427,9 @@ serialisers living in [`src/rank_io.rs`](../src/rank_io.rs) and
 [`src/sign_bitmap.rs`](../src/sign_bitmap.rs). `RankQuant`
 additionally exposes `search_asymmetric_subset` for scoring a
 precomputed candidate set — the rerank half of the two-stage pattern.
+Candidate IDs are global row ordinals; duplicate candidates are scored as
+separate entries and can produce duplicate hits, so callers that need
+unique output rows should deduplicate candidate lists before reranking.
 
 `RankQuantFastscan` (re-exported `#[doc(hidden)]`) is an optional
 single-pass b=2 fast path; it supports `add`/`search` but not
diff --git a/fuzz/fuzz_targets/search_rankquant.rs b/fuzz/fuzz_targets/search_rankquant.rs
index 6c7b386..cfddcae 100644
--- a/fuzz/fuzz_targets/search_rankquant.rs
+++ b/fuzz/fuzz_targets/search_rankquant.rs
@@ -9,8 +9,8 @@
 //! huge value. Invalid dimensions, non-finite floats, and ragged vector lengths
 //! are caller contract violations, so this target avoids them and treats any
 //! panic as a compute-path bug. Assertions stay structural: shape, finite
-//! scores, valid doc IDs, score-descending rows, and repeat determinism in one
-//! process.
+//! scores, valid doc IDs, score-descending/doc-ID-ascending rows, and repeat
+//! determinism in one process.
 #![no_main]
 
 use libfuzzer_sys::{
@@ -105,15 +105,19 @@ fn assert_results(label: &str, res: &SearchResults, nq: usize, k_eff: usize, n:
                 "{label}: doc id {id} out of range for n={n} at query {qi} slot {slot}",
             );
         }
-        for slot in 1..k_eff {
-            let prev = (scores[slot - 1], ids[slot - 1]);
-            let cur = (scores[slot], ids[slot]);
-            assert!(
-                cur.0 <= prev.0,
-                "{label}: row {qi} not sorted at slots {} and {slot}",
-                slot - 1,
-            );
-        }
+        assert_score_then_id_order(label, qi, scores, ids);
+    }
+}
+
+fn assert_score_then_id_order(label: &str, qi: usize, scores: &[f32], ids: &[i64]) {
+    for slot in 1..scores.len() {
+        let prev = (scores[slot - 1], ids[slot - 1]);
+        let cur = (scores[slot], ids[slot]);
+        assert!(
+            cur.0 < prev.0 || (cur.0 == prev.0 && cur.1 >= prev.1),
+            "{label}: row {qi} violates score-desc/doc-id-asc order at slots {} and {slot}",
+            slot - 1,
+        );
     }
 }
 
diff --git a/fuzz/fuzz_targets/signbitmap_rankquant_twostage.rs b/fuzz/fuzz_targets/signbitmap_rankquant_twostage.rs
index 3a45c4b..776e57d 100644
--- a/fuzz/fuzz_targets/signbitmap_rankquant_twostage.rs
+++ b/fuzz/fuzz_targets/signbitmap_rankquant_twostage.rs
@@ -11,7 +11,9 @@
 //! reranking agrees with a full RankQuant search.
 //!
 //! Contract: no panic, abort, or out-of-bounds access on any in-range candidate
-//! input, and full-corpus candidate reranking must match full RankQuant search.
+//! input, subset reranking must preserve score-descending/doc-ID-ascending
+//! ordering, and full-corpus candidate reranking must match full RankQuant
+//! search.
 #![no_main]
 
 use libfuzzer_sys::{
@@ -31,6 +33,18 @@ struct TwoStageInput {
     payload: Vec<u8>,
 }
 
+fn assert_score_then_id_order(scores: &[f32], ids: &[i64]) {
+    for slot in 1..scores.len() {
+        let prev = (scores[slot - 1], ids[slot - 1]);
+        let cur = (scores[slot], ids[slot]);
+        assert!(
+            cur.0 < prev.0 || (cur.0 == prev.0 && cur.1 >= prev.1),
+            "subset rerank violates score-desc/doc-id-asc order at slots {} and {slot}",
+            slot - 1,
+        );
+    }
+}
+
 impl<'a> Arbitrary<'a> for TwoStageInput {
     fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
         let dim = *u.choose(&[64usize, 128, 256, 512])?;
@@ -108,7 +122,7 @@ fuzz_target!(|input: TwoStageInput| {
     assert_eq!(scores.len(), k_eff);
     assert_eq!(ids.len(), k_eff);
     assert!(scores.iter().all(|score| score.is_finite()));
-    assert!(scores.windows(2).all(|pair| pair[0] >= pair[1]));
+    assert_score_then_id_order(&scores, &ids);
     for &id in &ids {
         assert!(id >= 0);
         assert!(subset_candidates.contains(&(id as u32)));
diff --git a/ordvec-ffi/include/ordvec.h b/ordvec-ffi/include/ordvec.h
index 36f34cb..b02bc6f 100644
--- a/ordvec-ffi/include/ordvec.h
+++ b/ordvec-ffi/include/ordvec.h
@@ -52,6 +52,10 @@ typedef struct {
   const float *query;
   uint64_t dim;
   uint64_t k;
+  /**
+   * Optional subset row IDs. These are entry lists, not sets: duplicate
+   * candidates are scored independently and can produce duplicate hits.
+   */
   const uint32_t *candidate_rows;
   uint64_t candidate_count;
   uint64_t flags;
@@ -224,6 +228,11 @@ void ordvec_index_free(ordvec_index_t *index);
 /**
  * Run a synchronous single-query search.
  *
+ * When `params.candidate_rows` is supplied, those IDs are global row ordinals
+ * and may be unsorted or duplicated. Duplicate candidates are scored as
+ * separate entries and can produce duplicate hits; callers that need unique
+ * output rows must deduplicate before calling.
+ *
  * # Safety
  *
  * `index` must be a live handle returned by `ordvec_index_load`. All non-null
diff --git a/ordvec-ffi/src/lib.rs b/ordvec-ffi/src/lib.rs
index 6b35d48..d1f0097 100644
--- a/ordvec-ffi/src/lib.rs
+++ b/ordvec-ffi/src/lib.rs
@@ -73,6 +73,8 @@ pub struct ordvec_search_params_t {
     pub query: *const f32,
     pub dim: u64,
     pub k: u64,
+    /// Optional subset row IDs. These are entry lists, not sets: duplicate
+    /// candidates are scored independently and can produce duplicate hits.
     pub candidate_rows: *const u32,
     pub candidate_count: u64,
     pub flags: u64,
@@ -871,6 +873,11 @@ pub unsafe extern "C" fn ordvec_index_free(index: *mut ordvec_index_t) {
 #[no_mangle]
 /// Run a synchronous single-query search.
 ///
+/// When `params.candidate_rows` is supplied, those IDs are global row ordinals
+/// and may be unsorted or duplicated. Duplicate candidates are scored as
+/// separate entries and can produce duplicate hits; callers that need unique
+/// output rows must deduplicate before calling.
+///
 /// # Safety
 ///
 /// `index` must be a live handle returned by `ordvec_index_load`. All non-null
diff --git a/ordvec-go/README.md b/ordvec-go/README.md
index 8fde2ef..15cdb4c 100644
--- a/ordvec-go/README.md
+++ b/ordvec-go/README.md
@@ -19,3 +19,7 @@ Search with `nil` options or `nil` `SearchOptions.Candidates` performs a full
 search. An empty, non-nil `Candidates` slice is treated as an explicit empty
 subset and returns a typed `StatusBadArgument`, matching the C ABI v1
 pointer/count contract.
+
+`SearchOptions.Candidates` is an entry list of global row ordinals, not a set.
+Duplicate candidates are scored independently and can produce duplicate hits;
+deduplicate before searching when unique row IDs are required.
diff --git a/ordvec-go/doc.go b/ordvec-go/doc.go
index fb79eff..eaf3458 100644
--- a/ordvec-go/doc.go
+++ b/ordvec-go/doc.go
@@ -6,4 +6,8 @@
 // Search pins and passes caller-owned query and candidate slices to the C ABI
 // without copying them. Callers must not mutate those slices until Search
 // returns.
+//
+// Candidate slices are entry lists, not sets. Duplicate candidate IDs are scored
+// independently and can produce duplicate hits; callers that require unique row
+// IDs should deduplicate before Search.
 package ordvec
diff --git a/ordvec-go/ordvec.go b/ordvec-go/ordvec.go
index 1566cb9..820b678 100644
--- a/ordvec-go/ordvec.go
+++ b/ordvec-go/ordvec.go
@@ -145,6 +145,10 @@ type Stats struct {
 }
 
 type SearchOptions struct {
+	// Candidates restricts the search to these global row ordinals. It is an
+	// entry list, not a set: duplicate candidates are scored independently and
+	// can produce duplicate hits. Deduplicate before calling if unique rows are
+	// required.
 	Candidates []uint32
 	UserTag    uint64
 }
diff --git a/ordvec-python/src/lib.rs b/ordvec-python/src/lib.rs
index ce03c29..e6d7365 100644
--- a/ordvec-python/src/lib.rs
+++ b/ordvec-python/src/lib.rs
@@ -731,9 +731,14 @@ impl RankQuant {
     /// Asymmetric scoring restricted to a candidate subset (e.g. the top-M
     /// shortlist from a [`Bitmap`] or [`SignBitmap`] probe). Returns
     /// ``(scores, global_ids)`` where ``global_ids`` are the original doc
-    /// indices (mapped from the local candidate slot); slots that could not be
-    /// filled are returned as ``-1``. Uses the same AVX-512 → AVX2 → scalar
-    /// dispatch as ``search_asymmetric``.
+    /// indices (mapped from the local candidate slot). ``k`` is capped to the
+    /// candidate-list length; the subset path does not add sentinel padding.
+    /// Uses the same AVX-512 → AVX2 → scalar dispatch as ``search_asymmetric``.
+    ///
+    /// ``candidates`` may be unsorted and may contain duplicates. Duplicate
+    /// candidate IDs are scored as separate entries and can produce duplicate
+    /// hits; callers that require unique row IDs should deduplicate before
+    /// calling.
     ///
     /// If the shortlist came from [`Bitmap`], this is the exact RankQuant
     /// rerank stage over that survivor set; it does not itself apply or
diff --git a/ordvec-python/tests/test_rank_quant.py b/ordvec-python/tests/test_rank_quant.py
index 21bf830..4e97ca8 100644
--- a/ordvec-python/tests/test_rank_quant.py
+++ b/ordvec-python/tests/test_rank_quant.py
@@ -310,8 +310,9 @@ def test_search_asymmetric_subset_returns_global_ids():
     assert ids.dtype == np.int64
     # Self-query against a candidate set containing self → top-1 is self.
     assert int(ids[0]) == 0
-    # All returned ids are from the candidate set (or sentinel -1).
-    candidate_set = set(candidates.tolist()) | {-1}
+    # All returned ids are from the candidate set; k is capped instead of
+    # sentinel-padding unfilled slots.
+    candidate_set = set(candidates.tolist())
     for i in ids:
         assert int(i) in candidate_set
 
@@ -347,6 +348,20 @@ def test_search_asymmetric_subset_ties_use_global_row_ids():
     np.testing.assert_array_equal(scores, np.array([0.0, 0.0], dtype=np.float32))
 
 
+def test_search_asymmetric_subset_duplicates_remain_duplicate_entries():
+    vectors = np.ones((12, 64), dtype=np.float32)
+    idx = RankQuant(dim=64, bits=2)
+    idx.add(vectors)
+
+    candidates = np.array([7, 8, 7], dtype=np.uint32)
+    scores, ids = idx.search_asymmetric_subset(
+        np.zeros(64, dtype=np.float32), candidates, k=2
+    )
+
+    np.testing.assert_array_equal(ids, np.array([7, 7], dtype=np.int64))
+    np.testing.assert_array_equal(scores, np.array([0.0, 0.0], dtype=np.float32))
+
+
 def test_search_asymmetric_subset_k_caps_at_candidate_count():
     # k > len(candidates) should silently cap — no panic, no sentinel
     # padding beyond the candidate-set size.
diff --git a/src/quant.rs b/src/quant.rs
index f770043..87f0087 100644
--- a/src/quant.rs
+++ b/src/quant.rs
@@ -539,6 +539,9 @@ impl RankQuant {
     /// to global IDs before returning). Results are ordered by score
     /// descending, then global row ID ascending, matching the full-index
     /// search tie policy even when `candidates` is unsorted.
+    /// Duplicate candidate IDs are scored as separate entries and can
+    /// produce duplicate hits; callers that require unique row IDs should
+    /// deduplicate before calling.
     ///
     /// Uses the same AVX-512 → AVX2 → scalar dispatch as
     /// [`Self::search_asymmetric`] and the same centre-drop math, just
diff --git a/tests/index/two_stage.rs b/tests/index/two_stage.rs
index d434234..cedbf67 100644
--- a/tests/index/two_stage.rs
+++ b/tests/index/two_stage.rs
@@ -19,6 +19,18 @@ fn assert_two_stage_invariants(sign: &SignBitmap, rankquant: &RankQuant) {
     assert_eq!(sign.len(), N);
 }
 
+fn assert_score_then_id_order(scores: &[f32], ids: &[i64]) {
+    for slot in 1..scores.len() {
+        let prev = (scores[slot - 1], ids[slot - 1]);
+        let cur = (scores[slot], ids[slot]);
+        assert!(
+            cur.0 < prev.0 || (cur.0 == prev.0 && cur.1 >= prev.1),
+            "results violate score-desc/doc-id-asc order at slots {} and {slot}",
+            slot - 1,
+        );
+    }
+}
+
 #[test]
 fn sign_rankquant_pipeline_handles_edge_candidate_and_k_shapes() {
     let (sign, rankquant, _corpus) = build_two_stage(2);
@@ -51,6 +63,7 @@ fn sign_rankquant_pipeline_handles_edge_candidate_and_k_shapes() {
     assert_eq!(scores.len(), shortlist.len());
     assert_eq!(ids.len(), shortlist.len());
     assert!(ids.iter().all(|&id| shortlist.contains(&(id as u32))));
+    assert_score_then_id_order(&scores, &ids);
 }
 
 #[test]
@@ -64,15 +77,10 @@ fn sign_rankquant_full_candidate_set_matches_full_rankquant_search() {
     let full = rankquant.search_asymmetric(query, 16);
     let (subset_scores, subset_ids) = rankquant.search_asymmetric_subset(query, &candidates, 16);
 
-    assert!(subset_ids
-        .iter()
-        .all(|&id| candidates.contains(&(id as u32))));
+    assert_eq!(subset_ids, full.indices_for_query(0));
     assert_eq!(subset_scores.len(), full.scores_for_query(0).len());
-    let mut subset_scores_sorted = subset_scores;
-    let mut full_scores_sorted = full.scores_for_query(0).to_vec();
-    subset_scores_sorted.sort_by(|left, right| left.total_cmp(right));
-    full_scores_sorted.sort_by(|left, right| left.total_cmp(right));
-    for (subset, full) in subset_scores_sorted.iter().zip(&full_scores_sorted) {
+    assert_score_then_id_order(&subset_scores, &subset_ids);
+    for (subset, full) in subset_scores.iter().zip(full.scores_for_query(0)) {
         assert!(
             (subset - full).abs() <= 1e-6,
             "subset score {subset} diverged from full score {full}"

From 14392b3fce3012eb7674d7fd1a5f7130338cad56 Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Thu, 4 Jun 2026 09:02:03 -0500
Subject: [PATCH 2/3] Fix visible RankQuant tie ordering after score offsets

Signed-off-by: Nelson Spence <nelson@projectnavi.ai>
---
 src/quant.rs             | 35 +++++++++++------------------------
 src/util.rs              | 32 +++++++++++++++++++++++++++++++-
 tests/index/two_stage.rs | 29 +++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+), 25 deletions(-)

diff --git a/src/quant.rs b/src/quant.rs
index 87f0087..285fb10 100644
--- a/src/quant.rs
+++ b/src/quant.rs
@@ -397,25 +397,16 @@ impl RankQuant {
                     &mut top,
                 );
 
-                top.finalize_into(out_scores, out_indices);
-
                 if centre_drop_used {
                     // The asym kernels drop the per-lane `- centre` term from
-                    // the hot loop; it is a query-constant shift, re-applied
-                    // here. Guarded by `is_finite` so it lands only on filled
-                    // slots: when fewer than `k` docs were scored the trailing
-                    // top-k positions stay at the `f32::NEG_INFINITY` sentinel,
-                    // and `NEG_INFINITY + offset` would wrongly turn a sentinel
-                    // into a finite score. (Real scores are always finite — the
-                    // finite-input policy guarantees it — so the guard only ever
-                    // skips sentinels, never a genuine result.)
+                    // the hot loop; apply the query-constant shift before the
+                    // final visible-score sort so rounding-collapse ties still
+                    // use the public row-id tie key.
                     let q_sum: f32 = q_unit.iter().sum();
                     let offset = -centre * q_sum * inv_norm;
-                    for s in out_scores.iter_mut() {
-                        if s.is_finite() {
-                            *s += offset;
-                        }
-                    }
+                    top.finalize_with_score_offset_into(out_scores, out_indices, offset);
+                } else {
+                    top.finalize_into(out_scores, out_indices);
                 }
 
                 let _ = bytes_per_vec; // shape clarity
@@ -659,17 +650,13 @@ impl RankQuant {
 
         let mut scores = vec![f32::NEG_INFINITY; k_eff];
         let mut local_indices = vec![-1i64; k_eff];
-        top.finalize_into(&mut scores, &mut local_indices);
         if centre_drop_used {
             // Re-apply the per-query centre shift dropped from the kernel hot
-            // loop; the `is_finite` guard skips unfilled top-k slots (still at
-            // the `f32::NEG_INFINITY` sentinel) so a sentinel never becomes a
-            // finite score. See the matching note in `search_asymmetric`.
-            for s in scores.iter_mut() {
-                if s.is_finite() {
-                    *s += centre_offset;
-                }
-            }
+            // loop before final sorting so visible-score ties are still ordered
+            // by global row ID.
+            top.finalize_with_score_offset_into(&mut scores, &mut local_indices, centre_offset);
+        } else {
+            top.finalize_into(&mut scores, &mut local_indices);
         }
         // Map local → global doc IDs.
         let global_indices: Vec<i64> = local_indices
diff --git a/src/util.rs b/src/util.rs
index 0229f72..7055731 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -474,6 +474,22 @@ impl TopK {
     /// user-requested `k`; positions beyond `self.filled` are left as
     /// sentinels.
     pub(crate) fn finalize_into(&self, out_scores: &mut [f32], out_indices: &mut [i64]) {
+        self.finalize_with_score_offset_into(out_scores, out_indices, 0.0);
+    }
+
+    /// Drain into `out_scores` / `out_indices`, applying a query-constant score
+    /// offset before the final `(score desc, tie_key asc)` ordering.
+    ///
+    /// SIMD RankQuant asymmetric kernels drop a query-constant centre term from
+    /// the hot loop and re-apply it at finalize time. Adding that offset can
+    /// collapse two distinct finite `f32` scores into one visible output score,
+    /// so the public tie order must be computed after the offset is applied.
+    pub(crate) fn finalize_with_score_offset_into(
+        &self,
+        out_scores: &mut [f32],
+        out_indices: &mut [i64],
+        score_offset: f32,
+    ) {
         debug_assert_eq!(out_scores.len(), out_indices.len());
         for s in out_scores.iter_mut() {
             *s = f32::NEG_INFINITY;
@@ -488,7 +504,7 @@ impl TopK {
             .zip(self.tie_keys.iter())
             .enumerate()
             .take(self.filled)
-            .map(|(slot, ((&s, &i), &tie_key))| (s, i, tie_key, slot))
+            .map(|(slot, ((&s, &i), &tie_key))| (s + score_offset, i, tie_key, slot))
             .collect();
         // Composite key: score descending, then tie key ascending. The kept
         // slot is only a final deterministic tie-break when duplicate
@@ -578,6 +594,20 @@ mod tests {
         assert_eq!(indices, [0, 1]);
     }
 
+    #[test]
+    fn topk_offset_sort_uses_visible_score_ties() {
+        let mut top = TopK::new_with_tie_keys(2, &[10, 5]);
+        top.maybe_insert(1.0 + f32::EPSILON, 0);
+        top.maybe_insert(1.0, 1);
+
+        let mut scores = [f32::NEG_INFINITY; 2];
+        let mut indices = [-1; 2];
+        top.finalize_with_score_offset_into(&mut scores, &mut indices, 100_000_000.0);
+
+        assert_eq!(scores, [100_000_000.0, 100_000_000.0]);
+        assert_eq!(indices, [1, 0]);
+    }
+
     #[test]
     fn checked_new_len_accepts_up_to_max() {
         use crate::rank_io::MAX_VECTORS;
diff --git a/tests/index/two_stage.rs b/tests/index/two_stage.rs
index cedbf67..613ea56 100644
--- a/tests/index/two_stage.rs
+++ b/tests/index/two_stage.rs
@@ -87,3 +87,32 @@ fn sign_rankquant_full_candidate_set_matches_full_rankquant_search() {
         );
     }
 }
+
+#[test]
+fn sign_rankquant_subset_orders_visible_ties_after_centre_offset() {
+    let dim = 128usize;
+    let n_vectors = 5usize;
+    let bits = 4u8;
+    let payload = [
+        158u8, 158, 158, 158, 158, 158, 158, 158, 158, 158, 137, 10, 10,
+    ];
+    let floats: Vec<f32> = (0..((n_vectors + 1) * dim))
+        .map(|i| payload[i % payload.len()] as f32 - 128.0)
+        .collect();
+    let (corpus, query) = floats.split_at(n_vectors * dim);
+
+    let mut sign = SignBitmap::new(dim);
+    let mut rankquant = RankQuant::new(dim, bits);
+    sign.add(corpus);
+    rankquant.add(corpus);
+
+    let candidates = sign.top_m_candidates(query, n_vectors);
+    assert_eq!(candidates.len(), n_vectors);
+
+    let (scores, ids) = rankquant.search_asymmetric_subset(query, &candidates, n_vectors + 1);
+
+    assert_eq!(scores.len(), n_vectors);
+    assert_eq!(ids.len(), n_vectors);
+    assert!(scores.iter().all(|score| score.is_finite()));
+    assert_score_then_id_order(&scores, &ids);
+}

From b0e91a9c62bf1c51c7c236b72273bb41fc7d755b Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Thu, 4 Jun 2026 09:42:11 -0500
Subject: [PATCH 3/3] Fix RankQuant offset eviction ordering

Signed-off-by: Nelson Spence <nelson@projectnavi.ai>
---
 fuzz/fuzz_targets/search_rankquant.rs |  2 +-
 src/quant.rs                          | 56 +++++++++------------------
 src/util.rs                           | 50 ++++++++++++------------
 3 files changed, 44 insertions(+), 64 deletions(-)

diff --git a/fuzz/fuzz_targets/search_rankquant.rs b/fuzz/fuzz_targets/search_rankquant.rs
index cfddcae..3854a83 100644
--- a/fuzz/fuzz_targets/search_rankquant.rs
+++ b/fuzz/fuzz_targets/search_rankquant.rs
@@ -114,7 +114,7 @@ fn assert_score_then_id_order(label: &str, qi: usize, scores: &[f32], ids: &[i64
         let prev = (scores[slot - 1], ids[slot - 1]);
         let cur = (scores[slot], ids[slot]);
         assert!(
-            cur.0 < prev.0 || (cur.0 == prev.0 && cur.1 >= prev.1),
+            cur.0 < prev.0 || (cur.0 == prev.0 && cur.1 > prev.1),
             "{label}: row {qi} violates score-desc/doc-id-asc order at slots {} and {slot}",
             slot - 1,
         );
diff --git a/src/quant.rs b/src/quant.rs
index 285fb10..aef5038 100644
--- a/src/quant.rs
+++ b/src/quant.rs
@@ -338,10 +338,10 @@ impl RankQuant {
         #[cfg_attr(not(target_arch = "x86_64"), allow(unused_variables))]
         let simd_tier = select_simd_tier(dim, bits);
 
-        // For the AVX2 path we drop the per-lane centre subtract from
-        // the hot loop and add it back as a per-query constant offset
-        // to the top-k scores at finalize time. Ranking is invariant
-        // to this constant; absolute scores stay exact.
+        // SIMD asymmetric kernels drop the per-lane centre subtract from the
+        // hot loop. Apply the query-constant offset before TopK insertion so
+        // retention and final ordering use the same public visible score key.
+        #[cfg(target_arch = "x86_64")]
         let centre = ((1u32 << bits) as f32 - 1.0) / 2.0;
 
         queries
@@ -351,27 +351,27 @@ impl RankQuant {
             .for_each(|((q, out_scores), out_indices)| {
                 let q_unit = l2_normalise(q);
                 let mut top = TopK::new(k_eff);
-                #[cfg_attr(not(target_arch = "x86_64"), allow(unused_mut))]
-                let mut centre_drop_used = false;
+                #[cfg(target_arch = "x86_64")]
+                let centre_offset = -centre * q_unit.iter().sum::<f32>() * inv_norm;
 
                 #[cfg(target_arch = "x86_64")]
                 unsafe {
                     match (simd_tier, bits) {
                         (SimdTier::Avx512, 2) => {
+                            top.set_score_offset(centre_offset);
                             scan_b2_asym_avx512(&self.packed, n, dim, &q_unit, inv_norm, &mut top);
-                            centre_drop_used = true;
                         }
                         (SimdTier::Avx512, 4) => {
+                            top.set_score_offset(centre_offset);
                             scan_b4_asym_avx512(&self.packed, n, dim, &q_unit, inv_norm, &mut top);
-                            centre_drop_used = true;
                         }
                         (SimdTier::Avx2, 2) => {
+                            top.set_score_offset(centre_offset);
                             scan_b2_asym_avx2(&self.packed, n, dim, &q_unit, inv_norm, &mut top);
-                            centre_drop_used = true;
                         }
                         (SimdTier::Avx2, 4) => {
+                            top.set_score_offset(centre_offset);
                             scan_b4_asym_avx2(&self.packed, n, dim, &q_unit, inv_norm, &mut top);
-                            centre_drop_used = true;
                         }
                         _ => scan_via_lut_scalar(
                             &self.packed,
@@ -397,17 +397,7 @@ impl RankQuant {
                     &mut top,
                 );
 
-                if centre_drop_used {
-                    // The asym kernels drop the per-lane `- centre` term from
-                    // the hot loop; apply the query-constant shift before the
-                    // final visible-score sort so rounding-collapse ties still
-                    // use the public row-id tie key.
-                    let q_sum: f32 = q_unit.iter().sum();
-                    let offset = -centre * q_sum * inv_norm;
-                    top.finalize_with_score_offset_into(out_scores, out_indices, offset);
-                } else {
-                    top.finalize_into(out_scores, out_indices);
-                }
+                top.finalize_into(out_scores, out_indices);
 
                 let _ = bytes_per_vec; // shape clarity
             });
@@ -579,12 +569,13 @@ impl RankQuant {
 
         let norm = rankquant_norm(dim, bits);
         let inv_norm = 1.0_f32 / norm;
+        #[cfg(target_arch = "x86_64")]
         let centre = ((1u32 << bits) as f32 - 1.0) / 2.0;
 
         // L2-normalise the query and gather centre-correction.
         let q_unit = l2_normalise(query);
-        let q_sum: f32 = q_unit.iter().sum();
-        let centre_offset = -centre * q_sum * inv_norm;
+        #[cfg(target_arch = "x86_64")]
+        let centre_offset = -centre * q_unit.iter().sum::<f32>() * inv_norm;
 
         // Pack the candidate docs' bytes into a contiguous buffer so
         // the SIMD kernels can scan them as if they were a small dense
@@ -603,26 +594,24 @@ impl RankQuant {
         #[cfg_attr(not(target_arch = "x86_64"), allow(unused_variables))]
         let simd_tier = select_simd_tier(dim, bits);
         let mut top = TopK::new_with_tie_keys(k_eff, candidates);
-        #[cfg_attr(not(target_arch = "x86_64"), allow(unused_mut))]
-        let mut centre_drop_used = false;
         #[cfg(target_arch = "x86_64")]
         unsafe {
             match (simd_tier, bits) {
                 (SimdTier::Avx512, 2) => {
+                    top.set_score_offset(centre_offset);
                     scan_b2_asym_avx512(&sub_packed, m, dim, &q_unit, inv_norm, &mut top);
-                    centre_drop_used = true;
                 }
                 (SimdTier::Avx512, 4) => {
+                    top.set_score_offset(centre_offset);
                     scan_b4_asym_avx512(&sub_packed, m, dim, &q_unit, inv_norm, &mut top);
-                    centre_drop_used = true;
                 }
                 (SimdTier::Avx2, 2) => {
+                    top.set_score_offset(centre_offset);
                     scan_b2_asym_avx2(&sub_packed, m, dim, &q_unit, inv_norm, &mut top);
-                    centre_drop_used = true;
                 }
                 (SimdTier::Avx2, 4) => {
+                    top.set_score_offset(centre_offset);
                     scan_b4_asym_avx2(&sub_packed, m, dim, &q_unit, inv_norm, &mut top);
-                    centre_drop_used = true;
                 }
                 _ => scan_via_lut_scalar(
                     &sub_packed,
@@ -650,14 +639,7 @@ impl RankQuant {
 
         let mut scores = vec![f32::NEG_INFINITY; k_eff];
         let mut local_indices = vec![-1i64; k_eff];
-        if centre_drop_used {
-            // Re-apply the per-query centre shift dropped from the kernel hot
-            // loop before final sorting so visible-score ties are still ordered
-            // by global row ID.
-            top.finalize_with_score_offset_into(&mut scores, &mut local_indices, centre_offset);
-        } else {
-            top.finalize_into(&mut scores, &mut local_indices);
-        }
+        top.finalize_into(&mut scores, &mut local_indices);
         // Map local → global doc IDs.
         let global_indices: Vec<i64> = local_indices
             .iter()
diff --git a/src/util.rs b/src/util.rs
index 7055731..d0dae5c 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -368,6 +368,12 @@ pub(crate) struct TopK {
     indices: Vec<i64>,
     tie_keys: Vec<i64>,
     tie_key_by_index: Option<Vec<i64>>,
+    /// Query-constant score offset applied before insertion/eviction.
+    ///
+    /// RankQuant SIMD asymmetric kernels can drop a per-query centre term from
+    /// the hot loop. Applying it here keeps TopK's retention key identical to
+    /// the public visible score key, including f32 rounding-collapse ties.
+    score_offset: f32,
     filled: usize,
     /// Slot holding the worst kept entry under `(score asc, tie_key
     /// desc)` — the next to be evicted.
@@ -387,6 +393,7 @@ impl TopK {
             indices: vec![-1; k],
             tie_keys: vec![i64::MAX; k],
             tie_key_by_index: None,
+            score_offset: 0.0,
             filled: 0,
             worst_pos: 0,
             worst_val: f32::INFINITY,
@@ -406,8 +413,14 @@ impl TopK {
         top
     }
 
+    #[cfg_attr(not(target_arch = "x86_64"), allow(dead_code))]
+    pub(crate) fn set_score_offset(&mut self, score_offset: f32) {
+        self.score_offset = score_offset;
+    }
+
     #[inline]
     pub(crate) fn maybe_insert(&mut self, score: f32, idx: usize) {
+        let score = score + self.score_offset;
         // Convert the doc_id to its i64 storage form once, up front. doc_ids
         // are `< n_vectors ≤ MAX_VECTORS` (2^26) by the `add` cap, so this
         // never fails in practice; the checked conversion makes the "a doc_id
@@ -474,22 +487,6 @@ impl TopK {
     /// user-requested `k`; positions beyond `self.filled` are left as
     /// sentinels.
     pub(crate) fn finalize_into(&self, out_scores: &mut [f32], out_indices: &mut [i64]) {
-        self.finalize_with_score_offset_into(out_scores, out_indices, 0.0);
-    }
-
-    /// Drain into `out_scores` / `out_indices`, applying a query-constant score
-    /// offset before the final `(score desc, tie_key asc)` ordering.
-    ///
-    /// SIMD RankQuant asymmetric kernels drop a query-constant centre term from
-    /// the hot loop and re-apply it at finalize time. Adding that offset can
-    /// collapse two distinct finite `f32` scores into one visible output score,
-    /// so the public tie order must be computed after the offset is applied.
-    pub(crate) fn finalize_with_score_offset_into(
-        &self,
-        out_scores: &mut [f32],
-        out_indices: &mut [i64],
-        score_offset: f32,
-    ) {
         debug_assert_eq!(out_scores.len(), out_indices.len());
         for s in out_scores.iter_mut() {
             *s = f32::NEG_INFINITY;
@@ -504,7 +501,7 @@ impl TopK {
             .zip(self.tie_keys.iter())
             .enumerate()
             .take(self.filled)
-            .map(|(slot, ((&s, &i), &tie_key))| (s + score_offset, i, tie_key, slot))
+            .map(|(slot, ((&s, &i), &tie_key))| (s, i, tie_key, slot))
             .collect();
         // Composite key: score descending, then tie key ascending. The kept
         // slot is only a final deterministic tie-break when duplicate
@@ -595,17 +592,18 @@ mod tests {
     }
 
     #[test]
-    fn topk_offset_sort_uses_visible_score_ties() {
-        let mut top = TopK::new_with_tie_keys(2, &[10, 5]);
-        top.maybe_insert(1.0 + f32::EPSILON, 0);
-        top.maybe_insert(1.0, 1);
+    fn topk_score_offset_is_part_of_eviction_key() {
+        let mut top = TopK::new_with_tie_keys(1, &[10, 3]);
+        top.set_score_offset(16_777_216.0);
+        top.maybe_insert(1.0, 0);
+        top.maybe_insert(0.0, 1);
 
-        let mut scores = [f32::NEG_INFINITY; 2];
-        let mut indices = [-1; 2];
-        top.finalize_with_score_offset_into(&mut scores, &mut indices, 100_000_000.0);
+        let mut scores = [f32::NEG_INFINITY; 1];
+        let mut indices = [-1; 1];
+        top.finalize_into(&mut scores, &mut indices);
 
-        assert_eq!(scores, [100_000_000.0, 100_000_000.0]);
-        assert_eq!(indices, [1, 0]);
+        assert_eq!(scores, [16_777_216.0]);
+        assert_eq!(indices, [1]);
     }
 
     #[test]