From bb132eff9c9ecdba9970ea0dbd5fa7b74ffaa54c Mon Sep 17 00:00:00 2001
From: Thor Whalen <1906276+thorwhalen@users.noreply.github.com>
Date: Wed, 27 May 2026 14:16:36 +0200
Subject: [PATCH] docs(base): canonical SearchResult score contract; align
 faiss l2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Issue #9 noted that Chroma's score was metric-blind. The Chroma fix
landed earlier via score_from_distance(); this change closes the loop:

- vd/base.py: a new "Score semantics" section pins the cross-backend
  contract — higher-is-better, per-metric canonical similarity (cosine
  ∈ [-1, 1], dot raw inner product, l2 squashed to (0, 1]). Collection
  .search docstring references it.
- vd/backends/_helpers.py: score_from_distance docstring spells out the
  formula per metric and which adapters route through it vs return a
  native combined score (ES / MongoDB / Pinecone).
- vd/backends/faiss.py: route the l2 path through score_from_distance
  (was inline 1/(1+v)); add a comment explaining FAISS IndexFlatIP for
  cosine/dot is already canonical so passthrough is correct.
- vd/backends/qdrant.py: behaviour unchanged, comment flags the Euclid
  case for future revisit if Qdrant changes its score sign convention.
- tests/test_contract.py: pin the canonical contract on the in-memory
  reference adapter (identical-vector → metric-max, cosine-orthogonal
  → 0) and the helper's documented formula table.

Refs #9
---
 tests/test_contract.py  | 63 +++++++++++++++++++++++++++++++++++++++++
 vd/backends/_helpers.py | 42 +++++++++++++++++++++++----
 vd/backends/faiss.py    | 24 +++++++++++++---
 vd/backends/qdrant.py   |  9 ++++++
 vd/base.py              | 45 ++++++++++++++++++++++++++++-
 5 files changed, 173 insertions(+), 10 deletions(-)

diff --git a/tests/test_contract.py b/tests/test_contract.py
index 28e52ff..f9a016f 100644
--- a/tests/test_contract.py
+++ b/tests/test_contract.py
@@ -127,3 +127,66 @@ def test_chroma_rejects_unsupported_operator():
     # Chroma's filter subset has no $exists -> a clear vd error, pre-query.
     with pytest.raises(UnsupportedFilterError):
         list(col.search([0.1, 0.2, 0.3], filter={"missing": {"$exists": True}}))
+
+
+# --------------------------------------------------------------------------- #
+# Score semantics — the cross-backend contract documented in vd.base.
+# Pin the canonical scale on the in-memory reference adapter; distance-
+# returning adapters route through score_from_distance and therefore match
+# automatically. See vd/base.py "Score semantics" and issue #9.
+# --------------------------------------------------------------------------- #
+
+
+@pytest.mark.parametrize(
+    "metric,query,expected_top_score",
+    [
+        # Identical vectors → cosine similarity = 1.0 (max of [-1, 1]).
+        ("cosine", [1.0, 0.0, 0.0], 1.0),
+        # Identical vectors → inner product = 1.0 (no upper bound; equality here).
+        ("dot", [1.0, 0.0, 0.0], 1.0),
+        # Identical vectors → euclidean distance = 0 → score = 1/(1+0) = 1.0.
+        ("l2", [1.0, 0.0, 0.0], 1.0),
+    ],
+)
+def test_score_contract_identical_query_returns_max_canonical_score(
+    metric, query, expected_top_score
+):
+    """An identical query vector gets the canonical maximum for its metric."""
+    col = vd.connect("memory").create_collection(
+        f"score_contract_{metric}", metric=metric
+    )
+    col["a"] = Document(id="a", text="match", vector=query)
+    col["b"] = Document(id="b", text="other", vector=[0.0, 1.0, 0.0])
+    hits = list(col.search(query, limit=2))
+    assert hits[0]["id"] == "a"
+    assert hits[0]["score"] == pytest.approx(expected_top_score)
+
+
+def test_score_contract_cosine_orthogonal_is_zero():
+    """vd canonical cosine score for orthogonal vectors is exactly 0.0."""
+    col = vd.connect("memory").create_collection(
+        "score_contract_cos_ortho", metric="cosine"
+    )
+    col["a"] = Document(id="a", text="x", vector=[1.0, 0.0])
+    col["b"] = Document(id="b", text="y", vector=[0.0, 1.0])
+    hits = list(col.search([1.0, 0.0], limit=2))
+    by_id = {h["id"]: h["score"] for h in hits}
+    assert by_id["a"] == pytest.approx(1.0)
+    assert by_id["b"] == pytest.approx(0.0)
+
+
+def test_score_from_distance_helper_matches_documented_table():
+    """The reference helper produces exactly the formulas documented in vd.base."""
+    from vd.backends._helpers import score_from_distance
+
+    # cosine: 1 - d, d ∈ [0, 2] -> score ∈ [-1, 1]
+    assert score_from_distance(0.0, "cosine") == 1.0
+    assert score_from_distance(1.0, "cosine") == 0.0
+    assert score_from_distance(2.0, "cosine") == -1.0
+    # dot: -d (un-negate backends' negated inner product convention)
+    assert score_from_distance(-0.7, "dot") == pytest.approx(0.7)
+    assert score_from_distance(2.5, "dot") == pytest.approx(-2.5)
+    # l2: 1/(1+d), d ∈ [0, inf) -> score ∈ (0, 1]
+    assert score_from_distance(0.0, "l2") == 1.0
+    assert score_from_distance(1.0, "l2") == pytest.approx(0.5)
+    assert score_from_distance(9.0, "l2") == pytest.approx(0.1)
diff --git a/vd/backends/_helpers.py b/vd/backends/_helpers.py
index f9d4966..73f6263 100644
--- a/vd/backends/_helpers.py
+++ b/vd/backends/_helpers.py
@@ -21,14 +21,42 @@
 
 def score_from_distance(distance: float, metric: str) -> float:
     """
-    Convert a raw backend distance to a higher-is-better similarity score.
+    Convert a raw backend distance to ``vd``'s canonical similarity score.
+
+    Reference implementation of the cross-backend score contract documented
+    in :mod:`vd.base` ("Score semantics"): every ``SearchResult`` ``score``
+    is **higher-is-better** on a per-metric canonical scale, so fusion /
+    dedup / threshold logic works the same way across adapters.
+
+    Per-metric output:
+
+    ============  ===============================  ======================
+    metric        formula                          range
+    ============  ===============================  ======================
+    ``cosine``    ``1 - distance``                 ``[-1, 1]``
+    ``dot``       ``-distance``                    ``(-inf, +inf)``
+    ``l2``        ``1 / (1 + distance)``           ``(0, 1]``
+    ============  ===============================  ======================
+
+    Adapters whose backend already returns a higher-is-better number on a
+    *different* per-metric scale (Elasticsearch kNN ``_score``, MongoDB
+    Atlas ``vectorSearchScore``, Pinecone ``match.score``) **do not** route
+    through this helper — they pass the native score through and document
+    the deviation. Adapters whose backend returns a lower-is-better distance
+    (Chroma, DuckDB, FAISS L2, LanceDB, Milvus L2, pgvector, Redis,
+    sqlite-vec, Turbopuffer, Weaviate) call this helper to canonicalize.
 
     Parameters
     ----------
     distance : float
-        The backend's raw distance (lower = closer).
+        The backend's raw distance (lower = closer). For ``dot``, this is
+        the convention some backends use of negating the inner product so
+        that "distance" sorts the same way; for ``cosine``, the cosine
+        distance in ``[0, 2]``; for ``l2``, the Euclidean (or squared
+        Euclidean) distance in ``[0, +inf)``.
     metric : str
-        ``"cosine"``, ``"dot"``, or ``"l2"``.
+        ``"cosine"``, ``"dot"``, or ``"l2"``. Unknown metrics fall through
+        to the ``l2`` formula so the result is at least bounded.
 
     Examples
     --------
@@ -36,12 +64,16 @@ def score_from_distance(distance: float, metric: str) -> float:
     1.0
     >>> round(score_from_distance(1.0, 'l2'), 3)
     0.5
+    >>> score_from_distance(-0.7, 'dot')
+    0.7
     """
     if metric == "cosine":
-        # Cosine distance is in [0, 2]; similarity = 1 - distance.
+        # Cosine distance is in [0, 2]; similarity = 1 - distance ∈ [-1, 1].
         return 1.0 - distance
     if metric == "dot":
-        # Many backends report negative inner product as the "distance".
+        # Many backends report negative inner product as the "distance" so
+        # the smallest "distance" is the most similar; un-negate to recover
+        # the raw inner product (vd's canonical dot score).
         return -distance
     # l2 (and any unknown): squash a non-negative distance into (0, 1].
     return 1.0 / (1.0 + distance)
diff --git a/vd/backends/faiss.py b/vd/backends/faiss.py
index a3e6dd9..46cfdc2 100644
--- a/vd/backends/faiss.py
+++ b/vd/backends/faiss.py
@@ -31,7 +31,11 @@
         "Install with: pip install faiss-cpu numpy"
     ) from e
 
-from vd.backends._helpers import apply_client_filter, overfetch_limit
+from vd.backends._helpers import (
+    apply_client_filter,
+    overfetch_limit,
+    score_from_distance,
+)
 from vd.base import (
     AbstractClient,
     AbstractCollection,
@@ -150,13 +154,25 @@ def _query(
             if doc_id is None:
                 continue
             doc = self._docs[doc_id]
-            # IndexFlatIP -> higher is better; IndexFlatL2 -> lower is better.
-            value = float(score)
+            # FAISS conventions vs. vd's canonical score (vd.base "Score
+            # semantics"):
+            #   - IndexFlatIP (cosine/dot): returns inner product directly,
+            #     higher-is-better. For cosine, vectors are L2-normalized at
+            #     write time, so the inner product IS cosine similarity in
+            #     [-1, 1] — matches vd's cosine score. For dot, the raw
+            #     inner product matches vd's dot score (note: vd's dot
+            #     convention is the raw inner product, NOT the negated form
+            #     `score_from_distance("dot")` un-negates from).
+            #   - IndexFlatL2: returns *squared* L2 distance, lower-is-better.
+            #     Funnel through score_from_distance to canonicalize to
+            #     1/(1+d) ∈ (0, 1] like every other distance-returning adapter.
+            raw = float(score)
+            value = score_from_distance(raw, "l2") if self.metric == "l2" else raw
             results.append(
                 {
                     "id": doc_id,
                     "text": doc.text,
-                    "score": value if self.metric != "l2" else 1.0 / (1.0 + value),
+                    "score": value,
                     "metadata": dict(doc.metadata),
                 }
             )
diff --git a/vd/backends/qdrant.py b/vd/backends/qdrant.py
index 9bf2871..6c50fa0 100644
--- a/vd/backends/qdrant.py
+++ b/vd/backends/qdrant.py
@@ -239,6 +239,15 @@ def _query(
         for point in response.points:
             payload = point.payload or {}
             score = point.score
+            # Qdrant `point.score` per metric (see vd.base "Score semantics"):
+            #   - cosine: cosine similarity in [-1, 1]  → matches vd canonical
+            #   - dot:    raw inner product              → matches vd canonical
+            #   - euclid: a *distance* value (lower-is-better); Qdrant's
+            #     own sort orders ascending in that case. The existing
+            #     transform 1/(1+d) matches vd's canonical l2 score directly
+            #     (no un-negation), so leave it as-is. If a future Qdrant
+            #     client version switches Euclid to higher-is-better, this
+            #     branch must be revisited.
             results.append(
                 {
                     "id": payload.get(_ID_KEY, str(point.id)),
diff --git a/vd/base.py b/vd/base.py
index ef4286a..1937a0b 100644
--- a/vd/base.py
+++ b/vd/base.py
@@ -67,6 +67,43 @@
 #: spellings (e.g. ``"l2"`` -> Qdrant ``Distance.EUCLID``).
 METRICS = frozenset({"cosine", "dot", "l2"})
 
+# --------------------------------------------------------------------------- #
+# Score semantics — the cross-backend contract for SearchResult["score"]
+# --------------------------------------------------------------------------- #
+#
+# Every :data:`SearchResult` carries a ``score`` field. ``vd``'s contract for
+# that number is **higher-is-better, per-metric canonical similarity**:
+#
+# ============  ===============================  ======================
+# metric        canonical score                  range
+# ============  ===============================  ======================
+# ``cosine``    ``1 - cosine_distance``          ``[-1, 1]``
+# ``dot``       raw inner product                ``(-inf, +inf)``
+# ``l2``        ``1 / (1 + euclidean_distance)`` ``(0, 1]``
+# ============  ===============================  ======================
+#
+# Rationale:
+#
+# - **Same backend, different metrics** stay comparable (all three are
+#   higher-better).
+# - **Same metric, different backends** stay comparable: ``vd``'s own
+#   ``reciprocal_rank_fusion`` / ``deduplicate_results`` / ``multi_query_search``
+#   helpers and consumers like ``ef.SearchHit`` all assume this scale, so an
+#   adapter that returns ``1 / (1 + raw_distance)`` for cosine instead of
+#   ``1 - raw_distance`` would mis-rank only across adapters but consistently
+#   confuse score-threshold logic.
+#
+# The reference implementations are :func:`vd.backends.memory._similarity`
+# (in-memory adapter) and :func:`vd.backends._helpers.score_from_distance`
+# (distance-returning adapters). Adapters whose backend natively returns a
+# higher-is-better score on a *different* per-metric scale (e.g. Elasticsearch
+# kNN, MongoDB Atlas, Pinecone) **document the deviation in their adapter
+# docstring** rather than silently rescaling, because rescaling a backend's
+# own combined-ranking score can change ordering for ties. The deviation is
+# the cost of using that backend's native scoring.
+#
+# See issue #9 for the history of this contract.
+
 # Re-exported from vd.filters; imported lazily inside methods to avoid a cycle
 # (vd.filters imports UnsupportedFilterError from this module).
 
@@ -705,7 +742,13 @@ def search(
         ------
         dict
             ``{"id", "text", "score", "metadata"}`` — or whatever ``egress``
-            returns.
+            returns. ``score`` is a higher-is-better, per-metric canonical
+            similarity (see the "Score semantics" table at the top of
+            :mod:`vd.base`): cosine in ``[-1, 1]``, dot in ``(-inf, +inf)``,
+            l2 squashed to ``(0, 1]``. Adapters whose backend returns a
+            native combined-ranking score on a different scale (e.g.
+            Elasticsearch, Atlas, Pinecone) document the deviation in
+            their own docstring.
         """
         from vd.filters import validate_filter