From bb132eff9c9ecdba9970ea0dbd5fa7b74ffaa54c Mon Sep 17 00:00:00 2001 From: Thor Whalen <1906276+thorwhalen@users.noreply.github.com> Date: Wed, 27 May 2026 14:16:36 +0200 Subject: [PATCH] docs(base): canonical SearchResult score contract; align faiss l2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #9 noted that Chroma's score was metric-blind. The Chroma fix landed earlier via score_from_distance(); this change closes the loop: - vd/base.py: a new "Score semantics" section pins the cross-backend contract — higher-is-better, per-metric canonical similarity (cosine ∈ [-1, 1], dot raw inner product, l2 squashed to (0, 1]). Collection .search docstring references it. - vd/backends/_helpers.py: score_from_distance docstring spells out the formula per metric and which adapters route through it vs return a native combined score (ES / MongoDB / Pinecone). - vd/backends/faiss.py: route the l2 path through score_from_distance (was inline 1/(1+v)); add a comment explaining FAISS IndexFlatIP for cosine/dot is already canonical so passthrough is correct. - vd/backends/qdrant.py: behaviour unchanged, comment flags the Euclid case for future revisit if Qdrant changes its score sign convention. - tests/test_contract.py: pin the canonical contract on the in-memory reference adapter (identical-vector → metric-max, cosine-orthogonal → 0) and the helper's documented formula table. Refs #9 --- tests/test_contract.py | 63 +++++++++++++++++++++++++++++++++++++++++ vd/backends/_helpers.py | 42 +++++++++++++++++++++++---- vd/backends/faiss.py | 24 +++++++++++++--- vd/backends/qdrant.py | 9 ++++++ vd/base.py | 45 ++++++++++++++++++++++++++++- 5 files changed, 173 insertions(+), 10 deletions(-) diff --git a/tests/test_contract.py b/tests/test_contract.py index 28e52ff..f9a016f 100644 --- a/tests/test_contract.py +++ b/tests/test_contract.py @@ -127,3 +127,66 @@ def test_chroma_rejects_unsupported_operator(): # Chroma's filter subset has no $exists -> a clear vd error, pre-query. with pytest.raises(UnsupportedFilterError): list(col.search([0.1, 0.2, 0.3], filter={"missing": {"$exists": True}})) + + +# --------------------------------------------------------------------------- # +# Score semantics — the cross-backend contract documented in vd.base. +# Pin the canonical scale on the in-memory reference adapter; distance- +# returning adapters route through score_from_distance and therefore match +# automatically. See vd/base.py "Score semantics" and issue #9. +# --------------------------------------------------------------------------- # + + +@pytest.mark.parametrize( + "metric,query,expected_top_score", + [ + # Identical vectors → cosine similarity = 1.0 (max of [-1, 1]). + ("cosine", [1.0, 0.0, 0.0], 1.0), + # Identical vectors → inner product = 1.0 (no upper bound; equality here). + ("dot", [1.0, 0.0, 0.0], 1.0), + # Identical vectors → euclidean distance = 0 → score = 1/(1+0) = 1.0. + ("l2", [1.0, 0.0, 0.0], 1.0), + ], +) +def test_score_contract_identical_query_returns_max_canonical_score( + metric, query, expected_top_score +): + """An identical query vector gets the canonical maximum for its metric.""" + col = vd.connect("memory").create_collection( + f"score_contract_{metric}", metric=metric + ) + col["a"] = Document(id="a", text="match", vector=query) + col["b"] = Document(id="b", text="other", vector=[0.0, 1.0, 0.0]) + hits = list(col.search(query, limit=2)) + assert hits[0]["id"] == "a" + assert hits[0]["score"] == pytest.approx(expected_top_score) + + +def test_score_contract_cosine_orthogonal_is_zero(): + """vd canonical cosine score for orthogonal vectors is exactly 0.0.""" + col = vd.connect("memory").create_collection( + "score_contract_cos_ortho", metric="cosine" + ) + col["a"] = Document(id="a", text="x", vector=[1.0, 0.0]) + col["b"] = Document(id="b", text="y", vector=[0.0, 1.0]) + hits = list(col.search([1.0, 0.0], limit=2)) + by_id = {h["id"]: h["score"] for h in hits} + assert by_id["a"] == pytest.approx(1.0) + assert by_id["b"] == pytest.approx(0.0) + + +def test_score_from_distance_helper_matches_documented_table(): + """The reference helper produces exactly the formulas documented in vd.base.""" + from vd.backends._helpers import score_from_distance + + # cosine: 1 - d, d ∈ [0, 2] -> score ∈ [-1, 1] + assert score_from_distance(0.0, "cosine") == 1.0 + assert score_from_distance(1.0, "cosine") == 0.0 + assert score_from_distance(2.0, "cosine") == -1.0 + # dot: -d (un-negate backends' negated inner product convention) + assert score_from_distance(-0.7, "dot") == pytest.approx(0.7) + assert score_from_distance(2.5, "dot") == pytest.approx(-2.5) + # l2: 1/(1+d), d ∈ [0, inf) -> score ∈ (0, 1] + assert score_from_distance(0.0, "l2") == 1.0 + assert score_from_distance(1.0, "l2") == pytest.approx(0.5) + assert score_from_distance(9.0, "l2") == pytest.approx(0.1) diff --git a/vd/backends/_helpers.py b/vd/backends/_helpers.py index f9d4966..73f6263 100644 --- a/vd/backends/_helpers.py +++ b/vd/backends/_helpers.py @@ -21,14 +21,42 @@ def score_from_distance(distance: float, metric: str) -> float: """ - Convert a raw backend distance to a higher-is-better similarity score. + Convert a raw backend distance to ``vd``'s canonical similarity score. + + Reference implementation of the cross-backend score contract documented + in :mod:`vd.base` ("Score semantics"): every ``SearchResult`` ``score`` + is **higher-is-better** on a per-metric canonical scale, so fusion / + dedup / threshold logic works the same way across adapters. + + Per-metric output: + + ============ =============================== ====================== + metric formula range + ============ =============================== ====================== + ``cosine`` ``1 - distance`` ``[-1, 1]`` + ``dot`` ``-distance`` ``(-inf, +inf)`` + ``l2`` ``1 / (1 + distance)`` ``(0, 1]`` + ============ =============================== ====================== + + Adapters whose backend already returns a higher-is-better number on a + *different* per-metric scale (Elasticsearch kNN ``_score``, MongoDB + Atlas ``vectorSearchScore``, Pinecone ``match.score``) **do not** route + through this helper — they pass the native score through and document + the deviation. Adapters whose backend returns a lower-is-better distance + (Chroma, DuckDB, FAISS L2, LanceDB, Milvus L2, pgvector, Redis, + sqlite-vec, Turbopuffer, Weaviate) call this helper to canonicalize. Parameters ---------- distance : float - The backend's raw distance (lower = closer). + The backend's raw distance (lower = closer). For ``dot``, this is + the convention some backends use of negating the inner product so + that "distance" sorts the same way; for ``cosine``, the cosine + distance in ``[0, 2]``; for ``l2``, the Euclidean (or squared + Euclidean) distance in ``[0, +inf)``. metric : str - ``"cosine"``, ``"dot"``, or ``"l2"``. + ``"cosine"``, ``"dot"``, or ``"l2"``. Unknown metrics fall through + to the ``l2`` formula so the result is at least bounded. Examples -------- @@ -36,12 +64,16 @@ def score_from_distance(distance: float, metric: str) -> float: 1.0 >>> round(score_from_distance(1.0, 'l2'), 3) 0.5 + >>> score_from_distance(-0.7, 'dot') + 0.7 """ if metric == "cosine": - # Cosine distance is in [0, 2]; similarity = 1 - distance. + # Cosine distance is in [0, 2]; similarity = 1 - distance ∈ [-1, 1]. return 1.0 - distance if metric == "dot": - # Many backends report negative inner product as the "distance". + # Many backends report negative inner product as the "distance" so + # the smallest "distance" is the most similar; un-negate to recover + # the raw inner product (vd's canonical dot score). return -distance # l2 (and any unknown): squash a non-negative distance into (0, 1]. return 1.0 / (1.0 + distance) diff --git a/vd/backends/faiss.py b/vd/backends/faiss.py index a3e6dd9..46cfdc2 100644 --- a/vd/backends/faiss.py +++ b/vd/backends/faiss.py @@ -31,7 +31,11 @@ "Install with: pip install faiss-cpu numpy" ) from e -from vd.backends._helpers import apply_client_filter, overfetch_limit +from vd.backends._helpers import ( + apply_client_filter, + overfetch_limit, + score_from_distance, +) from vd.base import ( AbstractClient, AbstractCollection, @@ -150,13 +154,25 @@ def _query( if doc_id is None: continue doc = self._docs[doc_id] - # IndexFlatIP -> higher is better; IndexFlatL2 -> lower is better. - value = float(score) + # FAISS conventions vs. vd's canonical score (vd.base "Score + # semantics"): + # - IndexFlatIP (cosine/dot): returns inner product directly, + # higher-is-better. For cosine, vectors are L2-normalized at + # write time, so the inner product IS cosine similarity in + # [-1, 1] — matches vd's cosine score. For dot, the raw + # inner product matches vd's dot score (note: vd's dot + # convention is the raw inner product, NOT the negated form + # `score_from_distance("dot")` un-negates from). + # - IndexFlatL2: returns *squared* L2 distance, lower-is-better. + # Funnel through score_from_distance to canonicalize to + # 1/(1+d) ∈ (0, 1] like every other distance-returning adapter. + raw = float(score) + value = score_from_distance(raw, "l2") if self.metric == "l2" else raw results.append( { "id": doc_id, "text": doc.text, - "score": value if self.metric != "l2" else 1.0 / (1.0 + value), + "score": value, "metadata": dict(doc.metadata), } ) diff --git a/vd/backends/qdrant.py b/vd/backends/qdrant.py index 9bf2871..6c50fa0 100644 --- a/vd/backends/qdrant.py +++ b/vd/backends/qdrant.py @@ -239,6 +239,15 @@ def _query( for point in response.points: payload = point.payload or {} score = point.score + # Qdrant `point.score` per metric (see vd.base "Score semantics"): + # - cosine: cosine similarity in [-1, 1] → matches vd canonical + # - dot: raw inner product → matches vd canonical + # - euclid: a *distance* value (lower-is-better); Qdrant's + # own sort orders ascending in that case. The existing + # transform 1/(1+d) matches vd's canonical l2 score directly + # (no un-negation), so leave it as-is. If a future Qdrant + # client version switches Euclid to higher-is-better, this + # branch must be revisited. results.append( { "id": payload.get(_ID_KEY, str(point.id)), diff --git a/vd/base.py b/vd/base.py index ef4286a..1937a0b 100644 --- a/vd/base.py +++ b/vd/base.py @@ -67,6 +67,43 @@ #: spellings (e.g. ``"l2"`` -> Qdrant ``Distance.EUCLID``). METRICS = frozenset({"cosine", "dot", "l2"}) +# --------------------------------------------------------------------------- # +# Score semantics — the cross-backend contract for SearchResult["score"] +# --------------------------------------------------------------------------- # +# +# Every :data:`SearchResult` carries a ``score`` field. ``vd``'s contract for +# that number is **higher-is-better, per-metric canonical similarity**: +# +# ============ =============================== ====================== +# metric canonical score range +# ============ =============================== ====================== +# ``cosine`` ``1 - cosine_distance`` ``[-1, 1]`` +# ``dot`` raw inner product ``(-inf, +inf)`` +# ``l2`` ``1 / (1 + euclidean_distance)`` ``(0, 1]`` +# ============ =============================== ====================== +# +# Rationale: +# +# - **Same backend, different metrics** stay comparable (all three are +# higher-better). +# - **Same metric, different backends** stay comparable: ``vd``'s own +# ``reciprocal_rank_fusion`` / ``deduplicate_results`` / ``multi_query_search`` +# helpers and consumers like ``ef.SearchHit`` all assume this scale, so an +# adapter that returns ``1 / (1 + raw_distance)`` for cosine instead of +# ``1 - raw_distance`` would mis-rank only across adapters but consistently +# confuse score-threshold logic. +# +# The reference implementations are :func:`vd.backends.memory._similarity` +# (in-memory adapter) and :func:`vd.backends._helpers.score_from_distance` +# (distance-returning adapters). Adapters whose backend natively returns a +# higher-is-better score on a *different* per-metric scale (e.g. Elasticsearch +# kNN, MongoDB Atlas, Pinecone) **document the deviation in their adapter +# docstring** rather than silently rescaling, because rescaling a backend's +# own combined-ranking score can change ordering for ties. The deviation is +# the cost of using that backend's native scoring. +# +# See issue #9 for the history of this contract. + # Re-exported from vd.filters; imported lazily inside methods to avoid a cycle # (vd.filters imports UnsupportedFilterError from this module). @@ -705,7 +742,13 @@ def search( ------ dict ``{"id", "text", "score", "metadata"}`` — or whatever ``egress`` - returns. + returns. ``score`` is a higher-is-better, per-metric canonical + similarity (see the "Score semantics" table at the top of + :mod:`vd.base`): cosine in ``[-1, 1]``, dot in ``(-inf, +inf)``, + l2 squashed to ``(0, 1]``. Adapters whose backend returns a + native combined-ranking score on a different scale (e.g. + Elasticsearch, Atlas, Pinecone) document the deviation in + their own docstring. """ from vd.filters import validate_filter