From bd583a3db8a976fb756513067cc6d6a4c892a306 Mon Sep 17 00:00:00 2001
From: Humphrey Sun <humphreysun98@gmail.com>
Date: Thu, 4 Jun 2026 15:58:42 -0500
Subject: [PATCH] fix: PGVector.add_embeddings silently truncates on length
 mismatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`PGVector.add_embeddings` and `aadd_embeddings` built the SQL insert
payload via `zip(texts, metadatas, embeddings, ids_)`, which silently
truncates to the shortest argument. They then returned the full `ids_`
list regardless of how many rows were actually written, so an upstream
embedder bug that returned fewer embeddings than texts saw N IDs
returned to the caller but only M rows persisted, with no exception
raised. As called out in #300 (and observed downstream of
langchain-google-genai's `gemini-embedding-2` regression in
langchain-ai/langchain-google#1704).

This change validates lengths up-front via `_validate_lengths_match` —
raising a clear `ValueError` before any DB session is opened — and
adds `strict=True` to the trailing `zip` calls as a backstop. Adds
unit-test coverage that bypasses `PGVector.__init__` (no Postgres
needed) for both sync and async paths.

Fixes #300
---
 langchain_postgres/vectorstores.py            | 39 +++++++-
 .../v1/test_pgvector_length_validation.py     | 88 +++++++++++++++++++
 2 files changed, 125 insertions(+), 2 deletions(-)
 create mode 100644 tests/unit_tests/v1/test_pgvector_length_validation.py

diff --git a/langchain_postgres/vectorstores.py b/langchain_postgres/vectorstores.py
index a3743e4b..723c69e0 100644
--- a/langchain_postgres/vectorstores.py
+++ b/langchain_postgres/vectorstores.py
@@ -234,6 +234,39 @@ def _results_to_docs(docs_and_scores: Any) -> List[Document]:
     return [doc for doc, _ in docs_and_scores]
 
 
+def _validate_lengths_match(
+    texts: Sequence[str],
+    embeddings: List[List[float]],
+    metadatas: Optional[List[dict]],
+    ids: Optional[List[str]],
+) -> None:
+    """Raise `ValueError` if `embeddings`/`metadatas`/`ids` lengths disagree
+    with `texts`.
+
+    `PGVector.add_embeddings`/`aadd_embeddings` previously built the insert
+    payload via `zip(texts, metadatas, embeddings, ids_)` which silently
+    truncates to the shortest argument and then returns the full `ids_`
+    list — so a caller passing N texts but receiving only M embeddings
+    saw N IDs returned but only M rows persisted, with no error.
+    """
+    n = len(texts)
+    if len(embeddings) != n:
+        msg = (
+            f"Got {n} texts but {len(embeddings)} embeddings; "
+            "every text must have a matching embedding."
+        )
+        raise ValueError(msg)
+    if metadatas is not None and len(metadatas) != n:
+        msg = (
+            f"Got {n} texts but {len(metadatas)} metadatas; "
+            "every text must have a matching metadata entry."
+        )
+        raise ValueError(msg)
+    if ids is not None and len(ids) != n:
+        msg = f"Got {n} texts but {len(ids)} ids; every text must have a matching id."
+        raise ValueError(msg)
+
+
 def _create_vector_extension(conn: Connection) -> None:
     statement = sqlalchemy.text(
         "SELECT pg_advisory_xact_lock(1573678846307946496);"
@@ -763,6 +796,7 @@ def add_embeddings(
             kwargs: vectorstore specific parameters
         """
         assert not self._async_engine, "This method must be called with sync_mode"
+        _validate_lengths_match(texts, embeddings, metadatas, ids)
         if ids is None:
             ids_ = [str(uuid.uuid4()) for _ in texts]
         else:
@@ -784,7 +818,7 @@ def add_embeddings(
                     "cmetadata": metadata or {},
                 }
                 for text, metadata, embedding, id in zip(
-                    texts, metadatas, embeddings, ids_
+                    texts, metadatas, embeddings, ids_, strict=True
                 )
             ]
             stmt = insert(self.EmbeddingStore).values(data)
@@ -821,6 +855,7 @@ async def aadd_embeddings(
             kwargs: vectorstore specific parameters
         """
         await self.__apost_init__()  # Lazy async init
+        _validate_lengths_match(texts, embeddings, metadatas, ids)
 
         if ids is None:
             ids_ = [str(uuid.uuid4()) for _ in texts]
@@ -843,7 +878,7 @@ async def aadd_embeddings(
                     "cmetadata": metadata or {},
                 }
                 for text, metadata, embedding, id in zip(
-                    texts, metadatas, embeddings, ids_
+                    texts, metadatas, embeddings, ids_, strict=True
                 )
             ]
             stmt = insert(self.EmbeddingStore).values(data)
diff --git a/tests/unit_tests/v1/test_pgvector_length_validation.py b/tests/unit_tests/v1/test_pgvector_length_validation.py
new file mode 100644
index 00000000..195de1d3
--- /dev/null
+++ b/tests/unit_tests/v1/test_pgvector_length_validation.py
@@ -0,0 +1,88 @@
+"""Regression tests pinning that `PGVector.add_embeddings` /
+`aadd_embeddings` raise `ValueError` when input lengths disagree.
+
+Previously these methods built the SQL payload with
+`zip(texts, metadatas, embeddings, ids_)`, which silently truncates to
+the shortest argument and then returns the full `ids_` list — so an
+upstream embedder bug that returned fewer embeddings than texts would
+yield N IDs to the caller but only M rows in the database, with no
+exception. See #300.
+
+These tests bypass `PGVector.__init__` (no Postgres needed) by stubbing
+the few attributes the validation path touches.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from langchain_postgres.vectorstores import PGVector
+
+
+def _make_store() -> PGVector:
+    """Build a barebones `PGVector` that's just enough for the early
+    length-validation block to execute."""
+    store = object.__new__(PGVector)
+    # Sync path: `assert not self._async_engine` runs before validation.
+    store._async_engine = False
+    return store
+
+
+def test_add_embeddings_raises_when_embeddings_shorter_than_texts() -> None:
+    store = _make_store()
+    with pytest.raises(ValueError, match="3 texts but 1 embeddings"):
+        store.add_embeddings(
+            texts=["a", "b", "c"],
+            embeddings=[[0.1, 0.2]],
+        )
+
+
+def test_add_embeddings_raises_when_metadatas_length_mismatches() -> None:
+    store = _make_store()
+    with pytest.raises(ValueError, match="2 texts but 3 metadatas"):
+        store.add_embeddings(
+            texts=["a", "b"],
+            embeddings=[[0.1], [0.2]],
+            metadatas=[{}, {}, {}],
+        )
+
+
+def test_add_embeddings_raises_when_ids_length_mismatches() -> None:
+    store = _make_store()
+    with pytest.raises(ValueError, match="2 texts but 1 ids"):
+        store.add_embeddings(
+            texts=["a", "b"],
+            embeddings=[[0.1], [0.2]],
+            ids=["only-one"],
+        )
+
+
+def test_add_embeddings_does_not_raise_when_lengths_match() -> None:
+    """Validation must not fire when shapes line up. We don't reach the DB
+    layer here — the test uses an attribute access on a stubbed session
+    helper to detect that validation passed and execution continued."""
+    store = _make_store()
+    # Validation lives before any DB session is opened, so reaching the
+    # `with self._make_sync_session()` line should raise AttributeError on
+    # our stub. That is the signal we want — the ValueError path was NOT
+    # taken.
+    with pytest.raises(AttributeError):
+        store.add_embeddings(
+            texts=["a", "b"],
+            embeddings=[[0.1], [0.2]],
+            metadatas=[{}, {}],
+            ids=["x", "y"],
+        )
+
+
+@pytest.mark.asyncio
+async def test_aadd_embeddings_raises_when_embeddings_shorter_than_texts() -> None:
+    store = _make_store()
+    # `__apost_init__` short-circuits when `_async_init` is True, so the
+    # async validation block runs without touching the DB engine.
+    store._async_init = True
+    with pytest.raises(ValueError, match="3 texts but 2 embeddings"):
+        await store.aadd_embeddings(
+            texts=["a", "b", "c"],
+            embeddings=[[0.1], [0.2]],
+        )