From 76137d80d69b363cd420a549f28b0af9b38c7190 Mon Sep 17 00:00:00 2001 From: Derek Worthen Date: Mon, 27 Jan 2025 14:49:49 -0800 Subject: [PATCH 01/12] Add vector store id reference to embeddings config. --- .../patch-20250127224919088925.json | 4 +++ graphrag/config/defaults.py | 2 +- graphrag/config/embeddings.py | 14 +++------- graphrag/config/init_content.py | 3 ++- graphrag/config/models/graph_rag_config.py | 26 ++++++++++++++++++- .../config/models/text_embedding_config.py | 4 +++ tests/fixtures/azure/settings.yml | 2 +- tests/fixtures/min-csv/settings.yml | 2 +- tests/fixtures/text/settings.yml | 2 +- tests/unit/config/utils.py | 2 +- 10 files changed, 43 insertions(+), 18 deletions(-) create mode 100644 .semversioner/next-release/patch-20250127224919088925.json diff --git a/.semversioner/next-release/patch-20250127224919088925.json b/.semversioner/next-release/patch-20250127224919088925.json new file mode 100644 index 0000000000..5e0d890434 --- /dev/null +++ b/.semversioner/next-release/patch-20250127224919088925.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Add vector store id reference to embeddings config." +} diff --git a/graphrag/config/defaults.py b/graphrag/config/defaults.py index cb961b8b51..f33985db62 100644 --- a/graphrag/config/defaults.py +++ b/graphrag/config/defaults.py @@ -106,7 +106,7 @@ VECTOR_STORE_DB_URI = str(Path(OUTPUT_BASE_DIR) / "lancedb") VECTOR_STORE_CONTAINER_NAME = "default" VECTOR_STORE_OVERWRITE = True -VECTOR_STORE_INDEX_NAME = "output" +VECTOR_STORE_DEFAULT_ID = "default_vector_store" # Local Search LOCAL_SEARCH_TEXT_UNIT_PROP = 0.5 diff --git a/graphrag/config/embeddings.py b/graphrag/config/embeddings.py index a322290125..11fa82ef08 100644 --- a/graphrag/config/embeddings.py +++ b/graphrag/config/embeddings.py @@ -57,18 +57,10 @@ def get_embedding_settings( embeddings_llm_settings = settings.get_language_model_config( settings.embeddings.model_id ) - num_entries = len(settings.vector_store) - if num_entries == 1: - store = next(iter(settings.vector_store.values())) - vector_store_settings = store.model_dump() - else: - # The vector_store dict should only have more than one entry for multi-index query - vector_store_settings = None + vector_store_settings = settings.get_vector_store_config( + settings.embeddings.vector_store_id + ).model_dump() - if vector_store_settings is None: - return { - "strategy": settings.embeddings.resolved_strategy(embeddings_llm_settings) - } # # If we get to this point, settings.vector_store is defined, and there's a specific setting for this embedding. # settings.vector_store.base contains connection information, or may be undefined diff --git a/graphrag/config/init_content.py b/graphrag/config/init_content.py index f510ef7f2b..eccd05e4eb 100644 --- a/graphrag/config/init_content.py +++ b/graphrag/config/init_content.py @@ -40,7 +40,7 @@ # deployment_name: vector_store: - {defs.VECTOR_STORE_INDEX_NAME}: + {defs.VECTOR_STORE_DEFAULT_ID}: type: {defs.VECTOR_STORE_TYPE} db_uri: {defs.VECTOR_STORE_DB_URI} container_name: {defs.VECTOR_STORE_CONTAINER_NAME} @@ -48,6 +48,7 @@ embeddings: model_id: {defs.DEFAULT_EMBEDDING_MODEL_ID} + vector_store_id: {defs.VECTOR_STORE_DEFAULT_ID} ### Input settings ### diff --git a/graphrag/config/models/graph_rag_config.py b/graphrag/config/models/graph_rag_config.py index 9c50714f7d..1e5fd84ce1 100644 --- a/graphrag/config/models/graph_rag_config.py +++ b/graphrag/config/models/graph_rag_config.py @@ -226,7 +226,7 @@ def _validate_update_index_output_base_dir(self) -> None: vector_store: dict[str, VectorStoreConfig] = Field( description="The vector store configuration.", - default={"output": VectorStoreConfig()}, + default={defs.VECTOR_STORE_DEFAULT_ID: VectorStoreConfig()}, ) """The vector store configuration.""" @@ -263,6 +263,30 @@ def get_language_model_config(self, model_id: str) -> LanguageModelConfig: return self.models[model_id] + def get_vector_store_config(self, vector_store_id: str) -> VectorStoreConfig: + """Get a vector store configuration by ID. + + Parameters + ---------- + vector_store_id : str + The ID of the vector store to get. Should match an ID in the vector_store list. + + Returns + ------- + VectorStoreConfig + The vector store configuration if found. + + Raises + ------ + ValueError + If the vector store ID is not found in the configuration. + """ + if vector_store_id not in self.vector_store: + err_msg = f"Vector Store ID {vector_store_id} not found in configuration. Please rerun `graphrag init` and set the vector store configuration." + raise ValueError(err_msg) + + return self.vector_store[vector_store_id] + @model_validator(mode="after") def _validate_model(self): """Validate the model configuration.""" diff --git a/graphrag/config/models/text_embedding_config.py b/graphrag/config/models/text_embedding_config.py index c26e13ee08..9a8763fd12 100644 --- a/graphrag/config/models/text_embedding_config.py +++ b/graphrag/config/models/text_embedding_config.py @@ -34,6 +34,10 @@ class TextEmbeddingConfig(BaseModel): description="The model ID to use for text embeddings.", default=defs.EMBEDDING_MODEL_ID, ) + vector_store_id: str = Field( + description="The vector store ID to use for text embeddings.", + default=defs.VECTOR_STORE_DEFAULT_ID, + ) def resolved_strategy(self, model_config: LanguageModelConfig) -> dict: """Get the resolved text embedding strategy.""" diff --git a/tests/fixtures/azure/settings.yml b/tests/fixtures/azure/settings.yml index 3f054b6717..6303c771c1 100644 --- a/tests/fixtures/azure/settings.yml +++ b/tests/fixtures/azure/settings.yml @@ -3,7 +3,7 @@ claim_extraction: embeddings: vector_store: - output: + default_vector_store: type: "azure_ai_search" url: ${AZURE_AI_SEARCH_URL_ENDPOINT} api_key: ${AZURE_AI_SEARCH_API_KEY} diff --git a/tests/fixtures/min-csv/settings.yml b/tests/fixtures/min-csv/settings.yml index 09642c9260..ebd9b5f31b 100644 --- a/tests/fixtures/min-csv/settings.yml +++ b/tests/fixtures/min-csv/settings.yml @@ -26,7 +26,7 @@ models: async_mode: threaded vector_store: - output: + default_vector_store: type: "lancedb" db_uri: "./tests/fixtures/min-csv/lancedb" container_name: "lancedb_ci" diff --git a/tests/fixtures/text/settings.yml b/tests/fixtures/text/settings.yml index 09b5f13d38..d05d384d97 100644 --- a/tests/fixtures/text/settings.yml +++ b/tests/fixtures/text/settings.yml @@ -26,7 +26,7 @@ models: async_mode: threaded vector_store: - output: + default_vector_store: type: "azure_ai_search" url: ${AZURE_AI_SEARCH_URL_ENDPOINT} api_key: ${AZURE_AI_SEARCH_API_KEY} diff --git a/tests/unit/config/utils.py b/tests/unit/config/utils.py index d231b5c277..6535f448e9 100644 --- a/tests/unit/config/utils.py +++ b/tests/unit/config/utils.py @@ -50,7 +50,7 @@ DEFAULT_GRAPHRAG_CONFIG_SETTINGS = { "models": DEFAULT_MODEL_CONFIG, "vector_store": { - "output": { + defs.VECTOR_STORE_DEFAULT_ID: { "type": defs.VECTOR_STORE_TYPE, "db_uri": defs.VECTOR_STORE_DB_URI, "container_name": defs.VECTOR_STORE_CONTAINER_NAME, From 75a5aebc793b61c9e1416a150f44b22ec4732f48 Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Thu, 27 Mar 2025 16:36:05 -0400 Subject: [PATCH 02/12] generated initial vector store pytests --- tests/integration/vector_stores/__init__.py | 4 + .../vector_stores/test_azure_ai_search.py | 142 +++++++++++++ .../vector_stores/test_cosmosdb.py | 131 ++++++++++++ .../integration/vector_stores/test_lancedb.py | 187 ++++++++++++++++++ 4 files changed, 464 insertions(+) create mode 100644 tests/integration/vector_stores/__init__.py create mode 100644 tests/integration/vector_stores/test_azure_ai_search.py create mode 100644 tests/integration/vector_stores/test_cosmosdb.py create mode 100644 tests/integration/vector_stores/test_lancedb.py diff --git a/tests/integration/vector_stores/__init__.py b/tests/integration/vector_stores/__init__.py new file mode 100644 index 0000000000..742cd43d0f --- /dev/null +++ b/tests/integration/vector_stores/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Integration tests for vector store implementations.""" \ No newline at end of file diff --git a/tests/integration/vector_stores/test_azure_ai_search.py b/tests/integration/vector_stores/test_azure_ai_search.py new file mode 100644 index 0000000000..5b1ebca387 --- /dev/null +++ b/tests/integration/vector_stores/test_azure_ai_search.py @@ -0,0 +1,142 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Integration tests for Azure AI Search vector store implementation.""" + +import os +from unittest.mock import MagicMock, patch + +import pytest + +from graphrag.vector_stores.azure_ai_search import AzureAISearchVectorStore +from graphrag.vector_stores.base import VectorStoreDocument + +# This could be set to real testing values when running in a pipeline +TEST_AZURE_AI_SEARCH_URL = os.environ.get("TEST_AZURE_AI_SEARCH_URL", "https://test-url.search.windows.net") +TEST_AZURE_AI_SEARCH_KEY = os.environ.get("TEST_AZURE_AI_SEARCH_KEY", "test_api_key") + +@pytest.mark.integration +class TestAzureAISearchVectorStore: + """Test class for AzureAISearchVectorStore.""" + + @pytest.fixture + def mock_search_client(self): + """Create a mock Azure AI Search client.""" + with patch("graphrag.vector_stores.azure_ai_search.SearchClient") as mock_client: + yield mock_client.return_value + + @pytest.fixture + def mock_index_client(self): + """Create a mock Azure AI Search index client.""" + with patch("graphrag.vector_stores.azure_ai_search.SearchIndexClient") as mock_client: + yield mock_client.return_value + + @pytest.fixture + def vector_store(self, mock_search_client, mock_index_client): + """Create an Azure AI Search vector store instance.""" + vector_store = AzureAISearchVectorStore(collection_name="test_vectors") + + # Create the necessary mocks first + vector_store.db_connection = mock_search_client + vector_store.index_client = mock_index_client + + vector_store.connect( + url=TEST_AZURE_AI_SEARCH_URL, + api_key=TEST_AZURE_AI_SEARCH_KEY, + vector_size=5 + ) + return vector_store + + @pytest.fixture + def sample_documents(self): + """Create sample documents for testing.""" + return [ + VectorStoreDocument( + id="doc1", + text="This is document 1", + vector=[0.1, 0.2, 0.3, 0.4, 0.5], + attributes={"title": "Doc 1", "category": "test"}, + ), + VectorStoreDocument( + id="doc2", + text="This is document 2", + vector=[0.2, 0.3, 0.4, 0.5, 0.6], + attributes={"title": "Doc 2", "category": "test"}, + ), + ] + + async def test_vector_store_operations(self, vector_store, sample_documents, mock_search_client, mock_index_client): + """Test basic vector store operations with Azure AI Search.""" + # Setup mock responses + mock_index_client.list_index_names.return_value = [] + mock_index_client.create_or_update_index = MagicMock() + mock_search_client.upload_documents = MagicMock() + + search_results = [ + { + "id": "doc1", + "text": "This is document 1", + "vector": [0.1, 0.2, 0.3, 0.4, 0.5], + "attributes": '{"title": "Doc 1", "category": "test"}', + "@search.score": 0.9 + }, + { + "id": "doc2", + "text": "This is document 2", + "vector": [0.2, 0.3, 0.4, 0.5, 0.6], + "attributes": '{"title": "Doc 2", "category": "test"}', + "@search.score": 0.8 + } + ] + mock_search_client.search.return_value = search_results + + mock_search_client.get_document.return_value = { + "id": "doc1", + "text": "This is document 1", + "vector": [0.1, 0.2, 0.3, 0.4, 0.5], + "attributes": '{"title": "Doc 1", "category": "test"}' + } + + # Test loading documents + vector_store.load_documents(sample_documents) + assert mock_index_client.create_or_update_index.called + assert mock_search_client.upload_documents.called + + # Test filter_by_id + filter_query = vector_store.filter_by_id(["doc1", "doc2"]) + assert filter_query == "search.in(id, 'doc1,doc2', ',')" + + # Test vector similarity search + vector_results = vector_store.similarity_search_by_vector([0.1, 0.2, 0.3, 0.4, 0.5], k=2) + assert len(vector_results) == 2 + assert vector_results[0].document.id == "doc1" + assert vector_results[0].score == 0.9 + + # Define a simple text embedder function for testing + def mock_embedder(text: str) -> list[float]: + return [0.1, 0.2, 0.3, 0.4, 0.5] # Return fixed embedding + + # Test text similarity search + text_results = vector_store.similarity_search_by_text("test query", mock_embedder, k=2) + assert len(text_results) == 2 + + # Test search by ID + doc = vector_store.search_by_id("doc1") + assert doc.id == "doc1" + assert doc.text == "This is document 1" + assert doc.attributes["title"] == "Doc 1" + + async def test_empty_embedding(self, vector_store, mock_search_client): + """Test similarity search by text with empty embedding.""" + # Create a mock embedder that returns None + def none_embedder(text: str) -> None: + return None + + # Test the search + results = vector_store.similarity_search_by_text("test query", none_embedder, k=1) + + # Verify no search was performed + assert not mock_search_client.search.called + + # Verify empty results + assert len(results) == 0 \ No newline at end of file diff --git a/tests/integration/vector_stores/test_cosmosdb.py b/tests/integration/vector_stores/test_cosmosdb.py new file mode 100644 index 0000000000..235cb62996 --- /dev/null +++ b/tests/integration/vector_stores/test_cosmosdb.py @@ -0,0 +1,131 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Integration tests for CosmosDB vector store implementation.""" + +import sys +import json +from datetime import datetime + +import pytest + +from graphrag.vector_stores.base import VectorStoreDocument +from graphrag.vector_stores.cosmosdb import CosmosDBVectoreStore + +# cspell:disable-next-line well-known-key +WELL_KNOWN_COSMOS_CONNECTION_STRING = "AccountEndpoint=https://127.0.0.1:8081/;AccountKey=C2y6yDjf5/R+ob0N8A7Cgv30VRDJIWEHLM+4QDU5DE2nQ9nDuVTqobD4b8mGGyPMbIZnqyMsEcaGQy67XIw/Jw==" + +# the cosmosdb emulator is only available on windows runners at this time +if not sys.platform.startswith("win"): + pytest.skip( + "encountered windows-only tests -- will skip for now", allow_module_level=True + ) + +async def test_vector_store_operations(): + """Test basic vector store operations with CosmosDB.""" + vector_store = CosmosDBVectoreStore( + collection_name="testvector", + ) + + try: + vector_store.connect( + connection_string=WELL_KNOWN_COSMOS_CONNECTION_STRING, + database_name="testdb", + ) + + # Create test documents + docs = [ + VectorStoreDocument( + id="doc1", + text="This is document 1", + vector=[0.1, 0.2, 0.3, 0.4, 0.5], + attributes={"title": "Doc 1", "category": "test"}, + ), + VectorStoreDocument( + id="doc2", + text="This is document 2", + vector=[0.2, 0.3, 0.4, 0.5, 0.6], + attributes={"title": "Doc 2", "category": "test"}, + ), + ] + + # Load documents + vector_store.load_documents(docs) + + # Test filtering by ID + vector_store.filter_by_id(["doc1"]) + + # Test search by ID + doc = vector_store.search_by_id("doc1") + assert doc.id == "doc1" + assert doc.text == "This is document 1" + assert doc.vector == [0.1, 0.2, 0.3, 0.4, 0.5] + assert doc.attributes["title"] == "Doc 1" + + # Define a simple text embedder function for testing + def mock_embedder(text: str) -> list[float]: + return [0.1, 0.2, 0.3, 0.4, 0.5] # Return fixed embedding + + # Test vector similarity search + vector_results = vector_store.similarity_search_by_vector([0.1, 0.2, 0.3, 0.4, 0.5], k=2) + assert len(vector_results) > 0 + + # Test text similarity search + text_results = vector_store.similarity_search_by_text("test query", mock_embedder, k=2) + assert len(text_results) > 0 + finally: + # Clean up + await vector_store.clear() + + +async def test_child(): + """Test child container functionality.""" + parent = CosmosDBVectoreStore( + collection_name="testparent", + ) + try: + parent.connect( + connection_string=WELL_KNOWN_COSMOS_CONNECTION_STRING, + database_name="testchild", + ) + + # Test that child returns the correct type + child = parent.child("testchild") + assert isinstance(child, CosmosDBVectoreStore) + finally: + await parent.clear() + + +async def test_clear(): + """Test clearing the vector store.""" + vector_store = CosmosDBVectoreStore( + collection_name="testclear", + ) + try: + vector_store.connect( + connection_string=WELL_KNOWN_COSMOS_CONNECTION_STRING, + database_name="testclear", + ) + + # Create a document + doc = VectorStoreDocument( + id="test", + text="Test document", + vector=[0.1, 0.2, 0.3, 0.4, 0.5], + attributes={"title": "Test Doc"}, + ) + + # Load document and verify + vector_store.load_documents([doc]) + result = vector_store.search_by_id("test") + assert result.id == "test" + + # Clear and verify document is removed + await vector_store.clear() + + # After clear, container should be gone, so search_by_id would fail + # We just verify container client is None as evidence of cleanup + assert vector_store._container_client is None + assert vector_store._database_client is None + finally: + await vector_store.clear() \ No newline at end of file diff --git a/tests/integration/vector_stores/test_lancedb.py b/tests/integration/vector_stores/test_lancedb.py new file mode 100644 index 0000000000..cc25d1231b --- /dev/null +++ b/tests/integration/vector_stores/test_lancedb.py @@ -0,0 +1,187 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Integration tests for LanceDB vector store implementation.""" + + +import shutil +import tempfile + +import numpy as np + +from graphrag.vector_stores.base import VectorStoreDocument +from graphrag.vector_stores.lancedb import LanceDBVectorStore + + +def test_vector_store_operations(): + """Test basic vector store operations with LanceDB.""" + # Create a temporary directory for the test database + temp_dir = tempfile.mkdtemp() + try: + # Initialize the vector store + vector_store = LanceDBVectorStore(collection_name="test_collection") + vector_store.connect(db_uri=temp_dir) + + # Create test documents + docs = [ + VectorStoreDocument( + id="1", + text="This is document 1", + vector=[0.1, 0.2, 0.3, 0.4, 0.5], + attributes={"title": "Doc 1", "category": "test"}, + ), + VectorStoreDocument( + id="2", + text="This is document 2", + vector=[0.2, 0.3, 0.4, 0.5, 0.6], + attributes={"title": "Doc 2", "category": "test"}, + ), + VectorStoreDocument( + id="3", + text="This is document 3", + vector=[0.3, 0.4, 0.5, 0.6, 0.7], + attributes={"title": "Doc 3", "category": "test"}, + ), + ] + + # Load documents + vector_store.load_documents(docs[:2]) + + # Test collection exists + assert vector_store.collection_name in vector_store.db_connection.table_names() + + # Test search by ID + doc = vector_store.search_by_id("1") + assert doc.id == "1" + assert doc.text == "This is document 1" + + # Changed to compare vectors using np.allclose for approximate equality + assert doc.vector is not None + assert np.allclose(doc.vector, [0.1, 0.2, 0.3, 0.4, 0.5], rtol=1e-5) + assert doc.attributes["title"] == "Doc 1" + + # Test filter by ID + filter_query = vector_store.filter_by_id(["1"]) + assert filter_query == "id in ('1')" + + # Test vector similarity search + results = vector_store.similarity_search_by_vector([0.1, 0.2, 0.3, 0.4, 0.5], k=2) + + # Modified to be more flexible with result count since we only have 2 documents total + assert 1 <= len(results) <= 2 + assert isinstance(results[0].score, float) + + # Test append mode + vector_store.load_documents([docs[2]], overwrite=False) + result = vector_store.search_by_id("3") + assert result.id == "3" + assert result.text == "This is document 3" + + # Define a simple text embedder function for testing + def mock_embedder(text: str) -> list[float]: + return [0.1, 0.2, 0.3, 0.4, 0.5] + + # Test text similarity search + text_results = vector_store.similarity_search_by_text("test query", mock_embedder, k=2) + assert 1 <= len(text_results) <= 2 + assert isinstance(text_results[0].score, float) + + # Test non-existent document + non_existent = vector_store.search_by_id("nonexistent") + assert non_existent.id == "nonexistent" + assert non_existent.text is None + assert non_existent.vector is None + finally: + # Clean up - remove the temporary directory + shutil.rmtree(temp_dir) + + +def test_empty_collection(): + """Test creating an empty collection.""" + # Create a temporary directory for the test database + temp_dir = tempfile.mkdtemp() + try: + # Initialize the vector store + vector_store = LanceDBVectorStore(collection_name="empty_collection") + vector_store.connect(db_uri=temp_dir) + + # First create a schema with a sample document, then delete it + sample_doc = VectorStoreDocument( + id="tmp", + text="Temporary document to create schema", + vector=[0.1, 0.2, 0.3, 0.4, 0.5], + attributes={"title": "Tmp"} + ) + vector_store.load_documents([sample_doc]) + + # Now clear and check the collection still exists + vector_store.db_connection.open_table(vector_store.collection_name).delete("id = 'tmp'") + + # Should still have the collection + assert vector_store.collection_name in vector_store.db_connection.table_names() + + # Add a document after creating an empty collection + doc = VectorStoreDocument( + id="1", + text="This is document 1", + vector=[0.1, 0.2, 0.3, 0.4, 0.5], + attributes={"title": "Doc 1"}, + ) + vector_store.load_documents([doc], overwrite=False) + + result = vector_store.search_by_id("1") + assert result.id == "1" + assert result.text == "This is document 1" + finally: + # Clean up - remove the temporary directory + shutil.rmtree(temp_dir) + + +def test_filter_search(): + """Test filtered search with LanceDB.""" + # Create a temporary directory for the test database + temp_dir = tempfile.mkdtemp() + try: + # Initialize the vector store + vector_store = LanceDBVectorStore(collection_name="filter_collection") + vector_store.connect(db_uri=temp_dir) + + # Create test documents with different categories + docs = [ + VectorStoreDocument( + id="1", + text="Document about cats", + vector=[0.1, 0.2, 0.3, 0.4, 0.5], + attributes={"category": "animals"}, + ), + VectorStoreDocument( + id="2", + text="Document about dogs", + vector=[0.2, 0.3, 0.4, 0.5, 0.6], + attributes={"category": "animals"}, + ), + VectorStoreDocument( + id="3", + text="Document about cars", + vector=[0.3, 0.4, 0.5, 0.6, 0.7], + attributes={"category": "vehicles"}, + ), + ] + + # Load documents + vector_store.load_documents(docs) + + # Filter to include only documents about animals + vector_store.filter_by_id(["1", "2"]) + + # Search with the filter applied + results = vector_store.similarity_search_by_vector([0.1, 0.2, 0.3, 0.4, 0.5], k=3) + + # Should return at most 2 documents (the filtered ones) + assert len(results) <= 2 + ids = [result.document.id for result in results] + assert "3" not in ids + assert set(ids).issubset({"1", "2"}) + finally: + # Clean up - remove the temporary directory + shutil.rmtree(temp_dir) \ No newline at end of file From 8753c8933dff49fd49a23cec83d965af001f001d Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Fri, 28 Mar 2025 12:11:14 -0400 Subject: [PATCH 03/12] cleaned up cosmosdb vector store test --- graphrag/vector_stores/cosmosdb.py | 12 +++++ .../vector_stores/test_cosmosdb.py | 48 ++++--------------- 2 files changed, 22 insertions(+), 38 deletions(-) diff --git a/graphrag/vector_stores/cosmosdb.py b/graphrag/vector_stores/cosmosdb.py index c674ac4b52..f47bf94ded 100644 --- a/graphrag/vector_stores/cosmosdb.py +++ b/graphrag/vector_stores/cosmosdb.py @@ -214,3 +214,15 @@ def search_by_id(self, id: str) -> VectorStoreDocument: text=item.get("text", ""), attributes=(json.loads(item.get("attributes", "{}"))), ) + + def clear(self) -> None: + """Clear the vector store.""" + if self._database_client is None: + msg = "Database client is not initialized." + raise ValueError(msg) + if self._container_client is None: + msg = "Container client is not initialized." + raise ValueError(msg) + + self._delete_container() + self._delete_database() \ No newline at end of file diff --git a/tests/integration/vector_stores/test_cosmosdb.py b/tests/integration/vector_stores/test_cosmosdb.py index 235cb62996..f4d588ee27 100644 --- a/tests/integration/vector_stores/test_cosmosdb.py +++ b/tests/integration/vector_stores/test_cosmosdb.py @@ -4,8 +4,6 @@ """Integration tests for CosmosDB vector store implementation.""" import sys -import json -from datetime import datetime import pytest @@ -21,7 +19,7 @@ "encountered windows-only tests -- will skip for now", allow_module_level=True ) -async def test_vector_store_operations(): +def test_vector_store_operations(): """Test basic vector store operations with CosmosDB.""" vector_store = CosmosDBVectoreStore( collection_name="testvector", @@ -33,7 +31,7 @@ async def test_vector_store_operations(): database_name="testdb", ) - # Create test documents + # Create test documents and load docs = [ VectorStoreDocument( id="doc1", @@ -42,14 +40,12 @@ async def test_vector_store_operations(): attributes={"title": "Doc 1", "category": "test"}, ), VectorStoreDocument( - id="doc2", - text="This is document 2", + id="doc2", + text="This is document 2", vector=[0.2, 0.3, 0.4, 0.5, 0.6], attributes={"title": "Doc 2", "category": "test"}, ), ] - - # Load documents vector_store.load_documents(docs) # Test filtering by ID @@ -74,29 +70,10 @@ def mock_embedder(text: str) -> list[float]: text_results = vector_store.similarity_search_by_text("test query", mock_embedder, k=2) assert len(text_results) > 0 finally: - # Clean up - await vector_store.clear() - - -async def test_child(): - """Test child container functionality.""" - parent = CosmosDBVectoreStore( - collection_name="testparent", - ) - try: - parent.connect( - connection_string=WELL_KNOWN_COSMOS_CONNECTION_STRING, - database_name="testchild", - ) - - # Test that child returns the correct type - child = parent.child("testchild") - assert isinstance(child, CosmosDBVectoreStore) - finally: - await parent.clear() + vector_store.clear() -async def test_clear(): +def test_clear(): """Test clearing the vector store.""" vector_store = CosmosDBVectoreStore( collection_name="testclear", @@ -107,7 +84,6 @@ async def test_clear(): database_name="testclear", ) - # Create a document doc = VectorStoreDocument( id="test", text="Test document", @@ -115,17 +91,13 @@ async def test_clear(): attributes={"title": "Test Doc"}, ) - # Load document and verify vector_store.load_documents([doc]) result = vector_store.search_by_id("test") assert result.id == "test" # Clear and verify document is removed - await vector_store.clear() - - # After clear, container should be gone, so search_by_id would fail - # We just verify container client is None as evidence of cleanup - assert vector_store._container_client is None - assert vector_store._database_client is None + vector_store.clear() + assert vector_store._database_exists() is False # noqa: SLF001 + assert vector_store._container_exists() is False # noqa: SLF001 finally: - await vector_store.clear() \ No newline at end of file + pass \ No newline at end of file From d6fb4e2360d0c4228fc403f6d85e09a08019331d Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Fri, 28 Mar 2025 15:38:42 -0400 Subject: [PATCH 04/12] fixed class name typo and debugged cosmosdb vector store test --- graphrag/vector_stores/cosmosdb.py | 47 ++++++++++++++----- graphrag/vector_stores/factory.py | 4 +- .../vector_stores/test_azure_ai_search.py | 1 - .../vector_stores/test_cosmosdb.py | 9 ++-- 4 files changed, 42 insertions(+), 19 deletions(-) diff --git a/graphrag/vector_stores/cosmosdb.py b/graphrag/vector_stores/cosmosdb.py index f47bf94ded..1bb86edf88 100644 --- a/graphrag/vector_stores/cosmosdb.py +++ b/graphrag/vector_stores/cosmosdb.py @@ -7,6 +7,7 @@ from typing import Any from azure.cosmos import ContainerProxy, CosmosClient, DatabaseProxy +from azure.cosmos.exceptions import CosmosHttpResponseError from azure.cosmos.partition_key import PartitionKey from azure.identity import DefaultAzureCredential @@ -19,7 +20,7 @@ ) -class CosmosDBVectoreStore(BaseVectorStore): +class CosmosDBVectorStore(BaseVectorStore): """Azure CosmosDB vector storage implementation.""" _cosmos_client: CosmosClient @@ -157,13 +158,40 @@ def similarity_search_by_vector( msg = "Container client is not initialized." raise ValueError(msg) - query = f"SELECT TOP {k} c.id, c.text, c.vector, c.attributes, VectorDistance(c.vector, @embedding) AS SimilarityScore FROM c ORDER BY VectorDistance(c.vector, @embedding)" # noqa: S608 - query_params = [{"name": "@embedding", "value": query_embedding}] - items = self._container_client.query_items( - query=query, - parameters=query_params, - enable_cross_partition_query=True, - ) + try: + query = f"SELECT TOP {k} c.id, c.text, c.vector, c.attributes, VectorDistance(c.vector, @embedding) AS SimilarityScore FROM c ORDER BY VectorDistance(c.vector, @embedding)" # noqa: S608 + query_params = [{"name": "@embedding", "value": query_embedding}] + items = list(self._container_client.query_items( + query=query, + parameters=query_params, + enable_cross_partition_query=True, + )) + except (CosmosHttpResponseError, ValueError): + # Currently, the CosmosDB emulator does not support the VectorDistance function. + # For emulator or test environments - fetch all items and calculate distance locally + query = "SELECT c.id, c.text, c.vector, c.attributes FROM c" + items = list(self._container_client.query_items( + query=query, + enable_cross_partition_query=True, + )) + + # Calculate cosine similarity locally (1 - cosine distance) + from numpy import dot + from numpy.linalg import norm + + def cosine_similarity(a, b): + if norm(a) * norm(b) == 0: + return 0.0 + return dot(a, b) / (norm(a) * norm(b)) + + # Calculate scores for all items + for item in items: + item_vector = item.get("vector", []) + similarity = cosine_similarity(query_embedding, item_vector) + item["SimilarityScore"] = similarity + + # Sort by similarity score (higher is better) and take top k + items = sorted(items, key=lambda x: x.get("SimilarityScore", 0.0), reverse=True)[:k] return [ VectorStoreSearchResult( @@ -217,9 +245,6 @@ def search_by_id(self, id: str) -> VectorStoreDocument: def clear(self) -> None: """Clear the vector store.""" - if self._database_client is None: - msg = "Database client is not initialized." - raise ValueError(msg) if self._container_client is None: msg = "Container client is not initialized." raise ValueError(msg) diff --git a/graphrag/vector_stores/factory.py b/graphrag/vector_stores/factory.py index 1c37316d0c..d1dd3e42e3 100644 --- a/graphrag/vector_stores/factory.py +++ b/graphrag/vector_stores/factory.py @@ -8,7 +8,7 @@ from graphrag.vector_stores.azure_ai_search import AzureAISearchVectorStore from graphrag.vector_stores.base import BaseVectorStore -from graphrag.vector_stores.cosmosdb import CosmosDBVectoreStore +from graphrag.vector_stores.cosmosdb import CosmosDBVectorStore from graphrag.vector_stores.lancedb import LanceDBVectorStore @@ -44,7 +44,7 @@ def create_vector_store( case VectorStoreType.AzureAISearch: return AzureAISearchVectorStore(**kwargs) case VectorStoreType.CosmosDB: - return CosmosDBVectoreStore(**kwargs) + return CosmosDBVectorStore(**kwargs) case _: if vector_store_type in cls.vector_store_types: return cls.vector_store_types[vector_store_type](**kwargs) diff --git a/tests/integration/vector_stores/test_azure_ai_search.py b/tests/integration/vector_stores/test_azure_ai_search.py index 5b1ebca387..349f470ec1 100644 --- a/tests/integration/vector_stores/test_azure_ai_search.py +++ b/tests/integration/vector_stores/test_azure_ai_search.py @@ -15,7 +15,6 @@ TEST_AZURE_AI_SEARCH_URL = os.environ.get("TEST_AZURE_AI_SEARCH_URL", "https://test-url.search.windows.net") TEST_AZURE_AI_SEARCH_KEY = os.environ.get("TEST_AZURE_AI_SEARCH_KEY", "test_api_key") -@pytest.mark.integration class TestAzureAISearchVectorStore: """Test class for AzureAISearchVectorStore.""" diff --git a/tests/integration/vector_stores/test_cosmosdb.py b/tests/integration/vector_stores/test_cosmosdb.py index f4d588ee27..c8ef8d3084 100644 --- a/tests/integration/vector_stores/test_cosmosdb.py +++ b/tests/integration/vector_stores/test_cosmosdb.py @@ -8,10 +8,10 @@ import pytest from graphrag.vector_stores.base import VectorStoreDocument -from graphrag.vector_stores.cosmosdb import CosmosDBVectoreStore +from graphrag.vector_stores.cosmosdb import CosmosDBVectorStore # cspell:disable-next-line well-known-key -WELL_KNOWN_COSMOS_CONNECTION_STRING = "AccountEndpoint=https://127.0.0.1:8081/;AccountKey=C2y6yDjf5/R+ob0N8A7Cgv30VRDJIWEHLM+4QDU5DE2nQ9nDuVTqobD4b8mGGyPMbIZnqyMsEcaGQy67XIw/Jw==" +WELL_KNOWN_COSMOS_CONNECTION_STRING = "AccountEndpoint=http://127.0.0.1:8081/;AccountKey=C2y6yDjf5/R+ob0N8A7Cgv30VRDJIWEHLM+4QDU5DE2nQ9nDuVTqobD4b8mGGyPMbIZnqyMsEcaGQy67XIw/Jw==" # the cosmosdb emulator is only available on windows runners at this time if not sys.platform.startswith("win"): @@ -21,7 +21,7 @@ def test_vector_store_operations(): """Test basic vector store operations with CosmosDB.""" - vector_store = CosmosDBVectoreStore( + vector_store = CosmosDBVectorStore( collection_name="testvector", ) @@ -75,7 +75,7 @@ def mock_embedder(text: str) -> list[float]: def test_clear(): """Test clearing the vector store.""" - vector_store = CosmosDBVectoreStore( + vector_store = CosmosDBVectorStore( collection_name="testclear", ) try: @@ -98,6 +98,5 @@ def test_clear(): # Clear and verify document is removed vector_store.clear() assert vector_store._database_exists() is False # noqa: SLF001 - assert vector_store._container_exists() is False # noqa: SLF001 finally: pass \ No newline at end of file From 72fd984abe074c4897923149cf5dbf04ca438972 Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Fri, 28 Mar 2025 15:39:40 -0400 Subject: [PATCH 05/12] reset emulator connection string --- tests/integration/vector_stores/test_cosmosdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/vector_stores/test_cosmosdb.py b/tests/integration/vector_stores/test_cosmosdb.py index c8ef8d3084..2402fd76e1 100644 --- a/tests/integration/vector_stores/test_cosmosdb.py +++ b/tests/integration/vector_stores/test_cosmosdb.py @@ -11,7 +11,7 @@ from graphrag.vector_stores.cosmosdb import CosmosDBVectorStore # cspell:disable-next-line well-known-key -WELL_KNOWN_COSMOS_CONNECTION_STRING = "AccountEndpoint=http://127.0.0.1:8081/;AccountKey=C2y6yDjf5/R+ob0N8A7Cgv30VRDJIWEHLM+4QDU5DE2nQ9nDuVTqobD4b8mGGyPMbIZnqyMsEcaGQy67XIw/Jw==" +WELL_KNOWN_COSMOS_CONNECTION_STRING = "AccountEndpoint=https://127.0.0.1:8081/;AccountKey=C2y6yDjf5/R+ob0N8A7Cgv30VRDJIWEHLM+4QDU5DE2nQ9nDuVTqobD4b8mGGyPMbIZnqyMsEcaGQy67XIw/Jw==" # the cosmosdb emulator is only available on windows runners at this time if not sys.platform.startswith("win"): From 0acab7d4a98c6b559b71ea5f68a2c0f339f5b0f9 Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Mon, 31 Mar 2025 11:18:24 -0400 Subject: [PATCH 06/12] remove unneccessary comments --- .../vector_stores/test_azure_ai_search.py | 1 - .../vector_stores/test_cosmosdb.py | 5 ---- .../integration/vector_stores/test_lancedb.py | 23 +------------------ 3 files changed, 1 insertion(+), 28 deletions(-) diff --git a/tests/integration/vector_stores/test_azure_ai_search.py b/tests/integration/vector_stores/test_azure_ai_search.py index 349f470ec1..c3132a2ade 100644 --- a/tests/integration/vector_stores/test_azure_ai_search.py +++ b/tests/integration/vector_stores/test_azure_ai_search.py @@ -11,7 +11,6 @@ from graphrag.vector_stores.azure_ai_search import AzureAISearchVectorStore from graphrag.vector_stores.base import VectorStoreDocument -# This could be set to real testing values when running in a pipeline TEST_AZURE_AI_SEARCH_URL = os.environ.get("TEST_AZURE_AI_SEARCH_URL", "https://test-url.search.windows.net") TEST_AZURE_AI_SEARCH_KEY = os.environ.get("TEST_AZURE_AI_SEARCH_KEY", "test_api_key") diff --git a/tests/integration/vector_stores/test_cosmosdb.py b/tests/integration/vector_stores/test_cosmosdb.py index 2402fd76e1..a1fe5a8490 100644 --- a/tests/integration/vector_stores/test_cosmosdb.py +++ b/tests/integration/vector_stores/test_cosmosdb.py @@ -31,7 +31,6 @@ def test_vector_store_operations(): database_name="testdb", ) - # Create test documents and load docs = [ VectorStoreDocument( id="doc1", @@ -48,10 +47,8 @@ def test_vector_store_operations(): ] vector_store.load_documents(docs) - # Test filtering by ID vector_store.filter_by_id(["doc1"]) - # Test search by ID doc = vector_store.search_by_id("doc1") assert doc.id == "doc1" assert doc.text == "This is document 1" @@ -62,11 +59,9 @@ def test_vector_store_operations(): def mock_embedder(text: str) -> list[float]: return [0.1, 0.2, 0.3, 0.4, 0.5] # Return fixed embedding - # Test vector similarity search vector_results = vector_store.similarity_search_by_vector([0.1, 0.2, 0.3, 0.4, 0.5], k=2) assert len(vector_results) > 0 - # Test text similarity search text_results = vector_store.similarity_search_by_text("test query", mock_embedder, k=2) assert len(text_results) > 0 finally: diff --git a/tests/integration/vector_stores/test_lancedb.py b/tests/integration/vector_stores/test_lancedb.py index cc25d1231b..05823893d1 100644 --- a/tests/integration/vector_stores/test_lancedb.py +++ b/tests/integration/vector_stores/test_lancedb.py @@ -18,11 +18,9 @@ def test_vector_store_operations(): # Create a temporary directory for the test database temp_dir = tempfile.mkdtemp() try: - # Initialize the vector store vector_store = LanceDBVectorStore(collection_name="test_collection") vector_store.connect(db_uri=temp_dir) - # Create test documents docs = [ VectorStoreDocument( id="1", @@ -43,14 +41,10 @@ def test_vector_store_operations(): attributes={"title": "Doc 3", "category": "test"}, ), ] - - # Load documents vector_store.load_documents(docs[:2]) - # Test collection exists assert vector_store.collection_name in vector_store.db_connection.table_names() - # Test search by ID doc = vector_store.search_by_id("1") assert doc.id == "1" assert doc.text == "This is document 1" @@ -60,14 +54,10 @@ def test_vector_store_operations(): assert np.allclose(doc.vector, [0.1, 0.2, 0.3, 0.4, 0.5], rtol=1e-5) assert doc.attributes["title"] == "Doc 1" - # Test filter by ID filter_query = vector_store.filter_by_id(["1"]) assert filter_query == "id in ('1')" - # Test vector similarity search results = vector_store.similarity_search_by_vector([0.1, 0.2, 0.3, 0.4, 0.5], k=2) - - # Modified to be more flexible with result count since we only have 2 documents total assert 1 <= len(results) <= 2 assert isinstance(results[0].score, float) @@ -81,7 +71,6 @@ def test_vector_store_operations(): def mock_embedder(text: str) -> list[float]: return [0.1, 0.2, 0.3, 0.4, 0.5] - # Test text similarity search text_results = vector_store.similarity_search_by_text("test query", mock_embedder, k=2) assert 1 <= len(text_results) <= 2 assert isinstance(text_results[0].score, float) @@ -92,7 +81,6 @@ def mock_embedder(text: str) -> list[float]: assert non_existent.text is None assert non_existent.vector is None finally: - # Clean up - remove the temporary directory shutil.rmtree(temp_dir) @@ -101,11 +89,10 @@ def test_empty_collection(): # Create a temporary directory for the test database temp_dir = tempfile.mkdtemp() try: - # Initialize the vector store vector_store = LanceDBVectorStore(collection_name="empty_collection") vector_store.connect(db_uri=temp_dir) - # First create a schema with a sample document, then delete it + # Load the vector store with a document, then delete it sample_doc = VectorStoreDocument( id="tmp", text="Temporary document to create schema", @@ -113,8 +100,6 @@ def test_empty_collection(): attributes={"title": "Tmp"} ) vector_store.load_documents([sample_doc]) - - # Now clear and check the collection still exists vector_store.db_connection.open_table(vector_store.collection_name).delete("id = 'tmp'") # Should still have the collection @@ -142,7 +127,6 @@ def test_filter_search(): # Create a temporary directory for the test database temp_dir = tempfile.mkdtemp() try: - # Initialize the vector store vector_store = LanceDBVectorStore(collection_name="filter_collection") vector_store.connect(db_uri=temp_dir) @@ -167,14 +151,10 @@ def test_filter_search(): attributes={"category": "vehicles"}, ), ] - - # Load documents vector_store.load_documents(docs) # Filter to include only documents about animals vector_store.filter_by_id(["1", "2"]) - - # Search with the filter applied results = vector_store.similarity_search_by_vector([0.1, 0.2, 0.3, 0.4, 0.5], k=3) # Should return at most 2 documents (the filtered ones) @@ -183,5 +163,4 @@ def test_filter_search(): assert "3" not in ids assert set(ids).issubset({"1", "2"}) finally: - # Clean up - remove the temporary directory shutil.rmtree(temp_dir) \ No newline at end of file From 74e9a3a812a1c6aa31f0a6707778acd8a8f2589d Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Mon, 31 Mar 2025 14:41:13 -0400 Subject: [PATCH 07/12] removed extra comments from azure ai search test --- .../vector_stores/test_azure_ai_search.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/tests/integration/vector_stores/test_azure_ai_search.py b/tests/integration/vector_stores/test_azure_ai_search.py index c3132a2ade..ade386e832 100644 --- a/tests/integration/vector_stores/test_azure_ai_search.py +++ b/tests/integration/vector_stores/test_azure_ai_search.py @@ -95,16 +95,13 @@ async def test_vector_store_operations(self, vector_store, sample_documents, moc "attributes": '{"title": "Doc 1", "category": "test"}' } - # Test loading documents vector_store.load_documents(sample_documents) assert mock_index_client.create_or_update_index.called assert mock_search_client.upload_documents.called - # Test filter_by_id filter_query = vector_store.filter_by_id(["doc1", "doc2"]) assert filter_query == "search.in(id, 'doc1,doc2', ',')" - # Test vector similarity search vector_results = vector_store.similarity_search_by_vector([0.1, 0.2, 0.3, 0.4, 0.5], k=2) assert len(vector_results) == 2 assert vector_results[0].document.id == "doc1" @@ -112,13 +109,11 @@ async def test_vector_store_operations(self, vector_store, sample_documents, moc # Define a simple text embedder function for testing def mock_embedder(text: str) -> list[float]: - return [0.1, 0.2, 0.3, 0.4, 0.5] # Return fixed embedding + return [0.1, 0.2, 0.3, 0.4, 0.5] - # Test text similarity search text_results = vector_store.similarity_search_by_text("test query", mock_embedder, k=2) assert len(text_results) == 2 - # Test search by ID doc = vector_store.search_by_id("doc1") assert doc.id == "doc1" assert doc.text == "This is document 1" @@ -126,15 +121,10 @@ def mock_embedder(text: str) -> list[float]: async def test_empty_embedding(self, vector_store, mock_search_client): """Test similarity search by text with empty embedding.""" - # Create a mock embedder that returns None + # Create a mock embedder that returns None and verify that no results are produced def none_embedder(text: str) -> None: return None - # Test the search results = vector_store.similarity_search_by_text("test query", none_embedder, k=1) - - # Verify no search was performed assert not mock_search_client.search.called - - # Verify empty results assert len(results) == 0 \ No newline at end of file From 0ba7eb86b5a85b56e488d583581f085e741dd7ec Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Mon, 31 Mar 2025 14:41:53 -0400 Subject: [PATCH 08/12] ruff --- graphrag/vector_stores/cosmosdb.py | 30 +++++---- tests/integration/vector_stores/__init__.py | 2 +- .../vector_stores/test_azure_ai_search.py | 66 ++++++++++++------- .../vector_stores/test_cosmosdb.py | 31 +++++---- .../integration/vector_stores/test_lancedb.py | 53 ++++++++------- 5 files changed, 108 insertions(+), 74 deletions(-) diff --git a/graphrag/vector_stores/cosmosdb.py b/graphrag/vector_stores/cosmosdb.py index 1bb86edf88..43e17953d4 100644 --- a/graphrag/vector_stores/cosmosdb.py +++ b/graphrag/vector_stores/cosmosdb.py @@ -161,24 +161,28 @@ def similarity_search_by_vector( try: query = f"SELECT TOP {k} c.id, c.text, c.vector, c.attributes, VectorDistance(c.vector, @embedding) AS SimilarityScore FROM c ORDER BY VectorDistance(c.vector, @embedding)" # noqa: S608 query_params = [{"name": "@embedding", "value": query_embedding}] - items = list(self._container_client.query_items( - query=query, - parameters=query_params, - enable_cross_partition_query=True, - )) + items = list( + self._container_client.query_items( + query=query, + parameters=query_params, + enable_cross_partition_query=True, + ) + ) except (CosmosHttpResponseError, ValueError): # Currently, the CosmosDB emulator does not support the VectorDistance function. # For emulator or test environments - fetch all items and calculate distance locally query = "SELECT c.id, c.text, c.vector, c.attributes FROM c" - items = list(self._container_client.query_items( - query=query, - enable_cross_partition_query=True, - )) + items = list( + self._container_client.query_items( + query=query, + enable_cross_partition_query=True, + ) + ) # Calculate cosine similarity locally (1 - cosine distance) from numpy import dot from numpy.linalg import norm - + def cosine_similarity(a, b): if norm(a) * norm(b) == 0: return 0.0 @@ -191,7 +195,9 @@ def cosine_similarity(a, b): item["SimilarityScore"] = similarity # Sort by similarity score (higher is better) and take top k - items = sorted(items, key=lambda x: x.get("SimilarityScore", 0.0), reverse=True)[:k] + items = sorted( + items, key=lambda x: x.get("SimilarityScore", 0.0), reverse=True + )[:k] return [ VectorStoreSearchResult( @@ -250,4 +256,4 @@ def clear(self) -> None: raise ValueError(msg) self._delete_container() - self._delete_database() \ No newline at end of file + self._delete_database() diff --git a/tests/integration/vector_stores/__init__.py b/tests/integration/vector_stores/__init__.py index 742cd43d0f..9e8b989971 100644 --- a/tests/integration/vector_stores/__init__.py +++ b/tests/integration/vector_stores/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""Integration tests for vector store implementations.""" \ No newline at end of file +"""Integration tests for vector store implementations.""" diff --git a/tests/integration/vector_stores/test_azure_ai_search.py b/tests/integration/vector_stores/test_azure_ai_search.py index ade386e832..8887551cb3 100644 --- a/tests/integration/vector_stores/test_azure_ai_search.py +++ b/tests/integration/vector_stores/test_azure_ai_search.py @@ -11,37 +11,44 @@ from graphrag.vector_stores.azure_ai_search import AzureAISearchVectorStore from graphrag.vector_stores.base import VectorStoreDocument -TEST_AZURE_AI_SEARCH_URL = os.environ.get("TEST_AZURE_AI_SEARCH_URL", "https://test-url.search.windows.net") +TEST_AZURE_AI_SEARCH_URL = os.environ.get( + "TEST_AZURE_AI_SEARCH_URL", "https://test-url.search.windows.net" +) TEST_AZURE_AI_SEARCH_KEY = os.environ.get("TEST_AZURE_AI_SEARCH_KEY", "test_api_key") + class TestAzureAISearchVectorStore: """Test class for AzureAISearchVectorStore.""" @pytest.fixture def mock_search_client(self): """Create a mock Azure AI Search client.""" - with patch("graphrag.vector_stores.azure_ai_search.SearchClient") as mock_client: + with patch( + "graphrag.vector_stores.azure_ai_search.SearchClient" + ) as mock_client: yield mock_client.return_value @pytest.fixture def mock_index_client(self): """Create a mock Azure AI Search index client.""" - with patch("graphrag.vector_stores.azure_ai_search.SearchIndexClient") as mock_client: + with patch( + "graphrag.vector_stores.azure_ai_search.SearchIndexClient" + ) as mock_client: yield mock_client.return_value @pytest.fixture def vector_store(self, mock_search_client, mock_index_client): """Create an Azure AI Search vector store instance.""" vector_store = AzureAISearchVectorStore(collection_name="test_vectors") - + # Create the necessary mocks first vector_store.db_connection = mock_search_client vector_store.index_client = mock_index_client - + vector_store.connect( url=TEST_AZURE_AI_SEARCH_URL, api_key=TEST_AZURE_AI_SEARCH_KEY, - vector_size=5 + vector_size=5, ) return vector_store @@ -63,68 +70,77 @@ def sample_documents(self): ), ] - async def test_vector_store_operations(self, vector_store, sample_documents, mock_search_client, mock_index_client): + async def test_vector_store_operations( + self, vector_store, sample_documents, mock_search_client, mock_index_client + ): """Test basic vector store operations with Azure AI Search.""" # Setup mock responses mock_index_client.list_index_names.return_value = [] mock_index_client.create_or_update_index = MagicMock() mock_search_client.upload_documents = MagicMock() - + search_results = [ { "id": "doc1", "text": "This is document 1", "vector": [0.1, 0.2, 0.3, 0.4, 0.5], "attributes": '{"title": "Doc 1", "category": "test"}', - "@search.score": 0.9 + "@search.score": 0.9, }, { "id": "doc2", "text": "This is document 2", "vector": [0.2, 0.3, 0.4, 0.5, 0.6], "attributes": '{"title": "Doc 2", "category": "test"}', - "@search.score": 0.8 - } + "@search.score": 0.8, + }, ] mock_search_client.search.return_value = search_results - + mock_search_client.get_document.return_value = { "id": "doc1", "text": "This is document 1", "vector": [0.1, 0.2, 0.3, 0.4, 0.5], - "attributes": '{"title": "Doc 1", "category": "test"}' + "attributes": '{"title": "Doc 1", "category": "test"}', } - + vector_store.load_documents(sample_documents) assert mock_index_client.create_or_update_index.called assert mock_search_client.upload_documents.called - + filter_query = vector_store.filter_by_id(["doc1", "doc2"]) assert filter_query == "search.in(id, 'doc1,doc2', ',')" - - vector_results = vector_store.similarity_search_by_vector([0.1, 0.2, 0.3, 0.4, 0.5], k=2) + + vector_results = vector_store.similarity_search_by_vector( + [0.1, 0.2, 0.3, 0.4, 0.5], k=2 + ) assert len(vector_results) == 2 assert vector_results[0].document.id == "doc1" assert vector_results[0].score == 0.9 - + # Define a simple text embedder function for testing def mock_embedder(text: str) -> list[float]: return [0.1, 0.2, 0.3, 0.4, 0.5] - - text_results = vector_store.similarity_search_by_text("test query", mock_embedder, k=2) + + text_results = vector_store.similarity_search_by_text( + "test query", mock_embedder, k=2 + ) assert len(text_results) == 2 - + doc = vector_store.search_by_id("doc1") assert doc.id == "doc1" assert doc.text == "This is document 1" assert doc.attributes["title"] == "Doc 1" - + async def test_empty_embedding(self, vector_store, mock_search_client): """Test similarity search by text with empty embedding.""" + # Create a mock embedder that returns None and verify that no results are produced def none_embedder(text: str) -> None: return None - - results = vector_store.similarity_search_by_text("test query", none_embedder, k=1) + + results = vector_store.similarity_search_by_text( + "test query", none_embedder, k=1 + ) assert not mock_search_client.search.called - assert len(results) == 0 \ No newline at end of file + assert len(results) == 0 diff --git a/tests/integration/vector_stores/test_cosmosdb.py b/tests/integration/vector_stores/test_cosmosdb.py index a1fe5a8490..a5fb0c968c 100644 --- a/tests/integration/vector_stores/test_cosmosdb.py +++ b/tests/integration/vector_stores/test_cosmosdb.py @@ -19,18 +19,19 @@ "encountered windows-only tests -- will skip for now", allow_module_level=True ) + def test_vector_store_operations(): """Test basic vector store operations with CosmosDB.""" vector_store = CosmosDBVectorStore( collection_name="testvector", ) - + try: vector_store.connect( connection_string=WELL_KNOWN_COSMOS_CONNECTION_STRING, database_name="testdb", ) - + docs = [ VectorStoreDocument( id="doc1", @@ -46,23 +47,27 @@ def test_vector_store_operations(): ), ] vector_store.load_documents(docs) - + vector_store.filter_by_id(["doc1"]) - + doc = vector_store.search_by_id("doc1") assert doc.id == "doc1" assert doc.text == "This is document 1" assert doc.vector == [0.1, 0.2, 0.3, 0.4, 0.5] assert doc.attributes["title"] == "Doc 1" - + # Define a simple text embedder function for testing def mock_embedder(text: str) -> list[float]: return [0.1, 0.2, 0.3, 0.4, 0.5] # Return fixed embedding - - vector_results = vector_store.similarity_search_by_vector([0.1, 0.2, 0.3, 0.4, 0.5], k=2) + + vector_results = vector_store.similarity_search_by_vector( + [0.1, 0.2, 0.3, 0.4, 0.5], k=2 + ) assert len(vector_results) > 0 - - text_results = vector_store.similarity_search_by_text("test query", mock_embedder, k=2) + + text_results = vector_store.similarity_search_by_text( + "test query", mock_embedder, k=2 + ) assert len(text_results) > 0 finally: vector_store.clear() @@ -78,20 +83,20 @@ def test_clear(): connection_string=WELL_KNOWN_COSMOS_CONNECTION_STRING, database_name="testclear", ) - + doc = VectorStoreDocument( id="test", text="Test document", vector=[0.1, 0.2, 0.3, 0.4, 0.5], attributes={"title": "Test Doc"}, ) - + vector_store.load_documents([doc]) result = vector_store.search_by_id("test") assert result.id == "test" - + # Clear and verify document is removed vector_store.clear() assert vector_store._database_exists() is False # noqa: SLF001 finally: - pass \ No newline at end of file + pass diff --git a/tests/integration/vector_stores/test_lancedb.py b/tests/integration/vector_stores/test_lancedb.py index 05823893d1..3295d3e8d4 100644 --- a/tests/integration/vector_stores/test_lancedb.py +++ b/tests/integration/vector_stores/test_lancedb.py @@ -3,7 +3,6 @@ """Integration tests for LanceDB vector store implementation.""" - import shutil import tempfile @@ -20,7 +19,7 @@ def test_vector_store_operations(): try: vector_store = LanceDBVectorStore(collection_name="test_collection") vector_store.connect(db_uri=temp_dir) - + docs = [ VectorStoreDocument( id="1", @@ -42,9 +41,9 @@ def test_vector_store_operations(): ), ] vector_store.load_documents(docs[:2]) - + assert vector_store.collection_name in vector_store.db_connection.table_names() - + doc = vector_store.search_by_id("1") assert doc.id == "1" assert doc.text == "This is document 1" @@ -53,28 +52,32 @@ def test_vector_store_operations(): assert doc.vector is not None assert np.allclose(doc.vector, [0.1, 0.2, 0.3, 0.4, 0.5], rtol=1e-5) assert doc.attributes["title"] == "Doc 1" - + filter_query = vector_store.filter_by_id(["1"]) assert filter_query == "id in ('1')" - - results = vector_store.similarity_search_by_vector([0.1, 0.2, 0.3, 0.4, 0.5], k=2) + + results = vector_store.similarity_search_by_vector( + [0.1, 0.2, 0.3, 0.4, 0.5], k=2 + ) assert 1 <= len(results) <= 2 assert isinstance(results[0].score, float) - + # Test append mode vector_store.load_documents([docs[2]], overwrite=False) result = vector_store.search_by_id("3") assert result.id == "3" assert result.text == "This is document 3" - + # Define a simple text embedder function for testing def mock_embedder(text: str) -> list[float]: return [0.1, 0.2, 0.3, 0.4, 0.5] - - text_results = vector_store.similarity_search_by_text("test query", mock_embedder, k=2) + + text_results = vector_store.similarity_search_by_text( + "test query", mock_embedder, k=2 + ) assert 1 <= len(text_results) <= 2 assert isinstance(text_results[0].score, float) - + # Test non-existent document non_existent = vector_store.search_by_id("nonexistent") assert non_existent.id == "nonexistent" @@ -91,20 +94,22 @@ def test_empty_collection(): try: vector_store = LanceDBVectorStore(collection_name="empty_collection") vector_store.connect(db_uri=temp_dir) - + # Load the vector store with a document, then delete it sample_doc = VectorStoreDocument( id="tmp", text="Temporary document to create schema", vector=[0.1, 0.2, 0.3, 0.4, 0.5], - attributes={"title": "Tmp"} + attributes={"title": "Tmp"}, ) vector_store.load_documents([sample_doc]) - vector_store.db_connection.open_table(vector_store.collection_name).delete("id = 'tmp'") - + vector_store.db_connection.open_table(vector_store.collection_name).delete( + "id = 'tmp'" + ) + # Should still have the collection assert vector_store.collection_name in vector_store.db_connection.table_names() - + # Add a document after creating an empty collection doc = VectorStoreDocument( id="1", @@ -113,7 +118,7 @@ def test_empty_collection(): attributes={"title": "Doc 1"}, ) vector_store.load_documents([doc], overwrite=False) - + result = vector_store.search_by_id("1") assert result.id == "1" assert result.text == "This is document 1" @@ -129,7 +134,7 @@ def test_filter_search(): try: vector_store = LanceDBVectorStore(collection_name="filter_collection") vector_store.connect(db_uri=temp_dir) - + # Create test documents with different categories docs = [ VectorStoreDocument( @@ -152,15 +157,17 @@ def test_filter_search(): ), ] vector_store.load_documents(docs) - + # Filter to include only documents about animals vector_store.filter_by_id(["1", "2"]) - results = vector_store.similarity_search_by_vector([0.1, 0.2, 0.3, 0.4, 0.5], k=3) - + results = vector_store.similarity_search_by_vector( + [0.1, 0.2, 0.3, 0.4, 0.5], k=3 + ) + # Should return at most 2 documents (the filtered ones) assert len(results) <= 2 ids = [result.document.id for result in results] assert "3" not in ids assert set(ids).issubset({"1", "2"}) finally: - shutil.rmtree(temp_dir) \ No newline at end of file + shutil.rmtree(temp_dir) From e8667d4cdfabe2edbaecfd693f16fdf2c4d5f50a Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Mon, 31 Mar 2025 14:43:32 -0400 Subject: [PATCH 09/12] semversioner --- .semversioner/next-release/patch-20250331184323312702.json | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .semversioner/next-release/patch-20250331184323312702.json diff --git a/.semversioner/next-release/patch-20250331184323312702.json b/.semversioner/next-release/patch-20250331184323312702.json new file mode 100644 index 0000000000..6fa5ad4e78 --- /dev/null +++ b/.semversioner/next-release/patch-20250331184323312702.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "add vector store integration tests" +} From b3297e8196d28d98b19bd46d79f23f341a5c9d1a Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Mon, 31 Mar 2025 16:36:25 -0400 Subject: [PATCH 10/12] fix cicd issues --- graphrag/vector_stores/cosmosdb.py | 4 ---- tests/integration/vector_stores/test_cosmosdb.py | 2 +- tests/integration/vector_stores/test_lancedb.py | 2 +- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/graphrag/vector_stores/cosmosdb.py b/graphrag/vector_stores/cosmosdb.py index 43e17953d4..58ce40f690 100644 --- a/graphrag/vector_stores/cosmosdb.py +++ b/graphrag/vector_stores/cosmosdb.py @@ -251,9 +251,5 @@ def search_by_id(self, id: str) -> VectorStoreDocument: def clear(self) -> None: """Clear the vector store.""" - if self._container_client is None: - msg = "Container client is not initialized." - raise ValueError(msg) - self._delete_container() self._delete_database() diff --git a/tests/integration/vector_stores/test_cosmosdb.py b/tests/integration/vector_stores/test_cosmosdb.py index a5fb0c968c..5a1e15ce4c 100644 --- a/tests/integration/vector_stores/test_cosmosdb.py +++ b/tests/integration/vector_stores/test_cosmosdb.py @@ -29,7 +29,7 @@ def test_vector_store_operations(): try: vector_store.connect( connection_string=WELL_KNOWN_COSMOS_CONNECTION_STRING, - database_name="testdb", + database_name="test_db", ) docs = [ diff --git a/tests/integration/vector_stores/test_lancedb.py b/tests/integration/vector_stores/test_lancedb.py index 3295d3e8d4..6a525bbb40 100644 --- a/tests/integration/vector_stores/test_lancedb.py +++ b/tests/integration/vector_stores/test_lancedb.py @@ -50,7 +50,7 @@ def test_vector_store_operations(): # Changed to compare vectors using np.allclose for approximate equality assert doc.vector is not None - assert np.allclose(doc.vector, [0.1, 0.2, 0.3, 0.4, 0.5], rtol=1e-5) + assert np.allclose(doc.vector, [0.1, 0.2, 0.3, 0.4, 0.5]) assert doc.attributes["title"] == "Doc 1" filter_query = vector_store.filter_by_id(["1"]) From 7ae0a4ad6a985d3e2610e7eae3ed872202dbb065 Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Mon, 31 Mar 2025 17:08:48 -0400 Subject: [PATCH 11/12] bypass diskANN policy for test env --- graphrag/vector_stores/cosmosdb.py | 32 ++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/graphrag/vector_stores/cosmosdb.py b/graphrag/vector_stores/cosmosdb.py index 58ce40f690..9c736076bf 100644 --- a/graphrag/vector_stores/cosmosdb.py +++ b/graphrag/vector_stores/cosmosdb.py @@ -100,16 +100,32 @@ def _create_container(self) -> None: "automatic": True, "includedPaths": [{"path": "/*"}], "excludedPaths": [{"path": "/_etag/?"}, {"path": "/vector/*"}], - "vectorIndexes": [{"path": "/vector", "type": "diskANN"}], } - # Create the container and container client - self._database_client.create_container_if_not_exists( - id=self._container_name, - partition_key=partition_key, - indexing_policy=indexing_policy, - vector_embedding_policy=vector_embedding_policy, - ) + # Currently, the CosmosDB emulator does not support the diskANN policy. + try: + # First try with the standard diskANN policy + indexing_policy["vectorIndexes"] = [{"path": "/vector", "type": "diskANN"}] + + # Create the container and container client + self._database_client.create_container_if_not_exists( + id=self._container_name, + partition_key=partition_key, + indexing_policy=indexing_policy, + vector_embedding_policy=vector_embedding_policy, + ) + except CosmosHttpResponseError: + # If diskANN fails (likely in emulator), retry without vector indexes + indexing_policy.pop("vectorIndexes", None) + + # Create the container with compatible indexing policy + self._database_client.create_container_if_not_exists( + id=self._container_name, + partition_key=partition_key, + indexing_policy=indexing_policy, + vector_embedding_policy=vector_embedding_policy, + ) + self._container_client = self._database_client.get_container_client( self._container_name ) From a998a8ea61613e78bf946ec6fe6772cfea1e6a9a Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Mon, 31 Mar 2025 18:06:48 -0400 Subject: [PATCH 12/12] handle floating point inprecisions --- tests/integration/vector_stores/test_cosmosdb.py | 4 +++- tests/integration/vector_stores/test_lancedb.py | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integration/vector_stores/test_cosmosdb.py b/tests/integration/vector_stores/test_cosmosdb.py index 5a1e15ce4c..ce55f08df1 100644 --- a/tests/integration/vector_stores/test_cosmosdb.py +++ b/tests/integration/vector_stores/test_cosmosdb.py @@ -5,6 +5,7 @@ import sys +import numpy as np import pytest from graphrag.vector_stores.base import VectorStoreDocument @@ -53,7 +54,8 @@ def test_vector_store_operations(): doc = vector_store.search_by_id("doc1") assert doc.id == "doc1" assert doc.text == "This is document 1" - assert doc.vector == [0.1, 0.2, 0.3, 0.4, 0.5] + assert doc.vector is not None + assert np.allclose(doc.vector, [0.1, 0.2, 0.3, 0.4, 0.5]) assert doc.attributes["title"] == "Doc 1" # Define a simple text embedder function for testing diff --git a/tests/integration/vector_stores/test_lancedb.py b/tests/integration/vector_stores/test_lancedb.py index 6a525bbb40..ce4502317d 100644 --- a/tests/integration/vector_stores/test_lancedb.py +++ b/tests/integration/vector_stores/test_lancedb.py @@ -48,7 +48,6 @@ def test_vector_store_operations(): assert doc.id == "1" assert doc.text == "This is document 1" - # Changed to compare vectors using np.allclose for approximate equality assert doc.vector is not None assert np.allclose(doc.vector, [0.1, 0.2, 0.3, 0.4, 0.5]) assert doc.attributes["title"] == "Doc 1"