From cbaf5f0604f06d28f58f54589291d945bc563a6a Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Thu, 12 Feb 2026 20:38:44 +0000 Subject: [PATCH 1/5] add smoke tests for output csv --- tests/fixtures/text-csv/config.json | 106 +++++++++++++++++++++++++ tests/fixtures/text-csv/input/ABOUT.md | 3 + tests/fixtures/text-csv/settings.yml | 41 ++++++++++ tests/smoke/test_fixtures.py | 54 +++++++------ 4 files changed, 181 insertions(+), 23 deletions(-) create mode 100644 tests/fixtures/text-csv/config.json create mode 100644 tests/fixtures/text-csv/input/ABOUT.md create mode 100644 tests/fixtures/text-csv/settings.yml diff --git a/tests/fixtures/text-csv/config.json b/tests/fixtures/text-csv/config.json new file mode 100644 index 0000000000..08578c5c39 --- /dev/null +++ b/tests/fixtures/text-csv/config.json @@ -0,0 +1,106 @@ +{ + "input_path": "./tests/fixtures/text-csv", + "input_type": "text", + "index_method": "fast", + "workflow_config": { + "load_input_documents": { + "max_runtime": 30 + }, + "create_base_text_units": { + "max_runtime": 30 + }, + "extract_graph_nlp": { + "max_runtime": 30 + }, + "prune_graph": { + "max_runtime": 30 + }, + "finalize_graph": { + "row_range": [ + 10, + 300 + ], + "max_runtime": 30, + "nan_allowed_columns": [ + "description" + ], + "expected_artifacts": [ + "entities.csv", + "relationships.csv" + ] + }, + "create_communities": { + "row_range": [ + 1, + 30 + ], + "max_runtime": 30, + "expected_artifacts": ["communities.csv"] + }, + "create_community_reports_text": { + "row_range": [ + 1, + 30 + ], + "nan_allowed_columns": [ + "title", + "summary", + "full_content", + "full_content_json", + "rank", + "rank_explanation", + "findings", + "period", + "size" + ], + "max_runtime": 2000, + "expected_artifacts": ["community_reports.csv"] + }, + "create_final_text_units": { + "row_range": [ + 1, + 10 + ], + "nan_allowed_columns": [ + "relationship_ids", + "entity_ids", + "covariate_ids" + ], + "max_runtime": 30, + "expected_artifacts": ["text_units.csv"] + }, + "create_final_documents": { + "row_range": [ + 1, + 1 + ], + "nan_allowed_columns": [ + "raw_data" + ], + "max_runtime": 30, + "expected_artifacts": ["documents.csv"] + }, + "generate_text_embeddings": { + "row_range": [ + 1, + 100 + ], + "max_runtime": 150, + "expected_artifacts": [ + "embeddings.text_unit_text.csv", + "embeddings.entity_description.csv", + "embeddings.community_full_content.csv" + ] + } + }, + "query_config": [ + { + "query": "Who is Agent Alex Mercer and what are his goals?", + "method": "local" + }, + { + "query": "What is the major conflict in this story and who are the protagonist and antagonist?", + "method": "global" + } + ] +} diff --git a/tests/fixtures/text-csv/input/ABOUT.md b/tests/fixtures/text-csv/input/ABOUT.md new file mode 100644 index 0000000000..42c316108e --- /dev/null +++ b/tests/fixtures/text-csv/input/ABOUT.md @@ -0,0 +1,3 @@ +# About + +This document (Operation Dulce) in an AI-generated science fiction novella, included here for the purposes of integration testing. \ No newline at end of file diff --git a/tests/fixtures/text-csv/settings.yml b/tests/fixtures/text-csv/settings.yml new file mode 100644 index 0000000000..3a70ca8418 --- /dev/null +++ b/tests/fixtures/text-csv/settings.yml @@ -0,0 +1,41 @@ +completion_models: + default_completion_model: + model_provider: azure + api_key: ${GRAPHRAG_API_KEY} + api_base: ${GRAPHRAG_API_BASE} + api_version: "2025-04-01-preview" + model: gpt-4.1 + azure_deployment_name: gpt-4.1 + rate_limit: + type: sliding_window + tokens_per_period: 250_000 + requests_per_period: 250 +embedding_models: + default_embedding_model: + model_provider: azure + api_key: ${GRAPHRAG_API_KEY} + api_base: ${GRAPHRAG_API_BASE} + api_version: "2025-04-01-preview" + model: text-embedding-3-large + azure_deployment_name: text-embedding-3-large + rate_limit: + type: sliding_window + tokens_per_period: 250_000 + requests_per_period: 250 + +vector_store: + type: "lancedb" + db_uri: "./tests/fixtures/text-csv/lancedb" + overwrite: True + container_name: "lancedb_ci" + +table_provider: + type: csv + +community_reports: + prompt: "prompts/community_report.txt" + max_length: 2000 + max_input_length: 8000 + +snapshots: + embeddings: true diff --git a/tests/smoke/test_fixtures.py b/tests/smoke/test_fixtures.py index 7d43b0140a..2c3472a2fe 100644 --- a/tests/smoke/test_fixtures.py +++ b/tests/smoke/test_fixtures.py @@ -178,25 +178,27 @@ def __assert_indexer_outputs( for artifact in workflow_artifacts: if artifact.endswith(".parquet"): output_df = pd.read_parquet(output_path / artifact) + elif artifact.endswith(".csv"): + output_df = pd.read_csv(output_path / artifact) + else: + continue + + # Check number of rows between range + assert ( + config["row_range"][0] <= len(output_df) <= config["row_range"][1] + ), ( + f"Expected between {config['row_range'][0]} and {config['row_range'][1]}, found: {len(output_df)} for file: {artifact}" + ) - # Check number of rows between range - assert ( - config["row_range"][0] - <= len(output_df) - <= config["row_range"][1] - ), ( - f"Expected between {config['row_range'][0]} and {config['row_range'][1]}, found: {len(output_df)} for file: {artifact}" - ) - - # Get non-nan rows - nan_df = output_df.loc[ - :, - ~output_df.columns.isin(config.get("nan_allowed_columns", [])), - ] - nan_df = nan_df[nan_df.isna().any(axis=1)] - assert len(nan_df) == 0, ( - f"Found {len(nan_df)} rows with NaN values for file: {artifact} on columns: {nan_df.columns[nan_df.isna().any()].tolist()}" - ) + # Get non-nan rows + nan_df = output_df.loc[ + :, + ~output_df.columns.isin(config.get("nan_allowed_columns", [])), + ] + nan_df = nan_df[nan_df.isna().any(axis=1)] + assert len(nan_df) == 0, ( + f"Found {len(nan_df)} rows with NaN values for file: {artifact} on columns: {nan_df.columns[nan_df.isna().any()].tolist()}" + ) def __run_query(self, root: Path, query_config: dict[str, str]): command = [ @@ -220,11 +222,17 @@ def __run_query(self, root: Path, query_config: dict[str, str]): @mock.patch.dict( os.environ, { - **os.environ, - "BLOB_STORAGE_CONNECTION_STRING": WELL_KNOWN_AZURITE_CONNECTION_STRING, - "LOCAL_BLOB_STORAGE_CONNECTION_STRING": WELL_KNOWN_AZURITE_CONNECTION_STRING, - "AZURE_AI_SEARCH_URL_ENDPOINT": os.getenv("AZURE_AI_SEARCH_URL_ENDPOINT"), - "AZURE_AI_SEARCH_API_KEY": os.getenv("AZURE_AI_SEARCH_API_KEY"), + k: v + for k, v in { + **os.environ, + "BLOB_STORAGE_CONNECTION_STRING": WELL_KNOWN_AZURITE_CONNECTION_STRING, + "LOCAL_BLOB_STORAGE_CONNECTION_STRING": WELL_KNOWN_AZURITE_CONNECTION_STRING, + "AZURE_AI_SEARCH_URL_ENDPOINT": os.getenv( + "AZURE_AI_SEARCH_URL_ENDPOINT" + ), + "AZURE_AI_SEARCH_API_KEY": os.getenv("AZURE_AI_SEARCH_API_KEY"), + }.items() + if v is not None }, clear=True, ) From ec5df71eef61d4e96875277f3f8ed4e7792c751d Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Thu, 12 Feb 2026 21:11:24 +0000 Subject: [PATCH 2/5] change text fixture test --- tests/fixtures/text-csv/config.json | 106 ------------------------- tests/fixtures/text-csv/input/ABOUT.md | 3 - tests/fixtures/text-csv/settings.yml | 41 ---------- tests/fixtures/text/config.json | 21 ++--- tests/fixtures/text/settings.yml | 3 + tests/smoke/test_fixtures.py | 16 ++-- 6 files changed, 20 insertions(+), 170 deletions(-) delete mode 100644 tests/fixtures/text-csv/config.json delete mode 100644 tests/fixtures/text-csv/input/ABOUT.md delete mode 100644 tests/fixtures/text-csv/settings.yml diff --git a/tests/fixtures/text-csv/config.json b/tests/fixtures/text-csv/config.json deleted file mode 100644 index 08578c5c39..0000000000 --- a/tests/fixtures/text-csv/config.json +++ /dev/null @@ -1,106 +0,0 @@ -{ - "input_path": "./tests/fixtures/text-csv", - "input_type": "text", - "index_method": "fast", - "workflow_config": { - "load_input_documents": { - "max_runtime": 30 - }, - "create_base_text_units": { - "max_runtime": 30 - }, - "extract_graph_nlp": { - "max_runtime": 30 - }, - "prune_graph": { - "max_runtime": 30 - }, - "finalize_graph": { - "row_range": [ - 10, - 300 - ], - "max_runtime": 30, - "nan_allowed_columns": [ - "description" - ], - "expected_artifacts": [ - "entities.csv", - "relationships.csv" - ] - }, - "create_communities": { - "row_range": [ - 1, - 30 - ], - "max_runtime": 30, - "expected_artifacts": ["communities.csv"] - }, - "create_community_reports_text": { - "row_range": [ - 1, - 30 - ], - "nan_allowed_columns": [ - "title", - "summary", - "full_content", - "full_content_json", - "rank", - "rank_explanation", - "findings", - "period", - "size" - ], - "max_runtime": 2000, - "expected_artifacts": ["community_reports.csv"] - }, - "create_final_text_units": { - "row_range": [ - 1, - 10 - ], - "nan_allowed_columns": [ - "relationship_ids", - "entity_ids", - "covariate_ids" - ], - "max_runtime": 30, - "expected_artifacts": ["text_units.csv"] - }, - "create_final_documents": { - "row_range": [ - 1, - 1 - ], - "nan_allowed_columns": [ - "raw_data" - ], - "max_runtime": 30, - "expected_artifacts": ["documents.csv"] - }, - "generate_text_embeddings": { - "row_range": [ - 1, - 100 - ], - "max_runtime": 150, - "expected_artifacts": [ - "embeddings.text_unit_text.csv", - "embeddings.entity_description.csv", - "embeddings.community_full_content.csv" - ] - } - }, - "query_config": [ - { - "query": "Who is Agent Alex Mercer and what are his goals?", - "method": "local" - }, - { - "query": "What is the major conflict in this story and who are the protagonist and antagonist?", - "method": "global" - } - ] -} diff --git a/tests/fixtures/text-csv/input/ABOUT.md b/tests/fixtures/text-csv/input/ABOUT.md deleted file mode 100644 index 42c316108e..0000000000 --- a/tests/fixtures/text-csv/input/ABOUT.md +++ /dev/null @@ -1,3 +0,0 @@ -# About - -This document (Operation Dulce) in an AI-generated science fiction novella, included here for the purposes of integration testing. \ No newline at end of file diff --git a/tests/fixtures/text-csv/settings.yml b/tests/fixtures/text-csv/settings.yml deleted file mode 100644 index 3a70ca8418..0000000000 --- a/tests/fixtures/text-csv/settings.yml +++ /dev/null @@ -1,41 +0,0 @@ -completion_models: - default_completion_model: - model_provider: azure - api_key: ${GRAPHRAG_API_KEY} - api_base: ${GRAPHRAG_API_BASE} - api_version: "2025-04-01-preview" - model: gpt-4.1 - azure_deployment_name: gpt-4.1 - rate_limit: - type: sliding_window - tokens_per_period: 250_000 - requests_per_period: 250 -embedding_models: - default_embedding_model: - model_provider: azure - api_key: ${GRAPHRAG_API_KEY} - api_base: ${GRAPHRAG_API_BASE} - api_version: "2025-04-01-preview" - model: text-embedding-3-large - azure_deployment_name: text-embedding-3-large - rate_limit: - type: sliding_window - tokens_per_period: 250_000 - requests_per_period: 250 - -vector_store: - type: "lancedb" - db_uri: "./tests/fixtures/text-csv/lancedb" - overwrite: True - container_name: "lancedb_ci" - -table_provider: - type: csv - -community_reports: - prompt: "prompts/community_report.txt" - max_length: 2000 - max_input_length: 8000 - -snapshots: - embeddings: true diff --git a/tests/fixtures/text/config.json b/tests/fixtures/text/config.json index cc69e523ec..0ae2e06efc 100644 --- a/tests/fixtures/text/config.json +++ b/tests/fixtures/text/config.json @@ -21,9 +21,12 @@ 300 ], "max_runtime": 30, + "nan_allowed_columns": [ + "description" + ], "expected_artifacts": [ - "entities.parquet", - "relationships.parquet" + "entities.csv", + "relationships.csv" ] }, "create_communities": { @@ -32,7 +35,7 @@ 30 ], "max_runtime": 30, - "expected_artifacts": ["communities.parquet"] + "expected_artifacts": ["communities.csv"] }, "create_community_reports_text": { "row_range": [ @@ -51,7 +54,7 @@ "size" ], "max_runtime": 2000, - "expected_artifacts": ["community_reports.parquet"] + "expected_artifacts": ["community_reports.csv"] }, "create_final_text_units": { "row_range": [ @@ -64,7 +67,7 @@ "covariate_ids" ], "max_runtime": 30, - "expected_artifacts": ["text_units.parquet"] + "expected_artifacts": ["text_units.csv"] }, "create_final_documents": { "row_range": [ @@ -75,7 +78,7 @@ "raw_data" ], "max_runtime": 30, - "expected_artifacts": ["documents.parquet"] + "expected_artifacts": ["documents.csv"] }, "generate_text_embeddings": { "row_range": [ @@ -84,9 +87,9 @@ ], "max_runtime": 150, "expected_artifacts": [ - "embeddings.text_unit_text.parquet", - "embeddings.entity_description.parquet", - "embeddings.community_full_content.parquet" + "embeddings.text_unit_text.csv", + "embeddings.entity_description.csv", + "embeddings.community_full_content.csv" ] } }, diff --git a/tests/fixtures/text/settings.yml b/tests/fixtures/text/settings.yml index 6cf6f9074d..9f18f7680d 100644 --- a/tests/fixtures/text/settings.yml +++ b/tests/fixtures/text/settings.yml @@ -29,6 +29,9 @@ vector_store: api_key: ${AZURE_AI_SEARCH_API_KEY} container_name: "simple_text_ci" +table_provider: + type: csv + community_reports: prompt: "prompts/community_report.txt" max_length: 2000 diff --git a/tests/smoke/test_fixtures.py b/tests/smoke/test_fixtures.py index 2c3472a2fe..8a2985b380 100644 --- a/tests/smoke/test_fixtures.py +++ b/tests/smoke/test_fixtures.py @@ -222,17 +222,11 @@ def __run_query(self, root: Path, query_config: dict[str, str]): @mock.patch.dict( os.environ, { - k: v - for k, v in { - **os.environ, - "BLOB_STORAGE_CONNECTION_STRING": WELL_KNOWN_AZURITE_CONNECTION_STRING, - "LOCAL_BLOB_STORAGE_CONNECTION_STRING": WELL_KNOWN_AZURITE_CONNECTION_STRING, - "AZURE_AI_SEARCH_URL_ENDPOINT": os.getenv( - "AZURE_AI_SEARCH_URL_ENDPOINT" - ), - "AZURE_AI_SEARCH_API_KEY": os.getenv("AZURE_AI_SEARCH_API_KEY"), - }.items() - if v is not None + **os.environ, + "BLOB_STORAGE_CONNECTION_STRING": WELL_KNOWN_AZURITE_CONNECTION_STRING, + "LOCAL_BLOB_STORAGE_CONNECTION_STRING": WELL_KNOWN_AZURITE_CONNECTION_STRING, + "AZURE_AI_SEARCH_URL_ENDPOINT": os.getenv("AZURE_AI_SEARCH_URL_ENDPOINT"), + "AZURE_AI_SEARCH_API_KEY": os.getenv("AZURE_AI_SEARCH_API_KEY"), }, clear=True, ) From 5a34172447c18cea2c4d5fb37de45b5d11dc1653 Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Thu, 12 Feb 2026 21:39:08 +0000 Subject: [PATCH 3/5] change test --- .../graphrag_storage/tables/csv_table_provider.py | 2 +- tests/smoke/test_fixtures.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/graphrag-storage/graphrag_storage/tables/csv_table_provider.py b/packages/graphrag-storage/graphrag_storage/tables/csv_table_provider.py index 5de021b8a5..e81202f241 100644 --- a/packages/graphrag-storage/graphrag_storage/tables/csv_table_provider.py +++ b/packages/graphrag-storage/graphrag_storage/tables/csv_table_provider.py @@ -64,7 +64,7 @@ async def read_dataframe(self, table_name: str) -> pd.DataFrame: # Handle empty CSV (pandas can't parse files with no columns) if not csv_data or csv_data.strip() == "": return pd.DataFrame() - return pd.read_csv(StringIO(csv_data)) + return pd.read_csv(StringIO(csv_data), keep_default_na=False) except Exception: logger.exception("error loading table from storage: %s", filename) raise diff --git a/tests/smoke/test_fixtures.py b/tests/smoke/test_fixtures.py index 8a2985b380..e6a9680200 100644 --- a/tests/smoke/test_fixtures.py +++ b/tests/smoke/test_fixtures.py @@ -179,7 +179,7 @@ def __assert_indexer_outputs( if artifact.endswith(".parquet"): output_df = pd.read_parquet(output_path / artifact) elif artifact.endswith(".csv"): - output_df = pd.read_csv(output_path / artifact) + output_df = pd.read_csv(output_path / artifact, keep_default_na=False) else: continue From 53106d190f11d5063aff56c44ca590e55c4b08b5 Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Fri, 13 Feb 2026 13:07:16 -0300 Subject: [PATCH 4/5] add semver --- .semversioner/next-release/patch-20260213160631396575.json | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .semversioner/next-release/patch-20260213160631396575.json diff --git a/.semversioner/next-release/patch-20260213160631396575.json b/.semversioner/next-release/patch-20260213160631396575.json new file mode 100644 index 0000000000..79a9221724 --- /dev/null +++ b/.semversioner/next-release/patch-20260213160631396575.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "add csv table smoke tests" +} From 49f287b46ce35c0fed0e14f7b7d16cca482f5cc4 Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Fri, 13 Feb 2026 13:10:33 -0300 Subject: [PATCH 5/5] run format --- tests/smoke/test_fixtures.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/smoke/test_fixtures.py b/tests/smoke/test_fixtures.py index e6a9680200..8624930c14 100644 --- a/tests/smoke/test_fixtures.py +++ b/tests/smoke/test_fixtures.py @@ -179,7 +179,9 @@ def __assert_indexer_outputs( if artifact.endswith(".parquet"): output_df = pd.read_parquet(output_path / artifact) elif artifact.endswith(".csv"): - output_df = pd.read_csv(output_path / artifact, keep_default_na=False) + output_df = pd.read_csv( + output_path / artifact, keep_default_na=False + ) else: continue