From cbaf5f0604f06d28f58f54589291d945bc563a6a Mon Sep 17 00:00:00 2001
From: Dayenne Souza <ddesouza@microsoft.com>
Date: Thu, 12 Feb 2026 20:38:44 +0000
Subject: [PATCH 1/5] add smoke tests for output csv

---
 tests/fixtures/text-csv/config.json    | 106 +++++++++++++++++++++++++
 tests/fixtures/text-csv/input/ABOUT.md |   3 +
 tests/fixtures/text-csv/settings.yml   |  41 ++++++++++
 tests/smoke/test_fixtures.py           |  54 +++++++------
 4 files changed, 181 insertions(+), 23 deletions(-)
 create mode 100644 tests/fixtures/text-csv/config.json
 create mode 100644 tests/fixtures/text-csv/input/ABOUT.md
 create mode 100644 tests/fixtures/text-csv/settings.yml

diff --git a/tests/fixtures/text-csv/config.json b/tests/fixtures/text-csv/config.json
new file mode 100644
index 0000000000..08578c5c39
--- /dev/null
+++ b/tests/fixtures/text-csv/config.json
@@ -0,0 +1,106 @@
+{
+    "input_path": "./tests/fixtures/text-csv",
+    "input_type": "text",
+    "index_method": "fast",
+    "workflow_config": {
+        "load_input_documents": {
+            "max_runtime": 30
+        },
+        "create_base_text_units": {
+            "max_runtime": 30
+        },
+        "extract_graph_nlp": {
+            "max_runtime": 30
+        },
+        "prune_graph": {
+            "max_runtime": 30
+        },
+        "finalize_graph": {
+            "row_range": [
+                10,
+                300
+            ],
+            "max_runtime": 30,
+            "nan_allowed_columns": [
+                "description"
+            ],
+            "expected_artifacts": [
+                "entities.csv",
+                "relationships.csv"
+            ]
+        },
+        "create_communities": {
+            "row_range": [
+                1,
+                30
+            ],
+            "max_runtime": 30,
+            "expected_artifacts": ["communities.csv"]
+        },
+        "create_community_reports_text": {
+            "row_range": [
+                1,
+                30
+            ],
+            "nan_allowed_columns": [
+                "title",
+                "summary",
+                "full_content",
+                "full_content_json",
+                "rank",
+                "rank_explanation",
+                "findings",
+                "period",
+                "size"
+            ],
+            "max_runtime": 2000,
+            "expected_artifacts": ["community_reports.csv"]
+        },
+        "create_final_text_units": {
+            "row_range": [
+                1,
+                10
+            ],
+            "nan_allowed_columns": [
+                "relationship_ids",
+                "entity_ids",
+                "covariate_ids"
+            ],
+            "max_runtime": 30,
+            "expected_artifacts": ["text_units.csv"]
+        },
+        "create_final_documents": {
+            "row_range": [
+                1,
+                1
+            ],
+            "nan_allowed_columns": [
+                "raw_data"
+            ],
+            "max_runtime": 30,
+            "expected_artifacts": ["documents.csv"]
+        },
+        "generate_text_embeddings": {
+            "row_range": [
+                1,
+                100
+            ],
+            "max_runtime": 150,
+            "expected_artifacts": [
+                "embeddings.text_unit_text.csv",
+                "embeddings.entity_description.csv",
+                "embeddings.community_full_content.csv"
+            ]
+        }
+    },
+    "query_config": [
+        {
+            "query": "Who is Agent Alex Mercer and what are his goals?",
+            "method": "local"
+        },
+        {
+            "query": "What is the major conflict in this story and who are the protagonist and antagonist?",
+            "method": "global"
+        }
+    ]
+}
diff --git a/tests/fixtures/text-csv/input/ABOUT.md b/tests/fixtures/text-csv/input/ABOUT.md
new file mode 100644
index 0000000000..42c316108e
--- /dev/null
+++ b/tests/fixtures/text-csv/input/ABOUT.md
@@ -0,0 +1,3 @@
+# About
+
+This document (Operation Dulce) in an AI-generated science fiction novella, included here for the purposes of integration testing.
\ No newline at end of file
diff --git a/tests/fixtures/text-csv/settings.yml b/tests/fixtures/text-csv/settings.yml
new file mode 100644
index 0000000000..3a70ca8418
--- /dev/null
+++ b/tests/fixtures/text-csv/settings.yml
@@ -0,0 +1,41 @@
+completion_models:
+  default_completion_model:
+    model_provider: azure
+    api_key: ${GRAPHRAG_API_KEY}
+    api_base: ${GRAPHRAG_API_BASE}
+    api_version: "2025-04-01-preview"
+    model: gpt-4.1
+    azure_deployment_name: gpt-4.1
+    rate_limit:
+      type: sliding_window
+      tokens_per_period: 250_000
+      requests_per_period: 250
+embedding_models:
+  default_embedding_model:
+    model_provider: azure
+    api_key: ${GRAPHRAG_API_KEY}
+    api_base: ${GRAPHRAG_API_BASE}
+    api_version: "2025-04-01-preview"
+    model: text-embedding-3-large
+    azure_deployment_name: text-embedding-3-large
+    rate_limit:
+      type: sliding_window
+      tokens_per_period: 250_000
+      requests_per_period: 250
+
+vector_store:
+  type: "lancedb"
+  db_uri: "./tests/fixtures/text-csv/lancedb"
+  overwrite: True
+  container_name: "lancedb_ci"
+
+table_provider:
+  type: csv
+
+community_reports:
+  prompt: "prompts/community_report.txt"
+  max_length: 2000
+  max_input_length: 8000
+
+snapshots:
+  embeddings: true
diff --git a/tests/smoke/test_fixtures.py b/tests/smoke/test_fixtures.py
index 7d43b0140a..2c3472a2fe 100644
--- a/tests/smoke/test_fixtures.py
+++ b/tests/smoke/test_fixtures.py
@@ -178,25 +178,27 @@ def __assert_indexer_outputs(
             for artifact in workflow_artifacts:
                 if artifact.endswith(".parquet"):
                     output_df = pd.read_parquet(output_path / artifact)
+                elif artifact.endswith(".csv"):
+                    output_df = pd.read_csv(output_path / artifact)
+                else:
+                    continue
+
+                # Check number of rows between range
+                assert (
+                    config["row_range"][0] <= len(output_df) <= config["row_range"][1]
+                ), (
+                    f"Expected between {config['row_range'][0]} and {config['row_range'][1]}, found: {len(output_df)} for file: {artifact}"
+                )
 
-                    # Check number of rows between range
-                    assert (
-                        config["row_range"][0]
-                        <= len(output_df)
-                        <= config["row_range"][1]
-                    ), (
-                        f"Expected between {config['row_range'][0]} and {config['row_range'][1]}, found: {len(output_df)} for file: {artifact}"
-                    )
-
-                    # Get non-nan rows
-                    nan_df = output_df.loc[
-                        :,
-                        ~output_df.columns.isin(config.get("nan_allowed_columns", [])),
-                    ]
-                    nan_df = nan_df[nan_df.isna().any(axis=1)]
-                    assert len(nan_df) == 0, (
-                        f"Found {len(nan_df)} rows with NaN values for file: {artifact} on columns: {nan_df.columns[nan_df.isna().any()].tolist()}"
-                    )
+                # Get non-nan rows
+                nan_df = output_df.loc[
+                    :,
+                    ~output_df.columns.isin(config.get("nan_allowed_columns", [])),
+                ]
+                nan_df = nan_df[nan_df.isna().any(axis=1)]
+                assert len(nan_df) == 0, (
+                    f"Found {len(nan_df)} rows with NaN values for file: {artifact} on columns: {nan_df.columns[nan_df.isna().any()].tolist()}"
+                )
 
     def __run_query(self, root: Path, query_config: dict[str, str]):
         command = [
@@ -220,11 +222,17 @@ def __run_query(self, root: Path, query_config: dict[str, str]):
     @mock.patch.dict(
         os.environ,
         {
-            **os.environ,
-            "BLOB_STORAGE_CONNECTION_STRING": WELL_KNOWN_AZURITE_CONNECTION_STRING,
-            "LOCAL_BLOB_STORAGE_CONNECTION_STRING": WELL_KNOWN_AZURITE_CONNECTION_STRING,
-            "AZURE_AI_SEARCH_URL_ENDPOINT": os.getenv("AZURE_AI_SEARCH_URL_ENDPOINT"),
-            "AZURE_AI_SEARCH_API_KEY": os.getenv("AZURE_AI_SEARCH_API_KEY"),
+            k: v
+            for k, v in {
+                **os.environ,
+                "BLOB_STORAGE_CONNECTION_STRING": WELL_KNOWN_AZURITE_CONNECTION_STRING,
+                "LOCAL_BLOB_STORAGE_CONNECTION_STRING": WELL_KNOWN_AZURITE_CONNECTION_STRING,
+                "AZURE_AI_SEARCH_URL_ENDPOINT": os.getenv(
+                    "AZURE_AI_SEARCH_URL_ENDPOINT"
+                ),
+                "AZURE_AI_SEARCH_API_KEY": os.getenv("AZURE_AI_SEARCH_API_KEY"),
+            }.items()
+            if v is not None
         },
         clear=True,
     )

From ec5df71eef61d4e96875277f3f8ed4e7792c751d Mon Sep 17 00:00:00 2001
From: Dayenne Souza <ddesouza@microsoft.com>
Date: Thu, 12 Feb 2026 21:11:24 +0000
Subject: [PATCH 2/5] change text fixture test

---
 tests/fixtures/text-csv/config.json    | 106 -------------------------
 tests/fixtures/text-csv/input/ABOUT.md |   3 -
 tests/fixtures/text-csv/settings.yml   |  41 ----------
 tests/fixtures/text/config.json        |  21 ++---
 tests/fixtures/text/settings.yml       |   3 +
 tests/smoke/test_fixtures.py           |  16 ++--
 6 files changed, 20 insertions(+), 170 deletions(-)
 delete mode 100644 tests/fixtures/text-csv/config.json
 delete mode 100644 tests/fixtures/text-csv/input/ABOUT.md
 delete mode 100644 tests/fixtures/text-csv/settings.yml

diff --git a/tests/fixtures/text-csv/config.json b/tests/fixtures/text-csv/config.json
deleted file mode 100644
index 08578c5c39..0000000000
--- a/tests/fixtures/text-csv/config.json
+++ /dev/null
@@ -1,106 +0,0 @@
-{
-    "input_path": "./tests/fixtures/text-csv",
-    "input_type": "text",
-    "index_method": "fast",
-    "workflow_config": {
-        "load_input_documents": {
-            "max_runtime": 30
-        },
-        "create_base_text_units": {
-            "max_runtime": 30
-        },
-        "extract_graph_nlp": {
-            "max_runtime": 30
-        },
-        "prune_graph": {
-            "max_runtime": 30
-        },
-        "finalize_graph": {
-            "row_range": [
-                10,
-                300
-            ],
-            "max_runtime": 30,
-            "nan_allowed_columns": [
-                "description"
-            ],
-            "expected_artifacts": [
-                "entities.csv",
-                "relationships.csv"
-            ]
-        },
-        "create_communities": {
-            "row_range": [
-                1,
-                30
-            ],
-            "max_runtime": 30,
-            "expected_artifacts": ["communities.csv"]
-        },
-        "create_community_reports_text": {
-            "row_range": [
-                1,
-                30
-            ],
-            "nan_allowed_columns": [
-                "title",
-                "summary",
-                "full_content",
-                "full_content_json",
-                "rank",
-                "rank_explanation",
-                "findings",
-                "period",
-                "size"
-            ],
-            "max_runtime": 2000,
-            "expected_artifacts": ["community_reports.csv"]
-        },
-        "create_final_text_units": {
-            "row_range": [
-                1,
-                10
-            ],
-            "nan_allowed_columns": [
-                "relationship_ids",
-                "entity_ids",
-                "covariate_ids"
-            ],
-            "max_runtime": 30,
-            "expected_artifacts": ["text_units.csv"]
-        },
-        "create_final_documents": {
-            "row_range": [
-                1,
-                1
-            ],
-            "nan_allowed_columns": [
-                "raw_data"
-            ],
-            "max_runtime": 30,
-            "expected_artifacts": ["documents.csv"]
-        },
-        "generate_text_embeddings": {
-            "row_range": [
-                1,
-                100
-            ],
-            "max_runtime": 150,
-            "expected_artifacts": [
-                "embeddings.text_unit_text.csv",
-                "embeddings.entity_description.csv",
-                "embeddings.community_full_content.csv"
-            ]
-        }
-    },
-    "query_config": [
-        {
-            "query": "Who is Agent Alex Mercer and what are his goals?",
-            "method": "local"
-        },
-        {
-            "query": "What is the major conflict in this story and who are the protagonist and antagonist?",
-            "method": "global"
-        }
-    ]
-}
diff --git a/tests/fixtures/text-csv/input/ABOUT.md b/tests/fixtures/text-csv/input/ABOUT.md
deleted file mode 100644
index 42c316108e..0000000000
--- a/tests/fixtures/text-csv/input/ABOUT.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# About
-
-This document (Operation Dulce) in an AI-generated science fiction novella, included here for the purposes of integration testing.
\ No newline at end of file
diff --git a/tests/fixtures/text-csv/settings.yml b/tests/fixtures/text-csv/settings.yml
deleted file mode 100644
index 3a70ca8418..0000000000
--- a/tests/fixtures/text-csv/settings.yml
+++ /dev/null
@@ -1,41 +0,0 @@
-completion_models:
-  default_completion_model:
-    model_provider: azure
-    api_key: ${GRAPHRAG_API_KEY}
-    api_base: ${GRAPHRAG_API_BASE}
-    api_version: "2025-04-01-preview"
-    model: gpt-4.1
-    azure_deployment_name: gpt-4.1
-    rate_limit:
-      type: sliding_window
-      tokens_per_period: 250_000
-      requests_per_period: 250
-embedding_models:
-  default_embedding_model:
-    model_provider: azure
-    api_key: ${GRAPHRAG_API_KEY}
-    api_base: ${GRAPHRAG_API_BASE}
-    api_version: "2025-04-01-preview"
-    model: text-embedding-3-large
-    azure_deployment_name: text-embedding-3-large
-    rate_limit:
-      type: sliding_window
-      tokens_per_period: 250_000
-      requests_per_period: 250
-
-vector_store:
-  type: "lancedb"
-  db_uri: "./tests/fixtures/text-csv/lancedb"
-  overwrite: True
-  container_name: "lancedb_ci"
-
-table_provider:
-  type: csv
-
-community_reports:
-  prompt: "prompts/community_report.txt"
-  max_length: 2000
-  max_input_length: 8000
-
-snapshots:
-  embeddings: true
diff --git a/tests/fixtures/text/config.json b/tests/fixtures/text/config.json
index cc69e523ec..0ae2e06efc 100644
--- a/tests/fixtures/text/config.json
+++ b/tests/fixtures/text/config.json
@@ -21,9 +21,12 @@
                 300
             ],
             "max_runtime": 30,
+            "nan_allowed_columns": [
+                "description"
+            ],
             "expected_artifacts": [
-                "entities.parquet",
-                "relationships.parquet"
+                "entities.csv",
+                "relationships.csv"
             ]
         },
         "create_communities": {
@@ -32,7 +35,7 @@
                 30
             ],
             "max_runtime": 30,
-            "expected_artifacts": ["communities.parquet"]
+            "expected_artifacts": ["communities.csv"]
         },
         "create_community_reports_text": {
             "row_range": [
@@ -51,7 +54,7 @@
                 "size"
             ],
             "max_runtime": 2000,
-            "expected_artifacts": ["community_reports.parquet"]
+            "expected_artifacts": ["community_reports.csv"]
         },
         "create_final_text_units": {
             "row_range": [
@@ -64,7 +67,7 @@
                 "covariate_ids"
             ],
             "max_runtime": 30,
-            "expected_artifacts": ["text_units.parquet"]
+            "expected_artifacts": ["text_units.csv"]
         },
         "create_final_documents": {
             "row_range": [
@@ -75,7 +78,7 @@
                 "raw_data"
             ],
             "max_runtime": 30,
-            "expected_artifacts": ["documents.parquet"]
+            "expected_artifacts": ["documents.csv"]
         },
         "generate_text_embeddings": {
             "row_range": [
@@ -84,9 +87,9 @@
             ],
             "max_runtime": 150,
             "expected_artifacts": [
-                "embeddings.text_unit_text.parquet",
-                "embeddings.entity_description.parquet",
-                "embeddings.community_full_content.parquet"
+                "embeddings.text_unit_text.csv",
+                "embeddings.entity_description.csv",
+                "embeddings.community_full_content.csv"
             ]
         }
     },
diff --git a/tests/fixtures/text/settings.yml b/tests/fixtures/text/settings.yml
index 6cf6f9074d..9f18f7680d 100644
--- a/tests/fixtures/text/settings.yml
+++ b/tests/fixtures/text/settings.yml
@@ -29,6 +29,9 @@ vector_store:
   api_key: ${AZURE_AI_SEARCH_API_KEY}
   container_name: "simple_text_ci"
 
+table_provider:
+  type: csv
+
 community_reports:
   prompt: "prompts/community_report.txt"
   max_length: 2000
diff --git a/tests/smoke/test_fixtures.py b/tests/smoke/test_fixtures.py
index 2c3472a2fe..8a2985b380 100644
--- a/tests/smoke/test_fixtures.py
+++ b/tests/smoke/test_fixtures.py
@@ -222,17 +222,11 @@ def __run_query(self, root: Path, query_config: dict[str, str]):
     @mock.patch.dict(
         os.environ,
         {
-            k: v
-            for k, v in {
-                **os.environ,
-                "BLOB_STORAGE_CONNECTION_STRING": WELL_KNOWN_AZURITE_CONNECTION_STRING,
-                "LOCAL_BLOB_STORAGE_CONNECTION_STRING": WELL_KNOWN_AZURITE_CONNECTION_STRING,
-                "AZURE_AI_SEARCH_URL_ENDPOINT": os.getenv(
-                    "AZURE_AI_SEARCH_URL_ENDPOINT"
-                ),
-                "AZURE_AI_SEARCH_API_KEY": os.getenv("AZURE_AI_SEARCH_API_KEY"),
-            }.items()
-            if v is not None
+            **os.environ,
+            "BLOB_STORAGE_CONNECTION_STRING": WELL_KNOWN_AZURITE_CONNECTION_STRING,
+            "LOCAL_BLOB_STORAGE_CONNECTION_STRING": WELL_KNOWN_AZURITE_CONNECTION_STRING,
+            "AZURE_AI_SEARCH_URL_ENDPOINT": os.getenv("AZURE_AI_SEARCH_URL_ENDPOINT"),
+            "AZURE_AI_SEARCH_API_KEY": os.getenv("AZURE_AI_SEARCH_API_KEY"),
         },
         clear=True,
     )

From 5a34172447c18cea2c4d5fb37de45b5d11dc1653 Mon Sep 17 00:00:00 2001
From: Dayenne Souza <ddesouza@microsoft.com>
Date: Thu, 12 Feb 2026 21:39:08 +0000
Subject: [PATCH 3/5] change test

---
 .../graphrag_storage/tables/csv_table_provider.py               | 2 +-
 tests/smoke/test_fixtures.py                                    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/packages/graphrag-storage/graphrag_storage/tables/csv_table_provider.py b/packages/graphrag-storage/graphrag_storage/tables/csv_table_provider.py
index 5de021b8a5..e81202f241 100644
--- a/packages/graphrag-storage/graphrag_storage/tables/csv_table_provider.py
+++ b/packages/graphrag-storage/graphrag_storage/tables/csv_table_provider.py
@@ -64,7 +64,7 @@ async def read_dataframe(self, table_name: str) -> pd.DataFrame:
             # Handle empty CSV (pandas can't parse files with no columns)
             if not csv_data or csv_data.strip() == "":
                 return pd.DataFrame()
-            return pd.read_csv(StringIO(csv_data))
+            return pd.read_csv(StringIO(csv_data), keep_default_na=False)
         except Exception:
             logger.exception("error loading table from storage: %s", filename)
             raise
diff --git a/tests/smoke/test_fixtures.py b/tests/smoke/test_fixtures.py
index 8a2985b380..e6a9680200 100644
--- a/tests/smoke/test_fixtures.py
+++ b/tests/smoke/test_fixtures.py
@@ -179,7 +179,7 @@ def __assert_indexer_outputs(
                 if artifact.endswith(".parquet"):
                     output_df = pd.read_parquet(output_path / artifact)
                 elif artifact.endswith(".csv"):
-                    output_df = pd.read_csv(output_path / artifact)
+                    output_df = pd.read_csv(output_path / artifact, keep_default_na=False)
                 else:
                     continue
 

From 53106d190f11d5063aff56c44ca590e55c4b08b5 Mon Sep 17 00:00:00 2001
From: Dayenne Souza <ddesouza@microsoft.com>
Date: Fri, 13 Feb 2026 13:07:16 -0300
Subject: [PATCH 4/5] add semver

---
 .semversioner/next-release/patch-20260213160631396575.json | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 .semversioner/next-release/patch-20260213160631396575.json

diff --git a/.semversioner/next-release/patch-20260213160631396575.json b/.semversioner/next-release/patch-20260213160631396575.json
new file mode 100644
index 0000000000..79a9221724
--- /dev/null
+++ b/.semversioner/next-release/patch-20260213160631396575.json
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "add csv table smoke tests"
+}

From 49f287b46ce35c0fed0e14f7b7d16cca482f5cc4 Mon Sep 17 00:00:00 2001
From: Dayenne Souza <ddesouza@microsoft.com>
Date: Fri, 13 Feb 2026 13:10:33 -0300
Subject: [PATCH 5/5] run format

---
 tests/smoke/test_fixtures.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/smoke/test_fixtures.py b/tests/smoke/test_fixtures.py
index e6a9680200..8624930c14 100644
--- a/tests/smoke/test_fixtures.py
+++ b/tests/smoke/test_fixtures.py
@@ -179,7 +179,9 @@ def __assert_indexer_outputs(
                 if artifact.endswith(".parquet"):
                     output_df = pd.read_parquet(output_path / artifact)
                 elif artifact.endswith(".csv"):
-                    output_df = pd.read_csv(output_path / artifact, keep_default_na=False)
+                    output_df = pd.read_csv(
+                        output_path / artifact, keep_default_na=False
+                    )
                 else:
                     continue