From 50bb20f63ce6f5726164c73b2e7cb507f5eba380 Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Tue, 24 Feb 2026 21:19:53 -0300 Subject: [PATCH 1/3] fix --- .../patch-20260225001919068435.json | 4 ++++ packages/graphrag-input/graphrag_input/csv.py | 3 ++- tests/unit/indexing/input/test_csv_loader.py | 21 +++++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 .semversioner/next-release/patch-20260225001919068435.json diff --git a/.semversioner/next-release/patch-20260225001919068435.json b/.semversioner/next-release/patch-20260225001919068435.json new file mode 100644 index 0000000000..01df0b46fe --- /dev/null +++ b/.semversioner/next-release/patch-20260225001919068435.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "fix csv reader" +} diff --git a/packages/graphrag-input/graphrag_input/csv.py b/packages/graphrag-input/graphrag_input/csv.py index e041bff275..9f7ddaec52 100644 --- a/packages/graphrag-input/graphrag_input/csv.py +++ b/packages/graphrag-input/graphrag_input/csv.py @@ -4,6 +4,7 @@ """A module containing 'CSVFileReader' model.""" import csv +import io import logging import sys @@ -39,6 +40,6 @@ async def read_file(self, path: str) -> list[TextDocument]: """ file = await self._storage.get(path, encoding=self._encoding) - reader = csv.DictReader(file.splitlines()) + reader = csv.DictReader(io.StringIO(file)) rows = list(reader) return await self.process_data_columns(rows, path) diff --git a/tests/unit/indexing/input/test_csv_loader.py b/tests/unit/indexing/input/test_csv_loader.py index 1a84d82676..b705e1f143 100644 --- a/tests/unit/indexing/input/test_csv_loader.py +++ b/tests/unit/indexing/input/test_csv_loader.py @@ -54,3 +54,24 @@ async def test_csv_loader_multiple_files(): reader = create_input_reader(config, storage) documents = await reader.read_files() assert len(documents) == 4 + + +async def test_csv_loader_preserves_multiline_fields(): + """Multiline quoted CSV fields must retain their internal newlines.""" + config = InputConfig( + type=InputType.Csv, + text_column="text", + title_column="title", + ) + storage = create_storage( + StorageConfig( + base_dir="tests/unit/indexing/input/data/multiline-csv", + ) + ) + reader = create_input_reader(config, storage) + documents = await reader.read_files() + assert len(documents) == 2 + assert documents[0].title == "Post 1" + assert documents[0].text == "Line one.\nLine two.\nLine three." + assert documents[1].title == "Post 2" + assert documents[1].text == "Single line." From e88a616f2eff9e606f63a248a68068e7e5bfb4a5 Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Tue, 24 Feb 2026 21:30:56 -0300 Subject: [PATCH 2/3] fix test with inline content --- tests/unit/indexing/input/test_csv_loader.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/unit/indexing/input/test_csv_loader.py b/tests/unit/indexing/input/test_csv_loader.py index b705e1f143..92a131a949 100644 --- a/tests/unit/indexing/input/test_csv_loader.py +++ b/tests/unit/indexing/input/test_csv_loader.py @@ -56,18 +56,20 @@ async def test_csv_loader_multiple_files(): assert len(documents) == 4 -async def test_csv_loader_preserves_multiline_fields(): +async def test_csv_loader_preserves_multiline_fields(tmp_path): """Multiline quoted CSV fields must retain their internal newlines.""" + csv_content = ( + 'title,text\r\n' + '"Post 1","Line one.\nLine two.\nLine three."\r\n' + '"Post 2","Single line."\r\n' + ) + (tmp_path / "input.csv").write_text(csv_content, encoding="utf-8", newline="") config = InputConfig( type=InputType.Csv, text_column="text", title_column="title", ) - storage = create_storage( - StorageConfig( - base_dir="tests/unit/indexing/input/data/multiline-csv", - ) - ) + storage = create_storage(StorageConfig(base_dir=str(tmp_path))) reader = create_input_reader(config, storage) documents = await reader.read_files() assert len(documents) == 2 From 0120f3825e835ed89bf0026e41a377737dc9abcd Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Tue, 24 Feb 2026 21:32:59 -0300 Subject: [PATCH 3/3] fix format --- tests/unit/indexing/input/test_csv_loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/indexing/input/test_csv_loader.py b/tests/unit/indexing/input/test_csv_loader.py index 92a131a949..a2cd171750 100644 --- a/tests/unit/indexing/input/test_csv_loader.py +++ b/tests/unit/indexing/input/test_csv_loader.py @@ -59,11 +59,11 @@ async def test_csv_loader_multiple_files(): async def test_csv_loader_preserves_multiline_fields(tmp_path): """Multiline quoted CSV fields must retain their internal newlines.""" csv_content = ( - 'title,text\r\n' + "title,text\r\n" '"Post 1","Line one.\nLine two.\nLine three."\r\n' '"Post 2","Single line."\r\n' ) - (tmp_path / "input.csv").write_text(csv_content, encoding="utf-8", newline="") + (tmp_path / "input.csv").write_text(csv_content, encoding="utf-8") config = InputConfig( type=InputType.Csv, text_column="text",