From f3ab04737681e3a29ecb43acd06987b6b4cf2ba6 Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Wed, 23 Apr 2025 16:39:40 -0700 Subject: [PATCH 1/2] Fix text chunking logic --- graphrag/index/text_splitting/text_splitting.py | 4 ++++ tests/unit/indexing/text_splitting/test_text_splitting.py | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/graphrag/index/text_splitting/text_splitting.py b/graphrag/index/text_splitting/text_splitting.py index 1632904637..2693872b71 100644 --- a/graphrag/index/text_splitting/text_splitting.py +++ b/graphrag/index/text_splitting/text_splitting.py @@ -152,6 +152,8 @@ def split_single_text_on_tokens(text: str, tokenizer: Tokenizer) -> list[str]: while start_idx < len(input_ids): chunk_text = tokenizer.decode(list(chunk_ids)) result.append(chunk_text) # Append chunked text as string + if cur_idx == len(input_ids): # prevent single-token chunks + break start_idx += tokenizer.tokens_per_chunk - tokenizer.chunk_overlap cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids)) chunk_ids = input_ids[start_idx:cur_idx] @@ -186,6 +188,8 @@ def split_multiple_texts_on_tokens( chunk_text = tokenizer.decode([id for _, id in chunk_ids]) doc_indices = list({doc_idx for doc_idx, _ in chunk_ids}) result.append(TextChunk(chunk_text, doc_indices, len(chunk_ids))) + if cur_idx == len(input_ids): # prevent single-token chunks + break start_idx += tokenizer.tokens_per_chunk - tokenizer.chunk_overlap cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids)) chunk_ids = input_ids[start_idx:cur_idx] diff --git a/tests/unit/indexing/text_splitting/test_text_splitting.py b/tests/unit/indexing/text_splitting/test_text_splitting.py index da87d47350..10a5a06344 100644 --- a/tests/unit/indexing/text_splitting/test_text_splitting.py +++ b/tests/unit/indexing/text_splitting/test_text_splitting.py @@ -136,7 +136,6 @@ def test_split_single_text_on_tokens(): " by this t", "his test o", "est only.", - "nly.", ] result = split_single_text_on_tokens(text=text, tokenizer=tokenizer) @@ -197,7 +196,6 @@ def decode(tokens: list[int]) -> str: " this test", " test only", " only.", - ".", ] result = split_single_text_on_tokens(text=text, tokenizer=tokenizer) From c89dc1d60caf6fdef0162a7ff63bd7307bb9c6fa Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Wed, 23 Apr 2025 16:40:04 -0700 Subject: [PATCH 2/2] Semver --- .semversioner/next-release/patch-20250423233959070725.json | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .semversioner/next-release/patch-20250423233959070725.json diff --git a/.semversioner/next-release/patch-20250423233959070725.json b/.semversioner/next-release/patch-20250423233959070725.json new file mode 100644 index 0000000000..78ab81a340 --- /dev/null +++ b/.semversioner/next-release/patch-20250423233959070725.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Fix text chunking logic." +}