From 8c30b36c8807fc9543691ecb3e0a80001cff5333 Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Wed, 11 Feb 2026 13:04:55 -0800 Subject: [PATCH 1/4] Move document ID, human_readable_id, and raw_data setup from create_final_documents into load workflows Consolidates core document field initialization (id string cast, human_readable_id index, raw_data default) into load_input_documents and load_update_documents so that create_final_documents only handles the text unit join. Also applies the same setup in the run_pipeline input_documents bypass paths. --- packages/graphrag/graphrag/index/run/run_pipeline.py | 8 ++++++++ .../graphrag/index/workflows/create_final_documents.py | 6 ------ .../graphrag/index/workflows/load_input_documents.py | 6 +++++- .../graphrag/index/workflows/load_update_documents.py | 3 +++ 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/packages/graphrag/graphrag/index/run/run_pipeline.py b/packages/graphrag/graphrag/index/run/run_pipeline.py index 401f067a33..49b81fc556 100644 --- a/packages/graphrag/graphrag/index/run/run_pipeline.py +++ b/packages/graphrag/graphrag/index/run/run_pipeline.py @@ -73,6 +73,10 @@ async def run_pipeline( # if the user passes in a df directly, write directly to storage so we can skip finding/parsing later if input_documents is not None: + input_documents["id"] = input_documents["id"].astype(str) + input_documents["human_readable_id"] = input_documents.index + if "raw_data" not in input_documents.columns: + input_documents["raw_data"] = pd.Series(dtype="object") await delta_table_provider.write_dataframe("documents", input_documents) pipeline.remove("load_update_documents") @@ -91,6 +95,10 @@ async def run_pipeline( # if the user passes in a df directly, write directly to storage so we can skip finding/parsing later if input_documents is not None: + input_documents["id"] = input_documents["id"].astype(str) + input_documents["human_readable_id"] = input_documents.index + if "raw_data" not in input_documents.columns: + input_documents["raw_data"] = pd.Series(dtype="object") await output_table_provider.write_dataframe("documents", input_documents) pipeline.remove("load_input_documents") diff --git a/packages/graphrag/graphrag/index/workflows/create_final_documents.py b/packages/graphrag/graphrag/index/workflows/create_final_documents.py index 57c67229e9..ccbd967821 100644 --- a/packages/graphrag/graphrag/index/workflows/create_final_documents.py +++ b/packages/graphrag/graphrag/index/workflows/create_final_documents.py @@ -65,10 +65,4 @@ def create_final_documents( copy=False, ).reset_index(drop=True) - rejoined["id"] = rejoined["id"].astype(str) - rejoined["human_readable_id"] = rejoined.index - - if "raw_data" not in rejoined.columns: - rejoined["raw_data"] = pd.Series(dtype="object") - return rejoined.loc[:, DOCUMENTS_FINAL_COLUMNS] diff --git a/packages/graphrag/graphrag/index/workflows/load_input_documents.py b/packages/graphrag/graphrag/index/workflows/load_input_documents.py index ed7f83c8e2..8e27ed0a2a 100644 --- a/packages/graphrag/graphrag/index/workflows/load_input_documents.py +++ b/packages/graphrag/graphrag/index/workflows/load_input_documents.py @@ -39,4 +39,8 @@ async def run_workflow( async def load_input_documents(input_reader: InputReader) -> pd.DataFrame: """Load and parse input documents into a standard format.""" - return pd.DataFrame(await input_reader.read_files()) + output = pd.DataFrame(await input_reader.read_files()) + output["human_readable_id"] = output.index + if "raw_data" not in output.columns: + output["raw_data"] = pd.Series(dtype="object") + return output diff --git a/packages/graphrag/graphrag/index/workflows/load_update_documents.py b/packages/graphrag/graphrag/index/workflows/load_update_documents.py index 3f4417d3e1..a61a228493 100644 --- a/packages/graphrag/graphrag/index/workflows/load_update_documents.py +++ b/packages/graphrag/graphrag/index/workflows/load_update_documents.py @@ -51,6 +51,9 @@ async def load_update_documents( ) -> pd.DataFrame: """Load and parse update-only input documents into a standard format.""" input_documents = pd.DataFrame(await input_reader.read_files()) + input_documents["human_readable_id"] = input_documents.index + if "raw_data" not in input_documents.columns: + input_documents["raw_data"] = pd.Series(dtype="object") # previous table provider has the output of the previous run # we'll use this to diff the input from the prior delta_documents = await get_delta_docs(input_documents, previous_table_provider) From 32c5b5a5715470baac8cbbf4222026d2a91d794b Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Wed, 11 Feb 2026 13:20:56 -0800 Subject: [PATCH 2/4] Remove overzealous input document assignment --- packages/graphrag/graphrag/index/run/run_pipeline.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/packages/graphrag/graphrag/index/run/run_pipeline.py b/packages/graphrag/graphrag/index/run/run_pipeline.py index 49b81fc556..401f067a33 100644 --- a/packages/graphrag/graphrag/index/run/run_pipeline.py +++ b/packages/graphrag/graphrag/index/run/run_pipeline.py @@ -73,10 +73,6 @@ async def run_pipeline( # if the user passes in a df directly, write directly to storage so we can skip finding/parsing later if input_documents is not None: - input_documents["id"] = input_documents["id"].astype(str) - input_documents["human_readable_id"] = input_documents.index - if "raw_data" not in input_documents.columns: - input_documents["raw_data"] = pd.Series(dtype="object") await delta_table_provider.write_dataframe("documents", input_documents) pipeline.remove("load_update_documents") @@ -95,10 +91,6 @@ async def run_pipeline( # if the user passes in a df directly, write directly to storage so we can skip finding/parsing later if input_documents is not None: - input_documents["id"] = input_documents["id"].astype(str) - input_documents["human_readable_id"] = input_documents.index - if "raw_data" not in input_documents.columns: - input_documents["raw_data"] = pd.Series(dtype="object") await output_table_provider.write_dataframe("documents", input_documents) pipeline.remove("load_input_documents") From 26ef89fdb2a61b51cfee658efb3c1fe7e6e35cd0 Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Wed, 11 Feb 2026 13:20:59 -0800 Subject: [PATCH 3/4] Semver --- .semversioner/next-release/patch-20260211211707376370.json | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .semversioner/next-release/patch-20260211211707376370.json diff --git a/.semversioner/next-release/patch-20260211211707376370.json b/.semversioner/next-release/patch-20260211211707376370.json new file mode 100644 index 0000000000..0158f43809 --- /dev/null +++ b/.semversioner/next-release/patch-20260211211707376370.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Move document ID, human_readable_id, and raw_data initialization from create_final_documents into load_input_documents and load_update_documents." +} From a1cee9e1493f3b90b7fbd75ef32cdabb37aeb2b7 Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Wed, 11 Feb 2026 13:25:53 -0800 Subject: [PATCH 4/4] Format --- docs/examples_notebooks/api_overview.ipynb | 5 ++--- docs/examples_notebooks/input_documents.ipynb | 5 ++--- tests/verbs/test_create_community_reports.py | 8 ++++---- unified-search-app/app/app_logic.py | 3 +-- 4 files changed, 9 insertions(+), 12 deletions(-) diff --git a/docs/examples_notebooks/api_overview.ipynb b/docs/examples_notebooks/api_overview.ipynb index abcd7832fc..2a0c0f15de 100644 --- a/docs/examples_notebooks/api_overview.ipynb +++ b/docs/examples_notebooks/api_overview.ipynb @@ -28,11 +28,10 @@ "from pathlib import Path\n", "from pprint import pprint\n", "\n", + "import graphrag.api as api\n", "import pandas as pd\n", "from graphrag.config.load_config import load_config\n", - "from graphrag.index.typing.pipeline_run_result import PipelineRunResult\n", - "\n", - "import graphrag.api as api" + "from graphrag.index.typing.pipeline_run_result import PipelineRunResult" ] }, { diff --git a/docs/examples_notebooks/input_documents.ipynb b/docs/examples_notebooks/input_documents.ipynb index 505c0fe1f3..5657770eaf 100644 --- a/docs/examples_notebooks/input_documents.ipynb +++ b/docs/examples_notebooks/input_documents.ipynb @@ -30,11 +30,10 @@ "from pathlib import Path\n", "from pprint import pprint\n", "\n", + "import graphrag.api as api\n", "import pandas as pd\n", "from graphrag.config.load_config import load_config\n", - "from graphrag.index.typing.pipeline_run_result import PipelineRunResult\n", - "\n", - "import graphrag.api as api" + "from graphrag.index.typing.pipeline_run_result import PipelineRunResult" ] }, { diff --git a/tests/verbs/test_create_community_reports.py b/tests/verbs/test_create_community_reports.py index 5a40b05449..68d8d1be9c 100644 --- a/tests/verbs/test_create_community_reports.py +++ b/tests/verbs/test_create_community_reports.py @@ -3,14 +3,14 @@ from graphrag.data_model.schemas import COMMUNITY_REPORTS_FINAL_COLUMNS -from graphrag.index.workflows.create_community_reports import ( - run_workflow, -) - from graphrag.index.operations.summarize_communities.community_reports_extractor import ( CommunityReportResponse, FindingModel, ) +from graphrag.index.workflows.create_community_reports import ( + run_workflow, +) + from tests.unit.config.utils import get_default_graphrag_config from .util import ( diff --git a/unified-search-app/app/app_logic.py b/unified-search-app/app/app_logic.py index a573b9daa5..dc64e0e77c 100644 --- a/unified-search-app/app/app_logic.py +++ b/unified-search-app/app/app_logic.py @@ -7,6 +7,7 @@ import logging from typing import TYPE_CHECKING +import graphrag.api as api import streamlit as st from knowledge_loader.data_sources.loader import ( create_datasource, @@ -17,8 +18,6 @@ from state.session_variables import SessionVariables from ui.search import display_search_result -import graphrag.api as api - if TYPE_CHECKING: import pandas as pd