Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .semversioner/next-release/patch-20250422215029679348.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Add option to snapshot raw extractd graph tables."
}
1 change: 1 addition & 0 deletions graphrag/config/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,7 @@ class SnapshotsDefaults:

embeddings: bool = False
graphml: bool = False
raw_graph: bool = False


@dataclass
Expand Down
4 changes: 4 additions & 0 deletions graphrag/config/models/snapshots_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,7 @@ class SnapshotsConfig(BaseModel):
description="A flag indicating whether to take snapshots of GraphML.",
default=graphrag_config_defaults.snapshots.graphml,
)
raw_graph: bool = Field(
description="A flag indicating whether to take snapshots of the raw extracted graph (entities and relationships) before merging.",
default=graphrag_config_defaults.snapshots.raw_graph,
)
16 changes: 13 additions & 3 deletions graphrag/index/workflows/extract_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ async def run_workflow(
config.root_dir, summarization_llm_settings
)

entities, relationships = await extract_graph(
entities, relationships, raw_entities, raw_relationships = await extract_graph(
text_units=text_units,
callbacks=context.callbacks,
cache=context.cache,
Expand All @@ -58,6 +58,12 @@ async def run_workflow(
await write_table_to_storage(entities, "entities", context.storage)
await write_table_to_storage(relationships, "relationships", context.storage)

if config.snapshots.raw_graph:
await write_table_to_storage(raw_entities, "raw_entities", context.storage)
await write_table_to_storage(
raw_relationships, "raw_relationships", context.storage
)

return WorkflowFunctionOutput(
result={
"entities": entities,
Expand All @@ -76,7 +82,7 @@ async def extract_graph(
entity_types: list[str] | None = None,
summarization_strategy: dict[str, Any] | None = None,
summarization_num_threads: int = 4,
) -> tuple[pd.DataFrame, pd.DataFrame]:
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""All the steps to create the base entity graph."""
# this returns a graph for each text unit, to be merged later
extracted_entities, extracted_relationships = await extractor(
Expand All @@ -103,6 +109,10 @@ async def extract_graph(
callbacks.error(error_msg)
raise ValueError(error_msg)

# copy these as is before any summarization
Comment thread
natoverse marked this conversation as resolved.
raw_entities = extracted_entities.copy()
raw_relationships = extracted_relationships.copy()

entities, relationships = await get_summarized_entities_relationships(
extracted_entities=extracted_entities,
extracted_relationships=extracted_relationships,
Expand All @@ -112,7 +122,7 @@ async def extract_graph(
summarization_num_threads=summarization_num_threads,
)

return (entities, relationships)
return (entities, relationships, raw_entities, raw_relationships)


async def get_summarized_entities_relationships(
Expand Down
Loading