From 18ef4a9b9137bf5e74aa44f892395c2aa5e3cf77 Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Tue, 22 Apr 2025 12:58:45 -0700 Subject: [PATCH 01/15] Update stopwords config --- graphrag/config/defaults.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/graphrag/config/defaults.py b/graphrag/config/defaults.py index 3977ed5820..c72d1414bc 100644 --- a/graphrag/config/defaults.py +++ b/graphrag/config/defaults.py @@ -19,6 +19,7 @@ ReportingType, TextEmbeddingTarget, ) +from graphrag.index.operations.build_noun_graph.np_extractors.stop_words import EN_STOP_WORDS from graphrag.vector_stores.factory import VectorStoreType DEFAULT_OUTPUT_BASE_DIR = "output" @@ -195,7 +196,7 @@ class TextAnalyzerDefaults: max_word_length: int = 15 word_delimiter: str = " " include_named_entities: bool = True - exclude_nouns: None = None + exclude_nouns: list[str] = field(default_factory=lambda: EN_STOP_WORDS) exclude_entity_tags: list[str] = field(default_factory=lambda: ["DATE"]) exclude_pos_tags: list[str] = field( default_factory=lambda: ["DET", "PRON", "INTJ", "X"] From 670ec2366362bc3680008a6d2c1cff2c1918601c Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Tue, 22 Apr 2025 13:35:38 -0700 Subject: [PATCH 02/15] Minor edits --- graphrag/index/operations/build_noun_graph/build_noun_graph.py | 2 +- graphrag/index/operations/prune_graph.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py index 195d2685d1..cc67996ea4 100644 --- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py +++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py @@ -69,7 +69,7 @@ async def extract(row): noun_node_df = text_unit_df.explode("noun_phrases") noun_node_df = noun_node_df.rename( columns={"noun_phrases": "title", "id": "text_unit_id"} - ).drop_duplicates() + ) # group by title and count the number of text units grouped_node_df = ( diff --git a/graphrag/index/operations/prune_graph.py b/graphrag/index/operations/prune_graph.py index 71f76215cd..d826558584 100644 --- a/graphrag/index/operations/prune_graph.py +++ b/graphrag/index/operations/prune_graph.py @@ -21,7 +21,7 @@ def prune_graph( max_node_freq_std: float | None = None, min_node_degree: int = 1, max_node_degree_std: float | None = None, - min_edge_weight_pct: float = 0, + min_edge_weight_pct: float = 40, remove_ego_nodes: bool = False, lcc_only: bool = False, ) -> nx.Graph: From ba180523fdeef88bf2b9d07030aac0554f8d9695 Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Tue, 22 Apr 2025 13:39:54 -0700 Subject: [PATCH 03/15] Update PMI --- graphrag/index/operations/build_noun_graph/build_noun_graph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py index cc67996ea4..84d705a92a 100644 --- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py +++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py @@ -193,7 +193,7 @@ def _calculate_pmi_edge_weights( .rename(columns={"prop_occurrence": "target_prop"}) ) edges_df[edge_weight_col] = edges_df.apply( - lambda x: math.log2(x["prop_weight"] / (x["source_prop"] * x["target_prop"])), + lambda x: x["prop_weight"] * math.log2(x["prop_weight"] / (x["source_prop"] * x["target_prop"])), axis=1, ) return edges_df.drop(columns=["prop_weight", "source_prop", "target_prop"]) From 1f60a992bb50409bd1abeae4e2774468005973af Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Tue, 22 Apr 2025 13:40:17 -0700 Subject: [PATCH 04/15] Format --- graphrag/config/defaults.py | 4 +++- .../index/operations/build_noun_graph/build_noun_graph.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/graphrag/config/defaults.py b/graphrag/config/defaults.py index c72d1414bc..58375b317a 100644 --- a/graphrag/config/defaults.py +++ b/graphrag/config/defaults.py @@ -19,7 +19,9 @@ ReportingType, TextEmbeddingTarget, ) -from graphrag.index.operations.build_noun_graph.np_extractors.stop_words import EN_STOP_WORDS +from graphrag.index.operations.build_noun_graph.np_extractors.stop_words import ( + EN_STOP_WORDS, +) from graphrag.vector_stores.factory import VectorStoreType DEFAULT_OUTPUT_BASE_DIR = "output" diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py index 84d705a92a..7956b2045d 100644 --- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py +++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py @@ -193,7 +193,8 @@ def _calculate_pmi_edge_weights( .rename(columns={"prop_occurrence": "target_prop"}) ) edges_df[edge_weight_col] = edges_df.apply( - lambda x: x["prop_weight"] * math.log2(x["prop_weight"] / (x["source_prop"] * x["target_prop"])), + lambda x: x["prop_weight"] + * math.log2(x["prop_weight"] / (x["source_prop"] * x["target_prop"])), axis=1, ) return edges_df.drop(columns=["prop_weight", "source_prop", "target_prop"]) From 776206658415fa39170692fda60974bef21e5a4a Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Tue, 22 Apr 2025 14:07:22 -0700 Subject: [PATCH 05/15] Perf improvements --- .../build_noun_graph/build_noun_graph.py | 25 +++++-------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py index 7956b2045d..a055b45626 100644 --- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py +++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py @@ -3,8 +3,9 @@ """Graph extraction using NLP.""" -import math +from itertools import combinations +import numpy as np import pandas as pd from graphrag.cache.noop_pipeline_cache import NoopPipelineCache @@ -98,7 +99,7 @@ def _extract_edges( text_units_df.groupby("text_unit_id").agg({"title": list}).reset_index() ) text_units_df["edges"] = text_units_df["title"].apply( - lambda x: _create_relationships(x) + lambda x: list(combinations(x, 2)) ) edge_df = text_units_df.explode("edges").loc[:, ["edges", "text_unit_id"]] @@ -122,7 +123,7 @@ def _extract_edges( strict=False, ) - # group by source and target, count the number of text units and collect their ids + # group by source and target, count the number of text units grouped_edge_df = ( edge_df.groupby(["source", "target"]).agg({"text_unit_id": list}).reset_index() ) @@ -140,18 +141,6 @@ def _extract_edges( return grouped_edge_df -def _create_relationships( - noun_phrases: list[str], -) -> list[tuple[str, str]]: - """Create a (source, target) tuple pairwise for all noun phrases in a list.""" - relationships = [] - if len(noun_phrases) >= 2: - for i in range(len(noun_phrases) - 1): - for j in range(i + 1, len(noun_phrases)): - relationships.extend([(noun_phrases[i], noun_phrases[j])]) - return relationships - - def _calculate_pmi_edge_weights( nodes_df: pd.DataFrame, edges_df: pd.DataFrame, @@ -192,9 +181,7 @@ def _calculate_pmi_edge_weights( .drop(columns=[node_name_col]) .rename(columns={"prop_occurrence": "target_prop"}) ) - edges_df[edge_weight_col] = edges_df.apply( - lambda x: x["prop_weight"] - * math.log2(x["prop_weight"] / (x["source_prop"] * x["target_prop"])), - axis=1, + edges_df[edge_weight_col] = edges_df["prop_weight"] * np.log2( + edges_df["prop_weight"] / (edges_df["source_prop"] * edges_df["target_prop"]) ) return edges_df.drop(columns=["prop_weight", "source_prop", "target_prop"]) From 6237db9dadd5a2d407e315ee40e9c650466c03e0 Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Tue, 22 Apr 2025 14:08:05 -0700 Subject: [PATCH 06/15] Semver --- .semversioner/next-release/patch-20250422210800599071.json | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .semversioner/next-release/patch-20250422210800599071.json diff --git a/.semversioner/next-release/patch-20250422210800599071.json b/.semversioner/next-release/patch-20250422210800599071.json new file mode 100644 index 0000000000..9924cade88 --- /dev/null +++ b/.semversioner/next-release/patch-20250422210800599071.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Brings parity with our latest NLP extraction approaches." +} From 92df6ee7f08056fd97dea29027e683c6bc68e13d Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Wed, 23 Apr 2025 15:08:15 -0700 Subject: [PATCH 07/15] Remove edge collection apply --- .../operations/build_noun_graph/build_noun_graph.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py index a055b45626..7aa0f5c832 100644 --- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py +++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py @@ -4,6 +4,7 @@ """Graph extraction using NLP.""" from itertools import combinations +from typing import Any import numpy as np import pandas as pd @@ -98,9 +99,12 @@ def _extract_edges( text_units_df = ( text_units_df.groupby("text_unit_id").agg({"title": list}).reset_index() ) - text_units_df["edges"] = text_units_df["title"].apply( - lambda x: list(combinations(x, 2)) - ) + + titles = text_units_df["title"].tolist() + all_edges: Any = [list(combinations(t, 2)) for t in titles] + + text_units_df = text_units_df.assign(edges=all_edges) + edge_df = text_units_df.explode("edges").loc[:, ["edges", "text_unit_id"]] edge_df["source"] = edge_df["edges"].apply( From 6c0b81ead21d27df997ae3f483cf0536e9069ced Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Wed, 23 Apr 2025 15:26:58 -0700 Subject: [PATCH 08/15] Remove source/target apply --- .../build_noun_graph/build_noun_graph.py | 23 ++++--------------- 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py index 7aa0f5c832..200588c5ad 100644 --- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py +++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py @@ -107,26 +107,14 @@ def _extract_edges( edge_df = text_units_df.explode("edges").loc[:, ["edges", "text_unit_id"]] - edge_df["source"] = edge_df["edges"].apply( - lambda x: x[0] if isinstance(x, tuple) else None - ) - edge_df["target"] = edge_df["edges"].apply( - lambda x: x[1] if isinstance(x, tuple) else None - ) + edge_df[["source", "target"]] = edge_df["edges"].to_list() + edge_df["min_source"] = edge_df[["source", "target"]].min(axis=1) + edge_df["max_target"] = edge_df[["source", "target"]].max(axis=1) + edge_df = edge_df.drop(columns=["source", "target"]).rename(columns={"min_source": "source", "max_target": "target"}) + edge_df = edge_df[(edge_df.source.notna()) & (edge_df.target.notna())] edge_df = edge_df.drop(columns=["edges"]) - # make sure source is always smaller than target - edge_df["source"], edge_df["target"] = zip( - *edge_df.apply( - lambda x: (x["source"], x["target"]) - if x["source"] < x["target"] - else (x["target"], x["source"]), - axis=1, - ), - strict=False, - ) - # group by source and target, count the number of text units grouped_edge_df = ( edge_df.groupby(["source", "target"]).agg({"text_unit_id": list}).reset_index() @@ -137,7 +125,6 @@ def _extract_edges( grouped_edge_df = grouped_edge_df.loc[ :, ["source", "target", "weight", "text_unit_ids"] ] - if normalize_edge_weights: # use PMI weight instead of raw weight grouped_edge_df = _calculate_pmi_edge_weights(nodes_df, grouped_edge_df) From da731fdc384ff1f4bc34663380bb5cd0332790c4 Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Wed, 23 Apr 2025 16:16:00 -0700 Subject: [PATCH 09/15] Add edge weight to graph snapshot --- graphrag/index/workflows/finalize_graph.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/graphrag/index/workflows/finalize_graph.py b/graphrag/index/workflows/finalize_graph.py index 890bb8515d..a5a94ba062 100644 --- a/graphrag/index/workflows/finalize_graph.py +++ b/graphrag/index/workflows/finalize_graph.py @@ -38,7 +38,8 @@ async def run_workflow( if config.snapshots.graphml: # todo: extract graphs at each level, and add in meta like descriptions - graph = create_graph(relationships) + graph = create_graph(final_relationships, edge_attr=["weight"]) + await snapshot_graphml( graph, name="graph", From 41eda41953bab9df9df1e7c48eaec032fa13ada0 Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Wed, 23 Apr 2025 16:20:09 -0700 Subject: [PATCH 10/15] Revert breaking optimizations --- .../build_noun_graph/build_noun_graph.py | 34 +++++++++++-------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py index 200588c5ad..e9176ab1ae 100644 --- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py +++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py @@ -4,7 +4,6 @@ """Graph extraction using NLP.""" from itertools import combinations -from typing import Any import numpy as np import pandas as pd @@ -100,20 +99,30 @@ def _extract_edges( text_units_df.groupby("text_unit_id").agg({"title": list}).reset_index() ) - titles = text_units_df["title"].tolist() - all_edges: Any = [list(combinations(t, 2)) for t in titles] - - text_units_df = text_units_df.assign(edges=all_edges) + text_units_df["edges"] = text_units_df["title"].apply( + lambda x: list(combinations(x, 2)) + ) edge_df = text_units_df.explode("edges").loc[:, ["edges", "text_unit_id"]] - edge_df[["source", "target"]] = edge_df["edges"].to_list() - edge_df["min_source"] = edge_df[["source", "target"]].min(axis=1) - edge_df["max_target"] = edge_df[["source", "target"]].max(axis=1) - edge_df = edge_df.drop(columns=["source", "target"]).rename(columns={"min_source": "source", "max_target": "target"}) - + edge_df["source"] = edge_df["edges"].apply( + lambda x: x[0] if isinstance(x, tuple) else None + ) + edge_df["target"] = edge_df["edges"].apply( + lambda x: x[1] if isinstance(x, tuple) else None + ) edge_df = edge_df[(edge_df.source.notna()) & (edge_df.target.notna())] edge_df = edge_df.drop(columns=["edges"]) + # make sure source is always smaller than target + edge_df["source"], edge_df["target"] = zip( + *edge_df.apply( + lambda x: (x["source"], x["target"]) + if x["source"] < x["target"] + else (x["target"], x["source"]), + axis=1, + ), + strict=False, + ) # group by source and target, count the number of text units grouped_edge_df = ( @@ -121,10 +130,7 @@ def _extract_edges( ) grouped_edge_df = grouped_edge_df.rename(columns={"text_unit_id": "text_unit_ids"}) grouped_edge_df["weight"] = grouped_edge_df["text_unit_ids"].apply(len) - - grouped_edge_df = grouped_edge_df.loc[ - :, ["source", "target", "weight", "text_unit_ids"] - ] + grouped_edge_df = grouped_edge_df[["source", "target", "weight", "text_unit_ids"]] if normalize_edge_weights: # use PMI weight instead of raw weight grouped_edge_df = _calculate_pmi_edge_weights(nodes_df, grouped_edge_df) From 0f54cb109bc881816853c29e7e4a1bfec9b166d3 Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Thu, 24 Apr 2025 15:39:57 -0700 Subject: [PATCH 11/15] Add perf fixes back in --- .../build_noun_graph/build_noun_graph.py | 41 ++++++++----------- 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py index e9176ab1ae..d927d4861e 100644 --- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py +++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py @@ -4,6 +4,7 @@ """Graph extraction using NLP.""" from itertools import combinations +from typing import Any import numpy as np import pandas as pd @@ -31,7 +32,6 @@ async def build_noun_graph( text_units, text_analyzer, num_threads=num_threads, cache=cache ) edges_df = _extract_edges(nodes_df, normalize_edge_weights=normalize_edge_weights) - return (nodes_df, edges_df) @@ -95,35 +95,28 @@ def _extract_edges( """ text_units_df = nodes_df.explode("text_unit_ids") text_units_df = text_units_df.rename(columns={"text_unit_ids": "text_unit_id"}) + text_units_df = ( - text_units_df.groupby("text_unit_id").agg({"title": list}).reset_index() + text_units_df.groupby("text_unit_id") + .agg({"title": lambda x: list(x) if len(x) > 1 else np.nan}) + .reset_index() ) - - text_units_df["edges"] = text_units_df["title"].apply( - lambda x: list(combinations(x, 2)) + text_units_df = text_units_df.dropna() + titles = text_units_df["title"].tolist() + all_edges: Any = [list(combinations(t, 2)) for t in titles] + + text_units_df = text_units_df.assign(edges=all_edges) + edge_df = text_units_df.explode("edges")[["edges", "text_unit_id"]] + + edge_df[["source", "target"]] = edge_df["edges"].to_list() + edge_df["min_source"] = edge_df[["source", "target"]].min(axis=1) + edge_df["max_target"] = edge_df[["source", "target"]].max(axis=1) + edge_df = edge_df.drop(columns=["source", "target"]).rename( + columns={"min_source": "source", "max_target": "target"} ) - edge_df = text_units_df.explode("edges").loc[:, ["edges", "text_unit_id"]] - - edge_df["source"] = edge_df["edges"].apply( - lambda x: x[0] if isinstance(x, tuple) else None - ) - edge_df["target"] = edge_df["edges"].apply( - lambda x: x[1] if isinstance(x, tuple) else None - ) edge_df = edge_df[(edge_df.source.notna()) & (edge_df.target.notna())] edge_df = edge_df.drop(columns=["edges"]) - # make sure source is always smaller than target - edge_df["source"], edge_df["target"] = zip( - *edge_df.apply( - lambda x: (x["source"], x["target"]) - if x["source"] < x["target"] - else (x["target"], x["source"]), - axis=1, - ), - strict=False, - ) - # group by source and target, count the number of text units grouped_edge_df = ( edge_df.groupby(["source", "target"]).agg({"text_unit_id": list}).reset_index() From efee425d4147542fbc9c8e6dd8f2f961064dfa8a Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Thu, 24 Apr 2025 16:14:04 -0700 Subject: [PATCH 12/15] Format/types --- .../operations/build_noun_graph/build_noun_graph.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py index d927d4861e..5036ee9dc6 100644 --- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py +++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py @@ -4,7 +4,6 @@ """Graph extraction using NLP.""" from itertools import combinations -from typing import Any import numpy as np import pandas as pd @@ -103,16 +102,16 @@ def _extract_edges( ) text_units_df = text_units_df.dropna() titles = text_units_df["title"].tolist() - all_edges: Any = [list(combinations(t, 2)) for t in titles] + all_edges: list[list[tuple[str, str]]] = [list(combinations(t, 2)) for t in titles] - text_units_df = text_units_df.assign(edges=all_edges) + text_units_df = text_units_df.assign(edges=all_edges) # type: ignore edge_df = text_units_df.explode("edges")[["edges", "text_unit_id"]] - edge_df[["source", "target"]] = edge_df["edges"].to_list() + edge_df[["source", "target"]] = edge_df.loc[:, "edges"].to_list() edge_df["min_source"] = edge_df[["source", "target"]].min(axis=1) edge_df["max_target"] = edge_df[["source", "target"]].max(axis=1) edge_df = edge_df.drop(columns=["source", "target"]).rename( - columns={"min_source": "source", "max_target": "target"} + columns={"min_source": "source", "max_target": "target"} # type: ignore ) edge_df = edge_df[(edge_df.source.notna()) & (edge_df.target.notna())] @@ -123,7 +122,9 @@ def _extract_edges( ) grouped_edge_df = grouped_edge_df.rename(columns={"text_unit_id": "text_unit_ids"}) grouped_edge_df["weight"] = grouped_edge_df["text_unit_ids"].apply(len) - grouped_edge_df = grouped_edge_df[["source", "target", "weight", "text_unit_ids"]] + grouped_edge_df = grouped_edge_df.loc[ + :, ["source", "target", "weight", "text_unit_ids"] + ] if normalize_edge_weights: # use PMI weight instead of raw weight grouped_edge_df = _calculate_pmi_edge_weights(nodes_df, grouped_edge_df) From 7ab3a0fc0451a098e02fa898a0eef863abb7d8e4 Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Fri, 25 Apr 2025 13:57:57 -0700 Subject: [PATCH 13/15] Update defaults --- docs/config/yaml.md | 2 +- graphrag/config/defaults.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/config/yaml.md b/docs/config/yaml.md index 791b67341c..00c64e0ad2 100644 --- a/docs/config/yaml.md +++ b/docs/config/yaml.md @@ -251,7 +251,7 @@ Parameters for manual graph pruning. This can be used to optimize the modularity - max_node_freq_std **float | None** - The maximum standard deviation of node frequency to allow. - min_node_degree **int** - The minimum node degree to allow. - max_node_degree_std **float | None** - The maximum standard deviation of node degree to allow. -- min_edge_weight_pct **int** - The minimum edge weight percentile to allow. +- min_edge_weight_pct **float** - The minimum edge weight percentile to allow. - remove_ego_nodes **bool** - Remove ego nodes. - lcc_only **bool** - Only use largest connected component. diff --git a/graphrag/config/defaults.py b/graphrag/config/defaults.py index 5be5925c2c..3f52d0b90e 100644 --- a/graphrag/config/defaults.py +++ b/graphrag/config/defaults.py @@ -319,8 +319,8 @@ class PruneGraphDefaults: max_node_freq_std: None = None min_node_degree: int = 1 max_node_degree_std: None = None - min_edge_weight_pct: int = 40 - remove_ego_nodes: bool = False + min_edge_weight_pct: float = 40.0 + remove_ego_nodes: bool = True lcc_only: bool = False From fe897108de3f323e7ff1c56887ba1806bdc04dd3 Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Fri, 25 Apr 2025 14:12:36 -0700 Subject: [PATCH 14/15] Fix source/target ordering --- graphrag/index/operations/graph_to_dataframes.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/graphrag/index/operations/graph_to_dataframes.py b/graphrag/index/operations/graph_to_dataframes.py index add48bd3ae..dbc608f640 100644 --- a/graphrag/index/operations/graph_to_dataframes.py +++ b/graphrag/index/operations/graph_to_dataframes.py @@ -21,6 +21,14 @@ def graph_to_dataframes( edges = nx.to_pandas_edgelist(graph) + # we don't deal in directed graphs, but we do need to ensure consistent ordering for df joins + # nx loses the initial ordering + edges["min_source"] = edges[["source", "target"]].min(axis=1) + edges["max_target"] = edges[["source", "target"]].max(axis=1) + edges = edges.drop(columns=["source", "target"]).rename( + columns={"min_source": "source", "max_target": "target"} # type: ignore + ) + if node_columns: nodes = nodes.loc[:, node_columns] From cbc6613d4d7b70cb1cf11a26923750079afd0880 Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Fri, 25 Apr 2025 14:20:31 -0700 Subject: [PATCH 15/15] Fix test --- tests/verbs/test_prune_graph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/verbs/test_prune_graph.py b/tests/verbs/test_prune_graph.py index b30546bc92..3801dd27c0 100644 --- a/tests/verbs/test_prune_graph.py +++ b/tests/verbs/test_prune_graph.py @@ -28,4 +28,4 @@ async def test_prune_graph(): nodes_actual = await load_table_from_storage("entities", context.storage) - assert len(nodes_actual) == 21 + assert len(nodes_actual) == 20