From 18ef4a9b9137bf5e74aa44f892395c2aa5e3cf77 Mon Sep 17 00:00:00 2001
From: Nathan Evans <github@talkswithnumbers.com>
Date: Tue, 22 Apr 2025 12:58:45 -0700
Subject: [PATCH 01/15] Update stopwords config

---
 graphrag/config/defaults.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/graphrag/config/defaults.py b/graphrag/config/defaults.py
index 3977ed5820..c72d1414bc 100644
--- a/graphrag/config/defaults.py
+++ b/graphrag/config/defaults.py
@@ -19,6 +19,7 @@
     ReportingType,
     TextEmbeddingTarget,
 )
+from graphrag.index.operations.build_noun_graph.np_extractors.stop_words import EN_STOP_WORDS
 from graphrag.vector_stores.factory import VectorStoreType
 
 DEFAULT_OUTPUT_BASE_DIR = "output"
@@ -195,7 +196,7 @@ class TextAnalyzerDefaults:
     max_word_length: int = 15
     word_delimiter: str = " "
     include_named_entities: bool = True
-    exclude_nouns: None = None
+    exclude_nouns: list[str] = field(default_factory=lambda: EN_STOP_WORDS)
     exclude_entity_tags: list[str] = field(default_factory=lambda: ["DATE"])
     exclude_pos_tags: list[str] = field(
         default_factory=lambda: ["DET", "PRON", "INTJ", "X"]

From 670ec2366362bc3680008a6d2c1cff2c1918601c Mon Sep 17 00:00:00 2001
From: Nathan Evans <github@talkswithnumbers.com>
Date: Tue, 22 Apr 2025 13:35:38 -0700
Subject: [PATCH 02/15] Minor edits

---
 graphrag/index/operations/build_noun_graph/build_noun_graph.py | 2 +-
 graphrag/index/operations/prune_graph.py                       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
index 195d2685d1..cc67996ea4 100644
--- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py
+++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
@@ -69,7 +69,7 @@ async def extract(row):
     noun_node_df = text_unit_df.explode("noun_phrases")
     noun_node_df = noun_node_df.rename(
         columns={"noun_phrases": "title", "id": "text_unit_id"}
-    ).drop_duplicates()
+    )
 
     # group by title and count the number of text units
     grouped_node_df = (
diff --git a/graphrag/index/operations/prune_graph.py b/graphrag/index/operations/prune_graph.py
index 71f76215cd..d826558584 100644
--- a/graphrag/index/operations/prune_graph.py
+++ b/graphrag/index/operations/prune_graph.py
@@ -21,7 +21,7 @@ def prune_graph(
     max_node_freq_std: float | None = None,
     min_node_degree: int = 1,
     max_node_degree_std: float | None = None,
-    min_edge_weight_pct: float = 0,
+    min_edge_weight_pct: float = 40,
     remove_ego_nodes: bool = False,
     lcc_only: bool = False,
 ) -> nx.Graph:

From ba180523fdeef88bf2b9d07030aac0554f8d9695 Mon Sep 17 00:00:00 2001
From: Nathan Evans <github@talkswithnumbers.com>
Date: Tue, 22 Apr 2025 13:39:54 -0700
Subject: [PATCH 03/15] Update PMI

---
 graphrag/index/operations/build_noun_graph/build_noun_graph.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
index cc67996ea4..84d705a92a 100644
--- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py
+++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
@@ -193,7 +193,7 @@ def _calculate_pmi_edge_weights(
         .rename(columns={"prop_occurrence": "target_prop"})
     )
     edges_df[edge_weight_col] = edges_df.apply(
-        lambda x: math.log2(x["prop_weight"] / (x["source_prop"] * x["target_prop"])),
+        lambda x: x["prop_weight"] * math.log2(x["prop_weight"] / (x["source_prop"] * x["target_prop"])),
         axis=1,
     )
     return edges_df.drop(columns=["prop_weight", "source_prop", "target_prop"])

From 1f60a992bb50409bd1abeae4e2774468005973af Mon Sep 17 00:00:00 2001
From: Nathan Evans <github@talkswithnumbers.com>
Date: Tue, 22 Apr 2025 13:40:17 -0700
Subject: [PATCH 04/15] Format

---
 graphrag/config/defaults.py                                   | 4 +++-
 .../index/operations/build_noun_graph/build_noun_graph.py     | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/graphrag/config/defaults.py b/graphrag/config/defaults.py
index c72d1414bc..58375b317a 100644
--- a/graphrag/config/defaults.py
+++ b/graphrag/config/defaults.py
@@ -19,7 +19,9 @@
     ReportingType,
     TextEmbeddingTarget,
 )
-from graphrag.index.operations.build_noun_graph.np_extractors.stop_words import EN_STOP_WORDS
+from graphrag.index.operations.build_noun_graph.np_extractors.stop_words import (
+    EN_STOP_WORDS,
+)
 from graphrag.vector_stores.factory import VectorStoreType
 
 DEFAULT_OUTPUT_BASE_DIR = "output"
diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
index 84d705a92a..7956b2045d 100644
--- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py
+++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
@@ -193,7 +193,8 @@ def _calculate_pmi_edge_weights(
         .rename(columns={"prop_occurrence": "target_prop"})
     )
     edges_df[edge_weight_col] = edges_df.apply(
-        lambda x: x["prop_weight"] * math.log2(x["prop_weight"] / (x["source_prop"] * x["target_prop"])),
+        lambda x: x["prop_weight"]
+        * math.log2(x["prop_weight"] / (x["source_prop"] * x["target_prop"])),
         axis=1,
     )
     return edges_df.drop(columns=["prop_weight", "source_prop", "target_prop"])

From 776206658415fa39170692fda60974bef21e5a4a Mon Sep 17 00:00:00 2001
From: Nathan Evans <github@talkswithnumbers.com>
Date: Tue, 22 Apr 2025 14:07:22 -0700
Subject: [PATCH 05/15] Perf improvements

---
 .../build_noun_graph/build_noun_graph.py      | 25 +++++--------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
index 7956b2045d..a055b45626 100644
--- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py
+++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
@@ -3,8 +3,9 @@
 
 """Graph extraction using NLP."""
 
-import math
+from itertools import combinations
 
+import numpy as np
 import pandas as pd
 
 from graphrag.cache.noop_pipeline_cache import NoopPipelineCache
@@ -98,7 +99,7 @@ def _extract_edges(
         text_units_df.groupby("text_unit_id").agg({"title": list}).reset_index()
     )
     text_units_df["edges"] = text_units_df["title"].apply(
-        lambda x: _create_relationships(x)
+        lambda x: list(combinations(x, 2))
     )
     edge_df = text_units_df.explode("edges").loc[:, ["edges", "text_unit_id"]]
 
@@ -122,7 +123,7 @@ def _extract_edges(
         strict=False,
     )
 
-    # group by source and target, count the number of text units and collect their ids
+    # group by source and target, count the number of text units
     grouped_edge_df = (
         edge_df.groupby(["source", "target"]).agg({"text_unit_id": list}).reset_index()
     )
@@ -140,18 +141,6 @@ def _extract_edges(
     return grouped_edge_df
 
 
-def _create_relationships(
-    noun_phrases: list[str],
-) -> list[tuple[str, str]]:
-    """Create a (source, target) tuple pairwise for all noun phrases in a list."""
-    relationships = []
-    if len(noun_phrases) >= 2:
-        for i in range(len(noun_phrases) - 1):
-            for j in range(i + 1, len(noun_phrases)):
-                relationships.extend([(noun_phrases[i], noun_phrases[j])])
-    return relationships
-
-
 def _calculate_pmi_edge_weights(
     nodes_df: pd.DataFrame,
     edges_df: pd.DataFrame,
@@ -192,9 +181,7 @@ def _calculate_pmi_edge_weights(
         .drop(columns=[node_name_col])
         .rename(columns={"prop_occurrence": "target_prop"})
     )
-    edges_df[edge_weight_col] = edges_df.apply(
-        lambda x: x["prop_weight"]
-        * math.log2(x["prop_weight"] / (x["source_prop"] * x["target_prop"])),
-        axis=1,
+    edges_df[edge_weight_col] = edges_df["prop_weight"] * np.log2(
+        edges_df["prop_weight"] / (edges_df["source_prop"] * edges_df["target_prop"])
     )
     return edges_df.drop(columns=["prop_weight", "source_prop", "target_prop"])

From 6237db9dadd5a2d407e315ee40e9c650466c03e0 Mon Sep 17 00:00:00 2001
From: Nathan Evans <github@talkswithnumbers.com>
Date: Tue, 22 Apr 2025 14:08:05 -0700
Subject: [PATCH 06/15] Semver

---
 .semversioner/next-release/patch-20250422210800599071.json | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 .semversioner/next-release/patch-20250422210800599071.json

diff --git a/.semversioner/next-release/patch-20250422210800599071.json b/.semversioner/next-release/patch-20250422210800599071.json
new file mode 100644
index 0000000000..9924cade88
--- /dev/null
+++ b/.semversioner/next-release/patch-20250422210800599071.json
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Brings parity with our latest NLP extraction approaches."
+}

From 92df6ee7f08056fd97dea29027e683c6bc68e13d Mon Sep 17 00:00:00 2001
From: Nathan Evans <github@talkswithnumbers.com>
Date: Wed, 23 Apr 2025 15:08:15 -0700
Subject: [PATCH 07/15] Remove edge collection apply

---
 .../operations/build_noun_graph/build_noun_graph.py    | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
index a055b45626..7aa0f5c832 100644
--- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py
+++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
@@ -4,6 +4,7 @@
 """Graph extraction using NLP."""
 
 from itertools import combinations
+from typing import Any
 
 import numpy as np
 import pandas as pd
@@ -98,9 +99,12 @@ def _extract_edges(
     text_units_df = (
         text_units_df.groupby("text_unit_id").agg({"title": list}).reset_index()
     )
-    text_units_df["edges"] = text_units_df["title"].apply(
-        lambda x: list(combinations(x, 2))
-    )
+
+    titles = text_units_df["title"].tolist()
+    all_edges: Any = [list(combinations(t, 2)) for t in titles]
+
+    text_units_df = text_units_df.assign(edges=all_edges)
+
     edge_df = text_units_df.explode("edges").loc[:, ["edges", "text_unit_id"]]
 
     edge_df["source"] = edge_df["edges"].apply(

From 6c0b81ead21d27df997ae3f483cf0536e9069ced Mon Sep 17 00:00:00 2001
From: Nathan Evans <github@talkswithnumbers.com>
Date: Wed, 23 Apr 2025 15:26:58 -0700
Subject: [PATCH 08/15] Remove source/target apply

---
 .../build_noun_graph/build_noun_graph.py      | 23 ++++---------------
 1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
index 7aa0f5c832..200588c5ad 100644
--- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py
+++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
@@ -107,26 +107,14 @@ def _extract_edges(
 
     edge_df = text_units_df.explode("edges").loc[:, ["edges", "text_unit_id"]]
 
-    edge_df["source"] = edge_df["edges"].apply(
-        lambda x: x[0] if isinstance(x, tuple) else None
-    )
-    edge_df["target"] = edge_df["edges"].apply(
-        lambda x: x[1] if isinstance(x, tuple) else None
-    )
+    edge_df[["source", "target"]] = edge_df["edges"].to_list()
+    edge_df["min_source"] = edge_df[["source", "target"]].min(axis=1)
+    edge_df["max_target"] = edge_df[["source", "target"]].max(axis=1)
+    edge_df = edge_df.drop(columns=["source", "target"]).rename(columns={"min_source": "source", "max_target": "target"})
+
     edge_df = edge_df[(edge_df.source.notna()) & (edge_df.target.notna())]
     edge_df = edge_df.drop(columns=["edges"])
 
-    # make sure source is always smaller than target
-    edge_df["source"], edge_df["target"] = zip(
-        *edge_df.apply(
-            lambda x: (x["source"], x["target"])
-            if x["source"] < x["target"]
-            else (x["target"], x["source"]),
-            axis=1,
-        ),
-        strict=False,
-    )
-
     # group by source and target, count the number of text units
     grouped_edge_df = (
         edge_df.groupby(["source", "target"]).agg({"text_unit_id": list}).reset_index()
@@ -137,7 +125,6 @@ def _extract_edges(
     grouped_edge_df = grouped_edge_df.loc[
         :, ["source", "target", "weight", "text_unit_ids"]
     ]
-
     if normalize_edge_weights:
         # use PMI weight instead of raw weight
         grouped_edge_df = _calculate_pmi_edge_weights(nodes_df, grouped_edge_df)

From da731fdc384ff1f4bc34663380bb5cd0332790c4 Mon Sep 17 00:00:00 2001
From: Nathan Evans <github@talkswithnumbers.com>
Date: Wed, 23 Apr 2025 16:16:00 -0700
Subject: [PATCH 09/15] Add edge weight to graph snapshot

---
 graphrag/index/workflows/finalize_graph.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/graphrag/index/workflows/finalize_graph.py b/graphrag/index/workflows/finalize_graph.py
index 890bb8515d..a5a94ba062 100644
--- a/graphrag/index/workflows/finalize_graph.py
+++ b/graphrag/index/workflows/finalize_graph.py
@@ -38,7 +38,8 @@ async def run_workflow(
 
     if config.snapshots.graphml:
         # todo: extract graphs at each level, and add in meta like descriptions
-        graph = create_graph(relationships)
+        graph = create_graph(final_relationships, edge_attr=["weight"])
+
         await snapshot_graphml(
             graph,
             name="graph",

From 41eda41953bab9df9df1e7c48eaec032fa13ada0 Mon Sep 17 00:00:00 2001
From: Nathan Evans <github@talkswithnumbers.com>
Date: Wed, 23 Apr 2025 16:20:09 -0700
Subject: [PATCH 10/15] Revert breaking optimizations

---
 .../build_noun_graph/build_noun_graph.py      | 34 +++++++++++--------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
index 200588c5ad..e9176ab1ae 100644
--- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py
+++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
@@ -4,7 +4,6 @@
 """Graph extraction using NLP."""
 
 from itertools import combinations
-from typing import Any
 
 import numpy as np
 import pandas as pd
@@ -100,20 +99,30 @@ def _extract_edges(
         text_units_df.groupby("text_unit_id").agg({"title": list}).reset_index()
     )
 
-    titles = text_units_df["title"].tolist()
-    all_edges: Any = [list(combinations(t, 2)) for t in titles]
-
-    text_units_df = text_units_df.assign(edges=all_edges)
+    text_units_df["edges"] = text_units_df["title"].apply(
+        lambda x: list(combinations(x, 2))
+    )
 
     edge_df = text_units_df.explode("edges").loc[:, ["edges", "text_unit_id"]]
 
-    edge_df[["source", "target"]] = edge_df["edges"].to_list()
-    edge_df["min_source"] = edge_df[["source", "target"]].min(axis=1)
-    edge_df["max_target"] = edge_df[["source", "target"]].max(axis=1)
-    edge_df = edge_df.drop(columns=["source", "target"]).rename(columns={"min_source": "source", "max_target": "target"})
-
+    edge_df["source"] = edge_df["edges"].apply(
+        lambda x: x[0] if isinstance(x, tuple) else None
+    )
+    edge_df["target"] = edge_df["edges"].apply(
+        lambda x: x[1] if isinstance(x, tuple) else None
+    )
     edge_df = edge_df[(edge_df.source.notna()) & (edge_df.target.notna())]
     edge_df = edge_df.drop(columns=["edges"])
+    # make sure source is always smaller than target
+    edge_df["source"], edge_df["target"] = zip(
+        *edge_df.apply(
+            lambda x: (x["source"], x["target"])
+            if x["source"] < x["target"]
+            else (x["target"], x["source"]),
+            axis=1,
+        ),
+        strict=False,
+    )
 
     # group by source and target, count the number of text units
     grouped_edge_df = (
@@ -121,10 +130,7 @@ def _extract_edges(
     )
     grouped_edge_df = grouped_edge_df.rename(columns={"text_unit_id": "text_unit_ids"})
     grouped_edge_df["weight"] = grouped_edge_df["text_unit_ids"].apply(len)
-
-    grouped_edge_df = grouped_edge_df.loc[
-        :, ["source", "target", "weight", "text_unit_ids"]
-    ]
+    grouped_edge_df = grouped_edge_df[["source", "target", "weight", "text_unit_ids"]]
     if normalize_edge_weights:
         # use PMI weight instead of raw weight
         grouped_edge_df = _calculate_pmi_edge_weights(nodes_df, grouped_edge_df)

From 0f54cb109bc881816853c29e7e4a1bfec9b166d3 Mon Sep 17 00:00:00 2001
From: Nathan Evans <github@talkswithnumbers.com>
Date: Thu, 24 Apr 2025 15:39:57 -0700
Subject: [PATCH 11/15] Add perf fixes back in

---
 .../build_noun_graph/build_noun_graph.py      | 41 ++++++++-----------
 1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
index e9176ab1ae..d927d4861e 100644
--- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py
+++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
@@ -4,6 +4,7 @@
 """Graph extraction using NLP."""
 
 from itertools import combinations
+from typing import Any
 
 import numpy as np
 import pandas as pd
@@ -31,7 +32,6 @@ async def build_noun_graph(
         text_units, text_analyzer, num_threads=num_threads, cache=cache
     )
     edges_df = _extract_edges(nodes_df, normalize_edge_weights=normalize_edge_weights)
-
     return (nodes_df, edges_df)
 
 
@@ -95,35 +95,28 @@ def _extract_edges(
     """
     text_units_df = nodes_df.explode("text_unit_ids")
     text_units_df = text_units_df.rename(columns={"text_unit_ids": "text_unit_id"})
+
     text_units_df = (
-        text_units_df.groupby("text_unit_id").agg({"title": list}).reset_index()
+        text_units_df.groupby("text_unit_id")
+        .agg({"title": lambda x: list(x) if len(x) > 1 else np.nan})
+        .reset_index()
     )
-
-    text_units_df["edges"] = text_units_df["title"].apply(
-        lambda x: list(combinations(x, 2))
+    text_units_df = text_units_df.dropna()
+    titles = text_units_df["title"].tolist()
+    all_edges: Any = [list(combinations(t, 2)) for t in titles]
+
+    text_units_df = text_units_df.assign(edges=all_edges)
+    edge_df = text_units_df.explode("edges")[["edges", "text_unit_id"]]
+
+    edge_df[["source", "target"]] = edge_df["edges"].to_list()
+    edge_df["min_source"] = edge_df[["source", "target"]].min(axis=1)
+    edge_df["max_target"] = edge_df[["source", "target"]].max(axis=1)
+    edge_df = edge_df.drop(columns=["source", "target"]).rename(
+        columns={"min_source": "source", "max_target": "target"}
     )
 
-    edge_df = text_units_df.explode("edges").loc[:, ["edges", "text_unit_id"]]
-
-    edge_df["source"] = edge_df["edges"].apply(
-        lambda x: x[0] if isinstance(x, tuple) else None
-    )
-    edge_df["target"] = edge_df["edges"].apply(
-        lambda x: x[1] if isinstance(x, tuple) else None
-    )
     edge_df = edge_df[(edge_df.source.notna()) & (edge_df.target.notna())]
     edge_df = edge_df.drop(columns=["edges"])
-    # make sure source is always smaller than target
-    edge_df["source"], edge_df["target"] = zip(
-        *edge_df.apply(
-            lambda x: (x["source"], x["target"])
-            if x["source"] < x["target"]
-            else (x["target"], x["source"]),
-            axis=1,
-        ),
-        strict=False,
-    )
-
     # group by source and target, count the number of text units
     grouped_edge_df = (
         edge_df.groupby(["source", "target"]).agg({"text_unit_id": list}).reset_index()

From efee425d4147542fbc9c8e6dd8f2f961064dfa8a Mon Sep 17 00:00:00 2001
From: Nathan Evans <github@talkswithnumbers.com>
Date: Thu, 24 Apr 2025 16:14:04 -0700
Subject: [PATCH 12/15] Format/types

---
 .../operations/build_noun_graph/build_noun_graph.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
index d927d4861e..5036ee9dc6 100644
--- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py
+++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
@@ -4,7 +4,6 @@
 """Graph extraction using NLP."""
 
 from itertools import combinations
-from typing import Any
 
 import numpy as np
 import pandas as pd
@@ -103,16 +102,16 @@ def _extract_edges(
     )
     text_units_df = text_units_df.dropna()
     titles = text_units_df["title"].tolist()
-    all_edges: Any = [list(combinations(t, 2)) for t in titles]
+    all_edges: list[list[tuple[str, str]]] = [list(combinations(t, 2)) for t in titles]
 
-    text_units_df = text_units_df.assign(edges=all_edges)
+    text_units_df = text_units_df.assign(edges=all_edges)  # type: ignore
     edge_df = text_units_df.explode("edges")[["edges", "text_unit_id"]]
 
-    edge_df[["source", "target"]] = edge_df["edges"].to_list()
+    edge_df[["source", "target"]] = edge_df.loc[:, "edges"].to_list()
     edge_df["min_source"] = edge_df[["source", "target"]].min(axis=1)
     edge_df["max_target"] = edge_df[["source", "target"]].max(axis=1)
     edge_df = edge_df.drop(columns=["source", "target"]).rename(
-        columns={"min_source": "source", "max_target": "target"}
+        columns={"min_source": "source", "max_target": "target"}  # type: ignore
     )
 
     edge_df = edge_df[(edge_df.source.notna()) & (edge_df.target.notna())]
@@ -123,7 +122,9 @@ def _extract_edges(
     )
     grouped_edge_df = grouped_edge_df.rename(columns={"text_unit_id": "text_unit_ids"})
     grouped_edge_df["weight"] = grouped_edge_df["text_unit_ids"].apply(len)
-    grouped_edge_df = grouped_edge_df[["source", "target", "weight", "text_unit_ids"]]
+    grouped_edge_df = grouped_edge_df.loc[
+        :, ["source", "target", "weight", "text_unit_ids"]
+    ]
     if normalize_edge_weights:
         # use PMI weight instead of raw weight
         grouped_edge_df = _calculate_pmi_edge_weights(nodes_df, grouped_edge_df)

From 7ab3a0fc0451a098e02fa898a0eef863abb7d8e4 Mon Sep 17 00:00:00 2001
From: Nathan Evans <github@talkswithnumbers.com>
Date: Fri, 25 Apr 2025 13:57:57 -0700
Subject: [PATCH 13/15] Update defaults

---
 docs/config/yaml.md         | 2 +-
 graphrag/config/defaults.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/config/yaml.md b/docs/config/yaml.md
index 791b67341c..00c64e0ad2 100644
--- a/docs/config/yaml.md
+++ b/docs/config/yaml.md
@@ -251,7 +251,7 @@ Parameters for manual graph pruning. This can be used to optimize the modularity
 - max_node_freq_std **float | None** - The maximum standard deviation of node frequency to allow.
 - min_node_degree **int** - The minimum node degree to allow.
 - max_node_degree_std **float | None** - The maximum standard deviation of node degree to allow.
-- min_edge_weight_pct **int** - The minimum edge weight percentile to allow.
+- min_edge_weight_pct **float** - The minimum edge weight percentile to allow.
 - remove_ego_nodes **bool** - Remove ego nodes.
 - lcc_only **bool** - Only use largest connected component.
 
diff --git a/graphrag/config/defaults.py b/graphrag/config/defaults.py
index 5be5925c2c..3f52d0b90e 100644
--- a/graphrag/config/defaults.py
+++ b/graphrag/config/defaults.py
@@ -319,8 +319,8 @@ class PruneGraphDefaults:
     max_node_freq_std: None = None
     min_node_degree: int = 1
     max_node_degree_std: None = None
-    min_edge_weight_pct: int = 40
-    remove_ego_nodes: bool = False
+    min_edge_weight_pct: float = 40.0
+    remove_ego_nodes: bool = True
     lcc_only: bool = False
 
 

From fe897108de3f323e7ff1c56887ba1806bdc04dd3 Mon Sep 17 00:00:00 2001
From: Nathan Evans <github@talkswithnumbers.com>
Date: Fri, 25 Apr 2025 14:12:36 -0700
Subject: [PATCH 14/15] Fix source/target ordering

---
 graphrag/index/operations/graph_to_dataframes.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/graphrag/index/operations/graph_to_dataframes.py b/graphrag/index/operations/graph_to_dataframes.py
index add48bd3ae..dbc608f640 100644
--- a/graphrag/index/operations/graph_to_dataframes.py
+++ b/graphrag/index/operations/graph_to_dataframes.py
@@ -21,6 +21,14 @@ def graph_to_dataframes(
 
     edges = nx.to_pandas_edgelist(graph)
 
+    # we don't deal in directed graphs, but we do need to ensure consistent ordering for df joins
+    # nx loses the initial ordering
+    edges["min_source"] = edges[["source", "target"]].min(axis=1)
+    edges["max_target"] = edges[["source", "target"]].max(axis=1)
+    edges = edges.drop(columns=["source", "target"]).rename(
+        columns={"min_source": "source", "max_target": "target"}  # type: ignore
+    )
+
     if node_columns:
         nodes = nodes.loc[:, node_columns]
 

From cbc6613d4d7b70cb1cf11a26923750079afd0880 Mon Sep 17 00:00:00 2001
From: Nathan Evans <github@talkswithnumbers.com>
Date: Fri, 25 Apr 2025 14:20:31 -0700
Subject: [PATCH 15/15] Fix test

---
 tests/verbs/test_prune_graph.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/verbs/test_prune_graph.py b/tests/verbs/test_prune_graph.py
index b30546bc92..3801dd27c0 100644
--- a/tests/verbs/test_prune_graph.py
+++ b/tests/verbs/test_prune_graph.py
@@ -28,4 +28,4 @@ async def test_prune_graph():
 
     nodes_actual = await load_table_from_storage("entities", context.storage)
 
-    assert len(nodes_actual) == 21
+    assert len(nodes_actual) == 20