microsoft · natoverse · Feb 13, 2026 · Feb 11, 2026 · Feb 12, 2026 · Feb 12, 2026
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Remove NetworkX dependency from graph utilities; move to DataFrame-based implementations in graphrag.graphs package."
+}
@@ -21,7 +21,7 @@ ignorePaths:
   - pyproject.toml
   - entity_extraction.txt
   - package.json
-  - tests/fixtures/
+  - tests/
   - examples_notebooks/inputs/
   - docs/examples_notebooks/inputs/
   - "*.csv"

@@ -45,6 +45,7 @@ PROPN
 Syntatic
 ents
 INTJ
+rels
 
 # Libraries
 Langchain

@@ -0,0 +1,4 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""Graph utilities that operate on DataFrames instead of NetworkX objects."""
@@ -0,0 +1,43 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""Compute node degree directly from a relationships DataFrame."""
+
+import pandas as pd
+
+
+def compute_degree(
+    relationships: pd.DataFrame,
+    source_column: str = "source",
+    target_column: str = "target",
+) -> pd.DataFrame:
+    """Compute the degree of each node from an edge list DataFrame.
+
+    Degree is the number of edges connected to a node (counting both
+    source and target appearances).
+
+    Parameters
+    ----------
+    relationships : pd.DataFrame
+        Edge list with at least source and target columns.
+    source_column : str
+        Name of the source node column.
+    target_column : str
+        Name of the target node column.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with columns ["title", "degree"].
+    """
+    # Normalize edge direction so (A,B) and (B,A) are treated as the same
+    # undirected edge, matching NetworkX Graph behavior.
+    edges = relationships[[source_column, target_column]].copy()
+    edges["_lo"] = edges.min(axis=1)
+    edges["_hi"] = edges.max(axis=1)
+    edges = edges.drop_duplicates(subset=["_lo", "_hi"])
+
+    source_counts = edges[source_column].value_counts()
+    target_counts = edges[target_column].value_counts()
+    degree = source_counts.add(target_counts, fill_value=0).astype(int)
+    return pd.DataFrame({"title": degree.index, "degree": degree.to_numpy()})
@@ -0,0 +1,93 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""Find connected components and the largest connected component from an edge list DataFrame."""
+
+import pandas as pd
+
+
+def connected_components(
+    relationships: pd.DataFrame,
+    source_column: str = "source",
+    target_column: str = "target",
+) -> list[set[str]]:
+    """Return all connected components as a list of node-title sets.
+
+    Uses union-find on the deduplicated edge list.
+
+    Parameters
+    ----------
+    relationships : pd.DataFrame
+        Edge list with at least source and target columns.
+    source_column : str
+        Name of the source node column.
+    target_column : str
+        Name of the target node column.
+
+    Returns
+    -------
+    list[set[str]]
+        Each element is a set of node titles belonging to one component,
+        sorted by descending component size.
+    """
+    edges = relationships.drop_duplicates(subset=[source_column, target_column])
+
+    # Initialize every node as its own parent
+    all_nodes = pd.concat(
+        [edges[source_column], edges[target_column]], ignore_index=True
+    ).unique()
+    parent: dict[str, str] = {node: node for node in all_nodes}
+
+    def find(x: str) -> str:
+        while parent[x] != x:
+            parent[x] = parent[parent[x]]  # path compression
+            x = parent[x]
+        return x
+
+    def union(a: str, b: str) -> None:
+        ra, rb = find(a), find(b)
+        if ra != rb:
+            parent[ra] = rb
+
+    # Union each edge
+    for src, tgt in zip(edges[source_column], edges[target_column], strict=True):
+        union(src, tgt)
+
+    # Group by root
+    groups: dict[str, set[str]] = {}
+    for node in parent:
+        root = find(node)
+        groups.setdefault(root, set()).add(node)
+
+    return sorted(groups.values(), key=len, reverse=True)
+
+
+def largest_connected_component(
+    relationships: pd.DataFrame,
+    source_column: str = "source",
+    target_column: str = "target",
+) -> set[str]:
+    """Return the node titles belonging to the largest connected component.
+
+    Parameters
+    ----------
+    relationships : pd.DataFrame
+        Edge list with at least source and target columns.
+    source_column : str
+        Name of the source node column.
+    target_column : str
+        Name of the target node column.
+
+    Returns
+    -------
+    set[str]
+        The set of node titles in the largest connected component.
+    """
+    components = connected_components(
+        relationships,
+        source_column=source_column,
+        target_column=target_column,
+    )
+    if not components:
+        return set()
+    return components[0]
@@ -0,0 +1,101 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""Edge weight calculation utilities (PMI, RRF)."""
+
+import numpy as np
+import pandas as pd
+
+
+def calculate_pmi_edge_weights(
+    nodes_df: pd.DataFrame,
+    edges_df: pd.DataFrame,
+    node_name_col: str = "title",
+    node_freq_col: str = "frequency",
+    edge_weight_col: str = "weight",
+    edge_source_col: str = "source",
+    edge_target_col: str = "target",
+) -> pd.DataFrame:
+    """Calculate pointwise mutual information (PMI) edge weights.
+
+    Uses a variant of PMI that accounts for bias towards low-frequency events.
+    pmi(x,y) = p(x,y) * log2(p(x,y)/ (p(x)*p(y))
+    p(x,y) = edge_weight(x,y) / total_edge_weights
+    p(x) = freq_occurrence(x) / total_freq_occurrences.
+    """
+    copied_nodes_df = nodes_df[[node_name_col, node_freq_col]]
+
+    total_edge_weights = edges_df[edge_weight_col].sum()
+    total_freq_occurrences = nodes_df[node_freq_col].sum()
+    copied_nodes_df["prop_occurrence"] = (
+        copied_nodes_df[node_freq_col] / total_freq_occurrences
+    )
+    copied_nodes_df = copied_nodes_df.loc[:, [node_name_col, "prop_occurrence"]]
+
+    edges_df["prop_weight"] = edges_df[edge_weight_col] / total_edge_weights
+    edges_df = (
+        edges_df
+        .merge(
+            copied_nodes_df,
+            left_on=edge_source_col,
+            right_on=node_name_col,
+            how="left",
+        )
+        .drop(columns=[node_name_col])
+        .rename(columns={"prop_occurrence": "source_prop"})
+    )
+    edges_df = (
+        edges_df
+        .merge(
+            copied_nodes_df,
+            left_on=edge_target_col,
+            right_on=node_name_col,
+            how="left",
+        )
+        .drop(columns=[node_name_col])
+        .rename(columns={"prop_occurrence": "target_prop"})
+    )
+    edges_df[edge_weight_col] = edges_df["prop_weight"] * np.log2(
+        edges_df["prop_weight"] / (edges_df["source_prop"] * edges_df["target_prop"])
+    )
+
+    return edges_df.drop(columns=["prop_weight", "source_prop", "target_prop"])
+
+
+def calculate_rrf_edge_weights(
+    nodes_df: pd.DataFrame,
+    edges_df: pd.DataFrame,
+    node_name_col: str = "title",
+    node_freq_col: str = "freq",
+    edge_weight_col: str = "weight",
+    edge_source_col: str = "source",
+    edge_target_col: str = "target",
+    rrf_smoothing_factor: int = 60,
+) -> pd.DataFrame:
+    """Calculate reciprocal rank fusion (RRF) edge weights.
+
+    Combines PMI weight and combined freq of source and target.
+    """
+    edges_df = calculate_pmi_edge_weights(
+        nodes_df,
+        edges_df,
+        node_name_col,
+        node_freq_col,
+        edge_weight_col,
+        edge_source_col,
+        edge_target_col,
+    )
+
+    edges_df["pmi_rank"] = edges_df[edge_weight_col].rank(method="min", ascending=False)
+    edges_df["raw_weight_rank"] = edges_df[edge_weight_col].rank(
+        method="min", ascending=False
+    )
+    edges_df[edge_weight_col] = edges_df.apply(
+        lambda x: (
+            (1 / (rrf_smoothing_factor + x["pmi_rank"]))
+            + (1 / (rrf_smoothing_factor + x["raw_weight_rank"]))
+        ),
+        axis=1,
+    )
+
+    return edges_df.drop(columns=["pmi_rank", "raw_weight_rank"])
@@ -0,0 +1,54 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""Hierarchical Leiden clustering on edge lists."""
+
+from typing import Any
+
+import graspologic_native as gn
+
+
+def hierarchical_leiden(
+    edges: list[tuple[str, str, float]],
+    max_cluster_size: int = 10,
+    random_seed: int | None = 0xDEADBEEF,
+) -> list[gn.HierarchicalCluster]:
+    """Run hierarchical leiden on an edge list."""
+    return gn.hierarchical_leiden(
+        edges=edges,
+        max_cluster_size=max_cluster_size,
+        seed=random_seed,
+        starting_communities=None,
+        resolution=1.0,
+        randomness=0.001,
+        use_modularity=True,
+        iterations=1,
+    )
+
+
+def first_level_hierarchical_clustering(
+    hcs: list[gn.HierarchicalCluster],
+) -> dict[Any, int]:
+    """Return the initial leiden clustering as a dict of node id to community id.
+
+    Returns
+    -------
+    dict[Any, int]
+        The initial leiden algorithm clustering results as a dictionary
+        of node id to community id.
+    """
+    return {entry.node: entry.cluster for entry in hcs if entry.level == 0}
+
+
+def final_level_hierarchical_clustering(
+    hcs: list[gn.HierarchicalCluster],
+) -> dict[Any, int]:
+    """Return the final leiden clustering as a dict of node id to community id.
+
+    Returns
+    -------
+    dict[Any, int]
+        The last leiden algorithm clustering results as a dictionary
+        of node id to community id.
+    """
+    return {entry.node: entry.cluster for entry in hcs if entry.is_final_cluster}
-Original file line number
+Diff line change
@@ Expand Up / @@ -45,6 +45,7 @@ PROPN @@
     Syntatic
     ents
     INTJ
+    rels
     # Libraries
     Langchain
@@ Expand Down @@