Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .semversioner/next-release/patch-20260212211908142161.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Remove NetworkX dependency from graph utilities; move to DataFrame-based implementations in graphrag.graphs package."
}
2 changes: 1 addition & 1 deletion cspell.config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ ignorePaths:
- pyproject.toml
- entity_extraction.txt
- package.json
- tests/fixtures/
- tests/
- examples_notebooks/inputs/
- docs/examples_notebooks/inputs/
- "*.csv"
Expand Down
1 change: 1 addition & 0 deletions dictionary.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ PROPN
Syntatic
ents
INTJ
rels

# Libraries
Langchain
Expand Down
4 changes: 4 additions & 0 deletions packages/graphrag/graphrag/graphs/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""Graph utilities that operate on DataFrames instead of NetworkX objects."""
43 changes: 43 additions & 0 deletions packages/graphrag/graphrag/graphs/compute_degree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""Compute node degree directly from a relationships DataFrame."""

import pandas as pd


def compute_degree(
relationships: pd.DataFrame,
source_column: str = "source",
target_column: str = "target",
) -> pd.DataFrame:
"""Compute the degree of each node from an edge list DataFrame.

Degree is the number of edges connected to a node (counting both
source and target appearances).

Parameters
----------
relationships : pd.DataFrame
Edge list with at least source and target columns.
source_column : str
Name of the source node column.
target_column : str
Name of the target node column.

Returns
-------
pd.DataFrame
DataFrame with columns ["title", "degree"].
"""
# Normalize edge direction so (A,B) and (B,A) are treated as the same
# undirected edge, matching NetworkX Graph behavior.
edges = relationships[[source_column, target_column]].copy()
edges["_lo"] = edges.min(axis=1)
edges["_hi"] = edges.max(axis=1)
edges = edges.drop_duplicates(subset=["_lo", "_hi"])

source_counts = edges[source_column].value_counts()
target_counts = edges[target_column].value_counts()
degree = source_counts.add(target_counts, fill_value=0).astype(int)
return pd.DataFrame({"title": degree.index, "degree": degree.to_numpy()})
93 changes: 93 additions & 0 deletions packages/graphrag/graphrag/graphs/connected_components.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""Find connected components and the largest connected component from an edge list DataFrame."""

import pandas as pd


def connected_components(
relationships: pd.DataFrame,
source_column: str = "source",
target_column: str = "target",
) -> list[set[str]]:
"""Return all connected components as a list of node-title sets.

Uses union-find on the deduplicated edge list.

Parameters
----------
relationships : pd.DataFrame
Edge list with at least source and target columns.
source_column : str
Name of the source node column.
target_column : str
Name of the target node column.

Returns
-------
list[set[str]]
Each element is a set of node titles belonging to one component,
sorted by descending component size.
"""
edges = relationships.drop_duplicates(subset=[source_column, target_column])

# Initialize every node as its own parent
all_nodes = pd.concat(
[edges[source_column], edges[target_column]], ignore_index=True
).unique()
parent: dict[str, str] = {node: node for node in all_nodes}

def find(x: str) -> str:
while parent[x] != x:
parent[x] = parent[parent[x]] # path compression
x = parent[x]
return x

def union(a: str, b: str) -> None:
ra, rb = find(a), find(b)
if ra != rb:
parent[ra] = rb

# Union each edge
for src, tgt in zip(edges[source_column], edges[target_column], strict=True):
union(src, tgt)

# Group by root
groups: dict[str, set[str]] = {}
for node in parent:
root = find(node)
groups.setdefault(root, set()).add(node)

return sorted(groups.values(), key=len, reverse=True)


def largest_connected_component(
relationships: pd.DataFrame,
source_column: str = "source",
target_column: str = "target",
) -> set[str]:
"""Return the node titles belonging to the largest connected component.

Parameters
----------
relationships : pd.DataFrame
Edge list with at least source and target columns.
source_column : str
Name of the source node column.
target_column : str
Name of the target node column.

Returns
-------
set[str]
The set of node titles in the largest connected component.
"""
components = connected_components(
relationships,
source_column=source_column,
target_column=target_column,
)
if not components:
return set()
return components[0]
101 changes: 101 additions & 0 deletions packages/graphrag/graphrag/graphs/edge_weights.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""Edge weight calculation utilities (PMI, RRF)."""

import numpy as np
import pandas as pd


def calculate_pmi_edge_weights(
nodes_df: pd.DataFrame,
edges_df: pd.DataFrame,
node_name_col: str = "title",
node_freq_col: str = "frequency",
edge_weight_col: str = "weight",
edge_source_col: str = "source",
edge_target_col: str = "target",
) -> pd.DataFrame:
"""Calculate pointwise mutual information (PMI) edge weights.

Uses a variant of PMI that accounts for bias towards low-frequency events.
pmi(x,y) = p(x,y) * log2(p(x,y)/ (p(x)*p(y))
p(x,y) = edge_weight(x,y) / total_edge_weights
p(x) = freq_occurrence(x) / total_freq_occurrences.
"""
copied_nodes_df = nodes_df[[node_name_col, node_freq_col]]

total_edge_weights = edges_df[edge_weight_col].sum()
total_freq_occurrences = nodes_df[node_freq_col].sum()
copied_nodes_df["prop_occurrence"] = (
copied_nodes_df[node_freq_col] / total_freq_occurrences
)
copied_nodes_df = copied_nodes_df.loc[:, [node_name_col, "prop_occurrence"]]

edges_df["prop_weight"] = edges_df[edge_weight_col] / total_edge_weights
edges_df = (
edges_df
.merge(
copied_nodes_df,
left_on=edge_source_col,
right_on=node_name_col,
how="left",
)
.drop(columns=[node_name_col])
.rename(columns={"prop_occurrence": "source_prop"})
)
edges_df = (
edges_df
.merge(
copied_nodes_df,
left_on=edge_target_col,
right_on=node_name_col,
how="left",
)
.drop(columns=[node_name_col])
.rename(columns={"prop_occurrence": "target_prop"})
)
edges_df[edge_weight_col] = edges_df["prop_weight"] * np.log2(
edges_df["prop_weight"] / (edges_df["source_prop"] * edges_df["target_prop"])
)

return edges_df.drop(columns=["prop_weight", "source_prop", "target_prop"])


def calculate_rrf_edge_weights(
nodes_df: pd.DataFrame,
edges_df: pd.DataFrame,
node_name_col: str = "title",
node_freq_col: str = "freq",
edge_weight_col: str = "weight",
edge_source_col: str = "source",
edge_target_col: str = "target",
rrf_smoothing_factor: int = 60,
) -> pd.DataFrame:
"""Calculate reciprocal rank fusion (RRF) edge weights.

Combines PMI weight and combined freq of source and target.
"""
edges_df = calculate_pmi_edge_weights(
nodes_df,
edges_df,
node_name_col,
node_freq_col,
edge_weight_col,
edge_source_col,
edge_target_col,
)

edges_df["pmi_rank"] = edges_df[edge_weight_col].rank(method="min", ascending=False)
edges_df["raw_weight_rank"] = edges_df[edge_weight_col].rank(
method="min", ascending=False
)
edges_df[edge_weight_col] = edges_df.apply(
lambda x: (
(1 / (rrf_smoothing_factor + x["pmi_rank"]))
+ (1 / (rrf_smoothing_factor + x["raw_weight_rank"]))
),
axis=1,
)

return edges_df.drop(columns=["pmi_rank", "raw_weight_rank"])
54 changes: 54 additions & 0 deletions packages/graphrag/graphrag/graphs/hierarchical_leiden.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""Hierarchical Leiden clustering on edge lists."""

from typing import Any

import graspologic_native as gn


def hierarchical_leiden(
edges: list[tuple[str, str, float]],
max_cluster_size: int = 10,
random_seed: int | None = 0xDEADBEEF,
) -> list[gn.HierarchicalCluster]:
"""Run hierarchical leiden on an edge list."""
return gn.hierarchical_leiden(
edges=edges,
max_cluster_size=max_cluster_size,
seed=random_seed,
starting_communities=None,
resolution=1.0,
randomness=0.001,
use_modularity=True,
iterations=1,
)


def first_level_hierarchical_clustering(
hcs: list[gn.HierarchicalCluster],
) -> dict[Any, int]:
"""Return the initial leiden clustering as a dict of node id to community id.

Returns
-------
dict[Any, int]
The initial leiden algorithm clustering results as a dictionary
of node id to community id.
"""
return {entry.node: entry.cluster for entry in hcs if entry.level == 0}


def final_level_hierarchical_clustering(
hcs: list[gn.HierarchicalCluster],
) -> dict[Any, int]:
"""Return the final leiden clustering as a dict of node id to community id.

Returns
-------
dict[Any, int]
The last leiden algorithm clustering results as a dictionary
of node id to community id.
"""
return {entry.node: entry.cluster for entry in hcs if entry.is_final_cluster}
Loading