From 6dc5482f9510f7cd3c8a55e9e37ba6b0f6fcfa49 Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Fri, 13 Feb 2026 19:44:28 -0300 Subject: [PATCH 1/7] add manual release instructions --- DEVELOPING.md | 2 + RELEASE.md | 134 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 RELEASE.md diff --git a/DEVELOPING.md b/DEVELOPING.md index f4381c0cd6..63bfa9cf90 100644 --- a/DEVELOPING.md +++ b/DEVELOPING.md @@ -66,6 +66,8 @@ We use [semversioner](https://github.com/raulgomis/semversioner) to automate and uv run semversioner add-change -t patch -d "." ``` +For the full end-to-end release process (version bumping, publishing to PyPI, etc.), see [RELEASE.md](RELEASE.md). + # Azurite Some unit and smoke tests use Azurite to emulate Azure resources. This can be started by running: diff --git a/RELEASE.md b/RELEASE.md new file mode 100644 index 0000000000..c6432100d9 --- /dev/null +++ b/RELEASE.md @@ -0,0 +1,134 @@ +# Release Process + +This document describes the end-to-end process for releasing GraphRAG packages. + +**Note**: The CI publish workflow (python-publish.yml) is currently non-functional. Packages must be published manually as described below. + +## Prerequisites + +- Write access to the `microsoft/graphrag` repository. +- Maintainer or owner role on **all** GraphRAG packages on PyPI. +- `uv` installed locally. +- Dependencies synced: `uv sync --all-packages`. + +## 1. Prepare the release + +From `main` (or a clean branch off `main`), run the release task: + +```sh +uv run poe release +``` + +This does the following automatically: + +1. Runs `semversioner release` — consumes all pending change files and bumps the + version. +2. Regenerates `CHANGELOG.md`. +3. Updates `project.version` in every package's `pyproject.toml`. +4. Updates cross-package dependency version pins (e.g. `graphrag-common==X.Y.Z` + in all packages that depend on it). +5. Runs `uv sync --all-packages` to update the lockfile. + +## 2. Open and merge the release PR + +1. Commit all changes and push to a branch named `release/` (e.g. + `release/3.1.0`). +2. Open a PR targeting `main`. +3. CI checks (semver, linting, tests) will run automatically. +4. Once approved, merge to `main`. + +## 3. Create a GitHub release + +1. Go to https://github.com/microsoft/graphrag/releases/new. +2. Create a new tag matching the version (e.g. `v3.1.0`). +3. Target: `main`. +4. Title: the version number (e.g. `v3.1.0`). +5. Use the changelog entry for the release notes, or click "Generate release + notes". +6. Publish the release. + +## 4. Build the packages + +Pull the latest `main` to get a clean state, then build: + +```sh +git checkout main && git pull +uv sync --all-packages +uv run poe build +``` + +All wheels and sdists will be in the `dist/` directory. + +## 5. Publish to PyPI + +Packages must be published in dependency order. The `graphrag` package depends on +all others, so it goes last. + +### Generate PyPI tokens + +For each package, go to https://pypi.org/manage/account/ and create a +project-scoped API token under **API tokens > Add API token**. Select the +specific project as the scope. Copy the token (starts with `pypi-...`) — it is +only shown once. + +### Publish each package + +For each package, export its token and publish. Replace `` with the +actual version (e.g. `3.1.0`). + +```sh +# 1. graphrag-common (no internal dependencies) +export UV_PUBLISH_TOKEN="pypi-" +uv publish dist/graphrag_common-* + +# 2. graphrag-storage (depends on common) +export UV_PUBLISH_TOKEN="pypi-" +uv publish dist/graphrag_storage-* + +# 3. graphrag-chunking (depends on common) +export UV_PUBLISH_TOKEN="pypi-" +uv publish dist/graphrag_chunking-* + +# 4. graphrag-vectors (depends on common) +export UV_PUBLISH_TOKEN="pypi-" +uv publish dist/graphrag_vectors-* + +# 5. graphrag-input (depends on common, storage) +export UV_PUBLISH_TOKEN="pypi-" +uv publish dist/graphrag_input-* + +# 6. graphrag-cache (depends on common, storage) +export UV_PUBLISH_TOKEN="pypi-" +uv publish dist/graphrag_cache-* + +# 7. graphrag-llm (depends on cache, common) +export UV_PUBLISH_TOKEN="pypi-" +uv publish dist/graphrag_llm-* + +# 8. graphrag (depends on ALL of the above — publish last) +export UV_PUBLISH_TOKEN="pypi-" +uv publish dist/graphrag-* +``` + +### Verify + +After publishing, confirm the new versions are live: + +```sh +pip index versions graphrag +``` + +Or visit https://pypi.org/project/graphrag/ and check each sub-package page. + +## Package dependency graph (for reference) + +``` +graphrag-common (no internal deps) +├── graphrag-storage (common) +├── graphrag-chunking (common) +├── graphrag-vectors (common) +├── graphrag-input (common, storage) +├── graphrag-cache (common, storage) +├── graphrag-llm (cache, common) +└── graphrag (all of the above) +``` From cabe5412a9e256f9beb0e7b52e8ae3f4024613d2 Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Thu, 19 Feb 2026 11:46:55 -0300 Subject: [PATCH 2/7] create streaming --- .../patch-20260219144634370703.json | 4 + .../index/operations/cluster_graph.py | 26 +- .../index/workflows/create_communities.py | 115 +++-- tests/unit/indexing/test_cluster_graph.py | 295 +++++++++++ .../unit/indexing/test_create_communities.py | 488 ++++++++++++++++++ 5 files changed, 879 insertions(+), 49 deletions(-) create mode 100644 .semversioner/next-release/patch-20260219144634370703.json create mode 100644 tests/unit/indexing/test_cluster_graph.py create mode 100644 tests/unit/indexing/test_create_communities.py diff --git a/.semversioner/next-release/patch-20260219144634370703.json b/.semversioner/next-release/patch-20260219144634370703.json new file mode 100644 index 0000000000..01e3e75559 --- /dev/null +++ b/.semversioner/next-release/patch-20260219144634370703.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "create_communities streaming" +} diff --git a/packages/graphrag/graphrag/index/operations/cluster_graph.py b/packages/graphrag/graphrag/index/operations/cluster_graph.py index 745d6b5148..d14db22aa7 100644 --- a/packages/graphrag/graphrag/index/operations/cluster_graph.py +++ b/packages/graphrag/graphrag/index/operations/cluster_graph.py @@ -4,6 +4,7 @@ """A module containing cluster_graph method definition.""" import logging +from collections import defaultdict import pandas as pd @@ -34,12 +35,9 @@ def cluster_graph( clusters: dict[int, dict[int, list[str]]] = {} for level in levels: - result = {} + result: dict[int, list[str]] = defaultdict(list) clusters[level] = result - for node_id, raw_community_id in node_id_to_community_map[level].items(): - community_id = raw_community_id - if community_id not in result: - result[community_id] = [] + for node_id, community_id in node_id_to_community_map[level].items(): result[community_id].append(node_id) results: Communities = [] @@ -64,15 +62,25 @@ def _compute_leiden_communities( # so we replicate that by normalizing direction then keeping last. lo = edge_df[["source", "target"]].min(axis=1) hi = edge_df[["source", "target"]].max(axis=1) - edge_df = edge_df.assign(source=lo, target=hi) - edge_df = edge_df.drop_duplicates(subset=["source", "target"], keep="last") + edge_df["source"] = lo + edge_df["target"] = hi + edge_df.drop_duplicates(subset=["source", "target"], keep="last", inplace=True) if use_lcc: edge_df = stable_lcc(edge_df) + weights = ( + edge_df["weight"].astype(float) + if "weight" in edge_df.columns + else pd.Series(1.0, index=edge_df.index) + ) edge_list: list[tuple[str, str, float]] = sorted( - (str(row["source"]), str(row["target"]), float(row.get("weight", 1.0))) - for _, row in edge_df.iterrows() + zip( + edge_df["source"].astype(str), + edge_df["target"].astype(str), + weights, + strict=True, + ) ) community_mapping = hierarchical_leiden( diff --git a/packages/graphrag/graphrag/index/workflows/create_communities.py b/packages/graphrag/graphrag/index/workflows/create_communities.py index d80b78b1aa..57a7005fbf 100644 --- a/packages/graphrag/graphrag/index/workflows/create_communities.py +++ b/packages/graphrag/graphrag/index/workflows/create_communities.py @@ -5,7 +5,7 @@ import logging from datetime import datetime, timezone -from typing import cast +from typing import Any, cast from uuid import uuid4 import numpy as np @@ -28,34 +28,65 @@ async def run_workflow( """All the steps to transform final communities.""" logger.info("Workflow started: create_communities") reader = DataReader(context.output_table_provider) - entities = await reader.entities() relationships = await reader.relationships() + + title_to_entity_id: dict[str, str] = {} + async with context.output_table_provider.open("entities") as entities_table: + async for row in entities_table: + title_to_entity_id[row["title"]] = row["id"] + max_cluster_size = config.cluster_graph.max_cluster_size use_lcc = config.cluster_graph.use_lcc seed = config.cluster_graph.seed output = create_communities( - entities, + title_to_entity_id, relationships, max_cluster_size=max_cluster_size, use_lcc=use_lcc, seed=seed, ) - await context.output_table_provider.write_dataframe("communities", output) + rows = output.to_dict("records") + sample_size = min(5, len(rows)) + sample_rows = rows[:sample_size] + + async with context.output_table_provider.open("communities") as table: + for row in rows: + await table.write(cast("dict[str, Any]", row)) logger.info("Workflow completed: create_communities") - return WorkflowFunctionOutput(result=output) + return WorkflowFunctionOutput(result=sample_rows) def create_communities( - entities: pd.DataFrame, + title_to_entity_id: dict[str, str], relationships: pd.DataFrame, max_cluster_size: int, use_lcc: bool, seed: int | None = None, ) -> pd.DataFrame: - """All the steps to transform final communities.""" + """Build community DataFrame from clustered relationships. + + Args + ---- + title_to_entity_id: dict[str, str] + Mapping of entity title to entity id. + relationships: pd.DataFrame + Relationships DataFrame with source, target, weight, + text_unit_ids columns. + max_cluster_size: int + Maximum cluster size for hierarchical Leiden. + use_lcc: bool + Whether to restrict to the largest connected component. + seed: int | None + Random seed for deterministic clustering. + + Returns + ------- + pd.DataFrame + Communities DataFrame with COMMUNITIES_FINAL_COLUMNS schema. + """ clusters = cluster_graph( relationships, max_cluster_size, @@ -69,47 +100,51 @@ def create_communities( communities["community"] = communities["community"].astype(int) # aggregate entity ids for each community - entity_ids = communities.merge(entities, on="title", how="inner") + entity_map = communities[["community", "title"]].copy() + entity_map["entity_id"] = entity_map["title"].map(title_to_entity_id) entity_ids = ( - entity_ids.groupby("community").agg(entity_ids=("id", list)).reset_index() + entity_map + .dropna(subset=["entity_id"]) + .groupby("community") + .agg(entity_ids=("entity_id", list)) + .reset_index() ) - # aggregate relationships ids for each community - # these are limited to only those where the source and target are in the same community - max_level = communities["level"].max() - all_grouped = pd.DataFrame( - columns=["community", "level", "relationship_ids", "text_unit_ids"] # type: ignore - ) - for level in range(max_level + 1): - communities_at_level = communities.loc[communities["level"] == level] - sources = relationships.merge( - communities_at_level, left_on="source", right_on="title", how="inner" + # aggregate relationship ids per community, limited to + # intra-community edges (source and target in the same community). + # Process one hierarchy level at a time to keep intermediate + # DataFrames small, then concat the grouped results once at the end. + level_results = [] + for level in communities["level"].unique(): + level_comms = communities[communities["level"] == level] + with_source = relationships.merge( + level_comms, left_on="source", right_on="title", how="inner" ) - targets = sources.merge( - communities_at_level, left_on="target", right_on="title", how="inner" + with_both = with_source.merge( + level_comms, left_on="target", right_on="title", how="inner" ) - matched = targets.loc[targets["community_x"] == targets["community_y"]] - text_units = matched.explode("text_unit_ids") + intra = with_both[with_both["community_x"] == with_both["community_y"]] + if intra.empty: + continue grouped = ( - text_units - .groupby(["community_x", "level_x", "parent_x"]) - .agg(relationship_ids=("id", list), text_unit_ids=("text_unit_ids", list)) + intra + .explode("text_unit_ids") + .groupby(["community_x", "parent_x"]) + .agg( + relationship_ids=("id", list), + text_unit_ids=("text_unit_ids", list), + ) .reset_index() ) - grouped.rename( - columns={ - "community_x": "community", - "level_x": "level", - "parent_x": "parent", - }, - inplace=True, - ) - all_grouped = pd.concat([ - all_grouped, - grouped.loc[ - :, ["community", "level", "parent", "relationship_ids", "text_unit_ids"] - ], - ]) + grouped["level"] = level + level_results.append(grouped) + + all_grouped = pd.concat(level_results, ignore_index=True).rename( + columns={ + "community_x": "community", + "parent_x": "parent", + } + ) # deduplicate the lists all_grouped["relationship_ids"] = all_grouped["relationship_ids"].apply( diff --git a/tests/unit/indexing/test_cluster_graph.py b/tests/unit/indexing/test_cluster_graph.py new file mode 100644 index 0000000000..f9f7344530 --- /dev/null +++ b/tests/unit/indexing/test_cluster_graph.py @@ -0,0 +1,295 @@ +# Copyright (C) 2026 Microsoft + +"""Tests for the cluster_graph operation. + +These tests pin down the behavior of cluster_graph and its internal +_compute_leiden_communities function so that refactoring (vectorizing +iterrows, reducing copies, etc.) can be verified against known output. +""" + +import pandas as pd +import pytest +from graphrag.index.operations.cluster_graph import ( + Communities, + cluster_graph, +) + + +def _make_edges( + rows: list[tuple[str, str, float]], +) -> pd.DataFrame: + """Build a minimal relationships DataFrame from (source, target, weight).""" + return pd.DataFrame([{"source": s, "target": t, "weight": w} for s, t, w in rows]) + + +def _node_sets(clusters: Communities) -> list[set[str]]: + """Extract sorted-by-level list of node sets from cluster output.""" + return [set(nodes) for _, _, _, nodes in clusters] + + +# ------------------------------------------------------------------- +# Basic clustering +# ------------------------------------------------------------------- + + +class TestClusterGraphBasic: + """Verify basic clustering on small synthetic graphs.""" + + def test_single_triangle(self): + """A single triangle should produce one community at level 0.""" + edges = _make_edges([("X", "Y", 1.0), ("X", "Z", 1.0), ("Y", "Z", 1.0)]) + clusters = cluster_graph(edges, max_cluster_size=10, use_lcc=False, seed=42) + + assert len(clusters) == 1 + level, cid, parent, nodes = clusters[0] + assert level == 0 + assert parent == -1 + assert set(nodes) == {"X", "Y", "Z"} + + def test_two_disconnected_cliques(self): + """Two disconnected triangles should produce two communities.""" + edges = _make_edges([ + ("A", "B", 1.0), + ("A", "C", 1.0), + ("B", "C", 1.0), + ("D", "E", 1.0), + ("D", "F", 1.0), + ("E", "F", 1.0), + ]) + clusters = cluster_graph(edges, max_cluster_size=10, use_lcc=False, seed=42) + + assert len(clusters) == 2 + node_sets = _node_sets(clusters) + assert {"A", "B", "C"} in node_sets + assert {"D", "E", "F"} in node_sets + for level, _, parent, _ in clusters: + assert level == 0 + assert parent == -1 + + def test_lcc_filters_to_largest_component(self): + """With use_lcc=True, only the largest connected component is kept.""" + edges = _make_edges([ + ("A", "B", 1.0), + ("A", "C", 1.0), + ("B", "C", 1.0), + ("D", "E", 1.0), + ("D", "F", 1.0), + ("E", "F", 1.0), + ]) + clusters = cluster_graph(edges, max_cluster_size=10, use_lcc=True, seed=42) + + assert len(clusters) == 1 + all_nodes = set(clusters[0][3]) + assert len(all_nodes) == 3 + + +# ------------------------------------------------------------------- +# Edge normalization +# ------------------------------------------------------------------- + + +class TestEdgeNormalization: + """Verify that direction normalization and deduplication work.""" + + def test_reversed_edges_produce_same_result(self): + """Reversing all edge directions should yield identical clusters.""" + forward = _make_edges([ + ("A", "B", 1.0), + ("A", "C", 1.0), + ("B", "C", 1.0), + ("D", "E", 1.0), + ("D", "F", 1.0), + ("E", "F", 1.0), + ]) + backward = _make_edges([ + ("B", "A", 1.0), + ("C", "A", 1.0), + ("C", "B", 1.0), + ("E", "D", 1.0), + ("F", "D", 1.0), + ("F", "E", 1.0), + ]) + clusters_fwd = cluster_graph( + forward, max_cluster_size=10, use_lcc=False, seed=42 + ) + clusters_bwd = cluster_graph( + backward, max_cluster_size=10, use_lcc=False, seed=42 + ) + + assert _node_sets(clusters_fwd) == _node_sets(clusters_bwd) + + def test_duplicate_edges_are_deduped(self): + """A→B and B→A should be treated as one edge after normalization.""" + edges = _make_edges([ + ("A", "B", 1.0), + ("B", "A", 2.0), + ("A", "C", 1.0), + ("B", "C", 1.0), + ]) + clusters = cluster_graph(edges, max_cluster_size=10, use_lcc=False, seed=42) + + assert len(clusters) == 1 + assert set(clusters[0][3]) == {"A", "B", "C"} + + def test_missing_weight_defaults_to_one(self): + """Edges without a weight column should default to weight 1.0.""" + edges = pd.DataFrame({ + "source": ["A", "A", "B"], + "target": ["B", "C", "C"], + }) + clusters = cluster_graph(edges, max_cluster_size=10, use_lcc=False, seed=42) + + assert len(clusters) == 1 + assert set(clusters[0][3]) == {"A", "B", "C"} + + +# ------------------------------------------------------------------- +# Determinism +# ------------------------------------------------------------------- + + +class TestDeterminism: + """Verify that seeding produces reproducible results.""" + + def test_same_seed_same_result(self): + """Identical seed should yield identical output.""" + edges = _make_edges([ + ("A", "B", 1.0), + ("A", "C", 1.0), + ("B", "C", 1.0), + ("D", "E", 1.0), + ("D", "F", 1.0), + ("E", "F", 1.0), + ]) + c1 = cluster_graph(edges, max_cluster_size=10, use_lcc=False, seed=123) + c2 = cluster_graph(edges, max_cluster_size=10, use_lcc=False, seed=123) + + assert c1 == c2 + + def test_does_not_mutate_input(self): + """cluster_graph should not modify the input DataFrame.""" + edges = _make_edges([ + ("A", "B", 1.0), + ("A", "C", 1.0), + ("B", "C", 1.0), + ]) + original = edges.copy() + cluster_graph(edges, max_cluster_size=10, use_lcc=False, seed=42) + + pd.testing.assert_frame_equal(edges, original) + + +# ------------------------------------------------------------------- +# Output structure +# ------------------------------------------------------------------- + + +class TestOutputStructure: + """Verify the shape and types of the Communities output.""" + + def test_output_tuple_structure(self): + """Each entry should be (level, community_id, parent, node_list).""" + edges = _make_edges([("A", "B", 1.0), ("A", "C", 1.0), ("B", "C", 1.0)]) + clusters = cluster_graph(edges, max_cluster_size=10, use_lcc=False, seed=42) + + for entry in clusters: + assert len(entry) == 4 + level, cid, parent, nodes = entry + assert isinstance(level, int) + assert isinstance(cid, int) + assert isinstance(parent, int) + assert isinstance(nodes, list) + assert all(isinstance(n, str) for n in nodes) + + def test_level_zero_has_parent_minus_one(self): + """All level-0 clusters should have parent == -1.""" + edges = _make_edges([ + ("A", "B", 1.0), + ("A", "C", 1.0), + ("B", "C", 1.0), + ("D", "E", 1.0), + ("D", "F", 1.0), + ("E", "F", 1.0), + ]) + clusters = cluster_graph(edges, max_cluster_size=10, use_lcc=False, seed=42) + + for level, _, parent, _ in clusters: + if level == 0: + assert parent == -1 + + def test_all_nodes_covered_at_each_level(self): + """At any given level, the union of all community nodes should + equal exactly the set of all nodes in the graph for that level.""" + edges = _make_edges([ + ("A", "B", 1.0), + ("A", "C", 1.0), + ("B", "C", 1.0), + ("D", "E", 1.0), + ("D", "F", 1.0), + ("E", "F", 1.0), + ]) + clusters = cluster_graph(edges, max_cluster_size=10, use_lcc=False, seed=42) + + levels: dict[int, set[str]] = {} + for level, _, _, nodes in clusters: + levels.setdefault(level, set()).update(nodes) + + all_nodes = {"A", "B", "C", "D", "E", "F"} + for level, covered_nodes in levels.items(): + assert covered_nodes == all_nodes, ( + f"Level {level}: expected {all_nodes}, got {covered_nodes}" + ) + + +# ------------------------------------------------------------------- +# Real test data (golden file regression) +# ------------------------------------------------------------------- + + +class TestClusterGraphRealData: + """Regression tests using the shared test fixture data.""" + + @pytest.fixture + def relationships(self) -> pd.DataFrame: + """Load the test relationships fixture.""" + return pd.read_parquet("tests/verbs/data/relationships.parquet") + + def test_cluster_count(self, relationships: pd.DataFrame): + """Pin the expected number of clusters from the fixture data.""" + clusters = cluster_graph( + relationships, + max_cluster_size=10, + use_lcc=True, + seed=0xDEADBEEF, + ) + assert len(clusters) == 122 + + def test_level_distribution(self, relationships: pd.DataFrame): + """Pin the expected number of clusters per level.""" + clusters = cluster_graph( + relationships, + max_cluster_size=10, + use_lcc=True, + seed=0xDEADBEEF, + ) + from collections import Counter + + level_counts = Counter(c[0] for c in clusters) + assert level_counts == {0: 23, 1: 65, 2: 32, 3: 2} + + def test_level_zero_nodes_sample(self, relationships: pd.DataFrame): + """Spot-check a few known nodes in level-0 clusters.""" + clusters = cluster_graph( + relationships, + max_cluster_size=10, + use_lcc=True, + seed=0xDEADBEEF, + ) + level_0 = [c for c in clusters if c[0] == 0] + all_level_0_nodes = set() + for _, _, _, nodes in level_0: + all_level_0_nodes.update(nodes) + + assert "SCROOGE" in all_level_0_nodes + assert "ABRAHAM" in all_level_0_nodes + assert "JACOB MARLEY" in all_level_0_nodes diff --git a/tests/unit/indexing/test_create_communities.py b/tests/unit/indexing/test_create_communities.py new file mode 100644 index 0000000000..1b77e079e8 --- /dev/null +++ b/tests/unit/indexing/test_create_communities.py @@ -0,0 +1,488 @@ +# Copyright (C) 2026 Microsoft + +"""Tests for the create_communities pure function. + +These tests pin down the behavior of the create_communities function +independently of the workflow runner, so that refactoring (vectorizing +the per-level loop, streaming entity reads, streaming writes, etc.) +can be verified against known output. +""" + +import uuid + +import pandas as pd +import pytest +from graphrag.data_model.schemas import COMMUNITIES_FINAL_COLUMNS +from graphrag.index.workflows.create_communities import create_communities + + +def _make_title_to_entity_id( + rows: list[tuple[str, str]], +) -> dict[str, str]: + """Build a title-to-entity-id mapping from (id, title) pairs.""" + return {title: eid for eid, title in rows} + + +def _make_relationships( + rows: list[tuple[str, str, str, float, list[str]]], +) -> pd.DataFrame: + """Build a minimal relationships DataFrame. + + Each row is (id, source, target, weight, text_unit_ids). + """ + return pd.DataFrame([ + { + "id": rid, + "source": src, + "target": tgt, + "weight": w, + "text_unit_ids": tuids, + "human_readable_id": i, + } + for i, (rid, src, tgt, w, tuids) in enumerate(rows) + ]) + + +@pytest.fixture +def two_triangles(): + """Two disconnected triangles: {A,B,C} and {D,E,F}.""" + title_to_entity_id = _make_title_to_entity_id([ + ("e1", "A"), + ("e2", "B"), + ("e3", "C"), + ("e4", "D"), + ("e5", "E"), + ("e6", "F"), + ]) + relationships = _make_relationships([ + ("r1", "A", "B", 1.0, ["t1"]), + ("r2", "A", "C", 1.0, ["t1", "t2"]), + ("r3", "B", "C", 1.0, ["t2"]), + ("r4", "D", "E", 1.0, ["t3"]), + ("r5", "D", "F", 1.0, ["t3", "t4"]), + ("r6", "E", "F", 1.0, ["t4"]), + ]) + return title_to_entity_id, relationships + + +# ------------------------------------------------------------------- +# Column schema +# ------------------------------------------------------------------- + + +class TestOutputSchema: + """Verify the output DataFrame has the expected column schema.""" + + def test_has_all_final_columns(self, two_triangles): + """Output must have exactly the COMMUNITIES_FINAL_COLUMNS.""" + title_to_entity_id, relationships = two_triangles + result = create_communities( + title_to_entity_id, + relationships, + max_cluster_size=10, + use_lcc=False, + seed=42, + ) + assert list(result.columns) == COMMUNITIES_FINAL_COLUMNS + + def test_column_order_matches_schema(self, two_triangles): + """Column order must match the schema constant exactly.""" + title_to_entity_id, relationships = two_triangles + result = create_communities( + title_to_entity_id, + relationships, + max_cluster_size=10, + use_lcc=False, + seed=42, + ) + for i, col_name in enumerate(COMMUNITIES_FINAL_COLUMNS): + assert result.columns[i] == col_name + + +# ------------------------------------------------------------------- +# Metadata fields +# ------------------------------------------------------------------- + + +class TestMetadataFields: + """Verify computed metadata fields like id, title, size, period.""" + + def test_uuid_ids(self, two_triangles): + """Each community id should be a valid UUID4.""" + title_to_entity_id, relationships = two_triangles + result = create_communities( + title_to_entity_id, + relationships, + max_cluster_size=10, + use_lcc=False, + seed=42, + ) + for _, row in result.iterrows(): + parsed = uuid.UUID(row["id"]) + assert parsed.version == 4 + + def test_title_format(self, two_triangles): + """Title should be 'Community N' where N is the community id.""" + title_to_entity_id, relationships = two_triangles + result = create_communities( + title_to_entity_id, + relationships, + max_cluster_size=10, + use_lcc=False, + seed=42, + ) + for _, row in result.iterrows(): + assert row["title"] == f"Community {row['community']}" + + def test_human_readable_id_equals_community(self, two_triangles): + """human_readable_id should equal the community integer id.""" + title_to_entity_id, relationships = two_triangles + result = create_communities( + title_to_entity_id, + relationships, + max_cluster_size=10, + use_lcc=False, + seed=42, + ) + assert (result["human_readable_id"] == result["community"]).all() + + def test_size_equals_entity_count(self, two_triangles): + """size should equal the length of entity_ids.""" + title_to_entity_id, relationships = two_triangles + result = create_communities( + title_to_entity_id, + relationships, + max_cluster_size=10, + use_lcc=False, + seed=42, + ) + for _, row in result.iterrows(): + assert row["size"] == len(row["entity_ids"]) + + def test_period_is_iso_date(self, two_triangles): + """period should be a valid ISO date string.""" + title_to_entity_id, relationships = two_triangles + result = create_communities( + title_to_entity_id, + relationships, + max_cluster_size=10, + use_lcc=False, + seed=42, + ) + from datetime import date + + for _, row in result.iterrows(): + date.fromisoformat(row["period"]) + + +# ------------------------------------------------------------------- +# Entity aggregation +# ------------------------------------------------------------------- + + +class TestEntityAggregation: + """Verify that entity_ids are correctly aggregated per community.""" + + def test_entity_ids_per_community(self, two_triangles): + """Each community should contain exactly the entities matching + its cluster nodes.""" + title_to_entity_id, relationships = two_triangles + result = create_communities( + title_to_entity_id, + relationships, + max_cluster_size=10, + use_lcc=False, + seed=42, + ) + comm_0 = result[result["community"] == 0].iloc[0] + comm_1 = result[result["community"] == 1].iloc[0] + + assert sorted(comm_0["entity_ids"]) == ["e1", "e2", "e3"] + assert sorted(comm_1["entity_ids"]) == ["e4", "e5", "e6"] + + def test_entity_ids_are_lists(self, two_triangles): + """entity_ids should be Python lists, not numpy arrays.""" + title_to_entity_id, relationships = two_triangles + result = create_communities( + title_to_entity_id, + relationships, + max_cluster_size=10, + use_lcc=False, + seed=42, + ) + for _, row in result.iterrows(): + assert isinstance(row["entity_ids"], list) + + +# ------------------------------------------------------------------- +# Relationship and text_unit aggregation +# ------------------------------------------------------------------- + + +class TestRelationshipAggregation: + """Verify that relationship_ids and text_unit_ids are correctly + aggregated (intra-community only) and deduplicated.""" + + def test_relationship_ids_per_community(self, two_triangles): + """Each community should only include relationships where both + endpoints are in the same community.""" + title_to_entity_id, relationships = two_triangles + result = create_communities( + title_to_entity_id, + relationships, + max_cluster_size=10, + use_lcc=False, + seed=42, + ) + comm_0 = result[result["community"] == 0].iloc[0] + comm_1 = result[result["community"] == 1].iloc[0] + + assert sorted(comm_0["relationship_ids"]) == ["r1", "r2", "r3"] + assert sorted(comm_1["relationship_ids"]) == ["r4", "r5", "r6"] + + def test_text_unit_ids_per_community(self, two_triangles): + """text_unit_ids should be the deduplicated union of text units + from the community's intra-community relationships.""" + title_to_entity_id, relationships = two_triangles + result = create_communities( + title_to_entity_id, + relationships, + max_cluster_size=10, + use_lcc=False, + seed=42, + ) + comm_0 = result[result["community"] == 0].iloc[0] + comm_1 = result[result["community"] == 1].iloc[0] + + assert sorted(comm_0["text_unit_ids"]) == ["t1", "t2"] + assert sorted(comm_1["text_unit_ids"]) == ["t3", "t4"] + + def test_lists_are_sorted_and_deduplicated(self, two_triangles): + """relationship_ids and text_unit_ids should be sorted with + no duplicates.""" + title_to_entity_id, relationships = two_triangles + result = create_communities( + title_to_entity_id, + relationships, + max_cluster_size=10, + use_lcc=False, + seed=42, + ) + for _, row in result.iterrows(): + assert row["relationship_ids"] == sorted(set(row["relationship_ids"])) + assert row["text_unit_ids"] == sorted(set(row["text_unit_ids"])) + + def test_cross_community_relationships_excluded(self): + """A relationship spanning two communities must not appear in + either community's relationship_ids.""" + title_to_entity_id = _make_title_to_entity_id([ + ("e1", "A"), + ("e2", "B"), + ("e3", "C"), + ("e4", "D"), + ("e5", "E"), + ("e6", "F"), + ]) + relationships = _make_relationships([ + ("r1", "A", "B", 1.0, ["t1"]), + ("r2", "A", "C", 1.0, ["t1"]), + ("r3", "B", "C", 1.0, ["t1"]), + ("r_cross", "C", "D", 0.1, ["t_cross"]), + ("r4", "D", "E", 1.0, ["t2"]), + ("r5", "D", "F", 1.0, ["t2"]), + ("r6", "E", "F", 1.0, ["t2"]), + ]) + result = create_communities( + title_to_entity_id, + relationships, + max_cluster_size=10, + use_lcc=False, + seed=42, + ) + all_rel_ids = [] + for _, row in result.iterrows(): + all_rel_ids.extend(row["relationship_ids"]) + assert "r_cross" not in all_rel_ids + assert "t_cross" not in [ + tid for _, row in result.iterrows() for tid in row["text_unit_ids"] + ] + + +# ------------------------------------------------------------------- +# Parent / children tree +# ------------------------------------------------------------------- + + +class TestParentChildTree: + """Verify the parent-child tree structure is consistent.""" + + def test_level_zero_parent_is_minus_one(self, two_triangles): + """All level-0 communities should have parent == -1.""" + title_to_entity_id, relationships = two_triangles + result = create_communities( + title_to_entity_id, + relationships, + max_cluster_size=10, + use_lcc=False, + seed=42, + ) + lvl0 = result[result["level"] == 0] + assert (lvl0["parent"] == -1).all() + + def test_leaf_communities_have_empty_children(self, two_triangles): + """Communities that are nobody's parent should have children=[].""" + title_to_entity_id, relationships = two_triangles + result = create_communities( + title_to_entity_id, + relationships, + max_cluster_size=10, + use_lcc=False, + seed=42, + ) + for _, row in result.iterrows(): + children = row["children"] + if isinstance(children, list) and len(children) == 0: + child_rows = result[result["parent"] == row["community"]] + assert len(child_rows) == 0 + + def test_parent_child_bidirectional_consistency_real_data(self): + """For real test data: if community X lists Y as child, + then Y's parent must be X.""" + entities_df = pd.read_parquet("tests/verbs/data/entities.parquet") + title_to_entity_id = dict(zip(entities_df["title"], entities_df["id"])) + relationships = pd.read_parquet("tests/verbs/data/relationships.parquet") + result = create_communities( + title_to_entity_id, + relationships, + max_cluster_size=10, + use_lcc=True, + seed=0xDEADBEEF, + ) + for _, row in result.iterrows(): + children = row["children"] + if hasattr(children, "__len__") and len(children) > 0: + for child_id in children: + child_row = result[result["community"] == child_id] + assert len(child_row) == 1, ( + f"Child {child_id} not found or duplicated" + ) + assert child_row.iloc[0]["parent"] == row["community"] + + +# ------------------------------------------------------------------- +# LCC filtering +# ------------------------------------------------------------------- + + +class TestLccFiltering: + """Verify LCC filtering interaction with create_communities.""" + + def test_lcc_reduces_community_count(self): + """With use_lcc=True and two disconnected components, only the + larger component's communities should appear.""" + title_to_entity_id = _make_title_to_entity_id([ + ("e1", "A"), + ("e2", "B"), + ("e3", "C"), + ("e4", "D"), + ("e5", "E"), + ("e6", "F"), + ]) + relationships = _make_relationships([ + ("r1", "A", "B", 1.0, ["t1"]), + ("r2", "A", "C", 1.0, ["t1"]), + ("r3", "B", "C", 1.0, ["t1"]), + ("r4", "D", "E", 1.0, ["t2"]), + ("r5", "D", "F", 1.0, ["t2"]), + ("r6", "E", "F", 1.0, ["t2"]), + ]) + result_no_lcc = create_communities( + title_to_entity_id, + relationships, + max_cluster_size=10, + use_lcc=False, + seed=42, + ) + result_lcc = create_communities( + title_to_entity_id, + relationships, + max_cluster_size=10, + use_lcc=True, + seed=42, + ) + assert len(result_lcc) < len(result_no_lcc) + assert len(result_lcc) == 1 + + +# ------------------------------------------------------------------- +# Golden file regression (real test data) +# ------------------------------------------------------------------- + + +class TestRealDataRegression: + """Regression tests using the shared test fixture data. + + These pin exact values so any behavioral change during refactoring + is caught immediately. + """ + + @pytest.fixture + def real_result(self) -> pd.DataFrame: + """Run create_communities on the test fixture data.""" + entities_df = pd.read_parquet("tests/verbs/data/entities.parquet") + title_to_entity_id = dict(zip(entities_df["title"], entities_df["id"])) + relationships = pd.read_parquet("tests/verbs/data/relationships.parquet") + return create_communities( + title_to_entity_id, + relationships, + max_cluster_size=10, + use_lcc=True, + seed=0xDEADBEEF, + ) + + def test_row_count(self, real_result: pd.DataFrame): + """Pin the expected number of communities.""" + assert len(real_result) == 122 + + def test_level_distribution(self, real_result: pd.DataFrame): + """Pin the expected number of communities per level.""" + from collections import Counter + + counts = Counter(real_result["level"].tolist()) + assert counts == {0: 23, 1: 65, 2: 32, 3: 2} + + def test_values_match_golden_file(self, real_result: pd.DataFrame): + """The output should match the golden Parquet file for all + columns except id (UUID) and period (date-dependent).""" + expected = pd.read_parquet("tests/verbs/data/communities.parquet") + + assert len(real_result) == len(expected) + + skip_columns = {"id", "period", "children"} + for col in COMMUNITIES_FINAL_COLUMNS: + if col in skip_columns: + continue + pd.testing.assert_series_equal( + real_result[col], + expected[col], + check_dtype=False, + check_index=False, + check_names=False, + obj=f"Column '{col}'", + ) + + # children requires special handling: the golden file stores + # numpy arrays, the function may return lists or arrays + for i in range(len(real_result)): + actual_children = list(real_result.iloc[i]["children"]) + expected_children = list(expected.iloc[i]["children"]) + assert actual_children == expected_children, ( + f"Row {i} children mismatch: {actual_children} != {expected_children}" + ) + + def test_communities_with_children(self, real_result: pd.DataFrame): + """Pin the expected number of communities that have children.""" + has_children = real_result["children"].apply( + lambda x: hasattr(x, "__len__") and len(x) > 0 + ) + assert has_children.sum() == 24 From 76a72f6d7683519a2c60df59a319e8d0b0bf982c Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Thu, 19 Feb 2026 11:53:27 -0300 Subject: [PATCH 3/7] fix deleted file --- RELEASE.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 RELEASE.md diff --git a/RELEASE.md b/RELEASE.md deleted file mode 100644 index e69de29bb2..0000000000 From 82c511196fca8612174d71c4c260f1a2f67adaa3 Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Thu, 19 Feb 2026 11:55:23 -0300 Subject: [PATCH 4/7] addd file --- RELEASE.md | 187 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 187 insertions(+) create mode 100644 RELEASE.md diff --git a/RELEASE.md b/RELEASE.md new file mode 100644 index 0000000000..b078ca3f16 --- /dev/null +++ b/RELEASE.md @@ -0,0 +1,187 @@ +# Release Process + +This document describes the end-to-end process for releasing GraphRAG packages. + +**Note**: The CI publish workflow (python-publish.yml) is currently non-functional. +Packages must be published manually as described below. + +## Prerequisites + +- Write access to the `microsoft/graphrag` repository. +- Maintainer or owner role on **all** GraphRAG packages on PyPI. +- A project-scoped PyPI API token (see [PyPI token setup](#generate-pypi-tokens)). +- `uv` installed locally. +- Dependencies synced: `uv sync --all-packages`. + +## 1. Prepare the release + +Pull the latest changes on `main` and run the release task: + +```sh +git checkout main +git pull +uv run poe release +``` + +This runs the following steps automatically: + +1. `semversioner release` -- consumes all pending change files and bumps the + version. +2. Regenerates `CHANGELOG.md`. +3. Updates `project.version` in every package's `pyproject.toml`. +4. Updates cross-package dependency version pins (e.g. `graphrag-common==X.Y.Z` + in all packages that depend on it). +5. Runs `uv sync --all-packages` to update the lockfile. + +### Cutting a release on Windows + +`uv run poe release` does not work on Windows unless you are using WSL. Poe +defaults to `cmd.exe` and there is no straightforward way to force it to use +PowerShell. Run each step manually in PowerShell instead: + +```powershell +uv run semversioner release +uv run semversioner changelog > CHANGELOG.md + +$version = uv run semversioner current-version +if (-not $version) { Write-Error "Failed to get version"; exit 1 } + +uv run update-toml update --file packages/graphrag/pyproject.toml --path project.version --value $version +uv run update-toml update --file packages/graphrag-common/pyproject.toml --path project.version --value $version +uv run update-toml update --file packages/graphrag-chunking/pyproject.toml --path project.version --value $version +uv run update-toml update --file packages/graphrag-input/pyproject.toml --path project.version --value $version +uv run update-toml update --file packages/graphrag-storage/pyproject.toml --path project.version --value $version +uv run update-toml update --file packages/graphrag-cache/pyproject.toml --path project.version --value $version +uv run update-toml update --file packages/graphrag-vectors/pyproject.toml --path project.version --value $version +uv run update-toml update --file packages/graphrag-llm/pyproject.toml --path project.version --value $version + +uv run python -m scripts.update_workspace_dependency_versions +uv sync --all-packages +``` + +## 2. Open and merge the release PR + +Check `CHANGELOG.md` or any package's `pyproject.toml` to find the new version, +then move the changes to a release branch: + +```sh +git switch -c release/vVERSION +git add . +git commit -m "Release vVERSION" +git tag -a vVERSION -m "Release vVERSION" +git push origin release/vVERSION -u +``` + +Open a PR targeting `main`. CI checks (semver, linting, tests) will run +automatically. Once approved, merge to `main`. + +## 3. Publish to PyPI + +Once the PR is merged, switch back to `main`, build, and publish. + +You will need a PyPI API token set as the `UV_PUBLISH_TOKEN` environment +variable. See the +[uv docs on publishing](https://docs.astral.sh/uv/guides/package/#building-and-publishing-a-package) +for details. + +### Generate PyPI tokens + +For each package, go to and create a +project-scoped API token under **API tokens > Add API token**. Select the +specific project as the scope. Copy the token (starts with `pypi-...`) -- it is +only shown once. + +If you want to publish all packages with one token and a single `uv publish` +call, create an **account-scoped** token instead of a project-scoped one. + +### Build the packages + +```sh +git checkout main +git pull +uv sync --all-packages +uv run poe build +``` + +All wheels and source distributions will be placed in the `dist/` directory. + +### Publish all packages at once + +This requires an **account-scoped** PyPI token: + +```sh +export UV_PUBLISH_TOKEN="pypi-..." +uv publish +``` + +### Publishing packages individually + +If you need to publish packages one at a time (e.g. with separate per-package +tokens), publish them in dependency order. The `graphrag` meta-package depends on +all others, so it goes last. + +```sh +# 1. graphrag-common (no internal dependencies) +export UV_PUBLISH_TOKEN="pypi-" +uv publish dist/graphrag_common-* + +# 2. graphrag-storage (depends on common) +export UV_PUBLISH_TOKEN="pypi-" +uv publish dist/graphrag_storage-* + +# 3. graphrag-chunking (depends on common) +export UV_PUBLISH_TOKEN="pypi-" +uv publish dist/graphrag_chunking-* + +# 4. graphrag-vectors (depends on common) +export UV_PUBLISH_TOKEN="pypi-" +uv publish dist/graphrag_vectors-* + +# 5. graphrag-input (depends on common, storage) +export UV_PUBLISH_TOKEN="pypi-" +uv publish dist/graphrag_input-* + +# 6. graphrag-cache (depends on common, storage) +export UV_PUBLISH_TOKEN="pypi-" +uv publish dist/graphrag_cache-* + +# 7. graphrag-llm (depends on cache, common) +export UV_PUBLISH_TOKEN="pypi-" +uv publish dist/graphrag_llm-* + +# 8. graphrag (depends on ALL of the above -- publish last) +export UV_PUBLISH_TOKEN="pypi-" +uv publish dist/graphrag-* +``` + +### Verify + +After publishing, confirm the new versions are live: + +```sh +pip index versions graphrag +``` + +Or visit and check each sub-package page. + +## 4. Create a GitHub release + +1. Go to . +2. Select the tag you pushed earlier (e.g. `v3.1.0`). Target: `main`. +3. Title: the version number (e.g. `v3.1.0`). +4. Use a previous release as a template for the release notes, or click + "Generate release notes". +5. Publish the release. + +## Package dependency graph (for reference) + +``` +graphrag-common (no internal deps) +├── graphrag-storage (common) +├── graphrag-chunking (common) +├── graphrag-vectors (common) +├── graphrag-input (common, storage) +├── graphrag-cache (common, storage) +├── graphrag-llm (cache, common) +└── graphrag (all of the above) +``` \ No newline at end of file From 360c9c5d77eb6c97b6f0623d434d42fd57c1d1c0 Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Thu, 19 Feb 2026 12:01:54 -0300 Subject: [PATCH 5/7] fix check --- tests/unit/indexing/test_cluster_graph.py | 2 +- tests/unit/indexing/test_create_communities.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/unit/indexing/test_cluster_graph.py b/tests/unit/indexing/test_cluster_graph.py index f9f7344530..cf717bc35a 100644 --- a/tests/unit/indexing/test_cluster_graph.py +++ b/tests/unit/indexing/test_cluster_graph.py @@ -41,7 +41,7 @@ def test_single_triangle(self): clusters = cluster_graph(edges, max_cluster_size=10, use_lcc=False, seed=42) assert len(clusters) == 1 - level, cid, parent, nodes = clusters[0] + level, _cid, parent, nodes = clusters[0] assert level == 0 assert parent == -1 assert set(nodes) == {"X", "Y", "Z"} diff --git a/tests/unit/indexing/test_create_communities.py b/tests/unit/indexing/test_create_communities.py index 1b77e079e8..d20b7ea542 100644 --- a/tests/unit/indexing/test_create_communities.py +++ b/tests/unit/indexing/test_create_communities.py @@ -349,7 +349,9 @@ def test_parent_child_bidirectional_consistency_real_data(self): """For real test data: if community X lists Y as child, then Y's parent must be X.""" entities_df = pd.read_parquet("tests/verbs/data/entities.parquet") - title_to_entity_id = dict(zip(entities_df["title"], entities_df["id"])) + title_to_entity_id = dict( + zip(entities_df["title"], entities_df["id"], strict=False) + ) relationships = pd.read_parquet("tests/verbs/data/relationships.parquet") result = create_communities( title_to_entity_id, @@ -430,7 +432,9 @@ class TestRealDataRegression: def real_result(self) -> pd.DataFrame: """Run create_communities on the test fixture data.""" entities_df = pd.read_parquet("tests/verbs/data/entities.parquet") - title_to_entity_id = dict(zip(entities_df["title"], entities_df["id"])) + title_to_entity_id = dict( + zip(entities_df["title"], entities_df["id"], strict=False) + ) relationships = pd.read_parquet("tests/verbs/data/relationships.parquet") return create_communities( title_to_entity_id, From 17fbdc775499422d2e63b002524a8b043f856b07 Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Thu, 19 Feb 2026 13:54:03 -0300 Subject: [PATCH 6/7] add consistency --- .../index/workflows/create_communities.py | 66 ++++--- .../unit/indexing/test_create_communities.py | 171 +++++++++++++----- 2 files changed, 171 insertions(+), 66 deletions(-) diff --git a/packages/graphrag/graphrag/index/workflows/create_communities.py b/packages/graphrag/graphrag/index/workflows/create_communities.py index 57a7005fbf..33d506fd46 100644 --- a/packages/graphrag/graphrag/index/workflows/create_communities.py +++ b/packages/graphrag/graphrag/index/workflows/create_communities.py @@ -10,6 +10,7 @@ import numpy as np import pandas as pd +from graphrag_storage.tables.table import Table from graphrag.config.models.graph_rag_config import GraphRagConfig from graphrag.data_model.data_reader import DataReader @@ -39,37 +40,34 @@ async def run_workflow( use_lcc = config.cluster_graph.use_lcc seed = config.cluster_graph.seed - output = create_communities( - title_to_entity_id, - relationships, - max_cluster_size=max_cluster_size, - use_lcc=use_lcc, - seed=seed, - ) - - rows = output.to_dict("records") - sample_size = min(5, len(rows)) - sample_rows = rows[:sample_size] - - async with context.output_table_provider.open("communities") as table: - for row in rows: - await table.write(cast("dict[str, Any]", row)) + async with context.output_table_provider.open("communities") as communities_table: + sample_rows = await create_communities( + communities_table, + title_to_entity_id, + relationships, + max_cluster_size=max_cluster_size, + use_lcc=use_lcc, + seed=seed, + ) logger.info("Workflow completed: create_communities") return WorkflowFunctionOutput(result=sample_rows) -def create_communities( +async def create_communities( + communities_table: Table, title_to_entity_id: dict[str, str], relationships: pd.DataFrame, max_cluster_size: int, use_lcc: bool, seed: int | None = None, -) -> pd.DataFrame: - """Build community DataFrame from clustered relationships. +) -> list[dict[str, Any]]: + """Build communities from clustered relationships and stream rows to the table. Args ---- + communities_table: Table + Output table to write community rows to. title_to_entity_id: dict[str, str] Mapping of entity title to entity id. relationships: pd.DataFrame @@ -84,8 +82,8 @@ def create_communities( Returns ------- - pd.DataFrame - Communities DataFrame with COMMUNITIES_FINAL_COLUMNS schema. + list[dict[str, Any]] + Sample of up to 5 community rows for logging. """ clusters = cluster_graph( relationships, @@ -181,7 +179,27 @@ def create_communities( final_communities["period"] = datetime.now(timezone.utc).date().isoformat() final_communities["size"] = final_communities.loc[:, "entity_ids"].apply(len) - return final_communities.loc[ - :, - COMMUNITIES_FINAL_COLUMNS, - ] + output = final_communities.loc[:, COMMUNITIES_FINAL_COLUMNS] + rows = output.to_dict("records") + sample_rows: list[dict[str, Any]] = [] + for row in rows: + row = _sanitize_row(row) + await communities_table.write(row) + if len(sample_rows) < 5: + sample_rows.append(row) + return sample_rows + + +def _sanitize_row(row: dict[str, Any]) -> dict[str, Any]: + """Convert numpy types to native Python types for table serialization.""" + sanitized = {} + for key, value in row.items(): + if isinstance(value, np.ndarray): + sanitized[key] = value.tolist() + elif isinstance(value, np.integer): + sanitized[key] = int(value) + elif isinstance(value, np.floating): + sanitized[key] = float(value) + else: + sanitized[key] = value + return sanitized diff --git a/tests/unit/indexing/test_create_communities.py b/tests/unit/indexing/test_create_communities.py index d20b7ea542..904f9c93bb 100644 --- a/tests/unit/indexing/test_create_communities.py +++ b/tests/unit/indexing/test_create_communities.py @@ -9,11 +9,39 @@ """ import uuid +from typing import Any +import numpy as np import pandas as pd import pytest from graphrag.data_model.schemas import COMMUNITIES_FINAL_COLUMNS -from graphrag.index.workflows.create_communities import create_communities +from graphrag.index.workflows.create_communities import ( + _sanitize_row, + create_communities, +) +from graphrag_storage.tables.csv_table import CSVTable + + +class FakeTable(CSVTable): + """In-memory table that collects written rows for test assertions.""" + + def __init__(self) -> None: + self.rows: list[dict[str, Any]] = [] + + async def write(self, row: dict[str, Any]) -> None: + """Append a row to the in-memory store.""" + self.rows.append(row) + + +async def _run_create_communities( + title_to_entity_id: dict[str, str], + relationships: pd.DataFrame, + **kwargs: Any, +) -> pd.DataFrame: + """Helper that runs create_communities with a FakeTable and returns all rows as a DataFrame.""" + table = FakeTable() + await create_communities(table, title_to_entity_id, relationships, **kwargs) + return pd.DataFrame(table.rows) def _make_title_to_entity_id( @@ -73,10 +101,10 @@ def two_triangles(): class TestOutputSchema: """Verify the output DataFrame has the expected column schema.""" - def test_has_all_final_columns(self, two_triangles): + async def test_has_all_final_columns(self, two_triangles): """Output must have exactly the COMMUNITIES_FINAL_COLUMNS.""" title_to_entity_id, relationships = two_triangles - result = create_communities( + result = await _run_create_communities( title_to_entity_id, relationships, max_cluster_size=10, @@ -85,10 +113,10 @@ def test_has_all_final_columns(self, two_triangles): ) assert list(result.columns) == COMMUNITIES_FINAL_COLUMNS - def test_column_order_matches_schema(self, two_triangles): + async def test_column_order_matches_schema(self, two_triangles): """Column order must match the schema constant exactly.""" title_to_entity_id, relationships = two_triangles - result = create_communities( + result = await _run_create_communities( title_to_entity_id, relationships, max_cluster_size=10, @@ -107,10 +135,10 @@ def test_column_order_matches_schema(self, two_triangles): class TestMetadataFields: """Verify computed metadata fields like id, title, size, period.""" - def test_uuid_ids(self, two_triangles): + async def test_uuid_ids(self, two_triangles): """Each community id should be a valid UUID4.""" title_to_entity_id, relationships = two_triangles - result = create_communities( + result = await _run_create_communities( title_to_entity_id, relationships, max_cluster_size=10, @@ -121,10 +149,10 @@ def test_uuid_ids(self, two_triangles): parsed = uuid.UUID(row["id"]) assert parsed.version == 4 - def test_title_format(self, two_triangles): + async def test_title_format(self, two_triangles): """Title should be 'Community N' where N is the community id.""" title_to_entity_id, relationships = two_triangles - result = create_communities( + result = await _run_create_communities( title_to_entity_id, relationships, max_cluster_size=10, @@ -134,10 +162,10 @@ def test_title_format(self, two_triangles): for _, row in result.iterrows(): assert row["title"] == f"Community {row['community']}" - def test_human_readable_id_equals_community(self, two_triangles): + async def test_human_readable_id_equals_community(self, two_triangles): """human_readable_id should equal the community integer id.""" title_to_entity_id, relationships = two_triangles - result = create_communities( + result = await _run_create_communities( title_to_entity_id, relationships, max_cluster_size=10, @@ -146,10 +174,10 @@ def test_human_readable_id_equals_community(self, two_triangles): ) assert (result["human_readable_id"] == result["community"]).all() - def test_size_equals_entity_count(self, two_triangles): + async def test_size_equals_entity_count(self, two_triangles): """size should equal the length of entity_ids.""" title_to_entity_id, relationships = two_triangles - result = create_communities( + result = await _run_create_communities( title_to_entity_id, relationships, max_cluster_size=10, @@ -159,10 +187,10 @@ def test_size_equals_entity_count(self, two_triangles): for _, row in result.iterrows(): assert row["size"] == len(row["entity_ids"]) - def test_period_is_iso_date(self, two_triangles): + async def test_period_is_iso_date(self, two_triangles): """period should be a valid ISO date string.""" title_to_entity_id, relationships = two_triangles - result = create_communities( + result = await _run_create_communities( title_to_entity_id, relationships, max_cluster_size=10, @@ -183,11 +211,11 @@ def test_period_is_iso_date(self, two_triangles): class TestEntityAggregation: """Verify that entity_ids are correctly aggregated per community.""" - def test_entity_ids_per_community(self, two_triangles): + async def test_entity_ids_per_community(self, two_triangles): """Each community should contain exactly the entities matching its cluster nodes.""" title_to_entity_id, relationships = two_triangles - result = create_communities( + result = await _run_create_communities( title_to_entity_id, relationships, max_cluster_size=10, @@ -200,10 +228,10 @@ def test_entity_ids_per_community(self, two_triangles): assert sorted(comm_0["entity_ids"]) == ["e1", "e2", "e3"] assert sorted(comm_1["entity_ids"]) == ["e4", "e5", "e6"] - def test_entity_ids_are_lists(self, two_triangles): + async def test_entity_ids_are_lists(self, two_triangles): """entity_ids should be Python lists, not numpy arrays.""" title_to_entity_id, relationships = two_triangles - result = create_communities( + result = await _run_create_communities( title_to_entity_id, relationships, max_cluster_size=10, @@ -223,11 +251,11 @@ class TestRelationshipAggregation: """Verify that relationship_ids and text_unit_ids are correctly aggregated (intra-community only) and deduplicated.""" - def test_relationship_ids_per_community(self, two_triangles): + async def test_relationship_ids_per_community(self, two_triangles): """Each community should only include relationships where both endpoints are in the same community.""" title_to_entity_id, relationships = two_triangles - result = create_communities( + result = await _run_create_communities( title_to_entity_id, relationships, max_cluster_size=10, @@ -240,11 +268,11 @@ def test_relationship_ids_per_community(self, two_triangles): assert sorted(comm_0["relationship_ids"]) == ["r1", "r2", "r3"] assert sorted(comm_1["relationship_ids"]) == ["r4", "r5", "r6"] - def test_text_unit_ids_per_community(self, two_triangles): + async def test_text_unit_ids_per_community(self, two_triangles): """text_unit_ids should be the deduplicated union of text units from the community's intra-community relationships.""" title_to_entity_id, relationships = two_triangles - result = create_communities( + result = await _run_create_communities( title_to_entity_id, relationships, max_cluster_size=10, @@ -257,11 +285,11 @@ def test_text_unit_ids_per_community(self, two_triangles): assert sorted(comm_0["text_unit_ids"]) == ["t1", "t2"] assert sorted(comm_1["text_unit_ids"]) == ["t3", "t4"] - def test_lists_are_sorted_and_deduplicated(self, two_triangles): + async def test_lists_are_sorted_and_deduplicated(self, two_triangles): """relationship_ids and text_unit_ids should be sorted with no duplicates.""" title_to_entity_id, relationships = two_triangles - result = create_communities( + result = await _run_create_communities( title_to_entity_id, relationships, max_cluster_size=10, @@ -272,7 +300,7 @@ def test_lists_are_sorted_and_deduplicated(self, two_triangles): assert row["relationship_ids"] == sorted(set(row["relationship_ids"])) assert row["text_unit_ids"] == sorted(set(row["text_unit_ids"])) - def test_cross_community_relationships_excluded(self): + async def test_cross_community_relationships_excluded(self): """A relationship spanning two communities must not appear in either community's relationship_ids.""" title_to_entity_id = _make_title_to_entity_id([ @@ -292,7 +320,7 @@ def test_cross_community_relationships_excluded(self): ("r5", "D", "F", 1.0, ["t2"]), ("r6", "E", "F", 1.0, ["t2"]), ]) - result = create_communities( + result = await _run_create_communities( title_to_entity_id, relationships, max_cluster_size=10, @@ -316,10 +344,10 @@ def test_cross_community_relationships_excluded(self): class TestParentChildTree: """Verify the parent-child tree structure is consistent.""" - def test_level_zero_parent_is_minus_one(self, two_triangles): + async def test_level_zero_parent_is_minus_one(self, two_triangles): """All level-0 communities should have parent == -1.""" title_to_entity_id, relationships = two_triangles - result = create_communities( + result = await _run_create_communities( title_to_entity_id, relationships, max_cluster_size=10, @@ -329,10 +357,10 @@ def test_level_zero_parent_is_minus_one(self, two_triangles): lvl0 = result[result["level"] == 0] assert (lvl0["parent"] == -1).all() - def test_leaf_communities_have_empty_children(self, two_triangles): + async def test_leaf_communities_have_empty_children(self, two_triangles): """Communities that are nobody's parent should have children=[].""" title_to_entity_id, relationships = two_triangles - result = create_communities( + result = await _run_create_communities( title_to_entity_id, relationships, max_cluster_size=10, @@ -345,7 +373,7 @@ def test_leaf_communities_have_empty_children(self, two_triangles): child_rows = result[result["parent"] == row["community"]] assert len(child_rows) == 0 - def test_parent_child_bidirectional_consistency_real_data(self): + async def test_parent_child_bidirectional_consistency_real_data(self): """For real test data: if community X lists Y as child, then Y's parent must be X.""" entities_df = pd.read_parquet("tests/verbs/data/entities.parquet") @@ -353,7 +381,7 @@ def test_parent_child_bidirectional_consistency_real_data(self): zip(entities_df["title"], entities_df["id"], strict=False) ) relationships = pd.read_parquet("tests/verbs/data/relationships.parquet") - result = create_communities( + result = await _run_create_communities( title_to_entity_id, relationships, max_cluster_size=10, @@ -379,7 +407,7 @@ def test_parent_child_bidirectional_consistency_real_data(self): class TestLccFiltering: """Verify LCC filtering interaction with create_communities.""" - def test_lcc_reduces_community_count(self): + async def test_lcc_reduces_community_count(self): """With use_lcc=True and two disconnected components, only the larger component's communities should appear.""" title_to_entity_id = _make_title_to_entity_id([ @@ -398,14 +426,14 @@ def test_lcc_reduces_community_count(self): ("r5", "D", "F", 1.0, ["t2"]), ("r6", "E", "F", 1.0, ["t2"]), ]) - result_no_lcc = create_communities( + result_no_lcc = await _run_create_communities( title_to_entity_id, relationships, max_cluster_size=10, use_lcc=False, seed=42, ) - result_lcc = create_communities( + result_lcc = await _run_create_communities( title_to_entity_id, relationships, max_cluster_size=10, @@ -429,14 +457,14 @@ class TestRealDataRegression: """ @pytest.fixture - def real_result(self) -> pd.DataFrame: + async def real_result(self) -> pd.DataFrame: """Run create_communities on the test fixture data.""" entities_df = pd.read_parquet("tests/verbs/data/entities.parquet") title_to_entity_id = dict( zip(entities_df["title"], entities_df["id"], strict=False) ) relationships = pd.read_parquet("tests/verbs/data/relationships.parquet") - return create_communities( + return await _run_create_communities( title_to_entity_id, relationships, max_cluster_size=10, @@ -444,18 +472,18 @@ def real_result(self) -> pd.DataFrame: seed=0xDEADBEEF, ) - def test_row_count(self, real_result: pd.DataFrame): + async def test_row_count(self, real_result: pd.DataFrame): """Pin the expected number of communities.""" assert len(real_result) == 122 - def test_level_distribution(self, real_result: pd.DataFrame): + async def test_level_distribution(self, real_result: pd.DataFrame): """Pin the expected number of communities per level.""" from collections import Counter counts = Counter(real_result["level"].tolist()) assert counts == {0: 23, 1: 65, 2: 32, 3: 2} - def test_values_match_golden_file(self, real_result: pd.DataFrame): + async def test_values_match_golden_file(self, real_result: pd.DataFrame): """The output should match the golden Parquet file for all columns except id (UUID) and period (date-dependent).""" expected = pd.read_parquet("tests/verbs/data/communities.parquet") @@ -484,9 +512,68 @@ def test_values_match_golden_file(self, real_result: pd.DataFrame): f"Row {i} children mismatch: {actual_children} != {expected_children}" ) - def test_communities_with_children(self, real_result: pd.DataFrame): + async def test_communities_with_children(self, real_result: pd.DataFrame): """Pin the expected number of communities that have children.""" has_children = real_result["children"].apply( lambda x: hasattr(x, "__len__") and len(x) > 0 ) assert has_children.sum() == 24 + + +# ------------------------------------------------------------------- +# Row sanitization +# ------------------------------------------------------------------- + + +class TestSanitizeRow: + """Verify numpy types are converted to native Python types.""" + + def test_ndarray_to_list(self): + """np.ndarray values should become plain lists.""" + row = {"children": np.array([1, 2, 3])} + result = _sanitize_row(row) + assert result["children"] == [1, 2, 3] + assert isinstance(result["children"], list) + + def test_empty_ndarray_to_empty_list(self): + """An empty np.ndarray should become an empty list.""" + row = {"children": np.array([])} + assert _sanitize_row(row)["children"] == [] + + def test_np_integer_to_int(self): + """np.integer values should become native int.""" + row = {"community": np.int64(42)} + result = _sanitize_row(row) + assert result["community"] == 42 + assert type(result["community"]) is int + + def test_np_floating_to_float(self): + """np.floating values should become native float.""" + row = {"weight": np.float64(3.14)} + result = _sanitize_row(row) + assert result["weight"] == pytest.approx(3.14) + assert type(result["weight"]) is float + + def test_native_types_pass_through(self): + """Native Python types should pass through unchanged.""" + row = {"id": "abc", "size": 5, "tags": ["a", "b"]} + assert _sanitize_row(row) == row + + def test_mixed_row(self): + """A row with a mix of numpy and native types.""" + row = { + "community": np.int64(7), + "children": np.array([1, 2]), + "title": "Community 7", + "weight": np.float64(0.5), + } + result = _sanitize_row(row) + assert result == { + "community": 7, + "children": [1, 2], + "title": "Community 7", + "weight": pytest.approx(0.5), + } + assert type(result["community"]) is int + assert type(result["children"]) is list + assert type(result["weight"]) is float From b18ddbcb6131cba14b9310097e553c42cb55c713 Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Thu, 19 Feb 2026 14:52:39 -0300 Subject: [PATCH 7/7] fix logic --- .../index/workflows/create_communities.py | 22 +++++---- .../unit/indexing/test_create_communities.py | 49 +++++++++++++++++-- 2 files changed, 57 insertions(+), 14 deletions(-) diff --git a/packages/graphrag/graphrag/index/workflows/create_communities.py b/packages/graphrag/graphrag/index/workflows/create_communities.py index 33d506fd46..45476f9416 100644 --- a/packages/graphrag/graphrag/index/workflows/create_communities.py +++ b/packages/graphrag/graphrag/index/workflows/create_communities.py @@ -31,19 +31,17 @@ async def run_workflow( reader = DataReader(context.output_table_provider) relationships = await reader.relationships() - title_to_entity_id: dict[str, str] = {} - async with context.output_table_provider.open("entities") as entities_table: - async for row in entities_table: - title_to_entity_id[row["title"]] = row["id"] - max_cluster_size = config.cluster_graph.max_cluster_size use_lcc = config.cluster_graph.use_lcc seed = config.cluster_graph.seed - async with context.output_table_provider.open("communities") as communities_table: + async with ( + context.output_table_provider.open("entities") as entities_table, + context.output_table_provider.open("communities") as communities_table, + ): sample_rows = await create_communities( communities_table, - title_to_entity_id, + entities_table, relationships, max_cluster_size=max_cluster_size, use_lcc=use_lcc, @@ -56,7 +54,7 @@ async def run_workflow( async def create_communities( communities_table: Table, - title_to_entity_id: dict[str, str], + entities_table: Table, relationships: pd.DataFrame, max_cluster_size: int, use_lcc: bool, @@ -68,8 +66,8 @@ async def create_communities( ---- communities_table: Table Output table to write community rows to. - title_to_entity_id: dict[str, str] - Mapping of entity title to entity id. + entities_table: Table + Table containing entity rows. relationships: pd.DataFrame Relationships DataFrame with source, target, weight, text_unit_ids columns. @@ -92,6 +90,10 @@ async def create_communities( seed=seed, ) + title_to_entity_id: dict[str, str] = {} + async for row in entities_table: + title_to_entity_id[row["title"]] = row["id"] + communities = pd.DataFrame( clusters, columns=pd.Index(["level", "community", "parent", "title"]) ).explode("title") diff --git a/tests/unit/indexing/test_create_communities.py b/tests/unit/indexing/test_create_communities.py index 904f9c93bb..e3f225bf0b 100644 --- a/tests/unit/indexing/test_create_communities.py +++ b/tests/unit/indexing/test_create_communities.py @@ -20,6 +20,7 @@ create_communities, ) from graphrag_storage.tables.csv_table import CSVTable +from graphrag_storage.tables.table import Table class FakeTable(CSVTable): @@ -33,15 +34,55 @@ async def write(self, row: dict[str, Any]) -> None: self.rows.append(row) +class FakeEntitiesTable(Table): + """In-memory read-only table that supports async iteration.""" + + def __init__(self, rows: list[dict[str, Any]]) -> None: + self._rows = rows + self._index = 0 + + def __aiter__(self): + """Return an async iterator over the rows.""" + self._index = 0 + return self + + async def __anext__(self) -> dict[str, Any]: + """Yield the next row or stop.""" + if self._index >= len(self._rows): + raise StopAsyncIteration + row = self._rows[self._index] + self._index += 1 + return row + + async def length(self) -> int: + """Return number of rows.""" + return len(self._rows) + + async def has(self, row_id: str) -> bool: + """Check if a row with the given ID exists.""" + return any(r.get("id") == row_id for r in self._rows) + + async def write(self, row: dict[str, Any]) -> None: + """Not supported for read-only table.""" + raise NotImplementedError + + async def close(self) -> None: + """No-op.""" + + async def _run_create_communities( title_to_entity_id: dict[str, str], relationships: pd.DataFrame, **kwargs: Any, ) -> pd.DataFrame: - """Helper that runs create_communities with a FakeTable and returns all rows as a DataFrame.""" - table = FakeTable() - await create_communities(table, title_to_entity_id, relationships, **kwargs) - return pd.DataFrame(table.rows) + """Helper that runs create_communities with fake tables and returns all rows as a DataFrame.""" + communities_table = FakeTable() + entity_rows = [ + {"id": eid, "title": title} for title, eid in title_to_entity_id.items() + ] + entities_table = FakeEntitiesTable(entity_rows) + await create_communities(communities_table, entities_table, relationships, **kwargs) + return pd.DataFrame(communities_table.rows) def _make_title_to_entity_id(