Implement graph construct for cypher endpoints

FlorentinD · FlorentinD · commit 657e76c95ead · 2025-11-21T13:25:12.000+01:00
diff --git a/graphdatascience/procedure_surface/api/catalog/catalog_endpoints.py b/graphdatascience/procedure_surface/api/catalog/catalog_endpoints.py
@@ -4,6 +4,8 @@
 from types import TracebackType
 from typing import NamedTuple, Type
 
+from pandas import DataFrame
+
 from graphdatascience.procedure_surface.api.base_result import BaseResult
 from graphdatascience.procedure_surface.api.catalog.graph_api import GraphV2
 from graphdatascience.procedure_surface.api.catalog.graph_info import GraphInfo, GraphInfoWithDegrees
@@ -14,31 +16,76 @@
 
 
 class CatalogEndpoints(ABC):
+    @abstractmethod
+    def construct(
+        self,
+        graph_name: str,
+        nodes: DataFrame | list[DataFrame],
+        relationships: DataFrame | list[DataFrame] | None = None,
+        concurrency: int = 4,
+        undirected_relationship_types: list[str] | None = None,
+    ) -> GraphV2:
+        """Construct a graph from a list of node and relationship dataframes.
+
+        Parameters
+        ----------
+        graph_name
+            Name of the graph to construct
+        nodes
+            Node dataframes. A dataframe should follow the schema:
+
+            - `nodeId` to identify uniquely the node overall dataframes
+            - `labels` to specify the labels of the node as a list of strings (optional)
+            - other columns are treated as node properties
+        relationships
+            Relationship dataframes. A dataframe should follow the schema:
+
+            - `sourceNodeId` to identify the start node of the relationship
+            - `targetNodeId` to identify the end node of the relationship
+            - `relationshipType` to specify the type of the relationship (optional)
+            - other columns are treated as relationship properties
+        concurrency
+            Number of concurrent threads to use.
+        undirected_relationship_types
+            List of relationship types to treat as undirected.
+
+        Returns
+        -------
+        GraphV2
+            Constructed graph object.
+        """
+
     @abstractmethod
     def list(self, G: GraphV2 | str | None = None) -> list[GraphInfoWithDegrees]:
         """List graphs in the graph catalog.
 
-        Args:
-            G (GraphV2 | str | None, optional): GraphV2 object or name to filter results.
-               If None, list all graphs. Defaults to None.
+        Parameters
+        ----------
+        G
+            GraphV2 object or name to filter results. If None, list all graphs.
 
-        Returns:
-            list[GraphListResult]: List of graph metadata objects containing information like
-                                 graph name, node count, relationship count, etc.
+        Returns
+        -------
+        list[GraphInfoWithDegrees]
+            List of graph metadata objects containing information like node count.
         """
         pass
 
     @abstractmethod
     def drop(self, G: GraphV2 | str, fail_if_missing: bool = True) -> GraphInfo | None:
         """Drop a graph from the graph catalog.
 
-        Args:
-            G (GraphV2 | str): GraphV2 object or name to drop.
-            fail_if_missing (bool): Whether to fail if the graph is missing. Defaults to True.
+        Parameters
+        ----------
+        G
+            Graph to drop by name of object.
+        fail_if_missing
+            Whether to fail if the graph is missing
 
-        Returns:
-              GraphListResult: GraphV2 metadata object containing information like
-                               graph name, node count, relationship count, etc.
+        Returns
+        -------
+        GraphListResult
+            GraphV2 metadata object containing information like node count.
         """
 
     @abstractmethod
@@ -68,9 +115,10 @@ def filter(
         job_id
             Identifier for the computation.
 
-        Returns:
-            GraphWithFilterResult: tuple of the filtered graph object and the information like
-                                graph name, node count, relationship count, etc.
+        Returns
+        -------
+        GraphWithFilterResult:
+            tuple of the filtered graph object and the information like graph name, node count, relationship count, etc.
         """
         pass
 
diff --git a/graphdatascience/procedure_surface/arrow/catalog/catalog_arrow_endpoints.py b/graphdatascience/procedure_surface/arrow/catalog/catalog_arrow_endpoints.py
@@ -5,6 +5,8 @@
 from typing import Any, NamedTuple, Type
 from uuid import uuid4
 
+from pandas import DataFrame
+
 from graphdatascience.arrow_client.authenticated_flight_client import AuthenticatedArrowClient
 from graphdatascience.arrow_client.v2.job_client import JobClient
 from graphdatascience.arrow_client.v2.remote_write_back_client import RemoteWriteBackClient
@@ -52,15 +54,6 @@ def __init__(
             protocol_version = ProtocolVersionResolver(query_runner).resolve()
             self._project_protocol = ProjectProtocol.select(protocol_version)
 
-    def list(self, G: GraphV2 | str | None = None) -> list[GraphInfoWithDegrees]:
-        graph_name: str | None = None
-        if isinstance(G, GraphV2):
-            graph_name = G.name()
-        elif isinstance(G, str):
-            graph_name = G
-
-        return self._graph_backend.list(graph_name)
-
     def project(
         self,
         graph_name: str,
@@ -137,6 +130,16 @@ def project(
 
         return GraphWithProjectResult(get_graph(graph_name, self._arrow_client), job_result)
 
+    def construct(
+        self,
+        graph_name: str,
+        nodes: DataFrame | list[DataFrame],
+        relationships: DataFrame | list[DataFrame] | None = None,
+        concurrency: int = 4,
+        undirected_relationship_types: list[str] | None = None,
+    ) -> GraphV2:
+        raise NotImplementedError("Graph construction is not yet supported via V2 endpoints.")
+
     def drop(self, G: GraphV2 | str, fail_if_missing: bool = True) -> GraphInfo | None:
         graph_name = G.name() if isinstance(G, GraphV2) else G
 
@@ -212,6 +215,15 @@ def generate(
             GraphGenerationStats(**JobClient.get_summary(self._arrow_client, job_id)),
         )
 
+    def list(self, G: GraphV2 | str | None = None) -> list[GraphInfoWithDegrees]:
+        graph_name: str | None = None
+        if isinstance(G, GraphV2):
+            graph_name = G.name()
+        elif isinstance(G, str):
+            graph_name = G
+
+        return self._graph_backend.list(graph_name)
+
     @property
     def sample(self) -> GraphSamplingEndpoints:
         return GraphSamplingArrowEndpoints(self._arrow_client, show_progress=self._show_progress)
diff --git a/graphdatascience/procedure_surface/cypher/catalog/node_properties_cypher_endpoints.py b/graphdatascience/procedure_surface/cypher/catalog/node_properties_cypher_endpoints.py
@@ -10,6 +10,7 @@
     NodePropertySpec,
 )
 from graphdatascience.procedure_surface.api.default_values import ALL_LABELS
+from graphdatascience.procedure_surface.cypher.catalog.utils import require_database
 from graphdatascience.procedure_surface.utils.config_converter import ConfigConverter
 from graphdatascience.procedure_surface.utils.result_utils import join_db_node_properties, transpose_property_columns
 from graphdatascience.query_runner.query_runner import QueryRunner
@@ -35,9 +36,7 @@ def stream(
         db_node_properties: list[str] | None = None,
     ) -> DataFrame:
         if self._gds_arrow_client is not None:
-            database = self._query_runner.database()
-            if database is None:
-                raise ValueError("The database is not set")
+            database = require_database(self._query_runner)
 
             result = self._gds_arrow_client.get_node_properties(
                 G.name(), database, node_properties, node_labels, list_node_labels or False, concurrency
diff --git a/graphdatascience/procedure_surface/cypher/catalog/relationship_cypher_endpoints.py b/graphdatascience/procedure_surface/cypher/catalog/relationship_cypher_endpoints.py
@@ -14,6 +14,7 @@
     RelationshipsWriteResult,
 )
 from graphdatascience.procedure_surface.api.default_values import ALL_TYPES
+from graphdatascience.procedure_surface.cypher.catalog.utils import require_database
 from graphdatascience.procedure_surface.utils.config_converter import ConfigConverter
 
 
@@ -36,9 +37,7 @@ def stream(
         effective_rel_types = relationship_types if relationship_types is not None else ["*"]
 
         if self._gds_arrow_client is not None:
-            database = self._query_runner.database()
-            if database is None:
-                raise ValueError("The database is not set")
+            database = require_database(self._query_runner)
 
             if relationship_properties:
                 return self._gds_arrow_client.get_relationship_properties(
diff --git a/graphdatascience/procedure_surface/cypher/catalog/utils.py b/graphdatascience/procedure_surface/cypher/catalog/utils.py
@@ -0,0 +1,12 @@
+from graphdatascience.query_runner.query_runner import QueryRunner
+
+
+def require_database(query_runner: QueryRunner) -> str:
+    database = query_runner.database()
+    if database is None:
+        raise ValueError(
+            "For this call you must have explicitly specified a valid Neo4j database to target, "
+            "using `gds.set_database`."
+        )
+
+    return database
diff --git a/graphdatascience/procedure_surface/cypher/catalog_cypher_endpoints.py b/graphdatascience/procedure_surface/cypher/catalog_cypher_endpoints.py
@@ -4,6 +4,9 @@
 from types import TracebackType
 from typing import Any, NamedTuple, Type
 
+from pandas import DataFrame
+
+from graphdatascience.arrow_client.v1.gds_arrow_client import GdsArrowClient
 from graphdatascience.procedure_surface.api.catalog.catalog_endpoints import (
     CatalogEndpoints,
     GraphFilterResult,
@@ -15,7 +18,11 @@
 from graphdatascience.procedure_surface.api.catalog.graph_api import GraphV2
 from graphdatascience.procedure_surface.api.catalog.graph_info import GraphInfo, GraphInfoWithDegrees
 from graphdatascience.procedure_surface.api.catalog.graph_sampling_endpoints import GraphSamplingEndpoints
-from graphdatascience.procedure_surface.cypher.catalog.graph_backend_cypher import get_graph
+from graphdatascience.procedure_surface.cypher.catalog.graph_backend_cypher import CypherGraphBackend, get_graph
+from graphdatascience.procedure_surface.cypher.catalog.utils import require_database
+from graphdatascience.query_runner.arrow_graph_constructor import ArrowGraphConstructor
+from graphdatascience.query_runner.cypher_graph_constructor import CypherGraphConstructor
+from graphdatascience.query_runner.graph_constructor import GraphConstructor
 
 from ...call_parameters import CallParameters
 from ...query_runner.query_runner import QueryRunner
@@ -28,8 +35,46 @@
 
 
 class CatalogCypherEndpoints(CatalogEndpoints):
-    def __init__(self, query_runner: QueryRunner):
+    def __init__(self, query_runner: QueryRunner, arrow_client: GdsArrowClient | None = None):
         self._query_runner = query_runner
+        self._arrow_client = arrow_client
+
+    def construct(
+        self,
+        graph_name: str,
+        nodes: DataFrame | list[DataFrame],
+        relationships: DataFrame | list[DataFrame] | None = None,
+        concurrency: int | None = None,
+        undirected_relationship_types: list[str] | None = None,
+    ) -> GraphV2:
+        if isinstance(nodes, DataFrame):
+            nodes = [nodes]
+        if relationships is None:
+            relationships = []
+        elif isinstance(relationships, DataFrame):
+            relationships = [relationships]
+
+        graph_constructor: GraphConstructor
+        if self._arrow_client is not None:
+            database = require_database(self._query_runner)
+
+            graph_constructor = ArrowGraphConstructor(
+                database=database,
+                graph_name=graph_name,
+                flight_client=self._arrow_client,
+                concurrency=concurrency,
+                undirected_relationship_types=undirected_relationship_types,
+            )
+        else:
+            graph_constructor = CypherGraphConstructor(
+                query_runner=self._query_runner,
+                graph_name=graph_name,
+                concurrency=concurrency,
+                undirected_relationship_types=undirected_relationship_types,
+            )
+
+        graph_constructor.run(node_dfs=nodes, relationship_dfs=relationships)
+        return GraphV2(name=graph_name, backend=CypherGraphBackend(graph_name, self._query_runner))
 
     def list(self, G: GraphV2 | str | None = None) -> list[GraphInfoWithDegrees]:
         graph_name = G if isinstance(G, str) else G.name() if G is not None else None
diff --git a/graphdatascience/query_runner/arrow_graph_constructor.py b/graphdatascience/query_runner/arrow_graph_constructor.py
@@ -22,8 +22,8 @@ def __init__(
         database: str,
         graph_name: str,
         flight_client: GdsArrowClient,
-        concurrency: int,
-        undirected_relationship_types: list[str] | None,
+        concurrency: int | None = None,
+        undirected_relationship_types: list[str] | None = None,
         chunk_size: int = 10_000,
     ):
         self._database = database
diff --git a/graphdatascience/query_runner/cypher_graph_constructor.py b/graphdatascience/query_runner/cypher_graph_constructor.py
@@ -58,14 +58,13 @@ def __init__(
         self,
         query_runner: QueryRunner,
         graph_name: str,
-        concurrency: int,
-        undirected_relationship_types: list[str] | None,
-        server_version: ServerVersion,
+        concurrency: int | None = None,
+        undirected_relationship_types: list[str] | None = None,
     ):
         self._query_runner = query_runner
         self._concurrency = concurrency
         self._graph_name = graph_name
-        self._server_version = server_version
+        self._server_version = query_runner.server_version()
         self._undirected_relationship_types = undirected_relationship_types
 
     def run(self, node_dfs: list[DataFrame], relationship_dfs: list[DataFrame]) -> None:
@@ -81,9 +80,9 @@ def run(self, node_dfs: list[DataFrame], relationship_dfs: list[DataFrame]) -> N
             self.CypherProjectionRunner(
                 self._query_runner,
                 self._graph_name,
+                self._server_version,
                 self._concurrency,
                 self._undirected_relationship_types,
-                self._server_version,
             ).run(node_dfs, relationship_dfs)
         else:
             assert not self._undirected_relationship_types, "This should have been raised earlier."
@@ -130,9 +129,9 @@ def __init__(
             self,
             query_runner: QueryRunner,
             graph_name: str,
-            concurrency: int,
-            undirected_relationship_types: list[str] | None,
             server_version: ServerVersion,
+            concurrency: int | None = None,
+            undirected_relationship_types: list[str] | None = None,
         ):
             self._query_runner = query_runner
             self._concurrency = concurrency
@@ -359,9 +358,9 @@ def rels_config_part(self, rel_cols: list[EntityColumnSchema], rel_properties_ke
             return rels_config_fields
 
     class LegacyCypherProjectionRunner:
-        def __init__(self, query_runner: QueryRunner, graph_name: str, concurrency: int):
+        def __init__(self, query_runner: QueryRunner, graph_name: str, concurrency: int | None = None):
             self._query_runner = query_runner
-            self._concurrency = concurrency
+            self._concurrency = concurrency if concurrency is not None else 4
             self._graph_name = graph_name
 
         def run(self, node_df: DataFrame, relationship_df: DataFrame) -> None:
diff --git a/graphdatascience/query_runner/neo4j_query_runner.py b/graphdatascience/query_runner/neo4j_query_runner.py
@@ -375,9 +375,7 @@ def __del__(self) -> None:
     def create_graph_constructor(
         self, graph_name: str, concurrency: int, undirected_relationship_types: list[str] | None
     ) -> GraphConstructor:
-        return CypherGraphConstructor(
-            self, graph_name, concurrency, undirected_relationship_types, self.server_version()
-        )
+        return CypherGraphConstructor(self, graph_name, concurrency, undirected_relationship_types)
 
     def set_show_progress(self, show_progress: bool) -> None:
         self._show_progress = show_progress
diff --git a/graphdatascience/tests/integrationV2/procedure_surface/cypher/test_catalog_cypher_endpoints.py b/graphdatascience/tests/integrationV2/procedure_surface/cypher/test_catalog_cypher_endpoints.py
diff --git a/graphdatascience/tests/unit/conftest.py b/graphdatascience/tests/unit/conftest.py