From 69e194357a394412fa65b734b409d039540b2cd1 Mon Sep 17 00:00:00 2001
From: beinan <beinan@uber.com>
Date: Sun, 18 Jan 2026 22:53:52 +0000
Subject: [PATCH] fix: avoid dataset enumeration on GCS/S3 for query execution

Previously, every query would enumerate all datasets on cloud storage
and load all of them, causing ~10s latency with 20+ datasets on GCS.

Now the query parser extracts which tables are actually referenced
(via node_labels() and relationship_types()), and only those specific
datasets are loaded. Paths are computed directly from the root path
without enumeration.

Fixes #87

Co-Authored-By: Claude <noreply@anthropic.com>
---
 python/python/knowledge_graph/service.py | 13 +++++++++++--
 python/python/knowledge_graph/store.py   | 18 ++++++++++++++----
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/python/python/knowledge_graph/service.py b/python/python/knowledge_graph/service.py
index d9b932ed..a0e53f63 100644
--- a/python/python/knowledge_graph/service.py
+++ b/python/python/knowledge_graph/service.py
@@ -116,9 +116,18 @@ def run(
         *,
         datasets: Optional[Mapping[str, pa.Table]] = None,
     ) -> pa.Table:
-        """Execute a Cypher statement against Lance datasets."""
+        """Execute a Cypher statement against Lance datasets.
+
+        Only loads the datasets referenced in the query, avoiding expensive
+        enumeration of all datasets on cloud storage.
+        """
         query = CypherQuery(statement).with_config(self._config)
-        base_tables: MutableMapping[str, "pa.Table"] = dict(self._store.load_tables())
+
+        # Only load tables that are actually referenced in the query
+        referenced_tables = set(query.node_labels()) | set(query.relationship_types())
+        base_tables: MutableMapping[str, "pa.Table"] = dict(
+            self._store.load_tables(referenced_tables)
+        )
         if datasets:
             base_tables.update(datasets)
         return query.execute(base_tables)
diff --git a/python/python/knowledge_graph/store.py b/python/python/knowledge_graph/store.py
index e2ccaf06..dd582f02 100644
--- a/python/python/knowledge_graph/store.py
+++ b/python/python/knowledge_graph/store.py
@@ -138,16 +138,26 @@ def load_tables(
         self,
         names: Optional[Iterable[str]] = None,
     ) -> Mapping[str, "pa.Table"]:
-        """Load Lance datasets as PyArrow tables."""
+        """Load Lance datasets as PyArrow tables.
+
+        When specific names are provided, this method computes paths directly
+        without enumerating all datasets - significantly faster on cloud storage.
+        """
         lance = self._get_lance()
 
         self.ensure_layout()
-        available = self.list_datasets()
-        requested = list(names) if names is not None else list(available.keys())
+
+        # Only enumerate datasets when no specific names are requested
+        if names is not None:
+            requested = list(names)
+        else:
+            available = self.list_datasets()
+            requested = list(available.keys())
 
         tables: Dict[str, "pa.Table"] = {}
         for name in requested:
-            path = available.get(name, self._dataset_path(name))
+            # Compute path directly - no need to look up from enumeration
+            path = self._dataset_path(name)
             if not self._path_exists(path):
                 raise FileNotFoundError(f"Dataset '{name}' not found at {path}")
             dataset = lance.dataset(