From 69e194357a394412fa65b734b409d039540b2cd1 Mon Sep 17 00:00:00 2001 From: beinan Date: Sun, 18 Jan 2026 22:53:52 +0000 Subject: [PATCH] fix: avoid dataset enumeration on GCS/S3 for query execution Previously, every query would enumerate all datasets on cloud storage and load all of them, causing ~10s latency with 20+ datasets on GCS. Now the query parser extracts which tables are actually referenced (via node_labels() and relationship_types()), and only those specific datasets are loaded. Paths are computed directly from the root path without enumeration. Fixes #87 Co-Authored-By: Claude --- python/python/knowledge_graph/service.py | 13 +++++++++++-- python/python/knowledge_graph/store.py | 18 ++++++++++++++---- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/python/python/knowledge_graph/service.py b/python/python/knowledge_graph/service.py index d9b932ed..a0e53f63 100644 --- a/python/python/knowledge_graph/service.py +++ b/python/python/knowledge_graph/service.py @@ -116,9 +116,18 @@ def run( *, datasets: Optional[Mapping[str, pa.Table]] = None, ) -> pa.Table: - """Execute a Cypher statement against Lance datasets.""" + """Execute a Cypher statement against Lance datasets. + + Only loads the datasets referenced in the query, avoiding expensive + enumeration of all datasets on cloud storage. + """ query = CypherQuery(statement).with_config(self._config) - base_tables: MutableMapping[str, "pa.Table"] = dict(self._store.load_tables()) + + # Only load tables that are actually referenced in the query + referenced_tables = set(query.node_labels()) | set(query.relationship_types()) + base_tables: MutableMapping[str, "pa.Table"] = dict( + self._store.load_tables(referenced_tables) + ) if datasets: base_tables.update(datasets) return query.execute(base_tables) diff --git a/python/python/knowledge_graph/store.py b/python/python/knowledge_graph/store.py index e2ccaf06..dd582f02 100644 --- a/python/python/knowledge_graph/store.py +++ b/python/python/knowledge_graph/store.py @@ -138,16 +138,26 @@ def load_tables( self, names: Optional[Iterable[str]] = None, ) -> Mapping[str, "pa.Table"]: - """Load Lance datasets as PyArrow tables.""" + """Load Lance datasets as PyArrow tables. + + When specific names are provided, this method computes paths directly + without enumerating all datasets - significantly faster on cloud storage. + """ lance = self._get_lance() self.ensure_layout() - available = self.list_datasets() - requested = list(names) if names is not None else list(available.keys()) + + # Only enumerate datasets when no specific names are requested + if names is not None: + requested = list(names) + else: + available = self.list_datasets() + requested = list(available.keys()) tables: Dict[str, "pa.Table"] = {} for name in requested: - path = available.get(name, self._dataset_path(name)) + # Compute path directly - no need to look up from enumeration + path = self._dataset_path(name) if not self._path_exists(path): raise FileNotFoundError(f"Dataset '{name}' not found at {path}") dataset = lance.dataset(