diff --git a/python/python/knowledge_graph/service.py b/python/python/knowledge_graph/service.py index d9b932ed..a0e53f63 100644 --- a/python/python/knowledge_graph/service.py +++ b/python/python/knowledge_graph/service.py @@ -116,9 +116,18 @@ def run( *, datasets: Optional[Mapping[str, pa.Table]] = None, ) -> pa.Table: - """Execute a Cypher statement against Lance datasets.""" + """Execute a Cypher statement against Lance datasets. + + Only loads the datasets referenced in the query, avoiding expensive + enumeration of all datasets on cloud storage. + """ query = CypherQuery(statement).with_config(self._config) - base_tables: MutableMapping[str, "pa.Table"] = dict(self._store.load_tables()) + + # Only load tables that are actually referenced in the query + referenced_tables = set(query.node_labels()) | set(query.relationship_types()) + base_tables: MutableMapping[str, "pa.Table"] = dict( + self._store.load_tables(referenced_tables) + ) if datasets: base_tables.update(datasets) return query.execute(base_tables) diff --git a/python/python/knowledge_graph/store.py b/python/python/knowledge_graph/store.py index e2ccaf06..dd582f02 100644 --- a/python/python/knowledge_graph/store.py +++ b/python/python/knowledge_graph/store.py @@ -138,16 +138,26 @@ def load_tables( self, names: Optional[Iterable[str]] = None, ) -> Mapping[str, "pa.Table"]: - """Load Lance datasets as PyArrow tables.""" + """Load Lance datasets as PyArrow tables. + + When specific names are provided, this method computes paths directly + without enumerating all datasets - significantly faster on cloud storage. + """ lance = self._get_lance() self.ensure_layout() - available = self.list_datasets() - requested = list(names) if names is not None else list(available.keys()) + + # Only enumerate datasets when no specific names are requested + if names is not None: + requested = list(names) + else: + available = self.list_datasets() + requested = list(available.keys()) tables: Dict[str, "pa.Table"] = {} for name in requested: - path = available.get(name, self._dataset_path(name)) + # Compute path directly - no need to look up from enumeration + path = self._dataset_path(name) if not self._path_exists(path): raise FileNotFoundError(f"Dataset '{name}' not found at {path}") dataset = lance.dataset(