Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions python/python/knowledge_graph/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,18 @@ def run(
*,
datasets: Optional[Mapping[str, pa.Table]] = None,
) -> pa.Table:
"""Execute a Cypher statement against Lance datasets."""
"""Execute a Cypher statement against Lance datasets.

Only loads the datasets referenced in the query, avoiding expensive
enumeration of all datasets on cloud storage.
"""
query = CypherQuery(statement).with_config(self._config)
base_tables: MutableMapping[str, "pa.Table"] = dict(self._store.load_tables())

# Only load tables that are actually referenced in the query
referenced_tables = set(query.node_labels()) | set(query.relationship_types())
base_tables: MutableMapping[str, "pa.Table"] = dict(
self._store.load_tables(referenced_tables)
)
if datasets:
base_tables.update(datasets)
return query.execute(base_tables)
Expand Down
18 changes: 14 additions & 4 deletions python/python/knowledge_graph/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,16 +138,26 @@ def load_tables(
self,
names: Optional[Iterable[str]] = None,
) -> Mapping[str, "pa.Table"]:
"""Load Lance datasets as PyArrow tables."""
"""Load Lance datasets as PyArrow tables.

When specific names are provided, this method computes paths directly
without enumerating all datasets - significantly faster on cloud storage.
"""
lance = self._get_lance()

self.ensure_layout()
available = self.list_datasets()
requested = list(names) if names is not None else list(available.keys())

# Only enumerate datasets when no specific names are requested
if names is not None:
requested = list(names)
else:
available = self.list_datasets()
requested = list(available.keys())

tables: Dict[str, "pa.Table"] = {}
for name in requested:
path = available.get(name, self._dataset_path(name))
# Compute path directly - no need to look up from enumeration
path = self._dataset_path(name)
if not self._path_exists(path):
raise FileNotFoundError(f"Dataset '{name}' not found at {path}")
dataset = lance.dataset(
Expand Down
Loading