From 76b9c2d618ca6e073c91d0131a7f6a4ae3aec9b7 Mon Sep 17 00:00:00 2001 From: "jianjian.xie" Date: Mon, 22 Jun 2026 13:45:26 -0700 Subject: [PATCH 01/10] docs: add Phase 2 CSR native expand operator design Design spec for issue #159 Phase 2: wire the Phase 1 CsrIndex into a native single-hop Expand via custom DataFusion ExecutionPlan (CsrExpandExec topology + LanceTakeExec materialization), dense-ROWID id model, with fallback to the DataFusion join path for unsupported shapes. Co-Authored-By: Claude Opus 4.8 (1M context) --- ...06-22-csr-native-expand-operator-design.md | 179 ++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 docs/superpowers/specs/2026-06-22-csr-native-expand-operator-design.md diff --git a/docs/superpowers/specs/2026-06-22-csr-native-expand-operator-design.md b/docs/superpowers/specs/2026-06-22-csr-native-expand-operator-design.md new file mode 100644 index 00000000..d4bc2678 --- /dev/null +++ b/docs/superpowers/specs/2026-06-22-csr-native-expand-operator-design.md @@ -0,0 +1,179 @@ +# Phase 2: CSR-backed native single-hop Expand operator + +Issue: [lance-format/lance-graph#159](https://github.com/lance-format/lance-graph/issues/159) +Status: Approved design (2026-06-22) +Builds on: Phase 1 — `CsrIndex` (PR #160, commit `c7e4f18`) + +## Problem + +Today every Cypher traversal is lowered to DataFusion SQL joins. A single-hop +`(a)-[:KNOWS]->(b)` becomes a relationship scan plus two inner joins. Phase 1 +added an in-memory `CsrIndex` (dense `u64` adjacency with `neighbors()`, `bfs()`, +`shortest_path()`, Arrow (de)serialization) but nothing consumes it — +`LanceNativePlanner` is a placeholder that returns `EmptyRelation`, and +`ExecutionStrategy::LanceNative` errors with "not yet implemented". + +Phase 2 wires CSR into a real native execution path for **single-hop `Expand`**, +replacing the join with a direct neighbor lookup followed by a `take()`. + +## Foundational decisions (locked) + +1. **Execution model — custom DataFusion `ExecutionPlan` (DuckPGQ-style).** + A logical extension node lowers to a streaming physical operator that does + neighbor lookups at execution time and composes with the rest of the + DataFusion pipeline. (Alternatives considered: DataFusion-materialized + MemTable; pure-native bypassing DataFusion. Rejected — less faithful / less + composable.) + +2. **ID mapping — dense ROWID model.** The CSR vertex id *is* the node's row id. + `csr.neighbors(src_rowid) -> dst_rowid`s; target properties come from + `take(dst_rowids)`. This mirrors how every Lance index works (key → row ids → + `take()` to materialize) and reuses Lance's addressing instead of inventing a + dictionary. For Phase 2 (in-memory, single fragment) "row id == dense row + offset"; this generalizes to **Lance stable row ids** in Phase 4. + +3. **Output contract — properties materialized via `take()`.** `Expand` does not + fall back when target properties are projected. The neighbor row ids feed a + `take()` that materializes the referenced target columns, so `RETURN b.name` + runs fully native. + +4. **`take()` placement — a separate `LanceTakeExec` operator.** + `CsrExpandExec` does *topology only* (row id → neighbor row ids); + `LanceTakeExec` does *materialization only* via a `RowMaterializer` + abstraction. Single-purpose operators, reused by Phase 3 (multi-hop) and + Phase 5 (hybrid vector). Mirrors Lance's own scan+take shape. + +## Architecture & modules + +Promote `crates/lance-graph/src/lance_native_planner.rs` to a module directory: + +| File | Responsibility | +|---|---| +| `lance_native_planner/mod.rs` | `LanceNativePlanner` — native-vs-fallback decision; lowering that overrides only `Expand` | +| `lance_native_planner/csr_expand.rs` | `CsrExpandNode` (logical extension) + `CsrExpandExec` (physical): row id → neighbor row ids | +| `lance_native_planner/take.rs` | `LanceTakeNode` + `LanceTakeExec` + `RowMaterializer` trait + `InMemoryMaterializer` | +| `lance_native_planner/extension_planner.rs` | `ExtensionPlanner` mapping logical nodes → physical execs (builds CSR + materializer at plan time); `CsrQueryPlanner` wrapping `DefaultPhysicalPlanner` | + +## Data flow + +For `MATCH (a:Person)-[:KNOWS]->(b:Person) WHERE b.age > 30 RETURN a.name, b.name`: + +``` +TableScan(person AS a) a__id(=rowid), a__name, a__age [reuse existing scan_ops] + └─ CsrExpandExec + b__id (one row per neighbor; a__* carried through, b__id = neighbor row id) + └─ LanceTakeExec + b__name, b__age (take all other target cols by offset) + └─ Filter(b__age>30) [DataFusion native physical op] + └─ Project(...) [DataFusion native physical op] +``` + +`CsrExpandExec` and `LanceTakeExec` produce a normal `RecordBatch` stream with a +correct qualified schema (`{var}__{col}`), so all operators above (Filter, +Project, Sort, Limit, Distinct, Offset) are ordinary DataFusion physical +operators. + +## The planner: reuse + override + +`LanceNativePlanner` holds a `DataFusionPlanner` and **overrides only `Expand` +lowering**; every other `LogicalOperator` delegates to the existing crate-internal +builders (`pub(crate)`), so scans/projects/filters/limits behave identically to +the DataFusion path. + +- **`Expand` →** `CsrExpandNode` (appends `b__` — the neighbor + row id, which under the dense model equals the target's id value) wrapped by + `LanceTakeNode` (appends **all remaining target node columns**, qualified + `b__`). +- **Materialize all target columns, not just referenced ones.** This is exactly + what the DataFusion target scan does (`build_qualified_target_scan` projects + every target field), so the native output schema matches the join path's column + set and every `b__col` is available to downstream Project/Filter/Sort. It also + removes any need to walk expressions collecting `b.*` references. Projection + pushdown (materializing only needed columns) is a later optimization. +- Source row-id column = `a__` (dense-ROWID assumption; becomes + `a___rowid` in Phase 4). +- The neighbor column emitted by `CsrExpandExec` is named `b__` + (e.g. `b__id`), so `RETURN b.id` is served directly and `LanceTakeExec` reuses + that column as the row-id input. `LanceTakeExec` therefore materializes the + target columns *other than* the id field. +- Output column naming stays `{var}__{col}` to match the DataFusion path exactly. + +### Native-vs-fallback rule + +A query is served natively **iff** its plan contains exactly one single-hop +`Expand` with a single relationship type and direction Outgoing or Incoming. +**Otherwise the entire query delegates to `DataFusionPlanner`.** Falls back for: +`VariableLengthExpand`, more than one `Expand` (multi-hop), `Expand` with more +than one relationship type, `Undirected` direction, `Join`, `Unwind`. + +Consequence: `ExecutionStrategy::LanceNative` is always correct on valid Cypher — +it never errors, it just uses joins when CSR cannot serve the shape. + +## CSR construction & RowMaterializer + +- `ExtensionPlanner::plan_extension` (async) looks up the edge table and target + node table from `SessionState`, collects them, and builds: + - `Arc` via the Phase 1 builder. **Add + `CsrIndexBuilder::add_edges_from_batch_with_columns(batch, src_col, dst_col)`** + so it uses the real `RelationshipMapping` field names + (`source_id_field`/`target_id_field`) instead of the hardcoded + `src_id`/`dst_id`. Outgoing builds `(src→dst)`; Incoming reverses to `(dst→src)`. + - `InMemoryMaterializer { batch }` over the collected target node table. +- `RowMaterializer` trait: + ```rust + trait RowMaterializer: Send + Sync { + fn schema(&self) -> SchemaRef; + fn take(&self, row_ids: &UInt64Array, columns: &[String]) -> Result; + } + ``` + In-memory impl = `arrow::compute::take` by offset — O(1) random access, the + concrete reason CSR beats the hash join. The take node materializes **all** + target columns except the id field. Phase 4 adds a `LanceDatasetMaterializer` + backed by `LanceDataset::take`. +- CSR is built once per physical planning (rebuild per query is acceptable for + Phase 2; Phase 4 persists CSR as a Lance dataset). +- `num_vertices` is inferred from the edge data (Phase 1 default = max id + 1); + source ids beyond range yield empty neighbors (no error), matching Phase 1. + +## query.rs wiring + +- Build the native `SessionContext` via + `SessionStateBuilder::new().with_default_features().with_query_planner(Arc::new(CsrQueryPlanner)).build()`. + `CsrQueryPlanner` wraps `DefaultPhysicalPlanner` with the `ExtensionPlanner`. +- `create_logical_plans` gains a planner choice (DataFusion vs LanceNative). +- The **in-memory `execute(datasets, Some(LanceNative))` path is fully wired and + tested.** It builds the native context, plans with `LanceNativePlanner`, and + executes. +- The **namespace native path stays `UnsupportedFeature` for Phase 2** — it needs + the Lance-dataset materializer, which is Phase 4 (persistence). + +## Error handling + +Reuse `GraphError` with `snafu::Location`. Missing or wrong-typed edge columns at +CSR-build time and take failures surface as `PlanError` / `ExecutionError`. Source +row ids beyond CSR range produce empty neighbors rather than an error. + +## Testing + +- **Operator unit tests** + - `CsrExpandExec`: outgoing, incoming, vertex with no neighbors, source id out + of range, input split across multiple batches, carry-through of source columns. + - `LanceTakeExec`: take correctness by offset, column subset selection, empty + input, row-id column dropped from output. + - `CsrIndexBuilder::add_edges_from_batch_with_columns`: custom column names, + reversed (incoming) build. +- **Planner tests** + - Supported shape lowers to a plan containing `CsrExpandExec` + `LanceTakeExec`. + - Each unsupported shape (var-length, two-hop, undirected, multi-type) falls + back to a join plan (`Join` present, no extension nodes). +- **End-to-end parity** (`execute(datasets, …)` LanceNative vs DataFusion return + identical results) + - `MATCH (a:Person)-[:KNOWS]->(b:Person) RETURN a.name, b.name` + - same with `WHERE b.age > 30` + - same with `LIMIT` + - incoming direction `(a)<-[:KNOWS]-(b)` + +## Out of scope (later phases) + +- Multi-hop / `VariableLengthExpand`, BFS/DFS/shortest-path operators (Phase 3). +- Persisting CSR as Lance datasets, incremental updates, Lance stable row ids, + `LanceDatasetMaterializer`, namespace native path (Phase 4). +- Hybrid CSR + vector search (Phase 5). From 87ee0d302ff833af3025470b7d13f7cdd06e82e7 Mon Sep 17 00:00:00 2001 From: "jianjian.xie" Date: Mon, 22 Jun 2026 14:38:59 -0700 Subject: [PATCH 02/10] docs: add Phase 2 CSR native expand implementation plan 7-task TDD plan implementing the approved design: generalize CSR builder, CsrExpandNode/Exec, LanceTakeNode/Exec + RowMaterializer, CsrExtensionPlanner/ CsrQueryPlanner, LanceNativePlanner lowering with fallback, and query.rs wiring of the LanceNative execution strategy with end-to-end parity tests. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../2026-06-22-csr-native-expand-operator.md | 2112 +++++++++++++++++ 1 file changed, 2112 insertions(+) create mode 100644 docs/superpowers/plans/2026-06-22-csr-native-expand-operator.md diff --git a/docs/superpowers/plans/2026-06-22-csr-native-expand-operator.md b/docs/superpowers/plans/2026-06-22-csr-native-expand-operator.md new file mode 100644 index 00000000..0b95e068 --- /dev/null +++ b/docs/superpowers/plans/2026-06-22-csr-native-expand-operator.md @@ -0,0 +1,2112 @@ +# Phase 2: CSR-backed Native Single-Hop Expand — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Execute single-hop Cypher `Expand` natively via the Phase 1 `CsrIndex` — a custom DataFusion `ExecutionPlan` does neighbor lookups (`CsrExpandExec`) and a `take()` materializes target columns (`LanceTakeExec`), instead of relationship-scan + two joins. + +**Architecture:** `LanceNativePlanner` wraps `DataFusionPlanner` and overrides only `Expand` lowering, emitting two logical extension nodes (`CsrExpandNode`, `LanceTakeNode`) over the existing source scan; anything it can't serve delegates to the join path. A `CsrQueryPlanner` registered on the `SessionContext` turns those nodes into physical operators, building the CSR and an in-memory row materializer at physical-planning time. Dense-ROWID id model: CSR vertex id == node row id; target props via `take()`. + +**Tech Stack:** Rust, DataFusion 50.3 (`UserDefinedLogicalNodeCore`, `ExtensionPlanner`, `QueryPlanner`), Arrow 56.2 (`arrow::compute::{take, cast}`), async-trait 0.1, tokio (tests). + +**Spec:** `docs/superpowers/specs/2026-06-22-csr-native-expand-operator-design.md` + +--- + +## File Structure + +| File | Action | Responsibility | +|---|---|---| +| `crates/lance-graph/src/csr_index.rs` | Modify | Add `add_edges_from_batch_with_columns` | +| `crates/lance-graph/src/datafusion_planner/mod.rs` | Modify | `mod expression;` → `pub(crate) mod expression;` | +| `crates/lance-graph/src/lance_native_planner.rs` | Delete | Replaced by directory module | +| `crates/lance-graph/src/lance_native_planner/mod.rs` | Create | `LanceNativePlanner`: native-vs-fallback + lowering; re-exports | +| `crates/lance-graph/src/lance_native_planner/direction.rs` | Create | `NativeDirection` enum | +| `crates/lance-graph/src/lance_native_planner/take.rs` | Create | `RowMaterializer`, `InMemoryMaterializer`, `take_batch`, `LanceTakeNode`, `LanceTakeExec` | +| `crates/lance-graph/src/lance_native_planner/csr_expand.rs` | Create | `expand_batch`, `CsrExpandNode`, `CsrExpandExec` | +| `crates/lance-graph/src/lance_native_planner/extension_planner.rs` | Create | `CsrExtensionPlanner`, `CsrQueryPlanner` | +| `crates/lance-graph/src/query.rs` | Modify | Wire `ExecutionStrategy::LanceNative` (in-memory datasets path) | +| `crates/lance-graph/tests/test_lance_native_expand.rs` | Create | End-to-end parity tests | + +Run all crate tests with: `cargo test -p lance-graph` +Run a single test with: `cargo test -p lance-graph -- --exact` (or `cargo test -p lance-graph --test test_lance_native_expand ` for integration tests). + +--- + +## Task 1: Generalize CSR builder with custom edge column names + +**Files:** +- Modify: `crates/lance-graph/src/csr_index.rs:240-274` (the `add_edges_from_batch` method) and its test module. + +- [ ] **Step 1: Write the failing test** + +Add to the `tests` module in `crates/lance-graph/src/csr_index.rs` (before the closing `}`): + +```rust + #[test] + fn test_build_from_record_batch_custom_columns() { + let schema = Arc::new(Schema::new(vec![ + Field::new("src_person_id", DataType::UInt64, false), + Field::new("dst_person_id", DataType::UInt64, false), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt64Array::from(vec![0, 0, 1])), + Arc::new(UInt64Array::from(vec![1, 2, 2])), + ], + ) + .unwrap(); + + // Forward (outgoing): src -> dst + let idx = CsrIndexBuilder::new() + .add_edges_from_batch_with_columns(&batch, "src_person_id", "dst_person_id") + .unwrap() + .build(); + assert_eq!(idx.neighbors(0), &[1, 2]); + assert_eq!(idx.neighbors(1), &[2]); + + // Reversed (incoming): swap the column args -> dst -> src + let rev = CsrIndexBuilder::new() + .add_edges_from_batch_with_columns(&batch, "dst_person_id", "src_person_id") + .unwrap() + .build(); + assert_eq!(rev.neighbors(2), &[0, 1]); + assert_eq!(rev.neighbors(1), &[0]); + } +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cargo test -p lance-graph test_build_from_record_batch_custom_columns` +Expected: FAIL — `no method named add_edges_from_batch_with_columns`. + +- [ ] **Step 3: Implement** + +In `crates/lance-graph/src/csr_index.rs`, replace the existing `add_edges_from_batch` method (lines ~239-274) with a thin wrapper plus the generalized method: + +```rust + /// Add edges from an Arrow RecordBatch with `src_id` and `dst_id` columns. + pub fn add_edges_from_batch(self, batch: &RecordBatch) -> Result { + self.add_edges_from_batch_with_columns(batch, "src_id", "dst_id") + } + + /// Add edges from an Arrow RecordBatch, reading source vertex ids from + /// `src_col` and destination vertex ids from `dst_col`. + /// + /// Both columns must be `UInt64`. To build a reversed (incoming/CSC) index, + /// pass the destination column name as `src_col` and vice versa. + pub fn add_edges_from_batch_with_columns( + mut self, + batch: &RecordBatch, + src_col: &str, + dst_col: &str, + ) -> Result { + let src_array = batch + .column_by_name(src_col) + .ok_or_else(|| GraphError::PlanError { + message: format!("Edge batch missing '{}' column", src_col), + location: snafu::Location::new(file!(), line!(), column!()), + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| GraphError::PlanError { + message: format!("'{}' column must be UInt64", src_col), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + let dst_array = batch + .column_by_name(dst_col) + .ok_or_else(|| GraphError::PlanError { + message: format!("Edge batch missing '{}' column", dst_col), + location: snafu::Location::new(file!(), line!(), column!()), + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| GraphError::PlanError { + message: format!("'{}' column must be UInt64", dst_col), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + + for i in 0..batch.num_rows() { + self.edges.push((src_array.value(i), dst_array.value(i))); + } + + Ok(self) + } +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cargo test -p lance-graph csr_index` +Expected: PASS (including the existing `test_build_from_record_batch`, which now routes through the wrapper). + +- [ ] **Step 5: Commit** + +```bash +git add crates/lance-graph/src/csr_index.rs +git commit -m "feat(csr): add_edges_from_batch_with_columns for custom edge column names" +``` + +--- + +## Task 2: Module skeleton + `NativeDirection` + +Converts the placeholder file into a directory module so later tasks have a home. Keeps the existing `LanceNativePlanner` placeholder behavior compiling (it is rewritten in Task 6). + +**Files:** +- Delete: `crates/lance-graph/src/lance_native_planner.rs` +- Create: `crates/lance-graph/src/lance_native_planner/mod.rs` +- Create: `crates/lance-graph/src/lance_native_planner/direction.rs` + +- [ ] **Step 1: Create the direction enum** + +Create `crates/lance-graph/src/lance_native_planner/direction.rs`: + +```rust +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Traversal direction for the native CSR expand operators. + +/// Direction a single-hop expand traverses. `Undirected` is intentionally +/// absent — undirected expands fall back to the DataFusion join planner. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum NativeDirection { + /// Follow edges source -> destination (CSR). + Outgoing, + /// Follow edges destination -> source (CSC / reversed). + Incoming, +} +``` + +- [ ] **Step 2: Move the placeholder into the new mod.rs** + +Delete `crates/lance-graph/src/lance_native_planner.rs` and create +`crates/lance-graph/src/lance_native_planner/mod.rs` with the **exact previous +contents** of the placeholder file, with one added line at the top of the module +body to declare the submodule. The full file: + +```rust +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Lance Native physical planner (placeholder) +//! +//! Rewritten in Task 6 to lower single-hop `Expand` onto CSR-backed +//! extension nodes. For now it keeps the original placeholder behavior so the +//! crate compiles between tasks. + +mod direction; + +pub use direction::NativeDirection; + +use crate::config::GraphConfig; +use crate::datafusion_planner::GraphPhysicalPlanner; +use crate::error::Result; +use crate::logical_plan::LogicalOperator; +use datafusion::common::DFSchema; +use datafusion::logical_expr::{EmptyRelation, LogicalPlan}; +use std::sync::Arc; + +/// Placeholder Lance-native planner +pub struct LanceNativePlanner { + #[allow(dead_code)] + config: GraphConfig, +} + +impl LanceNativePlanner { + pub fn new(config: GraphConfig) -> Self { + Self { config } + } +} + +impl GraphPhysicalPlanner for LanceNativePlanner { + fn plan(&self, _logical_plan: &LogicalOperator) -> Result { + let schema = Arc::new(DFSchema::empty()); + Ok(LogicalPlan::EmptyRelation(EmptyRelation { + produce_one_row: false, + schema, + })) + } +} +``` + +(The placeholder's `#[cfg(test)] mod tests { ... }` is dropped; Task 6 adds real tests.) + +- [ ] **Step 3: Build to verify the module restructure compiles** + +Run: `cargo build -p lance-graph` +Expected: builds (a warning about unused `NativeDirection` re-export is acceptable; it is consumed in Task 3). + +- [ ] **Step 4: Commit** + +```bash +git add -A crates/lance-graph/src/lance_native_planner.rs crates/lance-graph/src/lance_native_planner/ +git commit -m "refactor(native): promote lance_native_planner to module dir; add NativeDirection" +``` + +--- + +## Task 3: `expand_batch` + `CsrExpandNode` + `CsrExpandExec` + +**Files:** +- Create: `crates/lance-graph/src/lance_native_planner/csr_expand.rs` +- Modify: `crates/lance-graph/src/lance_native_planner/mod.rs` (add `mod csr_expand;`) + +- [ ] **Step 1: Write the failing test for the pure expansion function** + +Create `crates/lance-graph/src/lance_native_planner/csr_expand.rs` with the test module first (the rest is added in Step 3): + +```rust +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Native single-hop expand: logical node + physical operator + core function. +//! +//! `CsrExpandExec` does topology only — for each input row it looks up the +//! source vertex's neighbors in the CSR index and emits one output row per +//! neighbor, carrying through all input columns and appending the neighbor row +//! id as a new column. Target property materialization is handled separately by +//! `LanceTakeExec`. + +use std::any::Any; +use std::fmt; +use std::sync::Arc; + +use arrow::compute::{cast, take}; +use arrow_array::{Array, ArrayRef, RecordBatch, UInt32Array, UInt64Array}; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use datafusion::common::{DFSchemaRef, Result as DFResult}; +use datafusion::execution::TaskContext; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, + SendableRecordBatchStream, +}; +use futures::StreamExt; + +use crate::csr_index::CsrIndex; +use crate::error::{GraphError, Result}; +use super::direction::NativeDirection; + +/// Expand one input batch: for every input row, append one output row per +/// neighbor of that row's source vertex. +/// +/// `source_id_idx` is the column index of the source vertex id within `input`. +/// `neighbor_field` is the appended column (its data type is the target id +/// field's type; neighbor ids are cast into it). `out_schema` must equal +/// `input.schema()` fields followed by `neighbor_field`. +pub(crate) fn expand_batch( + input: &RecordBatch, + source_id_idx: usize, + csr: &CsrIndex, + neighbor_field: &Field, + out_schema: &SchemaRef, +) -> Result { + let map_err = |e: arrow_schema::ArrowError, what: &str| GraphError::ExecutionError { + message: format!("CsrExpand {}: {}", what, e), + location: snafu::Location::new(file!(), line!(), column!()), + }; + + // Source ids may be any integer type; normalize to u64. + let src_u64 = cast(input.column(source_id_idx), &DataType::UInt64) + .map_err(|e| map_err(e, "cast source id to u64"))?; + let src = src_u64 + .as_any() + .downcast_ref::() + .expect("cast to UInt64 yields UInt64Array"); + + let mut parent_idx: Vec = Vec::new(); + let mut neighbors: Vec = Vec::new(); + for row in 0..input.num_rows() { + if src.is_null(row) { + continue; + } + for &n in csr.neighbors(src.value(row)) { + parent_idx.push(row as u32); + neighbors.push(n); + } + } + + let take_idx = UInt32Array::from(parent_idx); + let mut cols: Vec = Vec::with_capacity(input.num_columns() + 1); + for c in input.columns() { + cols.push(take(c, &take_idx, None).map_err(|e| map_err(e, "take carried column"))?); + } + let neigh_u64 = Arc::new(UInt64Array::from(neighbors)) as ArrayRef; + let neigh_col = cast(&neigh_u64, neighbor_field.data_type()) + .map_err(|e| map_err(e, "cast neighbor id"))?; + cols.push(neigh_col); + + RecordBatch::try_new(out_schema.clone(), cols).map_err(|e| GraphError::ExecutionError { + message: format!("CsrExpand build output batch: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::csr_index::CsrIndexBuilder; + + fn input_batch() -> RecordBatch { + // a__id = [0,1,2,3], a__name = ["n0","n1","n2","n3"] + let schema = Arc::new(Schema::new(vec![ + Field::new("a__id", DataType::UInt64, false), + Field::new("a__name", DataType::Utf8, false), + ])); + RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt64Array::from(vec![0u64, 1, 2, 3])), + Arc::new(arrow_array::StringArray::from(vec!["n0", "n1", "n2", "n3"])), + ], + ) + .unwrap() + } + + fn out_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("a__id", DataType::UInt64, false), + Field::new("a__name", DataType::Utf8, false), + Field::new("b__id", DataType::UInt64, true), + ])) + } + + #[test] + fn test_expand_batch_outgoing() { + // 0->1, 0->2, 1->2, 3-> (none) + let csr = CsrIndexBuilder::new() + .with_num_vertices(4) + .add_edge(0, 1) + .add_edge(0, 2) + .add_edge(1, 2) + .build(); + let neighbor_field = Field::new("b__id", DataType::UInt64, true); + let out = expand_batch(&input_batch(), 0, &csr, &neighbor_field, &out_schema()).unwrap(); + + assert_eq!(out.num_rows(), 3); + let a_id = out.column(0).as_any().downcast_ref::().unwrap(); + let b_id = out.column(2).as_any().downcast_ref::().unwrap(); + let a_name = out + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + // Rows: (0,n0,1) (0,n0,2) (1,n1,2) + assert_eq!(a_id.values(), &[0, 0, 1]); + assert_eq!(b_id.values(), &[1, 2, 2]); + assert_eq!(a_name.value(0), "n0"); + assert_eq!(a_name.value(2), "n1"); + } + + #[test] + fn test_expand_batch_no_neighbors_and_out_of_range() { + let csr = CsrIndexBuilder::new().with_num_vertices(2).build(); // no edges + let neighbor_field = Field::new("b__id", DataType::UInt64, true); + let out = expand_batch(&input_batch(), 0, &csr, &neighbor_field, &out_schema()).unwrap(); + assert_eq!(out.num_rows(), 0); + } + + #[test] + fn test_expand_batch_casts_source_id_from_int64() { + // Source id column is Int64 (not UInt64): must still work. + let schema = Arc::new(Schema::new(vec![Field::new("a__id", DataType::Int64, false)])); + let input = RecordBatch::try_new( + schema, + vec![Arc::new(arrow_array::Int64Array::from(vec![0i64, 1]))], + ) + .unwrap(); + let out_schema = Arc::new(Schema::new(vec![ + Field::new("a__id", DataType::Int64, false), + Field::new("b__id", DataType::UInt64, true), + ])); + let csr = CsrIndexBuilder::new() + .with_num_vertices(2) + .add_edge(0, 1) + .build(); + let neighbor_field = Field::new("b__id", DataType::UInt64, true); + let out = expand_batch(&input, 0, &csr, &neighbor_field, &out_schema).unwrap(); + assert_eq!(out.num_rows(), 1); + let b_id = out.column(1).as_any().downcast_ref::().unwrap(); + assert_eq!(b_id.values(), &[1]); + } +} +``` + +Add `mod csr_expand;` to `crates/lance-graph/src/lance_native_planner/mod.rs` (after `mod direction;`). + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cargo test -p lance-graph expand_batch` +Expected: FAIL to compile if `expand_batch` body absent — but here it is present, so this step verifies the tests pass for the pure function. If they pass, proceed; the node/exec below add no new tests. + +Run: `cargo test -p lance-graph expand_batch` +Expected: PASS (3 tests). + +- [ ] **Step 3: Add the logical node and physical operator** + +Append to `crates/lance-graph/src/lance_native_planner/csr_expand.rs` (after `expand_batch`, before `#[cfg(test)]`): + +```rust +/// Logical extension node for a single-hop CSR expand. +/// +/// Holds only hashable metadata; the physical operator (and its `CsrIndex`) is +/// constructed by `CsrExtensionPlanner` at physical-planning time. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct CsrExpandNode { + /// Source subplan (a node scan, optionally with a source-only filter). + pub input: LogicalPlan, + /// Relationship type (lowercased table name to look up the edge table). + pub rel_type: String, + /// Edge table column holding source vertex ids. + pub src_field: String, + /// Edge table column holding destination vertex ids. + pub dst_field: String, + /// Traversal direction. + pub direction: NativeDirection, + /// Qualified column in `input` carrying the source vertex id (e.g. `a__id`). + pub source_id_column: String, + /// Qualified output column for the neighbor row id (e.g. `b__id`). + pub neighbor_column: String, + /// Arrow data type of the neighbor column (target id field's type). + pub neighbor_data_type: DataType, + /// Output schema = input schema + neighbor column. + pub schema: DFSchemaRef, +} + +impl PartialOrd for CsrExpandNode { + fn partial_cmp(&self, other: &Self) -> Option { + // Order by stable, comparable fields only. + ( + &self.rel_type, + &self.src_field, + &self.dst_field, + &self.source_id_column, + &self.neighbor_column, + ) + .partial_cmp(&( + &other.rel_type, + &other.src_field, + &other.dst_field, + &other.source_id_column, + &other.neighbor_column, + )) + } +} + +impl UserDefinedLogicalNodeCore for CsrExpandNode { + fn name(&self) -> &str { + "CsrExpand" + } + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.input] + } + fn schema(&self) -> &DFSchemaRef { + &self.schema + } + fn expressions(&self) -> Vec { + vec![] + } + fn fmt_for_explain(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "CsrExpand: rel={}, dir={:?}, src={}, neighbor={}", + self.rel_type, self.direction, self.source_id_column, self.neighbor_column + ) + } + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + mut inputs: Vec, + ) -> DFResult { + Ok(Self { + input: inputs.remove(0), + ..self.clone() + }) + } +} + +/// Physical operator for `CsrExpandNode`. +#[derive(Debug)] +pub struct CsrExpandExec { + input: Arc, + csr: Arc, + source_id_idx: usize, + neighbor_field: Field, + out_schema: SchemaRef, + props: PlanProperties, +} + +impl CsrExpandExec { + pub fn new( + input: Arc, + csr: Arc, + source_id_idx: usize, + neighbor_field: Field, + out_schema: SchemaRef, + ) -> Self { + let props = PlanProperties::new( + EquivalenceProperties::new(out_schema.clone()), + Partitioning::UnknownPartitioning(1), + EmissionType::Incremental, + Boundedness::Bounded, + ); + Self { + input, + csr, + source_id_idx, + neighbor_field, + out_schema, + props, + } + } +} + +impl DisplayAs for CsrExpandExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "CsrExpandExec: neighbor={}", self.neighbor_field.name()) + } +} + +impl ExecutionPlan for CsrExpandExec { + fn name(&self) -> &str { + "CsrExpandExec" + } + fn as_any(&self) -> &dyn Any { + self + } + fn properties(&self) -> &PlanProperties { + &self.props + } + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + fn with_new_children( + self: Arc, + children: Vec>, + ) -> DFResult> { + Ok(Arc::new(CsrExpandExec::new( + children[0].clone(), + self.csr.clone(), + self.source_id_idx, + self.neighbor_field.clone(), + self.out_schema.clone(), + ))) + } + fn execute( + &self, + partition: usize, + context: Arc, + ) -> DFResult { + let input = self.input.execute(partition, context)?; + let csr = self.csr.clone(); + let idx = self.source_id_idx; + let field = self.neighbor_field.clone(); + let out_schema = self.out_schema.clone(); + let out_schema_for_stream = out_schema.clone(); + let stream = input.map(move |rb| { + let rb = rb?; + expand_batch(&rb, idx, &csr, &field, &out_schema) + .map_err(|e| datafusion::error::DataFusionError::Execution(e.to_string())) + }); + Ok(Box::pin(RecordBatchStreamAdapter::new( + out_schema_for_stream, + stream, + ))) + } +} +``` + +- [ ] **Step 4: Build and run tests** + +Run: `cargo test -p lance-graph csr_expand` +Expected: PASS (3 tests; node/exec compile). + +- [ ] **Step 5: Commit** + +```bash +git add crates/lance-graph/src/lance_native_planner/ +git commit -m "feat(native): CsrExpandNode/Exec and expand_batch core" +``` + +--- + +## Task 4: `RowMaterializer` + `take_batch` + `LanceTakeNode` + `LanceTakeExec` + +**Files:** +- Create: `crates/lance-graph/src/lance_native_planner/take.rs` +- Modify: `crates/lance-graph/src/lance_native_planner/mod.rs` (add `mod take;`) + +- [ ] **Step 1: Write the failing test for the materializer + take function** + +Create `crates/lance-graph/src/lance_native_planner/take.rs`: + +```rust +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Native materialization: take target node columns by row id. +//! +//! `CsrExpandExec` produces target *row ids*; `LanceTakeExec` turns those into +//! target *properties* via a `RowMaterializer`. Under the dense-ROWID model the +//! in-memory materializer is a direct `arrow::compute::take` by offset — the +//! concrete reason CSR beats a hash join. + +use std::any::Any; +use std::fmt; +use std::sync::Arc; + +use arrow::compute::{cast, take}; +use arrow_array::{ArrayRef, RecordBatch, UInt64Array}; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use datafusion::common::{DFSchemaRef, Result as DFResult}; +use datafusion::execution::TaskContext; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, + SendableRecordBatchStream, +}; +use futures::StreamExt; + +use crate::error::{GraphError, Result}; + +/// Materializes rows of a target node table by row id. +pub trait RowMaterializer: Send + Sync + fmt::Debug { + /// Take `columns` (raw, unqualified names) for the given `row_ids`. + /// The returned batch has one row per element of `row_ids`, columns in the + /// requested order, named by their raw names. + fn take(&self, row_ids: &UInt64Array, columns: &[String]) -> Result; +} + +/// In-memory materializer over a fully-collected target node batch. Row id == +/// offset into the batch (dense-ROWID model). +#[derive(Debug)] +pub struct InMemoryMaterializer { + batch: RecordBatch, +} + +impl InMemoryMaterializer { + pub fn new(batch: RecordBatch) -> Self { + Self { batch } + } +} + +impl RowMaterializer for InMemoryMaterializer { + fn take(&self, row_ids: &UInt64Array, columns: &[String]) -> Result { + let mut fields: Vec = Vec::with_capacity(columns.len()); + let mut arrays: Vec = Vec::with_capacity(columns.len()); + for name in columns { + let col = self + .batch + .column_by_name(name) + .ok_or_else(|| GraphError::ExecutionError { + message: format!("take: target column '{}' not found", name), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + let taken = take(col, row_ids, None).map_err(|e| GraphError::ExecutionError { + message: format!("take: failed on column '{}': {}", name, e), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + fields.push(Field::new(name, col.data_type().clone(), true)); + arrays.push(taken); + } + RecordBatch::try_new(Arc::new(Schema::new(fields)), arrays).map_err(|e| { + GraphError::ExecutionError { + message: format!("take: build batch: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + } + }) + } +} + +/// Append materialized target columns to one input batch. +/// +/// `row_id_idx` is the index of the row-id column in `input`. `take_cols` are +/// the raw target column names to materialize, in the same order as the +/// appended fields of `out_schema`. `out_schema` = `input.schema()` followed by +/// the qualified materialized columns. +pub(crate) fn take_batch( + input: &RecordBatch, + row_id_idx: usize, + materializer: &dyn RowMaterializer, + take_cols: &[String], + out_schema: &SchemaRef, +) -> Result { + let ids_u64 = cast(input.column(row_id_idx), &DataType::UInt64).map_err(|e| { + GraphError::ExecutionError { + message: format!("take: cast row id to u64: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + } + })?; + let ids = ids_u64 + .as_any() + .downcast_ref::() + .expect("cast to UInt64 yields UInt64Array"); + + let materialized = materializer.take(ids, take_cols)?; + + let mut cols: Vec = input.columns().to_vec(); + cols.extend(materialized.columns().iter().cloned()); + + RecordBatch::try_new(out_schema.clone(), cols).map_err(|e| GraphError::ExecutionError { + message: format!("take: build output batch: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::StringArray; + + fn target_batch() -> RecordBatch { + // person table: id, name, age (raw, lowercased column names) + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::UInt64, false), + Field::new("name", DataType::Utf8, false), + Field::new("age", DataType::Int64, false), + ])); + RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt64Array::from(vec![0u64, 1, 2])), + Arc::new(StringArray::from(vec!["alice", "bob", "carol"])), + Arc::new(arrow_array::Int64Array::from(vec![30i64, 40, 50])), + ], + ) + .unwrap() + } + + #[test] + fn test_in_memory_materializer_take_subset() { + let m = InMemoryMaterializer::new(target_batch()); + let ids = UInt64Array::from(vec![2u64, 0]); + let out = m.take(&ids, &["name".to_string()]).unwrap(); + assert_eq!(out.num_columns(), 1); + let names = out.column(0).as_any().downcast_ref::().unwrap(); + assert_eq!(names.value(0), "carol"); + assert_eq!(names.value(1), "alice"); + } + + #[test] + fn test_in_memory_materializer_missing_column_errors() { + let m = InMemoryMaterializer::new(target_batch()); + let ids = UInt64Array::from(vec![0u64]); + assert!(m.take(&ids, &["nonexistent".to_string()]).is_err()); + } + + #[test] + fn test_take_batch_appends_qualified_columns() { + // input: a__name, b__id (b__id is the neighbor row id) + let in_schema = Arc::new(Schema::new(vec![ + Field::new("a__name", DataType::Utf8, false), + Field::new("b__id", DataType::UInt64, true), + ])); + let input = RecordBatch::try_new( + in_schema, + vec![ + Arc::new(StringArray::from(vec!["x", "y"])), + Arc::new(UInt64Array::from(vec![1u64, 2])), + ], + ) + .unwrap(); + // out: a__name, b__id, b__name, b__age + let out_schema = Arc::new(Schema::new(vec![ + Field::new("a__name", DataType::Utf8, false), + Field::new("b__id", DataType::UInt64, true), + Field::new("b__name", DataType::Utf8, true), + Field::new("b__age", DataType::Int64, true), + ])); + let m = InMemoryMaterializer::new(target_batch()); + let out = take_batch( + &input, + 1, + &m, + &["name".to_string(), "age".to_string()], + &out_schema, + ) + .unwrap(); + assert_eq!(out.num_rows(), 2); + let b_name = out.column(2).as_any().downcast_ref::().unwrap(); + let b_age = out + .column(3) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(b_name.value(0), "bob"); // row id 1 + assert_eq!(b_name.value(1), "carol"); // row id 2 + assert_eq!(b_age.values(), &[40, 50]); + } +} +``` + +Add `mod take;` to `crates/lance-graph/src/lance_native_planner/mod.rs`. + +- [ ] **Step 2: Run tests to verify they pass** + +Run: `cargo test -p lance-graph -- take::tests` +Expected: PASS (3 tests). + +- [ ] **Step 3: Add the logical node and physical operator** + +Append to `crates/lance-graph/src/lance_native_planner/take.rs` (after `take_batch`, before `#[cfg(test)]`): + +```rust +/// Logical extension node for materializing target columns via take(). +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct LanceTakeNode { + /// Input subplan (a `CsrExpandNode`). + pub input: LogicalPlan, + /// Lowercased target node table name (to collect rows from). + pub target_table: String, + /// Qualified column in `input` holding the row ids (e.g. `b__id`). + pub row_id_column: String, + /// Raw (unqualified, lowercased) target columns to materialize, in output order. + pub take_cols: Vec, + /// Output schema = input schema + qualified materialized columns. + pub schema: DFSchemaRef, +} + +impl PartialOrd for LanceTakeNode { + fn partial_cmp(&self, other: &Self) -> Option { + (&self.target_table, &self.row_id_column, &self.take_cols).partial_cmp(&( + &other.target_table, + &other.row_id_column, + &other.take_cols, + )) + } +} + +impl UserDefinedLogicalNodeCore for LanceTakeNode { + fn name(&self) -> &str { + "LanceTake" + } + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.input] + } + fn schema(&self) -> &DFSchemaRef { + &self.schema + } + fn expressions(&self) -> Vec { + vec![] + } + fn fmt_for_explain(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "LanceTake: table={}, row_id={}, cols={:?}", + self.target_table, self.row_id_column, self.take_cols + ) + } + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + mut inputs: Vec, + ) -> DFResult { + Ok(Self { + input: inputs.remove(0), + ..self.clone() + }) + } +} + +/// Physical operator for `LanceTakeNode`. +#[derive(Debug)] +pub struct LanceTakeExec { + input: Arc, + materializer: Arc, + row_id_idx: usize, + take_cols: Vec, + out_schema: SchemaRef, + props: PlanProperties, +} + +impl LanceTakeExec { + pub fn new( + input: Arc, + materializer: Arc, + row_id_idx: usize, + take_cols: Vec, + out_schema: SchemaRef, + ) -> Self { + let props = PlanProperties::new( + EquivalenceProperties::new(out_schema.clone()), + Partitioning::UnknownPartitioning(1), + EmissionType::Incremental, + Boundedness::Bounded, + ); + Self { + input, + materializer, + row_id_idx, + take_cols, + out_schema, + props, + } + } +} + +impl DisplayAs for LanceTakeExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "LanceTakeExec: cols={:?}", self.take_cols) + } +} + +impl ExecutionPlan for LanceTakeExec { + fn name(&self) -> &str { + "LanceTakeExec" + } + fn as_any(&self) -> &dyn Any { + self + } + fn properties(&self) -> &PlanProperties { + &self.props + } + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + fn with_new_children( + self: Arc, + children: Vec>, + ) -> DFResult> { + Ok(Arc::new(LanceTakeExec::new( + children[0].clone(), + self.materializer.clone(), + self.row_id_idx, + self.take_cols.clone(), + self.out_schema.clone(), + ))) + } + fn execute( + &self, + partition: usize, + context: Arc, + ) -> DFResult { + let input = self.input.execute(partition, context)?; + let materializer = self.materializer.clone(); + let row_id_idx = self.row_id_idx; + let take_cols = self.take_cols.clone(); + let out_schema = self.out_schema.clone(); + let out_schema_for_stream = out_schema.clone(); + let stream = input.map(move |rb| { + let rb = rb?; + take_batch(&rb, row_id_idx, materializer.as_ref(), &take_cols, &out_schema) + .map_err(|e| datafusion::error::DataFusionError::Execution(e.to_string())) + }); + Ok(Box::pin(RecordBatchStreamAdapter::new( + out_schema_for_stream, + stream, + ))) + } +} +``` + +- [ ] **Step 4: Build and run tests** + +Run: `cargo test -p lance-graph -- take::tests` +Expected: PASS (3 tests; node/exec compile). + +- [ ] **Step 5: Commit** + +```bash +git add crates/lance-graph/src/lance_native_planner/ +git commit -m "feat(native): RowMaterializer, LanceTakeNode/Exec, take_batch core" +``` + +--- + +## Task 5: `CsrExtensionPlanner` + `CsrQueryPlanner` + +Turns the two logical nodes into physical operators, building the `CsrIndex` and `InMemoryMaterializer` from tables registered on the session. + +**Files:** +- Create: `crates/lance-graph/src/lance_native_planner/extension_planner.rs` +- Modify: `crates/lance-graph/src/lance_native_planner/mod.rs` (add `mod extension_planner; pub use extension_planner::CsrQueryPlanner;`) + +- [ ] **Step 1: Create the extension planner and query planner** + +Create `crates/lance-graph/src/lance_native_planner/extension_planner.rs`: + +```rust +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Physical planning for the native CSR extension nodes. +//! +//! `CsrQueryPlanner` is registered on the execution `SessionContext`. It runs +//! the `DefaultPhysicalPlanner` with `CsrExtensionPlanner`, which builds the +//! `CsrIndex` (from the edge table) and the `InMemoryMaterializer` (from the +//! target node table) at physical-planning time. + +use std::sync::Arc; + +use arrow::compute::concat_batches; +use arrow_schema::Field; +use async_trait::async_trait; +use datafusion::common::Result as DFResult; +use datafusion::error::DataFusionError; +use datafusion::execution::context::{QueryPlanner, SessionContext, SessionState}; +use datafusion::execution::TaskContext; +use datafusion::logical_expr::{LogicalPlan, UserDefinedLogicalNode}; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::physical_planner::{ + DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner, +}; + +use super::csr_expand::{CsrExpandExec, CsrExpandNode}; +use super::direction::NativeDirection; +use super::take::{CsrExtensionMaterializer, InMemoryMaterializer, LanceTakeExec, LanceTakeNode}; +use crate::csr_index::CsrIndexBuilder; + +/// Collect a registered table to a single `RecordBatch`. +async fn collect_table( + session_state: &SessionState, + table: &str, +) -> DFResult { + let ctx = SessionContext::new_with_state(session_state.clone()); + let df = ctx.table(table).await?; + let schema = df.schema().inner().clone(); + let batches = df.collect().await?; + concat_batches(&schema, &batches).map_err(|e| DataFusionError::Execution(e.to_string())) +} + +/// Extension planner that lowers `CsrExpandNode` and `LanceTakeNode`. +#[derive(Debug)] +pub struct CsrExtensionPlanner; + +#[async_trait] +impl ExtensionPlanner for CsrExtensionPlanner { + async fn plan_extension( + &self, + _planner: &dyn PhysicalPlanner, + node: &dyn UserDefinedLogicalNode, + _logical_inputs: &[&LogicalPlan], + physical_inputs: &[Arc], + session_state: &SessionState, + ) -> DFResult>> { + if let Some(expand) = node.as_any().downcast_ref::() { + let input = physical_inputs[0].clone(); + + // Build CSR from the edge table (reverse columns for Incoming). + let edges = collect_table(session_state, &expand.rel_type).await?; + let (src_col, dst_col) = match expand.direction { + NativeDirection::Outgoing => (&expand.src_field, &expand.dst_field), + NativeDirection::Incoming => (&expand.dst_field, &expand.src_field), + }; + let csr = CsrIndexBuilder::new() + .add_edges_from_batch_with_columns(&edges, src_col, dst_col) + .map_err(|e| DataFusionError::Execution(e.to_string()))? + .build(); + + let in_schema = input.schema(); + let source_id_idx = in_schema + .index_of(&expand.source_id_column) + .map_err(|e| DataFusionError::Execution(format!( + "CsrExpand: source id column '{}' not found in input: {}", + expand.source_id_column, e + )))?; + let neighbor_field = Field::new( + &expand.neighbor_column, + expand.neighbor_data_type.clone(), + true, + ); + let out_schema = expand.schema.inner().clone(); + + return Ok(Some(Arc::new(CsrExpandExec::new( + input, + Arc::new(csr), + source_id_idx, + neighbor_field, + out_schema, + )))); + } + + if let Some(take) = node.as_any().downcast_ref::() { + let input = physical_inputs[0].clone(); + + let target = collect_table(session_state, &take.target_table).await?; + let materializer = Arc::new(InMemoryMaterializer::new(target)); + + let in_schema = input.schema(); + let row_id_idx = in_schema + .index_of(&take.row_id_column) + .map_err(|e| DataFusionError::Execution(format!( + "LanceTake: row id column '{}' not found in input: {}", + take.row_id_column, e + )))?; + let out_schema = take.schema.inner().clone(); + + return Ok(Some(Arc::new(LanceTakeExec::new( + input, + materializer as Arc, + row_id_idx, + take.take_cols.clone(), + out_schema, + )))); + } + + Ok(None) + } +} + +/// Query planner that installs `CsrExtensionPlanner`. +#[derive(Debug, Default)] +pub struct CsrQueryPlanner; + +impl CsrQueryPlanner { + pub fn new() -> Self { + Self + } +} + +#[async_trait] +impl QueryPlanner for CsrQueryPlanner { + async fn create_physical_plan( + &self, + logical_plan: &LogicalPlan, + session_state: &SessionState, + ) -> DFResult> { + let planner = + DefaultPhysicalPlanner::with_extension_planners(vec![Arc::new(CsrExtensionPlanner)]); + planner + .create_physical_plan(logical_plan, session_state) + .await + } +} + +// Silence unused import in some build configs. +#[allow(unused_imports)] +use TaskContext as _TaskContext; +``` + +> Note: `CsrExtensionMaterializer` referenced above is a type alias for +> `dyn RowMaterializer` used to keep the trait-object cast explicit. Add to +> `take.rs` (top level, after the `RowMaterializer` trait): +> ```rust +> /// Convenience alias for the boxed materializer trait object. +> pub type CsrExtensionMaterializer = dyn RowMaterializer; +> ``` +> and change `LanceTakeExec::new` / field type from `Arc` to +> accept `Arc` (the alias is `dyn RowMaterializer`, so +> `Arc` == `Arc`; no signature +> change needed — pass `materializer` directly without the `as` cast if simpler). + +- [ ] **Step 2: Update mod.rs** + +Add to `crates/lance-graph/src/lance_native_planner/mod.rs`: + +```rust +mod extension_planner; + +pub use extension_planner::CsrQueryPlanner; +``` + +- [ ] **Step 3: Build** + +Run: `cargo build -p lance-graph` +Expected: builds. If the `CsrExtensionMaterializer` alias causes friction, delete the alias and the `as Arc` cast and pass `materializer` directly (it is already `Arc`; coercion to `Arc` is automatic at the call site because `LanceTakeExec::new` takes `Arc`). + +- [ ] **Step 4: Commit** + +```bash +git add crates/lance-graph/src/lance_native_planner/ +git commit -m "feat(native): CsrExtensionPlanner + CsrQueryPlanner physical planning" +``` + +--- + +## Task 6: `LanceNativePlanner` lowering + fallback + +Rewrites `mod.rs` to lower a supported single-hop `Expand` onto the extension nodes and delegate everything else to `DataFusionPlanner`. + +**Files:** +- Modify: `crates/lance-graph/src/lance_native_planner/mod.rs` +- Modify: `crates/lance-graph/src/datafusion_planner/mod.rs:20` (`mod expression;` → `pub(crate) mod expression;`) + +- [ ] **Step 1: Expose the expression helpers** + +In `crates/lance-graph/src/datafusion_planner/mod.rs`, change: + +```rust +mod expression; +``` +to: +```rust +pub(crate) mod expression; +``` + +- [ ] **Step 2: Write the failing planner tests** + +Replace the body of `crates/lance-graph/src/lance_native_planner/mod.rs` (keep the +`mod`/`pub use` declarations from Tasks 2–5) so the file is: + +```rust +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Lance Native physical planner. +//! +//! Lowers a supported single-hop `Expand` onto CSR-backed extension nodes +//! (`CsrExpandNode` + `LanceTakeNode`). Any plan it cannot serve natively is +//! delegated wholesale to `DataFusionPlanner`, so `LanceNative` execution is +//! always correct on valid Cypher — it simply uses joins when CSR cannot help. + +mod csr_expand; +mod direction; +mod extension_planner; +mod take; + +pub use direction::NativeDirection; +pub use extension_planner::CsrQueryPlanner; + +use std::sync::Arc; + +use datafusion::common::DFSchema; +use datafusion::logical_expr::{Expr, Extension, LogicalPlan, LogicalPlanBuilder}; +use arrow_schema::{Field, Schema}; + +use crate::ast::RelationshipDirection; +use crate::case_insensitive::qualify_column; +use crate::config::GraphConfig; +use crate::datafusion_planner::expression::{ + to_df_boolean_expr, to_df_value_expr, +}; +use crate::datafusion_planner::{ + analysis, DataFusionPlanner, GraphPhysicalPlanner, PlanningContext, +}; +use crate::error::{GraphError, Result}; +use crate::logical_plan::{LogicalOperator, ProjectionItem, SortItem}; + +use csr_expand::CsrExpandNode; +use direction::NativeDirection; +use take::LanceTakeNode; + +/// Lance-native planner: CSR single-hop expand with DataFusion fallback. +pub struct LanceNativePlanner { + config: GraphConfig, + df: DataFusionPlanner, +} + +impl LanceNativePlanner { + pub fn new(config: GraphConfig) -> Self { + Self { + df: DataFusionPlanner::new(config.clone()), + config, + } + } + + pub fn with_catalog( + config: GraphConfig, + catalog: Arc, + ) -> Self { + Self { + df: DataFusionPlanner::with_catalog(config.clone(), catalog), + config, + } + } +} + +impl GraphPhysicalPlanner for LanceNativePlanner { + fn plan(&self, logical_plan: &LogicalOperator) -> Result { + if !can_plan_natively(logical_plan) { + return self.df.plan(logical_plan); + } + let analysis = analysis::analyze(logical_plan)?; + let mut ctx = PlanningContext::new(&analysis); + self.build_native(&mut ctx, logical_plan) + } +} + +impl LanceNativePlanner { + /// Build the native plan for a supported tree. Unary operators above the + /// single expand are rebuilt on the native child; the expand itself lowers + /// to `CsrExpandNode` + `LanceTakeNode`. + fn build_native( + &self, + ctx: &mut PlanningContext, + op: &LogicalOperator, + ) -> Result { + match op { + LogicalOperator::ScanByLabel { + variable, + label, + properties, + } => self.df.build_scan(ctx, variable, label, properties), + + LogicalOperator::Filter { input, predicate } => { + let child = self.build_native(ctx, input)?; + let expr = to_df_boolean_expr(predicate); + LogicalPlanBuilder::from(child) + .filter(expr) + .map_err(|e| self.plan_err("filter", e))? + .build() + .map_err(|e| self.plan_err("filter build", e)) + } + + LogicalOperator::Project { input, projections } => { + let child = self.build_native(ctx, input)?; + self.build_project_on(child, projections) + } + + LogicalOperator::Sort { input, sort_items } => { + let child = self.build_native(ctx, input)?; + self.build_sort_on(child, sort_items) + } + + LogicalOperator::Limit { input, count } => { + let child = self.build_native(ctx, input)?; + LogicalPlanBuilder::from(child) + .limit(0, Some(*count as usize)) + .map_err(|e| self.plan_err("limit", e))? + .build() + .map_err(|e| self.plan_err("limit build", e)) + } + + LogicalOperator::Offset { input, offset } => { + let child = self.build_native(ctx, input)?; + LogicalPlanBuilder::from(child) + .limit(*offset as usize, None) + .map_err(|e| self.plan_err("offset", e))? + .build() + .map_err(|e| self.plan_err("offset build", e)) + } + + LogicalOperator::Distinct { input } => { + let child = self.build_native(ctx, input)?; + LogicalPlanBuilder::from(child) + .distinct() + .map_err(|e| self.plan_err("distinct", e))? + .build() + .map_err(|e| self.plan_err("distinct build", e)) + } + + LogicalOperator::Expand { + input, + source_variable, + target_variable, + target_label, + relationship_types, + direction, + .. + } => self.build_expand_native( + ctx, + input, + source_variable, + target_variable, + target_label, + relationship_types, + direction, + ), + + // Unsupported here would have been rejected by can_plan_natively. + other => Err(GraphError::PlanError { + message: format!("native planner reached unsupported operator: {:?}", other), + location: snafu::Location::new(file!(), line!(), column!()), + }), + } + } + + #[allow(clippy::too_many_arguments)] + fn build_expand_native( + &self, + ctx: &mut PlanningContext, + input: &LogicalOperator, + source_variable: &str, + target_variable: &str, + target_label: &str, + relationship_types: &[String], + direction: &RelationshipDirection, + ) -> Result { + let source_plan = self.build_native(ctx, input)?; + + let rel_type = &relationship_types[0]; + let rel_map = self.config.get_relationship_mapping(rel_type).ok_or_else(|| { + GraphError::ConfigError { + message: format!("No relationship mapping for '{}'", rel_type), + location: snafu::Location::new(file!(), line!(), column!()), + } + })?; + + let src_label = ctx + .analysis + .var_to_label + .get(source_variable) + .ok_or_else(|| GraphError::PlanError { + message: format!("No label for source variable '{}'", source_variable), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + let src_node = self.config.get_node_mapping(src_label).ok_or_else(|| { + GraphError::ConfigError { + message: format!("No node mapping for label '{}'", src_label), + location: snafu::Location::new(file!(), line!(), column!()), + } + })?; + let tgt_node = self.config.get_node_mapping(target_label).ok_or_else(|| { + GraphError::ConfigError { + message: format!("No node mapping for label '{}'", target_label), + location: snafu::Location::new(file!(), line!(), column!()), + } + })?; + + let catalog = self.df.catalog_ref().ok_or_else(|| GraphError::ConfigError { + message: "LanceNativePlanner requires a catalog for native expand".to_string(), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + let tgt_source = catalog.node_source(target_label).ok_or_else(|| { + GraphError::ConfigError { + message: format!("No table source for target label '{}'", target_label), + location: snafu::Location::new(file!(), line!(), column!()), + } + })?; + let tgt_arrow = tgt_source.schema(); + + let direction = match direction { + RelationshipDirection::Outgoing => NativeDirection::Outgoing, + RelationshipDirection::Incoming => NativeDirection::Incoming, + RelationshipDirection::Undirected => { + return Err(GraphError::PlanError { + message: "undirected expand is not natively supported".to_string(), + location: snafu::Location::new(file!(), line!(), column!()), + }); + } + }; + + let source_id_column = qualify_column(source_variable, &src_node.id_field); + let neighbor_column = qualify_column(target_variable, &tgt_node.id_field); + let neighbor_data_type = tgt_arrow + .field_with_name(&tgt_node.id_field.to_lowercase()) + .map_err(|e| GraphError::ConfigError { + message: format!( + "target id field '{}' not found in '{}': {}", + tgt_node.id_field, target_label, e + ), + location: snafu::Location::new(file!(), line!(), column!()), + })? + .data_type() + .clone(); + + // CsrExpandNode output schema = source schema + neighbor column. + let src_arrow = source_plan.schema().inner(); + let mut expand_fields: Vec = + src_arrow.fields().iter().map(|f| f.as_ref().clone()).collect(); + expand_fields.push(Field::new( + &neighbor_column, + neighbor_data_type.clone(), + true, + )); + let expand_arrow = Schema::new(expand_fields); + let expand_schema = Arc::new( + DFSchema::try_from(expand_arrow).map_err(|e| self.plan_err("expand schema", e))?, + ); + + let expand_node = CsrExpandNode { + input: source_plan, + rel_type: rel_type.to_lowercase(), + src_field: rel_map.source_id_field.to_lowercase(), + dst_field: rel_map.target_id_field.to_lowercase(), + direction, + source_id_column, + neighbor_column: neighbor_column.clone(), + neighbor_data_type, + schema: expand_schema.clone(), + }; + let expand_plan = LogicalPlan::Extension(Extension { + node: Arc::new(expand_node), + }); + + // LanceTakeNode: materialize all target columns except the id field. + let id_lower = tgt_node.id_field.to_lowercase(); + let take_cols: Vec = tgt_arrow + .fields() + .iter() + .map(|f| f.name().to_lowercase()) + .filter(|n| n != &id_lower) + .collect(); + + let mut take_fields: Vec = expand_arrow_fields(&expand_plan); + for raw in &take_cols { + let f = tgt_arrow + .field_with_name(raw) + .map_err(|e| self.plan_err("target field", e))?; + take_fields.push(Field::new( + qualify_column(target_variable, raw), + f.data_type().clone(), + true, + )); + } + let take_arrow = Schema::new(take_fields); + let take_schema = + Arc::new(DFSchema::try_from(take_arrow).map_err(|e| self.plan_err("take schema", e))?); + + let take_node = LanceTakeNode { + input: expand_plan, + target_table: target_label.to_lowercase(), + row_id_column: neighbor_column, + take_cols, + schema: take_schema, + }; + Ok(LogicalPlan::Extension(Extension { + node: Arc::new(take_node), + })) + } + + fn build_project_on( + &self, + input: LogicalPlan, + projections: &[ProjectionItem], + ) -> Result { + // Delegate to DataFusionPlanner's project-on-plan helpers (handle + // aggregates + Cypher dot-notation aliasing identically to the join path). + let has_agg = projections.iter().any(|p| { + crate::datafusion_planner::expression::contains_aggregate(&p.expression) + }); + if has_agg { + self.df.build_project_with_aggregates(input, projections) + } else { + self.df.build_simple_project(input, projections) + } + } + + fn build_sort_on(&self, input: LogicalPlan, sort_items: &[SortItem]) -> Result { + use datafusion::logical_expr::SortExpr; + let sort_exprs: Vec = sort_items + .iter() + .map(|item| { + let expr = to_df_value_expr(&item.expression); + let asc = matches!(item.direction, crate::ast::SortDirection::Ascending); + SortExpr { + expr, + asc, + nulls_first: true, + } + }) + .collect(); + LogicalPlanBuilder::from(input) + .sort(sort_exprs) + .map_err(|e| self.plan_err("sort", e))? + .build() + .map_err(|e| self.plan_err("sort build", e)) + } + + fn plan_err(&self, what: &str, e: E) -> GraphError { + GraphError::PlanError { + message: format!("native {}: {}", what, e), + location: snafu::Location::new(file!(), line!(), column!()), + } + } +} + +/// Extract the arrow fields of a `CsrExpandNode` extension plan as owned `Field`s. +fn expand_arrow_fields(plan: &LogicalPlan) -> Vec { + plan.schema() + .inner() + .fields() + .iter() + .map(|f| f.as_ref().clone()) + .collect() +} + +/// True iff the plan is a single-hop expand the native planner can serve. +fn can_plan_natively(op: &LogicalOperator) -> bool { + let mut expands = 0usize; + if !walk_supported(op, &mut expands) { + return false; + } + expands == 1 +} + +fn walk_supported(op: &LogicalOperator, expands: &mut usize) -> bool { + match op { + LogicalOperator::ScanByLabel { .. } => true, + LogicalOperator::Filter { input, .. } + | LogicalOperator::Project { input, .. } + | LogicalOperator::Sort { input, .. } + | LogicalOperator::Limit { input, .. } + | LogicalOperator::Offset { input, .. } + | LogicalOperator::Distinct { input } => walk_supported(input, expands), + LogicalOperator::Expand { + input, + relationship_types, + direction, + .. + } => { + *expands += 1; + if relationship_types.len() != 1 { + return false; + } + if matches!(direction, RelationshipDirection::Undirected) { + return false; + } + walk_supported(input, expands) + } + // Not supported natively in Phase 2. + LogicalOperator::VariableLengthExpand { .. } + | LogicalOperator::Join { .. } + | LogicalOperator::Unwind { .. } => false, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::datafusion_planner::test_fixtures::{make_catalog, person_knows_config, person_scan}; + use crate::logical_plan::{LogicalOperator, ProjectionItem}; + use crate::ast::{PropertyRef, ValueExpression}; + + fn knows_expand(direction: RelationshipDirection) -> LogicalOperator { + LogicalOperator::Expand { + input: Box::new(person_scan("a")), + source_variable: "a".to_string(), + target_variable: "b".to_string(), + target_label: "Person".to_string(), + relationship_types: vec!["KNOWS".to_string()], + direction, + relationship_variable: None, + properties: Default::default(), + target_properties: Default::default(), + } + } + + #[test] + fn test_can_plan_natively_single_hop() { + let plan = LogicalOperator::Project { + input: Box::new(knows_expand(RelationshipDirection::Outgoing)), + projections: vec![ProjectionItem { + expression: ValueExpression::Property(PropertyRef::new("b", "name")), + alias: None, + }], + }; + assert!(can_plan_natively(&plan)); + } + + #[test] + fn test_cannot_plan_undirected_or_multitype() { + assert!(!can_plan_natively(&knows_expand(RelationshipDirection::Undirected))); + let mut multi = knows_expand(RelationshipDirection::Outgoing); + if let LogicalOperator::Expand { + relationship_types, .. + } = &mut multi + { + relationship_types.push("LIKES".to_string()); + } + assert!(!can_plan_natively(&multi)); + } + + #[test] + fn test_cannot_plan_zero_expands() { + assert!(!can_plan_natively(&person_scan("a"))); + } + + #[test] + fn test_native_plan_contains_extension_nodes() { + let plan = LogicalOperator::Project { + input: Box::new(knows_expand(RelationshipDirection::Outgoing)), + projections: vec![ProjectionItem { + expression: ValueExpression::Property(PropertyRef::new("b", "name")), + alias: None, + }], + }; + let planner = LanceNativePlanner::with_catalog(person_knows_config(), make_catalog()); + let df_plan = planner.plan(&plan).unwrap(); + let s = format!("{:?}", df_plan); + assert!(s.contains("CsrExpand"), "missing CsrExpand: {}", s); + assert!(s.contains("LanceTake"), "missing LanceTake: {}", s); + } + + #[test] + fn test_unsupported_falls_back_to_join() { + // Variable-length expand must fall back to the DataFusion join path. + let vlexpand = LogicalOperator::VariableLengthExpand { + input: Box::new(person_scan("a")), + source_variable: "a".into(), + target_variable: "b".into(), + relationship_types: vec!["KNOWS".into()], + direction: RelationshipDirection::Outgoing, + relationship_variable: None, + min_length: Some(1), + max_length: Some(2), + target_properties: Default::default(), + }; + let planner = LanceNativePlanner::with_catalog(person_knows_config(), make_catalog()); + let df_plan = planner.plan(&vlexpand).unwrap(); + let s = format!("{:?}", df_plan); + assert!(!s.contains("CsrExpand"), "should not be native: {}", s); + } +} +``` + +This task also requires two `pub(crate)` accessors on `DataFusionPlanner`. Add them in `crates/lance-graph/src/datafusion_planner/mod.rs` inside `impl DataFusionPlanner`: + +```rust + /// Access the catalog, if any (used by the native planner). + pub(crate) fn catalog_ref( + &self, + ) -> Option<&Arc> { + self.catalog.as_ref() + } +``` + +(`build_simple_project`, `build_project_with_aggregates`, and `build_scan` are already `pub(crate)`. `contains_aggregate` lives in `expression`, now `pub(crate)`.) + +- [ ] **Step 3: Run tests to verify they fail, then pass** + +Run: `cargo test -p lance-graph -- lance_native_planner::tests` +Expected: compiles and PASSES (5 tests). If `build_project_with_aggregates` is not `pub(crate)`, change its visibility in `crates/lance-graph/src/datafusion_planner/builder/aggregate_ops.rs` from `fn` to `pub(crate) fn` (verify; `build_simple_project` is already `pub(crate)`). + +- [ ] **Step 4: Commit** + +```bash +git add crates/lance-graph/src/lance_native_planner/ crates/lance-graph/src/datafusion_planner/ +git commit -m "feat(native): LanceNativePlanner single-hop lowering with DataFusion fallback" +``` + +--- + +## Task 7: Wire `ExecutionStrategy::LanceNative` and end-to-end parity tests + +**Files:** +- Modify: `crates/lance-graph/src/query.rs` (the two `LanceNative` arms; native context + planner) +- Create: `crates/lance-graph/tests/test_lance_native_expand.rs` + +- [ ] **Step 1: Write the failing end-to-end test** + +Create `crates/lance-graph/tests/test_lance_native_expand.rs`: + +```rust +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! End-to-end parity tests: native CSR expand vs DataFusion join path. + +use std::collections::HashMap; +use std::sync::Arc; + +use arrow_array::{Int64Array, RecordBatch, StringArray, UInt64Array}; +use arrow_schema::{DataType, Field, Schema}; +use lance_graph::config::GraphConfig; +use lance_graph::query::{CypherQuery, ExecutionStrategy}; + +fn person_batch() -> RecordBatch { + // Dense ids 0..4 (row id == id_field value). + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::UInt64, false), + Field::new("name", DataType::Utf8, false), + Field::new("age", DataType::Int64, false), + ])); + RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt64Array::from(vec![0u64, 1, 2, 3])), + Arc::new(StringArray::from(vec!["alice", "bob", "carol", "dave"])), + Arc::new(Int64Array::from(vec![30i64, 40, 25, 50])), + ], + ) + .unwrap() +} + +fn knows_batch() -> RecordBatch { + // 0->1, 0->2, 1->3, 2->3 + let schema = Arc::new(Schema::new(vec![ + Field::new("src_id", DataType::UInt64, false), + Field::new("dst_id", DataType::UInt64, false), + ])); + RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt64Array::from(vec![0u64, 0, 1, 2])), + Arc::new(UInt64Array::from(vec![1u64, 2, 3, 3])), + ], + ) + .unwrap() +} + +fn config() -> GraphConfig { + GraphConfig::builder() + .with_node_label("Person", "id") + .with_relationship("KNOWS", "src_id", "dst_id") + .build() + .unwrap() +} + +fn datasets() -> HashMap { + let mut d = HashMap::new(); + d.insert("Person".to_string(), person_batch()); + d.insert("KNOWS".to_string(), knows_batch()); + d +} + +/// Collect (a.name, b.name) rows as a sorted Vec for order-independent compare. +fn name_pairs(batch: &RecordBatch) -> Vec<(String, String)> { + let cols: Vec<&StringArray> = (0..batch.num_columns()) + .map(|i| batch.column(i).as_any().downcast_ref::().unwrap()) + .collect(); + let mut rows: Vec<(String, String)> = (0..batch.num_rows()) + .map(|r| (cols[0].value(r).to_string(), cols[1].value(r).to_string())) + .collect(); + rows.sort(); + rows +} + +#[tokio::test] +async fn test_native_expand_matches_datafusion_names() { + let q = "MATCH (a:Person)-[:KNOWS]->(b:Person) RETURN a.name, b.name"; + let query = CypherQuery::new(q).unwrap().with_config(config()); + + let native = query + .execute(datasets(), Some(ExecutionStrategy::LanceNative)) + .await + .unwrap(); + let df = query + .execute(datasets(), Some(ExecutionStrategy::DataFusion)) + .await + .unwrap(); + + let expected = vec![ + ("alice".to_string(), "bob".to_string()), + ("alice".to_string(), "carol".to_string()), + ("bob".to_string(), "dave".to_string()), + ("carol".to_string(), "dave".to_string()), + ]; + assert_eq!(name_pairs(&native), expected); + assert_eq!(name_pairs(&native), name_pairs(&df)); +} + +#[tokio::test] +async fn test_native_expand_with_target_filter() { + let q = "MATCH (a:Person)-[:KNOWS]->(b:Person) WHERE b.age > 30 RETURN a.name, b.name"; + let query = CypherQuery::new(q).unwrap().with_config(config()); + + let native = query + .execute(datasets(), Some(ExecutionStrategy::LanceNative)) + .await + .unwrap(); + let df = query + .execute(datasets(), Some(ExecutionStrategy::DataFusion)) + .await + .unwrap(); + assert_eq!(name_pairs(&native), name_pairs(&df)); + // bob(40) and dave(50) qualify as targets: (alice,bob),(bob,dave),(carol,dave) + assert_eq!( + name_pairs(&native), + vec![ + ("alice".to_string(), "bob".to_string()), + ("bob".to_string(), "dave".to_string()), + ("carol".to_string(), "dave".to_string()), + ] + ); +} + +#[tokio::test] +async fn test_native_expand_incoming_matches_datafusion() { + let q = "MATCH (a:Person)<-[:KNOWS]-(b:Person) RETURN a.name, b.name"; + let query = CypherQuery::new(q).unwrap().with_config(config()); + let native = query + .execute(datasets(), Some(ExecutionStrategy::LanceNative)) + .await + .unwrap(); + let df = query + .execute(datasets(), Some(ExecutionStrategy::DataFusion)) + .await + .unwrap(); + assert_eq!(name_pairs(&native), name_pairs(&df)); +} + +#[tokio::test] +async fn test_native_varlength_falls_back_and_matches() { + // Variable-length path is unsupported natively; LanceNative must fall back + // and produce the same result as DataFusion. + let q = "MATCH (a:Person)-[:KNOWS*1..2]->(b:Person) RETURN a.name, b.name"; + let query = CypherQuery::new(q).unwrap().with_config(config()); + let native = query + .execute(datasets(), Some(ExecutionStrategy::LanceNative)) + .await + .unwrap(); + let df = query + .execute(datasets(), Some(ExecutionStrategy::DataFusion)) + .await + .unwrap(); + assert_eq!(name_pairs(&native), name_pairs(&df)); +} +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `cargo test -p lance-graph --test test_lance_native_expand` +Expected: FAIL — `LanceNative` currently returns `UnsupportedFeature`. + +- [ ] **Step 3: Wire the native execution path in query.rs** + +In `crates/lance-graph/src/query.rs`: + +(a) Change the private context builder to optionally install the query planner. +Replace the signature and `SessionContext::new()` line of +`build_catalog_and_context_from_datasets` (around line 608 and 627): + +```rust + async fn build_catalog_and_context_from_datasets( + &self, + datasets: HashMap, + native: bool, + ) -> Result<( + lance_graph_catalog::InMemoryCatalog, + datafusion::execution::context::SessionContext, + )> { + use datafusion::datasource::{DefaultTableSource, MemTable}; + use datafusion::execution::context::SessionContext; + use lance_graph_catalog::InMemoryCatalog; + use std::sync::Arc; + + if datasets.is_empty() { + return Err(GraphError::ConfigError { + message: "No input datasets provided".to_string(), + location: snafu::Location::new(file!(), line!(), column!()), + }); + } + + // Create session context (with the CSR query planner when native). + let ctx = if native { + use datafusion::execution::session_state::SessionStateBuilder; + let state = SessionStateBuilder::new() + .with_default_features() + .with_query_planner(Arc::new(crate::lance_native_planner::CsrQueryPlanner::new())) + .build(); + SessionContext::new_with_state(state) + } else { + SessionContext::new() + }; + let mut catalog = InMemoryCatalog::new(); +``` + +(Leave the rest of the method body — the dataset registration loop and the +`Ok((catalog, ctx))` return — unchanged.) + +(b) Update the three existing callers to pass `false`: +- `execute_datafusion` (around line 599): `self.build_catalog_and_context_from_datasets(datasets, false)` +- `explain` (around line 329): `self.build_catalog_and_context_from_datasets(datasets, false)` +- `to_sql` (around line 369): `self.build_catalog_and_context_from_datasets(datasets, false)` + +(c) Add the `create_logical_plans_native` helper next to `create_logical_plans` +(after line 841): + +```rust + fn create_logical_plans_native( + &self, + catalog: std::sync::Arc, + ) -> Result { + use crate::datafusion_planner::GraphPhysicalPlanner; + use crate::lance_native_planner::LanceNativePlanner; + use crate::semantic::SemanticAnalyzer; + + let config = self.require_config()?; + + let mut analyzer = SemanticAnalyzer::new(config.clone()); + let semantic = analyzer.analyze(&self.ast, &self.parameters)?; + if !semantic.errors.is_empty() { + return Err(GraphError::PlanError { + message: format!("Semantic analysis failed:\n{}", semantic.errors.join("\n")), + location: snafu::Location::new(file!(), line!(), column!()), + }); + } + + let mut logical_planner = crate::logical_plan::LogicalPlanner::new(config); + let logical_plan = logical_planner.plan(&semantic.ast)?; + + let native = LanceNativePlanner::with_catalog(config.clone(), catalog); + native.plan(&logical_plan) + } +``` + +Make sure `LanceNativePlanner` is re-exported: in +`crates/lance-graph/src/lance_native_planner/mod.rs` it is already `pub struct`, +and `crate::lance_native_planner::LanceNativePlanner` is accessible since +`lib.rs` has `pub mod lance_native_planner;`. + +(d) Add the native execute method (after `execute_datafusion`, ~line 604): + +```rust + async fn execute_lance_native( + &self, + datasets: HashMap, + ) -> Result { + use arrow::compute::concat_batches; + use std::sync::Arc; + + let (catalog, ctx) = self + .build_catalog_and_context_from_datasets(datasets, true) + .await?; + + let df_logical_plan = self.create_logical_plans_native(Arc::new(catalog))?; + + let df = ctx + .execute_logical_plan(df_logical_plan) + .await + .map_err(|e| GraphError::ExecutionError { + message: format!("Failed to execute native plan: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + let result_schema = df.schema().inner().clone(); + let batches = df.collect().await.map_err(|e| GraphError::ExecutionError { + message: format!("Failed to collect native results: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + concat_batches(&result_schema, &batches).map_err(|e| GraphError::ExecutionError { + message: format!("Failed to concat native results: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + }) + } +``` + +(e) Route the in-memory `execute` arm (around line 234-240): replace the +`ExecutionStrategy::LanceNative => Err(...)` arm with: + +```rust + ExecutionStrategy::LanceNative => self.execute_lance_native(datasets).await, +``` + +(Leave the `execute_with_namespace_internal` `LanceNative` arm returning +`UnsupportedFeature` — namespace native execution is Phase 4.) + +- [ ] **Step 4: Run the end-to-end tests** + +Run: `cargo test -p lance-graph --test test_lance_native_expand` +Expected: PASS (4 tests). + +- [ ] **Step 5: Run the full crate test suite** + +Run: `cargo test -p lance-graph` +Expected: PASS (all prior tests + new ones). Then `cargo clippy -p lance-graph --all-targets` — fix any warnings (e.g. unused imports). + +- [ ] **Step 6: Commit** + +```bash +git add crates/lance-graph/src/query.rs crates/lance-graph/tests/test_lance_native_expand.rs +git commit -m "feat(native): wire LanceNative execution strategy + e2e parity tests" +``` + +--- + +## Self-Review + +**Spec coverage:** +- Custom DataFusion `ExecutionPlan` execution model → Tasks 3, 4, 5 (nodes, execs, planner). ✓ +- Dense-ROWID id model (vertex id == row id; source `a__`, neighbor `b__`) → Task 6 `build_expand_native`. ✓ +- Output via `take()`, materialize all target columns except id → Task 4 + Task 6 `take_cols`. ✓ +- Separate `LanceTakeExec` + `RowMaterializer` (arrow take now, Lance later) → Task 4. ✓ +- Planner reuse + override only `Expand`; fallback for var-length/multi-hop/multi-type/undirected/Join/Unwind → Task 6 `can_plan_natively`/`build_native`. ✓ +- CSR built at physical-planning time with real `rel_map` columns (reversed for incoming) → Task 1 + Task 5. ✓ +- query.rs wiring: in-memory `execute` native path wired; namespace native stays `UnsupportedFeature` → Task 7. ✓ +- Tests: operator units (Tasks 3,4), planner native/fallback (Task 6), e2e parity incl. filter, incoming, fallback (Task 7). ✓ + +**Placeholder scan:** No TBD/TODO; all steps contain concrete code and commands. + +**Type consistency:** `expand_batch`/`take_batch` signatures match their `execute` call sites; `CsrExpandNode`/`LanceTakeNode` field names match construction in Task 6 and downcast use in Task 5; `add_edges_from_batch_with_columns` signature matches Task 5 usage; `CsrQueryPlanner::new` / `with_catalog` / `build_simple_project` / `build_project_with_aggregates` / `catalog_ref` names consistent across tasks. + +**Risk notes for the implementer:** +- DataFusion 50.3 API drift: if `PlanProperties::new`, `EmissionType`/`Boundedness` import paths, or `ExtensionPlanner`/`QueryPlanner` async signatures differ, consult `cargo doc -p datafusion --open` for the exact paths; the field/method *names* used here are stable across 49–50. +- If `build_project_with_aggregates` is private, widen it to `pub(crate)` (Task 6, Step 3). +- `with_default_features()` on `SessionStateBuilder` is required so standard scalar/aggregate functions resolve in the native context. From 3df32d7dd0e8afcb737dd514218f0605069928a5 Mon Sep 17 00:00:00 2001 From: "jianjian.xie" Date: Mon, 22 Jun 2026 15:17:26 -0700 Subject: [PATCH 03/10] feat(csr): add_edges_from_batch_with_columns for custom edge column names --- crates/lance-graph/src/csr_index.rs | 75 +++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 16 deletions(-) diff --git a/crates/lance-graph/src/csr_index.rs b/crates/lance-graph/src/csr_index.rs index 176ef54d..1d3d7200 100644 --- a/crates/lance-graph/src/csr_index.rs +++ b/crates/lance-graph/src/csr_index.rs @@ -237,32 +237,43 @@ impl CsrIndexBuilder { } /// Add edges from an Arrow RecordBatch with `src_id` and `dst_id` columns. - pub fn add_edges_from_batch(mut self, batch: &RecordBatch) -> Result { - let src_col = batch - .column_by_name("src_id") - .ok_or_else(|| GraphError::PlanError { - message: "Edge batch missing 'src_id' column".to_string(), - location: snafu::Location::new(file!(), line!(), column!()), - })?; - let dst_col = batch - .column_by_name("dst_id") + pub fn add_edges_from_batch(self, batch: &RecordBatch) -> Result { + self.add_edges_from_batch_with_columns(batch, "src_id", "dst_id") + } + + /// Add edges from an Arrow RecordBatch, reading source vertex ids from + /// `src_col` and destination vertex ids from `dst_col`. + /// + /// Both columns must be `UInt64`. To build a reversed (incoming/CSC) index, + /// pass the destination column name as `src_col` and vice versa. + pub fn add_edges_from_batch_with_columns( + mut self, + batch: &RecordBatch, + src_col: &str, + dst_col: &str, + ) -> Result { + let src_array = batch + .column_by_name(src_col) .ok_or_else(|| GraphError::PlanError { - message: "Edge batch missing 'dst_id' column".to_string(), + message: format!("Edge batch missing '{}' column", src_col), location: snafu::Location::new(file!(), line!(), column!()), - })?; - - let src_array = src_col + })? .as_any() .downcast_ref::() .ok_or_else(|| GraphError::PlanError { - message: "src_id column must be UInt64".to_string(), + message: format!("'{}' column must be UInt64", src_col), location: snafu::Location::new(file!(), line!(), column!()), })?; - let dst_array = dst_col + let dst_array = batch + .column_by_name(dst_col) + .ok_or_else(|| GraphError::PlanError { + message: format!("Edge batch missing '{}' column", dst_col), + location: snafu::Location::new(file!(), line!(), column!()), + })? .as_any() .downcast_ref::() .ok_or_else(|| GraphError::PlanError { - message: "dst_id column must be UInt64".to_string(), + message: format!("'{}' column must be UInt64", dst_col), location: snafu::Location::new(file!(), line!(), column!()), })?; @@ -651,4 +662,36 @@ mod tests { assert_eq!(idx.neighbors(0), &[1, 1, 1]); assert_eq!(idx.degree(0), 3); } + + #[test] + fn test_build_from_record_batch_custom_columns() { + let schema = Arc::new(Schema::new(vec![ + Field::new("src_person_id", DataType::UInt64, false), + Field::new("dst_person_id", DataType::UInt64, false), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt64Array::from(vec![0, 0, 1])), + Arc::new(UInt64Array::from(vec![1, 2, 2])), + ], + ) + .unwrap(); + + // Forward (outgoing): src -> dst + let idx = CsrIndexBuilder::new() + .add_edges_from_batch_with_columns(&batch, "src_person_id", "dst_person_id") + .unwrap() + .build(); + assert_eq!(idx.neighbors(0), &[1, 2]); + assert_eq!(idx.neighbors(1), &[2]); + + // Reversed (incoming): swap the column args -> dst -> src + let rev = CsrIndexBuilder::new() + .add_edges_from_batch_with_columns(&batch, "dst_person_id", "src_person_id") + .unwrap() + .build(); + assert_eq!(rev.neighbors(2), &[0, 1]); + assert_eq!(rev.neighbors(1), &[0]); + } } From 0c434085ba507a6cb83d9c3b863550c6515d2b53 Mon Sep 17 00:00:00 2001 From: "jianjian.xie" Date: Mon, 22 Jun 2026 15:24:53 -0700 Subject: [PATCH 04/10] refactor(native): promote lance_native_planner to module dir; add NativeDirection --- .../lance-graph/src/lance_native_planner.rs | 77 ------------------- .../src/lance_native_planner/direction.rs | 14 ++++ .../src/lance_native_planner/mod.rs | 42 ++++++++++ 3 files changed, 56 insertions(+), 77 deletions(-) delete mode 100644 crates/lance-graph/src/lance_native_planner.rs create mode 100644 crates/lance-graph/src/lance_native_planner/direction.rs create mode 100644 crates/lance-graph/src/lance_native_planner/mod.rs diff --git a/crates/lance-graph/src/lance_native_planner.rs b/crates/lance-graph/src/lance_native_planner.rs deleted file mode 100644 index a500d6a9..00000000 --- a/crates/lance-graph/src/lance_native_planner.rs +++ /dev/null @@ -1,77 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -//! Lance Native physical planner (placeholder) -//! -//! This planner is intended to compile logical graph plans into a physical -//! execution plan that leverages Lance's native scan and filter engine. -//! -//! For now, this is a placeholder implementation that conforms to the -//! `GraphPhysicalPlanner` trait and returns an empty DataFusion logical plan -//! until the native pipeline is wired up. - -use crate::config::GraphConfig; -use crate::datafusion_planner::GraphPhysicalPlanner; -use crate::error::Result; -use crate::logical_plan::LogicalOperator; -use datafusion::common::DFSchema; -use datafusion::logical_expr::{EmptyRelation, LogicalPlan}; -use std::sync::Arc; - -/// Placeholder Lance-native planner -pub struct LanceNativePlanner { - #[allow(dead_code)] - config: GraphConfig, -} - -impl LanceNativePlanner { - pub fn new(config: GraphConfig) -> Self { - Self { config } - } -} - -impl GraphPhysicalPlanner for LanceNativePlanner { - fn plan(&self, _logical_plan: &LogicalOperator) -> Result { - // Placeholder: return an empty relation. A future implementation will - // produce a runnable pipeline using Lance's native execution engine. - let schema = Arc::new(DFSchema::empty()); - Ok(LogicalPlan::EmptyRelation(EmptyRelation { - produce_one_row: false, - schema, - })) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_lance_native_planner_placeholder() { - let cfg = GraphConfig::builder() - .with_node_label("Person", "id") - .build() - .unwrap(); - let planner = LanceNativePlanner::new(cfg); - // Minimal logical plan to feed into placeholder - let lp = LogicalOperator::Distinct { - input: Box::new(LogicalOperator::Limit { - input: Box::new(LogicalOperator::Project { - input: Box::new(LogicalOperator::ScanByLabel { - variable: "n".to_string(), - label: "Person".to_string(), - properties: Default::default(), - }), - projections: vec![], - }), - count: 1, - }), - }; - let df_plan = planner.plan(&lp).unwrap(); - // Empty relation is acceptable as a placeholder - match df_plan { - LogicalPlan::EmptyRelation(_) => {} - _ => panic!("expected empty relation placeholder"), - } - } -} diff --git a/crates/lance-graph/src/lance_native_planner/direction.rs b/crates/lance-graph/src/lance_native_planner/direction.rs new file mode 100644 index 00000000..5bc9c105 --- /dev/null +++ b/crates/lance-graph/src/lance_native_planner/direction.rs @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Traversal direction for the native CSR expand operators. + +/// Direction a single-hop expand traverses. `Undirected` is intentionally +/// absent — undirected expands fall back to the DataFusion join planner. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum NativeDirection { + /// Follow edges source -> destination (CSR). + Outgoing, + /// Follow edges destination -> source (CSC / reversed). + Incoming, +} diff --git a/crates/lance-graph/src/lance_native_planner/mod.rs b/crates/lance-graph/src/lance_native_planner/mod.rs new file mode 100644 index 00000000..1a12aed1 --- /dev/null +++ b/crates/lance-graph/src/lance_native_planner/mod.rs @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Lance Native physical planner (placeholder) +//! +//! Rewritten in a later task to lower single-hop `Expand` onto CSR-backed +//! extension nodes. For now it keeps the original placeholder behavior so the +//! crate compiles between tasks. + +mod direction; + +pub use direction::NativeDirection; + +use crate::config::GraphConfig; +use crate::datafusion_planner::GraphPhysicalPlanner; +use crate::error::Result; +use crate::logical_plan::LogicalOperator; +use datafusion::common::DFSchema; +use datafusion::logical_expr::{EmptyRelation, LogicalPlan}; +use std::sync::Arc; + +/// Placeholder Lance-native planner +pub struct LanceNativePlanner { + #[allow(dead_code)] + config: GraphConfig, +} + +impl LanceNativePlanner { + pub fn new(config: GraphConfig) -> Self { + Self { config } + } +} + +impl GraphPhysicalPlanner for LanceNativePlanner { + fn plan(&self, _logical_plan: &LogicalOperator) -> Result { + let schema = Arc::new(DFSchema::empty()); + Ok(LogicalPlan::EmptyRelation(EmptyRelation { + produce_one_row: false, + schema, + })) + } +} From 5fc76648a6499d0b6a00076093927cbe30b85d0d Mon Sep 17 00:00:00 2001 From: "jianjian.xie" Date: Mon, 22 Jun 2026 15:34:57 -0700 Subject: [PATCH 05/10] feat(native): CsrExpandNode/Exec and expand_batch core Co-Authored-By: Claude Sonnet 4.6 (1M context) --- .../src/lance_native_planner/csr_expand.rs | 351 ++++++++++++++++++ .../src/lance_native_planner/mod.rs | 1 + 2 files changed, 352 insertions(+) create mode 100644 crates/lance-graph/src/lance_native_planner/csr_expand.rs diff --git a/crates/lance-graph/src/lance_native_planner/csr_expand.rs b/crates/lance-graph/src/lance_native_planner/csr_expand.rs new file mode 100644 index 00000000..7f83da7f --- /dev/null +++ b/crates/lance-graph/src/lance_native_planner/csr_expand.rs @@ -0,0 +1,351 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Native single-hop expand: logical node + physical operator + core function. +//! +//! `CsrExpandExec` does topology only — for each input row it looks up the +//! source vertex's neighbors in the CSR index and emits one output row per +//! neighbor, carrying through all input columns and appending the neighbor row +//! id as a new column. Target property materialization is handled separately by +//! `LanceTakeExec`. + +use std::any::Any; +use std::fmt; +use std::sync::Arc; + +use arrow::compute::{cast, take}; +use arrow_array::{Array, ArrayRef, RecordBatch, UInt32Array, UInt64Array}; +use arrow_schema::{DataType, Field, SchemaRef}; +use datafusion::common::{DFSchemaRef, Result as DFResult}; +use datafusion::execution::TaskContext; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, + SendableRecordBatchStream, +}; +use futures::StreamExt; + +use super::direction::NativeDirection; +use crate::csr_index::CsrIndex; +use crate::error::{GraphError, Result}; + +/// Expand one input batch: for every input row, append one output row per +/// neighbor of that row's source vertex. +/// +/// `source_id_idx` is the column index of the source vertex id within `input`. +/// `neighbor_field` is the appended column (its data type is the target id +/// field's type; neighbor ids are cast into it). `out_schema` must equal +/// `input.schema()` fields followed by `neighbor_field`. +pub(crate) fn expand_batch( + input: &RecordBatch, + source_id_idx: usize, + csr: &CsrIndex, + neighbor_field: &Field, + out_schema: &SchemaRef, +) -> Result { + let map_err = |e: arrow_schema::ArrowError, what: &str| GraphError::ExecutionError { + message: format!("CsrExpand {}: {}", what, e), + location: snafu::Location::new(file!(), line!(), column!()), + }; + + // Source ids may be any integer type; normalize to u64. + let src_u64 = cast(input.column(source_id_idx), &DataType::UInt64) + .map_err(|e| map_err(e, "cast source id to u64"))?; + // Arrow 56.x: cast(_, UInt64) always yields a plain UInt64Array. + let src = src_u64 + .as_any() + .downcast_ref::() + .expect("cast to UInt64 yields UInt64Array"); + + let mut parent_idx: Vec = Vec::with_capacity(input.num_rows()); + let mut neighbors: Vec = Vec::with_capacity(input.num_rows()); + for row in 0..input.num_rows() { + if src.is_null(row) { + continue; + } + let row_u32 = u32::try_from(row).map_err(|_| GraphError::ExecutionError { + message: "CsrExpand: input batch row index exceeds u32::MAX".to_string(), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + for &n in csr.neighbors(src.value(row)) { + parent_idx.push(row_u32); + neighbors.push(n); + } + } + + let take_idx = UInt32Array::from(parent_idx); + let mut cols: Vec = Vec::with_capacity(input.num_columns() + 1); + for c in input.columns() { + cols.push(take(c, &take_idx, None).map_err(|e| map_err(e, "take carried column"))?); + } + let neigh_u64 = Arc::new(UInt64Array::from(neighbors)) as ArrayRef; + let neigh_col = cast(&neigh_u64, neighbor_field.data_type()) + .map_err(|e| map_err(e, "cast neighbor id"))?; + cols.push(neigh_col); + + RecordBatch::try_new(out_schema.clone(), cols).map_err(|e| GraphError::ExecutionError { + message: format!("CsrExpand build output batch: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + }) +} + +/// Logical extension node for a single-hop CSR expand. +/// +/// Holds only hashable metadata; the physical operator (and its `CsrIndex`) is +/// constructed by the extension planner at physical-planning time. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct CsrExpandNode { + /// Source subplan (a node scan, optionally with a source-only filter). + pub input: LogicalPlan, + /// Relationship type (lowercased table name to look up the edge table). + pub rel_type: String, + /// Edge table column holding source vertex ids. + pub src_field: String, + /// Edge table column holding destination vertex ids. + pub dst_field: String, + /// Traversal direction. + pub direction: NativeDirection, + /// Qualified column in `input` carrying the source vertex id (e.g. `a__id`). + pub source_id_column: String, + /// Qualified output column for the neighbor row id (e.g. `b__id`). + pub neighbor_column: String, + /// Arrow data type of the neighbor column (target id field's type). + pub neighbor_data_type: DataType, + /// Output schema = input schema + neighbor column. + pub schema: DFSchemaRef, +} + +impl PartialOrd for CsrExpandNode { + fn partial_cmp(&self, other: &Self) -> Option { + ( + &self.rel_type, + &self.src_field, + &self.dst_field, + &self.source_id_column, + &self.neighbor_column, + ) + .partial_cmp(&( + &other.rel_type, + &other.src_field, + &other.dst_field, + &other.source_id_column, + &other.neighbor_column, + )) + } +} + +impl UserDefinedLogicalNodeCore for CsrExpandNode { + fn name(&self) -> &str { + "CsrExpand" + } + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.input] + } + fn schema(&self) -> &DFSchemaRef { + &self.schema + } + fn expressions(&self) -> Vec { + vec![] + } + fn fmt_for_explain(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "CsrExpand: rel={}, dir={:?}, src={}, neighbor={}", + self.rel_type, self.direction, self.source_id_column, self.neighbor_column + ) + } + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + mut inputs: Vec, + ) -> DFResult { + Ok(Self { + input: inputs.remove(0), + ..self.clone() + }) + } +} + +/// Physical operator for `CsrExpandNode`. +#[derive(Debug)] +pub struct CsrExpandExec { + input: Arc, + csr: Arc, + source_id_idx: usize, + neighbor_field: Field, + out_schema: SchemaRef, + props: PlanProperties, +} + +impl CsrExpandExec { + pub fn new( + input: Arc, + csr: Arc, + source_id_idx: usize, + neighbor_field: Field, + out_schema: SchemaRef, + ) -> Self { + let props = PlanProperties::new( + EquivalenceProperties::new(out_schema.clone()), + Partitioning::UnknownPartitioning(1), + EmissionType::Incremental, + Boundedness::Bounded, + ); + Self { + input, + csr, + source_id_idx, + neighbor_field, + out_schema, + props, + } + } +} + +impl DisplayAs for CsrExpandExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "CsrExpandExec: neighbor={}", self.neighbor_field.name()) + } +} + +impl ExecutionPlan for CsrExpandExec { + fn name(&self) -> &str { + "CsrExpandExec" + } + fn as_any(&self) -> &dyn Any { + self + } + fn properties(&self) -> &PlanProperties { + &self.props + } + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + fn with_new_children( + self: Arc, + children: Vec>, + ) -> DFResult> { + Ok(Arc::new(CsrExpandExec::new( + children[0].clone(), + self.csr.clone(), + self.source_id_idx, + self.neighbor_field.clone(), + self.out_schema.clone(), + ))) + } + fn execute( + &self, + partition: usize, + context: Arc, + ) -> DFResult { + let input = self.input.execute(partition, context)?; + let csr = self.csr.clone(); + let idx = self.source_id_idx; + let field = self.neighbor_field.clone(); + let out_schema = self.out_schema.clone(); + let out_schema_for_stream = out_schema.clone(); + let stream = input.map(move |rb| { + let rb = rb?; + expand_batch(&rb, idx, &csr, &field, &out_schema) + .map_err(|e| datafusion::error::DataFusionError::Execution(e.to_string())) + }); + Ok(Box::pin(RecordBatchStreamAdapter::new( + out_schema_for_stream, + stream, + ))) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_schema::Schema; + use crate::csr_index::CsrIndexBuilder; + + fn input_batch() -> RecordBatch { + // a__id = [0,1,2,3], a__name = ["n0","n1","n2","n3"] + let schema = Arc::new(Schema::new(vec![ + Field::new("a__id", DataType::UInt64, false), + Field::new("a__name", DataType::Utf8, false), + ])); + RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt64Array::from(vec![0u64, 1, 2, 3])), + Arc::new(arrow_array::StringArray::from(vec!["n0", "n1", "n2", "n3"])), + ], + ) + .unwrap() + } + + fn out_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("a__id", DataType::UInt64, false), + Field::new("a__name", DataType::Utf8, false), + Field::new("b__id", DataType::UInt64, true), + ])) + } + + #[test] + fn test_expand_batch_outgoing() { + // 0->1, 0->2, 1->2, 3-> (none) + let csr = CsrIndexBuilder::new() + .with_num_vertices(4) + .add_edge(0, 1) + .add_edge(0, 2) + .add_edge(1, 2) + .build(); + let neighbor_field = Field::new("b__id", DataType::UInt64, true); + let out = expand_batch(&input_batch(), 0, &csr, &neighbor_field, &out_schema()).unwrap(); + + assert_eq!(out.num_rows(), 3); + let a_id = out.column(0).as_any().downcast_ref::().unwrap(); + let b_id = out.column(2).as_any().downcast_ref::().unwrap(); + let a_name = out + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + // Rows: (0,n0,1) (0,n0,2) (1,n1,2) + assert_eq!(a_id.values(), &[0, 0, 1]); + assert_eq!(b_id.values(), &[1, 2, 2]); + assert_eq!(a_name.value(0), "n0"); + assert_eq!(a_name.value(1), "n0"); + assert_eq!(a_name.value(2), "n1"); + } + + #[test] + fn test_expand_batch_no_neighbors_and_out_of_range() { + let csr = CsrIndexBuilder::new().with_num_vertices(2).build(); // no edges + let neighbor_field = Field::new("b__id", DataType::UInt64, true); + let out = expand_batch(&input_batch(), 0, &csr, &neighbor_field, &out_schema()).unwrap(); + assert_eq!(out.num_rows(), 0); + } + + #[test] + fn test_expand_batch_casts_source_id_from_int64() { + // Source id column is Int64 (not UInt64): must still work. + let schema = Arc::new(Schema::new(vec![Field::new("a__id", DataType::Int64, false)])); + let input = RecordBatch::try_new( + schema, + vec![Arc::new(arrow_array::Int64Array::from(vec![0i64, 1]))], + ) + .unwrap(); + let out_schema = Arc::new(Schema::new(vec![ + Field::new("a__id", DataType::Int64, false), + Field::new("b__id", DataType::UInt64, true), + ])); + let csr = CsrIndexBuilder::new() + .with_num_vertices(2) + .add_edge(0, 1) + .build(); + let neighbor_field = Field::new("b__id", DataType::UInt64, true); + let out = expand_batch(&input, 0, &csr, &neighbor_field, &out_schema).unwrap(); + assert_eq!(out.num_rows(), 1); + let b_id = out.column(1).as_any().downcast_ref::().unwrap(); + assert_eq!(b_id.values(), &[1]); + } +} diff --git a/crates/lance-graph/src/lance_native_planner/mod.rs b/crates/lance-graph/src/lance_native_planner/mod.rs index 1a12aed1..570025a4 100644 --- a/crates/lance-graph/src/lance_native_planner/mod.rs +++ b/crates/lance-graph/src/lance_native_planner/mod.rs @@ -8,6 +8,7 @@ //! crate compiles between tasks. mod direction; +mod csr_expand; pub use direction::NativeDirection; From fce3edb83ade55f7509b904666a01e4f95ffbd0a Mon Sep 17 00:00:00 2001 From: "jianjian.xie" Date: Mon, 22 Jun 2026 15:50:36 -0700 Subject: [PATCH 06/10] feat(native): RowMaterializer, LanceTakeNode/Exec, take_batch core --- .../src/lance_native_planner/mod.rs | 1 + .../src/lance_native_planner/take.rs | 386 ++++++++++++++++++ 2 files changed, 387 insertions(+) create mode 100644 crates/lance-graph/src/lance_native_planner/take.rs diff --git a/crates/lance-graph/src/lance_native_planner/mod.rs b/crates/lance-graph/src/lance_native_planner/mod.rs index 570025a4..06f1ae28 100644 --- a/crates/lance-graph/src/lance_native_planner/mod.rs +++ b/crates/lance-graph/src/lance_native_planner/mod.rs @@ -9,6 +9,7 @@ mod direction; mod csr_expand; +mod take; pub use direction::NativeDirection; diff --git a/crates/lance-graph/src/lance_native_planner/take.rs b/crates/lance-graph/src/lance_native_planner/take.rs new file mode 100644 index 00000000..fc57d21f --- /dev/null +++ b/crates/lance-graph/src/lance_native_planner/take.rs @@ -0,0 +1,386 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Native materialization: take target node columns by row id. +//! +//! `CsrExpandExec` produces target *row ids*; `LanceTakeExec` turns those into +//! target *properties* via a `RowMaterializer`. Under the dense-ROWID model the +//! in-memory materializer is a direct `arrow::compute::take` by offset — the +//! concrete reason CSR beats a hash join. + +use std::any::Any; +use std::fmt; +use std::sync::Arc; + +use arrow::compute::{cast, take}; +use arrow_array::{ArrayRef, RecordBatch, UInt64Array}; +use arrow_schema::{DataType, Field, SchemaRef}; +use datafusion::common::{DFSchemaRef, Result as DFResult}; +use datafusion::execution::TaskContext; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, + SendableRecordBatchStream, +}; +use futures::StreamExt; + +use crate::error::{GraphError, Result}; + +/// Materializes rows of a target node table by row id. +pub trait RowMaterializer: Send + Sync + fmt::Debug { + /// Take `columns` (raw, unqualified names) for the given `row_ids`. + /// The returned batch has one row per element of `row_ids`, columns in the + /// requested order, named by their raw names. + fn take(&self, row_ids: &UInt64Array, columns: &[String]) -> Result; +} + +/// In-memory materializer over a fully-collected target node batch. Row id == +/// offset into the batch (dense-ROWID model). +#[derive(Debug)] +pub struct InMemoryMaterializer { + batch: RecordBatch, +} + +impl InMemoryMaterializer { + pub fn new(batch: RecordBatch) -> Self { + Self { batch } + } +} + +impl RowMaterializer for InMemoryMaterializer { + fn take(&self, row_ids: &UInt64Array, columns: &[String]) -> Result { + let mut fields: Vec = Vec::with_capacity(columns.len()); + let mut arrays: Vec = Vec::with_capacity(columns.len()); + for name in columns { + let col = self + .batch + .column_by_name(name) + .ok_or_else(|| GraphError::ExecutionError { + message: format!("take: target column '{}' not found", name), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + let taken = take(col, row_ids, None).map_err(|e| GraphError::ExecutionError { + message: format!("take: failed on column '{}': {}", name, e), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + let nullable = self + .batch + .schema() + .field_with_name(name) + .map(|f| f.is_nullable()) + .unwrap_or(true); + fields.push(Field::new(name, col.data_type().clone(), nullable)); + arrays.push(taken); + } + RecordBatch::try_new(Arc::new(arrow_schema::Schema::new(fields)), arrays).map_err(|e| { + GraphError::ExecutionError { + message: format!("take: build batch: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + } + }) + } +} + +/// Append materialized target columns to one input batch. +/// +/// `row_id_idx` is the index of the row-id column in `input`. `take_cols` are +/// the raw target column names to materialize, in the same order as the +/// appended fields of `out_schema`. `out_schema` = `input.schema()` followed by +/// the qualified materialized columns. +pub(crate) fn take_batch( + input: &RecordBatch, + row_id_idx: usize, + materializer: &dyn RowMaterializer, + take_cols: &[String], + out_schema: &SchemaRef, +) -> Result { + // Nothing to materialize: output is the input re-stamped with out_schema. + // (When take_cols is empty, out_schema == input.schema().) + if take_cols.is_empty() { + return RecordBatch::try_new(out_schema.clone(), input.columns().to_vec()).map_err(|e| { + GraphError::ExecutionError { + message: format!("take: build output batch: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + } + }); + } + + let ids_u64 = cast(input.column(row_id_idx), &DataType::UInt64).map_err(|e| { + GraphError::ExecutionError { + message: format!("take: cast row id to u64: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + } + })?; + // Arrow 56.x: cast(_, UInt64) always yields a plain UInt64Array. + let ids = ids_u64 + .as_any() + .downcast_ref::() + .expect("cast to UInt64 yields UInt64Array"); + + let materialized = materializer.take(ids, take_cols)?; + + let mut cols: Vec = input.columns().to_vec(); + cols.extend(materialized.columns().iter().cloned()); + + RecordBatch::try_new(out_schema.clone(), cols).map_err(|e| GraphError::ExecutionError { + message: format!("take: build output batch: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + }) +} + +/// Logical extension node for materializing target columns via take(). +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct LanceTakeNode { + /// Input subplan (a `CsrExpandNode`). + pub input: LogicalPlan, + /// Lowercased target node table name (to collect rows from). + pub target_table: String, + /// Qualified column in `input` holding the row ids (e.g. `b__id`). + pub row_id_column: String, + /// Raw (unqualified, lowercased) target columns to materialize, in output order. + pub take_cols: Vec, + /// Output schema = input schema + qualified materialized columns. + pub schema: DFSchemaRef, +} + +impl PartialOrd for LanceTakeNode { + fn partial_cmp(&self, other: &Self) -> Option { + (&self.target_table, &self.row_id_column, &self.take_cols).partial_cmp(&( + &other.target_table, + &other.row_id_column, + &other.take_cols, + )) + } +} + +impl UserDefinedLogicalNodeCore for LanceTakeNode { + fn name(&self) -> &str { + "LanceTake" + } + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.input] + } + fn schema(&self) -> &DFSchemaRef { + &self.schema + } + fn expressions(&self) -> Vec { + vec![] + } + fn fmt_for_explain(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "LanceTake: table={}, row_id={}, cols={:?}", + self.target_table, self.row_id_column, self.take_cols + ) + } + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + mut inputs: Vec, + ) -> DFResult { + Ok(Self { + input: inputs.remove(0), + ..self.clone() + }) + } +} + +/// Physical operator for `LanceTakeNode`. +#[derive(Debug)] +pub struct LanceTakeExec { + input: Arc, + materializer: Arc, + row_id_idx: usize, + take_cols: Vec, + out_schema: SchemaRef, + props: PlanProperties, +} + +impl LanceTakeExec { + pub fn new( + input: Arc, + materializer: Arc, + row_id_idx: usize, + take_cols: Vec, + out_schema: SchemaRef, + ) -> Self { + let props = PlanProperties::new( + EquivalenceProperties::new(out_schema.clone()), + Partitioning::UnknownPartitioning(1), + EmissionType::Incremental, + Boundedness::Bounded, + ); + Self { + input, + materializer, + row_id_idx, + take_cols, + out_schema, + props, + } + } +} + +impl DisplayAs for LanceTakeExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "LanceTakeExec: cols={:?}", self.take_cols) + } +} + +impl ExecutionPlan for LanceTakeExec { + fn name(&self) -> &str { + "LanceTakeExec" + } + fn as_any(&self) -> &dyn Any { + self + } + fn properties(&self) -> &PlanProperties { + &self.props + } + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + fn with_new_children( + self: Arc, + children: Vec>, + ) -> DFResult> { + Ok(Arc::new(LanceTakeExec::new( + children[0].clone(), + self.materializer.clone(), + self.row_id_idx, + self.take_cols.clone(), + self.out_schema.clone(), + ))) + } + fn execute( + &self, + partition: usize, + context: Arc, + ) -> DFResult { + let input = self.input.execute(partition, context)?; + let materializer = self.materializer.clone(); + let row_id_idx = self.row_id_idx; + let take_cols = self.take_cols.clone(); + let out_schema = self.out_schema.clone(); + let out_schema_for_stream = out_schema.clone(); + let stream = input.map(move |rb| { + let rb = rb?; + take_batch(&rb, row_id_idx, materializer.as_ref(), &take_cols, &out_schema) + .map_err(|e| datafusion::error::DataFusionError::Execution(e.to_string())) + }); + Ok(Box::pin(RecordBatchStreamAdapter::new( + out_schema_for_stream, + stream, + ))) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::StringArray; + use arrow_schema::Schema; + + fn target_batch() -> RecordBatch { + // person table: id, name, age (raw, lowercased column names) + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::UInt64, false), + Field::new("name", DataType::Utf8, false), + Field::new("age", DataType::Int64, false), + ])); + RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt64Array::from(vec![0u64, 1, 2])), + Arc::new(StringArray::from(vec!["alice", "bob", "carol"])), + Arc::new(arrow_array::Int64Array::from(vec![30i64, 40, 50])), + ], + ) + .unwrap() + } + + #[test] + fn test_in_memory_materializer_take_subset() { + let m = InMemoryMaterializer::new(target_batch()); + let ids = UInt64Array::from(vec![2u64, 0]); + let out = m.take(&ids, &["name".to_string()]).unwrap(); + assert_eq!(out.num_columns(), 1); + let names = out.column(0).as_any().downcast_ref::().unwrap(); + assert_eq!(names.value(0), "carol"); + assert_eq!(names.value(1), "alice"); + } + + #[test] + fn test_in_memory_materializer_missing_column_errors() { + let m = InMemoryMaterializer::new(target_batch()); + let ids = UInt64Array::from(vec![0u64]); + assert!(m.take(&ids, &["nonexistent".to_string()]).is_err()); + } + + #[test] + fn test_take_batch_appends_qualified_columns() { + // input: a__name, b__id (b__id is the neighbor row id) + let in_schema = Arc::new(Schema::new(vec![ + Field::new("a__name", DataType::Utf8, false), + Field::new("b__id", DataType::UInt64, true), + ])); + let input = RecordBatch::try_new( + in_schema, + vec![ + Arc::new(StringArray::from(vec!["x", "y"])), + Arc::new(UInt64Array::from(vec![1u64, 2])), + ], + ) + .unwrap(); + // out: a__name, b__id, b__name, b__age + let out_schema = Arc::new(Schema::new(vec![ + Field::new("a__name", DataType::Utf8, false), + Field::new("b__id", DataType::UInt64, true), + Field::new("b__name", DataType::Utf8, true), + Field::new("b__age", DataType::Int64, true), + ])); + let m = InMemoryMaterializer::new(target_batch()); + let out = take_batch( + &input, + 1, + &m, + &["name".to_string(), "age".to_string()], + &out_schema, + ) + .unwrap(); + assert_eq!(out.num_rows(), 2); + let b_name = out.column(2).as_any().downcast_ref::().unwrap(); + let b_age = out + .column(3) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(b_name.value(0), "bob"); // row id 1 + assert_eq!(b_name.value(1), "carol"); // row id 2 + assert_eq!(b_age.values(), &[40, 50]); + } + + #[test] + fn test_take_batch_empty_take_cols_passthrough() { + // With no target columns to materialize, output == input (same rows/cols). + let in_schema = Arc::new(Schema::new(vec![ + Field::new("a__name", DataType::Utf8, false), + Field::new("b__id", DataType::UInt64, true), + ])); + let input = RecordBatch::try_new( + in_schema.clone(), + vec![ + Arc::new(StringArray::from(vec!["x", "y"])), + Arc::new(UInt64Array::from(vec![1u64, 2])), + ], + ) + .unwrap(); + let m = InMemoryMaterializer::new(target_batch()); + let out = take_batch(&input, 1, &m, &[], &in_schema).unwrap(); + assert_eq!(out.num_rows(), 2); + assert_eq!(out.num_columns(), 2); + } +} From f2a1526fe2f06d0f54e5847c23b79eec991b4b7f Mon Sep 17 00:00:00 2001 From: "jianjian.xie" Date: Mon, 22 Jun 2026 16:00:47 -0700 Subject: [PATCH 07/10] feat(native): CsrExtensionPlanner + CsrQueryPlanner physical planning --- .../lance_native_planner/extension_planner.rs | 153 ++++++++++++++++++ .../src/lance_native_planner/mod.rs | 2 + 2 files changed, 155 insertions(+) create mode 100644 crates/lance-graph/src/lance_native_planner/extension_planner.rs diff --git a/crates/lance-graph/src/lance_native_planner/extension_planner.rs b/crates/lance-graph/src/lance_native_planner/extension_planner.rs new file mode 100644 index 00000000..ce569402 --- /dev/null +++ b/crates/lance-graph/src/lance_native_planner/extension_planner.rs @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Physical planning for the native CSR extension nodes. +//! +//! `CsrQueryPlanner` is registered on the execution `SessionContext`. It runs +//! the `DefaultPhysicalPlanner` with `CsrExtensionPlanner`, which builds the +//! `CsrIndex` (from the edge table) and the `InMemoryMaterializer` (from the +//! target node table) at physical-planning time. + +use std::sync::Arc; + +use arrow::compute::concat_batches; +use arrow_schema::Field; +use async_trait::async_trait; +use datafusion::common::Result as DFResult; +use datafusion::error::DataFusionError; +use datafusion::execution::context::{QueryPlanner, SessionContext, SessionState}; +use datafusion::logical_expr::{LogicalPlan, UserDefinedLogicalNode}; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner}; + +use super::csr_expand::{CsrExpandExec, CsrExpandNode}; +use super::direction::NativeDirection; +use super::take::{InMemoryMaterializer, LanceTakeExec, LanceTakeNode}; +use crate::csr_index::CsrIndexBuilder; + +/// Collect a registered table to a single `RecordBatch`. +/// +/// Phase 2 builds the CSR / materializer eagerly at physical-planning time, so +/// the whole edge / target table is read into memory here. An empty table yields +/// an empty batch (correct schema), producing an empty CSR — i.e. no traversals. +async fn collect_table( + session_state: &SessionState, + table: &str, +) -> DFResult { + let ctx = SessionContext::new_with_state(session_state.clone()); + let df = ctx.table(table).await?; + let schema = df.schema().inner().clone(); + let batches = df.collect().await?; + concat_batches(&schema, &batches).map_err(|e| DataFusionError::Execution(e.to_string())) +} + +/// Extension planner that lowers `CsrExpandNode` and `LanceTakeNode`. +#[derive(Debug)] +pub struct CsrExtensionPlanner; + +#[async_trait] +impl ExtensionPlanner for CsrExtensionPlanner { + async fn plan_extension( + &self, + _planner: &dyn PhysicalPlanner, + node: &dyn UserDefinedLogicalNode, + _logical_inputs: &[&LogicalPlan], + physical_inputs: &[Arc], + session_state: &SessionState, + ) -> DFResult>> { + if let Some(expand) = node.as_any().downcast_ref::() { + let input = physical_inputs + .first() + .ok_or_else(|| DataFusionError::Internal( + "CsrExpandNode: expected 1 physical input, got 0".to_string(), + ))? + .clone(); + + // Build CSR from the edge table (reverse columns for Incoming). + let edges = collect_table(session_state, &expand.rel_type).await?; + let (src_col, dst_col) = match expand.direction { + NativeDirection::Outgoing => (&expand.src_field, &expand.dst_field), + NativeDirection::Incoming => (&expand.dst_field, &expand.src_field), + }; + let csr = CsrIndexBuilder::new() + .add_edges_from_batch_with_columns(&edges, src_col, dst_col) + .map_err(|e| DataFusionError::Execution(e.to_string()))? + .build(); + + let in_schema = input.schema(); + let source_id_idx = in_schema.index_of(&expand.source_id_column).map_err(|e| { + DataFusionError::Execution(format!( + "CsrExpand: source id column '{}' not found in input: {}", + expand.source_id_column, e + )) + })?; + let neighbor_field = + Field::new(&expand.neighbor_column, expand.neighbor_data_type.clone(), true); + let out_schema = expand.schema.inner().clone(); + + return Ok(Some(Arc::new(CsrExpandExec::new( + input, + Arc::new(csr), + source_id_idx, + neighbor_field, + out_schema, + )))); + } + + if let Some(take) = node.as_any().downcast_ref::() { + let input = physical_inputs + .first() + .ok_or_else(|| DataFusionError::Internal( + "LanceTakeNode: expected 1 physical input, got 0".to_string(), + ))? + .clone(); + + let target = collect_table(session_state, &take.target_table).await?; + let materializer = Arc::new(InMemoryMaterializer::new(target)); + + let in_schema = input.schema(); + let row_id_idx = in_schema.index_of(&take.row_id_column).map_err(|e| { + DataFusionError::Execution(format!( + "LanceTake: row id column '{}' not found in input: {}", + take.row_id_column, e + )) + })?; + let out_schema = take.schema.inner().clone(); + + return Ok(Some(Arc::new(LanceTakeExec::new( + input, + materializer, + row_id_idx, + take.take_cols.clone(), + out_schema, + )))); + } + + Ok(None) + } +} + +/// Query planner that installs `CsrExtensionPlanner`. +#[derive(Debug, Default)] +pub struct CsrQueryPlanner; + +impl CsrQueryPlanner { + pub fn new() -> Self { + Self + } +} + +#[async_trait] +impl QueryPlanner for CsrQueryPlanner { + async fn create_physical_plan( + &self, + logical_plan: &LogicalPlan, + session_state: &SessionState, + ) -> DFResult> { + let planner = + DefaultPhysicalPlanner::with_extension_planners(vec![Arc::new(CsrExtensionPlanner)]); + planner + .create_physical_plan(logical_plan, session_state) + .await + } +} diff --git a/crates/lance-graph/src/lance_native_planner/mod.rs b/crates/lance-graph/src/lance_native_planner/mod.rs index 06f1ae28..f8493480 100644 --- a/crates/lance-graph/src/lance_native_planner/mod.rs +++ b/crates/lance-graph/src/lance_native_planner/mod.rs @@ -10,8 +10,10 @@ mod direction; mod csr_expand; mod take; +mod extension_planner; pub use direction::NativeDirection; +pub use extension_planner::CsrQueryPlanner; use crate::config::GraphConfig; use crate::datafusion_planner::GraphPhysicalPlanner; From f622fb4c2577f2d304f04f039059dccd96c74718 Mon Sep 17 00:00:00 2001 From: "jianjian.xie" Date: Mon, 22 Jun 2026 16:14:23 -0700 Subject: [PATCH 08/10] feat(native): LanceNativePlanner single-hop lowering with DataFusion fallback --- .../lance-graph/src/datafusion_planner/mod.rs | 9 +- .../src/lance_native_planner/mod.rs | 501 +++++++++++++++++- 2 files changed, 488 insertions(+), 22 deletions(-) diff --git a/crates/lance-graph/src/datafusion_planner/mod.rs b/crates/lance-graph/src/datafusion_planner/mod.rs index dfb4bb5b..8ac64671 100644 --- a/crates/lance-graph/src/datafusion_planner/mod.rs +++ b/crates/lance-graph/src/datafusion_planner/mod.rs @@ -17,14 +17,14 @@ pub mod analysis; mod builder; mod config_helpers; -mod expression; +pub(crate) mod expression; mod join_ops; mod scan_ops; mod udf; pub mod vector_ops; #[cfg(test)] -mod test_fixtures; +pub(crate) mod test_fixtures; // Re-export public types pub use analysis::{PlanningContext, QueryAnalysis, RelationshipInstance}; @@ -62,6 +62,11 @@ impl DataFusionPlanner { } } + /// Access the catalog, if any (used by the native planner). + pub(crate) fn catalog_ref(&self) -> Option<&Arc> { + self.catalog.as_ref() + } + /// Helper to convert DataFusion builder errors into GraphError::PlanError with context pub(crate) fn plan_error( &self, diff --git a/crates/lance-graph/src/lance_native_planner/mod.rs b/crates/lance-graph/src/lance_native_planner/mod.rs index f8493480..07fbc7f6 100644 --- a/crates/lance-graph/src/lance_native_planner/mod.rs +++ b/crates/lance-graph/src/lance_native_planner/mod.rs @@ -1,46 +1,507 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -//! Lance Native physical planner (placeholder) +//! Lance Native physical planner. //! -//! Rewritten in a later task to lower single-hop `Expand` onto CSR-backed -//! extension nodes. For now it keeps the original placeholder behavior so the -//! crate compiles between tasks. +//! Lowers a supported single-hop `Expand` onto CSR-backed extension nodes +//! (`CsrExpandNode` + `LanceTakeNode`). Any plan it cannot serve natively is +//! delegated wholesale to `DataFusionPlanner`, so `LanceNative` execution is +//! always correct on valid Cypher — it simply uses joins when CSR cannot help. -mod direction; mod csr_expand; -mod take; +mod direction; mod extension_planner; +mod take; pub use direction::NativeDirection; pub use extension_planner::CsrQueryPlanner; -use crate::config::GraphConfig; -use crate::datafusion_planner::GraphPhysicalPlanner; -use crate::error::Result; -use crate::logical_plan::LogicalOperator; -use datafusion::common::DFSchema; -use datafusion::logical_expr::{EmptyRelation, LogicalPlan}; use std::sync::Arc; -/// Placeholder Lance-native planner +use arrow_schema::{Field, Schema}; +use datafusion::common::DFSchema; +use datafusion::logical_expr::{Extension, LogicalPlan, LogicalPlanBuilder}; + +use crate::ast::RelationshipDirection; +use crate::case_insensitive::qualify_column; +use crate::config::GraphConfig; +use crate::datafusion_planner::expression::{to_df_boolean_expr, to_df_value_expr}; +use crate::datafusion_planner::{analysis, DataFusionPlanner, GraphPhysicalPlanner, PlanningContext}; +use crate::error::{GraphError, Result}; +use crate::logical_plan::{LogicalOperator, ProjectionItem, SortItem}; + +use csr_expand::CsrExpandNode; +use take::LanceTakeNode; + +/// Lance-native planner: CSR single-hop expand with DataFusion fallback. pub struct LanceNativePlanner { - #[allow(dead_code)] config: GraphConfig, + df: DataFusionPlanner, } impl LanceNativePlanner { pub fn new(config: GraphConfig) -> Self { - Self { config } + Self { + df: DataFusionPlanner::new(config.clone()), + config, + } + } + + pub fn with_catalog( + config: GraphConfig, + catalog: Arc, + ) -> Self { + Self { + df: DataFusionPlanner::with_catalog(config.clone(), catalog), + config, + } } } impl GraphPhysicalPlanner for LanceNativePlanner { - fn plan(&self, _logical_plan: &LogicalOperator) -> Result { - let schema = Arc::new(DFSchema::empty()); - Ok(LogicalPlan::EmptyRelation(EmptyRelation { - produce_one_row: false, - schema, + fn plan(&self, logical_plan: &LogicalOperator) -> Result { + if !can_plan_natively(logical_plan) { + return self.df.plan(logical_plan); + } + let analysis = analysis::analyze(logical_plan)?; + let mut ctx = PlanningContext::new(&analysis); + self.build_native(&mut ctx, logical_plan) + } +} + +impl LanceNativePlanner { + fn build_native( + &self, + ctx: &mut PlanningContext, + op: &LogicalOperator, + ) -> Result { + match op { + LogicalOperator::ScanByLabel { + variable, + label, + properties, + } => self.df.build_scan(ctx, variable, label, properties), + + LogicalOperator::Filter { input, predicate } => { + let child = self.build_native(ctx, input)?; + let expr = to_df_boolean_expr(predicate); + LogicalPlanBuilder::from(child) + .filter(expr) + .map_err(|e| self.plan_err("filter", e))? + .build() + .map_err(|e| self.plan_err("filter build", e)) + } + + LogicalOperator::Project { input, projections } => { + let child = self.build_native(ctx, input)?; + self.build_project_on(child, projections) + } + + LogicalOperator::Sort { input, sort_items } => { + let child = self.build_native(ctx, input)?; + self.build_sort_on(child, sort_items) + } + + LogicalOperator::Limit { input, count } => { + let child = self.build_native(ctx, input)?; + LogicalPlanBuilder::from(child) + .limit(0, Some(*count as usize)) + .map_err(|e| self.plan_err("limit", e))? + .build() + .map_err(|e| self.plan_err("limit build", e)) + } + + LogicalOperator::Offset { input, offset } => { + let child = self.build_native(ctx, input)?; + LogicalPlanBuilder::from(child) + .limit(*offset as usize, None) + .map_err(|e| self.plan_err("offset", e))? + .build() + .map_err(|e| self.plan_err("offset build", e)) + } + + LogicalOperator::Distinct { input } => { + let child = self.build_native(ctx, input)?; + LogicalPlanBuilder::from(child) + .distinct() + .map_err(|e| self.plan_err("distinct", e))? + .build() + .map_err(|e| self.plan_err("distinct build", e)) + } + + LogicalOperator::Expand { + input, + source_variable, + target_variable, + target_label, + relationship_types, + direction, + .. + } => self.build_expand_native( + ctx, + input, + source_variable, + target_variable, + target_label, + relationship_types, + direction, + ), + + other => Err(GraphError::PlanError { + message: format!("native planner reached unsupported operator: {:?}", other), + location: snafu::Location::new(file!(), line!(), column!()), + }), + } + } + + #[allow(clippy::too_many_arguments)] + fn build_expand_native( + &self, + ctx: &mut PlanningContext, + input: &LogicalOperator, + source_variable: &str, + target_variable: &str, + target_label: &str, + relationship_types: &[String], + direction: &RelationshipDirection, + ) -> Result { + let source_plan = self.build_native(ctx, input)?; + + let rel_type = &relationship_types[0]; + let rel_map = self + .config + .get_relationship_mapping(rel_type) + .ok_or_else(|| GraphError::ConfigError { + message: format!("No relationship mapping for '{}'", rel_type), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + + let src_label = ctx + .analysis + .var_to_label + .get(source_variable) + .ok_or_else(|| GraphError::PlanError { + message: format!("No label for source variable '{}'", source_variable), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + let src_node = self + .config + .get_node_mapping(src_label) + .ok_or_else(|| GraphError::ConfigError { + message: format!("No node mapping for label '{}'", src_label), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + let tgt_node = self + .config + .get_node_mapping(target_label) + .ok_or_else(|| GraphError::ConfigError { + message: format!("No node mapping for label '{}'", target_label), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + + let catalog = self.df.catalog_ref().ok_or_else(|| GraphError::ConfigError { + message: "LanceNativePlanner requires a catalog for native expand".to_string(), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + let tgt_source = catalog.node_source(target_label).ok_or_else(|| { + GraphError::ConfigError { + message: format!("No table source for target label '{}'", target_label), + location: snafu::Location::new(file!(), line!(), column!()), + } + })?; + let tgt_arrow = tgt_source.schema(); + + let native_direction = match direction { + RelationshipDirection::Outgoing => NativeDirection::Outgoing, + RelationshipDirection::Incoming => NativeDirection::Incoming, + RelationshipDirection::Undirected => { + return Err(GraphError::PlanError { + message: "undirected expand is not natively supported".to_string(), + location: snafu::Location::new(file!(), line!(), column!()), + }); + } + }; + + let source_id_column = qualify_column(source_variable, &src_node.id_field); + let neighbor_column = qualify_column(target_variable, &tgt_node.id_field); + let id_lower = tgt_node.id_field.to_lowercase(); + let neighbor_data_type = tgt_arrow + .field_with_name(&id_lower) + .map_err(|e| GraphError::ConfigError { + message: format!( + "target id field '{}' not found in '{}': {}", + tgt_node.id_field, target_label, e + ), + location: snafu::Location::new(file!(), line!(), column!()), + })? + .data_type() + .clone(); + + // CsrExpandNode output schema = source schema + neighbor column. + let src_arrow = source_plan.schema().inner(); + let mut expand_fields: Vec = + src_arrow.fields().iter().map(|f| f.as_ref().clone()).collect(); + expand_fields.push(Field::new(&neighbor_column, neighbor_data_type.clone(), true)); + let expand_arrow = Schema::new(expand_fields); + // Fields are intentionally unqualified: names are already the pre-qualified + // `{var}__{col}` strings the reused DataFusion builders resolve by name. + let expand_schema = Arc::new( + DFSchema::try_from(expand_arrow).map_err(|e| self.plan_err("expand schema", e))?, + ); + + let expand_node = CsrExpandNode { + input: source_plan, + rel_type: rel_type.to_lowercase(), + src_field: rel_map.source_id_field.to_lowercase(), + dst_field: rel_map.target_id_field.to_lowercase(), + direction: native_direction, + source_id_column, + neighbor_column: neighbor_column.clone(), + neighbor_data_type, + schema: expand_schema, + }; + let expand_plan = LogicalPlan::Extension(Extension { + node: Arc::new(expand_node), + }); + + // LanceTakeNode: materialize all target columns except the id field. + let take_cols: Vec = tgt_arrow + .fields() + .iter() + .map(|f| f.name().to_lowercase()) + .filter(|n| n != &id_lower) + .collect(); + + let mut take_fields: Vec = expand_plan + .schema() + .inner() + .fields() + .iter() + .map(|f| f.as_ref().clone()) + .collect(); + for raw in &take_cols { + let f = tgt_arrow + .field_with_name(raw) + .map_err(|e| self.plan_err("target field", e))?; + take_fields.push(Field::new( + qualify_column(target_variable, raw), + f.data_type().clone(), + true, + )); + } + let take_arrow = Schema::new(take_fields); + let take_schema = + Arc::new(DFSchema::try_from(take_arrow).map_err(|e| self.plan_err("take schema", e))?); + + let take_node = LanceTakeNode { + input: expand_plan, + target_table: target_label.to_lowercase(), + row_id_column: neighbor_column, + take_cols, + schema: take_schema, + }; + Ok(LogicalPlan::Extension(Extension { + node: Arc::new(take_node), })) } + + fn build_project_on( + &self, + input: LogicalPlan, + projections: &[ProjectionItem], + ) -> Result { + let has_agg = projections + .iter() + .any(|p| crate::datafusion_planner::expression::contains_aggregate(&p.expression)); + if has_agg { + self.df.build_project_with_aggregates(input, projections) + } else { + self.df.build_simple_project(input, projections) + } + } + + fn build_sort_on(&self, input: LogicalPlan, sort_items: &[SortItem]) -> Result { + use datafusion::logical_expr::SortExpr; + let sort_exprs: Vec = sort_items + .iter() + .map(|item| { + let expr = to_df_value_expr(&item.expression); + let asc = matches!(item.direction, crate::ast::SortDirection::Ascending); + SortExpr { + expr, + asc, + nulls_first: true, + } + }) + .collect(); + LogicalPlanBuilder::from(input) + .sort(sort_exprs) + .map_err(|e| self.plan_err("sort", e))? + .build() + .map_err(|e| self.plan_err("sort build", e)) + } + + fn plan_err(&self, what: &str, e: E) -> GraphError { + GraphError::PlanError { + message: format!("native {}: {}", what, e), + location: snafu::Location::new(file!(), line!(), column!()), + } + } +} + +/// True iff the plan is a single-hop expand the native planner can serve. +fn can_plan_natively(op: &LogicalOperator) -> bool { + let mut expands = 0usize; + if !walk_supported(op, &mut expands) { + return false; + } + expands == 1 +} + +/// Recursively check every operator is natively supportable, accumulating the +/// total number of `Expand` nodes into `expands`. Returns false on any +/// unsupported operator. The caller enforces exactly one expand (`expands == 1`); +/// this function does not early-exit on the second expand. +fn walk_supported(op: &LogicalOperator, expands: &mut usize) -> bool { + match op { + LogicalOperator::ScanByLabel { .. } => true, + LogicalOperator::Filter { input, .. } + | LogicalOperator::Project { input, .. } + | LogicalOperator::Sort { input, .. } + | LogicalOperator::Limit { input, .. } + | LogicalOperator::Offset { input, .. } + | LogicalOperator::Distinct { input } => walk_supported(input, expands), + LogicalOperator::Expand { + input, + relationship_types, + direction, + properties, + target_properties, + .. + } => { + *expands += 1; + if relationship_types.len() != 1 { + return false; + } + if matches!(direction, RelationshipDirection::Undirected) { + return false; + } + // Inline relationship/target-node property filters are not handled by + // the native path yet; fall back to the DataFusion join planner. + if !properties.is_empty() || !target_properties.is_empty() { + return false; + } + walk_supported(input, expands) + } + LogicalOperator::VariableLengthExpand { .. } + | LogicalOperator::Join { .. } + | LogicalOperator::Unwind { .. } => false, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::ast::{PropertyRef, ValueExpression}; + use crate::datafusion_planner::test_fixtures::{make_catalog, person_knows_config, person_scan}; + use crate::logical_plan::{LogicalOperator, ProjectionItem}; + + fn knows_expand(direction: RelationshipDirection) -> LogicalOperator { + LogicalOperator::Expand { + input: Box::new(person_scan("a")), + source_variable: "a".to_string(), + target_variable: "b".to_string(), + target_label: "Person".to_string(), + relationship_types: vec!["KNOWS".to_string()], + direction, + relationship_variable: None, + properties: Default::default(), + target_properties: Default::default(), + } + } + + #[test] + fn test_can_plan_natively_single_hop() { + let plan = LogicalOperator::Project { + input: Box::new(knows_expand(RelationshipDirection::Outgoing)), + projections: vec![ProjectionItem { + expression: ValueExpression::Property(PropertyRef::new("b", "name")), + alias: None, + }], + }; + assert!(can_plan_natively(&plan)); + } + + #[test] + fn test_cannot_plan_undirected_or_multitype() { + assert!(!can_plan_natively(&knows_expand( + RelationshipDirection::Undirected + ))); + let mut multi = knows_expand(RelationshipDirection::Outgoing); + if let LogicalOperator::Expand { + relationship_types, .. + } = &mut multi + { + relationship_types.push("LIKES".to_string()); + } + assert!(!can_plan_natively(&multi)); + } + + #[test] + fn test_cannot_plan_zero_expands() { + assert!(!can_plan_natively(&person_scan("a"))); + } + + #[test] + fn test_native_plan_contains_extension_nodes() { + let plan = LogicalOperator::Project { + input: Box::new(knows_expand(RelationshipDirection::Outgoing)), + projections: vec![ProjectionItem { + expression: ValueExpression::Property(PropertyRef::new("b", "name")), + alias: None, + }], + }; + let planner = LanceNativePlanner::with_catalog(person_knows_config(), make_catalog()); + let df_plan = planner.plan(&plan).unwrap(); + let s = format!("{:?}", df_plan); + assert!(s.contains("CsrExpand"), "missing CsrExpand: {}", s); + assert!(s.contains("LanceTake"), "missing LanceTake: {}", s); + } + + #[test] + fn test_unsupported_falls_back_to_join() { + let vlexpand = LogicalOperator::VariableLengthExpand { + input: Box::new(person_scan("a")), + source_variable: "a".into(), + target_variable: "b".into(), + relationship_types: vec!["KNOWS".into()], + direction: RelationshipDirection::Outgoing, + relationship_variable: None, + min_length: Some(1), + max_length: Some(2), + target_properties: Default::default(), + }; + let planner = LanceNativePlanner::with_catalog(person_knows_config(), make_catalog()); + let df_plan = planner.plan(&vlexpand).unwrap(); + let s = format!("{:?}", df_plan); + assert!(!s.contains("CsrExpand"), "should not be native: {}", s); + } + + #[test] + fn test_cannot_plan_expand_with_inline_property_filters() { + use crate::ast::PropertyValue; + // Target-node inline filter -> must fall back. + let mut op = knows_expand(RelationshipDirection::Outgoing); + if let LogicalOperator::Expand { target_properties, .. } = &mut op { + target_properties.insert("active".into(), PropertyValue::Boolean(true)); + } + assert!(!can_plan_natively(&op)); + + // Relationship inline filter -> must fall back. + let mut op2 = knows_expand(RelationshipDirection::Outgoing); + if let LogicalOperator::Expand { properties, .. } = &mut op2 { + properties.insert("since".into(), PropertyValue::Integer(2020)); + } + assert!(!can_plan_natively(&op2)); + } } From 2df8fccefad5e3249c95c9dc6f6845008efb228a Mon Sep 17 00:00:00 2001 From: "jianjian.xie" Date: Mon, 22 Jun 2026 16:31:07 -0700 Subject: [PATCH 09/10] feat(native): wire LanceNative execution strategy + e2e parity tests Co-Authored-By: Claude Sonnet 4.6 (1M context) --- crates/lance-graph/src/query.rs | 103 ++++++++++- .../tests/test_lance_native_expand.rs | 160 ++++++++++++++++++ 2 files changed, 254 insertions(+), 9 deletions(-) create mode 100644 crates/lance-graph/tests/test_lance_native_expand.rs diff --git a/crates/lance-graph/src/query.rs b/crates/lance-graph/src/query.rs index e63e2a6a..eb5d8ff6 100644 --- a/crates/lance-graph/src/query.rs +++ b/crates/lance-graph/src/query.rs @@ -233,10 +233,7 @@ impl CypherQuery { let strategy = strategy.unwrap_or_default(); match strategy { ExecutionStrategy::DataFusion => self.execute_datafusion(datasets).await, - ExecutionStrategy::LanceNative => Err(GraphError::UnsupportedFeature { - feature: "Lance native execution strategy is not yet implemented".to_string(), - location: snafu::Location::new(file!(), line!(), column!()), - }), + ExecutionStrategy::LanceNative => self.execute_lance_native(datasets).await, } } @@ -328,7 +325,7 @@ impl CypherQuery { // Build catalog and context from datasets let (catalog, ctx) = self - .build_catalog_and_context_from_datasets(datasets) + .build_catalog_and_context_from_datasets(datasets, false) .await?; // Delegate to the internal explain method @@ -366,7 +363,7 @@ impl CypherQuery { // Build catalog and context from datasets using the helper let (catalog, ctx) = self - .build_catalog_and_context_from_datasets(datasets) + .build_catalog_and_context_from_datasets(datasets, false) .await?; // Generate Logical Plan @@ -596,7 +593,7 @@ impl CypherQuery { // Build catalog and context from datasets let (catalog, ctx) = self - .build_catalog_and_context_from_datasets(datasets) + .build_catalog_and_context_from_datasets(datasets, false) .await?; // Delegate to common execution logic @@ -604,10 +601,87 @@ impl CypherQuery { .await } + /// Execute using the Lance native CSR strategy with in-memory datasets. + /// + /// Installs `CsrQueryPlanner` on the session so that CSR extension nodes in the + /// logical plan are lowered to `CsrExpandExec` / `LanceTakeExec` at physical-plan + /// time. Unsupported plans (e.g. variable-length paths) fall back automatically to + /// the DataFusion join path via `LanceNativePlanner`'s internal delegate. + async fn execute_lance_native( + &self, + datasets: HashMap, + ) -> Result { + use arrow::compute::concat_batches; + use std::sync::Arc; + + // Build catalog and a CSR-enabled session context. + let (catalog, ctx) = self + .build_catalog_and_context_from_datasets(datasets, true) + .await?; + + // Lower the graph logical plan through LanceNativePlanner. + let df_logical_plan = self.create_logical_plans_native(Arc::new(catalog))?; + + let df = ctx + .execute_logical_plan(df_logical_plan) + .await + .map_err(|e| GraphError::ExecutionError { + message: format!("Failed to execute native plan: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + let result_schema = df.schema().inner().clone(); + let batches = df.collect().await.map_err(|e| GraphError::ExecutionError { + message: format!("Failed to collect native results: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + if batches.is_empty() { + return Ok(arrow::record_batch::RecordBatch::new_empty(result_schema)); + } + concat_batches(&result_schema, &batches).map_err(|e| GraphError::ExecutionError { + message: format!("Failed to concat native results: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + }) + } + + /// Build a DataFusion `LogicalPlan` via `LanceNativePlanner`. + /// + /// Mirrors `create_logical_plans` exactly but uses `LanceNativePlanner` for phase 3 + /// instead of `DataFusionPlanner`. Unsupported patterns (variable-length paths, etc.) + /// are transparently delegated back to the DataFusion join planner. + fn create_logical_plans_native( + &self, + catalog: std::sync::Arc, + ) -> Result { + use crate::datafusion_planner::GraphPhysicalPlanner; + use crate::lance_native_planner::LanceNativePlanner; + use crate::semantic::SemanticAnalyzer; + + let config = self.require_config()?; + + // Phase 1: Semantic Analysis + let mut analyzer = SemanticAnalyzer::new(config.clone()); + let semantic = analyzer.analyze(&self.ast, &self.parameters)?; + if !semantic.errors.is_empty() { + return Err(GraphError::PlanError { + message: format!("Semantic analysis failed:\n{}", semantic.errors.join("\n")), + location: snafu::Location::new(file!(), line!(), column!()), + }); + } + + // Phase 2: Graph Logical Plan + let mut logical_planner = LogicalPlanner::new(config); + let logical_plan = logical_planner.plan(&semantic.ast)?; + + // Phase 3: Native DataFusion Logical Plan (with CSR extension nodes where possible) + let native = LanceNativePlanner::with_catalog(config.clone(), catalog); + native.plan(&logical_plan) + } + /// Helper to build catalog and context from in-memory datasets async fn build_catalog_and_context_from_datasets( &self, datasets: HashMap, + native: bool, ) -> Result<( lance_graph_catalog::InMemoryCatalog, datafusion::execution::context::SessionContext, @@ -624,8 +698,19 @@ impl CypherQuery { }); } - // Create session context and catalog - let ctx = SessionContext::new(); + // Create session context — with CSR query planner when native=true + let ctx = if native { + use datafusion::execution::session_state::SessionStateBuilder; + let state = SessionStateBuilder::new() + .with_default_features() + .with_query_planner(Arc::new( + crate::lance_native_planner::CsrQueryPlanner::new(), + )) + .build(); + SessionContext::new_with_state(state) + } else { + SessionContext::new() + }; let mut catalog = InMemoryCatalog::new(); // Register all datasets as tables diff --git a/crates/lance-graph/tests/test_lance_native_expand.rs b/crates/lance-graph/tests/test_lance_native_expand.rs new file mode 100644 index 00000000..b8b88eda --- /dev/null +++ b/crates/lance-graph/tests/test_lance_native_expand.rs @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! End-to-end parity tests: native CSR expand vs DataFusion join path. + +use std::collections::HashMap; +use std::sync::Arc; + +use arrow_array::{Int64Array, RecordBatch, StringArray, UInt64Array}; +use arrow_schema::{DataType, Field, Schema}; +use lance_graph::config::GraphConfig; +use lance_graph::query::{CypherQuery, ExecutionStrategy}; + +fn person_batch() -> RecordBatch { + // Dense ids 0..4 (row id == id_field value). + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::UInt64, false), + Field::new("name", DataType::Utf8, false), + Field::new("age", DataType::Int64, false), + ])); + RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt64Array::from(vec![0u64, 1, 2, 3])), + Arc::new(StringArray::from(vec!["alice", "bob", "carol", "dave"])), + Arc::new(Int64Array::from(vec![30i64, 40, 25, 50])), + ], + ) + .unwrap() +} + +fn knows_batch() -> RecordBatch { + // 0->1, 0->2, 1->3, 2->3 + let schema = Arc::new(Schema::new(vec![ + Field::new("src_id", DataType::UInt64, false), + Field::new("dst_id", DataType::UInt64, false), + ])); + RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt64Array::from(vec![0u64, 0, 1, 2])), + Arc::new(UInt64Array::from(vec![1u64, 2, 3, 3])), + ], + ) + .unwrap() +} + +fn config() -> GraphConfig { + GraphConfig::builder() + .with_node_label("Person", "id") + .with_relationship("KNOWS", "src_id", "dst_id") + .build() + .unwrap() +} + +fn datasets() -> HashMap { + let mut d = HashMap::new(); + d.insert("Person".to_string(), person_batch()); + d.insert("KNOWS".to_string(), knows_batch()); + d +} + +/// Collect (a.name, b.name) rows as a sorted Vec for order-independent compare. +fn name_pairs(batch: &RecordBatch) -> Vec<(String, String)> { + let cols: Vec<&StringArray> = (0..batch.num_columns()) + .map(|i| { + batch + .column(i) + .as_any() + .downcast_ref::() + .unwrap() + }) + .collect(); + let mut rows: Vec<(String, String)> = (0..batch.num_rows()) + .map(|r| (cols[0].value(r).to_string(), cols[1].value(r).to_string())) + .collect(); + rows.sort(); + rows +} + +#[tokio::test] +async fn test_native_expand_matches_datafusion_names() { + let q = "MATCH (a:Person)-[:KNOWS]->(b:Person) RETURN a.name, b.name"; + let query = CypherQuery::new(q).unwrap().with_config(config()); + + let native = query + .execute(datasets(), Some(ExecutionStrategy::LanceNative)) + .await + .unwrap(); + let df = query + .execute(datasets(), Some(ExecutionStrategy::DataFusion)) + .await + .unwrap(); + + let expected = vec![ + ("alice".to_string(), "bob".to_string()), + ("alice".to_string(), "carol".to_string()), + ("bob".to_string(), "dave".to_string()), + ("carol".to_string(), "dave".to_string()), + ]; + assert_eq!(name_pairs(&native), expected); + assert_eq!(name_pairs(&native), name_pairs(&df)); +} + +#[tokio::test] +async fn test_native_expand_with_target_filter() { + let q = "MATCH (a:Person)-[:KNOWS]->(b:Person) WHERE b.age > 30 RETURN a.name, b.name"; + let query = CypherQuery::new(q).unwrap().with_config(config()); + + let native = query + .execute(datasets(), Some(ExecutionStrategy::LanceNative)) + .await + .unwrap(); + let df = query + .execute(datasets(), Some(ExecutionStrategy::DataFusion)) + .await + .unwrap(); + assert_eq!(name_pairs(&native), name_pairs(&df)); + // bob(40) and dave(50) qualify as targets: (alice,bob),(bob,dave),(carol,dave) + assert_eq!( + name_pairs(&native), + vec![ + ("alice".to_string(), "bob".to_string()), + ("bob".to_string(), "dave".to_string()), + ("carol".to_string(), "dave".to_string()), + ] + ); +} + +#[tokio::test] +async fn test_native_expand_incoming_matches_datafusion() { + let q = "MATCH (a:Person)<-[:KNOWS]-(b:Person) RETURN a.name, b.name"; + let query = CypherQuery::new(q).unwrap().with_config(config()); + let native = query + .execute(datasets(), Some(ExecutionStrategy::LanceNative)) + .await + .unwrap(); + let df = query + .execute(datasets(), Some(ExecutionStrategy::DataFusion)) + .await + .unwrap(); + assert_eq!(name_pairs(&native), name_pairs(&df)); +} + +#[tokio::test] +async fn test_native_varlength_falls_back_and_matches() { + // Variable-length path is unsupported natively; LanceNative must fall back + // and produce the same result as DataFusion. + let q = "MATCH (a:Person)-[:KNOWS*1..2]->(b:Person) RETURN a.name, b.name"; + let query = CypherQuery::new(q).unwrap().with_config(config()); + let native = query + .execute(datasets(), Some(ExecutionStrategy::LanceNative)) + .await + .unwrap(); + let df = query + .execute(datasets(), Some(ExecutionStrategy::DataFusion)) + .await + .unwrap(); + assert_eq!(name_pairs(&native), name_pairs(&df)); +} From 033a1329fc301d887d0da513a8669cdff0c6c471 Mon Sep 17 00:00:00 2001 From: "jianjian.xie" Date: Mon, 22 Jun 2026 23:20:28 -0700 Subject: [PATCH 10/10] test(csr): add error-path tests and doc comment for add_edges_from_batch_with_columns Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/lance-graph/src/csr_index.rs | 41 ++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/crates/lance-graph/src/csr_index.rs b/crates/lance-graph/src/csr_index.rs index 1d3d7200..526a38e3 100644 --- a/crates/lance-graph/src/csr_index.rs +++ b/crates/lance-graph/src/csr_index.rs @@ -236,7 +236,8 @@ impl CsrIndexBuilder { self } - /// Add edges from an Arrow RecordBatch with `src_id` and `dst_id` columns. + /// Add edges from an Arrow RecordBatch. Expects columns named `src_id` and + /// `dst_id`; use [`add_edges_from_batch_with_columns`] for other names. pub fn add_edges_from_batch(self, batch: &RecordBatch) -> Result { self.add_edges_from_batch_with_columns(batch, "src_id", "dst_id") } @@ -693,5 +694,43 @@ mod tests { .build(); assert_eq!(rev.neighbors(2), &[0, 1]); assert_eq!(rev.neighbors(1), &[0]); + assert_eq!(rev.neighbors(0), &[] as &[u64]); + } + + #[test] + fn test_add_edges_from_batch_with_columns_errors() { + // Missing column name -> error + let schema = Arc::new(Schema::new(vec![ + Field::new("src_person_id", DataType::UInt64, false), + Field::new("dst_person_id", DataType::UInt64, false), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt64Array::from(vec![0u64])), + Arc::new(UInt64Array::from(vec![1u64])), + ], + ) + .unwrap(); + assert!(CsrIndexBuilder::new() + .add_edges_from_batch_with_columns(&batch, "missing", "dst_person_id") + .is_err()); + + // Wrong column type (Int64 instead of UInt64) -> error + let schema2 = Arc::new(Schema::new(vec![ + Field::new("src_person_id", DataType::Int64, false), + Field::new("dst_person_id", DataType::UInt64, false), + ])); + let batch2 = RecordBatch::try_new( + schema2, + vec![ + Arc::new(arrow_array::Int64Array::from(vec![0i64])), + Arc::new(UInt64Array::from(vec![1u64])), + ], + ) + .unwrap(); + assert!(CsrIndexBuilder::new() + .add_edges_from_batch_with_columns(&batch2, "src_person_id", "dst_person_id") + .is_err()); } }