lance-format · jja725 · Jun 11, 2026 · Jun 11, 2026
diff --git a/include/lance/lance.h b/include/lance/lance.h
@@ -559,6 +559,114 @@ int32_t lance_dataset_alter_columns(
     size_t num_alterations
 );
 
+/* ─── lance_dataset_add_columns ───────────────────────────────────────────── */
+
+/**
+ * A single new column defined by a SQL expression over the dataset's existing
+ * columns, e.g. { .name = "doubled", .expression = "x * 2" }. Both fields are
+ * required, non-empty UTF-8, and are read by shared reference for the duration
+ * of the call.
+ */
+typedef struct LanceSqlColumn {
+    /* Name of the new column. Required, non-empty UTF-8. */
+    const char* name;
+    /* SQL expression evaluated against existing columns. Required, non-empty. */
+    const char* expression;
+} LanceSqlColumn;
+
+/**
+ * Add one or more columns computed from SQL expressions over the dataset's
+ * existing columns, committing a new manifest. Each fragment is scanned, the
+ * expressions are evaluated, and the results are written as new column files.
+ *
+ * Mutates `dataset` in place — the same handle remains valid afterward and
+ * sees the new version. Scanners already in flight keep their pre-add view.
+ *
+ * @param dataset      Open dataset (not consumed). Mutated in place. Must not
+ *                     be NULL.
+ * @param columns      Array of `LanceSqlColumn`. Must not be NULL; each entry's
+ *                     `name` and `expression` must be non-NULL and non-empty.
+ * @param num_columns  Length of `columns`. Must be > 0.
+ * @param batch_size   Rows per scan batch while evaluating expressions.
+ *                     0 = upstream default.
+ * @return 0 on success, -1 on error. Error codes:
+ *         LANCE_ERR_INVALID_ARGUMENT for NULL/empty inputs, NULL or empty
+ *         `name` / `expression`, non-UTF-8 strings, malformed SQL *syntax*, a
+ *         new column name that collides with an existing column, or a
+ *         `batch_size` beyond UINT32_MAX. An expression that references a
+ *         *non-existent column* surfaces as LANCE_ERR_INTERNAL (an upstream
+ *         schema error, the same path as lance_dataset_delete), not
+ *         LANCE_ERR_INVALID_ARGUMENT. LANCE_ERR_COMMIT_CONFLICT for a
+ *         concurrent writer.
+ */
+int32_t lance_dataset_add_columns_sql(
+    LanceDataset* dataset,
+    const LanceSqlColumn* columns,
+    size_t num_columns,
+    uint64_t batch_size
+);
+
+/**
+ * Add one or more all-null columns described by an Arrow C Data Interface
+ * schema, committing a new manifest. On non-legacy datasets this is a
+ * metadata-only operation — no data files are rewritten. Every field in the
+ * schema must be nullable.
+ *
+ * Mutates `dataset` in place — the same handle remains valid afterward and
+ * sees the new version. Scanners already in flight keep their pre-add view.
+ *
+ * @param dataset  Open dataset (not consumed). Mutated in place. Must not be
+ *                 NULL.
+ * @param schema   Arrow C `ArrowSchema` describing the new columns. Read by
+ *                 shared reference; its `release` callback is never invoked.
+ *                 Must not be NULL. Only the top-level schema is validated
+ *                 before it is handed to arrow-rs; the caller is responsible for
+ *                 providing fully-initialised child fields.
+ * @return 0 on success, -1 on error. Error codes:
+ *         LANCE_ERR_INVALID_ARGUMENT for a NULL dataset/schema, an
+ *         uninitialised or already-released schema, an invalid Arrow schema, a
+ *         non-nullable field, or a name that collides with an existing column.
+ *         LANCE_ERR_NOT_SUPPORTED for a legacy-format dataset (which cannot take
+ *         all-null columns as a metadata-only change).
+ *         LANCE_ERR_COMMIT_CONFLICT for a concurrent writer.
+ */
+int32_t lance_dataset_add_columns_nulls(
+    LanceDataset* dataset,
+    const struct ArrowSchema* schema
+);
+
+/**
+ * Add columns by splicing precomputed data from an Arrow C Data Interface
+ * stream into the dataset, committing a new manifest. The stream's batches are
+ * consumed in order and aligned positionally to the dataset's existing rows;
+ * the total row count must match the dataset exactly.
+ *
+ * Mutates `dataset` in place — the same handle remains valid afterward and
+ * sees the new version. Scanners already in flight keep their pre-add view.
+ *
+ * @param dataset     Open dataset (not consumed). Mutated in place. Must not
+ *                    be NULL.
+ * @param stream      Arrow C stream of new column data. When non-NULL it is
+ *                    consumed (released) on every return path, including error
+ *                    returns — the caller must not use it again. (A NULL stream
+ *                    is rejected before anything is consumed.) Its schema
+ *                    defines the new columns and must not collide with existing
+ *                    column names.
+ * @param batch_size  Rows per write batch while aligning the stream to
+ *                    fragments. 0 = upstream default.
+ * @return 0 on success, -1 on error. Error codes:
+ *         LANCE_ERR_INVALID_ARGUMENT for a NULL dataset/stream, a stream missing
+ *         a mandatory get_schema/get_next/release callback, a stream whose total
+ *         row count does not match the dataset, a new column name that collides
+ *         with an existing column, or a `batch_size` beyond UINT32_MAX.
+ *         LANCE_ERR_COMMIT_CONFLICT for a concurrent writer.
+ */
+int32_t lance_dataset_add_columns_stream(
+    LanceDataset* dataset,
+    struct ArrowArrayStream* stream,
+    uint64_t batch_size
+);
+
 /**
  * Export the dataset schema via Arrow C Data Interface.
  * @param out  Pointer to caller-allocated ArrowSchema struct

diff --git a/include/lance/lance.hpp b/include/lance/lance.hpp
@@ -136,6 +136,16 @@ struct ColumnAlteration {
     const ArrowSchema*         data_type     = nullptr;
 };
 
+// ─── New column (SQL) ────────────────────────────────────────────────────────
+
+/// A single new column defined by a SQL expression over the dataset's existing
+/// columns, added by `Dataset::add_columns_sql`. Both fields are required and
+/// non-empty, e.g. `{ "doubled", "x * 2" }`.
+struct SqlColumn {
+    std::string name;
+    std::string expression;
+};
+
 // ─── Dataset ─────────────────────────────────────────────────────────────────
 
 class Dataset {
@@ -511,6 +521,78 @@ class Dataset {
         }
     }
 
+    /// Add columns computed from SQL expressions over the dataset's existing
+    /// columns, committing a new manifest. `batch_size = 0` uses the upstream
+    /// default scan batch size.
+    ///
+    /// `columns` must be non-empty and each entry's `name` and `expression`
+    /// must be non-empty. Throws lance::Error on failure (empty list, empty
+    /// name/expression, malformed SQL syntax, name collision with an existing
+    /// column, commit conflict, ...). A reference to a non-existent column
+    /// throws with code `LANCE_ERR_INTERNAL` (an upstream schema error), not
+    /// `LANCE_ERR_INVALID_ARGUMENT` — see the C header for the rationale.
+    void add_columns_sql(const std::vector<SqlColumn>& columns,
+                         uint64_t batch_size = 0) {
+        // The C strings we install in each entry borrow from `columns` (the
+        // caller's std::strings), which outlive this call. The entries are
+        // copied by value into `raw`, so any reallocation during push_back
+        // just moves the raw bytes — pointer values are preserved.
+        std::vector<LanceSqlColumn> raw;
+        raw.reserve(columns.size());
+        for (const auto& c : columns) {
+            LanceSqlColumn entry{};
+            entry.name       = c.name.c_str();
+            entry.expression = c.expression.c_str();
+            raw.push_back(entry);
+        }
+        // Pass `raw.data()` unconditionally — matches the `alter_columns` and
+        // `drop_columns` siblings whose inputs are also required to be
+        // non-empty. An empty `columns` yields `num_columns == 0`, which the
+        // Rust layer rejects before it indexes the pointer.
+        if (lance_dataset_add_columns_sql(
+                handle_.get(), raw.data(), raw.size(), batch_size) != 0) {
+            check_error();
+        }
+    }
+
+    /// Add all-null columns described by an Arrow schema, committing a new
+    /// manifest. Metadata-only on non-legacy datasets. Every field in `schema`
+    /// must be nullable. The caller owns `schema` and must keep it alive for
+    /// the duration of the call; the wrapper does not release it.
+    ///
+    /// Throws lance::Error on failure (invalid schema, non-nullable field, name
+    /// collision with an existing column, commit conflict, ...). A legacy-format
+    /// dataset throws with code `LANCE_ERR_NOT_SUPPORTED` (all-null columns are
+    /// metadata-only and the legacy format cannot represent them that way).
+    void add_columns_nulls(const ArrowSchema* schema) {
+        if (lance_dataset_add_columns_nulls(handle_.get(), schema) != 0) {
+            check_error();
+        }
+    }
+
+    /// Add columns by splicing precomputed data from an Arrow C stream into the
+    /// dataset, committing a new manifest. `batch_size = 0` uses the upstream
+    /// default. When non-null, `stream` is consumed (released) on every return
+    /// path — including a null-dataset error and when this method throws — so do
+    /// not use it again afterward. Only a null `stream` is rejected without
+    /// consuming anything.
+    ///
+    /// The stream's total row count must match the dataset exactly. Throws
+    /// lance::Error on failure (row-count mismatch, name collision with an
+    /// existing column, commit conflict, ...).
+    void add_columns_stream(ArrowArrayStream* stream, uint64_t batch_size = 0) {
+        // Forward `stream` straight to the C API, which owns the stream and
+        // releases it on every path. No RAII guard is needed here (unlike
+        // `write`, which builds vectors before its C call): nothing between this
+        // method's entry and the call below can throw, so the stream can never
+        // be stranded by an exception. WARNING: do not add any throwing code
+        // before the C call without first arming a stream-release guard.
+        if (lance_dataset_add_columns_stream(
+                handle_.get(), stream, batch_size) != 0) {
+            check_error();
+        }
+    }
+
     /// Export the schema as an Arrow C Data Interface struct.
     void schema(ArrowSchema* out) const {
         if (lance_dataset_schema(handle_.get(), out) != 0) {