Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions include/lance/lance.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ typedef struct LanceDataset LanceDataset;
typedef struct LanceScanner LanceScanner;
typedef struct LanceBatch LanceBatch;
typedef struct LanceVersions LanceVersions;
typedef struct LanceDataStatistics LanceDataStatistics;

/* ─── Dataset lifecycle ─── */

Expand Down Expand Up @@ -205,6 +206,48 @@ int64_t lance_versions_timestamp_ms_at(const LanceVersions* versions, size_t ind
/** Close and free a versions handle. Safe to call with NULL. */
void lance_versions_close(LanceVersions* versions);

/* ─── Data statistics ─── */

/**
* Compute per-field data statistics (compressed on-disk byte size) for query
* planning. Walks every fragment, so this performs I/O. Caller frees the
* returned handle with lance_data_statistics_close().
*
* Entries are ordered by schema field id, one per field (including nested
* struct/list children).
* @return handle on success, or NULL on error
*/
LanceDataStatistics* lance_dataset_calculate_data_stats(const LanceDataset* dataset);

/**
* Number of fields in the statistics snapshot. Clears the thread-local error
* on success. Returns 0 and sets LANCE_ERR_INVALID_ARGUMENT on a NULL handle;
* a dataset with an empty schema also yields 0 with no error set, so check
* lance_last_error_code() to distinguish the error case from an empty result.
*/
uint64_t lance_data_statistics_count(const LanceDataStatistics* stats);

/**
* Schema field id at `index` (0 <= index < count).
* Returns 0 on error (NULL handle or out-of-range index), setting
* LANCE_ERR_INVALID_ARGUMENT. Because 0 is itself a valid field id, check
* lance_last_error_code() when passing an untrusted index; iterating
* `0..count` never errors.
*/
uint32_t lance_data_statistics_field_id_at(const LanceDataStatistics* stats, size_t index);

/**
* Compressed on-disk byte size of the field at `index`.
* Returns 0 on error (NULL handle or out-of-range index), setting
* LANCE_ERR_INVALID_ARGUMENT. A field written with the legacy (v1) storage
* format also reports 0 but sets no error, so check lance_last_error_code() to
* distinguish a genuine 0 from the error sentinel.
*/
uint64_t lance_data_statistics_bytes_on_disk_at(const LanceDataStatistics* stats, size_t index);

/** Close and free a data statistics handle. Safe to call with NULL. */
void lance_data_statistics_close(LanceDataStatistics* stats);

/**
* Restore the dataset to an older version by committing a new manifest that
* carries the fragments of `version`. If `version` is already the latest,
Expand Down
30 changes: 30 additions & 0 deletions include/lance/lance.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,14 @@ struct VersionInfo {
int64_t timestamp_ms;
};

/// Per-field storage statistics for query planning.
/// `id` is the schema field id; `bytes_on_disk` is the compressed on-disk size
/// (0 for datasets written with the legacy v1 storage format).
struct FieldStatistics {
uint32_t id;
uint64_t bytes_on_disk;
};

// ─── Write mode ──────────────────────────────────────────────────────────────

enum class WriteMode : int32_t {
Expand Down Expand Up @@ -349,6 +357,28 @@ class Dataset {
return out;
}

/// Compute per-field data statistics (compressed on-disk byte size) for
/// query planning, ordered by schema field id. Performs I/O over every
/// fragment. Throws lance::Error on failure.
std::vector<FieldStatistics> calculate_data_stats() const {
auto* raw = lance_dataset_calculate_data_stats(handle_.get());
if (!raw) check_error();
Handle<LanceDataStatistics, lance_data_statistics_close> snap(raw);

uint64_t n = lance_data_statistics_count(snap.get());
std::vector<FieldStatistics> out;
out.reserve(static_cast<size_t>(n));
for (uint64_t i = 0; i < n; i++) {
FieldStatistics fs;
fs.id = lance_data_statistics_field_id_at(snap.get(), static_cast<size_t>(i));
fs.bytes_on_disk =
lance_data_statistics_bytes_on_disk_at(snap.get(), static_cast<size_t>(i));
if (lance_last_error_code() != LANCE_OK) check_error();
out.push_back(fs);
}
return out;
}

/// Commit a new manifest that aliases `version` as the latest. The
/// returned Dataset points at the target version; this handle is
/// unchanged. If `version` is already the latest, no new manifest is
Expand Down
159 changes: 159 additions & 0 deletions src/data_statistics.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

//! Data statistics C API: per-field storage statistics for query planning.
//!
//! `lance_dataset_calculate_data_stats` walks every fragment to total each
//! field's compressed on-disk byte size, returning the result as an opaque
//! `LanceDataStatistics` snapshot. Accessors read entries by index, and
//! `lance_data_statistics_close` frees it.

use lance::dataset::statistics::DatasetStatisticsExt;
use lance_core::Result;

use crate::dataset::LanceDataset;
use crate::error::{LanceErrorCode, clear_last_error, ffi_try, set_last_error};
use crate::runtime::block_on;

/// Opaque snapshot of a dataset's per-field data statistics.
pub struct LanceDataStatistics {
fields: Vec<FieldStat>,
}

#[derive(Clone, Copy)]
struct FieldStat {
id: u32,
bytes_on_disk: u64,
}

/// Compute per-field data statistics for the dataset. The caller frees the
/// returned handle with `lance_data_statistics_close`. Returns NULL on error.
///
/// Entries are ordered by the dataset's schema field id, one per field
/// (including nested struct/list children). `bytes_on_disk` is the field's
/// compressed on-disk size; it is 0 for datasets written with the legacy (v1)
/// storage format, which does not track per-field sizes.
#[unsafe(no_mangle)]
pub unsafe extern "C" fn lance_dataset_calculate_data_stats(
dataset: *const LanceDataset,
) -> *mut LanceDataStatistics {
ffi_try!(unsafe { calculate_inner(dataset) }, null)
}

unsafe fn calculate_inner(dataset: *const LanceDataset) -> Result<*mut LanceDataStatistics> {
if dataset.is_null() {
return Err(lance_core::Error::InvalidInput {
source: "dataset must not be NULL".into(),
location: snafu::location!(),
});
}
// SAFETY: `dataset` is non-null (checked above) and points at a live
// `LanceDataset` created by `lance_dataset_open`; we take only a shared
// borrow, which is sound for the duration of this call.
let ds = unsafe { &*dataset };
let snapshot = ds.snapshot();
let stats = block_on(snapshot.calculate_data_stats())?;
let fields = stats
.fields
.into_iter()
.map(|f| FieldStat {
id: f.id,
bytes_on_disk: f.bytes_on_disk,
})
.collect();
Ok(Box::into_raw(Box::new(LanceDataStatistics { fields })))
}

/// Return the number of fields in the statistics snapshot.
///
/// Clears the thread-local error on success. Returns 0 and sets
/// `InvalidArgument` on a NULL handle. A dataset with an empty schema also
/// yields 0 with no error set, so check `lance_last_error_code()` to
/// distinguish the error case from an empty result.
#[unsafe(no_mangle)]
pub unsafe extern "C" fn lance_data_statistics_count(stats: *const LanceDataStatistics) -> u64 {
if stats.is_null() {
set_last_error(LanceErrorCode::InvalidArgument, "stats is NULL");
return 0;
}
// SAFETY: `stats` is non-null (checked above) and was produced by
// `lance_dataset_calculate_data_stats` via `Box::into_raw`; the accessors
// only ever take shared borrows, so no mutable alias exists.
let s = unsafe { &*stats };
let count = s.fields.len() as u64;
clear_last_error();
count
}

/// Return the schema field id at `index` (0 <= index < count).
///
/// Returns 0 and sets the thread-local error on NULL or out-of-range input.
/// Because 0 is itself a valid field id, check `lance_last_error_code()` when
/// passing an untrusted index; iterating `0..count` never triggers the error
/// path.
#[unsafe(no_mangle)]
pub unsafe extern "C" fn lance_data_statistics_field_id_at(
stats: *const LanceDataStatistics,
index: usize,
) -> u32 {
unsafe { entry_at(stats, index) }.map(|f| f.id).unwrap_or(0)
}

/// Return the compressed on-disk byte size of the field at `index`.
///
/// Returns 0 and sets the thread-local error on NULL or out-of-range input.
/// A genuine 0 (legacy storage, or an empty field) is indistinguishable from
/// the error sentinel by value alone — check `lance_last_error_code()`.
#[unsafe(no_mangle)]
pub unsafe extern "C" fn lance_data_statistics_bytes_on_disk_at(
stats: *const LanceDataStatistics,
index: usize,
) -> u64 {
unsafe { entry_at(stats, index) }
.map(|f| f.bytes_on_disk)
.unwrap_or(0)
}

/// Close and free a data statistics handle. Safe to call with NULL.
#[unsafe(no_mangle)]
pub unsafe extern "C" fn lance_data_statistics_close(stats: *mut LanceDataStatistics) {
if !stats.is_null() {
unsafe {
let _ = Box::from_raw(stats);
}
}
}

// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------

/// Copy the field stat at `index` out of the handle. Sets the thread-local
/// error and returns `None` on NULL handle or out-of-range index.
unsafe fn entry_at(stats: *const LanceDataStatistics, index: usize) -> Option<FieldStat> {
if stats.is_null() {
set_last_error(LanceErrorCode::InvalidArgument, "stats is NULL");
return None;
}
// SAFETY: `stats` is non-null (checked above) and was produced by
// `lance_dataset_calculate_data_stats` via `Box::into_raw`; we take only a
// shared borrow.
let s = unsafe { &*stats };
match s.fields.get(index).copied() {
Some(f) => {
clear_last_error();
Some(f)
}
None => {
set_last_error(
LanceErrorCode::InvalidArgument,
format!(
"field statistics index {} out of range; count = {}",
index,
s.fields.len()
),
);
None
}
}
}
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ mod alter_columns;
mod async_dispatcher;
mod batch;
mod compact;
mod data_statistics;
mod dataset;
mod delete;
mod drop_columns;
Expand All @@ -40,6 +41,7 @@ pub use add_columns::*;
pub use alter_columns::*;
pub use batch::*;
pub use compact::*;
pub use data_statistics::*;
pub use dataset::*;
pub use delete::*;
pub use drop_columns::*;
Expand Down
Loading
Loading