diff --git a/include/lance/lance.h b/include/lance/lance.h index fdd01a0..1de72fa 100644 --- a/include/lance/lance.h +++ b/include/lance/lance.h @@ -140,6 +140,7 @@ typedef struct LanceDataset LanceDataset; typedef struct LanceScanner LanceScanner; typedef struct LanceBatch LanceBatch; typedef struct LanceVersions LanceVersions; +typedef struct LanceDataStatistics LanceDataStatistics; /* ─── Dataset lifecycle ─── */ @@ -205,6 +206,48 @@ int64_t lance_versions_timestamp_ms_at(const LanceVersions* versions, size_t ind /** Close and free a versions handle. Safe to call with NULL. */ void lance_versions_close(LanceVersions* versions); +/* ─── Data statistics ─── */ + +/** + * Compute per-field data statistics (compressed on-disk byte size) for query + * planning. Walks every fragment, so this performs I/O. Caller frees the + * returned handle with lance_data_statistics_close(). + * + * Entries are ordered by schema field id, one per field (including nested + * struct/list children). + * @return handle on success, or NULL on error + */ +LanceDataStatistics* lance_dataset_calculate_data_stats(const LanceDataset* dataset); + +/** + * Number of fields in the statistics snapshot. Clears the thread-local error + * on success. Returns 0 and sets LANCE_ERR_INVALID_ARGUMENT on a NULL handle; + * a dataset with an empty schema also yields 0 with no error set, so check + * lance_last_error_code() to distinguish the error case from an empty result. + */ +uint64_t lance_data_statistics_count(const LanceDataStatistics* stats); + +/** + * Schema field id at `index` (0 <= index < count). + * Returns 0 on error (NULL handle or out-of-range index), setting + * LANCE_ERR_INVALID_ARGUMENT. Because 0 is itself a valid field id, check + * lance_last_error_code() when passing an untrusted index; iterating + * `0..count` never errors. + */ +uint32_t lance_data_statistics_field_id_at(const LanceDataStatistics* stats, size_t index); + +/** + * Compressed on-disk byte size of the field at `index`. + * Returns 0 on error (NULL handle or out-of-range index), setting + * LANCE_ERR_INVALID_ARGUMENT. A field written with the legacy (v1) storage + * format also reports 0 but sets no error, so check lance_last_error_code() to + * distinguish a genuine 0 from the error sentinel. + */ +uint64_t lance_data_statistics_bytes_on_disk_at(const LanceDataStatistics* stats, size_t index); + +/** Close and free a data statistics handle. Safe to call with NULL. */ +void lance_data_statistics_close(LanceDataStatistics* stats); + /** * Restore the dataset to an older version by committing a new manifest that * carries the fragments of `version`. If `version` is already the latest, diff --git a/include/lance/lance.hpp b/include/lance/lance.hpp index ad17789..40aa9e3 100644 --- a/include/lance/lance.hpp +++ b/include/lance/lance.hpp @@ -97,6 +97,14 @@ struct VersionInfo { int64_t timestamp_ms; }; +/// Per-field storage statistics for query planning. +/// `id` is the schema field id; `bytes_on_disk` is the compressed on-disk size +/// (0 for datasets written with the legacy v1 storage format). +struct FieldStatistics { + uint32_t id; + uint64_t bytes_on_disk; +}; + // ─── Write mode ────────────────────────────────────────────────────────────── enum class WriteMode : int32_t { @@ -349,6 +357,28 @@ class Dataset { return out; } + /// Compute per-field data statistics (compressed on-disk byte size) for + /// query planning, ordered by schema field id. Performs I/O over every + /// fragment. Throws lance::Error on failure. + std::vector calculate_data_stats() const { + auto* raw = lance_dataset_calculate_data_stats(handle_.get()); + if (!raw) check_error(); + Handle snap(raw); + + uint64_t n = lance_data_statistics_count(snap.get()); + std::vector out; + out.reserve(static_cast(n)); + for (uint64_t i = 0; i < n; i++) { + FieldStatistics fs; + fs.id = lance_data_statistics_field_id_at(snap.get(), static_cast(i)); + fs.bytes_on_disk = + lance_data_statistics_bytes_on_disk_at(snap.get(), static_cast(i)); + if (lance_last_error_code() != LANCE_OK) check_error(); + out.push_back(fs); + } + return out; + } + /// Commit a new manifest that aliases `version` as the latest. The /// returned Dataset points at the target version; this handle is /// unchanged. If `version` is already the latest, no new manifest is diff --git a/src/data_statistics.rs b/src/data_statistics.rs new file mode 100644 index 0000000..c7a59e7 --- /dev/null +++ b/src/data_statistics.rs @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Data statistics C API: per-field storage statistics for query planning. +//! +//! `lance_dataset_calculate_data_stats` walks every fragment to total each +//! field's compressed on-disk byte size, returning the result as an opaque +//! `LanceDataStatistics` snapshot. Accessors read entries by index, and +//! `lance_data_statistics_close` frees it. + +use lance::dataset::statistics::DatasetStatisticsExt; +use lance_core::Result; + +use crate::dataset::LanceDataset; +use crate::error::{LanceErrorCode, clear_last_error, ffi_try, set_last_error}; +use crate::runtime::block_on; + +/// Opaque snapshot of a dataset's per-field data statistics. +pub struct LanceDataStatistics { + fields: Vec, +} + +#[derive(Clone, Copy)] +struct FieldStat { + id: u32, + bytes_on_disk: u64, +} + +/// Compute per-field data statistics for the dataset. The caller frees the +/// returned handle with `lance_data_statistics_close`. Returns NULL on error. +/// +/// Entries are ordered by the dataset's schema field id, one per field +/// (including nested struct/list children). `bytes_on_disk` is the field's +/// compressed on-disk size; it is 0 for datasets written with the legacy (v1) +/// storage format, which does not track per-field sizes. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn lance_dataset_calculate_data_stats( + dataset: *const LanceDataset, +) -> *mut LanceDataStatistics { + ffi_try!(unsafe { calculate_inner(dataset) }, null) +} + +unsafe fn calculate_inner(dataset: *const LanceDataset) -> Result<*mut LanceDataStatistics> { + if dataset.is_null() { + return Err(lance_core::Error::InvalidInput { + source: "dataset must not be NULL".into(), + location: snafu::location!(), + }); + } + // SAFETY: `dataset` is non-null (checked above) and points at a live + // `LanceDataset` created by `lance_dataset_open`; we take only a shared + // borrow, which is sound for the duration of this call. + let ds = unsafe { &*dataset }; + let snapshot = ds.snapshot(); + let stats = block_on(snapshot.calculate_data_stats())?; + let fields = stats + .fields + .into_iter() + .map(|f| FieldStat { + id: f.id, + bytes_on_disk: f.bytes_on_disk, + }) + .collect(); + Ok(Box::into_raw(Box::new(LanceDataStatistics { fields }))) +} + +/// Return the number of fields in the statistics snapshot. +/// +/// Clears the thread-local error on success. Returns 0 and sets +/// `InvalidArgument` on a NULL handle. A dataset with an empty schema also +/// yields 0 with no error set, so check `lance_last_error_code()` to +/// distinguish the error case from an empty result. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn lance_data_statistics_count(stats: *const LanceDataStatistics) -> u64 { + if stats.is_null() { + set_last_error(LanceErrorCode::InvalidArgument, "stats is NULL"); + return 0; + } + // SAFETY: `stats` is non-null (checked above) and was produced by + // `lance_dataset_calculate_data_stats` via `Box::into_raw`; the accessors + // only ever take shared borrows, so no mutable alias exists. + let s = unsafe { &*stats }; + let count = s.fields.len() as u64; + clear_last_error(); + count +} + +/// Return the schema field id at `index` (0 <= index < count). +/// +/// Returns 0 and sets the thread-local error on NULL or out-of-range input. +/// Because 0 is itself a valid field id, check `lance_last_error_code()` when +/// passing an untrusted index; iterating `0..count` never triggers the error +/// path. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn lance_data_statistics_field_id_at( + stats: *const LanceDataStatistics, + index: usize, +) -> u32 { + unsafe { entry_at(stats, index) }.map(|f| f.id).unwrap_or(0) +} + +/// Return the compressed on-disk byte size of the field at `index`. +/// +/// Returns 0 and sets the thread-local error on NULL or out-of-range input. +/// A genuine 0 (legacy storage, or an empty field) is indistinguishable from +/// the error sentinel by value alone — check `lance_last_error_code()`. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn lance_data_statistics_bytes_on_disk_at( + stats: *const LanceDataStatistics, + index: usize, +) -> u64 { + unsafe { entry_at(stats, index) } + .map(|f| f.bytes_on_disk) + .unwrap_or(0) +} + +/// Close and free a data statistics handle. Safe to call with NULL. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn lance_data_statistics_close(stats: *mut LanceDataStatistics) { + if !stats.is_null() { + unsafe { + let _ = Box::from_raw(stats); + } + } +} + +// --------------------------------------------------------------------------- +// Internal helpers +// --------------------------------------------------------------------------- + +/// Copy the field stat at `index` out of the handle. Sets the thread-local +/// error and returns `None` on NULL handle or out-of-range index. +unsafe fn entry_at(stats: *const LanceDataStatistics, index: usize) -> Option { + if stats.is_null() { + set_last_error(LanceErrorCode::InvalidArgument, "stats is NULL"); + return None; + } + // SAFETY: `stats` is non-null (checked above) and was produced by + // `lance_dataset_calculate_data_stats` via `Box::into_raw`; we take only a + // shared borrow. + let s = unsafe { &*stats }; + match s.fields.get(index).copied() { + Some(f) => { + clear_last_error(); + Some(f) + } + None => { + set_last_error( + LanceErrorCode::InvalidArgument, + format!( + "field statistics index {} out of range; count = {}", + index, + s.fields.len() + ), + ); + None + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 1fafd4b..123f961 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,6 +20,7 @@ mod alter_columns; mod async_dispatcher; mod batch; mod compact; +mod data_statistics; mod dataset; mod delete; mod drop_columns; @@ -40,6 +41,7 @@ pub use add_columns::*; pub use alter_columns::*; pub use batch::*; pub use compact::*; +pub use data_statistics::*; pub use dataset::*; pub use delete::*; pub use drop_columns::*; diff --git a/tests/c_api_test.rs b/tests/c_api_test.rs index 92ee5f5..2babda6 100644 --- a/tests/c_api_test.rs +++ b/tests/c_api_test.rs @@ -1811,6 +1811,295 @@ fn test_versions_close_null_is_safe() { unsafe { lance_versions_close(ptr::null_mut()) }; } +// --------------------------------------------------------------------------- +// Data statistics (lance_dataset_calculate_data_stats) +// --------------------------------------------------------------------------- + +/// Sum `bytes_on_disk` across every field in a statistics handle. +fn total_bytes_on_disk(stats: *const LanceDataStatistics) -> u64 { + let count = unsafe { lance_data_statistics_count(stats) }; + (0..count) + .map(|i| unsafe { lance_data_statistics_bytes_on_disk_at(stats, i as usize) }) + .sum() +} + +/// Collect the field ids of a statistics handle in index order. +fn field_ids(stats: *const LanceDataStatistics) -> Vec { + let count = unsafe { lance_data_statistics_count(stats) }; + (0..count) + .map(|i| unsafe { lance_data_statistics_field_id_at(stats, i as usize) }) + .collect() +} + +#[test] +fn test_data_statistics_single_fragment() { + // create_test_dataset has two top-level fields (id=0, name=1) written with + // the default (modern, v2+) storage format, so bytes_on_disk is populated. + let (_tmp, uri) = create_test_dataset(); + let c_uri = c_str(&uri); + let ds = unsafe { lance_dataset_open(c_uri.as_ptr(), ptr::null(), 0) }; + + let stats = unsafe { lance_dataset_calculate_data_stats(ds) }; + assert!(!stats.is_null()); + assert_eq!(unsafe { lance_data_statistics_count(stats) }, 2); + assert_eq!(field_ids(stats), vec![0, 1]); + // Field id 0 is a legitimate value that collides with the error sentinel; + // reading it on the success path must leave the error state clear so callers + // can disambiguate a real 0 from an error via lance_last_error_code(). + assert_eq!(unsafe { lance_data_statistics_field_id_at(stats, 0) }, 0); + assert_eq!(lance_last_error_code(), LanceErrorCode::Ok); + assert!( + total_bytes_on_disk(stats) > 0, + "modern storage should report non-zero on-disk size" + ); + + unsafe { lance_data_statistics_close(stats) }; + unsafe { lance_dataset_close(ds) }; +} + +#[test] +fn test_data_statistics_field_count_matches_schema() { + // create_large_dataset has three fields (id, value, label). + let (_tmp, uri) = create_large_dataset(50); + let c_uri = c_str(&uri); + let ds = unsafe { lance_dataset_open(c_uri.as_ptr(), ptr::null(), 0) }; + + let stats = unsafe { lance_dataset_calculate_data_stats(ds) }; + assert!(!stats.is_null()); + assert_eq!(unsafe { lance_data_statistics_count(stats) }, 3); + assert_eq!(field_ids(stats), vec![0, 1, 2]); + // Every field carries data, so each reports a non-zero size. + for i in 0..3 { + assert!( + unsafe { lance_data_statistics_bytes_on_disk_at(stats, i) } > 0, + "field {i} should report non-zero on-disk size" + ); + } + + unsafe { lance_data_statistics_close(stats) }; + unsafe { lance_dataset_close(ds) }; +} + +/// Write a single-fragment dataset with one Int32 `id` field holding `ids` and +/// return the on-disk byte size of that field. +fn single_fragment_id_field_bytes(ids: Vec) -> u64 { + let tmp = tempfile::tempdir().unwrap(); + let uri = tmp + .path() + .join("one_frag_stats_ds") + .to_str() + .unwrap() + .to_string(); + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + lance_c::runtime::block_on(async { + let batch = + RecordBatch::try_new(schema.clone(), vec![Arc::new(Int32Array::from(ids))]).unwrap(); + Dataset::write( + arrow::record_batch::RecordBatchIterator::new(vec![Ok(batch)], schema.clone()), + &uri, + None, + ) + .await + .unwrap(); + }); + let c_uri = c_str(&uri); + let ds = unsafe { lance_dataset_open(c_uri.as_ptr(), ptr::null(), 0) }; + let stats = unsafe { lance_dataset_calculate_data_stats(ds) }; + let bytes = unsafe { lance_data_statistics_bytes_on_disk_at(stats, 0) }; + unsafe { lance_data_statistics_close(stats) }; + unsafe { lance_dataset_close(ds) }; + bytes +} + +#[test] +fn test_data_statistics_multi_fragment_sums_across_fragments() { + // The two-fragment dataset's first fragment (ids 0..5) is identical to a + // standalone single-fragment dataset of the same rows. If calculate_data_stats + // counted only one fragment, the two byte totals would match; genuine + // aggregation makes the two-fragment total strictly larger. + let one_fragment_bytes = single_fragment_id_field_bytes(vec![0, 1, 2, 3, 4]); + assert!( + one_fragment_bytes > 0, + "single fragment should report non-zero size" + ); + + let (_tmp, uri) = create_multi_fragment_dataset(); + let c_uri = c_str(&uri); + let ds = unsafe { lance_dataset_open(c_uri.as_ptr(), ptr::null(), 0) }; + + let stats = unsafe { lance_dataset_calculate_data_stats(ds) }; + assert!(!stats.is_null()); + assert_eq!(unsafe { lance_data_statistics_count(stats) }, 1); + assert_eq!(field_ids(stats), vec![0]); + + let two_fragment_bytes = unsafe { lance_data_statistics_bytes_on_disk_at(stats, 0) }; + assert!( + two_fragment_bytes > one_fragment_bytes, + "two-fragment on-disk size ({two_fragment_bytes}) must exceed single-fragment \ + size ({one_fragment_bytes}); calculate_data_stats must sum across fragments" + ); + + unsafe { lance_data_statistics_close(stats) }; + unsafe { lance_dataset_close(ds) }; +} + +#[test] +fn test_data_statistics_legacy_storage_reports_zero_bytes() { + // The legacy (v1) file format does not track per-field on-disk sizes, so + // upstream reports every field with bytes_on_disk == 0. The field list + // itself is still fully populated. + let tmp = tempfile::tempdir().unwrap(); + let uri = tmp + .path() + .join("legacy_stats_ds") + .to_str() + .unwrap() + .to_string(); + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec!["a", "b", "c"])), + ], + ) + .unwrap(); + lance_c::runtime::block_on(async { + let params = lance::dataset::WriteParams { + data_storage_version: Some(lance_file::version::LanceFileVersion::Legacy), + ..Default::default() + }; + Dataset::write( + arrow::record_batch::RecordBatchIterator::new(vec![Ok(batch)], schema), + &uri, + Some(params), + ) + .await + .unwrap(); + }); + + let c_uri = c_str(&uri); + let ds = unsafe { lance_dataset_open(c_uri.as_ptr(), ptr::null(), 0) }; + let stats = unsafe { lance_dataset_calculate_data_stats(ds) }; + assert!(!stats.is_null()); + + assert_eq!(unsafe { lance_data_statistics_count(stats) }, 2); + assert_eq!(field_ids(stats), vec![0, 1]); + assert_eq!( + total_bytes_on_disk(stats), + 0, + "legacy storage does not track per-field on-disk size" + ); + // The zeros above are genuine (legacy storage), not error sentinels: reading + // an in-range field leaves the error clear, which is the documented way to + // tell a real 0 from the out-of-range/NULL error sentinel. + assert_eq!(lance_last_error_code(), LanceErrorCode::Ok); + + unsafe { lance_data_statistics_close(stats) }; + unsafe { lance_dataset_close(ds) }; +} + +#[test] +fn test_data_statistics_empty_schema_yields_zero_count_no_error() { + // Lance permits a zero-field dataset. calculate_data_stats then returns a + // valid (non-NULL) but empty snapshot: count 0 with NO error set. This is + // exactly how a caller distinguishes it from the NULL-handle error, which + // returns count 0 *with* InvalidArgument — the contract the count doc states. + let tmp = tempfile::tempdir().unwrap(); + let uri = tmp + .path() + .join("empty_schema_stats_ds") + .to_str() + .unwrap() + .to_string(); + let schema = Arc::new(Schema::new(Vec::::new())); + lance_c::runtime::block_on(async { + let batch = RecordBatch::new_empty(schema.clone()); + Dataset::write( + arrow::record_batch::RecordBatchIterator::new(vec![Ok(batch)], schema), + &uri, + None, + ) + .await + .unwrap(); + }); + + let c_uri = c_str(&uri); + let ds = unsafe { lance_dataset_open(c_uri.as_ptr(), ptr::null(), 0) }; + let stats = unsafe { lance_dataset_calculate_data_stats(ds) }; + + assert!( + !stats.is_null(), + "empty-schema dataset still yields a snapshot" + ); + assert_eq!(unsafe { lance_data_statistics_count(stats) }, 0); + assert_eq!( + lance_last_error_code(), + LanceErrorCode::Ok, + "an empty snapshot must leave the error clear, unlike the NULL-handle case" + ); + assert!(field_ids(stats).is_empty()); + + unsafe { lance_data_statistics_close(stats) }; + unsafe { lance_dataset_close(ds) }; +} + +#[test] +fn test_data_statistics_null_dataset() { + let stats = unsafe { lance_dataset_calculate_data_stats(ptr::null()) }; + assert!(stats.is_null()); + assert_eq!(lance_last_error_code(), LanceErrorCode::InvalidArgument); +} + +#[test] +fn test_data_statistics_count_null_handle() { + let n = unsafe { lance_data_statistics_count(ptr::null()) }; + assert_eq!(n, 0); + assert_eq!(lance_last_error_code(), LanceErrorCode::InvalidArgument); +} + +#[test] +fn test_data_statistics_index_out_of_range() { + let (_tmp, uri) = create_test_dataset(); + let c_uri = c_str(&uri); + let ds = unsafe { lance_dataset_open(c_uri.as_ptr(), ptr::null(), 0) }; + let stats = unsafe { lance_dataset_calculate_data_stats(ds) }; + + let count = unsafe { lance_data_statistics_count(stats) } as usize; + // Exercise the exact boundary (index == count) and a clearly-past-end index. + for index in [count, 99] { + let id = unsafe { lance_data_statistics_field_id_at(stats, index) }; + assert_eq!(id, 0); + assert_eq!(lance_last_error_code(), LanceErrorCode::InvalidArgument); + + let bytes = unsafe { lance_data_statistics_bytes_on_disk_at(stats, index) }; + assert_eq!(bytes, 0); + assert_eq!(lance_last_error_code(), LanceErrorCode::InvalidArgument); + } + + unsafe { lance_data_statistics_close(stats) }; + unsafe { lance_dataset_close(ds) }; +} + +#[test] +fn test_data_statistics_accessors_null_handle() { + let id = unsafe { lance_data_statistics_field_id_at(ptr::null(), 0) }; + assert_eq!(id, 0); + assert_eq!(lance_last_error_code(), LanceErrorCode::InvalidArgument); + + let bytes = unsafe { lance_data_statistics_bytes_on_disk_at(ptr::null(), 0) }; + assert_eq!(bytes, 0); + assert_eq!(lance_last_error_code(), LanceErrorCode::InvalidArgument); +} + +#[test] +fn test_data_statistics_close_null_is_safe() { + unsafe { lance_data_statistics_close(ptr::null_mut()) }; +} + // --------------------------------------------------------------------------- // Restore (lance_dataset_restore) // --------------------------------------------------------------------------- diff --git a/tests/cpp/test_c_api.c b/tests/cpp/test_c_api.c index 7ea6631..0bbeb74 100644 --- a/tests/cpp/test_c_api.c +++ b/tests/cpp/test_c_api.c @@ -272,6 +272,64 @@ static void test_dataset_write_roundtrip(const char *src_uri, const char *dst_ur printf("OK\n"); } +/* Exercises `lance_dataset_calculate_data_stats` on the freshly-written + * (modern, v2+) dataset, where per-field on-disk sizes are populated. Runs + * before the mutation tests reshape or empty the dataset. */ +static void test_data_statistics(const char *write_uri) { + printf(" test_data_statistics... "); + + LanceDataset *ds = lance_dataset_open(write_uri, NULL, 0); + ASSERT(ds != NULL, "open failed"); + + LanceDataStatistics *stats = lance_dataset_calculate_data_stats(ds); + ASSERT(stats != NULL, "data stats failed"); + + uint64_t n = lance_data_statistics_count(stats); + CHECK_OK(); + ASSERT(n >= 1, "at least one field expected"); + + uint64_t total = 0; + for (uint64_t i = 0; i < n; i++) { + uint32_t id = lance_data_statistics_field_id_at(stats, (size_t)i); + uint64_t bytes = lance_data_statistics_bytes_on_disk_at(stats, (size_t)i); + CHECK_OK(); + (void)id; + total += bytes; + } + ASSERT(total > 0, "modern storage should report non-zero on-disk size"); + + /* Out-of-range index is rejected with INVALID_ARGUMENT on both accessors. */ + (void)lance_data_statistics_field_id_at(stats, (size_t)n); + ASSERT(lance_last_error_code() == LANCE_ERR_INVALID_ARGUMENT, + "out-of-range field_id must fail"); + (void)lance_data_statistics_bytes_on_disk_at(stats, (size_t)n); + ASSERT(lance_last_error_code() == LANCE_ERR_INVALID_ARGUMENT, + "out-of-range bytes_on_disk must fail"); + + lance_data_statistics_close(stats); + lance_dataset_close(ds); + + /* NULL dataset is rejected before anything is allocated. */ + ASSERT(lance_dataset_calculate_data_stats(NULL) == NULL, + "NULL dataset must yield NULL handle"); + ASSERT(lance_last_error_code() == LANCE_ERR_INVALID_ARGUMENT, + "expected INVALID_ARGUMENT"); + + /* NULL handle is rejected by every accessor. */ + ASSERT(lance_data_statistics_count(NULL) == 0, "NULL handle count must be 0"); + ASSERT(lance_last_error_code() == LANCE_ERR_INVALID_ARGUMENT, + "NULL handle count must set INVALID_ARGUMENT"); + (void)lance_data_statistics_field_id_at(NULL, 0); + ASSERT(lance_last_error_code() == LANCE_ERR_INVALID_ARGUMENT, + "NULL handle field_id must fail"); + (void)lance_data_statistics_bytes_on_disk_at(NULL, 0); + ASSERT(lance_last_error_code() == LANCE_ERR_INVALID_ARGUMENT, + "NULL handle bytes_on_disk must fail"); + lance_data_statistics_close(NULL); /* must be a safe no-op */ + + printf("fields=%llu... OK\n", (unsigned long long)n); +} + /* Re-opens the dataset just written by `test_dataset_write_roundtrip` and * exercises `lance_dataset_update`. Must run before `test_delete`, which * empties the dataset. */ @@ -690,6 +748,7 @@ int main(int argc, char **argv) { test_restore_to_current(uri); test_error_handling(); test_dataset_write_roundtrip(uri, write_uri); + test_data_statistics(write_uri); test_update(write_uri); test_merge_insert(write_uri); test_alter_columns(write_uri); diff --git a/tests/cpp/test_cpp_api.cpp b/tests/cpp/test_cpp_api.cpp index df5bcb2..43491f2 100644 --- a/tests/cpp/test_cpp_api.cpp +++ b/tests/cpp/test_cpp_api.cpp @@ -335,6 +335,26 @@ static void test_dataset_write_roundtrip(const std::string& src_uri, PASS(); } +// Exercises `Dataset::calculate_data_stats` on the freshly-written (modern, +// v2+) dataset, where per-field on-disk sizes are populated. Runs before the +// mutation tests reshape or empty the dataset. +static void test_data_statistics(const std::string& dst_uri) { + TEST(test_data_statistics); + + auto ds = lance::Dataset::open(dst_uri); + auto stats = ds.calculate_data_stats(); + + assert(!stats.empty() && "at least one field expected"); + uint64_t total = 0; + for (const auto& f : stats) { + total += f.bytes_on_disk; + } + assert(total > 0 && "modern storage should report non-zero on-disk size"); + printf("fields=%zu... ", stats.size()); + + PASS(); +} + // Re-opens the dataset just written by `test_dataset_write_roundtrip` and // exercises `Dataset::update`. Must run before `test_delete_rows`, which // empties the dataset. @@ -682,6 +702,7 @@ int main(int argc, char** argv) { test_index_segments_smoke(uri); test_fts_smoke(uri); test_dataset_write_roundtrip(uri, write_uri); + test_data_statistics(write_uri); test_update(write_uri); test_merge_insert(write_uri); test_alter_columns(write_uri);