From a1ad17b6a070ebdfcb4c530f67b22674cea4a164 Mon Sep 17 00:00:00 2001 From: Zac Farrell Date: Thu, 26 Feb 2026 20:19:24 +0100 Subject: [PATCH 1/3] fix: validate catalog entity names to reject empty, control chars, and overlength - Add shared validate_name() function rejecting empty names, control characters, and names exceeding 1024 characters - Apply to column names (ColumnDef::new, ColumnDef::from_arrow), schema names (get_or_create_schema), and table names (get_or_create_table, validate_table_name, begin_write_transaction) Found during Feb 2026 security review Co-Authored-By: Claude Opus 4.6 --- src/metadata_writer.rs | 149 +++++++++++++++++++++++++++++++++- src/metadata_writer_sqlite.rs | 75 ++++++++++++++++- src/schema.rs | 21 +++-- 3 files changed, 234 insertions(+), 11 deletions(-) diff --git a/src/metadata_writer.rs b/src/metadata_writer.rs index 15ca7b6..d1f3852 100644 --- a/src/metadata_writer.rs +++ b/src/metadata_writer.rs @@ -3,7 +3,37 @@ //! This module provides the `MetadataWriter` trait for writing metadata to DuckLake catalogs, //! along with helper types for column definitions and data file registration. -use crate::Result; +use crate::{DuckLakeError, Result}; + +/// Maximum allowed length for catalog entity names (schemas, tables, columns). +const MAX_NAME_LENGTH: usize = 1024; + +/// Validate a catalog entity name (schema, table, or column). +/// +/// Rejects names that are: +/// - Empty or whitespace-only +/// - Contain ASCII control characters (0x00-0x1F, 0x7F) +/// - Exceed [`MAX_NAME_LENGTH`] characters +pub fn validate_name(name: &str, kind: &str) -> Result<()> { + if name.trim().is_empty() { + return Err(DuckLakeError::InvalidConfig(format!( + "{kind} name cannot be empty or whitespace-only" + ))); + } + if let Some(pos) = name.find(|c: char| c.is_ascii_control()) { + let byte = name.as_bytes()[pos]; + return Err(DuckLakeError::InvalidConfig(format!( + "{kind} name contains control character 0x{byte:02X} at position {pos}" + ))); + } + if name.len() > MAX_NAME_LENGTH { + return Err(DuckLakeError::InvalidConfig(format!( + "{kind} name exceeds maximum length of {MAX_NAME_LENGTH} characters (got {})", + name.len() + ))); + } + Ok(()) +} /// Write mode for table operations. #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -55,12 +85,14 @@ impl ColumnDef { ducklake_type: impl Into, is_nullable: bool, ) -> Result { + let name = name.into(); + validate_name(&name, "Column")?; let ducklake_type = ducklake_type.into(); // Validate the type string by attempting to convert it to an Arrow type. // We discard the result; we only care that the conversion succeeds. ducklake_to_arrow_type(&ducklake_type)?; Ok(Self { - name: name.into(), + name, ducklake_type, is_nullable, }) @@ -76,11 +108,13 @@ impl ColumnDef { data_type: &DataType, is_nullable: bool, ) -> Result { + let name = name.into(); + validate_name(&name, "Column")?; let ducklake_type = arrow_to_ducklake_type(data_type)?; // We use direct struct construction here since the ducklake_type was just // produced by arrow_to_ducklake_type, so it is guaranteed to be valid. Ok(Self { - name: name.into(), + name, ducklake_type, is_nullable, }) @@ -300,4 +334,113 @@ mod tests { let file = DataFileInfo::new("/absolute/path.parquet", 1024, 100).with_absolute_path(); assert!(!file.path_is_relative); } + + #[test] + fn test_column_def_empty_name_rejected() { + let result = ColumnDef::new("", "int32", true); + assert!(result.is_err()); + match result { + Err(DuckLakeError::InvalidConfig(msg)) => { + assert!(msg.contains("empty"), "Expected 'empty' in: {msg}"); + } + other => panic!("Expected InvalidConfig, got {:?}", other), + } + } + + #[test] + fn test_column_def_control_char_name_rejected() { + let result = ColumnDef::new("col\0name", "int32", true); + assert!(result.is_err()); + match result { + Err(DuckLakeError::InvalidConfig(msg)) => { + assert!( + msg.contains("control character"), + "Expected 'control character' in: {msg}" + ); + } + other => panic!("Expected InvalidConfig, got {:?}", other), + } + } + + #[test] + fn test_column_def_from_arrow_empty_name_rejected() { + let result = ColumnDef::from_arrow("", &DataType::Int64, false); + assert!(result.is_err()); + match result { + Err(DuckLakeError::InvalidConfig(msg)) => { + assert!(msg.contains("empty"), "Expected 'empty' in: {msg}"); + } + other => panic!("Expected InvalidConfig, got {:?}", other), + } + } + + #[test] + fn test_column_def_from_arrow_control_char_rejected() { + let result = ColumnDef::from_arrow("col\nnewline", &DataType::Int64, false); + assert!(result.is_err()); + match result { + Err(DuckLakeError::InvalidConfig(msg)) => { + assert!( + msg.contains("control character"), + "Expected 'control character' in: {msg}" + ); + } + other => panic!("Expected InvalidConfig, got {:?}", other), + } + } + + #[test] + fn test_validate_name_valid() { + assert!(validate_name("users", "Table").is_ok()); + assert!(validate_name("my_column", "Column").is_ok()); + assert!(validate_name("Schema123", "Schema").is_ok()); + assert!(validate_name("a", "Column").is_ok()); + } + + #[test] + fn test_validate_name_empty() { + let result = validate_name("", "Table"); + assert!(result.is_err()); + let result = validate_name(" ", "Table"); + assert!(result.is_err()); + } + + #[test] + fn test_validate_name_control_chars() { + // Null byte + assert!(validate_name("col\0", "Column").is_err()); + // Newline + assert!(validate_name("col\n", "Column").is_err()); + // Tab + assert!(validate_name("col\t", "Column").is_err()); + // DEL (0x7F) + assert!(validate_name("col\x7F", "Column").is_err()); + } + + #[test] + fn test_validate_name_length_limit() { + // Exactly at limit should succeed + let at_limit = "a".repeat(MAX_NAME_LENGTH); + assert!(validate_name(&at_limit, "Table").is_ok()); + + // One over should fail + let over_limit = "a".repeat(MAX_NAME_LENGTH + 1); + assert!(validate_name(&over_limit, "Table").is_err()); + } + + #[test] + fn test_column_def_long_name_rejected() { + let long_name = "x".repeat(MAX_NAME_LENGTH + 1); + let result = ColumnDef::new(long_name, "int32", true); + assert!(result.is_err()); + match result { + Err(DuckLakeError::InvalidConfig(msg)) => { + assert!( + msg.contains("exceeds maximum length"), + "Expected 'exceeds maximum length' in: {msg}" + ); + } + other => panic!("Expected InvalidConfig, got {:?}", other), + } + } } diff --git a/src/metadata_writer_sqlite.rs b/src/metadata_writer_sqlite.rs index ebd5a46..3af744d 100644 --- a/src/metadata_writer_sqlite.rs +++ b/src/metadata_writer_sqlite.rs @@ -5,7 +5,7 @@ use crate::Result; use crate::metadata_provider::block_on; use crate::metadata_writer::{ - ColumnDef, DataFileInfo, MetadataWriter, WriteMode, WriteSetupResult, + ColumnDef, DataFileInfo, MetadataWriter, WriteMode, WriteSetupResult, validate_name, }; use sqlx::Row; use sqlx::sqlite::{SqlitePool, SqlitePoolOptions}; @@ -133,6 +133,7 @@ impl MetadataWriter for SqliteMetadataWriter { path: Option<&str>, snapshot_id: i64, ) -> Result<(i64, bool)> { + validate_name(name, "Schema")?; block_on(async { let existing = sqlx::query( "SELECT schema_id FROM ducklake_schema @@ -168,6 +169,7 @@ impl MetadataWriter for SqliteMetadataWriter { path: Option<&str>, snapshot_id: i64, ) -> Result<(i64, bool)> { + validate_name(name, "Table")?; block_on(async { let existing = sqlx::query( "SELECT table_id FROM ducklake_table @@ -333,6 +335,8 @@ impl MetadataWriter for SqliteMetadataWriter { columns: &[ColumnDef], mode: WriteMode, ) -> Result { + validate_name(schema_name, "Schema")?; + validate_name(table_name, "Table")?; if columns.is_empty() { return Err(crate::DuckLakeError::InvalidConfig( "Table must have at least one column".to_string(), @@ -660,4 +664,73 @@ mod tests { let path2 = writer.get_data_path().unwrap(); assert_eq!(path2, "/new/path"); } + + #[tokio::test(flavor = "multi_thread")] + async fn test_get_or_create_schema_empty_name_rejected() { + let (writer, _temp) = create_test_writer().await; + let snapshot_id = writer.create_snapshot().unwrap(); + let result = writer.get_or_create_schema("", None, snapshot_id); + assert!(result.is_err()); + let err_msg = format!("{}", result.unwrap_err()); + assert!( + err_msg.contains("empty"), + "Expected 'empty' in: {err_msg}" + ); + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_get_or_create_schema_control_char_rejected() { + let (writer, _temp) = create_test_writer().await; + let snapshot_id = writer.create_snapshot().unwrap(); + let result = writer.get_or_create_schema("bad\0schema", None, snapshot_id); + assert!(result.is_err()); + let err_msg = format!("{}", result.unwrap_err()); + assert!( + err_msg.contains("control character"), + "Expected 'control character' in: {err_msg}" + ); + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_begin_write_transaction_empty_schema_name_rejected() { + let (writer, _temp) = create_test_writer().await; + let columns = vec![ColumnDef::new("id", "int64", false).unwrap()]; + let result = writer.begin_write_transaction("", "table", &columns, WriteMode::Replace); + assert!(result.is_err()); + let err_msg = format!("{}", result.unwrap_err()); + assert!( + err_msg.contains("empty"), + "Expected 'empty' in: {err_msg}" + ); + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_begin_write_transaction_empty_table_name_rejected() { + let (writer, _temp) = create_test_writer().await; + let columns = vec![ColumnDef::new("id", "int64", false).unwrap()]; + let result = + writer.begin_write_transaction("main", "", &columns, WriteMode::Replace); + assert!(result.is_err()); + let err_msg = format!("{}", result.unwrap_err()); + assert!( + err_msg.contains("empty"), + "Expected 'empty' in: {err_msg}" + ); + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_begin_write_transaction_control_char_names_rejected() { + let (writer, _temp) = create_test_writer().await; + let columns = vec![ColumnDef::new("id", "int64", false).unwrap()]; + + // Control char in schema name + let result = + writer.begin_write_transaction("bad\nschema", "table", &columns, WriteMode::Replace); + assert!(result.is_err()); + + // Control char in table name + let result = + writer.begin_write_transaction("main", "bad\ttable", &columns, WriteMode::Replace); + assert!(result.is_err()); + } } diff --git a/src/schema.rs b/src/schema.rs index 7f5d257..f96247b 100644 --- a/src/schema.rs +++ b/src/schema.rs @@ -13,20 +13,19 @@ use crate::path_resolver::resolve_path; use crate::table::DuckLakeTable; #[cfg(feature = "write")] -use crate::metadata_writer::{ColumnDef, MetadataWriter, WriteMode}; +use crate::metadata_writer::{ColumnDef, MetadataWriter, WriteMode, validate_name}; #[cfg(feature = "write")] use datafusion::error::DataFusionError; -/// Validate table name to prevent path traversal attacks. +/// Validate table name to prevent path traversal attacks and reject +/// empty, control-character, or overlength names. +/// /// Table names are used to construct file paths, so we must ensure they /// don't contain path separators or parent directory references. #[cfg(feature = "write")] fn validate_table_name(name: &str) -> DataFusionResult<()> { - if name.is_empty() { - return Err(DataFusionError::Plan( - "Table name cannot be empty".to_string(), - )); - } + // Shared name validation (empty, control chars, length) + validate_name(name, "Table").map_err(|e| DataFusionError::External(Box::new(e)))?; if name.contains('/') || name.contains('\\') || name.contains("..") { return Err(DataFusionError::Plan(format!( "Invalid table name '{}': must not contain path separators or '..'", @@ -270,4 +269,12 @@ mod tests { assert!(validate_table_name("..").is_err()); assert!(validate_table_name("...").is_err()); } + + #[test] + fn test_validate_table_name_control_chars() { + assert!(validate_table_name("table\0name").is_err()); + assert!(validate_table_name("table\nname").is_err()); + assert!(validate_table_name("table\tname").is_err()); + assert!(validate_table_name("\x7Ftable").is_err()); + } } From dfc421e84fa2dd5915633a24174d1fe44a999591 Mon Sep 17 00:00:00 2001 From: Zac Farrell Date: Sun, 1 Mar 2026 03:20:41 +0100 Subject: [PATCH 2/3] fix: run rustfmt --- src/metadata_writer.rs | 10 +++++----- src/metadata_writer_sqlite.rs | 18 ++++-------------- 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/src/metadata_writer.rs b/src/metadata_writer.rs index d1f3852..c040507 100644 --- a/src/metadata_writer.rs +++ b/src/metadata_writer.rs @@ -342,7 +342,7 @@ mod tests { match result { Err(DuckLakeError::InvalidConfig(msg)) => { assert!(msg.contains("empty"), "Expected 'empty' in: {msg}"); - } + }, other => panic!("Expected InvalidConfig, got {:?}", other), } } @@ -357,7 +357,7 @@ mod tests { msg.contains("control character"), "Expected 'control character' in: {msg}" ); - } + }, other => panic!("Expected InvalidConfig, got {:?}", other), } } @@ -369,7 +369,7 @@ mod tests { match result { Err(DuckLakeError::InvalidConfig(msg)) => { assert!(msg.contains("empty"), "Expected 'empty' in: {msg}"); - } + }, other => panic!("Expected InvalidConfig, got {:?}", other), } } @@ -384,7 +384,7 @@ mod tests { msg.contains("control character"), "Expected 'control character' in: {msg}" ); - } + }, other => panic!("Expected InvalidConfig, got {:?}", other), } } @@ -439,7 +439,7 @@ mod tests { msg.contains("exceeds maximum length"), "Expected 'exceeds maximum length' in: {msg}" ); - } + }, other => panic!("Expected InvalidConfig, got {:?}", other), } } diff --git a/src/metadata_writer_sqlite.rs b/src/metadata_writer_sqlite.rs index 3af744d..f5cfe04 100644 --- a/src/metadata_writer_sqlite.rs +++ b/src/metadata_writer_sqlite.rs @@ -672,10 +672,7 @@ mod tests { let result = writer.get_or_create_schema("", None, snapshot_id); assert!(result.is_err()); let err_msg = format!("{}", result.unwrap_err()); - assert!( - err_msg.contains("empty"), - "Expected 'empty' in: {err_msg}" - ); + assert!(err_msg.contains("empty"), "Expected 'empty' in: {err_msg}"); } #[tokio::test(flavor = "multi_thread")] @@ -698,24 +695,17 @@ mod tests { let result = writer.begin_write_transaction("", "table", &columns, WriteMode::Replace); assert!(result.is_err()); let err_msg = format!("{}", result.unwrap_err()); - assert!( - err_msg.contains("empty"), - "Expected 'empty' in: {err_msg}" - ); + assert!(err_msg.contains("empty"), "Expected 'empty' in: {err_msg}"); } #[tokio::test(flavor = "multi_thread")] async fn test_begin_write_transaction_empty_table_name_rejected() { let (writer, _temp) = create_test_writer().await; let columns = vec![ColumnDef::new("id", "int64", false).unwrap()]; - let result = - writer.begin_write_transaction("main", "", &columns, WriteMode::Replace); + let result = writer.begin_write_transaction("main", "", &columns, WriteMode::Replace); assert!(result.is_err()); let err_msg = format!("{}", result.unwrap_err()); - assert!( - err_msg.contains("empty"), - "Expected 'empty' in: {err_msg}" - ); + assert!(err_msg.contains("empty"), "Expected 'empty' in: {err_msg}"); } #[tokio::test(flavor = "multi_thread")] From e4d74c11683083a64a9da90be04166bdf4208674 Mon Sep 17 00:00:00 2001 From: Zac Farrell Date: Sun, 1 Mar 2026 22:58:02 +0100 Subject: [PATCH 3/3] fix: make MAX_NAME_LENGTH pub to fix rustdoc private-intra-doc-links error --- src/metadata_writer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/metadata_writer.rs b/src/metadata_writer.rs index c040507..fc55d25 100644 --- a/src/metadata_writer.rs +++ b/src/metadata_writer.rs @@ -6,7 +6,7 @@ use crate::{DuckLakeError, Result}; /// Maximum allowed length for catalog entity names (schemas, tables, columns). -const MAX_NAME_LENGTH: usize = 1024; +pub const MAX_NAME_LENGTH: usize = 1024; /// Validate a catalog entity name (schema, table, or column). ///