diff --git a/src/duckdb/extension/core_functions/scalar/date/date_diff.cpp b/src/duckdb/extension/core_functions/scalar/date/date_diff.cpp index aff668e20..4cf5739f4 100644 --- a/src/duckdb/extension/core_functions/scalar/date/date_diff.cpp +++ b/src/duckdb/extension/core_functions/scalar/date/date_diff.cpp @@ -102,7 +102,7 @@ struct DateDiff { template static inline TR Operation(TA startdate, TB enddate) { // Weeks do not count Monday crossings, just distance - return (enddate.days - startdate.days) / Interval::DAYS_PER_WEEK; + return (TR(enddate.days) - TR(startdate.days)) / Interval::DAYS_PER_WEEK; } }; @@ -116,7 +116,9 @@ struct DateDiff { struct MicrosecondsOperator { template static inline TR Operation(TA startdate, TB enddate) { - return Date::EpochMicroseconds(enddate) - Date::EpochMicroseconds(startdate); + const auto start = Date::EpochMicroseconds(startdate); + const auto end = Date::EpochMicroseconds(enddate); + return SubtractOperatorOverflowCheck::Operation(end, start); } }; diff --git a/src/duckdb/extension/json/json_extension.cpp b/src/duckdb/extension/json/json_extension.cpp index e4ca49e13..3d6aae37e 100644 --- a/src/duckdb/extension/json/json_extension.cpp +++ b/src/duckdb/extension/json/json_extension.cpp @@ -21,7 +21,8 @@ static const DefaultMacro JSON_MACROS[] = { "json_group_object", {"n", "v", nullptr}, {{nullptr, nullptr}}, - "CAST('{' || string_agg(to_json(n::VARCHAR) || ':' || CASE WHEN v IS NULL THEN 'null'::JSON ELSE to_json(v) END, " + "CAST('{' || string_agg(CASE WHEN n IS NULL THEN error('json_group_object key cannot be NULL') ELSE " + "to_json(n::VARCHAR) END || ':' || CASE WHEN v IS NULL THEN 'null'::JSON ELSE to_json(v) END, " "',') || '}' AS JSON)"}, {DEFAULT_SCHEMA, "json_group_structure", diff --git a/src/duckdb/extension/json/json_functions/json_create.cpp b/src/duckdb/extension/json/json_functions/json_create.cpp index 56f93707e..f7322ff29 100644 --- a/src/duckdb/extension/json/json_functions/json_create.cpp +++ b/src/duckdb/extension/json/json_functions/json_create.cpp @@ -155,8 +155,8 @@ static unique_ptr ArrayToJSONBind(ClientContext &context, ScalarFu if (arguments[0]->HasParameter()) { throw ParameterNotResolvedException(); } - if (arg_id != LogicalTypeId::LIST && arg_id != LogicalTypeId::SQLNULL) { - throw BinderException("array_to_json() argument type must be LIST"); + if (arg_id != LogicalTypeId::LIST && arg_id != LogicalTypeId::ARRAY && arg_id != LogicalTypeId::SQLNULL) { + throw BinderException("array_to_json() argument type must be LIST or ARRAY"); } return JSONCreateBindParams(bound_function, arguments, false); } @@ -259,7 +259,7 @@ static void AddKeyValuePairs(yyjson_mut_doc *doc, yyjson_mut_val *objs[], Vector for (idx_t i = 0; i < count; i++) { auto key_idx = key_data.sel->get_index(i); if (!key_data.validity.RowIsValid(key_idx)) { - continue; + throw InvalidInputException("JSON key cannot be NULL"); } auto key = CreateJSONValue::Operation(doc, keys[key_idx]); yyjson_mut_obj_add(objs[i], key, vals[i]); diff --git a/src/duckdb/extension/json/json_functions/json_transform.cpp b/src/duckdb/extension/json/json_functions/json_transform.cpp index f05feb7b2..aea3b46d6 100644 --- a/src/duckdb/extension/json/json_functions/json_transform.cpp +++ b/src/duckdb/extension/json/json_functions/json_transform.cpp @@ -39,7 +39,7 @@ static LogicalType StructureToTypeObject(yyjson_val *obj, ClientContext &context yyjson_val *key, *val; yyjson_obj_foreach(obj, idx, max, key, val) { val = yyjson_obj_iter_get_val(key); - auto key_str = unsafe_yyjson_get_str(key); + string key_str(unsafe_yyjson_get_str(key), unsafe_yyjson_get_len(key)); if (names.find(key_str) != names.end()) { JSONCommon::ThrowValFormatError("Duplicate keys in object in JSON structure: %s", val); } diff --git a/src/duckdb/extension/json/json_multi_file_info.cpp b/src/duckdb/extension/json/json_multi_file_info.cpp index 1f131e6af..c1ac09abd 100644 --- a/src/duckdb/extension/json/json_multi_file_info.cpp +++ b/src/duckdb/extension/json/json_multi_file_info.cpp @@ -232,7 +232,9 @@ bool JSONMultiFileInfo::ParseCopyOption(ClientContext &context, const string &ke } else { JSONCheckSingleParameter(key, values); options.auto_detect = BooleanValue::Get(values.back().DefaultCastAs(LogicalTypeId::BOOLEAN)); - options.format = JSONFormat::NEWLINE_DELIMITED; + if (options.format == JSONFormat::AUTO_DETECT) { + options.format = JSONFormat::NEWLINE_DELIMITED; + } } return true; } diff --git a/src/duckdb/extension/json/json_reader.cpp b/src/duckdb/extension/json/json_reader.cpp index 25feff588..832301836 100644 --- a/src/duckdb/extension/json/json_reader.cpp +++ b/src/duckdb/extension/json/json_reader.cpp @@ -658,16 +658,21 @@ bool JSONReader::ParseJSON(JSONReaderScanState &scan_state, char *const json_sta err.pos = json_size; AddParseError(scan_state, scan_state.lines_or_objects_in_buffer, err, "Try auto-detecting the JSON format"); return false; - } else if (!options.ignore_errors && read_size < json_size) { + } + if (read_size < json_size) { idx_t off = read_size; idx_t rem = json_size; SkipWhitespace(json_start, off, rem); if (off != rem) { // Between end of document and boundary should be whitespace only - err.code = YYJSON_READ_ERROR_UNEXPECTED_CONTENT; - err.msg = "unexpected content after document"; - err.pos = read_size; - AddParseError(scan_state, scan_state.lines_or_objects_in_buffer, err, "Try auto-detecting the JSON format"); - return false; + if (!options.ignore_errors) { + err.code = YYJSON_READ_ERROR_UNEXPECTED_CONTENT; + err.msg = "unexpected content after document"; + err.pos = read_size; + AddParseError(scan_state, scan_state.lines_or_objects_in_buffer, err, + "Try auto-detecting the JSON format"); + return false; + } + doc = nullptr; } } diff --git a/src/duckdb/extension/parquet/include/decode_utils.hpp b/src/duckdb/extension/parquet/include/decode_utils.hpp index 20ba91dd3..5dad16705 100644 --- a/src/duckdb/extension/parquet/include/decode_utils.hpp +++ b/src/duckdb/extension/parquet/include/decode_utils.hpp @@ -36,6 +36,11 @@ class ParquetDecodeUtils { static void BitUnpack(ByteBuffer &src, bitpacking_width_t &bitpack_pos, T *dst, idx_t count, const bitpacking_width_t width) { CheckWidth(width); + if (width > sizeof(T) * BITPACK_DLEN) { + throw IOException("The width (%d) of the bitpacked data exceeds the maximum width (%d) for " + "the target type, the file might be corrupted.", + width, sizeof(T) * BITPACK_DLEN); + } const auto mask = BITPACK_MASKS[width]; src.available(count * width / BITPACK_DLEN); // check if buffer has enough space available once if (bitpack_pos == 0 && count >= BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE) { @@ -88,6 +93,12 @@ class ParquetDecodeUtils { template static void BitUnpackAlignedInternal(ByteBuffer &src, T *dst, const idx_t count, const bitpacking_width_t width) { D_ASSERT(count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE == 0); + if (width > sizeof(T) * BITPACK_DLEN) { + throw IOException("The width (%d) of the bitpacked data exceeds the maximum width (%d) for " + "the target type, the file might be corrupted.", + width, sizeof(T) * BITPACK_DLEN); + } + if (cast_pointer_to_uint64(src.ptr) % sizeof(T) == 0) { // Fast path: aligned BitpackingPrimitives::UnPackBuffer(data_ptr_cast(dst), src.ptr, count, width); diff --git a/src/duckdb/extension/parquet/include/parquet_reader.hpp b/src/duckdb/extension/parquet/include/parquet_reader.hpp index 3fd7dbb8d..5f4b259fa 100644 --- a/src/duckdb/extension/parquet/include/parquet_reader.hpp +++ b/src/duckdb/extension/parquet/include/parquet_reader.hpp @@ -82,6 +82,10 @@ struct ParquetReaderScanState { //! (optional) pointer to the PhysicalOperator for logging optional_ptr op; + + //! Number of row groups actually scanned (i.e. not pruned by filters) by this scan state. + //! Accumulates across all files processed by the owning (per-thread) local state. + idx_t row_groups_scanned = 0; }; struct ParquetColumnDefinition { @@ -156,7 +160,7 @@ class ParquetReader : public BaseFileReader { unique_ptr root_schema; shared_ptr encryption_util; //! How many rows have been read from this file - atomic rows_read; + atomic rows_read {0}; public: string GetReaderType() const override { diff --git a/src/duckdb/extension/parquet/include/reader/variant/variant_binary_decoder.hpp b/src/duckdb/extension/parquet/include/reader/variant/variant_binary_decoder.hpp index 0f5d4e91c..d0d92c0fc 100644 --- a/src/duckdb/extension/parquet/include/reader/variant/variant_binary_decoder.hpp +++ b/src/duckdb/extension/parquet/include/reader/variant/variant_binary_decoder.hpp @@ -33,8 +33,9 @@ struct VariantMetadata { public: VariantMetadataHeader header; - const_data_ptr_t offsets; - const_data_ptr_t bytes; + + //! Total byte length of the metadata region. + idx_t total_size = 0; //! The json object keys have to be null-terminated //! But we don't receive them null-terminated @@ -135,15 +136,18 @@ class VariantBinaryDecoder { VariantBinaryDecoder() = delete; public: - static VariantValue Decode(const VariantMetadata &metadata, const_data_ptr_t data); + static VariantValue Decode(const VariantMetadata &metadata, const_data_ptr_t data, idx_t data_offset, + idx_t data_size); public: - static VariantValue PrimitiveTypeDecode(const VariantValueMetadata &value_metadata, const_data_ptr_t data); - static VariantValue ShortStringDecode(const VariantValueMetadata &value_metadata, const_data_ptr_t data); + static VariantValue PrimitiveTypeDecode(const VariantValueMetadata &value_metadata, const_data_ptr_t data, + idx_t data_offset, idx_t data_size); + static VariantValue ShortStringDecode(const VariantValueMetadata &value_metadata, const_data_ptr_t data, + idx_t data_offset, idx_t data_size); static VariantValue ObjectDecode(const VariantMetadata &metadata, const VariantValueMetadata &value_metadata, - const_data_ptr_t data); + const_data_ptr_t data, idx_t data_offset, idx_t data_size); static VariantValue ArrayDecode(const VariantMetadata &metadata, const VariantValueMetadata &value_metadata, - const_data_ptr_t data); + const_data_ptr_t data, idx_t data_offset, idx_t data_size); }; } // namespace duckdb diff --git a/src/duckdb/extension/parquet/include/reader/variant/variant_shredded_conversion.hpp b/src/duckdb/extension/parquet/include/reader/variant/variant_shredded_conversion.hpp index 8c7462b75..5ef709ccc 100644 --- a/src/duckdb/extension/parquet/include/reader/variant/variant_shredded_conversion.hpp +++ b/src/duckdb/extension/parquet/include/reader/variant/variant_shredded_conversion.hpp @@ -1,6 +1,7 @@ #pragma once #include "duckdb/common/types/variant_value.hpp" +#include "duckdb/function/scalar_function.hpp" #include "reader/variant/variant_binary_decoder.hpp" namespace duckdb { @@ -11,12 +12,16 @@ class VariantShreddedConversion { public: static vector Convert(Vector &metadata, Vector &group, idx_t offset, idx_t length, idx_t total_size); + static void ConvertBinaryToVariant(Vector &metadata_and_value, idx_t offset, idx_t length, idx_t total_size, + Vector &result); static vector ConvertShreddedLeaf(Vector &metadata, Vector &value, Vector &typed_value, idx_t offset, idx_t length, idx_t total_size); static vector ConvertShreddedArray(Vector &metadata, Vector &value, Vector &typed_value, idx_t offset, idx_t length, idx_t total_size); static vector ConvertShreddedObject(Vector &metadata, Vector &value, Vector &typed_value, idx_t offset, idx_t length, idx_t total_size); + //! Inverse of GetTransformFunction: decode a binary Variant value (metadata followed by value) into a VARIANT. + static ScalarFunction GetBytesToVariantFunction(); }; } // namespace duckdb diff --git a/src/duckdb/extension/parquet/parquet_extension.cpp b/src/duckdb/extension/parquet/parquet_extension.cpp index b7a64c80f..caad5e2de 100644 --- a/src/duckdb/extension/parquet/parquet_extension.cpp +++ b/src/duckdb/extension/parquet/parquet_extension.cpp @@ -17,6 +17,7 @@ #include "zstd_file_system.hpp" #include "writer/primitive_column_writer.hpp" #include "writer/variant_column_writer.hpp" +#include "reader/variant_column_reader.hpp" #include #include @@ -56,6 +57,7 @@ #include "duckdb/logging/log_manager.hpp" #include "duckdb/main/settings.hpp" #include "parquet_multi_file_info.hpp" +#include "reader/variant/variant_shredded_conversion.hpp" namespace duckdb { @@ -879,6 +881,9 @@ static void LoadInternal(ExtensionLoader &loader) { // variant_to_parquet_variant loader.RegisterFunction(VariantColumnWriter::GetTransformFunction()); + // bytes_to_variant + loader.RegisterFunction(VariantShreddedConversion::GetBytesToVariantFunction()); + CopyFunction function("parquet"); function.copy_to_select = ParquetWriteSelect; function.copy_to_bind = ParquetWriteBind; diff --git a/src/duckdb/extension/parquet/parquet_multi_file_info.cpp b/src/duckdb/extension/parquet/parquet_multi_file_info.cpp index e47befe2a..c8b742f55 100644 --- a/src/duckdb/extension/parquet/parquet_multi_file_info.cpp +++ b/src/duckdb/extension/parquet/parquet_multi_file_info.cpp @@ -53,12 +53,15 @@ struct ParquetReadBindData : public TableFunctionData { struct ParquetReadGlobalState : public GlobalTableFunctionState { explicit ParquetReadGlobalState(optional_ptr op_p) - : row_group_index(0), batch_index(0), op(op_p) { + : row_group_index(0), batch_index(0), total_row_groups_to_scan(0), op(op_p) { } //! Index of row group within file currently up for scanning idx_t row_group_index; //! Batch index of the next row group to be scanned idx_t batch_index; + //! Total number of row groups dispatched for scanning across all files. + //! Updated under the MultiFileGlobalState lock as row groups are handed out. + idx_t total_row_groups_to_scan; //! (Optional) pointer to physical operator performing the scan optional_ptr op; }; @@ -366,6 +369,24 @@ static vector ParquetGetPartitionStats(ClientContext &conte return result; } +static void ParquetGetMetrics(ClientContext &, const FunctionData *, GlobalTableFunctionState &global_state_p, + LocalTableFunctionState &local_state_p, const profiler_settings_t &requested_metrics, + profiler_metrics_t &metrics) { + auto &mf_gstate = global_state_p.Cast(); + auto &mf_lstate = local_state_p.Cast(); + auto &gstate = mf_gstate.global_state->Cast(); + auto &lstate = mf_lstate.local_state->Cast(); + + if (requested_metrics.find(MetricType::OPERATOR_ROW_GROUPS_SCANNED) != requested_metrics.end()) { + // per-thread count of row groups actually read; summed across threads by the profiler + metrics[MetricType::OPERATOR_ROW_GROUPS_SCANNED] = Value::UBIGINT(lstate.scan_state.row_groups_scanned); + } + if (requested_metrics.find(MetricType::OPERATOR_TOTAL_ROW_GROUPS_TO_SCAN) != requested_metrics.end()) { + // shared total across all files; reported identically by every thread + metrics[MetricType::OPERATOR_TOTAL_ROW_GROUPS_TO_SCAN] = Value::UBIGINT(gstate.total_row_groups_to_scan); + } +} + TableFunctionSet ParquetScanFunction::GetFunctionSet() { MultiFileFunction table_function("parquet_scan"); table_function.named_parameters["binary_as_string"] = LogicalType::BOOLEAN; @@ -383,6 +404,7 @@ TableFunctionSet ParquetScanFunction::GetFunctionSet() { table_function.get_row_id_columns = ParquetGetRowIdColumns; table_function.pushdown_expression = ParquetScanPushdownExpression; table_function.get_partition_stats = ParquetGetPartitionStats; + table_function.get_metrics = ParquetGetMetrics; table_function.filter_pushdown = true; table_function.filter_prune = true; table_function.late_materialization = true; @@ -687,6 +709,8 @@ bool ParquetReader::TryInitializeScan(ClientContext &context, GlobalTableFunctio // The current reader has rowgroups left to be scanned lstate.group_indexes = {gstate.row_group_index}; gstate.row_group_index++; + // Count this row group towards the total to be scanned (called under the MultiFileGlobalState lock) + gstate.total_row_groups_to_scan++; return true; } diff --git a/src/duckdb/extension/parquet/parquet_reader.cpp b/src/duckdb/extension/parquet/parquet_reader.cpp index f73e229b7..243077353 100644 --- a/src/duckdb/extension/parquet/parquet_reader.cpp +++ b/src/duckdb/extension/parquet/parquet_reader.cpp @@ -795,8 +795,14 @@ MultiFileColumnDefinition ParquetReader::ParseColumnDefinition(const FileMetaDat result.identifier = Value::INTEGER(parent_column_schema.field_id); } } - for (auto &child : element.children) { - result.children.push_back(ParseColumnDefinition(file_meta_data, child)); + // A GEOMETRY column is a leaf at the logical level - it only wraps an inner BLOB child internally so that the + // reader can validate/transform the WKB. Exposing that child here would make the column definition diverge from + // the (childless) global GEOMETRY column, breaking trivial column mapping and disabling row group pruning for + // spatial predicates. Treat it as a leaf. + if (element.schema_type != ParquetColumnSchemaType::GEOMETRY) { + for (auto &child : element.children) { + result.children.push_back(ParseColumnDefinition(file_meta_data, child)); + } } return result; } @@ -894,11 +900,11 @@ ParquetReader::ParquetReader(ClientContext &context_p, OpenFileInfo file_p, Parq metadata = LoadMetadata(context_p, allocator, *file_handle, parquet_options.encryption_config, encryption_util, footer_size); } else { - metadata = ObjectCache::GetObjectCache(context_p).Get(file.path); + metadata = ObjectCache::GetObjectCache(context_p).GetWithTypePrefix(file.path); if (!metadata || !metadata->IsValid(*file_handle)) { metadata = LoadMetadata(context_p, allocator, *file_handle, parquet_options.encryption_config, encryption_util, footer_size); - ObjectCache::GetObjectCache(context_p).Put(file.path, metadata); + ObjectCache::GetObjectCache(context_p).PutWithTypePrefix(file.path, metadata); } } } else { @@ -915,7 +921,7 @@ bool ParquetReader::MetadataCacheEnabled(ClientContext &context) { shared_ptr ParquetReader::GetMetadataCacheEntry(ClientContext &context, const OpenFileInfo &file) { - return ObjectCache::GetObjectCache(context).Get(file.path); + return ObjectCache::GetObjectCache(context).GetWithTypePrefix(file.path); } ParquetUnionData::~ParquetUnionData() { @@ -1421,9 +1427,14 @@ AsyncResult ParquetReader::Scan(ClientContext &context, ParquetReaderScanState & } auto &group = GetGroup(state); + const bool row_group_pruned = state.offset_in_group == (idx_t)group.num_rows; + if (!row_group_pruned) { + // the row group survived filter pruning and will actually be read + state.row_groups_scanned++; + } if (state.op) { DUCKDB_LOG(context, PhysicalOperatorLogType, *state.op, "ParquetReader", - state.offset_in_group == (idx_t)group.num_rows ? "SkipRowGroup" : "ReadRowGroup", + row_group_pruned ? "SkipRowGroup" : "ReadRowGroup", {{"file", file.path}, {"row_group_id", to_string(state.group_idx_list[state.current_group])}}); } diff --git a/src/duckdb/extension/parquet/reader/variant/variant_binary_decoder.cpp b/src/duckdb/extension/parquet/reader/variant/variant_binary_decoder.cpp index 8ab99fd61..b9bd88a66 100644 --- a/src/duckdb/extension/parquet/reader/variant/variant_binary_decoder.cpp +++ b/src/duckdb/extension/parquet/reader/variant/variant_binary_decoder.cpp @@ -9,7 +9,6 @@ #include "duckdb/common/types/uuid.hpp" #include "duckdb/common/types/time.hpp" #include "duckdb/common/types/date.hpp" -#include "duckdb/common/types/blob.hpp" static constexpr uint8_t VERSION_MASK = 0xF; static constexpr uint8_t SORTED_STRINGS_MASK = 0x1; @@ -40,13 +39,18 @@ namespace duckdb { namespace { -static idx_t ReadVariableLengthLittleEndian(idx_t length_in_bytes, const_data_ptr_t &ptr) { +static idx_t ReadVariableLengthLittleEndian(idx_t length_in_bytes, const_data_ptr_t ptr, idx_t &offset, + const idx_t capacity) { if (length_in_bytes > sizeof(idx_t)) { throw NotImplementedException("Can't read little-endian value of %d bytes", length_in_bytes); } + if (offset + length_in_bytes > capacity) { + throw IOException("Data corruption detected, read of length_in_bytes (%d) would exceed buffer capacity", + length_in_bytes); + } idx_t result = 0; - memcpy(reinterpret_cast(&result), ptr, length_in_bytes); - ptr += length_in_bytes; + memcpy(reinterpret_cast(&result), ptr + offset, length_in_bytes); + offset += length_in_bytes; return result; } @@ -67,21 +71,34 @@ VariantMetadataHeader VariantMetadataHeader::FromHeaderByte(uint8_t byte) { } VariantMetadata::VariantMetadata(const string_t &metadata) : metadata(metadata) { - auto metadata_data = metadata.GetData(); + auto metadata_data = reinterpret_cast(metadata.GetData()); + const auto metadata_buffer_capacity = metadata.GetSize(); + if (!metadata_data || metadata.GetSize() < 1) { + throw IOException("Corrupted VARIANT 'metadata' buffer, empty or nullptr"); + } - header = VariantMetadataHeader::FromHeaderByte(metadata_data[0]); + idx_t metadata_offset = 0; + header = VariantMetadataHeader::FromHeaderByte(metadata_data[metadata_offset]); + metadata_offset += sizeof(uint8_t); - const_data_ptr_t ptr = reinterpret_cast(metadata_data + sizeof(uint8_t)); - idx_t dictionary_size = ReadVariableLengthLittleEndian(header.offset_size, ptr); + idx_t dictionary_size = + ReadVariableLengthLittleEndian(header.offset_size, metadata_data, metadata_offset, metadata_buffer_capacity); - auto offsets = ptr; - auto bytes = offsets + ((dictionary_size + 1) * header.offset_size); - idx_t last_offset = ReadVariableLengthLittleEndian(header.offset_size, ptr); + auto data_start = metadata_offset + ((dictionary_size + 1) * header.offset_size); + idx_t last_offset = + ReadVariableLengthLittleEndian(header.offset_size, metadata_data, metadata_offset, metadata_buffer_capacity); for (idx_t i = 0; i < dictionary_size; i++) { - auto next_offset = ReadVariableLengthLittleEndian(header.offset_size, ptr); - strings.emplace_back(reinterpret_cast(bytes + last_offset), next_offset - last_offset); + auto next_offset = ReadVariableLengthLittleEndian(header.offset_size, metadata_data, metadata_offset, + metadata_buffer_capacity); + const idx_t string_size = next_offset - last_offset; + if (data_start + last_offset + string_size > metadata_buffer_capacity) { + throw IOException("Corrupted VARIANT 'metadata' buffer"); + } + strings.emplace_back(reinterpret_cast(metadata_data + data_start + last_offset), string_size); last_offset = next_offset; } + //! header byte + offsets region + string bytes + total_size = metadata_offset + last_offset; } VariantValueMetadata VariantValueMetadata::FromHeaderByte(uint8_t byte) { @@ -115,11 +132,14 @@ VariantValueMetadata VariantValueMetadata::FromHeaderByte(uint8_t byte) { } template -static T DecodeDecimal(const_data_ptr_t data, uint8_t &scale, uint8_t &width) { - scale = Load(data); - data++; +static T DecodeDecimal(const_data_ptr_t data, idx_t data_offset, idx_t data_size, uint8_t &scale, uint8_t &width) { + if (data_offset + sizeof(uint8_t) + sizeof(T) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + scale = Load(data + data_offset); + data_offset += sizeof(uint8_t); - auto result = Load(data); + auto result = Load(data + data_offset); auto abs_val = result; if (abs_val < 0) { abs_val = -abs_val; @@ -130,13 +150,17 @@ static T DecodeDecimal(const_data_ptr_t data, uint8_t &scale, uint8_t &width) { } template <> -hugeint_t DecodeDecimal(const_data_ptr_t data, uint8_t &scale, uint8_t &width) { - scale = Load(data); - data++; +hugeint_t DecodeDecimal(const_data_ptr_t data, idx_t data_offset, idx_t data_size, uint8_t &scale, uint8_t &width) { + if (data_offset + sizeof(uint8_t) + sizeof(uint64_t) + sizeof(int64_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + scale = Load(data + data_offset); + data_offset += sizeof(uint8_t); hugeint_t result; - result.lower = Load(data); - result.upper = Load(data + sizeof(uint64_t)); + result.lower = Load(data + data_offset); + data_offset += sizeof(uint64_t); + result.upper = Load(data + data_offset); //! FIXME: The spec says: //! The implied precision of a decimal value is `floor(log_10(val)) + 1` width = DecimalWidth::max; @@ -144,7 +168,7 @@ hugeint_t DecodeDecimal(const_data_ptr_t data, uint8_t &scale, uint8_t &width) { } VariantValue VariantBinaryDecoder::PrimitiveTypeDecode(const VariantValueMetadata &value_metadata, - const_data_ptr_t data) { + const_data_ptr_t data, idx_t data_offset, idx_t data_size) { switch (value_metadata.primitive_type) { case VariantPrimitiveType::NULL_TYPE: { return VariantValue::NullValue(); @@ -156,91 +180,138 @@ VariantValue VariantBinaryDecoder::PrimitiveTypeDecode(const VariantValueMetadat return VariantValue(Value::BOOLEAN(false)); } case VariantPrimitiveType::INT8: { - auto value = Load(data); + if (data_offset + sizeof(int8_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + auto value = Load(data + data_offset); return VariantValue(Value::TINYINT(value)); } case VariantPrimitiveType::INT16: { - auto value = Load(data); + if (data_offset + sizeof(int16_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + auto value = Load(data + data_offset); return VariantValue(Value::SMALLINT(value)); } case VariantPrimitiveType::INT32: { - auto value = Load(data); + if (data_offset + sizeof(int32_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + auto value = Load(data + data_offset); return VariantValue(Value::INTEGER(value)); } case VariantPrimitiveType::INT64: { - auto value = Load(data); + if (data_offset + sizeof(int64_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + auto value = Load(data + data_offset); return VariantValue(Value::BIGINT(value)); } case VariantPrimitiveType::DOUBLE: { - double value = Load(data); + if (data_offset + sizeof(double) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + double value = Load(data + data_offset); return VariantValue(Value::DOUBLE(value)); } case VariantPrimitiveType::FLOAT: { - float value = Load(data); + if (data_offset + sizeof(float) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + float value = Load(data + data_offset); return VariantValue(Value::FLOAT(value)); } case VariantPrimitiveType::DECIMAL4: { uint8_t scale; uint8_t width; - auto value = DecodeDecimal(data, scale, width); + auto value = DecodeDecimal(data, data_offset, data_size, scale, width); return VariantValue(Value::DECIMAL(value, width, scale)); } case VariantPrimitiveType::DECIMAL8: { uint8_t scale; uint8_t width; - auto value = DecodeDecimal(data, scale, width); + auto value = DecodeDecimal(data, data_offset, data_size, scale, width); return VariantValue(Value::DECIMAL(value, width, scale)); } case VariantPrimitiveType::DECIMAL16: { uint8_t scale; uint8_t width; - auto value = DecodeDecimal(data, scale, width); + auto value = DecodeDecimal(data, data_offset, data_size, scale, width); return VariantValue(Value::DECIMAL(value, width, scale)); } case VariantPrimitiveType::DATE: { date_t value; - value.days = Load(data); + if (data_offset + sizeof(int32_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + value.days = Load(data + data_offset); return VariantValue(Value::DATE(value)); } case VariantPrimitiveType::TIMESTAMP_MICROS: { timestamp_tz_t micros_ts_tz; - micros_ts_tz.value = Load(data); + if (data_offset + sizeof(int64_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + micros_ts_tz.value = Load(data + data_offset); return VariantValue(Value::TIMESTAMPTZ(micros_ts_tz)); } case VariantPrimitiveType::TIMESTAMP_NTZ_MICROS: { timestamp_t micros_ts; - micros_ts.value = Load(data); + if (data_offset + sizeof(int64_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + micros_ts.value = Load(data + data_offset); auto value = Value::TIMESTAMP(micros_ts); return VariantValue(std::move(value)); } case VariantPrimitiveType::BINARY: { - //! Follow the JSON serialization guide by converting BINARY to Base64: - //! For example: `"dmFyaWFudAo="` - auto size = Load(data); - auto string_data = reinterpret_cast(data + sizeof(uint32_t)); - auto base64_string = Blob::ToBase64(string_t(string_data, size)); - return VariantValue(Value(base64_string)); + //! Keep the raw bytes as a BLOB so the type is preserved when reconstructing a VARIANT. The conversion to + //! Base64 happens now in VariantValue::ToJSON. + if (data_offset + sizeof(uint32_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + auto size = Load(data + data_offset); + data_offset += sizeof(uint32_t); + + if (data_offset + size > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + return VariantValue(Value::BLOB(data + data_offset, size)); } case VariantPrimitiveType::STRING: { - auto size = Load(data); - auto string_data = reinterpret_cast(data + sizeof(uint32_t)); + if (data_offset + sizeof(uint32_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + auto size = Load(data + data_offset); + data_offset += sizeof(uint32_t); + + auto string_data = reinterpret_cast(data + data_offset); + if (data_offset + size > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } if (!Utf8Proc::IsValid(string_data, size)) { - throw InternalException("Can't decode Variant short-string, string isn't valid UTF8"); + throw IOException("Can't decode Variant short-string, string isn't valid UTF8"); } return VariantValue(Value(string(string_data, size))); } case VariantPrimitiveType::TIME_NTZ_MICROS: { dtime_t micros_time; - micros_time.micros = Load(data); + if (data_offset + sizeof(int64_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + micros_time.micros = Load(data + data_offset); return VariantValue(Value::TIME(micros_time)); } case VariantPrimitiveType::TIMESTAMP_NANOS: { timestamp_ns_t nanos_ts; - nanos_ts.value = Load(data); + if (data_offset + sizeof(int64_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + nanos_ts.value = Load(data + data_offset); //! Convert the nanos timestamp to a micros timestamp (not lossless) auto micros_ts = Timestamp::FromEpochNanoSeconds(nanos_ts.value); @@ -248,13 +319,19 @@ VariantValue VariantBinaryDecoder::PrimitiveTypeDecode(const VariantValueMetadat } case VariantPrimitiveType::TIMESTAMP_NTZ_NANOS: { timestamp_ns_t nanos_ts; - nanos_ts.value = Load(data); + if (data_offset + sizeof(int64_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + nanos_ts.value = Load(data + data_offset); auto value = Value::TIMESTAMPNS(nanos_ts); return VariantValue(std::move(value)); } case VariantPrimitiveType::UUID: { - auto uuid_value = UUIDValueConversion::ReadParquetUUID(data); + if (data_offset + sizeof(hugeint_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + auto uuid_value = UUIDValueConversion::ReadParquetUUID(data + data_offset); return VariantValue(Value::UUID(uuid_value)); } default: @@ -263,18 +340,24 @@ VariantValue VariantBinaryDecoder::PrimitiveTypeDecode(const VariantValueMetadat } } -VariantValue VariantBinaryDecoder::ShortStringDecode(const VariantValueMetadata &value_metadata, - const_data_ptr_t data) { - D_ASSERT(value_metadata.string_size < 64); - auto string_data = reinterpret_cast(data); +VariantValue VariantBinaryDecoder::ShortStringDecode(const VariantValueMetadata &value_metadata, const_data_ptr_t data, + idx_t data_offset, idx_t data_size) { + if (value_metadata.string_size >= 64) { + throw IOException("Corrupted VARIANT 'metadata' buffer"); + } + auto string_data = reinterpret_cast(data + data_offset); + if (data_offset + value_metadata.string_size > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } if (!Utf8Proc::IsValid(string_data, value_metadata.string_size)) { - throw InternalException("Can't decode Variant short-string, string isn't valid UTF8"); + throw IOException("Can't decode Variant short-string, string isn't valid UTF8"); } return VariantValue(Value(string(string_data, value_metadata.string_size))); } VariantValue VariantBinaryDecoder::ObjectDecode(const VariantMetadata &metadata, - const VariantValueMetadata &value_metadata, const_data_ptr_t data) { + const VariantValueMetadata &value_metadata, const_data_ptr_t data, + idx_t data_offset, idx_t data_size) { VariantValue ret(VariantValueType::OBJECT); auto field_offset_size = value_metadata.field_offset_size; @@ -283,23 +366,32 @@ VariantValue VariantBinaryDecoder::ObjectDecode(const VariantMetadata &metadata, idx_t num_elements; if (is_large) { - num_elements = Load(data); - data += sizeof(uint32_t); + if (data_offset + sizeof(uint32_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + num_elements = Load(data + data_offset); + data_offset += sizeof(uint32_t); } else { - num_elements = Load(data); - data += sizeof(uint8_t); + if (data_offset + sizeof(uint8_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + num_elements = Load(data + data_offset); + data_offset += sizeof(uint8_t); } - auto field_ids = data; - auto field_offsets = data + (num_elements * field_id_size); - auto values = field_offsets + ((num_elements + 1) * field_offset_size); + auto field_ids_offset = data_offset; + auto field_offsets_offset = data_offset + (num_elements * field_id_size); + auto values_offset = field_offsets_offset + ((num_elements + 1) * field_offset_size); - idx_t last_offset = ReadVariableLengthLittleEndian(field_offset_size, field_offsets); + idx_t last_offset = ReadVariableLengthLittleEndian(field_offset_size, data, field_offsets_offset, data_size); for (idx_t i = 0; i < num_elements; i++) { - auto field_id = ReadVariableLengthLittleEndian(field_id_size, field_ids); - auto next_offset = ReadVariableLengthLittleEndian(field_offset_size, field_offsets); + auto field_id = ReadVariableLengthLittleEndian(field_id_size, data, field_ids_offset, data_size); + auto next_offset = ReadVariableLengthLittleEndian(field_offset_size, data, field_offsets_offset, data_size); - auto value = Decode(metadata, values + last_offset); + auto value = Decode(metadata, data, values_offset + last_offset, data_size); + if (field_id >= metadata.strings.size()) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } auto &key = metadata.strings[field_id]; ret.AddChild(key, std::move(value)); @@ -309,7 +401,8 @@ VariantValue VariantBinaryDecoder::ObjectDecode(const VariantMetadata &metadata, } VariantValue VariantBinaryDecoder::ArrayDecode(const VariantMetadata &metadata, - const VariantValueMetadata &value_metadata, const_data_ptr_t data) { + const VariantValueMetadata &value_metadata, const_data_ptr_t data, + idx_t data_offset, idx_t data_size) { VariantValue ret(VariantValueType::ARRAY); auto field_offset_size = value_metadata.field_offset_size; @@ -317,45 +410,55 @@ VariantValue VariantBinaryDecoder::ArrayDecode(const VariantMetadata &metadata, uint32_t num_elements; if (is_large) { - num_elements = Load(data); - data += sizeof(uint32_t); + if (data_offset + sizeof(uint32_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + num_elements = Load(data + data_offset); + data_offset += sizeof(uint32_t); } else { - num_elements = Load(data); - data += sizeof(uint8_t); + if (data_offset + sizeof(uint8_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + num_elements = Load(data + data_offset); + data_offset += sizeof(uint8_t); } - auto field_offsets = data; - auto values = field_offsets + ((num_elements + 1) * field_offset_size); + auto field_offsets_offset = data_offset; + auto values_offset = field_offsets_offset + ((num_elements + 1) * field_offset_size); - idx_t last_offset = ReadVariableLengthLittleEndian(field_offset_size, field_offsets); + idx_t last_offset = ReadVariableLengthLittleEndian(field_offset_size, data, field_offsets_offset, data_size); for (idx_t i = 0; i < num_elements; i++) { - auto next_offset = ReadVariableLengthLittleEndian(field_offset_size, field_offsets); + auto next_offset = ReadVariableLengthLittleEndian(field_offset_size, data, field_offsets_offset, data_size); - ret.AddItem(Decode(metadata, values + last_offset)); + ret.AddItem(Decode(metadata, data, values_offset + last_offset, data_size)); last_offset = next_offset; } return ret; } -VariantValue VariantBinaryDecoder::Decode(const VariantMetadata &variant_metadata, const_data_ptr_t data) { - auto value_metadata = VariantValueMetadata::FromHeaderByte(data[0]); +VariantValue VariantBinaryDecoder::Decode(const VariantMetadata &variant_metadata, const_data_ptr_t data, + idx_t data_offset, idx_t data_size) { + if (data_offset + 1 > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + auto value_metadata = VariantValueMetadata::FromHeaderByte(data[data_offset]); + data_offset += sizeof(uint8_t); - data++; switch (value_metadata.basic_type) { case VariantBasicType::PRIMITIVE: { - return PrimitiveTypeDecode(value_metadata, data); + return PrimitiveTypeDecode(value_metadata, data, data_offset, data_size); } case VariantBasicType::SHORT_STRING: { - return ShortStringDecode(value_metadata, data); + return ShortStringDecode(value_metadata, data, data_offset, data_size); } case VariantBasicType::OBJECT: { - return ObjectDecode(variant_metadata, value_metadata, data); + return ObjectDecode(variant_metadata, value_metadata, data, data_offset, data_size); } case VariantBasicType::ARRAY: { - return ArrayDecode(variant_metadata, value_metadata, data); + return ArrayDecode(variant_metadata, value_metadata, data, data_offset, data_size); } default: - throw InternalException("Unexpected value for VariantBasicType"); + throw IOException("Unexpected value for VariantBasicType"); } } diff --git a/src/duckdb/extension/parquet/reader/variant/variant_shredded_conversion.cpp b/src/duckdb/extension/parquet/reader/variant/variant_shredded_conversion.cpp index 0a289bb3d..193df51b2 100644 --- a/src/duckdb/extension/parquet/reader/variant/variant_shredded_conversion.cpp +++ b/src/duckdb/extension/parquet/reader/variant/variant_shredded_conversion.cpp @@ -7,10 +7,8 @@ #include "duckdb/common/types/uuid.hpp" #include "duckdb/common/types/time.hpp" #include "duckdb/common/types/date.hpp" -#include "duckdb/common/types/blob.hpp" namespace duckdb { - template struct ConvertShreddedValue { static VariantValue Convert(T val); @@ -108,7 +106,9 @@ VariantValue ConvertShreddedValue::Convert(timestamp_ns_t val) { //! binary template <> VariantValue ConvertShreddedValue::ConvertBlob(string_t val) { - return VariantValue(Value(Blob::ToBase64(val))); + //! Keep the raw bytes as a BLOB so the type is preserved when reconstructing a VARIANT. The conversion to Base64 + //! happens now in VariantValue::ToJSON. + return VariantValue(Value::BLOB(const_data_ptr_cast(val.GetData()), val.GetSize())); } //! string template <> @@ -184,8 +184,10 @@ vector ConvertTypedValues(Vector &vec, Vector &metadata, Vector &b D_ASSERT(value_validity.RowIsValid(value_index)); auto metadata_value = metadata_data[metadata_format.sel->get_index(i)]; VariantMetadata variant_metadata(metadata_value); - ret[i] = VariantBinaryDecoder::Decode(variant_metadata, - const_data_ptr_cast(value_data[value_index].GetData())); + + auto &value_buffer = value_data[value_index]; + ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(value_buffer.GetData()), 0, + value_buffer.GetSize()); } } } @@ -322,7 +324,7 @@ struct ShreddedVariantField { } // namespace static vector ConvertBinaryEncoding(Vector &metadata, Vector &value, idx_t offset, idx_t length, - idx_t total_size) { + idx_t total_size, bool add_metadata_offset = false) { UnifiedVectorFormat value_format; value.ToUnifiedFormat(total_size, value_format); auto value_data = value_format.GetData(value_format); @@ -337,12 +339,25 @@ static vector ConvertBinaryEncoding(Vector &metadata, Vector &valu vector ret(length); for (idx_t i = 0; i < length; i++) { auto index = value_format.sel->get_index(i + offset); - if (validity.RowIsValid(index)) { - auto &metadata_value = metadata_data[metadata_format.sel->get_index(i)]; - VariantMetadata variant_metadata(metadata_value); - auto binary_value = value_data[index].GetData(); - ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value)); + if (!validity.RowIsValid(index)) { + continue; } + //! 'metadata' and 'value' are the same vector: each row holds the full binary Variant value (metadata + //! followed by the value blob). Decode the metadata, then read the value right after it. + auto &metadata_value = metadata_data[metadata_format.sel->get_index(i)]; + VariantMetadata variant_metadata(metadata_value); + + auto &value_buffer = value_data[index]; + auto binary_value = value_buffer.GetData(); + + idx_t value_offset = 0; + if (add_metadata_offset) { + //! For a full variant binary value (metadata followed by value) + //! The value bytes start directly after the metadata bytes + value_offset += variant_metadata.total_size; + } + ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value), value_offset, + value_buffer.GetSize()); } return ret; } @@ -366,8 +381,11 @@ static VariantValue ConvertPartiallyShreddedObject(vector //! Object is partially shredded, decode the object and merge the values auto &metadata_value = metadata_data[metadata_format.sel->get_index(i)]; VariantMetadata variant_metadata(metadata_value); - auto binary_value = value_data[index].GetData(); - auto unshredded = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value)); + + auto &value_buffer = value_data[index]; + auto binary_value = value_buffer.GetData(); + auto unshredded = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value), 0, + value_buffer.GetSize()); if (unshredded.value_type != VariantValueType::OBJECT) { throw InvalidInputException("Partially shredded objects have to encode Object Variants in the 'value'"); } @@ -438,8 +456,11 @@ vector VariantShreddedConversion::ConvertShreddedObject(Vector &me D_ASSERT(validity.RowIsValid(value_index)); auto &metadata_value = metadata_data[metadata_format.sel->get_index(i)]; VariantMetadata variant_metadata(metadata_value); - auto binary_value = value_data[value_index].GetData(); - ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value)); + + auto &value_buffer = value_data[value_index]; + auto binary_value = value_buffer.GetData(); + ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value), 0, + value_buffer.GetSize()); if (ret[i].value_type == VariantValueType::OBJECT) { throw InvalidInputException( "When 'typed_value' for a shredded Object is NULL, 'value' can not contain an Object value"); @@ -500,8 +521,10 @@ vector VariantShreddedConversion::ConvertShreddedArray(Vector &met D_ASSERT(value_validity.RowIsValid(value_index)); auto metadata_value = metadata_data[metadata_format.sel->get_index(i)]; VariantMetadata variant_metadata(metadata_value); - ret[i] = VariantBinaryDecoder::Decode(variant_metadata, - const_data_ptr_cast(value_data[value_index].GetData())); + + const auto &value_buffer = value_data[value_index]; + ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(value_buffer.GetData()), 0, + value_buffer.GetSize()); } } } @@ -550,4 +573,24 @@ vector VariantShreddedConversion::Convert(Vector &metadata, Vector } } +void VariantShreddedConversion::ConvertBinaryToVariant(Vector &metadata_and_value, idx_t offset, idx_t length, + idx_t total_size, Vector &result) { + auto res = ConvertBinaryEncoding(metadata_and_value, metadata_and_value, offset, length, total_size, true); + VariantValue::ToVARIANT(res, result); +} + +static void FromParquetVariant(DataChunk &input, ExpressionState &state, Vector &result) { + auto num_values = input.size(); + auto &metadata_value = input.data[0]; + + VariantShreddedConversion::ConvertBinaryToVariant(metadata_value, 0, num_values, num_values, result); +} + +ScalarFunction VariantShreddedConversion::GetBytesToVariantFunction() { + ScalarFunction transform("variant_bytes_to_variant", {LogicalType::BLOB}, LogicalType::VARIANT(), + FromParquetVariant); + transform.SetNullHandling(FunctionNullHandling::SPECIAL_HANDLING); + return transform; +} + } // namespace duckdb diff --git a/src/duckdb/src/catalog/catalog.cpp b/src/duckdb/src/catalog/catalog.cpp index b66f0452d..9f9fd57c8 100644 --- a/src/duckdb/src/catalog/catalog.cpp +++ b/src/duckdb/src/catalog/catalog.cpp @@ -1295,7 +1295,10 @@ void Catalog::OnDetach(ClientContext &context) { bool Catalog::HasConflictingAttachOptions(const string &path, const AttachOptions &options) { auto const db_type = options.db_type.empty() ? "duckdb" : options.db_type; - return GetDBPath() != path || GetCatalogType() != db_type; + // Normalize through the extension alias table so that equivalent forms + auto canonical_actual = ExtensionHelper::ApplyExtensionAlias(GetCatalogType()); + auto canonical_requested = ExtensionHelper::ApplyExtensionAlias(db_type); + return GetDBPath() != path || !StringUtil::CIEquals(canonical_actual, canonical_requested); } } // namespace duckdb diff --git a/src/duckdb/src/catalog/default/default_types.cpp b/src/duckdb/src/catalog/default/default_types.cpp index a2c8f5abe..edb4329d6 100644 --- a/src/duckdb/src/catalog/default/default_types.cpp +++ b/src/duckdb/src/catalog/default/default_types.cpp @@ -45,7 +45,7 @@ LogicalType BindDecimalType(BindLogicalTypeInput &input) { if (scale_value.DefaultTryCastAs(LogicalTypeId::UTINYINT)) { scale = scale_value.GetValueUnsafe(); } else { - throw BinderException("DECIMAL type scale must be between 0 and %d", Decimal::MAX_WIDTH_DECIMAL - 1); + throw BinderException("DECIMAL type scale must be between 0 and %d", Decimal::MAX_WIDTH_DECIMAL); } } diff --git a/src/duckdb/src/common/adbc/adbc.cpp b/src/duckdb/src/common/adbc/adbc.cpp index 5a78ca317..5e3762ab8 100644 --- a/src/duckdb/src/common/adbc/adbc.cpp +++ b/src/duckdb/src/common/adbc/adbc.cpp @@ -18,6 +18,15 @@ #include #include static void ReleaseError(struct AdbcError *error); +static void ReleaseErrorWithDuckDBDetails(struct AdbcError *error); +static void ReleaseStreamErrorDetails(struct AdbcError *error); +static const char *DuckDBErrorTypeToString(duckdb_error_type type); +static void AppendDuckDBErrorDetails(struct AdbcError *error, duckdb_error_type type); +static void AppendDuckDBErrorDetails(struct AdbcError *error, duckdb_error_data res); + +struct DuckDBErrorDetails { + std::vector> entries; +}; #include @@ -67,9 +76,8 @@ AdbcStatusCode duckdb_adbc_init(int version, void *driver, struct AdbcError *err // Initialize 1.1.0 function pointers if version >= 1.1.0 if (version >= ADBC_VERSION_1_1_0) { - // TODO: ADBC 1.1.0 adds support for these functions - adbc_driver->ErrorGetDetailCount = nullptr; - adbc_driver->ErrorGetDetail = nullptr; + adbc_driver->ErrorGetDetailCount = duckdb_adbc::ErrorGetDetailCount; + adbc_driver->ErrorGetDetail = duckdb_adbc::ErrorGetDetail; adbc_driver->ErrorFromArrayStream = duckdb_adbc::ErrorFromArrayStream; adbc_driver->DatabaseGetOption = duckdb_adbc::DatabaseGetOption; @@ -92,7 +100,7 @@ AdbcStatusCode duckdb_adbc_init(int version, void *driver, struct AdbcError *err adbc_driver->ConnectionSetOptionDouble = duckdb_adbc::ConnectionSetOptionDouble; adbc_driver->StatementCancel = duckdb_adbc::StatementCancel; - adbc_driver->StatementExecuteSchema = nullptr; + adbc_driver->StatementExecuteSchema = duckdb_adbc::StatementExecuteSchema; adbc_driver->StatementGetOption = duckdb_adbc::StatementGetOption; adbc_driver->StatementGetOptionBytes = duckdb_adbc::StatementGetOptionBytes; adbc_driver->StatementGetOptionDouble = duckdb_adbc::StatementGetOptionDouble; @@ -239,9 +247,11 @@ void InitializeADBCError(AdbcError *error) { if (!error) { return; } - // Avoid leaking any DuckDB-owned error message. - // Only call DuckDB's own release callback. - if (error->message && error->release == ::ReleaseError) { + // Only call release for callbacks DuckDB owns. The stream wrapper sets + // adbc_error.message = last_error (strdup) with release = nullptr; calling + // delete[] on that would be a double-free and allocator mismatch. + if (error->release == ::ReleaseError || error->release == ::ReleaseErrorWithDuckDBDetails || + error->release == ::ReleaseStreamErrorDetails) { error->release(error); } error->message = nullptr; @@ -1051,6 +1061,10 @@ static int get_next(struct ArrowArrayStream *stream, struct ArrowArray *out) { if (result_wrapper->materialized) { auto mat = result_wrapper->materialized; if (mat->current >= mat->count) { + // Surface any error that was encountered during materialization + if (result_wrapper->last_error) { + return DuckDBError; + } return DuckDBSuccess; // end of stream } // Transfer ownership of the batch to the caller @@ -1070,10 +1084,22 @@ static int get_next(struct ArrowArrayStream *stream, struct ArrowArray *out) { } result_wrapper->last_error = strdup(err); result_wrapper->status_code = IsInterruptError(err) ? ADBC_STATUS_CANCELLED : ADBC_STATUS_INTERNAL; - // Populate adbc_error for AdbcErrorFromArrayStream + // Populate adbc_error for AdbcErrorFromArrayStream with rich metadata result_wrapper->adbc_error.message = result_wrapper->last_error; - result_wrapper->adbc_error.vendor_code = 0; - result_wrapper->adbc_error.release = nullptr; + result_wrapper->adbc_error.vendor_code = ADBC_ERROR_VENDOR_CODE_PRIVATE_DATA; + if (result_wrapper->adbc_error.private_data) { + delete static_cast(result_wrapper->adbc_error.private_data); + result_wrapper->adbc_error.private_data = nullptr; + } + auto *details = new (std::nothrow) DuckDBErrorDetails(); + if (details) { + details->entries.emplace_back( + "duckdb:error_type", DuckDBErrorTypeToString(duckdb_result_error_type(&result_wrapper->result))); + result_wrapper->adbc_error.private_data = details; + result_wrapper->adbc_error.release = ::ReleaseStreamErrorDetails; + } else { + result_wrapper->adbc_error.release = nullptr; + } return DuckDBError; } return DuckDBSuccess; @@ -1085,6 +1111,29 @@ static int get_next(struct ArrowArrayStream *stream, struct ArrowArray *out) { duckdb_destroy_data_chunk(&duckdb_chunk); if (conversion_success) { + auto conv_err_msg = duckdb_error_data_message(conversion_success); + if (conv_err_msg && conv_err_msg[0] != '\0') { + if (result_wrapper->last_error) { + free(result_wrapper->last_error); + } + result_wrapper->last_error = strdup(conv_err_msg); + result_wrapper->status_code = ADBC_STATUS_INTERNAL; + result_wrapper->adbc_error.message = result_wrapper->last_error; + result_wrapper->adbc_error.vendor_code = ADBC_ERROR_VENDOR_CODE_PRIVATE_DATA; + if (result_wrapper->adbc_error.private_data) { + delete static_cast(result_wrapper->adbc_error.private_data); + result_wrapper->adbc_error.private_data = nullptr; + } + auto *details = new (std::nothrow) DuckDBErrorDetails(); + if (details) { + details->entries.emplace_back( + "duckdb:error_type", DuckDBErrorTypeToString(duckdb_error_data_error_type(conversion_success))); + result_wrapper->adbc_error.private_data = details; + result_wrapper->adbc_error.release = ::ReleaseStreamErrorDetails; + } else { + result_wrapper->adbc_error.release = nullptr; + } + } duckdb_destroy_error_data(&conversion_success); return DuckDBError; } @@ -1290,6 +1339,7 @@ AdbcStatusCode Ingest(duckdb_connection connection, const char *catalog, const c auto res = duckdb_schema_from_arrow(connection, &arrow_schema_wrapper.arrow_schema, out_types.GetPtr()); if (res) { SetError(error, duckdb_error_data_message(res)); + AppendDuckDBErrorDetails(error, res); duckdb_destroy_error_data(&res); return ADBC_STATUS_INTERNAL; } @@ -1309,6 +1359,7 @@ AdbcStatusCode Ingest(duckdb_connection connection, const char *catalog, const c bool already_exists = error_msg && std::string(error_msg).find("already exists") != std::string::npos; bool interrupted = IsInterruptError(error_msg); SetError(error, error_msg); + AppendDuckDBErrorDetails(error, duckdb_result_error_type(&result)); duckdb_destroy_result(&result); if (interrupted) { return ADBC_STATUS_CANCELLED; @@ -1333,6 +1384,7 @@ AdbcStatusCode Ingest(duckdb_connection connection, const char *catalog, const c if (duckdb_query(connection, create_sql.c_str(), &result) == DuckDBError) { auto err = duckdb_result_error(&result); SetError(error, err); + AppendDuckDBErrorDetails(error, duckdb_result_error_type(&result)); bool interrupted = IsInterruptError(err); duckdb_destroy_result(&result); return interrupted ? ADBC_STATUS_CANCELLED : ADBC_STATUS_INTERNAL; @@ -1347,6 +1399,7 @@ AdbcStatusCode Ingest(duckdb_connection connection, const char *catalog, const c if (duckdb_query(connection, sql.c_str(), &result) == DuckDBError) { auto err = duckdb_result_error(&result); SetError(error, err); + AppendDuckDBErrorDetails(error, duckdb_result_error_type(&result)); bool interrupted = IsInterruptError(err); duckdb_destroy_result(&result); return interrupted ? ADBC_STATUS_CANCELLED : ADBC_STATUS_INTERNAL; @@ -1359,6 +1412,7 @@ AdbcStatusCode Ingest(duckdb_connection connection, const char *catalog, const c if (!appender.Valid()) { if (!appender.CreateError().empty()) { set_ingest_error(appender.CreateError()); + AppendDuckDBErrorDetails(error, appender.CreateErrorType()); } else { SetError(error, missing_table_error); } @@ -1376,6 +1430,7 @@ AdbcStatusCode Ingest(duckdb_connection connection, const char *catalog, const c &out_chunk.chunk); if (res) { SetError(error, duckdb_error_data_message(res)); + AppendDuckDBErrorDetails(error, res); duckdb_destroy_error_data(&res); } // Count rows for rows_affected, if a chunk was produced @@ -1392,6 +1447,7 @@ AdbcStatusCode Ingest(duckdb_connection connection, const char *catalog, const c SetError(error, missing_table_error); } bool interrupted = IsInterruptError(err); + AppendDuckDBErrorDetails(error, error_data); duckdb_destroy_error_data(&error_data); return interrupted ? ADBC_STATUS_CANCELLED : ADBC_STATUS_INTERNAL; } @@ -1493,6 +1549,63 @@ AdbcStatusCode StatementCancel(struct AdbcStatement *statement, struct AdbcError return ADBC_STATUS_OK; } +AdbcStatusCode StatementExecuteSchema(struct AdbcStatement *statement, struct ArrowSchema *schema, + struct AdbcError *error) { + if (!statement) { + SetError(error, "Missing statement object"); + return ADBC_STATUS_INVALID_ARGUMENT; + } + if (!statement->private_data) { + SetError(error, "Invalid statement object"); + return ADBC_STATUS_INVALID_ARGUMENT; + } + if (!schema) { + SetError(error, "Missing schema object"); + return ADBC_STATUS_INVALID_ARGUMENT; + } + auto wrapper = static_cast(statement->private_data); + if (!wrapper->statement) { + SetError(error, "Must call StatementSetSqlQuery before StatementExecuteSchema"); + return ADBC_STATUS_INVALID_STATE; + } + + if (wrapper->conn_wrapper) { + wrapper->conn_wrapper->MaterializeStreams(); + } + + auto count = duckdb_prepared_statement_column_count(wrapper->statement); + std::vector types(count); + std::vector owned_names; + owned_names.reserve(count); + duckdb::vector names(count); + + for (idx_t i = 0; i < count; i++) { + types[i] = duckdb_prepared_statement_column_logical_type(wrapper->statement, i); + auto column_name = duckdb_prepared_statement_column_name(wrapper->statement, i); + owned_names.emplace_back(column_name ? column_name : ""); + names[i] = owned_names.back().c_str(); + duckdb_free(const_cast(column_name)); + } + + duckdb_arrow_options arrow_options; + duckdb_connection_get_arrow_options(wrapper->connection, &arrow_options); + + auto res = duckdb_to_arrow_schema(arrow_options, types.data(), names.data(), count, schema); + + for (auto &type : types) { + duckdb_destroy_logical_type(&type); + } + duckdb_destroy_arrow_options(&arrow_options); + + if (res) { + SetError(error, duckdb_error_data_message(res)); + AppendDuckDBErrorDetails(error, res); + duckdb_destroy_error_data(&res); + return ADBC_STATUS_INVALID_ARGUMENT; + } + return ADBC_STATUS_OK; +} + AdbcStatusCode StatementGetParameterSchema(struct AdbcStatement *statement, struct ArrowSchema *schema, struct AdbcError *error) { if (!statement) { @@ -1544,6 +1657,7 @@ AdbcStatusCode StatementGetParameterSchema(struct AdbcStatement *statement, stru if (res) { SetError(error, duckdb_error_data_message(res)); + AppendDuckDBErrorDetails(error, res); duckdb_destroy_error_data(&res); return ADBC_STATUS_INVALID_ARGUMENT; } @@ -1641,6 +1755,7 @@ AdbcStatusCode StatementExecuteQuery(struct AdbcStatement *statement, struct Arr duckdb_schema_from_arrow(wrapper->connection, &arrow_schema_wrapper.arrow_schema, out_types.GetPtr()); if (res) { SetError(error, duckdb_error_data_message(res)); + AppendDuckDBErrorDetails(error, res); duckdb_destroy_error_data(&res); return ADBC_STATUS_INVALID_ARGUMENT; } @@ -1659,6 +1774,7 @@ AdbcStatusCode StatementExecuteQuery(struct AdbcStatement *statement, struct Arr out_types.Get(), &out_chunk.chunk); if (res_conv) { SetError(error, duckdb_error_data_message(res_conv)); + AppendDuckDBErrorDetails(error, res_conv); duckdb_destroy_error_data(&res_conv); return ADBC_STATUS_INVALID_ARGUMENT; } @@ -1696,6 +1812,7 @@ AdbcStatusCode StatementExecuteQuery(struct AdbcStatement *statement, struct Arr if (res != DuckDBSuccess) { auto err = duckdb_result_error(&stream_wrapper->result); SetError(error, err); + AppendDuckDBErrorDetails(error, duckdb_result_error_type(&stream_wrapper->result)); bool interrupted = IsInterruptError(err); return interrupted ? ADBC_STATUS_CANCELLED : ADBC_STATUS_INVALID_ARGUMENT; } @@ -1708,6 +1825,7 @@ AdbcStatusCode StatementExecuteQuery(struct AdbcStatement *statement, struct Arr if (res != DuckDBSuccess) { auto err = duckdb_result_error(&stream_wrapper->result); SetError(error, err); + AppendDuckDBErrorDetails(error, duckdb_result_error_type(&stream_wrapper->result)); bool interrupted = IsInterruptError(err); return interrupted ? ADBC_STATUS_CANCELLED : ADBC_STATUS_INVALID_ARGUMENT; } @@ -2332,26 +2450,92 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth LIST({ column_name: column_name, ordinal_position: ordinal_position, - remarks: '', - xdbc_data_type: NULL::SMALLINT, - xdbc_type_name: NULL::VARCHAR, - xdbc_column_size: NULL::INTEGER, - xdbc_decimal_digits: NULL::SMALLINT, - xdbc_num_prec_radix: NULL::SMALLINT, - xdbc_nullable: NULL::SMALLINT, - xdbc_column_def: NULL::VARCHAR, - xdbc_sql_data_type: NULL::SMALLINT, - xdbc_datetime_sub: NULL::SMALLINT, - xdbc_char_octet_length: NULL::INTEGER, - xdbc_is_nullable: NULL::VARCHAR, - xdbc_scope_catalog: NULL::VARCHAR, + remarks: comment, + xdbc_data_type: NULL::SMALLINT, -- Arrow type ID not derivable from SQL; SQL type codes are in xdbc_sql_data_type + xdbc_type_name: data_type, + xdbc_column_size: CASE + WHEN base_type = 'DATE' THEN 10::INTEGER + WHEN data_type IN ('TIME', 'TIME WITH TIME ZONE', 'TIME_NS') THEN 15::INTEGER + WHEN data_type LIKE 'TIMESTAMP%%' THEN 26::INTEGER + ELSE numeric_precision::INTEGER + END, + xdbc_decimal_digits: numeric_scale::SMALLINT, + xdbc_num_prec_radix: numeric_precision_radix::SMALLINT, + xdbc_nullable: CASE is_nullable + WHEN FALSE THEN 0::SMALLINT + WHEN TRUE THEN 1::SMALLINT + ELSE 2::SMALLINT + END, + xdbc_column_def: column_default, + xdbc_sql_data_type: CASE + WHEN data_type = 'TIMESTAMP WITH TIME ZONE' THEN 2014::SMALLINT + WHEN data_type LIKE 'TIMESTAMP%%' THEN 93::SMALLINT + WHEN data_type = 'TIME WITH TIME ZONE' THEN 2013::SMALLINT + WHEN data_type LIKE '%%]' THEN 2003::SMALLINT + WHEN type_codes[base_type] IS NOT NULL THEN type_codes[base_type]::SMALLINT + ELSE 1111::SMALLINT -- Types.OTHER: aligned with DuckDB JDBC default for unmapped types + END, + xdbc_datetime_sub: CASE + WHEN base_type = 'DATE' THEN 1::SMALLINT + WHEN data_type LIKE 'TIMESTAMP%%' THEN 3::SMALLINT + WHEN data_type IN ('TIME', 'TIME WITH TIME ZONE', 'TIME_NS') THEN 2::SMALLINT + ELSE NULL::SMALLINT + END, + xdbc_char_octet_length: CASE + WHEN base_type IN ('VARCHAR', 'BLOB') THEN character_maximum_length::INTEGER + ELSE NULL::INTEGER + END, + xdbc_is_nullable: CASE is_nullable + WHEN FALSE THEN 'NO' + WHEN TRUE THEN 'YES' + ELSE '' + END, + xdbc_scope_catalog: NULL::VARCHAR, -- REF types not supported in DuckDB xdbc_scope_schema: NULL::VARCHAR, xdbc_scope_table: NULL::VARCHAR, - xdbc_is_autoincrement: NULL::BOOLEAN, - xdbc_is_generatedcolumn: NULL::BOOLEAN, + xdbc_is_autoincrement: NULL::BOOLEAN, -- not exposed via duckdb_columns() + xdbc_is_generatedcolumn: NULL::BOOLEAN, -- not exposed via duckdb_columns() }) table_columns - FROM information_schema.columns - WHERE column_name LIKE %s + FROM ( + SELECT + database_name AS table_catalog, + schema_name AS table_schema, + table_name, + column_name, + column_index AS ordinal_position, + comment, + column_default, + is_nullable, + numeric_scale, + numeric_precision, + numeric_precision_radix, + character_maximum_length, + data_type, + STRING_SPLIT(data_type, '(')[1] AS base_type, -- normalize typemods for type-code lookup + -- JDBC java.sql.Types-compatible codes, matching DuckDB JDBC where possible. + MAP { + 'BOOLEAN': 16, + 'TINYINT': -6, + 'UTINYINT': 5, + 'SMALLINT': 5, + 'USMALLINT': 4, + 'INTEGER': 4, + 'UINTEGER': -5, + 'BIGINT': -5, + 'FLOAT': 6, + 'DOUBLE': 8, + 'DATE': 91, + 'TIME': 92, + 'TIME_NS': 92, + 'VARCHAR': 12, + 'BLOB': 2004, + 'DECIMAL': 3, + 'BIT': -7, + 'STRUCT': 2002, + } AS type_codes + FROM duckdb_columns() + WHERE column_name LIKE %s + ) cols GROUP BY table_catalog, table_schema, table_name ), constraints AS ( @@ -2444,6 +2628,26 @@ AdbcStatusCode ConnectionGetTableTypes(struct AdbcConnection *connection, struct return QueryInternal(connection, out, q, error); } +int ErrorGetDetailCount(const struct AdbcError *error) { + if (!error || error->vendor_code != ADBC_ERROR_VENDOR_CODE_PRIVATE_DATA || !error->private_data) { + return 0; + } + const auto *details = static_cast(error->private_data); + return static_cast(details->entries.size()); +} + +struct AdbcErrorDetail ErrorGetDetail(const struct AdbcError *error, int index) { + if (!error || error->vendor_code != ADBC_ERROR_VENDOR_CODE_PRIVATE_DATA || !error->private_data) { + return {nullptr, nullptr, 0}; + } + const auto *details = static_cast(error->private_data); + if (index < 0 || static_cast(index) >= details->entries.size()) { + return {nullptr, nullptr, 0}; + } + const auto &entry = details->entries[static_cast(index)]; + return {entry.first.c_str(), reinterpret_cast(entry.second.c_str()), entry.second.size()}; +} + } // namespace duckdb_adbc void duckdb::DuckDBAdbcConnectionWrapper::RegisterStream(duckdb_adbc::DuckDBAdbcStreamWrapper *stream) { @@ -2466,7 +2670,9 @@ void duckdb::DuckDBAdbcConnectionWrapper::MaterializeStreams() { continue; } - // Collect remaining batches from the streaming result + // Collect remaining batches from the streaming result. Errors encountered mid-stream + // are stored on result_wrapper so that get_next can return buffered batches first + // and then surface the error once they are exhausted. duckdb::vector batches; auto arrow_options = duckdb_result_get_arrow_options(&result_wrapper->result); while (true) { @@ -2475,12 +2681,62 @@ void duckdb::DuckDBAdbcConnectionWrapper::MaterializeStreams() { auto duckdb_chunk = duckdb_fetch_chunk(result_wrapper->result); if (!duckdb_chunk) { + // End of stream or error; distinguish by checking the result error message. + auto err = duckdb_result_error(&result_wrapper->result); + if (err && err[0] != '\0') { + if (result_wrapper->last_error) { + free(result_wrapper->last_error); + } + result_wrapper->last_error = strdup(err); + result_wrapper->status_code = + duckdb_adbc::IsInterruptError(err) ? ADBC_STATUS_CANCELLED : ADBC_STATUS_INTERNAL; + result_wrapper->adbc_error.message = result_wrapper->last_error; + result_wrapper->adbc_error.vendor_code = ADBC_ERROR_VENDOR_CODE_PRIVATE_DATA; + if (result_wrapper->adbc_error.private_data) { + delete static_cast(result_wrapper->adbc_error.private_data); + result_wrapper->adbc_error.private_data = nullptr; + } + auto *details = new (std::nothrow) DuckDBErrorDetails(); + if (details) { + details->entries.emplace_back( + "duckdb:error_type", + DuckDBErrorTypeToString(duckdb_result_error_type(&result_wrapper->result))); + result_wrapper->adbc_error.private_data = details; + result_wrapper->adbc_error.release = ::ReleaseStreamErrorDetails; + } else { + result_wrapper->adbc_error.release = nullptr; + } + } break; } auto conversion_err = duckdb_data_chunk_to_arrow(arrow_options, duckdb_chunk, &array); duckdb_destroy_data_chunk(&duckdb_chunk); if (conversion_err) { + // Store error before freeing so get_next can surface it after buffered batches + auto conv_err_msg = duckdb_error_data_message(conversion_err); + if (conv_err_msg && conv_err_msg[0] != '\0') { + if (result_wrapper->last_error) { + free(result_wrapper->last_error); + } + result_wrapper->last_error = strdup(conv_err_msg); + result_wrapper->status_code = ADBC_STATUS_INTERNAL; + result_wrapper->adbc_error.message = result_wrapper->last_error; + result_wrapper->adbc_error.vendor_code = ADBC_ERROR_VENDOR_CODE_PRIVATE_DATA; + if (result_wrapper->adbc_error.private_data) { + delete static_cast(result_wrapper->adbc_error.private_data); + result_wrapper->adbc_error.private_data = nullptr; + } + auto *details = new (std::nothrow) DuckDBErrorDetails(); + if (details) { + details->entries.emplace_back( + "duckdb:error_type", DuckDBErrorTypeToString(duckdb_error_data_error_type(conversion_err))); + result_wrapper->adbc_error.private_data = details; + result_wrapper->adbc_error.release = ::ReleaseStreamErrorDetails; + } else { + result_wrapper->adbc_error.release = nullptr; + } + } duckdb_destroy_error_data(&conversion_err); if (array.release) { array.release(&array); @@ -2572,3 +2828,134 @@ void SetError(struct AdbcError *error, const std::string &message) { } error->release = ReleaseError; } + +static void ReleaseStreamErrorDetails(struct AdbcError *error) { + if (!error) { + return; + } + // message is owned by the stream wrapper (last_error), not freed here + delete static_cast(error->private_data); + error->private_data = nullptr; + error->release = nullptr; +} + +static void ReleaseErrorWithDuckDBDetails(struct AdbcError *error) { + if (!error) { + return; + } + delete[] error->message; + error->message = nullptr; + delete static_cast(error->private_data); + error->private_data = nullptr; + error->release = nullptr; +} + +static const char *DuckDBErrorTypeToString(duckdb_error_type type) { + switch (type) { + case DUCKDB_ERROR_INVALID: + return "Invalid"; + case DUCKDB_ERROR_OUT_OF_RANGE: + return "OutOfRange"; + case DUCKDB_ERROR_CONVERSION: + return "Conversion"; + case DUCKDB_ERROR_UNKNOWN_TYPE: + return "UnknownType"; + case DUCKDB_ERROR_DECIMAL: + return "Decimal"; + case DUCKDB_ERROR_MISMATCH_TYPE: + return "MismatchType"; + case DUCKDB_ERROR_DIVIDE_BY_ZERO: + return "DivideByZero"; + case DUCKDB_ERROR_OBJECT_SIZE: + return "ObjectSize"; + case DUCKDB_ERROR_INVALID_TYPE: + return "InvalidType"; + case DUCKDB_ERROR_SERIALIZATION: + return "Serialization"; + case DUCKDB_ERROR_TRANSACTION: + return "Transaction"; + case DUCKDB_ERROR_NOT_IMPLEMENTED: + return "NotImplemented"; + case DUCKDB_ERROR_EXPRESSION: + return "Expression"; + case DUCKDB_ERROR_CATALOG: + return "Catalog"; + case DUCKDB_ERROR_PARSER: + return "Parser"; + case DUCKDB_ERROR_PLANNER: + return "Planner"; + case DUCKDB_ERROR_SCHEDULER: + return "Scheduler"; + case DUCKDB_ERROR_EXECUTOR: + return "Executor"; + case DUCKDB_ERROR_CONSTRAINT: + return "Constraint"; + case DUCKDB_ERROR_INDEX: + return "Index"; + case DUCKDB_ERROR_STAT: + return "Stat"; + case DUCKDB_ERROR_CONNECTION: + return "Connection"; + case DUCKDB_ERROR_SYNTAX: + return "Syntax"; + case DUCKDB_ERROR_SETTINGS: + return "Settings"; + case DUCKDB_ERROR_BINDER: + return "Binder"; + case DUCKDB_ERROR_NETWORK: + return "Network"; + case DUCKDB_ERROR_OPTIMIZER: + return "Optimizer"; + case DUCKDB_ERROR_NULL_POINTER: + return "NullPointer"; + case DUCKDB_ERROR_IO: + return "IO"; + case DUCKDB_ERROR_INTERRUPT: + return "Interrupt"; + case DUCKDB_ERROR_FATAL: + return "Fatal"; + case DUCKDB_ERROR_INTERNAL: + return "Internal"; + case DUCKDB_ERROR_INVALID_INPUT: + return "InvalidInput"; + case DUCKDB_ERROR_OUT_OF_MEMORY: + return "OutOfMemory"; + case DUCKDB_ERROR_PERMISSION: + return "Permission"; + case DUCKDB_ERROR_PARAMETER_NOT_RESOLVED: + return "ParameterNotResolved"; + case DUCKDB_ERROR_PARAMETER_NOT_ALLOWED: + return "ParameterNotAllowed"; + case DUCKDB_ERROR_DEPENDENCY: + return "Dependency"; + case DUCKDB_ERROR_HTTP: + return "HTTP"; + case DUCKDB_ERROR_MISSING_EXTENSION: + return "MissingExtension"; + case DUCKDB_ERROR_AUTOLOAD: + return "Autoload"; + case DUCKDB_ERROR_SEQUENCE: + return "Sequence"; + case DUCKDB_INVALID_CONFIGURATION: + return "InvalidConfiguration"; + default: + return "Unknown"; + } +} + +static void AppendDuckDBErrorDetails(struct AdbcError *error, duckdb_error_data res) { + AppendDuckDBErrorDetails(error, duckdb_error_data_error_type(res)); +} + +static void AppendDuckDBErrorDetails(struct AdbcError *error, duckdb_error_type type) { + if (!error || error->vendor_code != ADBC_ERROR_VENDOR_CODE_PRIVATE_DATA) { + return; + } + auto *details = new (std::nothrow) DuckDBErrorDetails(); + if (!details) { + return; + } + details->entries.emplace_back("duckdb:error_type", DuckDBErrorTypeToString(type)); + error->private_data = details; + error->release = ::ReleaseErrorWithDuckDBDetails; +} diff --git a/src/duckdb/src/common/arrow/appender/append_data.cpp b/src/duckdb/src/common/arrow/appender/append_data.cpp index 06ccbc1ad..a42d7c804 100644 --- a/src/duckdb/src/common/arrow/appender/append_data.cpp +++ b/src/duckdb/src/common/arrow/appender/append_data.cpp @@ -26,4 +26,18 @@ void ArrowAppendData::AppendValidity(UnifiedVectorFormat &format, idx_t from, id } } +void ArrowAppendData::AppendChild(Vector &input, idx_t from, idx_t to, idx_t input_size) { + if (extension_data && extension_data->duckdb_to_arrow) { + // Convert the DuckDB-typed input into the extension's internal Arrow type before + // handing it to the (internal-typed) child appender. Size the internal vector to the + // actual input_size: container children can exceed STANDARD_VECTOR_SIZE (e.g. a 2048-row + // LIST whose elements average two entries), and duckdb_to_arrow writes input_size values. + Vector internal(extension_data->GetInternalType(), MaxValue(input_size, STANDARD_VECTOR_SIZE)); + extension_data->duckdb_to_arrow(*options.client_context, input, internal, input_size); + append_vector(*this, internal, from, to, input_size); + } else { + append_vector(*this, input, from, to, input_size); + } +} + } // namespace duckdb diff --git a/src/duckdb/src/common/arrow/appender/fixed_size_list_data.cpp b/src/duckdb/src/common/arrow/appender/fixed_size_list_data.cpp index a8cbc16d9..38856913c 100644 --- a/src/duckdb/src/common/arrow/appender/fixed_size_list_data.cpp +++ b/src/duckdb/src/common/arrow/appender/fixed_size_list_data.cpp @@ -23,7 +23,7 @@ void ArrowFixedSizeListData::Append(ArrowAppendData &append_data, Vector &input, auto array_size = ArrayType::GetSize(input.GetType()); auto &child_vector = ArrayVector::GetEntry(input); auto &child_data = *append_data.child_data[0]; - child_data.append_vector(child_data, child_vector, from * array_size, to * array_size, size * array_size); + child_data.AppendChild(child_vector, from * array_size, to * array_size, size * array_size); append_data.row_count += size; } diff --git a/src/duckdb/src/common/arrow/appender/struct_data.cpp b/src/duckdb/src/common/arrow/appender/struct_data.cpp index 28cee72a9..ade5179f1 100644 --- a/src/duckdb/src/common/arrow/appender/struct_data.cpp +++ b/src/duckdb/src/common/arrow/appender/struct_data.cpp @@ -24,7 +24,7 @@ void ArrowStructData::Append(ArrowAppendData &append_data, Vector &input, idx_t for (idx_t child_idx = 0; child_idx < children.size(); child_idx++) { auto &child = children[child_idx]; auto &child_data = *append_data.child_data[child_idx]; - child_data.append_vector(child_data, *child, from, to, size); + child_data.AppendChild(*child, from, to, input_size); } append_data.row_count += size; } diff --git a/src/duckdb/src/common/arrow/appender/union_data.cpp b/src/duckdb/src/common/arrow/appender/union_data.cpp index 29faa5222..e47db5ccb 100644 --- a/src/duckdb/src/common/arrow/appender/union_data.cpp +++ b/src/duckdb/src/common/arrow/appender/union_data.cpp @@ -49,7 +49,7 @@ void ArrowUnionData::Append(ArrowAppendData &append_data, Vector &input, idx_t f for (idx_t child_idx = 0; child_idx < child_vectors.size(); child_idx++) { auto &child_buffer = append_data.child_data[child_idx]; auto &child = child_vectors[child_idx]; - child_buffer->append_vector(*child_buffer, child, 0, size, size); + child_buffer->AppendChild(child, 0, size, size); } append_data.row_count += size; } diff --git a/src/duckdb/src/common/arrow/arrow_appender.cpp b/src/duckdb/src/common/arrow/arrow_appender.cpp index 72556d403..152bbff76 100644 --- a/src/duckdb/src/common/arrow/arrow_appender.cpp +++ b/src/duckdb/src/common/arrow/arrow_appender.cpp @@ -8,6 +8,7 @@ #include "duckdb/common/arrow/appender/append_data.hpp" #include "duckdb/common/arrow/appender/list.hpp" #include "duckdb/function/table/arrow/arrow_duck_schema.hpp" +#include "duckdb/main/config.hpp" namespace duckdb { @@ -19,14 +20,13 @@ ArrowAppender::ArrowAppender(vector types_p, const idx_t initial_ca unordered_map> extension_type_cast) : types(std::move(types_p)), options(options) { for (idx_t i = 0; i < types.size(); i++) { - unique_ptr entry; - bool bitshift_boolean = types[i].id() == LogicalTypeId::BOOLEAN && !options.arrow_lossless_conversion; - if (extension_type_cast.find(i) != extension_type_cast.end() && !bitshift_boolean) { - entry = InitializeChild(types[i], initial_capacity, options, extension_type_cast[i]); - } else { - entry = InitializeChild(types[i], initial_capacity, options); - } - root_data.push_back(std::move(entry)); + // Pass any explicit per-column extension override through to InitializeChild; when none + // is supplied it auto-resolves the extension (and applies the bitshift_boolean gate) so + // children of nested types pick up the same extension SetArrowFormat uses for the schema. + auto extension_it = extension_type_cast.find(i); + shared_ptr extension = + extension_it != extension_type_cast.end() ? extension_it->second : nullptr; + root_data.push_back(InitializeChild(types[i], initial_capacity, options, extension)); } } @@ -38,14 +38,7 @@ void ArrowAppender::Append(DataChunk &input, const idx_t from, const idx_t to, c D_ASSERT(types == input.GetTypes()); D_ASSERT(to >= from); for (idx_t i = 0; i < input.ColumnCount(); i++) { - if (root_data[i]->extension_data && root_data[i]->extension_data->duckdb_to_arrow) { - Vector input_data(root_data[i]->extension_data->GetInternalType()); - root_data[i]->extension_data->duckdb_to_arrow(*options.client_context, input.data[i], input_data, - input_size); - root_data[i]->append_vector(*root_data[i], input_data, from, to, input_size); - } else { - root_data[i]->append_vector(*root_data[i], input.data[i], from, to, input_size); - } + root_data[i]->AppendChild(input.data[i], from, to, input_size); } row_count += to - from; } @@ -315,12 +308,28 @@ unique_ptr ArrowAppender::InitializeChild(const LogicalType &ty ClientProperties &options, const shared_ptr &extension_type) { auto result = make_uniq(options); + + // Resolve the effective extension. An explicit override (from the top-level appender) wins. + // Otherwise auto-resolve from DBConfig so nested children use the same extension SetArrowFormat + // declares in the schema. BOOLEAN stays plain bit-packed when arrow_lossless_conversion is off + // (the bitshift_boolean gate), applied here so it holds at every nesting level. + shared_ptr effective_extension = extension_type; + const bool bitshift_boolean = type.id() == LogicalTypeId::BOOLEAN && !options.arrow_lossless_conversion; + if (bitshift_boolean) { + effective_extension = nullptr; + } else if (!effective_extension && options.client_context) { + const auto &db_config = DBConfig::GetConfig(*options.client_context); + if (db_config.HasArrowExtension(type)) { + effective_extension = db_config.GetArrowExtension(type).GetTypeExtension(); + } + } + LogicalType array_type = type; - if (extension_type) { - array_type = extension_type->GetInternalType(); + if (effective_extension) { + array_type = effective_extension->GetInternalType(); } InitializeFunctionPointers(*result, array_type); - result->extension_data = extension_type; + result->extension_data = effective_extension; const auto byte_count = (capacity + 7) / 8; result->GetValidityBuffer().reserve(byte_count); diff --git a/src/duckdb/src/common/arrow/arrow_type_extension.cpp b/src/duckdb/src/common/arrow/arrow_type_extension.cpp index 63fb46737..d18654da1 100644 --- a/src/duckdb/src/common/arrow/arrow_type_extension.cpp +++ b/src/duckdb/src/common/arrow/arrow_type_extension.cpp @@ -351,6 +351,9 @@ struct ArrowBignum { struct ArrowBool8 { static void ArrowToDuck(ClientContext &context, Vector &source, Vector &result, idx_t count) { + // The caller (ColumnArrowToDuckDB) always builds a flat storage vector for the + // extension's internal type, so reading it flat is safe. + D_ASSERT(source.GetVectorType() == VectorType::FLAT_VECTOR); auto source_ptr = reinterpret_cast(FlatVector::GetData(source)); auto result_ptr = reinterpret_cast(FlatVector::GetData(result)); for (idx_t i = 0; i < count; i++) { @@ -358,14 +361,20 @@ struct ArrowBool8 { } } static void DuckToArrow(ClientContext &context, Vector &source, Vector &result, idx_t count) { + // The source may be dictionary/constant/sliced (container appenders hand us such + // children), so resolve every row through the selection vector. The result is flat, + // so its validity is keyed by the logical row index. UnifiedVectorFormat format; source.ToUnifiedFormat(count, format); - FlatVector::SetValidity(result, format.validity); - auto source_ptr = reinterpret_cast(format.data); - auto result_ptr = reinterpret_cast(FlatVector::GetData(result)); + auto source_ptr = UnifiedVectorFormat::GetData(format); + auto result_ptr = FlatVector::GetData(result); + auto &result_validity = FlatVector::Validity(result); for (idx_t i = 0; i < count; i++) { - if (format.validity.RowIsValid(i)) { - result_ptr[i] = static_cast(source_ptr[i]); + auto source_idx = format.sel->get_index(i); + if (format.validity.RowIsValid(source_idx)) { + result_ptr[i] = static_cast(source_ptr[source_idx]); + } else { + result_validity.SetInvalid(i); } } } diff --git a/src/duckdb/src/common/enum_util.cpp b/src/duckdb/src/common/enum_util.cpp index d91879e1c..2f1e9b8c5 100644 --- a/src/duckdb/src/common/enum_util.cpp +++ b/src/duckdb/src/common/enum_util.cpp @@ -197,20 +197,19 @@ namespace duckdb { const StringUtil::EnumStringLiteral *GetARTConflictTypeValues() { static constexpr StringUtil::EnumStringLiteral values[] { { static_cast(ARTConflictType::NO_CONFLICT), "NO_CONFLICT" }, - { static_cast(ARTConflictType::CONSTRAINT), "CONSTRAINT" }, - { static_cast(ARTConflictType::TRANSACTION), "TRANSACTION" } + { static_cast(ARTConflictType::CONSTRAINT), "CONSTRAINT" } }; return values; } template<> const char* EnumUtil::ToChars(ARTConflictType value) { - return StringUtil::EnumToString(GetARTConflictTypeValues(), 3, "ARTConflictType", static_cast(value)); + return StringUtil::EnumToString(GetARTConflictTypeValues(), 2, "ARTConflictType", static_cast(value)); } template<> ARTConflictType EnumUtil::FromString(const char *value) { - return static_cast(StringUtil::StringToEnum(GetARTConflictTypeValues(), 3, "ARTConflictType", value)); + return static_cast(StringUtil::StringToEnum(GetARTConflictTypeValues(), 2, "ARTConflictType", value)); } const StringUtil::EnumStringLiteral *GetARTHandlingResultValues() { diff --git a/src/duckdb/src/common/gzip_file_system.cpp b/src/duckdb/src/common/gzip_file_system.cpp index f0bc5c7ff..21420d8d9 100644 --- a/src/duckdb/src/common/gzip_file_system.cpp +++ b/src/duckdb/src/common/gzip_file_system.cpp @@ -238,8 +238,11 @@ void MiniZStreamWrapper::Write(CompressedFile &file, StreamData &sd, data_ptr_t while (remaining > 0) { auto output_remaining = UnsafeNumericCast((sd.out_buff.get() + sd.out_buf_size) - sd.out_buff_start); + // miniz's avail_in is a platform-dependent unsigned int, cap ingestion bytes to avoid overflow. + auto avail_in = MinValue(remaining, NumericLimits::Maximum()); + mz_stream_ptr->next_in = reinterpret_cast(uncompressed_data); - mz_stream_ptr->avail_in = NumericCast(remaining); + mz_stream_ptr->avail_in = NumericCast(avail_in); mz_stream_ptr->next_out = sd.out_buff_start; mz_stream_ptr->avail_out = NumericCast(output_remaining); @@ -255,9 +258,9 @@ void MiniZStreamWrapper::Write(CompressedFile &file, StreamData &sd, data_ptr_t UnsafeNumericCast(sd.out_buff_start - sd.out_buff.get())); sd.out_buff_start = sd.out_buff.get(); } - auto written = UnsafeNumericCast(remaining - mz_stream_ptr->avail_in); + auto written = NumericCast(avail_in - mz_stream_ptr->avail_in); uncompressed_data += written; - remaining = mz_stream_ptr->avail_in; + remaining -= NumericCast(written); } } diff --git a/src/duckdb/src/common/local_file_system.cpp b/src/duckdb/src/common/local_file_system.cpp index 29f04735a..ba49e4249 100644 --- a/src/duckdb/src/common/local_file_system.cpp +++ b/src/duckdb/src/common/local_file_system.cpp @@ -880,7 +880,7 @@ static timestamp_t FiletimeToTimeStamp(FILETIME file_time) { // Adapted from: https://stackoverflow.com/questions/6161776/convert-windows-filetime-to-second-in-unix-linux const auto WINDOWS_TICK = 10000000; const auto SEC_TO_UNIX_EPOCH = 11644473600LL; - return Timestamp::FromTimeT(fileTime64 / WINDOWS_TICK - SEC_TO_UNIX_EPOCH); + return Timestamp::FromEpochSeconds(fileTime64 / WINDOWS_TICK - SEC_TO_UNIX_EPOCH); } static FileMetadata StatsInternal(HANDLE hFile, const string &path) { @@ -890,13 +890,13 @@ static FileMetadata StatsInternal(HANDLE hFile, const string &path) { if (handle_type == FILE_TYPE_CHAR) { file_metadata.file_type = FileType::FILE_TYPE_CHARDEV; file_metadata.file_size = 0; - file_metadata.last_modification_time = Timestamp::FromTimeT(0); + file_metadata.last_modification_time = Timestamp::FromEpochSeconds(0); return file_metadata; } if (handle_type == FILE_TYPE_PIPE) { file_metadata.file_type = FileType::FILE_TYPE_FIFO; file_metadata.file_size = 0; - file_metadata.last_modification_time = Timestamp::FromTimeT(0); + file_metadata.last_modification_time = Timestamp::FromEpochSeconds(0); return file_metadata; } diff --git a/src/duckdb/src/common/types/selection_vector.cpp b/src/duckdb/src/common/types/selection_vector.cpp index a1232340c..ca8b41263 100644 --- a/src/duckdb/src/common/types/selection_vector.cpp +++ b/src/duckdb/src/common/types/selection_vector.cpp @@ -16,6 +16,8 @@ SelectionData::SelectionData(idx_t count) { #endif } +SelectionData::~SelectionData() = default; + // LCOV_EXCL_START string SelectionVector::ToString(idx_t count) const { string result = "Selection Vector (" + to_string(count) + ") ["; diff --git a/src/duckdb/src/common/types/variant/variant_value.cpp b/src/duckdb/src/common/types/variant/variant_value.cpp index 47e1d2eec..d8ba50098 100644 --- a/src/duckdb/src/common/types/variant/variant_value.cpp +++ b/src/duckdb/src/common/types/variant/variant_value.cpp @@ -7,6 +7,7 @@ #include "duckdb/common/types/datetime.hpp" #include "duckdb/common/types/timestamp.hpp" #include "duckdb/common/types/date.hpp" +#include "duckdb/common/types/blob.hpp" #include "duckdb/common/types/interval.hpp" #include "duckdb/common/types/decimal.hpp" #include "duckdb/common/types/variant.hpp" @@ -785,6 +786,12 @@ yyjson_mut_val *VariantValue::ToJSON(ClientContext &context, yyjson_mut_doc *doc return yyjson_mut_real(doc, primitive_value.GetValue()); case LogicalTypeId::DOUBLE: return yyjson_mut_real(doc, primitive_value.GetValue()); + case LogicalTypeId::BLOB: { + //! Follow the JSON serialization guide by converting BINARY to Base64: + //! For example: `"dmFyaWFudAo="` + auto value_str = Blob::ToBase64(primitive_value.GetValueUnsafe()); + return yyjson_mut_strncpy(doc, value_str.c_str(), value_str.size()); + } case LogicalTypeId::DATE: case LogicalTypeId::TIME: case LogicalTypeId::VARCHAR: { diff --git a/src/duckdb/src/execution/index/art/art.cpp b/src/duckdb/src/execution/index/art/art.cpp index 130df0af7..86fefd83c 100644 --- a/src/duckdb/src/execution/index/art/art.cpp +++ b/src/duckdb/src/execution/index/art/art.cpp @@ -556,12 +556,6 @@ ErrorData ART::InsertKeys(ArenaAllocator &arena, unsafe_vector &keys, un VerifyAllocationsInternal(); } - if (conflict_type == ARTConflictType::TRANSACTION) { - // chunk is only null when called from MergeCheckpointDeltas. - auto msg = chunk ? AppendRowError(*chunk, conflict_idx.GetIndex()) : string("???"); - return ErrorData(TransactionException("write-write conflict on key: \"%s\"", msg)); - } - if (conflict_type == ARTConflictType::CONSTRAINT) { // chunk is only null when called from MergeCheckpointDeltas. auto msg = chunk ? AppendRowError(*chunk, conflict_idx.GetIndex()) : string("???"); diff --git a/src/duckdb/src/execution/index/art/art_index.cpp b/src/duckdb/src/execution/index/art/art_index.cpp index c4ba2c504..e9d3660d4 100644 --- a/src/duckdb/src/execution/index/art/art_index.cpp +++ b/src/duckdb/src/execution/index/art/art_index.cpp @@ -103,7 +103,6 @@ void ARTBuildSinkUnsorted(IndexBuildSinkInput &input, DataChunk &key_chunk, Data auto conflict_type = ARTOperator::Insert(l_state.arena_allocator, art, art.tree, l_state.keys[i], 0, l_state.row_ids[i], status, DeleteIndexInfo(), IndexAppendMode::DEFAULT); - D_ASSERT(conflict_type != ARTConflictType::TRANSACTION); if (conflict_type == ARTConflictType::CONSTRAINT) { throw ConstraintException("Data contains duplicates on indexed column(s)"); } diff --git a/src/duckdb/src/execution/operator/persistent/csv_rejects_table.cpp b/src/duckdb/src/execution/operator/persistent/csv_rejects_table.cpp index 33435736b..b1d436eec 100644 --- a/src/duckdb/src/execution/operator/persistent/csv_rejects_table.cpp +++ b/src/duckdb/src/execution/operator/persistent/csv_rejects_table.cpp @@ -34,8 +34,8 @@ shared_ptr CSVRejectsTable::GetOrCreate(ClientContext &context, throw BinderException("The names of the rejects scan and rejects error tables can't be the same. Use different " "names for these tables."); } - auto key = - "CSV_REJECTS_TABLE_CACHE_ENTRY_" + StringUtil::Upper(rejects_scan) + "_" + StringUtil::Upper(rejects_error); + auto key = StringUtil::Format("CSV_REJECTS_TABLE_CACHE_ENTRY_%s_%s", StringUtil::Upper(rejects_scan), + StringUtil::Upper(rejects_error)); auto &cache = ObjectCache::GetObjectCache(context); auto &catalog = Catalog::GetCatalog(context, TEMP_CATALOG); auto rejects_scan_exist = catalog.GetEntry(context, DEFAULT_SCHEMA, rejects_scan, diff --git a/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp b/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp index 50db9a3f5..a665516c7 100644 --- a/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp +++ b/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp @@ -427,6 +427,10 @@ void PhysicalTableScan::GetMetrics(ClientContext &context, GlobalSourceState &gs } auto &gstate = gstate_p.Cast(); auto &state = lstate.Cast(); + if (!state.local_state) { + // FIXME: We should be able to retrieve metrics from table functions with only a global state. + return; + } if (function.get_metrics) { function.get_metrics(context, bind_data.get(), *gstate.global_state, *state.local_state, requested_metrics, metrics); diff --git a/src/duckdb/src/function/cast/variant/from_variant.cpp b/src/duckdb/src/function/cast/variant/from_variant.cpp index 7b1a547f1..17d95e766 100644 --- a/src/duckdb/src/function/cast/variant/from_variant.cpp +++ b/src/duckdb/src/function/cast/variant/from_variant.cpp @@ -335,7 +335,9 @@ static bool ConvertVariantToArray(FromVariantConversionData &conversion_data, Ve } FindValues(conversion_data.variant, row_index, new_sel, child_data_entry); - CastVariant(conversion_data, child, new_sel, total_offset, array_size, row_index); + if (!CastVariant(conversion_data, child, new_sel, total_offset, array_size, row_index)) { + return false; + } total_offset += array_size; } return true; diff --git a/src/duckdb/src/function/scalar/list/list_select.cpp b/src/duckdb/src/function/scalar/list/list_select.cpp index ebaef993c..7ddc1402c 100644 --- a/src/duckdb/src/function/scalar/list/list_select.cpp +++ b/src/duckdb/src/function/scalar/list/list_select.cpp @@ -44,13 +44,15 @@ struct SetSelectionVectorWhere { return; } - selection_vector.set_index(target_offset, input_offset + child_idx); - if (!input_validity.RowIsValid(input_offset + child_idx)) { + if (child_idx >= target_length) { + selection_vector.set_index(target_offset, 0); validity_mask.SetInvalid(target_offset); + target_offset++; + return; } - if (child_idx >= target_length) { - selection_vector.set_index(target_offset, 0); + selection_vector.set_index(target_offset, input_offset + child_idx); + if (!input_validity.RowIsValid(input_offset + child_idx)) { validity_mask.SetInvalid(target_offset); } diff --git a/src/duckdb/src/function/table/system/logging_utils.cpp b/src/duckdb/src/function/table/system/logging_utils.cpp index 02e975af1..84daf92f2 100644 --- a/src/duckdb/src/function/table/system/logging_utils.cpp +++ b/src/duckdb/src/function/table/system/logging_utils.cpp @@ -91,6 +91,24 @@ static unique_ptr BindEnableLogging(ClientContext &context, TableF } } + // File logging requires a path. Reject switching to file storage without one before mutating any + // state, so the active storage is preserved instead of becoming a path-less storage that throws + // on every later flush (end-of-query and shutdown included). + if (StringUtil::Lower(result->config.storage) == LogConfig::FILE_STORAGE_NAME) { + auto current_storage = StringUtil::Lower(context.db->GetLogManager().GetConfig().storage); + // Already-active file storage keeps its existing path; only guard a fresh switch. + if (current_storage != LogConfig::FILE_STORAGE_NAME) { + auto path_entry = result->storage_config.find("path"); + bool has_usable_path = path_entry != result->storage_config.end() && !path_entry->second.IsNull() && + !path_entry->second.ToString().empty(); + if (!has_usable_path) { + throw InvalidInputException( + "Cannot enable 'file' log storage without a valid path. Provide one via storage_path, " + "e.g. CALL enable_logging(storage='file', storage_path='mylog.csv');"); + } + } + } + // Process positional params if (!input.inputs.empty()) { if (input.inputs[0].type() == LogicalType::VARCHAR) { diff --git a/src/duckdb/src/function/table/table_scan.cpp b/src/duckdb/src/function/table/table_scan.cpp index 60cb1f95f..8fdf83f98 100644 --- a/src/duckdb/src/function/table/table_scan.cpp +++ b/src/duckdb/src/function/table/table_scan.cpp @@ -293,8 +293,10 @@ class DuckTableScanState : public TableScanGlobalState { } if (bind_data.order_options) { - l_state->scan_state.table_state.reorderer = make_uniq(*bind_data.order_options); - l_state->scan_state.local_state.reorderer = make_uniq(*bind_data.order_options); + l_state->scan_state.table_state.reorderer = + make_uniq(*bind_data.order_options, TransactionData(tx)); + l_state->scan_state.local_state.reorderer = + make_uniq(*bind_data.order_options, TransactionData(tx)); } l_state->scan_state.Initialize(std::move(storage_ids), context.client, input.filters, input.sample_options); @@ -405,8 +407,9 @@ unique_ptr DuckTableScanInitGlobal(ClientContext &cont DataTable &storage, const TableScanBindData &bind_data) { auto g_state = make_uniq(context, input.bind_data.get()); if (bind_data.order_options) { - g_state->state.scan_state.reorderer = make_uniq(*bind_data.order_options); - g_state->state.local_state.reorderer = make_uniq(*bind_data.order_options); + auto transaction = TransactionData(DuckTransaction::Get(context, storage.GetAttached())); + g_state->state.scan_state.reorderer = make_uniq(*bind_data.order_options, transaction); + g_state->state.local_state.reorderer = make_uniq(*bind_data.order_options, transaction); } storage.InitializeParallelScan(context, g_state->state, input.column_indexes); @@ -723,8 +726,8 @@ unique_ptr TableScanInitGlobal(ClientContext &context, // row groups while we hold row IDs from the ART, ensuring we always see a consistent // pairing. unique_ptr vacuum_lock; - auto &db = DatabaseInstance::GetDatabase(context); - if (Settings::Get(db) > 0) { + const auto &attached = storage.GetAttached(); + if (attached.GetVacuumRebuildIndexThreshold() > 0) { auto &transaction_manager = DuckTransactionManager::Get(storage.GetAttached()); vacuum_lock = transaction_manager.SharedVacuumLock(); } diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index f868efcd0..da8449c69 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev41" +#define DUCKDB_PATCH_VERSION "4-dev320" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 5 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.5.4-dev41" +#define DUCKDB_VERSION "v1.5.4-dev320" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "0bf2a03cb1" +#define DUCKDB_SOURCE_ID "f086d79427" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/include/duckdb.h b/src/duckdb/src/include/duckdb.h index a96cb3ee2..750934870 100644 --- a/src/duckdb/src/include/duckdb.h +++ b/src/duckdb/src/include/duckdb.h @@ -502,7 +502,7 @@ typedef struct { } duckdb_bit; //! BIGNUMs are composed of a byte pointer, a size, and an `is_negative` bool. -//! The absolute value of the number is stored in `data` in little endian format. +//! The absolute value of the number is stored in `data` in big endian format. //! You must free `data` with `duckdb_free`. typedef struct { uint8_t *data; @@ -2516,8 +2516,11 @@ DUCKDB_C_API duckdb_value duckdb_create_bignum(duckdb_bignum input); /*! Creates a DECIMAL value from a duckdb_decimal +The width must be between 1 and 38, and the scale must not exceed the width. + * @param input The duckdb_decimal value -* @return The value. This must be destroyed with `duckdb_destroy_value`. +* @return The value, or `nullptr` if the width or scale are out of range. This must be destroyed with +`duckdb_destroy_value`. */ DUCKDB_C_API duckdb_value duckdb_create_decimal(duckdb_decimal input); @@ -3138,9 +3141,9 @@ DUCKDB_C_API duckdb_logical_type duckdb_create_enum_type(const char **member_nam Creates a DECIMAL type with the specified width and scale. The resulting type should be destroyed with `duckdb_destroy_logical_type`. -* @param width The width of the decimal type -* @param scale The scale of the decimal type -* @return The logical type. +* @param width The width of the decimal type. Must be between 1 and 38. +* @param scale The scale of the decimal type. Must not exceed the width. +* @return The logical type, or `nullptr` if the width or scale are out of range. */ DUCKDB_C_API duckdb_logical_type duckdb_create_decimal_type(uint8_t width, uint8_t scale); diff --git a/src/duckdb/src/include/duckdb/common/adbc/adbc.hpp b/src/duckdb/src/include/duckdb/common/adbc/adbc.hpp index 235a02e2a..a1466dd63 100644 --- a/src/duckdb/src/include/duckdb/common/adbc/adbc.hpp +++ b/src/duckdb/src/include/duckdb/common/adbc/adbc.hpp @@ -19,7 +19,7 @@ namespace duckdb_adbc { class AppenderWrapper { public: AppenderWrapper(duckdb_connection conn, const char *catalog, const char *schema, const char *table) - : appender(nullptr) { + : appender(nullptr), create_error_type(DUCKDB_ERROR_UNKNOWN_TYPE) { // Note: duckdb_appender_create_ext allocates an internal wrapper even on failure. // If creation fails, make sure to destroy it to avoid leaking. auto created = duckdb_appender(nullptr); @@ -30,6 +30,7 @@ class AppenderWrapper { if (error_message) { create_error = error_message; } + create_error_type = duckdb_error_data_error_type(error_data); duckdb_destroy_error_data(&error_data); duckdb_appender_destroy(&created); } @@ -52,10 +53,14 @@ class AppenderWrapper { const std::string &CreateError() const { return create_error; } + duckdb_error_type CreateErrorType() const { + return create_error_type; + } private: duckdb_appender appender; std::string create_error; + duckdb_error_type create_error_type; }; class DataChunkWrapper { @@ -191,6 +196,9 @@ AdbcStatusCode StatementSetOptionDouble(struct AdbcStatement *statement, const c const AdbcError *ErrorFromArrayStream(struct ArrowArrayStream *stream, AdbcStatusCode *status); +int ErrorGetDetailCount(const struct AdbcError *error); +struct AdbcErrorDetail ErrorGetDetail(const struct AdbcError *error, int index); + AdbcStatusCode StatementNew(struct AdbcConnection *connection, struct AdbcStatement *statement, struct AdbcError *error); @@ -214,6 +222,9 @@ AdbcStatusCode StatementBind(struct AdbcStatement *statement, struct ArrowArray AdbcStatusCode StatementBindStream(struct AdbcStatement *statement, struct ArrowArrayStream *stream, struct AdbcError *error); +AdbcStatusCode StatementExecuteSchema(struct AdbcStatement *statement, struct ArrowSchema *schema, + struct AdbcError *error); + AdbcStatusCode StatementGetParameterSchema(struct AdbcStatement *statement, struct ArrowSchema *schema, struct AdbcError *error); diff --git a/src/duckdb/src/include/duckdb/common/arrow/appender/append_data.hpp b/src/duckdb/src/include/duckdb/common/arrow/appender/append_data.hpp index 4a1e594d0..cdd5b2d16 100644 --- a/src/duckdb/src/include/duckdb/common/arrow/appender/append_data.hpp +++ b/src/duckdb/src/include/duckdb/common/arrow/appender/append_data.hpp @@ -93,6 +93,13 @@ struct ArrowAppendData { } void AppendValidity(UnifiedVectorFormat &format, idx_t from, idx_t to); + //! Append a (child) vector, routing it through the Arrow extension's duckdb_to_arrow + //! conversion first when one is set. Container appenders must call this instead of + //! append_vector so nested extension types (e.g. arrow.bool8 BOOLEAN) get the same + //! conversion the top-level appender applies, keeping the data layout in sync with + //! the schema declared by SetArrowFormat. + void AppendChild(Vector &input, idx_t from, idx_t to, idx_t input_size); + public: idx_t row_count = 0; idx_t null_count = 0; diff --git a/src/duckdb/src/include/duckdb/common/arrow/appender/list_data.hpp b/src/duckdb/src/include/duckdb/common/arrow/appender/list_data.hpp index 627e5fbbc..2d6f79658 100644 --- a/src/duckdb/src/include/duckdb/common/arrow/appender/list_data.hpp +++ b/src/duckdb/src/include/duckdb/common/arrow/appender/list_data.hpp @@ -37,7 +37,7 @@ struct ArrowListData { auto child_size = child_indices.size(); Vector child_copy(child.GetType()); child_copy.Slice(child, child_sel, child_size); - append_data.child_data[0]->append_vector(*append_data.child_data[0], child_copy, 0, child_size, child_size); + append_data.child_data[0]->AppendChild(child_copy, 0, child_size, child_size); append_data.row_count += size; } diff --git a/src/duckdb/src/include/duckdb/common/arrow/appender/list_view_data.hpp b/src/duckdb/src/include/duckdb/common/arrow/appender/list_view_data.hpp index f326b7648..0cbe995b5 100644 --- a/src/duckdb/src/include/duckdb/common/arrow/appender/list_view_data.hpp +++ b/src/duckdb/src/include/duckdb/common/arrow/appender/list_view_data.hpp @@ -39,7 +39,7 @@ struct ArrowListViewData { auto child_size = child_indices.size(); Vector child_copy(child.GetType()); child_copy.Slice(child, child_sel, child_size); - append_data.child_data[0]->append_vector(*append_data.child_data[0], child_copy, 0, child_size, child_size); + append_data.child_data[0]->AppendChild(child_copy, 0, child_size, child_size); append_data.row_count += size; } diff --git a/src/duckdb/src/include/duckdb/common/arrow/appender/map_data.hpp b/src/duckdb/src/include/duckdb/common/arrow/appender/map_data.hpp index b73b0016b..e08dca01b 100644 --- a/src/duckdb/src/include/duckdb/common/arrow/appender/map_data.hpp +++ b/src/duckdb/src/include/duckdb/common/arrow/appender/map_data.hpp @@ -56,8 +56,8 @@ struct ArrowMapData { key_vector_copy.Slice(key_vector, child_sel, list_size); Vector value_vector_copy(value_vector.GetType()); value_vector_copy.Slice(value_vector, child_sel, list_size); - key_data.append_vector(key_data, key_vector_copy, 0, list_size, list_size); - value_data.append_vector(value_data, value_vector_copy, 0, list_size, list_size); + key_data.AppendChild(key_vector_copy, 0, list_size, list_size); + value_data.AppendChild(value_vector_copy, 0, list_size, list_size); append_data.row_count += size; struct_data.row_count += size; diff --git a/src/duckdb/src/include/duckdb/common/http_util.hpp b/src/duckdb/src/include/duckdb/common/http_util.hpp index bf57b8a7b..ecb0fa8d3 100644 --- a/src/duckdb/src/include/duckdb/common/http_util.hpp +++ b/src/duckdb/src/include/duckdb/common/http_util.hpp @@ -134,7 +134,7 @@ struct BaseRequest { BaseRequest(RequestType type, const string &url, const HTTPHeaders &headers, HTTPParams ¶ms); RequestType type; - const string &url; + string url; string path; string proto_host_port; HTTPHeaders headers; diff --git a/src/duckdb/src/include/duckdb/common/types/decimal.hpp b/src/duckdb/src/include/duckdb/common/types/decimal.hpp index e08e544a1..7c4fe7d27 100644 --- a/src/duckdb/src/include/duckdb/common/types/decimal.hpp +++ b/src/duckdb/src/include/duckdb/common/types/decimal.hpp @@ -45,6 +45,10 @@ class Decimal { static constexpr uint8_t MAX_WIDTH_DECIMAL = MAX_WIDTH_INT128; public: + //! Whether width/scale form a valid DECIMAL type: width in [1, MAX_WIDTH_DECIMAL] and scale not exceeding width. + static bool IsValidWidthScale(uint8_t width, uint8_t scale) { + return width >= 1 && width <= MAX_WIDTH_DECIMAL && scale <= width; + } static string ToString(int16_t value, uint8_t width, uint8_t scale); static string ToString(int32_t value, uint8_t width, uint8_t scale); static string ToString(int64_t value, uint8_t width, uint8_t scale); diff --git a/src/duckdb/src/include/duckdb/common/types/geometry.hpp b/src/duckdb/src/include/duckdb/common/types/geometry.hpp index faaa4d614..1796c5f9e 100644 --- a/src/duckdb/src/include/duckdb/common/types/geometry.hpp +++ b/src/duckdb/src/include/duckdb/common/types/geometry.hpp @@ -113,10 +113,27 @@ class GeometryExtent { return GeometryExtent {EMPTY_MIN, EMPTY_MIN, EMPTY_MIN, EMPTY_MIN, EMPTY_MAX, EMPTY_MAX, EMPTY_MAX, EMPTY_MAX}; } - // Does this extent have any X/Y values set? - // In other words, is the range of the x/y axes not empty and not unknown? + // Does this extent have the X axis set? + // In other words, is the range of the x-axis not empty and not unknown? + bool HasX() const { + return std::isfinite(x_min) && std::isfinite(x_max); + } + // Does this extent have the Y axis set? + // In other words, is the range of the y-axis not empty and not unknown? + bool HasY() const { + return std::isfinite(y_min) && std::isfinite(y_max); + } + // Does this extent have both X and Y axes set? + // In other words, are the ranges of both the x and y axes not empty and not unknown? + // Used to gate serialization, where a non-finite axis cannot be represented. bool HasXY() const { - return std::isfinite(x_min) && std::isfinite(y_min) && std::isfinite(x_max) && std::isfinite(y_max); + return HasX() && HasY(); + } + // Can this extent be used for X/Y zonemap pruning? + // A single finite axis is enough: an unknown axis is treated as an infinite range, + // which intersects everything, so pruning simply degrades to the finite axis. + bool CanPruneXY() const { + return HasX() || HasY(); } // Does this extent have any Z values set? // In other words, is the range of the Z-axis not empty and not unknown? diff --git a/src/duckdb/src/include/duckdb/common/types/selection_vector.hpp b/src/duckdb/src/include/duckdb/common/types/selection_vector.hpp index 5575e5a08..c695e04e9 100644 --- a/src/duckdb/src/include/duckdb/common/types/selection_vector.hpp +++ b/src/duckdb/src/include/duckdb/common/types/selection_vector.hpp @@ -19,6 +19,11 @@ class VectorBuffer; struct SelectionData { DUCKDB_API explicit SelectionData(idx_t count); + // Out-of-line destructor: prevents GCC IPA-ICF from folding + // _Sp_counted_ptr_inplace::_M_dispose with the + // corresponding instantiation for TemplatedValidityData, which produces + // a spurious -Warray-bounds with g++ >= 14. + DUCKDB_API ~SelectionData(); AllocatedData owned_data; }; diff --git a/src/duckdb/src/include/duckdb/common/types/string_type.hpp b/src/duckdb/src/include/duckdb/common/types/string_type.hpp index 2d7bbcf56..15f6391c9 100644 --- a/src/duckdb/src/include/duckdb/common/types/string_type.hpp +++ b/src/duckdb/src/include/duckdb/common/types/string_type.hpp @@ -40,6 +40,7 @@ struct string_t { string_t() = default; explicit string_t(uint32_t len) { value.inlined.length = len; + memset(value.inlined.inlined, 0, INLINE_BYTES); } string_t(const char *data, uint32_t len) { value.inlined.length = len; diff --git a/src/duckdb/src/include/duckdb/execution/index/art/art.hpp b/src/duckdb/src/include/duckdb/execution/index/art/art.hpp index ca6ead12f..86fab038c 100644 --- a/src/duckdb/src/include/duckdb/execution/index/art/art.hpp +++ b/src/duckdb/src/include/duckdb/execution/index/art/art.hpp @@ -15,7 +15,7 @@ namespace duckdb { enum class VerifyExistenceType : uint8_t { APPEND = 0, APPEND_FK = 1, DELETE_FK = 2 }; -enum class ARTConflictType : uint8_t { NO_CONFLICT = 0, CONSTRAINT = 1, TRANSACTION = 2 }; +enum class ARTConflictType : uint8_t { NO_CONFLICT = 0, CONSTRAINT = 1 }; enum class ARTHandlingResult : uint8_t { CONTINUE = 0, SKIP = 1, YIELD = 2, NONE = 3 }; class ConflictManager; diff --git a/src/duckdb/src/include/duckdb/execution/index/art/art_operator.hpp b/src/duckdb/src/include/duckdb/execution/index/art/art_operator.hpp index d882e3ba7..f6d8673a2 100644 --- a/src/duckdb/src/include/duckdb/execution/index/art/art_operator.hpp +++ b/src/duckdb/src/include/duckdb/execution/index/art/art_operator.hpp @@ -152,13 +152,18 @@ class ARTOperator { status = GateStatus::GATE_SET; continue; } - // Unique indexes can have duplicates, if another transaction DELETE + INSERT - // the same key. In that case, the previous value must be kept alive until all - // other transactions do not depend on it anymore. - - // We restrict this transactionality to two-value leaves, so any subsequent - // incoming transaction must fail here. - return ARTConflictType::TRANSACTION; + // A unique ART may temporarily contain a gated two-row leaf during commit for + // DELETE + INSERT of the same key: commit appends the new row first, then + // commit-delete cleanup removes the old row ID. No other main-ART append should + // enter during that window because commit-time main-index appends are serialized + // by the WAL lock or transaction manager commit lock. + // + // Local append and delete indexes should not contain such gates either. + // Note that VerifyLeaf may still legitimately observe the temporary duplicate + // leaf state. + throw FatalException("Corrupted unique ART index \"%s\": encountered an existing gated leaf in unique " + "index while inserting", + art.name); } const auto type = active_node.GetType(); diff --git a/src/duckdb/src/include/duckdb/function/cast/variant/json_to_variant.hpp b/src/duckdb/src/include/duckdb/function/cast/variant/json_to_variant.hpp index b1f10305f..0231f28fb 100644 --- a/src/duckdb/src/include/duckdb/function/cast/variant/json_to_variant.hpp +++ b/src/duckdb/src/include/duckdb/function/cast/variant/json_to_variant.hpp @@ -12,6 +12,7 @@ #include "duckdb/common/types/decimal.hpp" #include "duckdb/common/types/time.hpp" #include "duckdb/common/types/timestamp.hpp" +#include "duckdb/common/operator/cast_operators.hpp" namespace duckdb { namespace variant { @@ -132,6 +133,14 @@ static bool ConvertJSONObject(yyjson_val *obj, ToVariantGlobalResultData &result return true; } +namespace { + +static inline string_t GetString(yyjson_val *val) { + return string_t(unsafe_yyjson_get_str(val), unsafe_yyjson_get_len(val)); +} + +} // namespace + template static bool ConvertJSONPrimitive(yyjson_val *val, ToVariantGlobalResultData &result, idx_t result_index, bool is_root) { auto json_tag = unsafe_yyjson_get_tag(val); @@ -143,8 +152,7 @@ static bool ConvertJSONPrimitive(yyjson_val *val, ToVariantGlobalResultData &res switch (json_tag) { case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NOESC: - case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NONE: - case YYJSON_TYPE_RAW | YYJSON_SUBTYPE_NONE: { + case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NONE: { WriteVariantMetadata(result, result_index, values_offset_data, blob_offset_data[result_index], nullptr, 0, VariantLogicalType::VARCHAR); uint32_t length = NumericCast(unsafe_yyjson_get_len(val)); @@ -188,11 +196,20 @@ static bool ConvertJSONPrimitive(yyjson_val *val, ToVariantGlobalResultData &res blob_offset_data[result_index] += sizeof(int64_t); break; } + case YYJSON_TYPE_RAW | YYJSON_SUBTYPE_NONE: case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_REAL: { WriteVariantMetadata(result, result_index, values_offset_data, blob_offset_data[result_index], nullptr, 0, VariantLogicalType::DOUBLE); + double value; if (WRITE_DATA) { - auto value = unsafe_yyjson_get_real(val); + if (json_tag == (YYJSON_TYPE_RAW | YYJSON_SUBTYPE_NONE)) { + auto success = TryCast::Operation(GetString(val), value, true); + if (!success) { + return false; + } + } else { + value = unsafe_yyjson_get_real(val); + } memcpy(blob_data + blob_offset_data[result_index], const_data_ptr_cast(&value), sizeof(double)); } blob_offset_data[result_index] += sizeof(double); diff --git a/src/duckdb/src/include/duckdb/function/cast/variant/variant_to_variant.hpp b/src/duckdb/src/include/duckdb/function/cast/variant/variant_to_variant.hpp index 482c3dcd5..c6c02868d 100644 --- a/src/duckdb/src/include/duckdb/function/cast/variant/variant_to_variant.hpp +++ b/src/duckdb/src/include/duckdb/function/cast/variant/variant_to_variant.hpp @@ -257,6 +257,8 @@ bool ConvertVariantToVariant(ToVariantSourceData &source_data, ToVariantGlobalRe auto &result = result_data.variant; for (idx_t source_index = 0; source_index < count; source_index++) { + //! Map the loop index through the incoming selection to the actual source row. + const auto scan_index = source_data.GetMappedIndex(source_index); auto result_index = selvec ? selvec->get_index(source_index) : source_index; auto &keys_list_entry = result.keys_data[result_index]; @@ -269,7 +271,7 @@ bool ConvertVariantToVariant(ToVariantSourceData &source_data, ToVariantGlobalRe uint32_t keys_count = 0; uint32_t blob_size = 0; - if (!source.RowIsValid(source_index)) { + if (!source.RowIsValid(scan_index)) { if (!IGNORE_NULLS) { HandleVariantNull(result_data, result_index, values_offset_data, blob_offset, values_index_selvec, source_index, is_root); @@ -287,23 +289,23 @@ bool ConvertVariantToVariant(ToVariantSourceData &source_data, ToVariantGlobalRe //! First write all children //! NOTE: this has to happen first because we use 'values_offset', which is increased when we write the values - auto source_children_list_entry = source.GetChildrenListEntry(source_index); + auto source_children_list_entry = source.GetChildrenListEntry(scan_index); for (idx_t source_children_index = 0; source_children_index < source_children_list_entry.length; source_children_index++) { //! values_index if (WRITE_DATA) { auto &values_offset = values_offset_data[result_index]; - auto source_value_index = source.GetValuesIndex(source_index, source_children_index); + auto source_value_index = source.GetValuesIndex(scan_index, source_children_index); result.values_index_data[children_list_entry.offset + children_offset + source_children_index] = values_offset + source_value_index; } //! keys_index - if (source.KeysIndexIsValid(source_index, source_children_index)) { + if (source.KeysIndexIsValid(scan_index, source_children_index)) { if (WRITE_DATA) { //! Look up the existing key from 'source' - auto source_key_index = source.GetKeysIndex(source_index, source_children_index); - auto &source_key_value = source.GetKey(source_index, source_key_index); + auto source_key_index = source.GetKeysIndex(scan_index, source_children_index); + auto &source_key_value = source.GetKey(scan_index, source_key_index); //! Now write this key to the dictionary of the result auto dict_index = result_data.GetOrCreateIndex(source_key_value); @@ -320,26 +322,25 @@ bool ConvertVariantToVariant(ToVariantSourceData &source_data, ToVariantGlobalRe } } - auto source_values_list_entry = source.GetValuesListEntry(source_index); + auto source_values_list_entry = source.GetValuesListEntry(scan_index); if (WRITE_DATA) { WriteState write_state(keys_offset, children_offset, blob_offset, blob_data, blob_size); for (uint32_t source_value_index = 0; source_value_index < source_values_list_entry.length; source_value_index++) { - auto source_type_id = source.GetTypeId(source_index, source_value_index); + auto source_type_id = source.GetTypeId(scan_index, source_value_index); WriteVariantMetadata(result_data, result_index, values_offset_data, blob_offset + blob_size, nullptr, 0, source_type_id); - VariantVisitor::Visit(source, source_index, source_value_index, - write_state); + VariantVisitor::Visit(source, scan_index, source_value_index, write_state); } } else { AnalyzeState analyze_state(children_offset); for (uint32_t source_value_index = 0; source_value_index < source_values_list_entry.length; source_value_index++) { values_offset_data[result_index]++; - blob_size += VariantVisitor::Visit(source, source_index, - source_value_index, analyze_state); + blob_size += VariantVisitor::Visit(source, scan_index, source_value_index, + analyze_state); } } diff --git a/src/duckdb/src/include/duckdb/function/variant/variant_shredding.hpp b/src/duckdb/src/include/duckdb/function/variant/variant_shredding.hpp index aa31ea84c..95cf7ee18 100644 --- a/src/duckdb/src/include/duckdb/function/variant/variant_shredding.hpp +++ b/src/duckdb/src/include/duckdb/function/variant/variant_shredding.hpp @@ -29,7 +29,7 @@ struct VariantColumnStatsData { idx_t total_count = 0; //! indices into the top-level 'columns' vector where the stats for the field/element live - case_insensitive_map_t field_stats; + unordered_map field_stats; idx_t element_stats = DConstants::INVALID_INDEX; }; diff --git a/src/duckdb/src/include/duckdb/main/attached_database.hpp b/src/duckdb/src/include/duckdb/main/attached_database.hpp index aef38333b..8407e5389 100644 --- a/src/duckdb/src/include/duckdb/main/attached_database.hpp +++ b/src/duckdb/src/include/duckdb/main/attached_database.hpp @@ -77,6 +77,8 @@ struct AttachOptions { AttachVisibility visibility = AttachVisibility::SHOWN; //! The stored database path (in the path manager) unique_ptr stored_database_path; + //! Per-database override of vacuum_rebuild_indexes. If not set, the global setting value is used. + optional_idx vacuum_rebuild_indexes_threshold; }; //! The AttachedDatabase represents an attached database instance. @@ -131,6 +133,9 @@ class AttachedDatabase : public CatalogEntry, public enable_shared_from_this &GetAttachOptions() const { return attach_options; } @@ -156,6 +161,7 @@ class AttachedDatabase : public CatalogEntry, public enable_shared_from_this close_lock; + optional_idx vacuum_rebuild_threshold; unordered_map attach_options; private: diff --git a/src/duckdb/src/include/duckdb/main/extension_entries.hpp b/src/duckdb/src/include/duckdb/main/extension_entries.hpp index d96910f1e..9c845f788 100644 --- a/src/duckdb/src/include/duckdb/main/extension_entries.hpp +++ b/src/duckdb/src/include/duckdb/main/extension_entries.hpp @@ -156,6 +156,7 @@ static constexpr ExtensionFunctionEntry EXTENSION_FUNCTIONS[] = { {"duckdb_proj_version", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"ducklake_add_data_files", "ducklake", CatalogType::TABLE_FUNCTION_ENTRY}, {"ducklake_cleanup_old_files", "ducklake", CatalogType::TABLE_FUNCTION_ENTRY}, + {"ducklake_commit", "ducklake", CatalogType::TABLE_FUNCTION_ENTRY}, {"ducklake_current_snapshot", "ducklake", CatalogType::TABLE_FUNCTION_ENTRY}, {"ducklake_delete_orphaned_files", "ducklake", CatalogType::TABLE_FUNCTION_ENTRY}, {"ducklake_expire_snapshots", "ducklake", CatalogType::TABLE_FUNCTION_ENTRY}, @@ -241,6 +242,7 @@ static constexpr ExtensionFunctionEntry EXTENSION_FUNCTIONS[] = { {"iceberg_table_properties", "iceberg", CatalogType::TABLE_FUNCTION_ENTRY}, {"iceberg_to_ducklake", "iceberg", CatalogType::TABLE_FUNCTION_ENTRY}, {"iceberg_truncate", "iceberg", CatalogType::SCALAR_FUNCTION_ENTRY}, + {"iceberg_verify_equality_deletes", "iceberg", CatalogType::SCALAR_FUNCTION_ENTRY}, {"icu_calendar_names", "icu", CatalogType::TABLE_FUNCTION_ENTRY}, {"icu_collate_af", "icu", CatalogType::SCALAR_FUNCTION_ENTRY}, {"icu_collate_am", "icu", CatalogType::SCALAR_FUNCTION_ENTRY}, @@ -535,6 +537,7 @@ static constexpr ExtensionFunctionEntry EXTENSION_FUNCTIONS[] = { {"pragma_rtree_index_info", "spatial", CatalogType::TABLE_FUNCTION_ENTRY}, {"printf", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY}, {"product", "core_functions", CatalogType::AGGREGATE_FUNCTION_ENTRY}, + {"quack_active_connections", "quack", CatalogType::TABLE_FUNCTION_ENTRY}, {"quack_check_token", "quack", CatalogType::SCALAR_FUNCTION_ENTRY}, {"quack_clear_cache", "quack", CatalogType::TABLE_FUNCTION_ENTRY}, {"quack_identify", "quack", CatalogType::TABLE_FUNCTION_ENTRY}, @@ -835,6 +838,7 @@ static constexpr ExtensionFunctionEntry EXTENSION_FUNCTIONS[] = { {"var_pop", "core_functions", CatalogType::AGGREGATE_FUNCTION_ENTRY}, {"var_samp", "core_functions", CatalogType::AGGREGATE_FUNCTION_ENTRY}, {"variance", "core_functions", CatalogType::AGGREGATE_FUNCTION_ENTRY}, + {"variant_bytes_to_variant", "parquet", CatalogType::SCALAR_FUNCTION_ENTRY}, {"variant_to_parquet_variant", "parquet", CatalogType::SCALAR_FUNCTION_ENTRY}, {"vector_type", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY}, {"version", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY}, @@ -1083,6 +1087,8 @@ static constexpr ExtensionEntry EXTENSION_SETTINGS[] = { {"azure_read_transfer_concurrency", "azure"}, {"azure_storage_connection_string", "azure"}, {"azure_transport_option_type", "azure"}, + {"azure_write_block_size", "azure"}, + {"azure_write_staged_blocks_per_commit", "azure"}, {"binary_as_string", "parquet"}, {"ca_cert_file", "httpfs"}, {"calendar", "icu"}, @@ -1091,6 +1097,7 @@ static constexpr ExtensionEntry EXTENSION_SETTINGS[] = { {"ducklake_max_retry_count", "ducklake"}, {"ducklake_retry_backoff", "ducklake"}, {"ducklake_retry_wait_ms", "ducklake"}, + {"ducklake_target_file_size", "ducklake"}, {"ducklake_write_deletion_vectors", "ducklake"}, {"enable_curl_server_cert_verification", "httpfs"}, {"enable_geoparquet_conversion", "parquet"}, @@ -1109,6 +1116,7 @@ static constexpr ExtensionEntry EXTENSION_SETTINGS[] = { {"http_timeout", "httpfs"}, {"httpfs_client_implementation", "httpfs"}, {"httpfs_connection_caching", "httpfs"}, + {"iceberg_logging_post_body_truncate_limit", "iceberg"}, {"iceberg_test_force_token_expiry", "iceberg"}, {"iceberg_use_metadata_log", "iceberg"}, {"iceberg_via_aws_sdk_for_catalog_interactions", "iceberg"}, @@ -1195,6 +1203,7 @@ static constexpr ExtensionEntry EXTENSION_SETTINGS[] = { {"ui_remote_url", "ui"}, {"unsafe_disable_etag_checks", "httpfs"}, {"unsafe_enable_version_guessing", "iceberg"}, + {"unsafe_iceberg_ignore_sort_order", "iceberg"}, {"whoami_hostname", "quack"}, {"whoami_meta", "quack"}, {"whoami_name", "quack"}, diff --git a/src/duckdb/src/include/duckdb/main/settings.hpp b/src/duckdb/src/include/duckdb/main/settings.hpp index 15a5570aa..25117117c 100644 --- a/src/duckdb/src/include/duckdb/main/settings.hpp +++ b/src/duckdb/src/include/duckdb/main/settings.hpp @@ -1575,7 +1575,8 @@ struct VacuumRebuildIndexesSetting { static constexpr const char *Name = "vacuum_rebuild_indexes"; static constexpr const char *Description = "(Experimental) Allow vacuum to compact row groups on tables with bound ART indexes, rebuilding the indexes " - "afterward. Tables with a row count exceeding this threshold are skipped. 0 = disabled."; + "afterward. Tables with a row count exceeding this threshold are skipped. 0 = disabled. Can also be set " + "per-database via the 'vacuum_rebuild_indexes' ATTACH option, which overrides this default."; static constexpr const char *InputType = "UBIGINT"; static constexpr const char *DefaultValue = "0"; static constexpr SettingScopeTarget Scope = SettingScopeTarget::GLOBAL_DEFAULT; diff --git a/src/duckdb/src/include/duckdb/planner/binder.hpp b/src/duckdb/src/include/duckdb/planner/binder.hpp index 796ac571a..0d6c918d8 100644 --- a/src/duckdb/src/include/duckdb/planner/binder.hpp +++ b/src/duckdb/src/include/duckdb/planner/binder.hpp @@ -536,12 +536,10 @@ class Binder : public enable_shared_from_this { void ExpandDefaultInValuesList(InsertStatement &stmt, TableCatalogEntry &table, optional_ptr values_list, const vector &named_column_map); - unique_ptr BindMergeAction(LogicalMergeInto &merge_into, TableCatalogEntry &table, - LogicalGet &get, idx_t proj_index, - vector> &expressions, - unique_ptr &root, MergeIntoAction &action, - const vector &source_aliases, - const vector &source_names); + unique_ptr + BindMergeAction(LogicalMergeInto &merge_into, TableCatalogEntry &table, LogicalGet &get, idx_t proj_index, + vector> &expressions, MergeIntoAction &action, + const vector &source_aliases, const vector &source_names); unique_ptr GenerateMergeInto(InsertStatement &stmt, TableCatalogEntry &table); diff --git a/src/duckdb/src/include/duckdb/planner/expression_binder/projection_binder.hpp b/src/duckdb/src/include/duckdb/planner/expression_binder/projection_binder.hpp index 8d2a9b3e6..647b36486 100644 --- a/src/duckdb/src/include/duckdb/planner/expression_binder/projection_binder.hpp +++ b/src/duckdb/src/include/duckdb/planner/expression_binder/projection_binder.hpp @@ -32,6 +32,7 @@ class ProjectionBinder : public ExpressionBinder { idx_t proj_index; vector> &proj_expressions; string clause; + bool in_child_projection = false; }; } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/storage/caching_file_system.hpp b/src/duckdb/src/include/duckdb/storage/caching_file_system.hpp index 87f5db5ec..49c94664b 100644 --- a/src/duckdb/src/include/duckdb/storage/caching_file_system.hpp +++ b/src/duckdb/src/include/duckdb/storage/caching_file_system.hpp @@ -36,7 +36,7 @@ struct CachingFileHandle { public: DUCKDB_API CachingFileHandle(QueryContext context, CachingFileSystem &caching_file_system, const OpenFileInfo &path, - FileOpenFlags flags, optional_ptr opener, CachedFile &cached_file); + FileOpenFlags flags, optional_ptr opener); DUCKDB_API ~CachingFileHandle(); public: @@ -60,6 +60,8 @@ struct CachingFileHandle { DUCKDB_API void Seek(idx_t location); private: + //! Refresh the cached file if the global cache state has changed. + shared_ptr EnsureCachedFileCurrent(); //! Get the version tag of the file (for checking cache invalidation) const string &GetVersionTag(const unique_ptr &guard); //! Tries to read from the cache, filling "overlapping_ranges" with ranges that overlap with the request. @@ -94,8 +96,8 @@ struct CachingFileHandle { optional_ptr opener; //! Cache validation mode for this file CacheValidationMode validate; - //! The associated CachedFile with cached ranges - CachedFile &cached_file; + //! Associated cached file. + shared_ptr cached_file; //! The underlying FileHandle (optional) unique_ptr file_handle; diff --git a/src/duckdb/src/include/duckdb/storage/compression/alp/alp_scan.hpp b/src/duckdb/src/include/duckdb/storage/compression/alp/alp_scan.hpp index 188b5f1bb..2a64c5672 100644 --- a/src/duckdb/src/include/duckdb/storage/compression/alp/alp_scan.hpp +++ b/src/duckdb/src/include/duckdb/storage/compression/alp/alp_scan.hpp @@ -124,7 +124,13 @@ struct AlpScanState : public SegmentScanState { // Load the offset (metadata) indicating where the vector data starts metadata_ptr -= AlpConstants::METADATA_POINTER_SIZE; auto data_byte_offset = Load(metadata_ptr); - D_ASSERT(data_byte_offset < segment.GetBlockSize()); + const auto block_size = segment.GetBlockSize(); + + if (data_byte_offset >= block_size) { + throw IOException( + "Corrupted ALP segment: stored data_byte_offset (%d) exceeds the segments block size (%d)", + data_byte_offset, block_size); + } idx_t vector_size = MinValue((idx_t)AlpConstants::ALP_VECTOR_SIZE, (count - total_value_count)); @@ -138,7 +144,14 @@ struct AlpScanState : public SegmentScanState { if (uncompressed_mode) { if (!SKIP) { // Read uncompressed values - memcpy(value_buffer, vector_ptr, sizeof(T) * vector_size); + const idx_t value_buffer_copy_size = sizeof(T) * vector_size; + if (vector_ptr + value_buffer_copy_size > segment_data + block_size) { + const auto bytes_remaining_in_block = (segment_data + block_size) - vector_ptr; + throw IOException("Corrupted ALP segment: stored vector_size is invalid, to-copy bytes (%d) " + "would exceed bytes remaining in the block (%d)", + value_buffer_copy_size, bytes_remaining_in_block); + } + memcpy(value_buffer, vector_ptr, value_buffer_copy_size); } return; } @@ -154,21 +167,54 @@ struct AlpScanState : public SegmentScanState { vector_state.bit_width = Load(vector_ptr); vector_ptr += AlpConstants::BIT_WIDTH_SIZE; - D_ASSERT(vector_state.exceptions_count <= vector_size); - D_ASSERT(vector_state.v_factor <= vector_state.v_exponent); - D_ASSERT(vector_state.bit_width <= sizeof(uint64_t) * 8); + if (vector_state.exceptions_count > vector_size) { + throw IOException("Corrupted ALP segment: exceptions_count (%d) exceeds vector_size (%d)", + vector_state.exceptions_count, vector_size); + } + if (vector_state.v_factor > vector_state.v_exponent) { + throw IOException("Corrupted ALP segment: v_factor (%d) exceeds v_exponent (%d)", vector_state.v_factor, + vector_state.v_exponent); + } + if (vector_state.bit_width > sizeof(uint64_t) * 8) { + throw IOException("Corrupted ALP segment: Invalid bit_width encountered: %d", vector_state.bit_width); + } + idx_t read_bytes = 0; if (vector_state.bit_width > 0) { auto bp_size = BitpackingPrimitives::GetRequiredSize(vector_size, vector_state.bit_width); + + const idx_t max_encoded = sizeof(vector_state.for_encoded); + if (bp_size > max_encoded || data_byte_offset + read_bytes + bp_size > block_size) { + throw IOException("Corrupted ALP segment: encoded payload too large"); + } memcpy(vector_state.for_encoded, (void *)vector_ptr, bp_size); vector_ptr += bp_size; + read_bytes += bp_size; } if (vector_state.exceptions_count > 0) { - memcpy(vector_state.exceptions, (void *)vector_ptr, sizeof(EXACT_TYPE) * vector_state.exceptions_count); - vector_ptr += sizeof(EXACT_TYPE) * vector_state.exceptions_count; - memcpy(vector_state.exceptions_positions, (void *)vector_ptr, - AlpConstants::EXCEPTION_POSITION_SIZE * vector_state.exceptions_count); + //! Load the exceptions + const idx_t max_exceptions_size = sizeof(vector_state.exceptions); + const idx_t exceptions_copy_size = sizeof(EXACT_TYPE) * vector_state.exceptions_count; + if (exceptions_copy_size > max_exceptions_size || + data_byte_offset + read_bytes + exceptions_copy_size > block_size) { + throw IOException("Corrupted ALP segment: exceptions payload too large"); + } + memcpy(vector_state.exceptions, (void *)vector_ptr, exceptions_copy_size); + vector_ptr += exceptions_copy_size; + read_bytes += exceptions_copy_size; + + //! Load the exceptions_positions + const idx_t max_exceptions_positions_size = sizeof(vector_state.exceptions_positions); + const idx_t exceptions_positions_copy_size = + AlpConstants::EXCEPTION_POSITION_SIZE * vector_state.exceptions_count; + if (exceptions_positions_copy_size > max_exceptions_positions_size || + data_byte_offset + read_bytes + exceptions_positions_copy_size > block_size) { + throw IOException("Corrupted ALP segment: exceptions_positions payload too large"); + } + memcpy(vector_state.exceptions_positions, (void *)vector_ptr, exceptions_positions_copy_size); + vector_ptr += exceptions_positions_copy_size; + read_bytes += exceptions_positions_copy_size; } // Decode all the vector values to the specified 'value_buffer' diff --git a/src/duckdb/src/include/duckdb/storage/compression/alprd/alprd_scan.hpp b/src/duckdb/src/include/duckdb/storage/compression/alprd/alprd_scan.hpp index 47e29434d..31e7b4eed 100644 --- a/src/duckdb/src/include/duckdb/storage/compression/alprd/alprd_scan.hpp +++ b/src/duckdb/src/include/duckdb/storage/compression/alprd/alprd_scan.hpp @@ -73,22 +73,48 @@ struct AlpRDScanState : public SegmentScanState { // ScanStates never exceed the boundaries of a Segment, // but are not guaranteed to start at the beginning of the Block segment_data = handle.Ptr() + segment.GetBlockOffset(); + const auto block_size = segment.GetBlockSize(); + + idx_t total_segment_offset = segment.GetBlockOffset(); auto metadata_offset = Load(segment_data); + auto segment_ptr = segment_data + AlpRDConstants::METADATA_POINTER_SIZE; + total_segment_offset += AlpRDConstants::METADATA_POINTER_SIZE; + metadata_ptr = segment_data + metadata_offset; + const idx_t metadata_ptr_offset = segment.GetBlockOffset() + metadata_offset; + if (metadata_ptr_offset > block_size) { + throw IOException("Corrupted ALPRD segment: metadata_offset value is corrupted"); + } + + if (total_segment_offset + AlpRDConstants::HEADER_SIZE > block_size) { + throw IOException("Corrupted ALPRD segment: reading header bytes would exceed block space"); + } // Load the Right Bit Width which is in the segment header after the pointer to the first metadata - vector_state.right_bit_width = Load(segment_data + AlpRDConstants::METADATA_POINTER_SIZE); - vector_state.left_bit_width = - Load(segment_data + AlpRDConstants::METADATA_POINTER_SIZE + AlpRDConstants::RIGHT_BIT_WIDTH_SIZE); + vector_state.right_bit_width = Load(segment_ptr); + segment_ptr += AlpRDConstants::RIGHT_BIT_WIDTH_SIZE; + + vector_state.left_bit_width = Load(segment_ptr); + segment_ptr += AlpRDConstants::LEFT_BIT_WIDTH_SIZE; - uint8_t actual_dictionary_size = - Load(segment_data + AlpRDConstants::METADATA_POINTER_SIZE + AlpRDConstants::RIGHT_BIT_WIDTH_SIZE + - AlpRDConstants::LEFT_BIT_WIDTH_SIZE); - uint8_t actual_dictionary_size_bytes = actual_dictionary_size * AlpRDConstants::DICTIONARY_ELEMENT_SIZE; + uint8_t actual_dictionary_size = Load(segment_ptr); + segment_ptr += AlpRDConstants::N_DICTIONARY_ELEMENTS_SIZE; + total_segment_offset += AlpRDConstants::HEADER_SIZE; + + if (actual_dictionary_size > AlpRDConstants::MAX_DICTIONARY_SIZE) { + throw IOException("Corrupt database file: ALPRD dictionary size exceeds maximum"); + } + idx_t actual_dictionary_size_bytes = + static_cast(actual_dictionary_size) * AlpRDConstants::DICTIONARY_ELEMENT_SIZE; + + const idx_t left_parts_dict_max_size = sizeof(vector_state.left_parts_dict); + if (total_segment_offset + actual_dictionary_size_bytes > metadata_ptr_offset || + actual_dictionary_size_bytes > left_parts_dict_max_size) { + throw IOException("Corrupted ALPRD segment: actual_dictionary_size is corrupted"); + } // Load the left parts dictionary which is after the segment header and is of a fixed size - memcpy(vector_state.left_parts_dict, (void *)(segment_data + AlpRDConstants::HEADER_SIZE), - actual_dictionary_size_bytes); + memcpy(vector_state.left_parts_dict, segment_ptr, actual_dictionary_size_bytes); } BufferHandle handle; @@ -143,7 +169,12 @@ struct AlpRDScanState : public SegmentScanState { // Load the offset (metadata) indicating where the vector data starts metadata_ptr -= AlpRDConstants::METADATA_POINTER_SIZE; auto data_byte_offset = Load(metadata_ptr); - D_ASSERT(data_byte_offset < segment.GetBlockSize()); + const auto block_size = segment.GetBlockSize(); + if (data_byte_offset >= block_size) { + throw IOException( + "Corrupted ALPRD segment: stored data_byte_offset (%d) exceeds the segments block size (%d)", + data_byte_offset, block_size); + } idx_t vector_size = MinValue((idx_t)AlpRDConstants::ALP_VECTOR_SIZE, (count - total_value_count)); @@ -157,7 +188,14 @@ struct AlpRDScanState : public SegmentScanState { if (uncompressed_mode) { if (!SKIP) { // Read uncompressed values - memcpy(value_buffer, vector_ptr, sizeof(T) * vector_size); + const idx_t value_buffer_copy_size = sizeof(T) * vector_size; + if (vector_ptr + value_buffer_copy_size > segment_data + block_size) { + const auto bytes_remaining_in_block = (segment_data + block_size) - vector_ptr; + throw IOException("Corrupted ALPRD segment: stored vector_size is invalid, to-copy bytes " + "(%d) would exceed bytes remaining in the block (%d)", + value_buffer_copy_size, bytes_remaining_in_block); + } + memcpy(value_buffer, vector_ptr, value_buffer_copy_size); } return; } @@ -165,18 +203,46 @@ struct AlpRDScanState : public SegmentScanState { auto left_bp_size = BitpackingPrimitives::GetRequiredSize(vector_size, vector_state.left_bit_width); auto right_bp_size = BitpackingPrimitives::GetRequiredSize(vector_size, vector_state.right_bit_width); + idx_t read_bytes = 0; + const idx_t max_left_encoded_size = sizeof(vector_state.left_encoded); + if (left_bp_size > max_left_encoded_size || data_byte_offset + read_bytes + left_bp_size > block_size) { + throw IOException("Corrupted ALPRD segment: left_encoded payload too large"); + } memcpy(vector_state.left_encoded, (void *)vector_ptr, left_bp_size); vector_ptr += left_bp_size; + read_bytes += left_bp_size; + const idx_t max_right_encoded_size = sizeof(vector_state.right_encoded); + if (right_bp_size > max_right_encoded_size || data_byte_offset + read_bytes + right_bp_size > block_size) { + throw IOException("Corrupted ALPRD segment: left_encoded payload too large"); + } memcpy(vector_state.right_encoded, (void *)vector_ptr, right_bp_size); vector_ptr += right_bp_size; + read_bytes += right_bp_size; if (vector_state.exceptions_count > 0) { - memcpy(vector_state.exceptions, (void *)vector_ptr, - AlpRDConstants::EXCEPTION_SIZE * vector_state.exceptions_count); - vector_ptr += AlpRDConstants::EXCEPTION_SIZE * vector_state.exceptions_count; - memcpy(vector_state.exceptions_positions, (void *)vector_ptr, - AlpRDConstants::EXCEPTION_POSITION_SIZE * vector_state.exceptions_count); + //! Load the exceptions + const idx_t max_exceptions_size = sizeof(vector_state.exceptions); + const idx_t exceptions_copy_size = AlpRDConstants::EXCEPTION_SIZE * vector_state.exceptions_count; + if (exceptions_copy_size > max_exceptions_size || + data_byte_offset + read_bytes + exceptions_copy_size > block_size) { + throw IOException("Corrupted ALPRD segment: exceptions payload too large"); + } + memcpy(vector_state.exceptions, (void *)vector_ptr, exceptions_copy_size); + vector_ptr += exceptions_copy_size; + read_bytes += exceptions_copy_size; + + //! Load the exceptions_positions + const idx_t max_exceptions_positions_size = sizeof(vector_state.exceptions_positions); + const idx_t exceptions_positions_copy_size = + AlpRDConstants::EXCEPTION_POSITION_SIZE * vector_state.exceptions_count; + if (exceptions_positions_copy_size > max_exceptions_positions_size || + data_byte_offset + read_bytes + exceptions_positions_copy_size > block_size) { + throw IOException("Corrupted ALPRD segment: exceptions_positions payload too large"); + } + memcpy(vector_state.exceptions_positions, (void *)vector_ptr, exceptions_positions_copy_size); + vector_ptr += exceptions_positions_copy_size; + read_bytes += exceptions_positions_copy_size; } // Decode all the vector values to the specified 'value_buffer' diff --git a/src/duckdb/src/include/duckdb/storage/compression/patas/patas_scan.hpp b/src/duckdb/src/include/duckdb/storage/compression/patas/patas_scan.hpp index ea1f95fb0..5fcd755ce 100644 --- a/src/duckdb/src/include/duckdb/storage/compression/patas/patas_scan.hpp +++ b/src/duckdb/src/include/duckdb/storage/compression/patas/patas_scan.hpp @@ -67,6 +67,14 @@ struct PatasGroupState { } value_buffer[0] = (EXACT_TYPE)0; for (idx_t i = 0; i < count; i++) { + if (unpacked_data[i].index_diff > i) { + throw IOException("Corrupted Patas segment: invalid backward reference"); + } + if (unpacked_data[i].significant_bytes > sizeof(EXACT_TYPE) || + unpacked_data[i].trailing_zeros >= sizeof(EXACT_TYPE) * 8) { + throw IOException("Corrupted Patas segment: invalid packed value metadata"); + } + value_buffer[i] = patas::PatasDecompression::DecompressValue( byte_reader, unpacked_data[i].significant_bytes, unpacked_data[i].trailing_zeros, value_buffer[i - unpacked_data[i].index_diff]); @@ -95,6 +103,9 @@ struct PatasScanState : public SegmentScanState { // but are not guaranteed to start at the beginning of the Block segment_data = handle.Ptr() + segment.GetBlockOffset(); auto metadata_offset = Load(segment_data); + if (segment.GetBlockOffset() + metadata_offset > segment.GetBlockSize()) { + throw IOException("Corrupted Patas segment: metadata_offset reaches outside of the blocks memory"); + } metadata_ptr = segment_data + metadata_offset; } @@ -154,7 +165,9 @@ struct PatasScanState : public SegmentScanState { // Load the offset indicating where a groups data starts metadata_ptr -= sizeof(uint32_t); auto data_byte_offset = Load(metadata_ptr); - D_ASSERT(data_byte_offset < segment.GetBlockSize()); + if (segment.GetBlockOffset() + data_byte_offset >= segment.GetBlockSize()) { + throw IOException("Corrupted Patas segment: data_byte_offset would reach outside of the blocks memory"); + } // Initialize the byte_reader with the data values for the group group_state.Init(segment_data + data_byte_offset); diff --git a/src/duckdb/src/include/duckdb/storage/external_file_cache.hpp b/src/duckdb/src/include/duckdb/storage/external_file_cache.hpp index 60fd65997..483f28084 100644 --- a/src/duckdb/src/include/duckdb/storage/external_file_cache.hpp +++ b/src/duckdb/src/include/duckdb/storage/external_file_cache.hpp @@ -13,6 +13,7 @@ #include "duckdb/common/mutex.hpp" #include "duckdb/common/shared_ptr.hpp" #include "duckdb/common/unordered_map.hpp" +#include "duckdb/common/unordered_set.hpp" #include "duckdb/storage/buffer/temporary_file_information.hpp" #include "duckdb/storage/storage_lock.hpp" #include "duckdb/common/types/timestamp.hpp" @@ -56,7 +57,7 @@ class ExternalFileCache { //! Cached files struct CachedFile { public: - explicit CachedFile(string path_p); + CachedFile(string path_p, idx_t generation_p); public: //! Verifies that none of the ranges fully overlap (must hold the lock) @@ -75,6 +76,7 @@ class ExternalFileCache { public: const string path; + const idx_t generation; StorageLock lock; private: @@ -96,23 +98,39 @@ class ExternalFileCache { bool IsEnabled() const; void SetEnabled(bool enable); + idx_t GetGeneration() const; vector GetCachedFileInformation() const; + //! Number of files tracked in the ObjectCache, exposed for testing. + idx_t GetCachedFileCount() const; BufferManager &GetBufferManager() const; - //! Gets the cached file, or creates it if is not yet present - CachedFile &GetOrCreateCachedFile(const string &path); + //! Gets the shared cached file for the given path, creating it if not yet present. + //! When caching is disabled, returns a transient CachedFile that is not tracked in the cached file map. + shared_ptr GetOrCreateCachedFile(const string &path); DUCKDB_API static bool IsValid(bool validate, const string &cached_version_tag, timestamp_t cached_last_modified, const string ¤t_version_tag, timestamp_t current_last_modified); private: + class ExternalFileCacheObjectCacheEntry; + + //! Registers a cached file path in the tracked set. + void InsertCachedFileKey(const string &path); + //! Removes a cached file path from the tracked set. + void EraseCachedFileKey(const string &path); + //! Delete the ObjectCache entries for the given cached file paths. + void DeleteObjectCacheEntries(const vector &paths); + //! The BufferManager used to cache files BufferManager &buffer_manager; //! Whether or not file caching is enabled atomic enable; - //! Mapping from file path to cached file with cached ranges - unordered_map> cached_files; - //! Lock for accessing the cached files + //! Generation counter, incremented whenever cache enablement changes. + atomic generation; + //! Paths of the cached files tracked in the ObjectCache. + //! Entries should only be inserted at `GetOrCreateCachedFile` and deleted at object cache entry deletion. + unordered_set cached_file_keys; + //! Lock for accessing cached_file_keys. mutable mutex lock; }; diff --git a/src/duckdb/src/include/duckdb/storage/object_cache.hpp b/src/duckdb/src/include/duckdb/storage/object_cache.hpp index 5d4cb6ef9..08712fde6 100644 --- a/src/duckdb/src/include/duckdb/storage/object_cache.hpp +++ b/src/duckdb/src/include/duckdb/storage/object_cache.hpp @@ -13,6 +13,7 @@ #include "duckdb/common/lru_cache.hpp" #include "duckdb/common/mutex.hpp" #include "duckdb/common/string.hpp" +#include "duckdb/common/string_util.hpp" #include "duckdb/common/unordered_map.hpp" #include "duckdb/main/database.hpp" #include "duckdb/storage/buffer/buffer_pool_reservation.hpp" @@ -149,6 +150,30 @@ class ObjectCache { lru_cache.Delete(key); } + //! Type-prefixed variants of the methods above. These namespace the caller-provided key with the entry's + //! ObjectType so that callers can pass a natural key (e.g. a file path) without having to build a unique + //! cache key themselves. + template + shared_ptr GetWithTypePrefix(const string &key) { + return Get(MakeCacheKey(key)); + } + + template + shared_ptr GetOrCreateWithTypePrefix(const string &key, ARGS &&... args) { + return GetOrCreate(MakeCacheKey(key), std::forward(args)...); + } + + template + void PutWithTypePrefix(const string &key, + shared_ptr value) { // NOLINT(performance-unnecessary-value-param) + Put(MakeCacheKey(key), std::move(value)); + } + + template + void DeleteWithTypePrefix(const string &key) { + Delete(MakeCacheKey(key)); + } + DUCKDB_API static ObjectCache &GetObjectCache(ClientContext &context); idx_t GetMaxMemory() const { @@ -173,6 +198,14 @@ class ObjectCache { return lru_cache.EvictToReduceAtLeast(target_bytes); } +private: + //! Build the internal cache key for a typed entry by namespacing the caller-provided key with the entry's + //! ObjectType. + template + static string MakeCacheKey(const string &key) { + return StringUtil::Format("%s-%s", T::ObjectType(), key); + } + private: mutable mutex lock_mutex; //! LRU cache for evictable entries diff --git a/src/duckdb/src/include/duckdb/storage/string_uncompressed.hpp b/src/duckdb/src/include/duckdb/storage/string_uncompressed.hpp index 191c59b0b..ba4228cd2 100644 --- a/src/duckdb/src/include/duckdb/storage/string_uncompressed.hpp +++ b/src/duckdb/src/include/duckdb/storage/string_uncompressed.hpp @@ -209,9 +209,9 @@ struct UncompressedStringStorage { auto entry_offset = result_data[new_count - 1]; if (entry_offset < 0) { // overflow strings store the dict offset negatively - invert size - new_dictionary_size = -entry_offset; + new_dictionary_size = static_cast(-entry_offset); } else { - new_dictionary_size = entry_offset; + new_dictionary_size = static_cast(entry_offset); } } *dictionary_size = new_dictionary_size; diff --git a/src/duckdb/src/include/duckdb/storage/table/per_column_metadata_blocks.hpp b/src/duckdb/src/include/duckdb/storage/table/per_column_metadata_blocks.hpp index 0754878eb..b7fb99f16 100644 --- a/src/duckdb/src/include/duckdb/storage/table/per_column_metadata_blocks.hpp +++ b/src/duckdb/src/include/duckdb/storage/table/per_column_metadata_blocks.hpp @@ -34,6 +34,8 @@ class PerColumnMetadataBlocks { void AddColumn(idx_t col_idx, const vector &blocks); //! Remove a column entry and all its block IDs (linear scan) void RemoveColumn(idx_t col_idx); + //! Merge two PerColumnMetadataBlocks sorted by column index with disjoint column sets + static PerColumnMetadataBlocks Merge(const PerColumnMetadataBlocks &a, const PerColumnMetadataBlocks &b); //! Iterate over all block IDs, passing (column_index, block_id) to the callback template diff --git a/src/duckdb/src/include/duckdb/storage/table/row_group.hpp b/src/duckdb/src/include/duckdb/storage/table/row_group.hpp index aa15953dc..afb8691c9 100644 --- a/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +++ b/src/duckdb/src/include/duckdb/storage/table/row_group.hpp @@ -175,6 +175,8 @@ class RowGroup : public SegmentBase { RowGroupWriteData WriteToDisk(RowGroupWriteInfo &info) const; //! Returns the number of committed rows (count - committed deletes) idx_t GetCommittedRowCount(); + //! Returns the number of rows visible to the given transaction + idx_t GetVisibleRowCount(TransactionData transaction); bool CanReuseMetadata(RowGroupWriter &writer) const; RowGroupWriteData WriteToDisk(RowGroupWriter &writer); RowGroupPointer Checkpoint(RowGroupWriteData write_data, RowGroupWriter &writer, TableStatistics &global_stats, diff --git a/src/duckdb/src/include/duckdb/storage/table/row_group_reorderer.hpp b/src/duckdb/src/include/duckdb/storage/table/row_group_reorderer.hpp index 665b9392e..a1229d437 100644 --- a/src/duckdb/src/include/duckdb/storage/table/row_group_reorderer.hpp +++ b/src/duckdb/src/include/duckdb/storage/table/row_group_reorderer.hpp @@ -50,7 +50,7 @@ struct OffsetPruningResult { class RowGroupReorderer { public: - explicit RowGroupReorderer(const RowGroupOrderOptions &options_p); + RowGroupReorderer(const RowGroupOrderOptions &options_p, TransactionData transaction_p); optional_ptr> GetRootSegment(RowGroupSegmentTree &row_groups); optional_ptr> GetNextRowGroup(SegmentNode &row_group); @@ -62,6 +62,7 @@ class RowGroupReorderer { private: const RowGroupOrderOptions options; + const TransactionData transaction; idx_t offset; bool initialized; diff --git a/src/duckdb/src/include/duckdb/transaction/duck_transaction_manager.hpp b/src/duckdb/src/include/duckdb/transaction/duck_transaction_manager.hpp index 7b029feb5..ba3eb8d1b 100644 --- a/src/duckdb/src/include/duckdb/transaction/duck_transaction_manager.hpp +++ b/src/duckdb/src/include/duckdb/transaction/duck_transaction_manager.hpp @@ -25,7 +25,7 @@ struct DuckCleanupInfo { transaction_t lowest_start_time; vector> transactions; - void Cleanup() noexcept; + void Cleanup(); bool ScheduleCleanup() noexcept; }; diff --git a/src/duckdb/src/include/duckdb/transaction/transaction_context.hpp b/src/duckdb/src/include/duckdb/transaction/transaction_context.hpp index bb9016145..f402e137c 100644 --- a/src/duckdb/src/include/duckdb/transaction/transaction_context.hpp +++ b/src/duckdb/src/include/duckdb/transaction/transaction_context.hpp @@ -72,8 +72,8 @@ class TransactionContext { private: ClientContext &context; bool auto_commit; - TransactionInvalidationPolicy invalidation_policy; - bool auto_rollback; + TransactionInvalidationPolicy invalidation_policy = TransactionInvalidationPolicy::STANDARD_POLICY; + bool auto_rollback = false; unique_ptr current_transaction; diff --git a/src/duckdb/src/logging/log_manager.cpp b/src/duckdb/src/logging/log_manager.cpp index a8bc5b9ea..8435c76d6 100644 --- a/src/duckdb/src/logging/log_manager.cpp +++ b/src/duckdb/src/logging/log_manager.cpp @@ -157,6 +157,15 @@ void LogManager::SetDisabledLogTypes(optional_ptr> disable void LogManager::SetLogStorage(DatabaseInstance &db, const string &storage_name) { unique_lock lck(lock); + // 'SET logging_storage' cannot supply the path that file storage requires, so reject the switch + // here (active storage preserved) and point users at enable_logging instead of installing a + // path-less storage that throws on every later flush. + auto storage_name_to_lower = StringUtil::Lower(storage_name); + if (storage_name_to_lower == LogConfig::FILE_STORAGE_NAME && config.storage != storage_name_to_lower) { + throw InvalidConfigurationException( + "Cannot select 'file' log storage via 'SET logging_storage' because it requires a path. " + "Use CALL enable_logging(storage='file', storage_path='...') instead."); + } SetLogStorageInternal(db, storage_name); } diff --git a/src/duckdb/src/main/attached_database.cpp b/src/duckdb/src/main/attached_database.cpp index 6b835b1fb..9f3d88bf5 100644 --- a/src/duckdb/src/main/attached_database.cpp +++ b/src/duckdb/src/main/attached_database.cpp @@ -12,6 +12,7 @@ #include "duckdb/main/database_path_and_type.hpp" #include "duckdb/main/valid_checker.hpp" #include "duckdb/storage/block_allocator.hpp" +#include "duckdb/main/settings.hpp" namespace duckdb { @@ -68,8 +69,10 @@ AttachOptions::AttachOptions(const unordered_map &attach_options, } if (entry.first == "type") { - // Extract the database type. - db_type = StringValue::Get(entry.second.DefaultCastAs(LogicalType::VARCHAR)); + // Extract the database type. Normalize case so that + // `TYPE sqlite` and `TYPE 'SQLite'` are equivalent. + // `TYPE sqlite` and `TYPE 'sqlite3'` are NOT equivalent, aliasing to be applied on comparison + db_type = StringUtil::Lower(StringValue::Get(entry.second.DefaultCastAs(LogicalType::VARCHAR))); continue; } @@ -85,6 +88,18 @@ AttachOptions::AttachOptions(const unordered_map &attach_options, } continue; } + + if (entry.first == "vacuum_rebuild_indexes") { + const auto threshold = UBigIntValue::Get(entry.second.DefaultCastAs(LogicalType::UBIGINT)); + try { + vacuum_rebuild_indexes_threshold = threshold; + } catch (InternalException &e) { + throw InvalidInputException("Invalid setting for vacuum_rebuild_indexes: %d (valid range is 0 - %d)", + threshold, + UBigIntValue::Get(Value::MaximumValue(LogicalType::UBIGINT)) - 1); + } + continue; + } options.emplace(entry.first, entry.second); } } @@ -121,6 +136,7 @@ AttachedDatabase::AttachedDatabase(DatabaseInstance &db, Catalog &catalog_p, str } recovery_mode = options.recovery_mode; visibility = options.visibility; + vacuum_rebuild_threshold = options.vacuum_rebuild_indexes_threshold; // We create the storage after the catalog to guarantee we allow extensions to instantiate the DuckCatalog. catalog = make_uniq(*this); @@ -142,6 +158,7 @@ AttachedDatabase::AttachedDatabase(DatabaseInstance &db, Catalog &catalog_p, Sto } recovery_mode = options.recovery_mode; visibility = options.visibility; + vacuum_rebuild_threshold = options.vacuum_rebuild_indexes_threshold; optional_ptr storage_info = storage_extension->storage_info.get(); catalog = storage_extension->attach(storage_info, context, *this, name, info, options); @@ -185,6 +202,13 @@ bool AttachedDatabase::NameIsReserved(const string &name) { return name == DEFAULT_SCHEMA || name == TEMP_CATALOG || name == SYSTEM_CATALOG; } +idx_t AttachedDatabase::GetVacuumRebuildIndexThreshold() const { + if (vacuum_rebuild_threshold.IsValid()) { + return vacuum_rebuild_threshold.GetIndex(); + } + return Settings::Get(db); +} + string AttachedDatabase::StoredPath() const { if (stored_database_path) { return stored_database_path->path; diff --git a/src/duckdb/src/main/capi/duckdb_value-c.cpp b/src/duckdb/src/main/capi/duckdb_value-c.cpp index 6e33822eb..c7a9d5195 100644 --- a/src/duckdb/src/main/capi/duckdb_value-c.cpp +++ b/src/duckdb/src/main/capi/duckdb_value-c.cpp @@ -6,6 +6,7 @@ #include "duckdb/common/types/uuid.hpp" #include "duckdb/common/types/value.hpp" #include "duckdb/common/types/bignum.hpp" +#include "duckdb/common/types/decimal.hpp" #include "duckdb/main/capi/capi_internal.hpp" using duckdb::LogicalTypeId; @@ -139,6 +140,9 @@ duckdb_bignum duckdb_get_bignum(duckdb_value val) { return {data, size, is_negative}; } duckdb_value duckdb_create_decimal(duckdb_decimal input) { + if (!duckdb::Decimal::IsValidWidthScale(input.width, input.scale)) { + return nullptr; + } duckdb::hugeint_t hugeint(input.value.upper, input.value.lower); int64_t int64; if (duckdb::Hugeint::TryCast(hugeint, int64)) { diff --git a/src/duckdb/src/main/capi/logical_types-c.cpp b/src/duckdb/src/main/capi/logical_types-c.cpp index 814877ec9..588470205 100644 --- a/src/duckdb/src/main/capi/logical_types-c.cpp +++ b/src/duckdb/src/main/capi/logical_types-c.cpp @@ -3,6 +3,7 @@ #include "duckdb/common/type_visitor.hpp" #include "duckdb/common/helper.hpp" #include "duckdb/common/types/geometry_crs.hpp" +#include "duckdb/common/types/decimal.hpp" namespace duckdb { @@ -155,7 +156,15 @@ duckdb_logical_type duckdb_create_map_type(duckdb_logical_type key_type, duckdb_ } duckdb_logical_type duckdb_create_decimal_type(uint8_t width, uint8_t scale) { - return reinterpret_cast(new duckdb::LogicalType(duckdb::LogicalType::DECIMAL(width, scale))); + if (!duckdb::Decimal::IsValidWidthScale(width, scale)) { + return nullptr; + } + try { + return reinterpret_cast( + new duckdb::LogicalType(duckdb::LogicalType::DECIMAL(width, scale))); + } catch (...) { + return nullptr; + } } duckdb_type duckdb_get_type_id(duckdb_logical_type type) { diff --git a/src/duckdb/src/main/database_manager.cpp b/src/duckdb/src/main/database_manager.cpp index 839ad33c9..6ce1c4234 100644 --- a/src/duckdb/src/main/database_manager.cpp +++ b/src/duckdb/src/main/database_manager.cpp @@ -135,6 +135,16 @@ shared_ptr DatabaseManager::AttachDatabase(ClientContext &cont existing_db->GetCatalog().SetDefaultTable(options.default_table.schema, options.default_table.name); } if (info.on_conflict == OnCreateConflict::REPLACE_ON_CONFLICT) { + // we require the vacuuming threshold for indexed tables to be the same as the already attached db + if (options.vacuum_rebuild_indexes_threshold.IsValid()) { + auto previous_setting = existing_db->GetVacuumRebuildIndexThreshold(); + auto new_setting = options.vacuum_rebuild_indexes_threshold.GetIndex(); + if (previous_setting != new_setting) { + throw BinderException("Cannot re-attach with a different vacuum_rebuild_indexes setting " + "(previous: %d, new: %d)", + previous_setting, new_setting); + } + } // allow custom catalogs to override this behavior if (!existing_db->GetCatalog().HasConflictingAttachOptions(info.path, options)) { return existing_db; diff --git a/src/duckdb/src/main/database_path_and_type.cpp b/src/duckdb/src/main/database_path_and_type.cpp index 5f0d87ac5..69acc484b 100644 --- a/src/duckdb/src/main/database_path_and_type.cpp +++ b/src/duckdb/src/main/database_path_and_type.cpp @@ -12,7 +12,10 @@ void DBPathAndType::ExtractExtensionPrefix(string &path, string &db_type) { if (!extension.empty()) { // path is prefixed with an extension - remove the first occurence of it path = path.substr(extension.length() + 1); - db_type = ExtensionHelper::ApplyExtensionAlias(extension); + // Store the raw user prefix normalized to lowercase. The alias is + // applied only at lookup/comparison sites — symmetric with how the + // `TYPE 'xxx'` option preserves the user-supplied value. + db_type = StringUtil::Lower(extension); } } diff --git a/src/duckdb/src/main/extension/extension_helper.cpp b/src/duckdb/src/main/extension/extension_helper.cpp index 3319b6951..1edbd4e1d 100644 --- a/src/duckdb/src/main/extension/extension_helper.cpp +++ b/src/duckdb/src/main/extension/extension_helper.cpp @@ -2,7 +2,6 @@ #include "duckdb/common/file_system.hpp" #include "duckdb/common/local_file_system.hpp" -#include "duckdb/main/database_file_opener.hpp" #include "duckdb/common/serializer/binary_deserializer.hpp" #include "duckdb/common/serializer/buffered_file_reader.hpp" #include "duckdb/common/string_util.hpp" @@ -10,6 +9,7 @@ #include "duckdb/logging/logger.hpp" #include "duckdb/main/client_context.hpp" #include "duckdb/main/database.hpp" +#include "duckdb/main/database_file_opener.hpp" #include "duckdb/main/extension.hpp" #include "duckdb/main/extension_install_info.hpp" #include "duckdb/main/settings.hpp" @@ -122,6 +122,8 @@ static const DefaultExtension internal_extensions[] = { {"quack", "The DuckDB 'Quack' Client/Server Protocol", false}, {"vortex", "Adds support for reading and writing files using the Vortex file format", false}, {"lance", "Adds support for querying Lance datasets", false}, + {"avro", "Adds support for reading Avro files", false}, + {"unity_catalog", "Adds support for connecting to Unity Catalog", false}, {nullptr, nullptr, false}}; idx_t ExtensionHelper::DefaultExtensionCount() { diff --git a/src/duckdb/src/main/query_profiler.cpp b/src/duckdb/src/main/query_profiler.cpp index 2da10b6cc..5735e6439 100644 --- a/src/duckdb/src/main/query_profiler.cpp +++ b/src/duckdb/src/main/query_profiler.cpp @@ -276,7 +276,7 @@ string QueryProfiler::ToString(ProfilerPrintFormat format) const { lock_guard guard(lock); // checking the tree to ensure the query is really empty // the query string is empty when a logical plan is deserialized - if (query_metrics.query_name.empty() && !root) { + if (query_metrics.query_name.empty() || !root) { return ""; } auto renderer = TreeRenderer::CreateRenderer(GetExplainFormat(format)); diff --git a/src/duckdb/src/main/settings/autogenerated_settings.cpp b/src/duckdb/src/main/settings/autogenerated_settings.cpp index dbf091303..fd738dda3 100644 --- a/src/duckdb/src/main/settings/autogenerated_settings.cpp +++ b/src/duckdb/src/main/settings/autogenerated_settings.cpp @@ -38,6 +38,9 @@ Value AccessModeSetting::GetSetting(const ClientContext &context) { // Allow Parser Override Extension //===----------------------------------------------------------------------===// void AllowParserOverrideExtensionSetting::OnSet(SettingCallbackInfo &info, Value ¶meter) { + if (parameter.IsNull()) { + throw InvalidInputException("allow_parser_override_extension setting cannot be NULL"); + } EnumUtil::FromString(StringValue::Get(parameter)); } @@ -45,6 +48,9 @@ void AllowParserOverrideExtensionSetting::OnSet(SettingCallbackInfo &info, Value // Arrow Output Version //===----------------------------------------------------------------------===// void ArrowOutputVersionSetting::OnSet(SettingCallbackInfo &info, Value ¶meter) { + if (parameter.IsNull()) { + throw InvalidInputException("arrow_output_version setting cannot be NULL"); + } EnumUtil::FromString(StringValue::Get(parameter)); } @@ -67,6 +73,9 @@ Value CustomUserAgentSetting::GetSetting(const ClientContext &context) { // Debug Checkpoint Abort //===----------------------------------------------------------------------===// void DebugCheckpointAbortSetting::OnSet(SettingCallbackInfo &info, Value ¶meter) { + if (parameter.IsNull()) { + throw InvalidInputException("debug_checkpoint_abort setting cannot be NULL"); + } EnumUtil::FromString(StringValue::Get(parameter)); } @@ -91,6 +100,9 @@ Value DebugForceExternalSetting::GetSetting(const ClientContext &context) { // Debug Physical Table Scan Execution Strategy //===----------------------------------------------------------------------===// void DebugPhysicalTableScanExecutionStrategySetting::OnSet(SettingCallbackInfo &info, Value ¶meter) { + if (parameter.IsNull()) { + throw InvalidInputException("debug_physical_table_scan_execution_strategy setting cannot be NULL"); + } EnumUtil::FromString(StringValue::Get(parameter)); } @@ -98,6 +110,9 @@ void DebugPhysicalTableScanExecutionStrategySetting::OnSet(SettingCallbackInfo & // Debug Verify Vector //===----------------------------------------------------------------------===// void DebugVerifyVectorSetting::OnSet(SettingCallbackInfo &info, Value ¶meter) { + if (parameter.IsNull()) { + throw InvalidInputException("debug_verify_vector setting cannot be NULL"); + } EnumUtil::FromString(StringValue::Get(parameter)); } @@ -105,6 +120,9 @@ void DebugVerifyVectorSetting::OnSet(SettingCallbackInfo &info, Value ¶meter // Debug Window Mode //===----------------------------------------------------------------------===// void DebugWindowModeSetting::OnSet(SettingCallbackInfo &info, Value ¶meter) { + if (parameter.IsNull()) { + throw InvalidInputException("debug_window_mode setting cannot be NULL"); + } EnumUtil::FromString(StringValue::Get(parameter)); } @@ -112,6 +130,9 @@ void DebugWindowModeSetting::OnSet(SettingCallbackInfo &info, Value ¶meter) // Deprecated Using Key Syntax //===----------------------------------------------------------------------===// void DeprecatedUsingKeySyntaxSetting::OnSet(SettingCallbackInfo &info, Value ¶meter) { + if (parameter.IsNull()) { + throw InvalidInputException("deprecated_using_key_syntax setting cannot be NULL"); + } EnumUtil::FromString(StringValue::Get(parameter)); } @@ -159,6 +180,9 @@ Value EnableProgressBarSetting::GetSetting(const ClientContext &context) { // Explain Output //===----------------------------------------------------------------------===// void ExplainOutputSetting::OnSet(SettingCallbackInfo &info, Value ¶meter) { + if (parameter.IsNull()) { + throw InvalidInputException("explain_output setting cannot be NULL"); + } EnumUtil::FromString(StringValue::Get(parameter)); } @@ -166,6 +190,9 @@ void ExplainOutputSetting::OnSet(SettingCallbackInfo &info, Value ¶meter) { // Force Bitpacking Mode //===----------------------------------------------------------------------===// void ForceBitpackingModeSetting::OnSet(SettingCallbackInfo &info, Value ¶meter) { + if (parameter.IsNull()) { + throw InvalidInputException("force_bitpacking_mode setting cannot be NULL"); + } EnumUtil::FromString(StringValue::Get(parameter)); } @@ -181,6 +208,9 @@ Value HTTPProxySetting::GetSetting(const ClientContext &context) { // Lambda Syntax //===----------------------------------------------------------------------===// void LambdaSyntaxSetting::OnSet(SettingCallbackInfo &info, Value ¶meter) { + if (parameter.IsNull()) { + throw InvalidInputException("lambda_syntax setting cannot be NULL"); + } EnumUtil::FromString(StringValue::Get(parameter)); } @@ -188,6 +218,9 @@ void LambdaSyntaxSetting::OnSet(SettingCallbackInfo &info, Value ¶meter) { // Pin Threads //===----------------------------------------------------------------------===// void PinThreadsSetting::OnSet(SettingCallbackInfo &info, Value ¶meter) { + if (parameter.IsNull()) { + throw InvalidInputException("pin_threads setting cannot be NULL"); + } EnumUtil::FromString(StringValue::Get(parameter)); } @@ -195,6 +228,9 @@ void PinThreadsSetting::OnSet(SettingCallbackInfo &info, Value ¶meter) { // Storage Block Prefetch //===----------------------------------------------------------------------===// void StorageBlockPrefetchSetting::OnSet(SettingCallbackInfo &info, Value ¶meter) { + if (parameter.IsNull()) { + throw InvalidInputException("storage_block_prefetch setting cannot be NULL"); + } EnumUtil::FromString(StringValue::Get(parameter)); } @@ -202,6 +238,9 @@ void StorageBlockPrefetchSetting::OnSet(SettingCallbackInfo &info, Value ¶me // Validate External File Cache //===----------------------------------------------------------------------===// void ValidateExternalFileCacheSetting::OnSet(SettingCallbackInfo &info, Value ¶meter) { + if (parameter.IsNull()) { + throw InvalidInputException("validate_external_file_cache setting cannot be NULL"); + } EnumUtil::FromString(StringValue::Get(parameter)); } diff --git a/src/duckdb/src/optimizer/row_group_pruner.cpp b/src/duckdb/src/optimizer/row_group_pruner.cpp index 32daada8a..95d15c644 100644 --- a/src/duckdb/src/optimizer/row_group_pruner.cpp +++ b/src/duckdb/src/optimizer/row_group_pruner.cpp @@ -131,7 +131,7 @@ optional_ptr RowGroupPruner::FindLogicalGet(const LogicalOrder &logi const auto &primary_order = logical_order.orders[0]; auto &colref = primary_order.expression->Cast(); - vector columns {JoinFilterPushdownColumn {colref.binding}}; + vector columns {JoinFilterPushdownColumn {colref.binding, colref.return_type}}; vector pushdown_targets; JoinFilterPushdownOptimizer::GetPushdownFilterTargets(*logical_order.children[0], std::move(columns), pushdown_targets); diff --git a/src/duckdb/src/parser/tableref/pivotref.cpp b/src/duckdb/src/parser/tableref/pivotref.cpp index 290ed346c..d34268736 100644 --- a/src/duckdb/src/parser/tableref/pivotref.cpp +++ b/src/duckdb/src/parser/tableref/pivotref.cpp @@ -371,7 +371,7 @@ string PivotRef::ToString() const { if (i > 0) { result += ", "; } - result += groups[i]; + result += KeywordHelper::WriteOptionallyQuoted(groups[i]); } } result += ")"; diff --git a/src/duckdb/src/planner/binder/statement/bind_insert.cpp b/src/duckdb/src/planner/binder/statement/bind_insert.cpp index d7e2d4203..1bd2e2225 100644 --- a/src/duckdb/src/planner/binder/statement/bind_insert.cpp +++ b/src/duckdb/src/planner/binder/statement/bind_insert.cpp @@ -432,7 +432,7 @@ unique_ptr Binder::GenerateMergeInto(InsertStatement &stmt, // now push another subquery that adds the default columns auto select_stmt = make_uniq(); auto select_node = make_uniq(); - unordered_set set_columns; + case_insensitive_set_t set_columns; for (auto &set_col : stmt.columns) { set_columns.insert(set_col); } diff --git a/src/duckdb/src/planner/binder/statement/bind_merge_into.cpp b/src/duckdb/src/planner/binder/statement/bind_merge_into.cpp index 1919592eb..56d7d180f 100644 --- a/src/duckdb/src/planner/binder/statement/bind_merge_into.cpp +++ b/src/duckdb/src/planner/binder/statement/bind_merge_into.cpp @@ -31,29 +31,17 @@ vector> GenerateColumnReferences(Binder &binder, co return result; } -unique_ptr Binder::BindMergeAction(LogicalMergeInto &merge_into, TableCatalogEntry &table, - LogicalGet &get, idx_t proj_index, - vector> &expressions, - unique_ptr &root, MergeIntoAction &action, - const vector &source_aliases, - const vector &source_names) { +unique_ptr +Binder::BindMergeAction(LogicalMergeInto &merge_into, TableCatalogEntry &table, LogicalGet &get, idx_t proj_index, + vector> &expressions, MergeIntoAction &action, + const vector &source_aliases, const vector &source_names) { auto result = make_uniq(); result->action_type = action.action_type; if (action.condition) { - if (action.condition->HasSubquery()) { - // if we have a subquery we need to execute the condition outside of the MERGE INTO statement - WhereBinder where_binder(*this, context); - auto cond = where_binder.Bind(action.condition); - PlanSubqueries(cond, root); - result->condition = - make_uniq(cond->return_type, ColumnBinding(proj_index, expressions.size())); - expressions.push_back(std::move(cond)); - } else { - ProjectionBinder proj_binder(*this, context, proj_index, expressions, "WHERE clause"); - proj_binder.target_type = LogicalType::BOOLEAN; - auto cond = proj_binder.Bind(action.condition); - result->condition = std::move(cond); - } + ProjectionBinder proj_binder(*this, context, proj_index, expressions, "WHERE clause"); + proj_binder.target_type = LogicalType::BOOLEAN; + auto cond = proj_binder.Bind(action.condition); + result->condition = std::move(cond); } switch (action.action_type) { case MergeActionType::MERGE_UPDATE: { @@ -82,7 +70,9 @@ unique_ptr Binder::BindMergeAction(LogicalMergeInto &merge action.update_info->expressions = GenerateColumnReferences(*this, source_aliases, source_names); } } - BindUpdateSet(proj_index, root, *action.update_info, table, result->columns, result->expressions, expressions); + unique_ptr fake_root; + BindUpdateSet(proj_index, fake_root, *action.update_info, table, result->columns, result->expressions, + expressions); // bind any additional columns that need to be bound for update constraints // FIXME: this is pretty hacky @@ -139,7 +129,6 @@ unique_ptr Binder::BindMergeAction(LogicalMergeInto &merge TryReplaceDefaultExpression(action.expressions[i], column); auto insert_expr = insert_binder.Bind(action.expressions[i]); - PlanSubqueries(insert_expr, root); insert_expressions.push_back(std::move(insert_expr)); } @@ -255,6 +244,27 @@ BoundStatement Binder::Bind(MergeIntoStatement &stmt) { source_names.push_back(column_names[c]); } } + // bind the WHEN NOT MATCHED BY SOURCE / TARGET merge actions + auto &get = bound_table.plan->Cast(); + auto merge_into = make_uniq(table); + merge_into->table_index = GenerateTableIndex(); + auto proj_index = GenerateTableIndex(); + vector> projection_expressions; + + for (auto &entry : stmt.actions) { + if (entry.first == MergeActionCondition::WHEN_MATCHED) { + continue; + } + auto &action_binder = + entry.first == MergeActionCondition::WHEN_NOT_MATCHED_BY_TARGET ? *source_binder : *target_binder; + vector> bound_actions; + for (auto &action : entry.second) { + CheckMergeAction(entry.first, action->action_type); + bound_actions.push_back(action_binder.BindMergeAction( + *merge_into, table, get, proj_index, projection_expressions, *action, source_aliases, source_names)); + } + merge_into->actions.emplace(entry.first, std::move(bound_actions)); + } // bind the join between the source and target // our conditions determine the join type we need @@ -274,7 +284,6 @@ BoundStatement Binder::Bind(MergeIntoStatement &stmt) { } else { join.type = JoinType::INNER; } - auto &get = bound_table.plan->Cast(); join.left = make_uniq(std::move(source_binding), std::move(source_binder)); join.right = make_uniq(std::move(bound_table), std::move(target_binder)); if (stmt.join_condition) { @@ -297,8 +306,6 @@ BoundStatement Binder::Bind(MergeIntoStatement &stmt) { bool inverted = join.type == JoinType::RIGHT; auto &source = join_ref.get().children[inverted ? 1 : 0]; - auto merge_into = make_uniq(table); - merge_into->table_index = GenerateTableIndex(); if (!stmt.returning_list.empty()) { merge_into->return_chunk = true; } @@ -310,20 +317,25 @@ BoundStatement Binder::Bind(MergeIntoStatement &stmt) { merge_into->bound_constraints = BindConstraints(table); - // bind the merge actions - auto proj_index = GenerateTableIndex(); - vector> projection_expressions; - + // bind WHEN_MATCHED merge actions (can contain references to both source and target) for (auto &entry : stmt.actions) { + if (entry.first != MergeActionCondition::WHEN_MATCHED) { + continue; + } vector> bound_actions; for (auto &action : entry.second) { CheckMergeAction(entry.first, action->action_type); - bound_actions.push_back(BindMergeAction(*merge_into, table, get, proj_index, projection_expressions, root, + bound_actions.push_back(BindMergeAction(*merge_into, table, get, proj_index, projection_expressions, *action, source_aliases, source_names)); } merge_into->actions.emplace(entry.first, std::move(bound_actions)); } + // plan merge action subqueries + for (auto &expr : projection_expressions) { + PlanSubqueries(expr, root); + } + if (has_not_matched_by_source) { // if we have "has_not_matched_by_source" we need to push an extra marker into the source // this marker tells us if we have found a source match or not diff --git a/src/duckdb/src/planner/binder/statement/bind_update.cpp b/src/duckdb/src/planner/binder/statement/bind_update.cpp index c43ece2e6..e85ac8e9a 100644 --- a/src/duckdb/src/planner/binder/statement/bind_update.cpp +++ b/src/duckdb/src/planner/binder/statement/bind_update.cpp @@ -58,7 +58,9 @@ void Binder::BindUpdateSet(idx_t proj_index, unique_ptr &root, UpdateBinder binder(*expr_binder_ptr, context); binder.target_type = column.Type(); auto bound_expr = binder.Bind(expr); - PlanSubqueries(bound_expr, root); + if (root) { + PlanSubqueries(bound_expr, root); + } update_expressions.push_back(make_uniq( bound_expr->return_type, ColumnBinding(proj_index, projection_expressions.size()))); diff --git a/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp b/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp index 8b790b727..0e27da6ff 100644 --- a/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp +++ b/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp @@ -69,12 +69,16 @@ BoundStatement Binder::BindWithReplacementScan(ClientContext &context, BaseTable auto &subquery = replacement_function->Cast(); subquery.column_name_alias = ref.column_name_alias; } else { + // carry the alias to the wrapping SubqueryRef so qualified references + // like `SELECT d.x FROM _ AS d` can resolve against the outer ref + auto inner_alias = replacement_function->alias; auto select_node = make_uniq(); select_node->select_list.push_back(make_uniq()); select_node->from_table = std::move(replacement_function); auto select_stmt = make_uniq(); select_stmt->node = std::move(select_node); auto subquery = make_uniq(std::move(select_stmt)); + subquery->alias = std::move(inner_alias); subquery->column_name_alias = ref.column_name_alias; replacement_function = std::move(subquery); } diff --git a/src/duckdb/src/planner/binder/tableref/bind_pivot.cpp b/src/duckdb/src/planner/binder/tableref/bind_pivot.cpp index 3e1094e52..ec5d77977 100644 --- a/src/duckdb/src/planner/binder/tableref/bind_pivot.cpp +++ b/src/duckdb/src/planner/binder/tableref/bind_pivot.cpp @@ -700,14 +700,16 @@ struct UnpivotEntry { void Binder::ExtractUnpivotEntries(Binder &child_binder, PivotColumnEntry &entry, vector &unpivot_entries) { // Try to bind the entry expression as values - try { - auto expr_copy = entry.expr->Copy(); - BindPivotInList(expr_copy, entry.values, child_binder); - // successfully bound as values - clear the expression - entry.expr = nullptr; - } catch (...) { - // ignore binder exceptions here - we fall back to expression mode - entry.values.clear(); + if (entry.expr) { + try { + auto expr_copy = entry.expr->Copy(); + BindPivotInList(expr_copy, entry.values, child_binder); + // successfully bound as values - clear the expression + entry.expr = nullptr; + } catch (...) { + // ignore binder exceptions here - we fall back to expression mode + entry.values.clear(); + } } if (!entry.expr) { diff --git a/src/duckdb/src/planner/expression_binder/projection_binder.cpp b/src/duckdb/src/planner/expression_binder/projection_binder.cpp index 331ed9315..4fc09e2ab 100644 --- a/src/duckdb/src/planner/expression_binder/projection_binder.cpp +++ b/src/duckdb/src/planner/expression_binder/projection_binder.cpp @@ -10,7 +10,12 @@ ProjectionBinder::ProjectionBinder(Binder &binder, ClientContext &context, idx_t } BindResult ProjectionBinder::BindColumnRef(unique_ptr &expr_ptr, idx_t depth, bool root_expression) { + if (in_child_projection) { + return ExpressionBinder::BindExpression(expr_ptr, depth); + } + in_child_projection = true; auto result = ExpressionBinder::BindExpression(expr_ptr, depth); + in_child_projection = false; if (result.HasError()) { return result; } @@ -33,6 +38,7 @@ BindResult ProjectionBinder::BindExpression(unique_ptr &expr_p case ExpressionClass::WINDOW: return BindUnsupportedExpression(expr, depth, clause + " cannot contain window functions!"); case ExpressionClass::COLUMN_REF: + case ExpressionClass::SUBQUERY: return BindColumnRef(expr_ptr, depth, root_expression); default: return ExpressionBinder::BindExpression(expr_ptr, depth); diff --git a/src/duckdb/src/storage/caching_file_system.cpp b/src/duckdb/src/storage/caching_file_system.cpp index 6a2d1d626..17110c4a2 100644 --- a/src/duckdb/src/storage/caching_file_system.cpp +++ b/src/duckdb/src/storage/caching_file_system.cpp @@ -61,32 +61,46 @@ CachingFileSystem CachingFileSystem::Get(ClientContext &context) { unique_ptr CachingFileSystem::OpenFile(const OpenFileInfo &path, FileOpenFlags flags, optional_ptr opener) { - return make_uniq(QueryContext(), *this, path, flags, opener, - external_file_cache.GetOrCreateCachedFile(path.path)); + return make_uniq(QueryContext(), *this, path, flags, opener); } unique_ptr CachingFileSystem::OpenFile(QueryContext context, const OpenFileInfo &path, FileOpenFlags flags, optional_ptr opener) { - return make_uniq(context, *this, path, flags, opener, - external_file_cache.GetOrCreateCachedFile(path.path)); + return make_uniq(context, *this, path, flags, opener); +} + +shared_ptr CachingFileHandle::EnsureCachedFileCurrent() { + if (cached_file && cached_file->generation == external_file_cache.GetGeneration()) { + return cached_file; + } + const bool needs_reopen = file_handle != nullptr; + if (needs_reopen) { + file_handle.reset(); + } + cached_file = external_file_cache.GetOrCreateCachedFile(path.path); + if (needs_reopen) { + GetFileHandle(); + } + return cached_file; } CachingFileHandle::CachingFileHandle(QueryContext context, CachingFileSystem &caching_file_system_p, const OpenFileInfo &path_p, FileOpenFlags flags_p, - optional_ptr opener_p, CachedFile &cached_file_p) + optional_ptr opener_p) : context(context), caching_file_system(caching_file_system_p), external_file_cache(caching_file_system.external_file_cache), path(path_p), flags(flags_p), opener(opener_p), validate( ExternalFileCacheUtil::GetCacheValidationMode(path_p, context.GetClientContext(), caching_file_system_p.db)), - cached_file(cached_file_p), position(0) { + cached_file(nullptr), position(0) { + cached_file = external_file_cache.GetOrCreateCachedFile(path_p.path); if (!external_file_cache.IsEnabled() || Validate()) { // If caching is disabled, or if we must validate cache entries, we always have to open the file GetFileHandle(); return; } // If we don't have any cached file ranges, we must also open the file. - auto guard = cached_file.lock.GetSharedLock(); - if (cached_file.Ranges(guard).empty()) { + auto guard = cached_file->lock.GetSharedLock(); + if (cached_file->Ranges(guard).empty()) { guard.reset(); GetFileHandle(); } @@ -101,15 +115,15 @@ FileHandle &CachingFileHandle::GetFileHandle() { last_modified = caching_file_system.file_system.GetLastModifiedTime(*file_handle); version_tag = caching_file_system.file_system.GetVersionTag(*file_handle); - auto guard = cached_file.lock.GetExclusiveLock(); - if (!cached_file.IsValid(guard, Validate(), version_tag, last_modified)) { - cached_file.Ranges(guard).clear(); // Invalidate entire cache + auto guard = cached_file->lock.GetExclusiveLock(); + if (!cached_file->IsValid(guard, Validate(), version_tag, last_modified)) { + cached_file->Ranges(guard).clear(); // Invalidate entire cache } - cached_file.FileSize(guard) = file_handle->GetFileSize(); - cached_file.LastModified(guard) = last_modified; - cached_file.VersionTag(guard) = version_tag; - cached_file.CanSeek(guard) = file_handle->CanSeek(); - cached_file.OnDiskFile(guard) = file_handle->OnDiskFile(); + cached_file->FileSize(guard) = file_handle->GetFileSize(); + cached_file->LastModified(guard) = last_modified; + cached_file->VersionTag(guard) = version_tag; + cached_file->CanSeek(guard) = file_handle->CanSeek(); + cached_file->OnDiskFile(guard) = file_handle->OnDiskFile(); } return *file_handle; } @@ -122,6 +136,7 @@ BufferHandle CachingFileHandle::Read(data_ptr_t &buffer, const idx_t nr_bytes, c GetFileHandle().Read(context, buffer, nr_bytes, location); return result; } + EnsureCachedFileCurrent(); // Try to read from the cache, filling overlapping_ranges in the process vector> overlapping_ranges; @@ -204,57 +219,62 @@ BufferHandle CachingFileHandle::Read(data_ptr_t &buffer, idx_t &nr_bytes) { } string CachingFileHandle::GetPath() const { - return cached_file.path; + return path.path; } idx_t CachingFileHandle::GetFileSize() { - if (file_handle || Validate()) { - return GetFileHandle().GetFileSize(); + if (!Validate()) { + auto current_cached_file = EnsureCachedFileCurrent(); + auto guard = current_cached_file->lock.GetSharedLock(); + return current_cached_file->FileSize(guard); } - auto guard = cached_file.lock.GetSharedLock(); - return cached_file.FileSize(guard); + return GetFileHandle().GetFileSize(); } timestamp_t CachingFileHandle::GetLastModifiedTime() { - if (file_handle || Validate()) { - GetFileHandle(); - return last_modified; + if (!Validate()) { + auto current_cached_file = EnsureCachedFileCurrent(); + auto guard = current_cached_file->lock.GetSharedLock(); + return current_cached_file->LastModified(guard); } - auto guard = cached_file.lock.GetSharedLock(); - return cached_file.LastModified(guard); + GetFileHandle(); + return last_modified; } string CachingFileHandle::GetVersionTag() { - if (file_handle || Validate()) { - GetFileHandle(); - return version_tag; + if (!Validate()) { + auto current_cached_file = EnsureCachedFileCurrent(); + auto guard = current_cached_file->lock.GetSharedLock(); + return current_cached_file->VersionTag(guard); } - auto guard = cached_file.lock.GetSharedLock(); - return cached_file.VersionTag(guard); + GetFileHandle(); + return version_tag; } bool CachingFileHandle::Validate() const { - return ShouldValidate(path, context.GetClientContext(), caching_file_system.db, cached_file.path); + return ShouldValidate(path, context.GetClientContext(), caching_file_system.db, path.path); } bool CachingFileHandle::CanSeek() { - if (file_handle || Validate()) { - return GetFileHandle().CanSeek(); + if (!Validate()) { + auto current_cached_file = EnsureCachedFileCurrent(); + auto guard = current_cached_file->lock.GetSharedLock(); + return current_cached_file->CanSeek(guard); } - auto guard = cached_file.lock.GetSharedLock(); - return cached_file.CanSeek(guard); + return GetFileHandle().CanSeek(); } bool CachingFileHandle::IsRemoteFile() const { - return FileSystem::IsRemoteFile(cached_file.path); + return FileSystem::IsRemoteFile(path.path); } bool CachingFileHandle::OnDiskFile() { - if (file_handle || Validate()) { - return GetFileHandle().OnDiskFile(); + if (!Validate()) { + auto current_cached_file = EnsureCachedFileCurrent(); + auto guard = current_cached_file->lock.GetSharedLock(); + return current_cached_file->OnDiskFile(guard); } - auto guard = cached_file.lock.GetSharedLock(); - return cached_file.OnDiskFile(guard); + return GetFileHandle().OnDiskFile(); } const string &CachingFileHandle::GetVersionTag(const unique_ptr &guard) { @@ -262,7 +282,7 @@ const string &CachingFileHandle::GetVersionTag(const unique_ptr GetFileHandle(); return version_tag; } - return cached_file.VersionTag(guard); + return cached_file->VersionTag(guard); } idx_t CachingFileHandle::SeekPosition() { @@ -282,8 +302,8 @@ BufferHandle CachingFileHandle::TryReadFromCache(data_ptr_t &buffer, idx_t nr_by BufferHandle result; // Get read lock for cached ranges - auto guard = cached_file.lock.GetSharedLock(); - auto &ranges = cached_file.Ranges(guard); + auto guard = cached_file->lock.GetSharedLock(); + auto &ranges = cached_file->Ranges(guard); // First, try to see if we've read from the exact same location before auto it = ranges.find(location); @@ -354,8 +374,8 @@ BufferHandle CachingFileHandle::TryReadFromFileRange(const unique_ptr &new_file_range) { // Grab the lock again (write lock this time) to insert the newly created buffer into the ranges - auto guard = cached_file.lock.GetExclusiveLock(); - auto &ranges = cached_file.Ranges(guard); + auto guard = cached_file->lock.GetExclusiveLock(); + auto &ranges = cached_file->Ranges(guard); // Start at lower_bound (first range with location not less than location of newly created range) const auto this_end = location + nr_bytes; @@ -399,7 +419,7 @@ BufferHandle CachingFileHandle::TryInsertFileRange(BufferHandle &pin, data_ptr_t // Finally, insert newly created buffer into the map new_file_range->AddCheckSum(); ranges[location] = std::move(new_file_range); - cached_file.Verify(guard); + cached_file->Verify(guard); return std::move(pin); } diff --git a/src/duckdb/src/storage/compression/rle.cpp b/src/duckdb/src/storage/compression/rle.cpp index b20899e30..7400d6959 100644 --- a/src/duckdb/src/storage/compression/rle.cpp +++ b/src/duckdb/src/storage/compression/rle.cpp @@ -258,52 +258,71 @@ void RLEFinalizeCompress(CompressionState &state_p) { //===--------------------------------------------------------------------===// template struct RLEScanState : public SegmentScanState { - explicit RLEScanState(ColumnSegment &segment) { - auto &buffer_manager = BufferManager::GetBufferManager(segment.db); - handle = buffer_manager.Pin(segment.block); - entry_pos = 0; - position_in_entry = 0; - rle_count_offset = UnsafeNumericCast(Load(handle.Ptr() + segment.GetBlockOffset())); - D_ASSERT(rle_count_offset <= segment.GetBlockSize()); + explicit RLEScanState(ColumnSegment &segment) + : handle(BufferManager::GetBufferManager(segment.db).Pin(segment.block)), entry_pos(0), position_in_entry(0), + rle_count_offset(UnsafeNumericCast(Load(handle.Ptr() + segment.GetBlockOffset()))), + data_pointer(reinterpret_cast(handle.Ptr() + segment.GetBlockOffset() + RLEConstants::RLE_HEADER_SIZE)), + index_pointer(reinterpret_cast(handle.Ptr() + segment.GetBlockOffset() + rle_count_offset)), + max_entry_pos(static_cast(reinterpret_cast(handle.Ptr() + segment.GetBlockSize()) - + reinterpret_cast(index_pointer)) / + static_cast(sizeof(rle_count_t))) { + if (rle_count_offset < RLEConstants::RLE_HEADER_SIZE) { + //! This would make the index_pointer point into a region reserved for the header data + throw IOException("Corrupted RLE segment: rle_count_offset is corrupted"); + } + if (segment.GetBlockOffset() + rle_count_offset > segment.GetBlockSize()) { + //! This would make the index_pointer start outside of the segment + throw IOException("Corrupted RLE segment: rle_count_offset is corrupted"); + } + if ((rle_count_offset - RLEConstants::RLE_HEADER_SIZE) / sizeof(T) > max_entry_pos) { + //! This would make the indexing of the index_pointer[entry_pos] reach outside of the segment + throw IOException("Corrupted RLE segment: rle_count_offset is corrupted"); + } } - inline void SkipInternal(rle_count_t *index_pointer, idx_t skip_count) { + inline void SkipInternal(idx_t skip_count) { while (skip_count > 0) { rle_count_t run_end = index_pointer[entry_pos]; idx_t skip_amount = MinValue(skip_count, run_end - position_in_entry); skip_count -= skip_amount; position_in_entry += skip_amount; - if (ExhaustedRun(index_pointer)) { + if (ExhaustedRun()) { ForwardToNextRun(); } } } void Skip(ColumnSegment &segment, idx_t skip_count) { - auto data = handle.Ptr() + segment.GetBlockOffset(); - auto index_pointer = reinterpret_cast(data + rle_count_offset); - SkipInternal(index_pointer, skip_count); + SkipInternal(skip_count); } inline void ForwardToNextRun() { // handled all entries in this RLE value // move to the next entry entry_pos++; + if (entry_pos > max_entry_pos) { + throw IOException( + "Corrupted RLE segment: index_pointer[entry_pos] would reach outside of the blocks memory"); + } position_in_entry = 0; } - inline bool ExhaustedRun(rle_count_t *index_pointer) { + inline bool ExhaustedRun() { return position_in_entry >= index_pointer[entry_pos]; } BufferHandle handle; idx_t entry_pos; idx_t position_in_entry; - uint32_t rle_count_offset; + const uint32_t rle_count_offset; //! If we are running a filter over the column - the runs that match the filter unsafe_unique_array matching_runs; idx_t matching_run_count = 0; + + const T *data_pointer; + const rle_count_t *index_pointer; + const idx_t max_entry_pos; }; template @@ -338,13 +357,12 @@ static bool CanEmitConstantVector(idx_t position, idx_t run_length, idx_t scan_c } template -static void RLEScanConstant(RLEScanState &scan_state, rle_count_t *index_pointer, T *data_pointer, idx_t scan_count, - Vector &result) { +static void RLEScanConstant(RLEScanState &scan_state, idx_t scan_count, Vector &result) { result.SetVectorType(VectorType::CONSTANT_VECTOR); auto result_data = ConstantVector::GetData(result); - result_data[0] = data_pointer[scan_state.entry_pos]; + result_data[0] = scan_state.data_pointer[scan_state.entry_pos]; scan_state.position_in_entry += scan_count; - if (scan_state.ExhaustedRun(index_pointer)) { + if (scan_state.ExhaustedRun()) { scan_state.ForwardToNextRun(); } return; @@ -355,14 +373,10 @@ void RLEScanPartialInternal(ColumnSegment &segment, ColumnScanState &state, idx_ idx_t result_offset) { auto &scan_state = state.scan_state->Cast>(); - auto data = scan_state.handle.Ptr() + segment.GetBlockOffset(); - auto data_pointer = reinterpret_cast(data + RLEConstants::RLE_HEADER_SIZE); - auto index_pointer = reinterpret_cast(data + scan_state.rle_count_offset); - // If we are scanning an entire Vector and it contains only a single run - if (CanEmitConstantVector(scan_state.position_in_entry, index_pointer[scan_state.entry_pos], - scan_count)) { - RLEScanConstant(scan_state, index_pointer, data_pointer, scan_count, result); + if (CanEmitConstantVector(scan_state.position_in_entry, + scan_state.index_pointer[scan_state.entry_pos], scan_count)) { + RLEScanConstant(scan_state, scan_count, result); return; } @@ -371,10 +385,10 @@ void RLEScanPartialInternal(ColumnSegment &segment, ColumnScanState &state, idx_ idx_t result_end = result_offset + scan_count; while (result_offset < result_end) { - rle_count_t run_end = index_pointer[scan_state.entry_pos]; + rle_count_t run_end = scan_state.index_pointer[scan_state.entry_pos]; idx_t run_count = run_end - scan_state.position_in_entry; idx_t remaining_scan_count = result_end - result_offset; - T element = data_pointer[scan_state.entry_pos]; + T element = scan_state.data_pointer[scan_state.entry_pos]; if (DUCKDB_UNLIKELY(run_count > remaining_scan_count)) { for (idx_t i = 0; i < remaining_scan_count; i++) { result_data[result_offset + i] = element; @@ -411,13 +425,10 @@ void RLESelect(ColumnSegment &segment, ColumnScanState &state, idx_t vector_coun const SelectionVector &sel, idx_t sel_count) { auto &scan_state = state.scan_state->Cast>(); - auto data = scan_state.handle.Ptr() + segment.GetBlockOffset(); - auto data_pointer = reinterpret_cast(data + RLEConstants::RLE_HEADER_SIZE); - auto index_pointer = reinterpret_cast(data + scan_state.rle_count_offset); - // If we are scanning an entire Vector and it contains only a single run we don't need to select at all - if (CanEmitConstantVector(scan_state.position_in_entry, index_pointer[scan_state.entry_pos], vector_count)) { - RLEScanConstant(scan_state, index_pointer, data_pointer, vector_count, result); + if (CanEmitConstantVector(scan_state.position_in_entry, scan_state.index_pointer[scan_state.entry_pos], + vector_count)) { + RLEScanConstant(scan_state, vector_count, result); return; } @@ -431,14 +442,14 @@ void RLESelect(ColumnSegment &segment, ColumnScanState &state, idx_t vector_coun throw InternalException("Error in RLESelect - selection vector indices are not ordered"); } // skip forward to the next index - scan_state.SkipInternal(index_pointer, next_idx - prev_idx); + scan_state.SkipInternal(next_idx - prev_idx); // read the element - result_data[i] = data_pointer[scan_state.entry_pos]; + result_data[i] = scan_state.data_pointer[scan_state.entry_pos]; // move the next to the prev prev_idx = next_idx; } // skip the tail - scan_state.SkipInternal(index_pointer, vector_count - prev_idx); + scan_state.SkipInternal(vector_count - prev_idx); } //===--------------------------------------------------------------------===// @@ -449,9 +460,8 @@ void RLEFilter(ColumnSegment &segment, ColumnScanState &state, idx_t vector_coun idx_t &sel_count, const TableFilter &filter, TableFilterState &filter_state) { auto &scan_state = state.scan_state->Cast>(); - auto data = scan_state.handle.Ptr() + segment.GetBlockOffset(); - auto data_pointer = reinterpret_cast(data + RLEConstants::RLE_HEADER_SIZE); - auto index_pointer = reinterpret_cast(data + scan_state.rle_count_offset); + auto data_pointer = const_cast(scan_state.data_pointer); + auto index_pointer = const_cast(scan_state.index_pointer); auto total_run_count = (scan_state.rle_count_offset - RLEConstants::RLE_HEADER_SIZE) / sizeof(T); if (!scan_state.matching_runs) { @@ -532,7 +542,7 @@ void RLEFilter(ColumnSegment &segment, ColumnScanState &state, idx_t vector_coun throw InternalException("Error in RLEFilter - selection vector indices are not ordered"); } // skip forward to the next index - scan_state.SkipInternal(index_pointer, read_idx - prev_idx); + scan_state.SkipInternal(read_idx - prev_idx); prev_idx = read_idx; if (!scan_state.matching_runs[scan_state.entry_pos]) { // this run is filtered out - we don't need to scan it @@ -543,7 +553,7 @@ void RLEFilter(ColumnSegment &segment, ColumnScanState &state, idx_t vector_coun matching_sel.set_index(matching_count++, read_idx); } // skip the tail - scan_state.SkipInternal(index_pointer, vector_count - prev_idx); + scan_state.SkipInternal(vector_count - prev_idx); } // set up the filter result diff --git a/src/duckdb/src/storage/external_file_cache.cpp b/src/duckdb/src/storage/external_file_cache.cpp index 0a5a04963..9229dbfa8 100644 --- a/src/duckdb/src/storage/external_file_cache.cpp +++ b/src/duckdb/src/storage/external_file_cache.cpp @@ -5,9 +5,42 @@ #include "duckdb/main/database.hpp" #include "duckdb/storage/buffer_manager.hpp" #include "duckdb/storage/buffer/block_handle.hpp" +#include "duckdb/storage/object_cache.hpp" namespace duckdb { +class ExternalFileCache::ExternalFileCacheObjectCacheEntry : public ObjectCacheEntry { +public: + ExternalFileCacheObjectCacheEntry(ExternalFileCache &cache_p, string path_p, idx_t generation_p) + : cache(cache_p), cached_file(make_shared_ptr(std::move(path_p), generation_p)) { + cache.InsertCachedFileKey(cached_file->path); + } + + ~ExternalFileCacheObjectCacheEntry() override { + cache.EraseCachedFileKey(cached_file->path); + } + + static string ObjectType() { + return "external_file_cache"; + } + + string GetObjectType() override { + return ObjectType(); + } + + optional_idx GetEstimatedCacheMemory() const override { + return cached_file->path.size() * 2; + } + + shared_ptr GetCachedFile() const { + return cached_file; + } + +private: + ExternalFileCache &cache; + shared_ptr cached_file; +}; + ExternalFileCache::CachedFileRange::CachedFileRange(shared_ptr block_handle_p, idx_t nr_bytes_p, idx_t location_p, string version_tag_p) : block_handle(std::move(block_handle_p)), nr_bytes(nr_bytes_p), location(location_p), @@ -57,8 +90,9 @@ void ExternalFileCache::CachedFileRange::VerifyCheckSum() { #endif } -ExternalFileCache::CachedFile::CachedFile(string path_p) - : path(std::move(path_p)), file_size(0), last_modified(0), can_seek(false), on_disk_file(false) { +ExternalFileCache::CachedFile::CachedFile(string path_p, idx_t generation_p) + : path(std::move(path_p)), generation(generation_p), file_size(0), last_modified(0), can_seek(false), + on_disk_file(false) { } void ExternalFileCache::CachedFile::Verify(const unique_ptr &guard) const { @@ -131,7 +165,7 @@ ExternalFileCache::CachedFile::Ranges(const unique_ptr &guard) { } ExternalFileCache::ExternalFileCache(DatabaseInstance &db, bool enable_p) - : buffer_manager(BufferManager::GetBufferManager(db)), enable(enable_p) { + : buffer_manager(BufferManager::GetBufferManager(db)), enable(enable_p), generation(0) { } bool ExternalFileCache::IsEnabled() const { @@ -139,27 +173,61 @@ bool ExternalFileCache::IsEnabled() const { } void ExternalFileCache::SetEnabled(bool enable_p) { - lock_guard guard(lock); - enable = enable_p; - if (!enable) { - cached_files.clear(); + vector keys_to_delete; + { + const lock_guard guard(lock); + if (enable == enable_p) { + return; + } + enable = enable_p; + generation++; + if (!enable) { + keys_to_delete.reserve(cached_file_keys.size()); + for (auto &key : cached_file_keys) { + keys_to_delete.emplace_back(key); + } + } } + DeleteObjectCacheEntries(keys_to_delete); +} + +idx_t ExternalFileCache::GetGeneration() const { + return generation; } vector ExternalFileCache::GetCachedFileInformation() const { - unique_lock files_guard(lock); + vector keys; + { + const lock_guard files_guard(lock); + keys.reserve(cached_file_keys.size()); + for (auto &key : cached_file_keys) { + keys.emplace_back(key); + } + } + + auto &object_cache = buffer_manager.GetDatabase().GetObjectCache(); vector result; - for (const auto &file : cached_files) { - auto ranges_guard = file.second->lock.GetSharedLock(); - for (const auto &range_entry : file.second->Ranges(ranges_guard)) { + for (const auto &key : keys) { + auto entry = object_cache.GetWithTypePrefix(key); + if (!entry) { + continue; + } + auto file = entry->GetCachedFile(); + auto ranges_guard = file->lock.GetSharedLock(); + for (const auto &range_entry : file->Ranges(ranges_guard)) { const auto &range = *range_entry.second; result.push_back( - {file.first, range.nr_bytes, range.location, !range.block_handle->GetMemory().IsUnloaded()}); + {file->path, range.nr_bytes, range.location, !range.block_handle->GetMemory().IsUnloaded()}); } } return result; } +idx_t ExternalFileCache::GetCachedFileCount() const { + const lock_guard files_guard(lock); + return cached_file_keys.size(); +} + ExternalFileCache &ExternalFileCache::Get(DatabaseInstance &db) { return db.GetExternalFileCache(); } @@ -172,13 +240,48 @@ BufferManager &ExternalFileCache::GetBufferManager() const { return buffer_manager; } -ExternalFileCache::CachedFile &ExternalFileCache::GetOrCreateCachedFile(const string &path) { - lock_guard guard(lock); - auto &entry = cached_files[path]; - if (!entry) { - entry = make_uniq(path); +void ExternalFileCache::DeleteObjectCacheEntries(const vector &paths) { + auto &object_cache = buffer_manager.GetDatabase().GetObjectCache(); + for (auto &path : paths) { + object_cache.DeleteWithTypePrefix(path); + } +} + +shared_ptr ExternalFileCache::GetOrCreateCachedFile(const string &path) { + auto &object_cache = buffer_manager.GetDatabase().GetObjectCache(); + while (true) { + const auto current_generation = generation.load(); + if (!enable) { + return make_shared_ptr(path, current_generation); + } + + auto entry = object_cache.GetOrCreateWithTypePrefix(path, *this, path, + current_generation); + auto cached_file = entry->GetCachedFile(); + + if (!enable) { + object_cache.DeleteWithTypePrefix(path); + return make_shared_ptr(path, current_generation); + } + if (cached_file->generation != current_generation) { + object_cache.DeleteWithTypePrefix(path); + continue; + } + return cached_file; } - return *entry; +} + +void ExternalFileCache::InsertCachedFileKey(const string &path) { + const lock_guard guard(lock); + auto inserted = cached_file_keys.insert(path); + ALWAYS_ASSERT(inserted.second); +} + +void ExternalFileCache::EraseCachedFileKey(const string &path) { + const lock_guard guard(lock); + auto entry = cached_file_keys.find(path); + ALWAYS_ASSERT(entry != cached_file_keys.end()); + cached_file_keys.erase(entry); } } // namespace duckdb diff --git a/src/duckdb/src/storage/optimistic_data_writer.cpp b/src/duckdb/src/storage/optimistic_data_writer.cpp index b426d9ae5..3e5106856 100644 --- a/src/duckdb/src/storage/optimistic_data_writer.cpp +++ b/src/duckdb/src/storage/optimistic_data_writer.cpp @@ -137,6 +137,9 @@ void OptimisticDataWriter::WriteUnflushedRowGroups(OptimisticWriteCollection &ro } row_groups.unflushed_row_groups.clear(); row_groups.partial_block_managers.clear(); + // any new append to the row group collection needs to append a new row group + // otherwise we append to an already flushed row group + row_groups.collection->SetRowGroupAppendMode(RowGroupAppendMode::REQUIRE_NEW); } void OptimisticWriteCollection::MergeStorage(OptimisticWriteCollection &merge_collection) { @@ -168,7 +171,7 @@ void OptimisticWriteCollection::MergeStorage(OptimisticWriteCollection &merge_co // we cannot append into a row group that has been flushed if (complete_row_groups == collection->GetRowGroupCount()) { // if the last row group has been flushed move any new appends to a new row group - collection->SetRowGroupAppendMode(RowGroupAppendMode::SUGGEST_NEW); + collection->SetRowGroupAppendMode(RowGroupAppendMode::REQUIRE_NEW); } } diff --git a/src/duckdb/src/storage/statistics/base_statistics.cpp b/src/duckdb/src/storage/statistics/base_statistics.cpp index 54c3a7b11..d3c33605c 100644 --- a/src/duckdb/src/storage/statistics/base_statistics.cpp +++ b/src/duckdb/src/storage/statistics/base_statistics.cpp @@ -12,7 +12,8 @@ namespace duckdb { -BaseStatistics::BaseStatistics() : type(LogicalType::INVALID) { +BaseStatistics::BaseStatistics() : type(LogicalType::INVALID), has_null(false), has_no_null(false), distinct_count(0) { + memset(&stats_union, 0, sizeof(stats_union)); } BaseStatistics::BaseStatistics(LogicalType type) { @@ -20,7 +21,10 @@ BaseStatistics::BaseStatistics(LogicalType type) { } void BaseStatistics::Construct(BaseStatistics &stats, LogicalType type) { + stats.has_null = false; + stats.has_no_null = false; stats.distinct_count = 0; + memset(&stats.stats_union, 0, sizeof(stats.stats_union)); stats.type = std::move(type); switch (GetStatsType(stats.type)) { case StatisticsType::LIST_STATS: diff --git a/src/duckdb/src/storage/statistics/geometry_stats.cpp b/src/duckdb/src/storage/statistics/geometry_stats.cpp index 28a36c178..b5455a6e1 100644 --- a/src/duckdb/src/storage/statistics/geometry_stats.cpp +++ b/src/duckdb/src/storage/statistics/geometry_stats.cpp @@ -4,6 +4,7 @@ #include "duckdb/common/types/vector.hpp" #include "duckdb/common/serializer/serializer.hpp" #include "duckdb/common/serializer/deserializer.hpp" +#include "duckdb/planner/expression/bound_cast_expression.hpp" #include "duckdb/planner/expression/bound_constant_expression.hpp" #include "duckdb/planner/expression/bound_function_expression.hpp" @@ -231,8 +232,10 @@ static FilterPropagateResult CheckIntersectionFilter(const GeometryStatsData &da return FilterPropagateResult::NO_PRUNING_POSSIBLE; } - // This has been checked before and needs to be true for the checks below to be valid - D_ASSERT(data.extent.HasXY()); + // This has been checked before and needs to be true for the checks below to be valid. + // Note: only one axis needs to be set; an unknown axis is an infinite range that + // intersects everything, so the IntersectsXY/ContainsXY math below stays valid. + D_ASSERT(data.extent.CanPruneXY()); const auto &geom = StringValue::Get(constant); auto extent = GeometryExtent::Empty(); @@ -288,8 +291,23 @@ FilterPropagateResult GeometryStats::CheckZonemap(const BaseStatistics &stats, c return FilterPropagateResult::NO_PRUNING_POSSIBLE; } - const auto lhs_kind = func.children[0]->GetExpressionType(); - const auto rhs_kind = func.children[1]->GetExpressionType(); + // The column reference may be wrapped in a GEOMETRY -> GEOMETRY cast (e.g. a CRS-erasing cast inserted to match + // the predicate's argument type). Such casts only change CRS metadata, not coordinates, so the bounding box + // remains valid. Look through them when classifying the operands. + auto strip_geometry_cast = [](const Expression &child) -> const Expression * { + if (child.GetExpressionType() == ExpressionType::OPERATOR_CAST) { + auto &cast = child.Cast(); + if (cast.child->return_type.id() == LogicalTypeId::GEOMETRY) { + return cast.child.get(); + } + } + return &child; + }; + + const auto &lhs = *strip_geometry_cast(*func.children[0]); + const auto &rhs = *strip_geometry_cast(*func.children[1]); + const auto lhs_kind = lhs.GetExpressionType(); + const auto rhs_kind = rhs.GetExpressionType(); const auto lhs_is_const = lhs_kind == ExpressionType::VALUE_CONSTANT && rhs_kind == ExpressionType::BOUND_REF; const auto rhs_is_const = rhs_kind == ExpressionType::VALUE_CONSTANT && lhs_kind == ExpressionType::BOUND_REF; @@ -300,16 +318,18 @@ FilterPropagateResult GeometryStats::CheckZonemap(const BaseStatistics &stats, c auto &data = GetDataUnsafe(stats); - if (!data.extent.HasXY()) { - // If the extent is empty or unknown, we cannot prune + if (!data.extent.CanPruneXY()) { + // If neither axis is set (the extent is empty or fully unknown), we cannot prune. + // A single known axis is enough: the unknown axis is an infinite range that + // intersects everything, so pruning degrades to the known axis. return FilterPropagateResult::NO_PRUNING_POSSIBLE; } if (lhs_is_const) { - return CheckIntersectionFilter(data, func.children[0]->Cast().value); + return CheckIntersectionFilter(data, lhs.Cast().value); } if (rhs_is_const) { - return CheckIntersectionFilter(data, func.children[1]->Cast().value); + return CheckIntersectionFilter(data, rhs.Cast().value); } // Else, no constant argument return FilterPropagateResult::NO_PRUNING_POSSIBLE; diff --git a/src/duckdb/src/storage/storage_info.cpp b/src/duckdb/src/storage/storage_info.cpp index 7d57d2693..70fd83e5d 100644 --- a/src/duckdb/src/storage/storage_info.cpp +++ b/src/duckdb/src/storage/storage_info.cpp @@ -92,10 +92,12 @@ static const StorageVersionInfo storage_version_info[] = { {"v1.4.2", 67}, {"v1.4.3", 67}, {"v1.4.4", 67}, + {"v1.4.5", 67}, {"v1.5.0", 68}, {"v1.5.1", 68}, {"v1.5.2", 68}, {"v1.5.3", 68}, + {"v1.5.4", 68}, {nullptr, 0} }; // END OF STORAGE VERSION INFO @@ -125,10 +127,12 @@ static const SerializationVersionInfo serialization_version_info[] = { {"v1.4.2", 6}, {"v1.4.3", 6}, {"v1.4.4", 6}, + {"v1.4.5", 6}, {"v1.5.0", 7}, {"v1.5.1", 7}, {"v1.5.2", 7}, {"v1.5.3", 7}, + {"v1.5.4", 7}, {"latest", 7}, {nullptr, 0} }; diff --git a/src/duckdb/src/storage/table/geo_column_data.cpp b/src/duckdb/src/storage/table/geo_column_data.cpp index efe26a870..f13211a47 100644 --- a/src/duckdb/src/storage/table/geo_column_data.cpp +++ b/src/duckdb/src/storage/table/geo_column_data.cpp @@ -336,8 +336,15 @@ unique_ptr GeoColumnData::Checkpoint(const RowGroup &row_ auto &partial_block_manager = info.GetPartialBlockManager(); auto checkpoint_state = make_uniq(row_group, *this, partial_block_manager); - auto &old_column_stats = - base_column->GetType().id() == LogicalTypeId::GEOMETRY ? old_stats : base_column->GetStatisticsRef(); + // When the inner column is unshredded, the geometry old_stats are already correct. + // When the inner column is shredded, the base_column has no stats of its own (it is parented to us). + // Shredded columns are always re-written from scratch, and the stats are recomputes, do the empty stats of the + // inner layout type is a correct default in these cases. + unique_ptr shredded_stats; + if (base_column->GetType().id() != LogicalTypeId::GEOMETRY) { + shredded_stats = BaseStatistics::CreateEmpty(base_column->GetType()).ToUnique(); + } + auto &old_column_stats = shredded_stats ? *shredded_stats : old_stats; // Are there any changes? if (!HasAnyChanges()) { diff --git a/src/duckdb/src/storage/table/per_column_metadata_blocks.cpp b/src/duckdb/src/storage/table/per_column_metadata_blocks.cpp index 5ec3f82f3..afc7d6f34 100644 --- a/src/duckdb/src/storage/table/per_column_metadata_blocks.cpp +++ b/src/duckdb/src/storage/table/per_column_metadata_blocks.cpp @@ -73,6 +73,33 @@ void PerColumnMetadataBlocks::AddColumn(idx_t col_idx, const vector &bloc } } +PerColumnMetadataBlocks PerColumnMetadataBlocks::Merge(const PerColumnMetadataBlocks &a, + const PerColumnMetadataBlocks &b) { + PerColumnMetadataBlocks result; + result.data.reserve(a.data.size() + b.data.size()); + idx_t ai = 0; + idx_t bi = 0; + // each marker is followed by its blocks until the next marker; data is sorted by column index + while (ai < a.data.size() && bi < b.data.size()) { + D_ASSERT(a.data[ai].is_column_index && b.data[bi].is_column_index); + D_ASSERT(a.data[ai].index != b.data[bi].index); + bool take_a = a.data[ai].index < b.data[bi].index; + const auto &src = take_a ? a.data : b.data; + idx_t &pos = take_a ? ai : bi; + result.data.push_back(src[pos++]); // marker + while (pos < src.size() && !src[pos].is_column_index) { + result.data.push_back(src[pos++]); + } + } + while (ai < a.data.size()) { + result.data.push_back(a.data[ai++]); + } + while (bi < b.data.size()) { + result.data.push_back(b.data[bi++]); + } + return result; +} + void PerColumnMetadataBlocks::RemoveColumn(idx_t col_idx) { idx_t start = data.size(); idx_t end = data.size(); diff --git a/src/duckdb/src/storage/table/row_group.cpp b/src/duckdb/src/storage/table/row_group.cpp index e333e3c31..f572c3af7 100644 --- a/src/duckdb/src/storage/table/row_group.cpp +++ b/src/duckdb/src/storage/table/row_group.cpp @@ -1211,6 +1211,21 @@ idx_t RowGroup::GetCommittedRowCount() { return count - vinfo->GetCommittedDeletedCount(count); } +idx_t RowGroup::GetVisibleRowCount(TransactionData transaction) { + auto vinfo = GetVersionInfo(); + if (!vinfo) { + return count; + } + ScanOptions options(transaction); + idx_t visible_count = 0; + SelectionVector sel(STANDARD_VECTOR_SIZE); + for (idx_t r = 0, i = 0; r < count; r += STANDARD_VECTOR_SIZE, i++) { + idx_t max_count = MinValue(STANDARD_VECTOR_SIZE, count - r); + visible_count += vinfo->GetSelVector(options, i, sel, max_count); + } + return visible_count; +} + bool RowGroup::HasUnloadedDeletes() const { if (deletes_pointers.empty()) { // no stored deletes at all @@ -1354,6 +1369,7 @@ RowGroupWriteData RowGroup::WriteToDisk(RowGroupWriter &writer) { RowGroupWriteInfo info(writer.GetPartialBlockManager(), compression_types, writer.GetCheckpointOptions()); + vector reused_columns; for (idx_t column_idx = 0; column_idx < GetColumnCount(); column_idx++) { bool column_has_changes = true; if (partial_reuse) { @@ -1367,6 +1383,7 @@ RowGroupWriteData RowGroup::WriteToDisk(RowGroupWriter &writer) { if (!column_has_changes) { // reuse this column's metadata result.states.push_back(nullptr); + reused_columns.emplace_back(column_idx); result_row_group->column_pointers[column_idx] = column_pointers[column_idx]; // carry forward existing column data and statistics if (!ColumnIsLoaded(column_idx)) { @@ -1393,6 +1410,16 @@ RowGroupWriteData RowGroup::WriteToDisk(RowGroupWriter &writer) { } } + if (partial_reuse) { + // carry forward the extras for reused columns onto the new row group, so RowGroup::Checkpoint + // can look them up via this->per_column_metadata_blocks + auto extras = per_column_metadata_blocks.GetBlocksForColumns(reused_columns); + for (idx_t i = 0; i < reused_columns.size(); i++) { + result_row_group->per_column_metadata_blocks.AddColumn(reused_columns[i], extras[i]); + } + result_row_group->has_per_column_metadata_blocks = true; + } + result.result_row_group = std::move(result_row_group); return result; } @@ -1452,7 +1479,6 @@ RowGroupPointer RowGroup::Checkpoint(RowGroupWriteData write_data, RowGroupWrite } // write path: write column metadata to disk (with optional per-column reuse) D_ASSERT(write_data.states.size() == GetColumnCount()); - vector reused_columns; // merge stats { @@ -1460,7 +1486,6 @@ RowGroupPointer RowGroup::Checkpoint(RowGroupWriteData write_data, RowGroupWrite for (idx_t column_idx = 0; column_idx < GetColumnCount(); column_idx++) { bool is_reused = !write_data.states[column_idx]; if (is_reused) { - reused_columns.emplace_back(column_idx); if (!ColumnIsLoaded(column_idx) && collection.get().GetTypes()[column_idx].id() != LogicalTypeId::VARIANT) { writer.SetHasUnloadedColumn(column_idx); @@ -1477,28 +1502,17 @@ RowGroupPointer RowGroup::Checkpoint(RowGroupWriteData write_data, RowGroupWrite // collect blocks that need to be preserved for reused columns vector reused_column_blocks; - - vector> extra_blocks_for_columns; - if (!reused_columns.empty()) { - extra_blocks_for_columns = per_column_metadata_blocks.GetBlocksForColumns(reused_columns); - } - idx_t reused_column_idx = 0; + // per-column extras for newly written columns, collected separately and merged with the + // partial map of reused-column extras (carried over onto this row group by WriteToDisk). + PerColumnMetadataBlocks written_column_blocks; for (idx_t column_idx = 0; column_idx < GetColumnCount(); column_idx++) { bool is_reused = !write_data.states[column_idx]; if (is_reused) { - // reuse existing column pointer and per-column blocks + // reuse existing column pointer (extras are already carried on this->per_column_metadata_blocks) auto col_ptr = column_pointers[column_idx]; row_group_pointer.data_pointers.push_back(col_ptr); - auto &col_blocks = extra_blocks_for_columns[reused_column_idx]; - row_group_pointer.per_column_metadata_blocks.AddColumn(column_idx, col_blocks); - - // collect all blocks for this reused column for ClearModifiedBlocks reused_column_blocks.push_back(col_ptr); - for (auto &block_id : col_blocks) { - reused_column_blocks.emplace_back(block_id, 0); - } - ++reused_column_idx; continue; } // write new metadata for this column @@ -1538,16 +1552,35 @@ RowGroupPointer RowGroup::Checkpoint(RowGroupWriteData write_data, RowGroupWrite col_extra_blocks.push_back(written_ptr.block_pointer); } } - row_group_pointer.per_column_metadata_blocks.AddColumn(column_idx, col_extra_blocks); + written_column_blocks.AddColumn(column_idx, col_extra_blocks); } if (GetCollection().SupportsPerColumnWrites()) { - row_group_pointer.has_per_column_metadata_blocks = true; // blocks already populated above + row_group_pointer.has_per_column_metadata_blocks = true; + if (write_data.write_action == RowGroupWriteAction::PARTIALLY_REUSE_COLUMN_METADATA) { + // merge reused-column extras (on this) with newly-written-column extras + D_ASSERT(has_per_column_metadata_blocks); + row_group_pointer.per_column_metadata_blocks = + PerColumnMetadataBlocks::Merge(per_column_metadata_blocks, written_column_blocks); + + // reused column blocks must be preserved by ClearModifiedBlocks + per_column_metadata_blocks.ForEachBlock( + [&](idx_t, idx_t block_id) { reused_column_blocks.emplace_back(block_id, 0); }); + } else { + row_group_pointer.per_column_metadata_blocks = written_column_blocks; + } + row_group_pointer.has_metadata_blocks = false; + row_group_pointer.extra_metadata_blocks.clear(); } else { + // Per-column reuse is not supported, so instead flatten the newly-written per-column extras into + // extra_metadata_blocks to still allow reusing the full row-group metadata on future checkpoints. + D_ASSERT(write_data.write_action != RowGroupWriteAction::PARTIALLY_REUSE_COLUMN_METADATA); + row_group_pointer.has_per_column_metadata_blocks = false; + row_group_pointer.per_column_metadata_blocks = {}; row_group_pointer.has_metadata_blocks = true; - row_group_pointer.per_column_metadata_blocks.ForEachBlock( + row_group_pointer.extra_metadata_blocks.clear(); + written_column_blocks.ForEachBlock( [&](idx_t, idx_t block_id) { row_group_pointer.extra_metadata_blocks.push_back(block_id); }); - row_group_pointer.per_column_metadata_blocks = {}; } if (metadata_manager) { diff --git a/src/duckdb/src/storage/table/row_group_collection.cpp b/src/duckdb/src/storage/table/row_group_collection.cpp index 37cb8495b..699599dd0 100644 --- a/src/duckdb/src/storage/table/row_group_collection.cpp +++ b/src/duckdb/src/storage/table/row_group_collection.cpp @@ -24,6 +24,19 @@ namespace duckdb { +static bool CanRebuildExistingIndexesAfterVacuum(DataTableInfo &info, AttachedDatabase &attached, idx_t total_rows) { + auto &indexes = info.GetIndexes(); + if (indexes.Empty() || indexes.HasUnbound()) { + return false; + } + auto vacuum_rebuild_threshold = attached.GetVacuumRebuildIndexThreshold(); + if (vacuum_rebuild_threshold == 0 || total_rows > vacuum_rebuild_threshold) { + return false; + } + auto index_types = indexes.DistinctIndexTypes(); + return index_types.size() == 1 && index_types.count(ART::TYPE_NAME); +} + //===--------------------------------------------------------------------===// // Row Group Segment Tree //===--------------------------------------------------------------------===// @@ -488,12 +501,13 @@ void RowGroupCollection::InitializeAppend(TransactionData transaction, TableAppe bool needs_new_row_group = state.row_groups->IsEmpty(l) || row_group_append_mode == RowGroupAppendMode::REQUIRE_NEW; // Otherwise we evaluate the row_group_append_mode if (!needs_new_row_group) { - if (info->GetIndexes().Empty()) { - // We honor SUGGEST_NEW unless the table has indexes because there is no vacuuming for indexed tables... + if (info->GetIndexes().Empty() || CanRebuildExistingIndexesAfterVacuum(*info, GetAttached(), GetTotalRows())) { + // Honor SUGGEST_NEW if vacuum can compact the table later, either because there are no indexes or because + // the existing indexes can be rebuilt after vacuuming. needs_new_row_group = row_group_append_mode == RowGroupAppendMode::SUGGEST_NEW; } else { - // ... and if it has indexes we will ignore row_group_append_mode and try to append, unless the last row - // group is full already. + // If the table has indexes that vacuum cannot rebuild, ignore row_group_append_mode and try to append, + // unless the last row group is full already. needs_new_row_group = row_group_size < state.row_groups->GetLastSegment(l)->GetNode().count; } } @@ -1344,11 +1358,8 @@ void RowGroupCollection::InitializeVacuumState(CollectionCheckpointState &checkp // *unless* vacuum_rebuild_indexes threshold is set, the table's row count // is within the threshold, and all indexes are bound ART indexes, // in which case we allow vacuuming and rebuild the indexes afterward. - auto vacuum_rebuild_threshold = Settings::Get(checkpoint_state.writer.GetDatabase()); - auto index_types = info->GetIndexes().DistinctIndexTypes(); - state.can_rebuild_indexes = has_indexes && !info->GetIndexes().HasUnbound() && index_types.size() == 1 && - index_types.count(ART::TYPE_NAME) && vacuum_rebuild_threshold > 0 && - GetTotalRows() <= vacuum_rebuild_threshold; + state.can_rebuild_indexes = + CanRebuildExistingIndexesAfterVacuum(*info, checkpoint_state.writer.GetAttached(), GetTotalRows()); // We can move around rowids if we either 1) don't have any indexes at all or 2) can_rebuild_indexes is true (in // which case indexes are entirely rebuilt after vacuuming). diff --git a/src/duckdb/src/storage/table/row_group_reorderer.cpp b/src/duckdb/src/storage/table/row_group_reorderer.cpp index 642b8571f..5913102ac 100644 --- a/src/duckdb/src/storage/table/row_group_reorderer.cpp +++ b/src/duckdb/src/storage/table/row_group_reorderer.cpp @@ -22,9 +22,10 @@ bool CompareValues(const Value &v1, const Value &v2, const OrderByStatistics ord return (order == OrderByStatistics::MAX && v1 < v2) || (order == OrderByStatistics::MIN && v1 > v2); } -idx_t GetQualifyingTupleCount(RowGroup &row_group, BaseStatistics &stats, const OrderByColumnType type) { +idx_t GetMinimumQualifyingTupleCount(RowGroup &row_group, BaseStatistics &stats, const OrderByColumnType type, + TransactionData transaction) { if (!stats.CanHaveNull()) { - return row_group.GetCommittedRowCount(); + return row_group.GetVisibleRowCount(transaction); } if (type == OrderByColumnType::NUMERIC) { @@ -44,7 +45,7 @@ idx_t GetQualifyingTupleCount(RowGroup &row_group, BaseStatistics &stats, const template void AddRowGroups(multimap &row_group_map, It it, End end, vector>> &ordered_row_groups, const idx_t row_limit, - const OrderByColumnType column_type, const OrderByStatistics stat_type) { + const OrderByColumnType column_type, const OrderByStatistics stat_type, TransactionData transaction) { const auto opposite_stat_type = stat_type == OrderByStatistics::MAX ? OrderByStatistics::MIN : OrderByStatistics::MAX; @@ -57,7 +58,7 @@ void AddRowGroups(multimap &row_group_map, It i idx_t qualify_later = 0; idx_t last_unresolved_row_group_sum = - GetQualifyingTupleCount(it->second.row_group.get().GetNode(), *last_stats, column_type); + GetMinimumQualifyingTupleCount(it->second.row_group.get().GetNode(), *last_stats, column_type, transaction); for (; it != end; ++it) { auto ¤t_key = it->first; auto &row_group = it->second.row_group; @@ -82,7 +83,8 @@ void AddRowGroups(multimap &row_group_map, It i auto &upcoming_row_group = last_unresolved_entry->second.row_group.get().GetNode(); auto &upcoming_stats = *last_unresolved_entry->second.stats; - last_unresolved_row_group_sum += GetQualifyingTupleCount(upcoming_row_group, upcoming_stats, column_type); + last_unresolved_row_group_sum += + GetMinimumQualifyingTupleCount(upcoming_row_group, upcoming_stats, column_type, transaction); last_unresolved_boundary = RowGroupReorderer::RetrieveStat(upcoming_stats, opposite_stat_type, column_type); } if (qualifying_tuples >= row_limit) { @@ -110,7 +112,7 @@ void InsertAllRowGroups(It it, End end, vector>> void SetRowGroupVector(multimap &row_group_map, const optional_idx row_limit, const idx_t row_group_offset, const OrderType order_type, const OrderByColumnType column_type, - vector>> &ordered_row_groups) { + vector>> &ordered_row_groups, TransactionData transaction) { const auto stat_type = order_type == OrderType::ASCENDING ? OrderByStatistics::MIN : OrderByStatistics::MAX; if (order_type == OrderType::ASCENDING) { auto end = row_group_map.end(); @@ -119,7 +121,8 @@ void SetRowGroupVector(multimap &row_group_map, return; } if (row_limit.IsValid()) { - AddRowGroups(row_group_map, it, end, ordered_row_groups, row_limit.GetIndex(), column_type, stat_type); + AddRowGroups(row_group_map, it, end, ordered_row_groups, row_limit.GetIndex(), column_type, stat_type, + transaction); } else { InsertAllRowGroups(it, end, ordered_row_groups); } @@ -130,7 +133,8 @@ void SetRowGroupVector(multimap &row_group_map, return; } if (row_limit.IsValid()) { - AddRowGroups(row_group_map, it, end, ordered_row_groups, row_limit.GetIndex(), column_type, stat_type); + AddRowGroups(row_group_map, it, end, ordered_row_groups, row_limit.GetIndex(), column_type, stat_type, + transaction); } else { InsertAllRowGroups(it, end, ordered_row_groups); } @@ -199,8 +203,8 @@ OffsetPruningResult FindOffsetPrunableChunks(It it, End end, const OrderByStatis } // namespace -RowGroupReorderer::RowGroupReorderer(const RowGroupOrderOptions &options_p) - : options(options_p), offset(0), initialized(false) { +RowGroupReorderer::RowGroupReorderer(const RowGroupOrderOptions &options_p, TransactionData transaction_p) + : options(options_p), transaction(transaction_p), offset(0), initialized(false) { } optional_ptr> RowGroupReorderer::GetNextRowGroup(SegmentNode &row_group) { @@ -260,6 +264,9 @@ OffsetPruningResult RowGroupReorderer::GetOffsetAfterPruning(const OrderByStatis } auto column_stats = partition_stats.partition_row_group->GetColumnStatistics(storage_index); + if (!column_stats) { + return {new_row_offset, 0, leading_null_group_offset}; + } if (null_order == OrderByNullType::NULLS_FIRST && IsNullOnly(*column_stats)) { if (new_row_offset < partition_stats.count) { return {new_row_offset, 0, leading_null_group_offset}; @@ -328,6 +335,10 @@ optional_ptr> RowGroupReorderer::GetRootSegment(RowGroupSe multimap row_group_map; for (auto &row_group : row_groups.SegmentNodes()) { auto stats = row_group.GetNode().GetStatistics(options.column_idx); + if (!stats) { + ambiguous_groups.push_back(row_group); + continue; + } if (IsNullOnly(*stats)) { null_only_groups.push_back(row_group); continue; @@ -347,10 +358,10 @@ optional_ptr> RowGroupReorderer::GetRootSegment(RowGroupSe AppendRowGroups(null_only_groups, options.leading_null_group_offset, ordered_row_groups); AppendRowGroups(ambiguous_groups, 0, ordered_row_groups); SetRowGroupVector(row_group_map, options.row_limit, options.row_group_offset, options.order_type, - options.column_type, ordered_row_groups); + options.column_type, ordered_row_groups, transaction); } else { SetRowGroupVector(row_group_map, options.row_limit, options.row_group_offset, options.order_type, - options.column_type, ordered_row_groups); + options.column_type, ordered_row_groups, transaction); AppendRowGroups(ambiguous_groups, 0, ordered_row_groups); AppendRowGroups(null_only_groups, 0, ordered_row_groups); } diff --git a/src/duckdb/src/storage/table/variant/variant_shredding.cpp b/src/duckdb/src/storage/table/variant/variant_shredding.cpp index f4739ce8b..7be0b48fd 100644 --- a/src/duckdb/src/storage/table/variant/variant_shredding.cpp +++ b/src/duckdb/src/storage/table/variant/variant_shredding.cpp @@ -427,6 +427,10 @@ bool VariantShreddingStats::GetShreddedTypeInternal(const VariantColumnStatsData child_list_t child_types; for (auto &entry : column.field_stats) { auto &child_column = GetColumnStats(entry.second); + if (entry.first.empty()) { + //! Do not include empty field names in the shredded type! + continue; + } LogicalType child_type; if (GetShreddedTypeInternal(child_column, child_type, total_value_count)) { child_types.emplace_back(entry.first, child_type); @@ -770,7 +774,18 @@ void VariantColumnData::ShredVariantData(Vector &input, Vector &output, idx_t co for (idx_t i = 0; i < count; i++) { auto input_val = input.GetValue(i); auto roundtripped_val = roundtrip_result.GetValue(i); - if (!ValueOperations::NotDistinctFrom(input_val, roundtripped_val)) { + + Vector input_vec(input_val); + Vector roundtripped_vec(roundtripped_val); + + Vector normalized_input(LogicalType::VARIANT(), 1); + Vector normalized_roundtrip(LogicalType::VARIANT(), 1); + VariantNormalizer::Normalize(input_vec, normalized_input, 1); + VariantNormalizer::Normalize(roundtripped_vec, normalized_roundtrip, 1); + + auto normalized_input_value = normalized_input.GetValue(0); + auto normalized_roundtrip_value = normalized_roundtrip.GetValue(0); + if (!ValueOperations::NotDistinctFrom(normalized_input_value, normalized_roundtrip_value)) { throw InternalException("Shredding roundtrip verification failed for row: %d, expected: %s, actual: %s", i, input_val.ToString(), roundtripped_val.ToString()); } diff --git a/src/duckdb/src/transaction/duck_transaction_manager.cpp b/src/duckdb/src/transaction/duck_transaction_manager.cpp index f8d9362fd..bb52aadec 100644 --- a/src/duckdb/src/transaction/duck_transaction_manager.cpp +++ b/src/duckdb/src/transaction/duck_transaction_manager.cpp @@ -20,7 +20,7 @@ namespace duckdb { -void DuckCleanupInfo::Cleanup() noexcept { +void DuckCleanupInfo::Cleanup() { for (auto &transaction : transactions) { if (transaction->awaiting_cleanup) { transaction->Cleanup(lowest_start_time); diff --git a/src/duckdb/third_party/fmt/include/fmt/format.h b/src/duckdb/third_party/fmt/include/fmt/format.h index 4c5163010..b5ab10c1e 100644 --- a/src/duckdb/third_party/fmt/include/fmt/format.h +++ b/src/duckdb/third_party/fmt/include/fmt/format.h @@ -321,24 +321,13 @@ inline typename Container::value_type* get_data(Container& c) { return c.data(); } -#ifdef _SECURE_SCL -// Make a checked iterator to avoid MSVC warnings. -template using checked_ptr = stdext::checked_array_iterator; -template checked_ptr make_checked(T* p, std::size_t size) { - return {p, size}; -} -#else -template using checked_ptr = T*; -template inline T* make_checked(T* p, std::size_t) { return p; } -#endif - template ::value)> -inline checked_ptr reserve( +inline typename Container::value_type* reserve( std::back_insert_iterator& it, std::size_t n) { Container& c = get_container(it); std::size_t size = c.size(); c.resize(size + n); - return make_checked(get_data(c) + size, n); + return get_data(c) + size; } template @@ -540,7 +529,7 @@ template void buffer::append(const U* begin, const U* end) { std::size_t new_size = size_ + to_unsigned(end - begin); reserve(new_size); - std::uninitialized_copy(begin, end, make_checked(ptr_, capacity_) + size_); + std::uninitialized_copy(begin, end, ptr_ + size_); size_ = new_size; } } // namespace internal @@ -642,7 +631,7 @@ class basic_memory_buffer : private Allocator, public internal::buffer { if (data == other.store_) { this->set(store_, capacity); std::uninitialized_copy(other.store_, other.store_ + size, - internal::make_checked(store_, capacity)); + store_); } else { this->set(data, capacity); // Set pointer to the inline array so that delete is not called @@ -689,7 +678,7 @@ void basic_memory_buffer::grow(std::size_t size) { T* new_data = std::allocator_traits::allocate(*this, new_capacity); // The following code doesn't throw, so the raw pointer above doesn't leak. std::uninitialized_copy(old_data, old_data + this->size(), - internal::make_checked(new_data, new_capacity)); + new_data); this->set(new_data, new_capacity); // deallocate must not throw according to the standard, but even if it does, // the buffer already uses the new storage and will deallocate it in @@ -1565,7 +1554,7 @@ template class basic_writer { } buffer -= s.size(); std::uninitialized_copy(s.data(), s.data() + s.size(), - make_checked(buffer, s.size())); + buffer); }); } }; diff --git a/src/duckdb/third_party/thrift/thrift/protocol/TProtocol.h b/src/duckdb/third_party/thrift/thrift/protocol/TProtocol.h index 517aceb75..f1bccd43c 100644 --- a/src/duckdb/third_party/thrift/thrift/protocol/TProtocol.h +++ b/src/duckdb/third_party/thrift/thrift/protocol/TProtocol.h @@ -89,6 +89,18 @@ static inline To bitwise_cast(From from) { # define __THRIFT_BYTE_ORDER BYTE_ORDER # define __THRIFT_LITTLE_ENDIAN LITTLE_ENDIAN # define __THRIFT_BIG_ENDIAN BIG_ENDIAN +# elif defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && defined(__ORDER_BIG_ENDIAN__) + // GCC / Clang builtin (macOS, Linux, MinGW, ...). Reliable without relying on system headers happening to have + // defined BYTE_ORDER already. +# define __THRIFT_BYTE_ORDER __BYTE_ORDER__ +# define __THRIFT_LITTLE_ENDIAN __ORDER_LITTLE_ENDIAN__ +# define __THRIFT_BIG_ENDIAN __ORDER_BIG_ENDIAN__ +# elif defined(_WIN32) + // All Windows targets (x86, x64, ARM, ARM64) are little-endian. MSVC does not define BYTE_ORDER, so without this + // we would fall through to the broken default below and byteswap every double on the wire. +# define __THRIFT_BYTE_ORDER 1234 +# define __THRIFT_LITTLE_ENDIAN __THRIFT_BYTE_ORDER +# define __THRIFT_BIG_ENDIAN 0 # else //# include # if BOOST_ENDIAN_BIG_BYTE @@ -106,6 +118,13 @@ static inline To bitwise_cast(From from) { # endif #endif +// Guard against silently falling into the big-endian byteswap path. +// if detection failed above, __THRIFT_BYTE_ORDER and __THRIFT_BIG_ENDIAN both expand to 0 and the comparison below +// would be (0 == 0) -> true, byte-swapping every double. +#if !defined(__THRIFT_BYTE_ORDER) || !defined(__THRIFT_LITTLE_ENDIAN) || !defined(__THRIFT_BIG_ENDIAN) +# error "Could not detect endianness for Thrift; define __THRIFT_BYTE_ORDER explicitly." +#endif + #if __THRIFT_BYTE_ORDER == __THRIFT_BIG_ENDIAN # if !defined(THRIFT_ntohll) # define THRIFT_ntohll(n) (n) diff --git a/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp b/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp index ea3403fca..006e56973 100644 --- a/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp +++ b/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp @@ -348,17 +348,17 @@ #include "extension/icu/third_party/icu/i18n/wintzimpl.cpp" -#include "extension/icu/third_party/icu/i18n/double-conversion-cached-powers.cpp" +#include "extension/icu/third_party/icu/i18n/double-conversion-bignum.cpp" -#include "extension/icu/third_party/icu/i18n/double-conversion-fast-dtoa.cpp" +#include "extension/icu/third_party/icu/i18n/double-conversion-string-to-double.cpp" -#include "extension/icu/third_party/icu/i18n/double-conversion-double-to-string.cpp" +#include "extension/icu/third_party/icu/i18n/double-conversion-cached-powers.cpp" #include "extension/icu/third_party/icu/i18n/double-conversion-bignum-dtoa.cpp" -#include "extension/icu/third_party/icu/i18n/double-conversion-bignum.cpp" +#include "extension/icu/third_party/icu/i18n/double-conversion-fast-dtoa.cpp" #include "extension/icu/third_party/icu/i18n/double-conversion-strtod.cpp" -#include "extension/icu/third_party/icu/i18n/double-conversion-string-to-double.cpp" +#include "extension/icu/third_party/icu/i18n/double-conversion-double-to-string.cpp"