From 3cc5165d3d30a3fd32b867e632431ffcc190c48b Mon Sep 17 00:00:00 2001 From: happenlee Date: Wed, 10 Dec 2025 16:49:37 +0800 Subject: [PATCH 01/12] change the null map encode --- .../olap/rowset/segment_v2/column_writer.cpp | 87 +++++++++++++++++-- be/src/olap/rowset/segment_v2/column_writer.h | 4 +- be/src/olap/rowset/segment_v2/parsed_page.h | 19 ++-- gensrc/proto/segment_v2.proto | 2 + 4 files changed, 98 insertions(+), 14 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp index b165b2b766a6d1..19409824612706 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/column_writer.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include "common/config.h" #include "common/logging.h" @@ -52,7 +53,28 @@ namespace doris::segment_v2 { #include "common/compile_check_begin.h" -class NullBitmapBuilder { +// Abstract base class for null bitmap builders +class NullBitmapBuilderBase { +public: + virtual ~NullBitmapBuilderBase() = default; + + // Add a run of 'run' values, all equal to 'value' + virtual void add_run(bool value, size_t run) = 0; + + // Returns whether the building nullmap contains any null values + virtual bool has_null() const = 0; + + // Finish building the null bitmap and write the result to 'slice' + virtual Status finish(OwnedSlice* slice) = 0; + + // Reset the builder to its initial state + virtual void reset() = 0; + + // Return the current size of the buffer in bytes + virtual uint64_t size() = 0; +}; + +class NullBitmapBuilder : public NullBitmapBuilderBase { public: NullBitmapBuilder() : _has_null(false), _bitmap_buf(512), _rle_encoder(&_bitmap_buf, 1) {} @@ -61,26 +83,26 @@ class NullBitmapBuilder { _bitmap_buf(BitmapSize(reserve_bits)), _rle_encoder(&_bitmap_buf, 1) {} - void add_run(bool value, size_t run) { + void add_run(bool value, size_t run) override { _has_null |= value; _rle_encoder.Put(value, run); } // Returns whether the building nullmap contains nullptr - bool has_null() const { return _has_null; } + bool has_null() const override { return _has_null; } - Status finish(OwnedSlice* slice) { + Status finish(OwnedSlice* slice) override { _rle_encoder.Flush(); RETURN_IF_CATCH_EXCEPTION({ *slice = _bitmap_buf.build(); }); return Status::OK(); } - void reset() { + void reset() override { _has_null = false; _rle_encoder.Clear(); } - uint64_t size() { return _bitmap_buf.size(); } + uint64_t size() override { return _bitmap_buf.size(); } private: bool _has_null; @@ -88,6 +110,56 @@ class NullBitmapBuilder { RleEncoder _rle_encoder; }; +// PlainNullBitmapBuilder uses std::vector to store null values directly without RLE encoding +// Each uint8_t represents a single null value: 0 = non-null, 1 = null +class PlainNullBitmapBuilder : public NullBitmapBuilderBase { +public: + PlainNullBitmapBuilder() : _has_null(false), _bitmap_buf() {} + + explicit PlainNullBitmapBuilder(size_t reserve_bits) + : _has_null(false), + _bitmap_buf(reserve_bits, 0) {} // Reserve enough bytes for the given number of bits + + void add_run(bool value, size_t run) override { + _has_null |= value; + const uint8_t val = value ? 1 : 0; + + // Ensure the buffer has enough bytes to hold all values + const size_t current_size = _bitmap_buf.size(); + _bitmap_buf.resize(current_size + run, 0); + + if (val) { + // Fill the new bytes with the value + std::fill(_bitmap_buf.begin() + current_size, _bitmap_buf.end(), val); + } + } + + // Returns whether the building nullmap contains nullptr + bool has_null() const override { return _has_null; } + + Status finish(OwnedSlice* slice) override { + // No need to flush, just build the slice from the buffer + RETURN_IF_CATCH_EXCEPTION({ + // Create a new OwnedSlice and copy the data + OwnedSlice result(_bitmap_buf.size()); + memcpy(result.data(), _bitmap_buf.data(), _bitmap_buf.size()); + *slice = std::move(result); + }); + return Status::OK(); + } + + void reset() override { + _has_null = false; + _bitmap_buf.clear(); + } + + uint64_t size() override { return _bitmap_buf.size(); } + +private: + bool _has_null; + std::vector _bitmap_buf; +}; + inline ScalarColumnWriter* get_null_writer(const ColumnWriterOptions& opts, io::FileWriter* file_writer, uint32_t id) { if (!opts.meta->is_nullable()) { @@ -458,7 +530,7 @@ Status ScalarColumnWriter::init() { _ordinal_index_builder = std::make_unique(); // create null bitmap builder if (is_nullable()) { - _null_bitmap_builder = std::make_unique(); + _null_bitmap_builder = std::make_unique(); } if (_opts.need_zone_map) { RETURN_IF_ERROR(ZoneMapIndexWriter::create(get_field(), _zone_map_index_builder)); @@ -743,6 +815,7 @@ Status ScalarColumnWriter::finish_current_page() { data_page_footer->set_first_ordinal(_first_rowid); data_page_footer->set_num_values(_next_rowid - _first_rowid); data_page_footer->set_nullmap_size(cast_set(nullmap.slice().size)); + data_page_footer->set_new_null_map(true); if (_new_page_callback != nullptr) { _new_page_callback->put_extra_info_in_page(data_page_footer); } diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h index 9e39ef45bb4c00..89d544ea2e918d 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.h +++ b/be/src/olap/rowset/segment_v2/column_writer.h @@ -92,7 +92,9 @@ struct ColumnWriterOptions { class BitmapIndexWriter; class EncodingInfo; +class NullBitmapBuilderBase; class NullBitmapBuilder; +class PlainNullBitmapBuilder; class OrdinalIndexWriter; class PageBuilder; class BloomFilterIndexWriter; @@ -268,7 +270,7 @@ class ScalarColumnWriter : public ColumnWriter { private: std::unique_ptr _page_builder; - std::unique_ptr _null_bitmap_builder; + std::unique_ptr _null_bitmap_builder; ColumnWriterOptions _opts; diff --git a/be/src/olap/rowset/segment_v2/parsed_page.h b/be/src/olap/rowset/segment_v2/parsed_page.h index 7ef20adecfed64..b654b73b10d9ef 100644 --- a/be/src/olap/rowset/segment_v2/parsed_page.h +++ b/be/src/olap/rowset/segment_v2/parsed_page.h @@ -51,11 +51,17 @@ struct ParsedPage { auto null_bitmap = Slice(body.data + body.size - null_size, null_size); if (null_size > 0) { - auto null_decoder = RleDecoder((const uint8_t*)null_bitmap.data, null_size, 1); - // Decode all null values into null_maps in advance - auto num_rows = footer.num_values(); - page->null_maps.resize(num_rows); - null_decoder.get_values((bool*)page->null_maps.data(), num_rows); + if (footer.has_new_null_map() && footer.new_null_map()) { + page->null_maps = std::span((uint8_t*)null_bitmap.data, null_size); + } else { + auto null_decoder = + RleDecoder((const uint8_t*)null_bitmap.data, null_size, 1); + // Decode all null values into null_maps in advance + auto num_rows = footer.num_values(); + page->null_bitmap.resize(num_rows); + null_decoder.get_values((bool*)page->null_bitmap.data(), num_rows); + page->null_maps = std::span(page->null_bitmap.data(), num_rows); + } } Slice data_slice(body.data, body.size - null_size); @@ -84,7 +90,8 @@ struct ParsedPage { PageHandle page_handle; - std::vector null_maps; + std::span null_maps; + std::vector null_bitmap; std::unique_ptr data_decoder; // ordinal of the first value in this page diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto index 653d565d546973..535c270d40b811 100644 --- a/gensrc/proto/segment_v2.proto +++ b/gensrc/proto/segment_v2.proto @@ -74,6 +74,8 @@ message DataPageFooterPB { // only for array column // Save the offset of next page optional uint64 next_array_item_ordinal = 4; + + optional bool new_null_map = 5; } message IndexPageFooterPB { From 69297e7631205e957e07aec0f6e948210d18473c Mon Sep 17 00:00:00 2001 From: happenlee Date: Wed, 10 Dec 2025 19:13:32 +0800 Subject: [PATCH 02/12] plain lz4 null page encode --- .../olap/rowset/segment_v2/column_writer.cpp | 32 ++++++++++++++++--- be/src/olap/rowset/segment_v2/parsed_page.h | 21 ++++++++---- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp index 19409824612706..03bb4bfffdbc2c 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/column_writer.cpp @@ -140,10 +140,34 @@ class PlainNullBitmapBuilder : public NullBitmapBuilderBase { Status finish(OwnedSlice* slice) override { // No need to flush, just build the slice from the buffer RETURN_IF_CATCH_EXCEPTION({ - // Create a new OwnedSlice and copy the data - OwnedSlice result(_bitmap_buf.size()); - memcpy(result.data(), _bitmap_buf.data(), _bitmap_buf.size()); - *slice = std::move(result); + // Check if we should compress the data + if (!_bitmap_buf.empty()) { + // Get LZ4 compression codec + BlockCompressionCodec* codec = nullptr; + RETURN_IF_ERROR( + get_block_compression_codec(segment_v2::CompressionTypePB::LZ4, &codec)); + if (codec != nullptr) { + // Compress the data + faststring compressed_buf; + Slice raw_slice(_bitmap_buf.data(), _bitmap_buf.size()); + Status status = codec->compress(raw_slice, &compressed_buf); + if (status.ok()) { + // Use compressed data if compression is successful and reduces size + // if (compressed_buf.size() < _bitmap_buf.size()) { + // Directly build OwnedSlice from compressed_buf to avoid memory copy + *slice = compressed_buf.build(); + return Status::OK(); + // } + } else { + return status; + } + } + } + // // Fallback to uncompressed data if compression fails or doesn't reduce size + // // Create OwnedSlice directly from _bitmap_buf data + // OwnedSlice result(_bitmap_buf.size()); + // memcpy(result.data(), _bitmap_buf.data(), _bitmap_buf.size()); + // *slice = std::move(result); }); return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/parsed_page.h b/be/src/olap/rowset/segment_v2/parsed_page.h index b654b73b10d9ef..6137ee5e1f2d53 100644 --- a/be/src/olap/rowset/segment_v2/parsed_page.h +++ b/be/src/olap/rowset/segment_v2/parsed_page.h @@ -29,6 +29,7 @@ #include "olap/rowset/segment_v2/options.h" #include "olap/rowset/segment_v2/page_decoder.h" #include "olap/rowset/segment_v2/page_handle.h" +#include "util/block_compression.h" #include "util/rle_encoding.h" #include "util/slice.h" @@ -52,15 +53,24 @@ struct ParsedPage { if (null_size > 0) { if (footer.has_new_null_map() && footer.new_null_map()) { - page->null_maps = std::span((uint8_t*)null_bitmap.data, null_size); + // Get LZ4 compression codec + BlockCompressionCodec* codec = nullptr; + RETURN_IF_ERROR( + get_block_compression_codec(segment_v2::CompressionTypePB::LZ4, &codec)); + if (codec != nullptr) { + // Compress the data + faststring compressed_buf; + page->null_maps.resize(footer.num_values()); + auto tmp_slice = Slice(page->null_maps.data(), page->null_maps.size()); + RETURN_IF_ERROR(codec->decompress(null_bitmap, &tmp_slice)); + } } else { auto null_decoder = RleDecoder((const uint8_t*)null_bitmap.data, null_size, 1); // Decode all null values into null_maps in advance auto num_rows = footer.num_values(); - page->null_bitmap.resize(num_rows); - null_decoder.get_values((bool*)page->null_bitmap.data(), num_rows); - page->null_maps = std::span(page->null_bitmap.data(), num_rows); + page->null_maps.resize(num_rows); + null_decoder.get_values((bool*)page->null_maps.data(), num_rows); } } @@ -90,8 +100,7 @@ struct ParsedPage { PageHandle page_handle; - std::span null_maps; - std::vector null_bitmap; + std::vector null_maps; std::unique_ptr data_decoder; // ordinal of the first value in this page From ae0e410294be3fe138da5d4864b88c3358024fc2 Mon Sep 17 00:00:00 2001 From: happenlee Date: Thu, 11 Dec 2025 10:56:51 +0800 Subject: [PATCH 03/12] use bitshuffle to decoding --- .../olap/rowset/segment_v2/column_writer.cpp | 78 ++++++++++++------- be/src/olap/rowset/segment_v2/parsed_page.h | 32 +++++--- gensrc/proto/segment_v2.proto | 4 +- 3 files changed, 72 insertions(+), 42 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp index 03bb4bfffdbc2c..0a21bea7a50721 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/column_writer.cpp @@ -29,6 +29,7 @@ #include "io/fs/file_writer.h" #include "olap/olap_common.h" #include "olap/rowset/segment_v2/bitmap_index_writer.h" +#include "olap/rowset/segment_v2/bitshuffle_page.h" #include "olap/rowset/segment_v2/bloom_filter_index_writer.h" #include "olap/rowset/segment_v2/encoding_info.h" #include "olap/rowset/segment_v2/inverted_index_writer.h" @@ -72,6 +73,11 @@ class NullBitmapBuilderBase { // Return the current size of the buffer in bytes virtual uint64_t size() = 0; + + EncodingTypePB encoding() const { return _encoding_type; } + +protected: + EncodingTypePB _encoding_type = RLE; }; class NullBitmapBuilder : public NullBitmapBuilderBase { @@ -114,11 +120,9 @@ class NullBitmapBuilder : public NullBitmapBuilderBase { // Each uint8_t represents a single null value: 0 = non-null, 1 = null class PlainNullBitmapBuilder : public NullBitmapBuilderBase { public: - PlainNullBitmapBuilder() : _has_null(false), _bitmap_buf() {} - - explicit PlainNullBitmapBuilder(size_t reserve_bits) - : _has_null(false), - _bitmap_buf(reserve_bits, 0) {} // Reserve enough bytes for the given number of bits + PlainNullBitmapBuilder(EncodingTypePB encoding_type) : _has_null(false), _bitmap_buf() { + _encoding_type = encoding_type; + } void add_run(bool value, size_t run) override { _has_null |= value; @@ -142,32 +146,40 @@ class PlainNullBitmapBuilder : public NullBitmapBuilderBase { RETURN_IF_CATCH_EXCEPTION({ // Check if we should compress the data if (!_bitmap_buf.empty()) { - // Get LZ4 compression codec - BlockCompressionCodec* codec = nullptr; - RETURN_IF_ERROR( - get_block_compression_codec(segment_v2::CompressionTypePB::LZ4, &codec)); - if (codec != nullptr) { - // Compress the data + if (_encoding_type == EncodingTypePB::BIT_SHUFFLE) { faststring compressed_buf; - Slice raw_slice(_bitmap_buf.data(), _bitmap_buf.size()); - Status status = codec->compress(raw_slice, &compressed_buf); - if (status.ok()) { - // Use compressed data if compression is successful and reduces size - // if (compressed_buf.size() < _bitmap_buf.size()) { - // Directly build OwnedSlice from compressed_buf to avoid memory copy - *slice = compressed_buf.build(); - return Status::OK(); - // } - } else { - return status; + compressed_buf.resize( + bitshuffle::compress_lz4_bound(_bitmap_buf.size(), sizeof(uint8_t), 0)); + int64_t r = bitshuffle::compress_lz4(_bitmap_buf.data(), compressed_buf.data(), + _bitmap_buf.size(), sizeof(uint8_t), 0); + if (UNLIKELY(r < 0)) { + return Status::InternalError("bitshuffle compress failed"); + } + // before build(), update buffer length to the actual compressed size + compressed_buf.resize(r); + *slice = compressed_buf.build(); + return Status::OK(); + } else if (_encoding_type == EncodingTypePB::PLAIN_ENCODING) { + // Get LZ4 compression codec + BlockCompressionCodec* codec = nullptr; + RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::LZ4, + &codec)); + if (codec != nullptr) { + // Compress the data + faststring compressed_buf; + Slice raw_slice(_bitmap_buf.data(), _bitmap_buf.size()); + Status status = codec->compress(raw_slice, &compressed_buf); + if (status.ok()) { + *slice = compressed_buf.build(); + return Status::OK(); + } else { + return status; + } } + } else { + return Status::Corruption("unsupported null map encoding"); } } - // // Fallback to uncompressed data if compression fails or doesn't reduce size - // // Create OwnedSlice directly from _bitmap_buf data - // OwnedSlice result(_bitmap_buf.size()); - // memcpy(result.data(), _bitmap_buf.data(), _bitmap_buf.size()); - // *slice = std::move(result); }); return Status::OK(); } @@ -554,7 +566,15 @@ Status ScalarColumnWriter::init() { _ordinal_index_builder = std::make_unique(); // create null bitmap builder if (is_nullable()) { - _null_bitmap_builder = std::make_unique(); + if (_opts.meta->has_null_map_encoding()) { + if (config::cooldown_thread_num < 10) { + _null_bitmap_builder = std::make_unique(BIT_SHUFFLE); + } else { + _null_bitmap_builder = std::make_unique(PLAIN_ENCODING); + } + } else { + _null_bitmap_builder = std::make_unique(); + } } if (_opts.need_zone_map) { RETURN_IF_ERROR(ZoneMapIndexWriter::create(get_field(), _zone_map_index_builder)); @@ -839,7 +859,7 @@ Status ScalarColumnWriter::finish_current_page() { data_page_footer->set_first_ordinal(_first_rowid); data_page_footer->set_num_values(_next_rowid - _first_rowid); data_page_footer->set_nullmap_size(cast_set(nullmap.slice().size)); - data_page_footer->set_new_null_map(true); + data_page_footer->set_null_map_encoding(_null_bitmap_builder->encoding()); if (_new_page_callback != nullptr) { _new_page_callback->put_extra_info_in_page(data_page_footer); } diff --git a/be/src/olap/rowset/segment_v2/parsed_page.h b/be/src/olap/rowset/segment_v2/parsed_page.h index 6137ee5e1f2d53..69783f9e26f0f0 100644 --- a/be/src/olap/rowset/segment_v2/parsed_page.h +++ b/be/src/olap/rowset/segment_v2/parsed_page.h @@ -22,6 +22,7 @@ #include #include +#include "bitshuffle_wrapper.h" #include "common/status.h" #include "olap/rowset/segment_v2/binary_dict_page.h" #include "olap/rowset/segment_v2/common.h" @@ -52,24 +53,31 @@ struct ParsedPage { auto null_bitmap = Slice(body.data + body.size - null_size, null_size); if (null_size > 0) { - if (footer.has_new_null_map() && footer.new_null_map()) { - // Get LZ4 compression codec - BlockCompressionCodec* codec = nullptr; - RETURN_IF_ERROR( - get_block_compression_codec(segment_v2::CompressionTypePB::LZ4, &codec)); - if (codec != nullptr) { - // Compress the data - faststring compressed_buf; - page->null_maps.resize(footer.num_values()); - auto tmp_slice = Slice(page->null_maps.data(), page->null_maps.size()); - RETURN_IF_ERROR(codec->decompress(null_bitmap, &tmp_slice)); + page->null_maps.resize(footer.num_values()); + if (footer.has_null_map_encoding()) { + if (footer.null_map_encoding() == BIT_SHUFFLE) { + int64_t r = bitshuffle::decompress_lz4(null_bitmap.data, page->null_maps.data(), + null_size, sizeof(uint8_t), 0); + if (UNLIKELY(r < 0)) { + return Status::Corruption("bitshuffle decompress failed"); + } + } else if (footer.null_map_encoding() == PLAIN_ENCODING) { + // Get LZ4 compression codec + BlockCompressionCodec* codec = nullptr; + RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::LZ4, + &codec)); + if (codec != nullptr) { + // Compress the data + faststring compressed_buf; + auto tmp_slice = Slice(page->null_maps.data(), page->null_maps.size()); + RETURN_IF_ERROR(codec->decompress(null_bitmap, &tmp_slice)); + } } } else { auto null_decoder = RleDecoder((const uint8_t*)null_bitmap.data, null_size, 1); // Decode all null values into null_maps in advance auto num_rows = footer.num_values(); - page->null_maps.resize(num_rows); null_decoder.get_values((bool*)page->null_maps.data(), num_rows); } } diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto index 535c270d40b811..933d9a56b5d377 100644 --- a/gensrc/proto/segment_v2.proto +++ b/gensrc/proto/segment_v2.proto @@ -75,7 +75,7 @@ message DataPageFooterPB { // Save the offset of next page optional uint64 next_array_item_ordinal = 4; - optional bool new_null_map = 5; + optional EncodingTypePB null_map_encoding = 5; } message IndexPageFooterPB { @@ -220,6 +220,8 @@ message ColumnMetaPB { optional uint64 compressed_data_bytes = 24; optional uint64 uncompressed_data_bytes = 25; optional uint64 raw_data_bytes = 26; + + optional EncodingTypePB null_map_encoding = 27; } message PrimaryKeyIndexMetaPB { From 9a33e3ec66d98ad77587d3b7650b22a0eac6912a Mon Sep 17 00:00:00 2001 From: happenlee Date: Thu, 11 Dec 2025 23:04:13 +0800 Subject: [PATCH 04/12] Revert "use bitshuffle to decoding" This reverts commit 5a54f831420141840afd12883d00ee97a237a09c. --- .../olap/rowset/segment_v2/column_writer.cpp | 78 +++++++------------ be/src/olap/rowset/segment_v2/parsed_page.h | 32 +++----- gensrc/proto/segment_v2.proto | 4 +- 3 files changed, 42 insertions(+), 72 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp index 0a21bea7a50721..03bb4bfffdbc2c 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/column_writer.cpp @@ -29,7 +29,6 @@ #include "io/fs/file_writer.h" #include "olap/olap_common.h" #include "olap/rowset/segment_v2/bitmap_index_writer.h" -#include "olap/rowset/segment_v2/bitshuffle_page.h" #include "olap/rowset/segment_v2/bloom_filter_index_writer.h" #include "olap/rowset/segment_v2/encoding_info.h" #include "olap/rowset/segment_v2/inverted_index_writer.h" @@ -73,11 +72,6 @@ class NullBitmapBuilderBase { // Return the current size of the buffer in bytes virtual uint64_t size() = 0; - - EncodingTypePB encoding() const { return _encoding_type; } - -protected: - EncodingTypePB _encoding_type = RLE; }; class NullBitmapBuilder : public NullBitmapBuilderBase { @@ -120,9 +114,11 @@ class NullBitmapBuilder : public NullBitmapBuilderBase { // Each uint8_t represents a single null value: 0 = non-null, 1 = null class PlainNullBitmapBuilder : public NullBitmapBuilderBase { public: - PlainNullBitmapBuilder(EncodingTypePB encoding_type) : _has_null(false), _bitmap_buf() { - _encoding_type = encoding_type; - } + PlainNullBitmapBuilder() : _has_null(false), _bitmap_buf() {} + + explicit PlainNullBitmapBuilder(size_t reserve_bits) + : _has_null(false), + _bitmap_buf(reserve_bits, 0) {} // Reserve enough bytes for the given number of bits void add_run(bool value, size_t run) override { _has_null |= value; @@ -146,40 +142,32 @@ class PlainNullBitmapBuilder : public NullBitmapBuilderBase { RETURN_IF_CATCH_EXCEPTION({ // Check if we should compress the data if (!_bitmap_buf.empty()) { - if (_encoding_type == EncodingTypePB::BIT_SHUFFLE) { + // Get LZ4 compression codec + BlockCompressionCodec* codec = nullptr; + RETURN_IF_ERROR( + get_block_compression_codec(segment_v2::CompressionTypePB::LZ4, &codec)); + if (codec != nullptr) { + // Compress the data faststring compressed_buf; - compressed_buf.resize( - bitshuffle::compress_lz4_bound(_bitmap_buf.size(), sizeof(uint8_t), 0)); - int64_t r = bitshuffle::compress_lz4(_bitmap_buf.data(), compressed_buf.data(), - _bitmap_buf.size(), sizeof(uint8_t), 0); - if (UNLIKELY(r < 0)) { - return Status::InternalError("bitshuffle compress failed"); - } - // before build(), update buffer length to the actual compressed size - compressed_buf.resize(r); - *slice = compressed_buf.build(); - return Status::OK(); - } else if (_encoding_type == EncodingTypePB::PLAIN_ENCODING) { - // Get LZ4 compression codec - BlockCompressionCodec* codec = nullptr; - RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::LZ4, - &codec)); - if (codec != nullptr) { - // Compress the data - faststring compressed_buf; - Slice raw_slice(_bitmap_buf.data(), _bitmap_buf.size()); - Status status = codec->compress(raw_slice, &compressed_buf); - if (status.ok()) { - *slice = compressed_buf.build(); - return Status::OK(); - } else { - return status; - } + Slice raw_slice(_bitmap_buf.data(), _bitmap_buf.size()); + Status status = codec->compress(raw_slice, &compressed_buf); + if (status.ok()) { + // Use compressed data if compression is successful and reduces size + // if (compressed_buf.size() < _bitmap_buf.size()) { + // Directly build OwnedSlice from compressed_buf to avoid memory copy + *slice = compressed_buf.build(); + return Status::OK(); + // } + } else { + return status; } - } else { - return Status::Corruption("unsupported null map encoding"); } } + // // Fallback to uncompressed data if compression fails or doesn't reduce size + // // Create OwnedSlice directly from _bitmap_buf data + // OwnedSlice result(_bitmap_buf.size()); + // memcpy(result.data(), _bitmap_buf.data(), _bitmap_buf.size()); + // *slice = std::move(result); }); return Status::OK(); } @@ -566,15 +554,7 @@ Status ScalarColumnWriter::init() { _ordinal_index_builder = std::make_unique(); // create null bitmap builder if (is_nullable()) { - if (_opts.meta->has_null_map_encoding()) { - if (config::cooldown_thread_num < 10) { - _null_bitmap_builder = std::make_unique(BIT_SHUFFLE); - } else { - _null_bitmap_builder = std::make_unique(PLAIN_ENCODING); - } - } else { - _null_bitmap_builder = std::make_unique(); - } + _null_bitmap_builder = std::make_unique(); } if (_opts.need_zone_map) { RETURN_IF_ERROR(ZoneMapIndexWriter::create(get_field(), _zone_map_index_builder)); @@ -859,7 +839,7 @@ Status ScalarColumnWriter::finish_current_page() { data_page_footer->set_first_ordinal(_first_rowid); data_page_footer->set_num_values(_next_rowid - _first_rowid); data_page_footer->set_nullmap_size(cast_set(nullmap.slice().size)); - data_page_footer->set_null_map_encoding(_null_bitmap_builder->encoding()); + data_page_footer->set_new_null_map(true); if (_new_page_callback != nullptr) { _new_page_callback->put_extra_info_in_page(data_page_footer); } diff --git a/be/src/olap/rowset/segment_v2/parsed_page.h b/be/src/olap/rowset/segment_v2/parsed_page.h index 69783f9e26f0f0..6137ee5e1f2d53 100644 --- a/be/src/olap/rowset/segment_v2/parsed_page.h +++ b/be/src/olap/rowset/segment_v2/parsed_page.h @@ -22,7 +22,6 @@ #include #include -#include "bitshuffle_wrapper.h" #include "common/status.h" #include "olap/rowset/segment_v2/binary_dict_page.h" #include "olap/rowset/segment_v2/common.h" @@ -53,31 +52,24 @@ struct ParsedPage { auto null_bitmap = Slice(body.data + body.size - null_size, null_size); if (null_size > 0) { - page->null_maps.resize(footer.num_values()); - if (footer.has_null_map_encoding()) { - if (footer.null_map_encoding() == BIT_SHUFFLE) { - int64_t r = bitshuffle::decompress_lz4(null_bitmap.data, page->null_maps.data(), - null_size, sizeof(uint8_t), 0); - if (UNLIKELY(r < 0)) { - return Status::Corruption("bitshuffle decompress failed"); - } - } else if (footer.null_map_encoding() == PLAIN_ENCODING) { - // Get LZ4 compression codec - BlockCompressionCodec* codec = nullptr; - RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::LZ4, - &codec)); - if (codec != nullptr) { - // Compress the data - faststring compressed_buf; - auto tmp_slice = Slice(page->null_maps.data(), page->null_maps.size()); - RETURN_IF_ERROR(codec->decompress(null_bitmap, &tmp_slice)); - } + if (footer.has_new_null_map() && footer.new_null_map()) { + // Get LZ4 compression codec + BlockCompressionCodec* codec = nullptr; + RETURN_IF_ERROR( + get_block_compression_codec(segment_v2::CompressionTypePB::LZ4, &codec)); + if (codec != nullptr) { + // Compress the data + faststring compressed_buf; + page->null_maps.resize(footer.num_values()); + auto tmp_slice = Slice(page->null_maps.data(), page->null_maps.size()); + RETURN_IF_ERROR(codec->decompress(null_bitmap, &tmp_slice)); } } else { auto null_decoder = RleDecoder((const uint8_t*)null_bitmap.data, null_size, 1); // Decode all null values into null_maps in advance auto num_rows = footer.num_values(); + page->null_maps.resize(num_rows); null_decoder.get_values((bool*)page->null_maps.data(), num_rows); } } diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto index 933d9a56b5d377..535c270d40b811 100644 --- a/gensrc/proto/segment_v2.proto +++ b/gensrc/proto/segment_v2.proto @@ -75,7 +75,7 @@ message DataPageFooterPB { // Save the offset of next page optional uint64 next_array_item_ordinal = 4; - optional EncodingTypePB null_map_encoding = 5; + optional bool new_null_map = 5; } message IndexPageFooterPB { @@ -220,8 +220,6 @@ message ColumnMetaPB { optional uint64 compressed_data_bytes = 24; optional uint64 uncompressed_data_bytes = 25; optional uint64 raw_data_bytes = 26; - - optional EncodingTypePB null_map_encoding = 27; } message PrimaryKeyIndexMetaPB { From 1432172e29bfee2bc80a4ecb4123142e4110a3cf Mon Sep 17 00:00:00 2001 From: happenlee Date: Tue, 16 Dec 2025 02:21:59 +0800 Subject: [PATCH 05/12] contine page --- .../olap/rowset/segment_v2/column_reader.cpp | 180 ++++++++++++------ .../olap/rowset/segment_v2/column_writer.cpp | 101 +++++----- be/src/olap/rowset/segment_v2/column_writer.h | 27 +-- be/src/olap/rowset/segment_v2/parsed_page.h | 3 + .../variant/variant_column_writer_impl.cpp | 4 +- gensrc/proto/segment_v2.proto | 2 + tools/tpcds-tools/conf/doris-cluster.conf | 4 +- 7 files changed, 198 insertions(+), 123 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index e9154e920cb888..e490e8f306116f 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -1636,6 +1636,9 @@ Status FileColumnIterator::_seek_to_pos_in_page(ParsedPage* page, ordinal_t offs } auto num_nulls = [this](ordinal_t start, ordinal_t end) { + if (_page.is_continue) { + return 0; + } auto null_count = 0; for (auto i = start; i < end; i++) { null_count += _page.null_maps[i]; @@ -1712,21 +1715,38 @@ Status FileColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr& d auto& null_map = null_col->get_null_map_data(); auto nest_column = null_col->get_nested_column_ptr(); - while (nrows_to_read > 0) { - bool is_null; - int i; - std::tie(is_null, i) = null_count(nrows_to_read); - if (is_null) { - null_col->insert_many_defaults(i); - } else { - null_map.resize_fill(null_map.size() + i, 0); - size_t num_rows = i; - RETURN_IF_ERROR(_page.data_decoder->next_batch(&num_rows, nest_column)); - DCHECK_EQ(i, num_rows); + // Optimization: if is_continue is true, skip null_count check and read directly + if (_page.is_continue) { + // Copy null values from page's null_maps + size_t start_offset = _page.offset_in_page; + // Reserve space for new null values + null_map.resize(null_map.size() + nrows_to_read); + // Copy from page's null_maps to column's null_map using memcpy for better performance + memcpy(null_map.data() + null_map.size() - nrows_to_read, + _page.null_maps.data() + start_offset, nrows_to_read); + + size_t num_rows = nrows_to_read; + RETURN_IF_ERROR(_page.data_decoder->next_batch(&num_rows, nest_column)); + DCHECK_EQ(nrows_to_read, num_rows); + _page.offset_in_page += nrows_to_read; + _current_ordinal += nrows_to_read; + } else { + while (nrows_to_read > 0) { + bool is_null; + int i; + std::tie(is_null, i) = null_count(nrows_to_read); + if (is_null) { + null_col->insert_many_defaults(i); + } else { + null_map.resize_fill(null_map.size() + i, 0); + size_t num_rows = i; + RETURN_IF_ERROR(_page.data_decoder->next_batch(&num_rows, nest_column)); + DCHECK_EQ(i, num_rows); + } + nrows_to_read -= i; + _page.offset_in_page += i; + _current_ordinal += i; } - nrows_to_read -= i; - _page.offset_in_page += i; - _current_ordinal += i; } } else { RETURN_IF_ERROR(_page.data_decoder->next_batch(&nrows_to_read, dst)); @@ -1760,65 +1780,107 @@ Status FileColumnIterator::read_by_rowids(const rowid_t* rowids, const size_t co nrows_to_read = std::min(remaining, _page.remaining()); if (!_page.null_maps.empty()) { - size_t already_read = 0; - while ((nrows_to_read - already_read) > 0) { - bool is_null = false; - size_t this_run = std::min(nrows_to_read - already_read, _page.remaining()); - if (UNLIKELY(this_run == 0)) { - break; - } - std::tie(is_null, this_run) = null_count(this_run); - size_t offset = total_read_count + already_read; + auto* null_col = + vectorized::check_and_get_column(dst.get()); + if (UNLIKELY(null_col == nullptr)) { + return Status::InternalError("unexpected column type in column reader"); + } + auto& null_map = null_col->get_null_map_data(); + auto nest_column = null_col->get_nested_column_ptr(); + + // Optimization: if is_continue is true, skip null_count check and read directly + if (_page.is_continue) { + size_t offset = total_read_count; size_t this_read_count = 0; - rowid_t current_ordinal_in_page = - cast_set(_page.offset_in_page + _page.first_ordinal); - for (size_t i = 0; i < this_run; ++i) { - if (rowids[offset + i] - current_ordinal_in_page >= this_run) { + // rowid_t current_ordinal_in_page = + // cast_set(_page.offset_in_page + _page.first_ordinal); + + // Calculate how many rowids in this batch belong to the current page + for (size_t i = 0; i < nrows_to_read; ++i) { + // Check if this rowid is within the current page's range + if (rowids[offset + i] >= _page.first_ordinal + _page.num_rows) { break; } this_read_count++; } - auto origin_index = _page.data_decoder->current_index(); if (this_read_count > 0) { - auto* null_col = - vectorized::check_and_get_column(dst.get()); - if (UNLIKELY(null_col == nullptr)) { - return Status::InternalError("unexpected column type in column reader"); - } - auto& null_map = null_col->get_null_map_data(); - auto nest_column = null_col->get_nested_column_ptr(); + // Read data for the rows + size_t read_count = this_read_count; - if (is_null) { - null_col->insert_many_defaults(this_read_count); - } else { - size_t read_count = this_read_count; - - // ordinal in nullable columns' data buffer maybe be not continuously(the data doesn't contain null value), - // so we need use `page_start_off_in_decoder` to calculate the actual offset in `data_decoder` - size_t page_start_off_in_decoder = - _page.first_ordinal + _page.offset_in_page - origin_index; - RETURN_IF_ERROR(_page.data_decoder->read_by_rowids( - &rowids[offset], page_start_off_in_decoder, &read_count, - nest_column)); - null_map.resize_fill(null_map.size() + read_count, 0); - DCHECK_EQ(read_count, this_read_count); + // Read data using data_decoder's read_by_rowids + RETURN_IF_ERROR(_page.data_decoder->read_by_rowids( + &rowids[offset], _page.first_ordinal, &read_count, nest_column)); + + // Update null map with data from page's null_maps + size_t null_map_start_offset = null_map.size(); + null_map.resize(null_map.size() + read_count); + + // Copy null flags from page's null_maps for the rows we just read + for (size_t i = 0; i < read_count; ++i) { + size_t idx_in_page = rowids[offset + i] - _page.first_ordinal; + null_map[null_map_start_offset + i] = _page.null_maps[idx_in_page]; } + DCHECK_EQ(read_count, this_read_count); } - if (!is_null) { - RETURN_IF_ERROR( - _page.data_decoder->seek_to_position_in_page(origin_index + this_run)); + DCHECK_EQ(nest_column->size(), null_map.size()); + total_read_count += this_read_count; + remaining -= this_read_count; + } else { + // Original logic for non-continue case + size_t already_read = 0; + while ((nrows_to_read - already_read) > 0) { + bool is_null = false; + size_t this_run = std::min(nrows_to_read - already_read, _page.remaining()); + if (UNLIKELY(this_run == 0)) { + break; + } + std::tie(is_null, this_run) = null_count(this_run); + size_t offset = total_read_count + already_read; + size_t this_read_count = 0; + rowid_t current_ordinal_in_page = + cast_set(_page.offset_in_page + _page.first_ordinal); + for (size_t i = 0; i < this_run; ++i) { + if (rowids[offset + i] - current_ordinal_in_page >= this_run) { + break; + } + this_read_count++; + } + + auto origin_index = _page.data_decoder->current_index(); + if (this_read_count > 0) { + if (is_null) { + null_col->insert_many_defaults(this_read_count); + } else { + size_t read_count = this_read_count; + + // ordinal in nullable columns' data buffer maybe be not continuously(the data doesn't contain null value), + // so we need use `page_start_off_in_decoder` to calculate the actual offset in `data_decoder` + size_t page_start_off_in_decoder = + _page.first_ordinal + _page.offset_in_page - origin_index; + RETURN_IF_ERROR(_page.data_decoder->read_by_rowids( + &rowids[offset], page_start_off_in_decoder, &read_count, + nest_column)); + null_map.resize_fill(null_map.size() + read_count, 0); + DCHECK_EQ(read_count, this_read_count); + } + } + + if (!is_null) { + RETURN_IF_ERROR(_page.data_decoder->seek_to_position_in_page(origin_index + + this_run)); + } + + already_read += this_read_count; + _page.offset_in_page += this_run; + DCHECK(_page.offset_in_page <= _page.num_rows); } - already_read += this_read_count; - _page.offset_in_page += this_run; - DCHECK(_page.offset_in_page <= _page.num_rows); + nrows_to_read = already_read; + total_read_count += nrows_to_read; + remaining -= nrows_to_read; } - - nrows_to_read = already_read; - total_read_count += nrows_to_read; - remaining -= nrows_to_read; } else { RETURN_IF_ERROR(_page.data_decoder->read_by_rowids( &rowids[total_read_count], _page.first_ordinal, &nrows_to_read, dst)); diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp index 03bb4bfffdbc2c..13b5829691804f 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/column_writer.cpp @@ -449,9 +449,9 @@ Status ColumnWriter::append_nullable(const uint8_t* is_null_bits, const void* da size_t this_run = 0; while ((this_run = null_iter.Next(&is_null)) > 0) { if (is_null) { - RETURN_IF_ERROR(append_nulls(this_run)); + RETURN_IF_ERROR(append_data(&ptr, this_run, true)); } else { - RETURN_IF_ERROR(append_data(&ptr, this_run)); + RETURN_IF_ERROR(append_data(&ptr, this_run, false)); } } return Status::OK(); @@ -475,13 +475,13 @@ Status ColumnWriter::append_nullable(const uint8_t* null_map, const uint8_t** pt do { auto step = next_run_step(); if (null_map[offset]) { - RETURN_IF_ERROR(append_nulls(step)); - *ptr += get_field()->size() * step; + RETURN_IF_ERROR(append_data(ptr, step, true)); + // *ptr += get_field()->size() * step; } else { // TODO: // 1. `*ptr += get_field()->size() * step;` should do in this function, not append_data; // 2. support array vectorized load and ptr offset add - RETURN_IF_ERROR(append_data(ptr, step)); + RETURN_IF_ERROR(append_data(ptr, step, false)); } offset += step; } while (offset < num_rows); @@ -495,7 +495,7 @@ Status ColumnWriter::append(const uint8_t* nullmap, const void* data, size_t num if (nullmap) { return append_nullable(nullmap, &ptr, num_rows); } else { - return append_data(&ptr, num_rows); + return append_data(&ptr, num_rows, false); } } @@ -641,11 +641,11 @@ Status ScalarColumnWriter::append_nulls(size_t num_rows) { // append data to page builder. this function will make sure that // num_rows must be written before return. And ptr will be modified // to next data should be written -Status ScalarColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) { +Status ScalarColumnWriter::append_data(const uint8_t** ptr, size_t num_rows, bool null) { size_t remaining = num_rows; while (remaining > 0) { size_t num_written = remaining; - RETURN_IF_ERROR(append_data_in_current_page(ptr, &num_written)); + RETURN_IF_ERROR(append_data_in_current_page(ptr, &num_written, null)); remaining -= num_written; @@ -657,35 +657,41 @@ Status ScalarColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) { } Status ScalarColumnWriter::_internal_append_data_in_current_page(const uint8_t* data, - size_t* num_written) { + size_t* num_written, bool null) { RETURN_IF_ERROR(_page_builder->add(data, num_written)); - if (_opts.need_bitmap_index) { - _bitmap_index_builder->add_values(data, *num_written); - } - if (_opts.need_zone_map) { - _zone_map_index_builder->add_values(data, *num_written); - } - if (_opts.need_inverted_index) { - for (const auto& builder : _inverted_index_builders) { - RETURN_IF_ERROR(builder->add_values(get_field()->name(), data, *num_written)); + if (!null) { + if (_opts.need_bitmap_index) { + _bitmap_index_builder->add_values(data, *num_written); + } + if (_opts.need_zone_map) { + _zone_map_index_builder->add_values(data, *num_written); + } + if (_opts.need_inverted_index) { + for (const auto& builder : _inverted_index_builders) { + RETURN_IF_ERROR(builder->add_values(get_field()->name(), data, *num_written)); + } + } + if (_opts.need_bloom_filter) { + RETURN_IF_ERROR(_bloom_filter_index_builder->add_values(data, *num_written)); } - } - if (_opts.need_bloom_filter) { - RETURN_IF_ERROR(_bloom_filter_index_builder->add_values(data, *num_written)); - } - _next_rowid += *num_written; + _next_rowid += *num_written; + if (is_nullable()) { + _null_bitmap_builder->add_run(false, *num_written); + } + } else { + DCHECK(is_nullable()); + RETURN_IF_ERROR(append_nulls(*num_written)); + } // we must write null bits after write data, because we don't // know how many rows can be written into current page - if (is_nullable()) { - _null_bitmap_builder->add_run(false, *num_written); - } return Status::OK(); } -Status ScalarColumnWriter::append_data_in_current_page(const uint8_t** data, size_t* num_written) { - RETURN_IF_ERROR(append_data_in_current_page(*data, num_written)); +Status ScalarColumnWriter::append_data_in_current_page(const uint8_t** data, size_t* num_written, + bool null) { + RETURN_IF_ERROR(append_data_in_current_page(*data, num_written, null)); *data += get_field()->size() * (*num_written); return Status::OK(); } @@ -840,6 +846,7 @@ Status ScalarColumnWriter::finish_current_page() { data_page_footer->set_num_values(_next_rowid - _first_rowid); data_page_footer->set_nullmap_size(cast_set(nullmap.slice().size)); data_page_footer->set_new_null_map(true); + data_page_footer->set_is_continue(true); if (_new_page_callback != nullptr) { _new_page_callback->put_extra_info_in_page(data_page_footer); } @@ -883,11 +890,11 @@ Status OffsetColumnWriter::init() { return Status::OK(); } -Status OffsetColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) { +Status OffsetColumnWriter::append_data(const uint8_t** ptr, size_t num_rows, bool null) { size_t remaining = num_rows; while (remaining > 0) { size_t num_written = remaining; - RETURN_IF_ERROR(append_data_in_current_page(ptr, &num_written)); + RETURN_IF_ERROR(append_data_in_current_page(ptr, &num_written, null)); // _next_offset after append_data_in_current_page is the offset of next data, which will used in finish_current_page() to set next_array_item_ordinal _next_offset = *(const uint64_t*)(*ptr); remaining -= num_written; @@ -940,12 +947,12 @@ Status StructColumnWriter::write_inverted_index() { Status StructColumnWriter::append_nullable(const uint8_t* null_map, const uint8_t** ptr, size_t num_rows) { - RETURN_IF_ERROR(append_data(ptr, num_rows)); - RETURN_IF_ERROR(_null_writer->append_data(&null_map, num_rows)); + RETURN_IF_ERROR(append_data(ptr, num_rows, false)); + RETURN_IF_ERROR(_null_writer->append_data(&null_map, num_rows, false)); return Status::OK(); } -Status StructColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) { +Status StructColumnWriter::append_data(const uint8_t** ptr, size_t num_rows, bool null) { const auto* results = reinterpret_cast(*ptr); for (size_t i = 0; i < _num_sub_column_writers; ++i) { auto nullmap = *(results + _num_sub_column_writers + i); @@ -1004,7 +1011,7 @@ Status StructColumnWriter::append_nulls(size_t num_rows) { if (is_nullable()) { std::vector null_signs(num_rows, 1); const uint8_t* null_sign_ptr = null_signs.data(); - RETURN_IF_ERROR(_null_writer->append_data(&null_sign_ptr, num_rows)); + RETURN_IF_ERROR(_null_writer->append_data(&null_sign_ptr, num_rows, false)); } return Status::OK(); } @@ -1066,7 +1073,7 @@ Status ArrayColumnWriter::write_ann_index() { } // batch append data for array -Status ArrayColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) { +Status ArrayColumnWriter::append_data(const uint8_t** ptr, size_t num_rows, bool null) { // data_ptr contains // [size, offset_ptr, item_data_ptr, item_nullmap_ptr] auto data_ptr = reinterpret_cast(*ptr); @@ -1107,7 +1114,7 @@ Status ArrayColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) { } } - RETURN_IF_ERROR(_offset_writer->append_data(&offsets_ptr, num_rows)); + RETURN_IF_ERROR(_offset_writer->append_data(&offsets_ptr, num_rows, false)); return Status::OK(); } @@ -1119,12 +1126,12 @@ uint64_t ArrayColumnWriter::estimate_buffer_size() { Status ArrayColumnWriter::append_nullable(const uint8_t* null_map, const uint8_t** ptr, size_t num_rows) { - RETURN_IF_ERROR(append_data(ptr, num_rows)); + RETURN_IF_ERROR(append_data(ptr, num_rows, false)); if (is_nullable()) { if (_opts.need_inverted_index) { RETURN_IF_ERROR(_inverted_index_writer->add_array_nulls(null_map, num_rows)); } - RETURN_IF_ERROR(_null_writer->append_data(&null_map, num_rows)); + RETURN_IF_ERROR(_null_writer->append_data(&null_map, num_rows, false)); } return Status::OK(); } @@ -1165,7 +1172,7 @@ Status ArrayColumnWriter::append_nulls(size_t num_rows) { while (num_lengths > 0) { // TODO llj bulk write const auto* offset_ptr = reinterpret_cast(&offset); - RETURN_IF_ERROR(_offset_writer->append_data(&offset_ptr, 1)); + RETURN_IF_ERROR(_offset_writer->append_data(&offset_ptr, 1, false)); --num_lengths; } return write_null_column(num_rows, true); @@ -1176,7 +1183,7 @@ Status ArrayColumnWriter::write_null_column(size_t num_rows, bool is_null) { while (is_nullable() && num_rows > 0) { // TODO llj bulk write const uint8_t* null_sign_ptr = &null_sign; - RETURN_IF_ERROR(_null_writer->append_data(&null_sign_ptr, 1)); + RETURN_IF_ERROR(_null_writer->append_data(&null_sign_ptr, 1, false)); --num_rows; } return Status::OK(); @@ -1240,15 +1247,15 @@ Status MapColumnWriter::finish() { Status MapColumnWriter::append_nullable(const uint8_t* null_map, const uint8_t** ptr, size_t num_rows) { - RETURN_IF_ERROR(append_data(ptr, num_rows)); + RETURN_IF_ERROR(append_data(ptr, num_rows, false)); if (is_nullable()) { - RETURN_IF_ERROR(_null_writer->append_data(&null_map, num_rows)); + RETURN_IF_ERROR(_null_writer->append_data(&null_map, num_rows, false)); } return Status::OK(); } // write key value data with offsets -Status MapColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) { +Status MapColumnWriter::append_data(const uint8_t** ptr, size_t num_rows, bool null) { // data_ptr contains // [size, offset_ptr, key_data_ptr, val_data_ptr, k_nullmap_ptr, v_nullmap_pr] // which converted results from olap_map_convertor and later will use a structure to replace it @@ -1269,7 +1276,7 @@ Status MapColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) { } // make sure the order : offset writer flush next_array_item_ordinal after kv_writers append_data // because we use _kv_writers[0]->get_next_rowid() to set next_array_item_ordinal in offset page footer - RETURN_IF_ERROR(_offsets_writer->append_data(&offsets_ptr, num_rows)); + RETURN_IF_ERROR(_offsets_writer->append_data(&offsets_ptr, num_rows, false)); return Status::OK(); } @@ -1304,12 +1311,12 @@ Status MapColumnWriter::append_nulls(size_t num_rows) { const ordinal_t offset = _kv_writers[0]->get_next_rowid(); std::vector offsets_data(num_rows, cast_set(offset)); const uint8_t* offsets_ptr = offsets_data.data(); - RETURN_IF_ERROR(_offsets_writer->append_data(&offsets_ptr, num_rows)); + RETURN_IF_ERROR(_offsets_writer->append_data(&offsets_ptr, num_rows, false)); if (is_nullable()) { std::vector null_signs(num_rows, 1); const uint8_t* null_sign_ptr = null_signs.data(); - RETURN_IF_ERROR(_null_writer->append_data(&null_sign_ptr, num_rows)); + RETURN_IF_ERROR(_null_writer->append_data(&null_sign_ptr, num_rows, false)); } return Status::OK(); } @@ -1335,7 +1342,7 @@ Status VariantColumnWriter::init() { return _impl->init(); } -Status VariantColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) { +Status VariantColumnWriter::append_data(const uint8_t** ptr, size_t num_rows, bool null) { _next_rowid += num_rows; return _impl->append_data(ptr, num_rows); } diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h index 89d544ea2e918d..56b1f559bcd9cd 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.h +++ b/be/src/olap/rowset/segment_v2/column_writer.h @@ -138,7 +138,7 @@ class ColumnWriter { return append_nullable(&nullmap, cell.cell_ptr(), 1); } else { auto* cel_ptr = cell.cell_ptr(); - return append_data((const uint8_t**)&cel_ptr, 1); + return append_data((const uint8_t**)&cel_ptr, 1, false); } } @@ -188,7 +188,7 @@ class ColumnWriter { virtual uint64_t get_total_compressed_data_pages_bytes() const = 0; // used for append not null data. - virtual Status append_data(const uint8_t** ptr, size_t num_rows) = 0; + virtual Status append_data(const uint8_t** ptr, size_t num_rows, bool null) = 0; bool is_nullable() const { return _is_nullable; } @@ -252,20 +252,21 @@ class ScalarColumnWriter : public ColumnWriter { void register_flush_page_callback(FlushPageCallback* flush_page_callback) { _new_page_callback = flush_page_callback; } - Status append_data(const uint8_t** ptr, size_t num_rows) override; + Status append_data(const uint8_t** ptr, size_t num_rows, bool null) override; // used for append not null data. When page is full, will append data not reach num_rows. - Status append_data_in_current_page(const uint8_t** ptr, size_t* num_written); + Status append_data_in_current_page(const uint8_t** ptr, size_t* num_written, bool null); - Status append_data_in_current_page(const uint8_t* ptr, size_t* num_written) { + Status append_data_in_current_page(const uint8_t* ptr, size_t* num_written, bool null) { RETURN_IF_CATCH_EXCEPTION( - { return _internal_append_data_in_current_page(ptr, num_written); }); + { return _internal_append_data_in_current_page(ptr, num_written, null); }); } friend class ArrayColumnWriter; friend class OffsetColumnWriter; private: - Status _internal_append_data_in_current_page(const uint8_t* ptr, size_t* num_written); + Status _internal_append_data_in_current_page(const uint8_t* ptr, size_t* num_written, + bool null); private: std::unique_ptr _page_builder; @@ -338,7 +339,7 @@ class OffsetColumnWriter final : public ScalarColumnWriter, FlushPageCallback { Status init() override; - Status append_data(const uint8_t** ptr, size_t num_rows) override; + Status append_data(const uint8_t** ptr, size_t num_rows, bool null) override; private: void put_extra_info_in_page(DataPageFooterPB* footer) override; @@ -356,7 +357,7 @@ class StructColumnWriter final : public ColumnWriter { Status init() override; Status append_nullable(const uint8_t* null_map, const uint8_t** data, size_t num_rows) override; - Status append_data(const uint8_t** ptr, size_t num_rows) override; + Status append_data(const uint8_t** ptr, size_t num_rows, bool null) override; uint64_t estimate_buffer_size() override; @@ -428,7 +429,7 @@ class ArrayColumnWriter final : public ColumnWriter { Status init() override; - Status append_data(const uint8_t** ptr, size_t num_rows) override; + Status append_data(const uint8_t** ptr, size_t num_rows, bool null) override; uint64_t estimate_buffer_size() override; @@ -509,7 +510,7 @@ class MapColumnWriter final : public ColumnWriter { Status init() override; - Status append_data(const uint8_t** ptr, size_t num_rows) override; + Status append_data(const uint8_t** ptr, size_t num_rows, bool null) override; Status append_nullable(const uint8_t* null_map, const uint8_t** ptr, size_t num_rows) override; uint64_t estimate_buffer_size() override; @@ -588,7 +589,7 @@ class VariantSubcolumnWriter : public ColumnWriter { Status init() override; - Status append_data(const uint8_t** ptr, size_t num_rows) override; + Status append_data(const uint8_t** ptr, size_t num_rows, bool null) override; uint64_t estimate_buffer_size() override; @@ -649,7 +650,7 @@ class VariantColumnWriter : public ColumnWriter { Status init() override; - Status append_data(const uint8_t** ptr, size_t num_rows) override; + Status append_data(const uint8_t** ptr, size_t num_rows, bool null) override; uint64_t estimate_buffer_size() override; diff --git a/be/src/olap/rowset/segment_v2/parsed_page.h b/be/src/olap/rowset/segment_v2/parsed_page.h index 6137ee5e1f2d53..fe2c39ad506dbe 100644 --- a/be/src/olap/rowset/segment_v2/parsed_page.h +++ b/be/src/olap/rowset/segment_v2/parsed_page.h @@ -50,6 +50,7 @@ struct ParsedPage { auto null_size = footer.nullmap_size(); auto null_bitmap = Slice(body.data + body.size - null_size, null_size); + page->is_continue = footer.has_is_continue() && footer.is_continue(); if (null_size > 0) { if (footer.has_new_null_map() && footer.new_null_map()) { @@ -63,6 +64,7 @@ struct ParsedPage { page->null_maps.resize(footer.num_values()); auto tmp_slice = Slice(page->null_maps.data(), page->null_maps.size()); RETURN_IF_ERROR(codec->decompress(null_bitmap, &tmp_slice)); + // Set is_continue to true when using new null map format } } else { auto null_decoder = @@ -119,6 +121,7 @@ struct ParsedPage { ordinal_t offset_in_page = 0; bool is_dict_encoding = false; + bool is_continue = false; bool contains(ordinal_t ord) { return ord >= first_ordinal && ord < (first_ordinal + num_rows); diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp b/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp index 20096a8b0c7697..6319a23b741ecc 100644 --- a/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp +++ b/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp @@ -723,7 +723,7 @@ Status VariantSubcolumnWriter::init() { return Status::OK(); } -Status VariantSubcolumnWriter::append_data(const uint8_t** ptr, size_t num_rows) { +Status VariantSubcolumnWriter::append_data(const uint8_t** ptr, size_t num_rows, bool null) { const auto* column = reinterpret_cast(*ptr); const auto& src = *reinterpret_cast(column->column_data); auto* dst_ptr = assert_cast(_column.get()); @@ -837,7 +837,7 @@ Status VariantSubcolumnWriter::write_bloom_filter_index() { Status VariantSubcolumnWriter::append_nullable(const uint8_t* null_map, const uint8_t** ptr, size_t num_rows) { // the root contains the same nullable info - RETURN_IF_ERROR(append_data(ptr, num_rows)); + RETURN_IF_ERROR(append_data(ptr, num_rows, false)); return Status::OK(); } diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto index 535c270d40b811..be4ecc9f344b10 100644 --- a/gensrc/proto/segment_v2.proto +++ b/gensrc/proto/segment_v2.proto @@ -76,6 +76,8 @@ message DataPageFooterPB { optional uint64 next_array_item_ordinal = 4; optional bool new_null_map = 5; + + optional bool is_continue = 6; } message IndexPageFooterPB { diff --git a/tools/tpcds-tools/conf/doris-cluster.conf b/tools/tpcds-tools/conf/doris-cluster.conf index fd737356c2c103..3a3101ff663e3f 100644 --- a/tools/tpcds-tools/conf/doris-cluster.conf +++ b/tools/tpcds-tools/conf/doris-cluster.conf @@ -18,9 +18,9 @@ # Any of FE host export FE_HOST='127.0.0.1' # http_port in fe.conf -export FE_HTTP_PORT=8030 +export FE_HTTP_PORT=8137 # query_port in fe.conf -export FE_QUERY_PORT=9030 +export FE_QUERY_PORT=9137 # Doris username export USER='root' # Doris password From 9a980895355098dad04f05aa12b70f46b7678414 Mon Sep 17 00:00:00 2001 From: happenlee Date: Thu, 18 Dec 2025 22:56:20 +0800 Subject: [PATCH 06/12] fix query error --- be/src/olap/rowset/segment_v2/column_reader.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index e490e8f306116f..08ed188f4a5078 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -521,6 +521,7 @@ Status ColumnReader::_parse_zone_map(const ZoneMapPB& zone_map, WrapperField* mi } } else { RETURN_IF_ERROR(min_value_container->from_string(zone_map.min())); + min_value_container->set_not_null(); } if (zone_map.has_nan()) { @@ -545,6 +546,7 @@ Status ColumnReader::_parse_zone_map(const ZoneMapPB& zone_map, WrapperField* mi } } else { RETURN_IF_ERROR(max_value_container->from_string(zone_map.max())); + max_value_container->set_not_null(); } } // for compatible original Cond eval logic From df02946fe26490ca4209a4dea0b102fe1068fdd2 Mon Sep 17 00:00:00 2001 From: happenlee Date: Tue, 23 Dec 2025 17:04:57 +0800 Subject: [PATCH 07/12] support stream agg topn --- .../exec/streaming_aggregation_operator.cpp | 301 ++++++++++++++++-- .../exec/streaming_aggregation_operator.h | 79 ++++- .../translator/PhysicalPlanTranslator.java | 3 +- 3 files changed, 344 insertions(+), 39 deletions(-) diff --git a/be/src/pipeline/exec/streaming_aggregation_operator.cpp b/be/src/pipeline/exec/streaming_aggregation_operator.cpp index 383c2e10079dfa..f1d1966dab7d15 100644 --- a/be/src/pipeline/exec/streaming_aggregation_operator.cpp +++ b/be/src/pipeline/exec/streaming_aggregation_operator.cpp @@ -99,6 +99,8 @@ Status StreamingAggLocalState::init(RuntimeState* state, LocalStateInfo& info) { _insert_values_to_column_timer = ADD_TIMER(Base::custom_profile(), "InsertValuesToColumnTime"); _deserialize_data_timer = ADD_TIMER(Base::custom_profile(), "DeserializeAndMergeTime"); _hash_table_compute_timer = ADD_TIMER(Base::custom_profile(), "HashTableComputeTime"); + _hash_table_limit_compute_timer = + ADD_TIMER(Base::custom_profile(), "HashTableLimitComputeTime"); _hash_table_emplace_timer = ADD_TIMER(Base::custom_profile(), "HashTableEmplaceTime"); _hash_table_input_counter = ADD_COUNTER(Base::custom_profile(), "HashTableInputCount", TUnit::UNIT); @@ -152,16 +154,10 @@ Status StreamingAggLocalState::open(RuntimeState* state) { }}, _agg_data->method_variant); - if (p._is_merge || p._needs_finalize) { - return Status::InvalidArgument( - "StreamingAggLocalState only support no merge and no finalize, " - "but got is_merge={}, needs_finalize={}", - p._is_merge, p._needs_finalize); - } - - _should_limit_output = p._limit != -1 && // has limit - (!p._have_conjuncts) && // no having conjunct - p._needs_finalize; // agg's finalize step + limit = p._sort_limit; + do_sort_limit = p._do_sort_limit; + null_directions = p._null_directions; + order_directions = p._order_directions; return Status::OK(); } @@ -316,23 +312,22 @@ bool StreamingAggLocalState::_should_not_do_pre_agg(size_t rows) { const auto spill_streaming_agg_mem_limit = p._spill_streaming_agg_mem_limit; const bool used_too_much_memory = spill_streaming_agg_mem_limit > 0 && _memory_usage() > spill_streaming_agg_mem_limit; - std::visit( - vectorized::Overload { - [&](std::monostate& arg) { - throw doris::Exception(ErrorCode::INTERNAL_ERROR, "uninited hash table"); - }, - [&](auto& agg_method) { - auto& hash_tbl = *agg_method.hash_table; - /// If too much memory is used during the pre-aggregation stage, - /// it is better to output the data directly without performing further aggregation. - // do not try to do agg, just init and serialize directly return the out_block - if (used_too_much_memory || (hash_tbl.add_elem_size_overflow(rows) && - !_should_expand_preagg_hash_tables())) { - SCOPED_TIMER(_streaming_agg_timer); - ret_flag = true; - } - }}, - _agg_data->method_variant); + std::visit(vectorized::Overload { + [&](std::monostate& arg) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, "uninited hash table"); + }, + [&](auto& agg_method) { + auto& hash_tbl = *agg_method.hash_table; + /// If too much memory is used during the pre-aggregation stage, + /// it is better to output the data directly without performing further aggregation. + // do not try to do agg, just init and serialize directly return the out_block + if (used_too_much_memory || (hash_tbl.add_elem_size_overflow(rows) && + !_should_expand_preagg_hash_tables())) { + SCOPED_TIMER(_streaming_agg_timer); + ret_flag = true; + } + }}, + _agg_data->method_variant); return ret_flag; } @@ -363,6 +358,30 @@ Status StreamingAggLocalState::_pre_agg_with_serialized_key(doris::vectorized::B _places.resize(rows); if (_should_not_do_pre_agg(rows)) { + if (limit > 0) { + DCHECK(do_sort_limit); + if (need_do_sort_limit == -1) { + const size_t hash_table_size = _get_hash_table_size(); + need_do_sort_limit = hash_table_size >= limit ? 1 : 0; + if (need_do_sort_limit == 1) { + build_limit_heap(hash_table_size); + } + } + + if (need_do_sort_limit == 1) { + if (_do_limit_filter(rows, key_columns)) { + bool need_filter = std::find(need_computes.begin(), need_computes.end(), 1) != + need_computes.end(); + if (need_filter) { + _add_limit_heap_top(key_columns, rows); + vectorized::Block::filter_block_internal(in_block, need_computes); + rows = (uint32_t)in_block->rows(); + } else { + return Status::OK(); + } + } + } + } bool mem_reuse = p._make_nullable_keys.empty() && out_block->mem_reuse(); std::vector data_types; @@ -404,12 +423,23 @@ Status StreamingAggLocalState::_pre_agg_with_serialized_key(doris::vectorized::B } } } else { - _emplace_into_hash_table(_places.data(), key_columns, rows); + bool need_agg = true; + if (need_do_sort_limit != 1) { + _emplace_into_hash_table(_places.data(), key_columns, rows); + } else { + need_agg = _emplace_into_hash_table_limit(_places.data(), in_block, key_columns, rows); + } - for (int i = 0; i < _aggregate_evaluators.size(); ++i) { - RETURN_IF_ERROR(_aggregate_evaluators[i]->execute_batch_add( - in_block, p._offsets_of_aggregate_states[i], _places.data(), _agg_arena_pool, - _should_expand_hash_table)); + if (need_agg) { + for (int i = 0; i < _aggregate_evaluators.size(); ++i) { + RETURN_IF_ERROR(_aggregate_evaluators[i]->execute_batch_add( + in_block, p._offsets_of_aggregate_states[i], _places.data(), + _agg_arena_pool, _should_expand_hash_table)); + } + if (limit > 0 && need_do_sort_limit == -1 && _get_hash_table_size() >= limit) { + need_do_sort_limit = 1; + build_limit_heap(_get_hash_table_size()); + } } } @@ -561,6 +591,183 @@ void StreamingAggLocalState::_destroy_agg_status(vectorized::AggregateDataPtr da } } +vectorized::MutableColumns StreamingAggLocalState::_get_keys_hash_table() { + return std::visit( + vectorized::Overload { + [&](std::monostate& arg) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, "uninited hash table"); + return vectorized::MutableColumns(); + }, + [&](auto&& agg_method) -> vectorized::MutableColumns { + vectorized::MutableColumns key_columns; + for (int i = 0; i < _probe_expr_ctxs.size(); ++i) { + key_columns.emplace_back( + _probe_expr_ctxs[i]->root()->data_type()->create_column()); + } + auto& data = *agg_method.hash_table; + bool has_null_key = data.has_null_key_data(); + const auto size = data.size() - has_null_key; + using KeyType = std::decay_t::Key; + std::vector keys(size); + + uint32_t num_rows = 0; + auto iter = _aggregate_data_container->begin(); + { + while (iter != _aggregate_data_container->end()) { + keys[num_rows] = iter.get_key(); + ++iter; + ++num_rows; + } + } + agg_method.insert_keys_into_columns(keys, key_columns, num_rows); + if (has_null_key) { + key_columns[0]->insert_data(nullptr, 0); + } + return key_columns; + }}, + _agg_data->method_variant); +} + +void StreamingAggLocalState::build_limit_heap(size_t hash_table_size) { + limit_columns = _get_keys_hash_table(); + for (size_t i = 0; i < hash_table_size; ++i) { + limit_heap.emplace(i, limit_columns, order_directions, null_directions); + } + while (hash_table_size > limit) { + limit_heap.pop(); + hash_table_size--; + } + limit_columns_min = limit_heap.top()._row_id; +} + +void StreamingAggLocalState::_add_limit_heap_top(vectorized::ColumnRawPtrs& key_columns, + size_t rows) { + for (int i = 0; i < rows; ++i) { + if (cmp_res[i] == 1 && need_computes[i]) { + for (int j = 0; j < key_columns.size(); ++j) { + limit_columns[j]->insert_from(*key_columns[j], i); + } + limit_heap.emplace(limit_columns[0]->size() - 1, limit_columns, order_directions, + null_directions); + limit_heap.pop(); + limit_columns_min = limit_heap.top()._row_id; + break; + } + } +} + +void StreamingAggLocalState::_refresh_limit_heap(size_t i, vectorized::ColumnRawPtrs& key_columns) { + for (int j = 0; j < key_columns.size(); ++j) { + limit_columns[j]->insert_from(*key_columns[j], i); + } + limit_heap.emplace(limit_columns[0]->size() - 1, limit_columns, order_directions, + null_directions); + limit_heap.pop(); + limit_columns_min = limit_heap.top()._row_id; +} + +bool StreamingAggLocalState::_emplace_into_hash_table_limit(vectorized::AggregateDataPtr* places, + vectorized::Block* block, + vectorized::ColumnRawPtrs& key_columns, + uint32_t num_rows) { + return std::visit( + vectorized::Overload { + [&](std::monostate& arg) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, "uninited hash table"); + return true; + }, + [&](auto&& agg_method) -> bool { + SCOPED_TIMER(_hash_table_compute_timer); + using HashMethodType = std::decay_t; + using AggState = typename HashMethodType::State; + + bool need_filter = _do_limit_filter(num_rows, key_columns); + if (auto need_agg = + std::find(need_computes.begin(), need_computes.end(), 1); + need_agg != need_computes.end()) { + if (need_filter) { + vectorized::Block::filter_block_internal(block, need_computes); + num_rows = (uint32_t)block->rows(); + } + + AggState state(key_columns); + agg_method.init_serialized_keys(key_columns, num_rows); + size_t i = 0; + + auto creator = [&](const auto& ctor, auto& key, auto& origin) { + try { + HashMethodType::try_presis_key_and_origin(key, origin, + _agg_arena_pool); + auto mapped = _aggregate_data_container->append_data(origin); + auto st = _create_agg_status(mapped); + if (!st) { + throw Exception(st.code(), st.to_string()); + } + ctor(key, mapped); + _refresh_limit_heap(i, key_columns); + } catch (...) { + // Exception-safety - if it can not allocate memory or create status, + // the destructors will not be called. + ctor(key, nullptr); + throw; + } + }; + + auto creator_for_null_key = [&](auto& mapped) { + mapped = _agg_arena_pool.aligned_alloc( + Base::_parent->template cast() + ._total_size_of_aggregate_states, + Base::_parent->template cast() + ._align_aggregate_states); + auto st = _create_agg_status(mapped); + if (!st) { + throw Exception(st.code(), st.to_string()); + } + _refresh_limit_heap(i, key_columns); + }; + + SCOPED_TIMER(_hash_table_emplace_timer); + for (i = 0; i < num_rows; ++i) { + places[i] = *agg_method.lazy_emplace(state, i, creator, + creator_for_null_key); + } + COUNTER_UPDATE(_hash_table_input_counter, num_rows); + return true; + } + return false; + }}, + _agg_data->method_variant); +} + +bool StreamingAggLocalState::_do_limit_filter(size_t num_rows, + vectorized::ColumnRawPtrs& key_columns) { + SCOPED_TIMER(_hash_table_limit_compute_timer); + if (num_rows) { + cmp_res.resize(num_rows); + need_computes.resize(num_rows); + memset(need_computes.data(), 0, need_computes.size()); + memset(cmp_res.data(), 0, cmp_res.size()); + + const auto key_size = null_directions.size(); + for (int i = 0; i < key_size; i++) { + key_columns[i]->compare_internal(limit_columns_min, *limit_columns[i], + null_directions[i], order_directions[i], cmp_res, + need_computes.data()); + } + + auto set_computes_arr = [](auto* __restrict res, auto* __restrict computes, size_t rows) { + for (size_t i = 0; i < rows; ++i) { + computes[i] = computes[i] == res[i]; + } + }; + set_computes_arr(cmp_res.data(), need_computes.data(), num_rows); + + return std::find(need_computes.begin(), need_computes.end(), 0) != need_computes.end(); + } + + return false; +} + void StreamingAggLocalState::_emplace_into_hash_table(vectorized::AggregateDataPtr* places, vectorized::ColumnRawPtrs& key_columns, const uint32_t num_rows) { @@ -616,7 +823,6 @@ StreamingAggOperatorX::StreamingAggOperatorX(ObjectPool* pool, int operator_id, _intermediate_tuple_id(tnode.agg_node.intermediate_tuple_id), _output_tuple_id(tnode.agg_node.output_tuple_id), _needs_finalize(tnode.agg_node.need_finalize), - _is_merge(false), _is_first_phase(tnode.agg_node.__isset.is_first_phase && tnode.agg_node.is_first_phase), _have_conjuncts(tnode.__isset.vconjunct && !tnode.vconjunct.nodes.empty()), _agg_fn_output_row_descriptor(descs, tnode.row_tuples, tnode.nullable_tuples), @@ -668,8 +874,33 @@ Status StreamingAggOperatorX::init(const TPlanNode& tnode, RuntimeState* state) } const auto& agg_functions = tnode.agg_node.aggregate_functions; - _is_merge = std::any_of(agg_functions.cbegin(), agg_functions.cend(), - [](const auto& e) { return e.nodes[0].agg_expr.is_merge_agg; }); + auto is_merge = std::any_of(agg_functions.cbegin(), agg_functions.cend(), + [](const auto& e) { return e.nodes[0].agg_expr.is_merge_agg; }); + if (is_merge || _needs_finalize) { + return Status::InvalidArgument( + "StreamingAggLocalState only support no merge and no finalize, " + "but got is_merge={}, needs_finalize={}", + is_merge, _needs_finalize); + } + + // Handle sort limit + if (tnode.agg_node.__isset.agg_sort_info_by_group_key) { + _sort_limit = _limit; + _limit = -1; + _do_sort_limit = true; + const auto& agg_sort_info = tnode.agg_node.agg_sort_info_by_group_key; + DCHECK_EQ(agg_sort_info.nulls_first.size(), agg_sort_info.is_asc_order.size()); + + const size_t order_by_key_size = agg_sort_info.is_asc_order.size(); + _order_directions.resize(order_by_key_size); + _null_directions.resize(order_by_key_size); + for (int i = 0; i < order_by_key_size; ++i) { + _order_directions[i] = agg_sort_info.is_asc_order[i] ? 1 : -1; + _null_directions[i] = + agg_sort_info.nulls_first[i] ? -_order_directions[i] : _order_directions[i]; + } + } + _op_name = "STREAMING_AGGREGATION_OPERATOR"; return Status::OK(); } diff --git a/be/src/pipeline/exec/streaming_aggregation_operator.h b/be/src/pipeline/exec/streaming_aggregation_operator.h index d7fc56b6fe65f9..7f7ee9403decf8 100644 --- a/be/src/pipeline/exec/streaming_aggregation_operator.h +++ b/be/src/pipeline/exec/streaming_aggregation_operator.h @@ -48,6 +48,7 @@ class StreamingAggLocalState MOCK_REMOVE(final) : public PipelineXLocalState _aggregate_data_container = nullptr; - bool _should_limit_output = false; bool _reach_limit = false; size_t _input_num_rows = 0; + int64_t limit = -1; + int need_do_sort_limit = -1; + bool do_sort_limit = false; + vectorized::MutableColumns limit_columns; + int limit_columns_min = -1; + vectorized::PaddedPODArray need_computes; + std::vector cmp_res; + std::vector order_directions; + std::vector null_directions; + + struct HeapLimitCursor { + HeapLimitCursor(int row_id, vectorized::MutableColumns& limit_columns, + std::vector& order_directions, std::vector& null_directions) + : _row_id(row_id), + _limit_columns(limit_columns), + _order_directions(order_directions), + _null_directions(null_directions) {} + + HeapLimitCursor(const HeapLimitCursor& other) = default; + + HeapLimitCursor(HeapLimitCursor&& other) noexcept + : _row_id(other._row_id), + _limit_columns(other._limit_columns), + _order_directions(other._order_directions), + _null_directions(other._null_directions) {} + + HeapLimitCursor& operator=(const HeapLimitCursor& other) noexcept { + _row_id = other._row_id; + return *this; + } + + HeapLimitCursor& operator=(HeapLimitCursor&& other) noexcept { + _row_id = other._row_id; + return *this; + } + + bool operator<(const HeapLimitCursor& rhs) const { + for (int i = 0; i < _limit_columns.size(); ++i) { + const auto& _limit_column = _limit_columns[i]; + auto res = _limit_column->compare_at(_row_id, rhs._row_id, *_limit_column, + _null_directions[i]) * + _order_directions[i]; + if (res < 0) { + return true; + } else if (res > 0) { + return false; + } + } + return false; + } + + int _row_id; + vectorized::MutableColumns& _limit_columns; + std::vector& _order_directions; + std::vector& _null_directions; + }; + + std::priority_queue limit_heap; + + vectorized::MutableColumns _get_keys_hash_table(); + vectorized::PODArray _places; std::vector _deserialize_buffer; @@ -182,7 +251,6 @@ class StreamingAggOperatorX MOCK_REMOVE(final) : public StatefulOperatorX _make_nullable_keys; bool _have_conjuncts; RowDescriptor _agg_fn_output_row_descriptor; + + // For sort limit + bool _do_sort_limit = false; + int64_t _sort_limit = -1; + std::vector _order_directions; + std::vector _null_directions; + const std::vector _partition_exprs; }; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java index 01b10a316048b4..0f621c492f6c4c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java @@ -344,8 +344,7 @@ public PlanFragment visitPhysicalDistribute(PhysicalDistribute d if (upstreamFragment.getPlanRoot() instanceof AggregationNode && upstream instanceof PhysicalHashAggregate) { PhysicalHashAggregate hashAggregate = (PhysicalHashAggregate) upstream; if (hashAggregate.getAggPhase() == AggPhase.LOCAL - && hashAggregate.getAggMode() == AggMode.INPUT_TO_BUFFER - && hashAggregate.getTopnPushInfo() == null) { + && hashAggregate.getAggMode() == AggMode.INPUT_TO_BUFFER) { AggregationNode aggregationNode = (AggregationNode) upstreamFragment.getPlanRoot(); aggregationNode.setUseStreamingPreagg(hashAggregate.isMaybeUsingStream()); } From bbca9eacc3b6c9fdd089500a03904f7240cd8259 Mon Sep 17 00:00:00 2001 From: Socrates Date: Fri, 19 Dec 2025 10:53:50 +0800 Subject: [PATCH 08/12] Manifest cache for tpch1000 (#59178) --- be/src/clucene | 1 + fe/check/checkstyle/suppressions.xml | 3 + .../java/org/apache/doris/common/Config.java | 12 + .../datasource/ExternalMetaCacheMgr.java | 7 + .../iceberg/IcebergManifestCacheMgr.java | 35 + .../datasource/iceberg/IcebergUtils.java | 8 + .../iceberg/cache/ContentFileEstimater.java | 194 ++++ .../iceberg/cache/IcebergManifestCache.java | 96 ++ .../cache/IcebergManifestCacheLoader.java | 89 ++ .../iceberg/cache/ManifestCacheKey.java | 58 ++ .../iceberg/cache/ManifestCacheValue.java | 65 ++ .../iceberg/source/IcebergScanNode.java | 146 ++- .../metastore/AbstractIcebergProperties.java | 62 ++ .../org/apache/iceberg/DeleteFileIndex.java | 906 ++++++++++++++++++ 14 files changed, 1681 insertions(+), 1 deletion(-) create mode 160000 be/src/clucene create mode 100644 fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergManifestCacheMgr.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ContentFileEstimater.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/IcebergManifestCache.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/IcebergManifestCacheLoader.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ManifestCacheKey.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ManifestCacheValue.java create mode 100644 fe/fe-core/src/main/java/org/apache/iceberg/DeleteFileIndex.java diff --git a/be/src/clucene b/be/src/clucene new file mode 160000 index 00000000000000..bb22247973e55d --- /dev/null +++ b/be/src/clucene @@ -0,0 +1 @@ +Subproject commit bb22247973e55dcac9a3eaafedc57cc6c36d2fc3 diff --git a/fe/check/checkstyle/suppressions.xml b/fe/check/checkstyle/suppressions.xml index 8f000bb7616ca9..7340c4c5bd5fe9 100644 --- a/fe/check/checkstyle/suppressions.xml +++ b/fe/check/checkstyle/suppressions.xml @@ -69,6 +69,9 @@ under the License. + + + diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index 350e34f8a90940..d4426c2d515136 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -2298,6 +2298,18 @@ public class Config extends ConfigBase { }) public static long external_cache_refresh_time_minutes = 10; // 10 mins + @ConfField(description = {"是否启用 Iceberg Manifest DataFile/DeleteFile 缓存。", + "Whether to enable Iceberg manifest DataFile/DeleteFile cache."}) + public static boolean iceberg_manifest_cache_enable = true; + + @ConfField(description = {"Iceberg Manifest 缓存的容量上限,单位 MB。", + "Iceberg manifest cache capacity in MB."}) + public static long iceberg_manifest_cache_capacity_mb = 1024; + + @ConfField(description = {"Iceberg Manifest 缓存的访问过期时间(秒),0 或负数表示不过期。", + "Iceberg manifest cache expire after access in seconds. 0 or negative disables expiration."}) + public static long iceberg_manifest_cache_ttl_sec = 48 * 60 * 60; + /** * Github workflow test type, for setting some session variables * only for certain test type. E.g. only settting batch_size to small diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalMetaCacheMgr.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalMetaCacheMgr.java index e777285a07f587..798a2170b1e53b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalMetaCacheMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalMetaCacheMgr.java @@ -29,6 +29,7 @@ import org.apache.doris.datasource.hudi.source.HudiCachedMetaClientProcessor; import org.apache.doris.datasource.hudi.source.HudiMetadataCacheMgr; import org.apache.doris.datasource.hudi.source.HudiPartitionProcessor; +import org.apache.doris.datasource.iceberg.IcebergManifestCacheMgr; import org.apache.doris.datasource.iceberg.IcebergMetadataCache; import org.apache.doris.datasource.iceberg.IcebergMetadataCacheMgr; import org.apache.doris.datasource.maxcompute.MaxComputeMetadataCache; @@ -97,6 +98,7 @@ public class ExternalMetaCacheMgr { private FileSystemCache fsCache; // all external table row count cache. private ExternalRowCountCache rowCountCache; + private final IcebergManifestCacheMgr icebergManifestCacheMgr; private final IcebergMetadataCacheMgr icebergMetadataCacheMgr; private final MaxComputeMetadataCacheMgr maxComputeMetadataCacheMgr; private final PaimonMetadataCacheMgr paimonMetadataCacheMgr; @@ -128,6 +130,7 @@ public ExternalMetaCacheMgr(boolean isCheckpointCatalog) { rowCountCache = new ExternalRowCountCache(rowCountRefreshExecutor); hudiMetadataCacheMgr = new HudiMetadataCacheMgr(commonRefreshExecutor); + icebergManifestCacheMgr = new IcebergManifestCacheMgr(); icebergMetadataCacheMgr = new IcebergMetadataCacheMgr(commonRefreshExecutor); maxComputeMetadataCacheMgr = new MaxComputeMetadataCacheMgr(); paimonMetadataCacheMgr = new PaimonMetadataCacheMgr(commonRefreshExecutor); @@ -199,6 +202,10 @@ public HudiMetadataCacheMgr getHudiMetadataCacheMgr() { return hudiMetadataCacheMgr; } + public IcebergManifestCacheMgr getIcebergManifestCacheMgr() { + return icebergManifestCacheMgr; + } + public IcebergMetadataCache getIcebergMetadataCache() { return icebergMetadataCacheMgr.getIcebergMetadataCache(); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergManifestCacheMgr.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergManifestCacheMgr.java new file mode 100644 index 00000000000000..ad95e151b98db5 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergManifestCacheMgr.java @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.datasource.iceberg; + +import org.apache.doris.datasource.iceberg.cache.IcebergManifestCache; + +/** + * Wrapper manager for Iceberg manifest cache. + */ +public class IcebergManifestCacheMgr { + private final IcebergManifestCache manifestCache; + + public IcebergManifestCacheMgr() { + this.manifestCache = new IcebergManifestCache(); + } + + public IcebergManifestCache getManifestCache() { + return manifestCache; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java index 9587ca4f8169ca..28ddf2817df40b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java @@ -56,6 +56,7 @@ import org.apache.doris.datasource.ExternalSchemaCache; import org.apache.doris.datasource.ExternalTable; import org.apache.doris.datasource.SchemaCacheValue; +import org.apache.doris.datasource.iceberg.cache.IcebergManifestCache; import org.apache.doris.datasource.iceberg.source.IcebergTableQueryInfo; import org.apache.doris.datasource.mvcc.MvccSnapshot; import org.apache.doris.datasource.mvcc.MvccUtil; @@ -1452,4 +1453,11 @@ public static String showCreateView(IcebergExternalTable icebergExternalTable) { icebergExternalTable.getViewText(); } + public static IcebergManifestCache getManifestCache() { + return Env.getCurrentEnv() + .getExtMetaCacheMgr() + .getIcebergManifestCacheMgr() + .getManifestCache(); + } + } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ContentFileEstimater.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ContentFileEstimater.java new file mode 100644 index 00000000000000..43f60096e31e4f --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ContentFileEstimater.java @@ -0,0 +1,194 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.datasource.iceberg.cache; + +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.StructLike; + +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Map; + +/** + * Utility to estimate the JVM weight of Iceberg {@link ContentFile} objects. + */ +public final class ContentFileEstimater { + private static final long LIST_BASE_WEIGHT = 48L; + private static final long OBJECT_REFERENCE_WEIGHT = 8L; + private static final long CONTENT_FILE_BASE_WEIGHT = 256L; + private static final long STRING_BASE_WEIGHT = 40L; + private static final long CHAR_BYTES = 2L; + private static final long BYTE_BUFFER_BASE_WEIGHT = 16L; + private static final long MAP_BASE_WEIGHT = 48L; + private static final long MAP_ENTRY_OVERHEAD = 24L; + private static final long LONG_OBJECT_WEIGHT = 24L; + private static final long INT_OBJECT_WEIGHT = 16L; + private static final long PARTITION_BASE_WEIGHT = 48L; + private static final long PARTITION_VALUE_BASE_WEIGHT = 8L; + + private ContentFileEstimater() { + } + + public static long estimate(List> files) { + return listReferenceWeight(files) + estimateContentFilesWeight(files); + } + + private static long listReferenceWeight(List files) { + if (files == null || files.isEmpty()) { + return 0L; + } + return LIST_BASE_WEIGHT + (long) files.size() * OBJECT_REFERENCE_WEIGHT; + } + + private static long estimateContentFilesWeight(List> files) { + long total = 0L; + if (files == null) { + return 0L; + } + for (ContentFile file : files) { + total += estimateContentFileWeight(file); + } + return total; + } + + private static long estimateContentFileWeight(ContentFile file) { + if (file == null) { + return 0L; + } + + long weight = CONTENT_FILE_BASE_WEIGHT; + weight += charSequenceWeight(file.path()); + weight += stringWeight(file.manifestLocation()); + weight += byteBufferWeight(file.keyMetadata()); + weight += partitionWeight(file.partition()); + + weight += numericMapWeight(file.columnSizes()); + weight += numericMapWeight(file.valueCounts()); + weight += numericMapWeight(file.nullValueCounts()); + weight += numericMapWeight(file.nanValueCounts()); + weight += byteBufferMapWeight(file.lowerBounds()); + weight += byteBufferMapWeight(file.upperBounds()); + + weight += listWeight(file.splitOffsets(), LONG_OBJECT_WEIGHT); + weight += listWeight(file.equalityFieldIds(), INT_OBJECT_WEIGHT); + + weight += optionalLongWeight(file.pos()); + weight += optionalLongWeight(file.dataSequenceNumber()); + weight += optionalLongWeight(file.fileSequenceNumber()); + weight += optionalLongWeight(file.firstRowId()); + weight += optionalIntWeight(file.sortOrderId()); + + if (file instanceof DeleteFile) { + DeleteFile deleteFile = (DeleteFile) file; + weight += stringWeight(deleteFile.referencedDataFile()); + weight += optionalLongWeight(deleteFile.contentOffset()); + weight += optionalLongWeight(deleteFile.contentSizeInBytes()); + } + + return weight; + } + + private static long listWeight(List list, long elementWeight) { + if (list == null || list.isEmpty()) { + return 0L; + } + return LIST_BASE_WEIGHT + (long) list.size() * (OBJECT_REFERENCE_WEIGHT + elementWeight); + } + + private static long numericMapWeight(Map map) { + if (map == null || map.isEmpty()) { + return 0L; + } + return MAP_BASE_WEIGHT + (long) map.size() * (MAP_ENTRY_OVERHEAD + LONG_OBJECT_WEIGHT); + } + + private static long byteBufferMapWeight(Map map) { + if (map == null || map.isEmpty()) { + return 0L; + } + long weight = MAP_BASE_WEIGHT + (long) map.size() * MAP_ENTRY_OVERHEAD; + for (ByteBuffer buffer : map.values()) { + weight += byteBufferWeight(buffer); + } + return weight; + } + + private static long partitionWeight(StructLike partition) { + if (partition == null) { + return 0L; + } + long weight = PARTITION_BASE_WEIGHT + (long) partition.size() * PARTITION_VALUE_BASE_WEIGHT; + for (int i = 0; i < partition.size(); i++) { + Object value = partition.get(i, Object.class); + weight += estimateValueWeight(value); + } + return weight; + } + + private static long estimateValueWeight(Object value) { + if (value == null) { + return 0L; + } + if (value instanceof CharSequence) { + return charSequenceWeight((CharSequence) value); + } else if (value instanceof byte[]) { + return BYTE_BUFFER_BASE_WEIGHT + ((byte[]) value).length; + } else if (value instanceof ByteBuffer) { + return byteBufferWeight((ByteBuffer) value); + } else if (value instanceof Long || value instanceof Double) { + return LONG_OBJECT_WEIGHT; + } else if (value instanceof Integer || value instanceof Float) { + return INT_OBJECT_WEIGHT; + } else if (value instanceof Short || value instanceof Character) { + return 4L; + } else if (value instanceof Boolean) { + return 1L; + } + return OBJECT_REFERENCE_WEIGHT; + } + + private static long charSequenceWeight(CharSequence value) { + if (value == null) { + return 0L; + } + return STRING_BASE_WEIGHT + (long) value.length() * CHAR_BYTES; + } + + private static long stringWeight(String value) { + if (value == null) { + return 0L; + } + return STRING_BASE_WEIGHT + (long) value.length() * CHAR_BYTES; + } + + private static long byteBufferWeight(ByteBuffer buffer) { + if (buffer == null) { + return 0L; + } + return BYTE_BUFFER_BASE_WEIGHT + buffer.remaining(); + } + + private static long optionalLongWeight(Long value) { + return value == null ? 0L : LONG_OBJECT_WEIGHT; + } + + private static long optionalIntWeight(Integer value) { + return value == null ? 0L : INT_OBJECT_WEIGHT; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/IcebergManifestCache.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/IcebergManifestCache.java new file mode 100644 index 00000000000000..be919c5d3134fb --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/IcebergManifestCache.java @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.datasource.iceberg.cache; + +import org.apache.doris.common.Config; +import org.apache.doris.datasource.CacheException; + +import com.github.benmanes.caffeine.cache.CacheLoader; +import com.github.benmanes.caffeine.cache.Caffeine; +import com.github.benmanes.caffeine.cache.LoadingCache; +import com.github.benmanes.caffeine.cache.Weigher; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.time.Duration; +import java.util.Optional; +import java.util.concurrent.Callable; + +/** + * A lightweight manifest cache that stores parsed DataFile/DeleteFile lists per manifest. + */ +public class IcebergManifestCache { + private static final Logger LOG = LogManager.getLogger(IcebergManifestCache.class); + + private final LoadingCache cache; + + public IcebergManifestCache() { + long capacityInBytes = Config.iceberg_manifest_cache_capacity_mb * 1024L * 1024L; + Weigher weigher = (key, value) -> { + long weight = Optional.ofNullable(value).map(ManifestCacheValue::getWeightBytes).orElse(0L); + if (weight > Integer.MAX_VALUE) { + return Integer.MAX_VALUE; + } + return (int) weight; + }; + Caffeine builder = Caffeine.newBuilder() + .maximumWeight(capacityInBytes) + .weigher(weigher); + if (Config.iceberg_manifest_cache_ttl_sec > 0) { + builder = builder.expireAfterAccess(Duration.ofSeconds(Config.iceberg_manifest_cache_ttl_sec)); + } + cache = builder.build(new CacheLoader() { + @Override + public ManifestCacheValue load(ManifestCacheKey key) { + throw new CacheException("Manifest cache loader should be provided explicitly for key %s", null, key); + } + }); + } + + public ManifestCacheValue get(ManifestCacheKey key, Callable loader) { + try { + return cache.get(key, ignored -> { + try { + return loader.call(); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + } catch (Exception e) { + throw new CacheException("Failed to load manifest cache for key %s", e, key); + } + } + + public Optional peek(ManifestCacheKey key) { + return Optional.ofNullable(cache.getIfPresent(key)); + } + + public void invalidateByPath(String path) { + cache.asMap().keySet().stream() + .filter(key -> key.getPath().equals(path)) + .forEach(cache::invalidate); + } + + public void invalidateAll() { + cache.invalidateAll(); + } + + public ManifestCacheKey buildKey(String path) { + return new ManifestCacheKey(path); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/IcebergManifestCacheLoader.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/IcebergManifestCacheLoader.java new file mode 100644 index 00000000000000..dc4d16da61b60a --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/IcebergManifestCacheLoader.java @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.datasource.iceberg.cache; + +import org.apache.doris.datasource.CacheException; + +import com.google.common.collect.Lists; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.ManifestFiles; +import org.apache.iceberg.ManifestReader; +import org.apache.iceberg.Table; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.io.IOException; +import java.util.List; + +/** + * Helper to load manifest content and populate the manifest cache. + */ +public class IcebergManifestCacheLoader { + private static final Logger LOG = LogManager.getLogger(IcebergManifestCacheLoader.class); + + private IcebergManifestCacheLoader() { + } + + public static ManifestCacheValue loadDataFilesWithCache(IcebergManifestCache cache, ManifestFile manifest, + Table table) { + ManifestCacheKey key = buildKey(cache, manifest); + return cache.get(key, () -> loadDataFiles(manifest, table)); + } + + public static ManifestCacheValue loadDeleteFilesWithCache(IcebergManifestCache cache, ManifestFile manifest, + Table table) { + ManifestCacheKey key = buildKey(cache, manifest); + return cache.get(key, () -> loadDeleteFiles(manifest, table)); + } + + private static ManifestCacheValue loadDataFiles(ManifestFile manifest, Table table) { + List dataFiles = Lists.newArrayList(); + try (ManifestReader reader = ManifestFiles.read(manifest, table.io())) { + // ManifestReader implements CloseableIterable, iterate directly + for (DataFile dataFile : reader) { + dataFiles.add(dataFile.copy()); + } + } catch (IOException e) { + LOG.warn("Failed to read data manifest {}", manifest.path(), e); + throw new CacheException("Failed to read data manifest %s", e, manifest.path()); + } + return ManifestCacheValue.forDataFiles(dataFiles); + } + + private static ManifestCacheValue loadDeleteFiles(ManifestFile manifest, Table table) { + List deleteFiles = Lists.newArrayList(); + try (ManifestReader reader = ManifestFiles.readDeleteManifest(manifest, table.io(), + table.specs())) { + // ManifestReader implements CloseableIterable, iterate directly + for (DeleteFile deleteFile : reader) { + deleteFiles.add(deleteFile.copy()); + } + } catch (IOException e) { + LOG.warn("Failed to read delete manifest {}", manifest.path(), e); + throw new CacheException("Failed to read delete manifest %s", e, manifest.path()); + } + return ManifestCacheValue.forDeleteFiles(deleteFiles); + } + + private static ManifestCacheKey buildKey(IcebergManifestCache cache, ManifestFile manifest) { + // Iceberg manifest files are immutable, so path uniquely identifies a manifest + return cache.buildKey(manifest.path()); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ManifestCacheKey.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ManifestCacheKey.java new file mode 100644 index 00000000000000..41b52187aec3f5 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ManifestCacheKey.java @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.datasource.iceberg.cache; + +import java.util.Objects; + +/** + * Cache key for a single Iceberg manifest file. + * Since Iceberg manifest files are immutable, path uniquely identifies a manifest. + */ +public class ManifestCacheKey { + private final String path; + + public ManifestCacheKey(String path) { + this.path = path; + } + + public String getPath() { + return path; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof ManifestCacheKey)) { + return false; + } + ManifestCacheKey that = (ManifestCacheKey) o; + return Objects.equals(path, that.path); + } + + @Override + public int hashCode() { + return Objects.hash(path); + } + + @Override + public String toString() { + return "ManifestCacheKey{path='" + path + "'}"; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ManifestCacheValue.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ManifestCacheValue.java new file mode 100644 index 00000000000000..0c7c9154639d6e --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ManifestCacheValue.java @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.datasource.iceberg.cache; + +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; + +import java.util.Collections; +import java.util.List; + +/** + * Cached manifest payload containing parsed files and an estimated weight. + */ +public class ManifestCacheValue { + private final List dataFiles; + private final List deleteFiles; + private final long weightBytes; + + private ManifestCacheValue(List dataFiles, List deleteFiles, long weightBytes) { + this.dataFiles = dataFiles == null ? Collections.emptyList() : dataFiles; + this.deleteFiles = deleteFiles == null ? Collections.emptyList() : deleteFiles; + this.weightBytes = weightBytes; + } + + public static ManifestCacheValue forDataFiles(List dataFiles) { + return new ManifestCacheValue(dataFiles, Collections.emptyList(), + estimateWeight(dataFiles, Collections.emptyList())); + } + + public static ManifestCacheValue forDeleteFiles(List deleteFiles) { + return new ManifestCacheValue(Collections.emptyList(), deleteFiles, + estimateWeight(Collections.emptyList(), deleteFiles)); + } + + public List getDataFiles() { + return dataFiles; + } + + public List getDeleteFiles() { + return deleteFiles; + } + + public long getWeightBytes() { + return weightBytes; + } + + private static long estimateWeight(List dataFiles, List deleteFiles) { + return ContentFileEstimater.estimate(dataFiles) + ContentFileEstimater.estimate(deleteFiles); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java index f5208397a0f324..0ffe86edb315d6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java @@ -23,6 +23,7 @@ import org.apache.doris.analysis.TupleDescriptor; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.TableIf; +import org.apache.doris.common.Config; import org.apache.doris.common.DdlException; import org.apache.doris.common.UserException; import org.apache.doris.common.security.authentication.ExecutionAuthenticator; @@ -38,6 +39,9 @@ import org.apache.doris.datasource.iceberg.IcebergExternalCatalog; import org.apache.doris.datasource.iceberg.IcebergExternalTable; import org.apache.doris.datasource.iceberg.IcebergUtils; +import org.apache.doris.datasource.iceberg.cache.IcebergManifestCache; +import org.apache.doris.datasource.iceberg.cache.IcebergManifestCacheLoader; +import org.apache.doris.datasource.iceberg.cache.ManifestCacheValue; import org.apache.doris.datasource.property.storage.StorageProperties; import org.apache.doris.nereids.exceptions.NotSupportedException; import org.apache.doris.planner.PlanNodeId; @@ -57,18 +61,27 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import org.apache.commons.lang3.exception.ExceptionUtils; +import org.apache.iceberg.BaseFileScanTask; import org.apache.iceberg.BaseTable; import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.DeleteFileIndex; import org.apache.iceberg.FileContent; import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.ManifestContent; import org.apache.iceberg.ManifestFile; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.PartitionData; import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.PartitionSpecParser; +import org.apache.iceberg.SchemaParser; import org.apache.iceberg.Snapshot; import org.apache.iceberg.Table; import org.apache.iceberg.TableScan; import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.expressions.InclusiveMetricsEvaluator; +import org.apache.iceberg.expressions.ManifestEvaluator; +import org.apache.iceberg.expressions.ResidualEvaluator; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.types.Conversions; @@ -78,9 +91,12 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.OptionalLong; import java.util.concurrent.CompletableFuture; @@ -358,8 +374,136 @@ public TableScan createTableScan() throws UserException { } private CloseableIterable planFileScanTask(TableScan scan) { + if (!Config.iceberg_manifest_cache_enable) { + long targetSplitSize = getRealFileSplitSize(0); + return TableScanUtil.splitFiles(scan.planFiles(), targetSplitSize); + } + try { + return planFileScanTaskWithManifestCache(scan); + } catch (Exception e) { + LOG.warn("Plan with manifest cache failed, fallback to original scan: {}", e.getMessage()); + long targetSplitSize = getRealFileSplitSize(0); + return TableScanUtil.splitFiles(scan.planFiles(), targetSplitSize); + } + } + + private CloseableIterable planFileScanTaskWithManifestCache(TableScan scan) throws IOException { + // Get the snapshot from the scan; return empty if no snapshot exists + Snapshot snapshot = scan.snapshot(); + if (snapshot == null) { + return CloseableIterable.withNoopClose(Collections.emptyList()); + } + + // Initialize manifest cache for efficient manifest file access + IcebergManifestCache cache = IcebergUtils.getManifestCache(); + + // Convert query conjuncts to Iceberg filter expression + // This combines all predicates with AND logic for partition/file pruning + Expression filterExpr = conjuncts.stream() + .map(conjunct -> IcebergUtils.convertToIcebergExpr(conjunct, icebergTable.schema())) + .filter(Objects::nonNull) + .reduce(Expressions.alwaysTrue(), Expressions::and); + + // Get all partition specs by their IDs for later use + Map specsById = icebergTable.specs(); + boolean caseSensitive = true; + + // Create residual evaluators for each partition spec + // Residual evaluators compute the remaining filter expression after partition pruning + Map residualEvaluators = new HashMap<>(); + specsById.forEach((id, spec) -> residualEvaluators.put(id, + ResidualEvaluator.of(spec, filterExpr == null ? Expressions.alwaysTrue() : filterExpr, + caseSensitive))); + + // Create metrics evaluator for file-level pruning based on column statistics + InclusiveMetricsEvaluator metricsEvaluator = filterExpr == null ? null + : new InclusiveMetricsEvaluator(icebergTable.schema(), filterExpr, caseSensitive); + + // ========== Phase 1: Load delete files from delete manifests ========== + List deleteFiles = new ArrayList<>(); + List deleteManifests = snapshot.deleteManifests(icebergTable.io()); + for (ManifestFile manifest : deleteManifests) { + // Skip non-delete manifests + if (manifest.content() != ManifestContent.DELETES) { + continue; + } + // Get the partition spec for this manifest + PartitionSpec spec = specsById.get(manifest.partitionSpecId()); + if (spec == null) { + continue; + } + // Create manifest evaluator for partition-level pruning + ManifestEvaluator evaluator = filterExpr == null ? null + : ManifestEvaluator.forPartitionFilter(filterExpr, spec, caseSensitive); + // Skip manifest if it doesn't match the filter expression (partition pruning) + if (evaluator != null && !evaluator.eval(manifest)) { + continue; + } + // Load delete files from cache (or from storage if not cached) + ManifestCacheValue value = IcebergManifestCacheLoader.loadDeleteFilesWithCache(cache, manifest, + icebergTable); + deleteFiles.addAll(value.getDeleteFiles()); + } + + // Build delete file index for efficient lookup of deletes applicable to each data file + DeleteFileIndex deleteIndex = DeleteFileIndex.builderFor(deleteFiles) + .specsById(specsById) + .caseSensitive(caseSensitive) + .build(); + + // ========== Phase 2: Load data files and create scan tasks ========== + List tasks = new ArrayList<>(); + try (CloseableIterable dataManifests = + IcebergUtils.getMatchingManifest(snapshot.dataManifests(icebergTable.io()), + specsById, filterExpr)) { + for (ManifestFile manifest : dataManifests) { + // Skip non-data manifests + if (manifest.content() != ManifestContent.DATA) { + continue; + } + // Get the partition spec for this manifest + PartitionSpec spec = specsById.get(manifest.partitionSpecId()); + if (spec == null) { + continue; + } + // Get the residual evaluator for this partition spec + ResidualEvaluator residualEvaluator = residualEvaluators.get(manifest.partitionSpecId()); + + // Load data files from cache (or from storage if not cached) + ManifestCacheValue value = IcebergManifestCacheLoader.loadDataFilesWithCache(cache, manifest, + icebergTable); + + // Process each data file in the manifest + for (org.apache.iceberg.DataFile dataFile : value.getDataFiles()) { + // Skip file if column statistics indicate no matching rows (metrics-based pruning) + if (metricsEvaluator != null && !metricsEvaluator.eval(dataFile)) { + continue; + } + // Skip file if partition values don't match the residual filter + if (residualEvaluator != null) { + if (residualEvaluator.residualFor(dataFile.partition()).equals(Expressions.alwaysFalse())) { + continue; + } + } + // Find all delete files that apply to this data file based on sequence number + List deletes = Arrays.asList( + deleteIndex.forDataFile(dataFile.dataSequenceNumber(), dataFile)); + + // Create a FileScanTask containing the data file, associated deletes, and metadata + tasks.add(new BaseFileScanTask( + dataFile, + deletes.toArray(new DeleteFile[0]), + SchemaParser.toJson(icebergTable.schema()), + PartitionSpecParser.toJson(spec), + residualEvaluator == null ? ResidualEvaluator.unpartitioned(Expressions.alwaysTrue()) + : residualEvaluator)); + } + } + } + + // Split tasks into smaller chunks based on target split size for parallel processing long targetSplitSize = getRealFileSplitSize(0); - return TableScanUtil.splitFiles(scan.planFiles(), targetSplitSize); + return TableScanUtil.splitFiles(CloseableIterable.withNoopClose(tasks), targetSplitSize); } private Split createIcebergSplit(FileScanTask fileScanTask) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/metastore/AbstractIcebergProperties.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/metastore/AbstractIcebergProperties.java index 2cc829c87433f0..88def12d2a599c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/metastore/AbstractIcebergProperties.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/metastore/AbstractIcebergProperties.java @@ -43,6 +43,43 @@ public abstract class AbstractIcebergProperties extends MetastoreProperties { ) protected String warehouse; + @Getter + @ConnectorProperty( + names = {CatalogProperties.IO_MANIFEST_CACHE_ENABLED}, + required = false, + description = "Controls whether to use caching during manifest reads or not. Default: false." + ) + protected String ioManifestCacheEnabled; + + @Getter + @ConnectorProperty( + names = {CatalogProperties.IO_MANIFEST_CACHE_EXPIRATION_INTERVAL_MS}, + required = false, + description = "Controls the maximum duration for which an entry stays in the manifest cache. " + + "Must be a non-negative value. Zero means entries expire only due to memory pressure. " + + "Default: 60000 (60s)." + ) + protected String ioManifestCacheExpirationIntervalMs; + + @Getter + @ConnectorProperty( + names = {CatalogProperties.IO_MANIFEST_CACHE_MAX_TOTAL_BYTES}, + required = false, + description = "Controls the maximum total amount of bytes to cache in manifest cache. " + + "Must be a positive value. Default: 104857600 (100MB)." + ) + protected String ioManifestCacheMaxTotalBytes; + + @Getter + @ConnectorProperty( + names = {CatalogProperties.IO_MANIFEST_CACHE_MAX_CONTENT_LENGTH}, + required = false, + description = "Controls the maximum length of file to be considered for caching. " + + "An InputFile will not be cached if the length is longer than this limit. " + + "Must be a positive value. Default: 8388608 (8MB)." + ) + protected String ioManifestCacheMaxContentLength; + @Getter protected ExecutionAuthenticator executionAuthenticator = new ExecutionAuthenticator(){}; @@ -80,6 +117,9 @@ public final Catalog initializeCatalog(String catalogName, List catalogProps) { + if (StringUtils.isNotBlank(ioManifestCacheEnabled)) { + catalogProps.put(CatalogProperties.IO_MANIFEST_CACHE_ENABLED, ioManifestCacheEnabled); + } + if (StringUtils.isNotBlank(ioManifestCacheExpirationIntervalMs)) { + catalogProps.put(CatalogProperties.IO_MANIFEST_CACHE_EXPIRATION_INTERVAL_MS, + ioManifestCacheExpirationIntervalMs); + } + if (StringUtils.isNotBlank(ioManifestCacheMaxTotalBytes)) { + catalogProps.put(CatalogProperties.IO_MANIFEST_CACHE_MAX_TOTAL_BYTES, ioManifestCacheMaxTotalBytes); + } + if (StringUtils.isNotBlank(ioManifestCacheMaxContentLength)) { + catalogProps.put(CatalogProperties.IO_MANIFEST_CACHE_MAX_CONTENT_LENGTH, ioManifestCacheMaxContentLength); + } + } + /** * Subclasses must implement this to create the concrete Catalog instance. */ diff --git a/fe/fe-core/src/main/java/org/apache/iceberg/DeleteFileIndex.java b/fe/fe-core/src/main/java/org/apache/iceberg/DeleteFileIndex.java new file mode 100644 index 00000000000000..5c9cdd93c45f93 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/iceberg/DeleteFileIndex.java @@ -0,0 +1,906 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import com.github.benmanes.caffeine.cache.Caffeine; +import com.github.benmanes.caffeine.cache.LoadingCache; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ExecutorService; +import org.apache.iceberg.exceptions.RuntimeIOException; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.expressions.ManifestEvaluator; +import org.apache.iceberg.expressions.Projections; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.metrics.ScanMetrics; +import org.apache.iceberg.metrics.ScanMetricsUtil; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.Comparators; +import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.ArrayUtil; +import org.apache.iceberg.util.ContentFileUtil; +import org.apache.iceberg.util.PartitionMap; +import org.apache.iceberg.util.PartitionSet; +import org.apache.iceberg.util.Tasks; + +/** + * An index of {@link DeleteFile delete files} by sequence number. + * + *

Use {@link #builderFor(FileIO, Iterable)} to construct an index, and {@link #forDataFile(long, + * DataFile)} or {@link #forEntry(ManifestEntry)} to get the delete files to apply to a given data + * file. + * + * Copyed from https://github.com/apache/iceberg/blob/apache-iceberg-1.9.1/core/src/main/java/org/apache/iceberg/DeleteFileIndex.java + * Change DeleteFileIndex and some methods to public. + */ +public class DeleteFileIndex { + private static final DeleteFile[] EMPTY_DELETES = new DeleteFile[0]; + + private final EqualityDeletes globalDeletes; + private final PartitionMap eqDeletesByPartition; + private final PartitionMap posDeletesByPartition; + private final Map posDeletesByPath; + private final Map dvByPath; + private final boolean hasEqDeletes; + private final boolean hasPosDeletes; + private final boolean isEmpty; + + private DeleteFileIndex( + EqualityDeletes globalDeletes, + PartitionMap eqDeletesByPartition, + PartitionMap posDeletesByPartition, + Map posDeletesByPath, + Map dvByPath) { + this.globalDeletes = globalDeletes; + this.eqDeletesByPartition = eqDeletesByPartition; + this.posDeletesByPartition = posDeletesByPartition; + this.posDeletesByPath = posDeletesByPath; + this.dvByPath = dvByPath; + this.hasEqDeletes = globalDeletes != null || eqDeletesByPartition != null; + this.hasPosDeletes = + posDeletesByPartition != null || posDeletesByPath != null || dvByPath != null; + this.isEmpty = !hasEqDeletes && !hasPosDeletes; + } + + public boolean isEmpty() { + return isEmpty; + } + + public boolean hasEqualityDeletes() { + return hasEqDeletes; + } + + public boolean hasPositionDeletes() { + return hasPosDeletes; + } + + public Iterable referencedDeleteFiles() { + Iterable deleteFiles = Collections.emptyList(); + + if (globalDeletes != null) { + deleteFiles = Iterables.concat(deleteFiles, globalDeletes.referencedDeleteFiles()); + } + + if (eqDeletesByPartition != null) { + for (EqualityDeletes deletes : eqDeletesByPartition.values()) { + deleteFiles = Iterables.concat(deleteFiles, deletes.referencedDeleteFiles()); + } + } + + if (posDeletesByPartition != null) { + for (PositionDeletes deletes : posDeletesByPartition.values()) { + deleteFiles = Iterables.concat(deleteFiles, deletes.referencedDeleteFiles()); + } + } + + if (posDeletesByPath != null) { + for (PositionDeletes deletes : posDeletesByPath.values()) { + deleteFiles = Iterables.concat(deleteFiles, deletes.referencedDeleteFiles()); + } + } + + if (dvByPath != null) { + deleteFiles = Iterables.concat(deleteFiles, dvByPath.values()); + } + + return deleteFiles; + } + + DeleteFile[] forEntry(ManifestEntry entry) { + return forDataFile(entry.dataSequenceNumber(), entry.file()); + } + + public DeleteFile[] forDataFile(DataFile file) { + return forDataFile(file.dataSequenceNumber(), file); + } + + public DeleteFile[] forDataFile(long sequenceNumber, DataFile file) { + if (isEmpty) { + return EMPTY_DELETES; + } + + DeleteFile[] global = findGlobalDeletes(sequenceNumber, file); + DeleteFile[] eqPartition = findEqPartitionDeletes(sequenceNumber, file); + DeleteFile dv = findDV(sequenceNumber, file); + if (dv != null && global == null && eqPartition == null) { + return new DeleteFile[] {dv}; + } else if (dv != null) { + return concat(global, eqPartition, new DeleteFile[] {dv}); + } else { + DeleteFile[] posPartition = findPosPartitionDeletes(sequenceNumber, file); + DeleteFile[] posPath = findPathDeletes(sequenceNumber, file); + return concat(global, eqPartition, posPartition, posPath); + } + } + + private DeleteFile[] findGlobalDeletes(long seq, DataFile dataFile) { + return globalDeletes == null ? EMPTY_DELETES : globalDeletes.filter(seq, dataFile); + } + + private DeleteFile[] findPosPartitionDeletes(long seq, DataFile dataFile) { + if (posDeletesByPartition == null) { + return EMPTY_DELETES; + } + + PositionDeletes deletes = posDeletesByPartition.get(dataFile.specId(), dataFile.partition()); + return deletes == null ? EMPTY_DELETES : deletes.filter(seq); + } + + private DeleteFile[] findEqPartitionDeletes(long seq, DataFile dataFile) { + if (eqDeletesByPartition == null) { + return EMPTY_DELETES; + } + + EqualityDeletes deletes = eqDeletesByPartition.get(dataFile.specId(), dataFile.partition()); + return deletes == null ? EMPTY_DELETES : deletes.filter(seq, dataFile); + } + + @SuppressWarnings("CollectionUndefinedEquality") + private DeleteFile[] findPathDeletes(long seq, DataFile dataFile) { + if (posDeletesByPath == null) { + return EMPTY_DELETES; + } + + PositionDeletes deletes = posDeletesByPath.get(dataFile.location()); + return deletes == null ? EMPTY_DELETES : deletes.filter(seq); + } + + private DeleteFile findDV(long seq, DataFile dataFile) { + if (dvByPath == null) { + return null; + } + + DeleteFile dv = dvByPath.get(dataFile.location()); + if (dv != null) { + ValidationException.check( + dv.dataSequenceNumber() >= seq, + "DV data sequence number (%s) must be greater than or equal to data file sequence number (%s)", + dv.dataSequenceNumber(), + seq); + } + return dv; + } + + @SuppressWarnings("checkstyle:CyclomaticComplexity") + private static boolean canContainEqDeletesForFile( + DataFile dataFile, EqualityDeleteFile deleteFile) { + Map dataLowers = dataFile.lowerBounds(); + Map dataUppers = dataFile.upperBounds(); + + // whether to check data ranges or to assume that the ranges match + // if upper/lower bounds are missing, null counts may still be used to determine delete files + // can be skipped + boolean checkRanges = + dataLowers != null && dataUppers != null && deleteFile.hasLowerAndUpperBounds(); + + Map dataNullCounts = dataFile.nullValueCounts(); + Map dataValueCounts = dataFile.valueCounts(); + Map deleteNullCounts = deleteFile.nullValueCounts(); + Map deleteValueCounts = deleteFile.valueCounts(); + + for (Types.NestedField field : deleteFile.equalityFields()) { + if (!field.type().isPrimitiveType()) { + // stats are not kept for nested types. assume that the delete file may match + continue; + } + + if (containsNull(dataNullCounts, field) && containsNull(deleteNullCounts, field)) { + // the data has null values and null has been deleted, so the deletes must be applied + continue; + } + + if (allNull(dataNullCounts, dataValueCounts, field) && allNonNull(deleteNullCounts, field)) { + // the data file contains only null values for this field, but there are no deletes for null + // values + return false; + } + + if (allNull(deleteNullCounts, deleteValueCounts, field) + && allNonNull(dataNullCounts, field)) { + // the delete file removes only null rows with null for this field, but there are no data + // rows with null + return false; + } + + if (!checkRanges) { + // some upper and lower bounds are missing, assume they match + continue; + } + + int id = field.fieldId(); + ByteBuffer dataLower = dataLowers.get(id); + ByteBuffer dataUpper = dataUppers.get(id); + Object deleteLower = deleteFile.lowerBound(id); + Object deleteUpper = deleteFile.upperBound(id); + if (dataLower == null || dataUpper == null || deleteLower == null || deleteUpper == null) { + // at least one bound is not known, assume the delete file may match + continue; + } + + if (!rangesOverlap(field, dataLower, dataUpper, deleteLower, deleteUpper)) { + // no values overlap between the data file and the deletes + return false; + } + } + + return true; + } + + private static boolean rangesOverlap( + Types.NestedField field, + ByteBuffer dataLowerBuf, + ByteBuffer dataUpperBuf, + T deleteLower, + T deleteUpper) { + Type.PrimitiveType type = field.type().asPrimitiveType(); + Comparator comparator = Comparators.forType(type); + + T dataLower = Conversions.fromByteBuffer(type, dataLowerBuf); + if (comparator.compare(dataLower, deleteUpper) > 0) { + return false; + } + + T dataUpper = Conversions.fromByteBuffer(type, dataUpperBuf); + if (comparator.compare(deleteLower, dataUpper) > 0) { + return false; + } + + return true; + } + + private static boolean allNonNull(Map nullValueCounts, Types.NestedField field) { + if (field.isRequired()) { + return true; + } + + if (nullValueCounts == null) { + return false; + } + + Long nullValueCount = nullValueCounts.get(field.fieldId()); + if (nullValueCount == null) { + return false; + } + + return nullValueCount <= 0; + } + + private static boolean allNull( + Map nullValueCounts, Map valueCounts, Types.NestedField field) { + if (field.isRequired()) { + return false; + } + + if (nullValueCounts == null || valueCounts == null) { + return false; + } + + Long nullValueCount = nullValueCounts.get(field.fieldId()); + Long valueCount = valueCounts.get(field.fieldId()); + if (nullValueCount == null || valueCount == null) { + return false; + } + + return nullValueCount.equals(valueCount); + } + + private static boolean containsNull(Map nullValueCounts, Types.NestedField field) { + if (field.isRequired()) { + return false; + } + + if (nullValueCounts == null) { + return true; + } + + Long nullValueCount = nullValueCounts.get(field.fieldId()); + if (nullValueCount == null) { + return true; + } + + return nullValueCount > 0; + } + + static Builder builderFor(FileIO io, Iterable deleteManifests) { + return new Builder(io, Sets.newHashSet(deleteManifests)); + } + + // changed to public method. + public static Builder builderFor(Iterable deleteFiles) { + return new Builder(deleteFiles); + } + + // changed to public class. + public static class Builder { + private final FileIO io; + private final Set deleteManifests; + private final Iterable deleteFiles; + private long minSequenceNumber = 0L; + private Map specsById = null; + private Expression dataFilter = Expressions.alwaysTrue(); + private Expression partitionFilter = Expressions.alwaysTrue(); + private PartitionSet partitionSet = null; + private boolean caseSensitive = true; + private ExecutorService executorService = null; + private ScanMetrics scanMetrics = ScanMetrics.noop(); + private boolean ignoreResiduals = false; + + Builder(FileIO io, Set deleteManifests) { + this.io = io; + this.deleteManifests = Sets.newHashSet(deleteManifests); + this.deleteFiles = null; + } + + Builder(Iterable deleteFiles) { + this.io = null; + this.deleteManifests = null; + this.deleteFiles = deleteFiles; + } + + Builder afterSequenceNumber(long seq) { + this.minSequenceNumber = seq; + return this; + } + + public Builder specsById(Map newSpecsById) { + this.specsById = newSpecsById; + return this; + } + + Builder filterData(Expression newDataFilter) { + Preconditions.checkArgument( + deleteFiles == null, "Index constructed from files does not support data filters"); + this.dataFilter = Expressions.and(dataFilter, newDataFilter); + return this; + } + + Builder filterPartitions(Expression newPartitionFilter) { + Preconditions.checkArgument( + deleteFiles == null, "Index constructed from files does not support partition filters"); + this.partitionFilter = Expressions.and(partitionFilter, newPartitionFilter); + return this; + } + + Builder filterPartitions(PartitionSet newPartitionSet) { + Preconditions.checkArgument( + deleteFiles == null, "Index constructed from files does not support partition filters"); + this.partitionSet = newPartitionSet; + return this; + } + + public Builder caseSensitive(boolean newCaseSensitive) { + this.caseSensitive = newCaseSensitive; + return this; + } + + Builder planWith(ExecutorService newExecutorService) { + this.executorService = newExecutorService; + return this; + } + + Builder scanMetrics(ScanMetrics newScanMetrics) { + this.scanMetrics = newScanMetrics; + return this; + } + + Builder ignoreResiduals() { + this.ignoreResiduals = true; + return this; + } + + private Iterable filterDeleteFiles() { + return Iterables.filter(deleteFiles, file -> file.dataSequenceNumber() > minSequenceNumber); + } + + private Collection loadDeleteFiles() { + // read all of the matching delete manifests in parallel and accumulate the matching files in + // a queue + Queue files = new ConcurrentLinkedQueue<>(); + Tasks.foreach(deleteManifestReaders()) + .stopOnFailure() + .throwFailureWhenFinished() + .executeWith(executorService) + .run( + deleteFile -> { + try (CloseableIterable> reader = deleteFile) { + for (ManifestEntry entry : reader) { + if (entry.dataSequenceNumber() > minSequenceNumber) { + // copy with stats for better filtering against data file stats + files.add(entry.file().copy()); + } + } + } catch (IOException e) { + throw new RuntimeIOException(e, "Failed to close"); + } + }); + return files; + } + + public DeleteFileIndex build() { + Iterable files = deleteFiles != null ? filterDeleteFiles() : loadDeleteFiles(); + + EqualityDeletes globalDeletes = new EqualityDeletes(); + PartitionMap eqDeletesByPartition = PartitionMap.create(specsById); + PartitionMap posDeletesByPartition = PartitionMap.create(specsById); + Map posDeletesByPath = Maps.newHashMap(); + Map dvByPath = Maps.newHashMap(); + + for (DeleteFile file : files) { + switch (file.content()) { + case POSITION_DELETES: + if (ContentFileUtil.isDV(file)) { + add(dvByPath, file); + } else { + add(posDeletesByPath, posDeletesByPartition, file); + } + break; + case EQUALITY_DELETES: + add(globalDeletes, eqDeletesByPartition, file); + break; + default: + throw new UnsupportedOperationException("Unsupported content: " + file.content()); + } + ScanMetricsUtil.indexedDeleteFile(scanMetrics, file); + } + + return new DeleteFileIndex( + globalDeletes.isEmpty() ? null : globalDeletes, + eqDeletesByPartition.isEmpty() ? null : eqDeletesByPartition, + posDeletesByPartition.isEmpty() ? null : posDeletesByPartition, + posDeletesByPath.isEmpty() ? null : posDeletesByPath, + dvByPath.isEmpty() ? null : dvByPath); + } + + private void add(Map dvByPath, DeleteFile dv) { + String path = dv.referencedDataFile(); + DeleteFile existingDV = dvByPath.putIfAbsent(path, dv); + if (existingDV != null) { + throw new ValidationException( + "Can't index multiple DVs for %s: %s and %s", + path, ContentFileUtil.dvDesc(dv), ContentFileUtil.dvDesc(existingDV)); + } + } + + private void add( + Map deletesByPath, + PartitionMap deletesByPartition, + DeleteFile file) { + String path = ContentFileUtil.referencedDataFileLocation(file); + + PositionDeletes deletes; + if (path != null) { + deletes = deletesByPath.computeIfAbsent(path, ignored -> new PositionDeletes()); + } else { + int specId = file.specId(); + StructLike partition = file.partition(); + deletes = deletesByPartition.computeIfAbsent(specId, partition, PositionDeletes::new); + } + + deletes.add(file); + } + + private void add( + EqualityDeletes globalDeletes, + PartitionMap deletesByPartition, + DeleteFile file) { + PartitionSpec spec = specsById.get(file.specId()); + + EqualityDeletes deletes; + if (spec.isUnpartitioned()) { + deletes = globalDeletes; + } else { + int specId = spec.specId(); + StructLike partition = file.partition(); + deletes = deletesByPartition.computeIfAbsent(specId, partition, EqualityDeletes::new); + } + + deletes.add(spec, file); + } + + private Iterable>> deleteManifestReaders() { + Expression entryFilter = ignoreResiduals ? Expressions.alwaysTrue() : dataFilter; + + LoadingCache partExprCache = + specsById == null + ? null + : Caffeine.newBuilder() + .build( + specId -> { + PartitionSpec spec = specsById.get(specId); + return Projections.inclusive(spec, caseSensitive).project(dataFilter); + }); + + LoadingCache evalCache = + specsById == null + ? null + : Caffeine.newBuilder() + .build( + specId -> { + PartitionSpec spec = specsById.get(specId); + return ManifestEvaluator.forPartitionFilter( + Expressions.and(partitionFilter, partExprCache.get(specId)), + spec, + caseSensitive); + }); + + CloseableIterable closeableDeleteManifests = + CloseableIterable.withNoopClose(deleteManifests); + CloseableIterable matchingManifests = + evalCache == null + ? closeableDeleteManifests + : CloseableIterable.filter( + scanMetrics.skippedDeleteManifests(), + closeableDeleteManifests, + manifest -> + manifest.content() == ManifestContent.DELETES + && (manifest.hasAddedFiles() || manifest.hasExistingFiles()) + && evalCache.get(manifest.partitionSpecId()).eval(manifest)); + + matchingManifests = + CloseableIterable.count(scanMetrics.scannedDeleteManifests(), matchingManifests); + return Iterables.transform( + matchingManifests, + manifest -> + ManifestFiles.readDeleteManifest(manifest, io, specsById) + .filterRows(entryFilter) + .filterPartitions( + Expressions.and( + partitionFilter, partExprCache.get(manifest.partitionSpecId()))) + .filterPartitions(partitionSet) + .caseSensitive(caseSensitive) + .scanMetrics(scanMetrics) + .liveEntries()); + } + } + + /** + * Finds an index in the sorted array of sequence numbers where the given sequence number should + * be inserted or is found. + * + *

If the sequence number is present in the array, this method returns the index of the first + * occurrence of the sequence number. If the sequence number is not present, the method returns + * the index where the sequence number would be inserted while maintaining the sorted order of the + * array. This returned index ranges from 0 (inclusive) to the length of the array (inclusive). + * + *

This method is used to determine the subset of delete files that apply to a given data file. + * + * @param seqs an array of sequence numbers sorted in ascending order + * @param seq the sequence number to search for + * @return the index of the first occurrence or the insertion point + */ + private static int findStartIndex(long[] seqs, long seq) { + int pos = Arrays.binarySearch(seqs, seq); + int start; + if (pos < 0) { + // the sequence number was not found, where it would be inserted is -(pos + 1) + start = -(pos + 1); + } else { + // the sequence number was found, but may not be the first + // find the first delete file with the given sequence number by decrementing the position + start = pos; + while (start > 0 && seqs[start - 1] >= seq) { + start -= 1; + } + } + + return start; + } + + private static DeleteFile[] concat(DeleteFile[]... deletes) { + return ArrayUtil.concat(DeleteFile.class, deletes); + } + + // a group of position delete files sorted by the sequence number they apply to + static class PositionDeletes { + private static final Comparator SEQ_COMPARATOR = + Comparator.comparingLong(DeleteFile::dataSequenceNumber); + + // indexed state + private long[] seqs = null; + private DeleteFile[] files = null; + + // a buffer that is used to hold files before indexing + private volatile List buffer = Lists.newArrayList(); + + public void add(DeleteFile file) { + Preconditions.checkState(buffer != null, "Can't add files upon indexing"); + buffer.add(file); + } + + public DeleteFile[] filter(long seq) { + indexIfNeeded(); + + int start = findStartIndex(seqs, seq); + + if (start >= files.length) { + return EMPTY_DELETES; + } + + if (start == 0) { + return files; + } + + int matchingFilesCount = files.length - start; + DeleteFile[] matchingFiles = new DeleteFile[matchingFilesCount]; + System.arraycopy(files, start, matchingFiles, 0, matchingFilesCount); + return matchingFiles; + } + + public Iterable referencedDeleteFiles() { + indexIfNeeded(); + return Arrays.asList(files); + } + + public boolean isEmpty() { + indexIfNeeded(); + return files.length == 0; + } + + private void indexIfNeeded() { + if (buffer != null) { + synchronized (this) { + if (buffer != null) { + this.files = indexFiles(buffer); + this.seqs = indexSeqs(files); + this.buffer = null; + } + } + } + } + + private static DeleteFile[] indexFiles(List list) { + DeleteFile[] array = list.toArray(EMPTY_DELETES); + Arrays.sort(array, SEQ_COMPARATOR); + return array; + } + + private static long[] indexSeqs(DeleteFile[] files) { + long[] seqs = new long[files.length]; + + for (int index = 0; index < files.length; index++) { + seqs[index] = files[index].dataSequenceNumber(); + } + + return seqs; + } + } + + // a group of equality delete files sorted by the sequence number they apply to + static class EqualityDeletes { + private static final Comparator SEQ_COMPARATOR = + Comparator.comparingLong(EqualityDeleteFile::applySequenceNumber); + private static final EqualityDeleteFile[] EMPTY_EQUALITY_DELETES = new EqualityDeleteFile[0]; + + // indexed state + private long[] seqs = null; + private EqualityDeleteFile[] files = null; + + // a buffer that is used to hold files before indexing + private volatile List buffer = Lists.newArrayList(); + + public void add(PartitionSpec spec, DeleteFile file) { + Preconditions.checkState(buffer != null, "Can't add files upon indexing"); + buffer.add(new EqualityDeleteFile(spec, file)); + } + + public DeleteFile[] filter(long seq, DataFile dataFile) { + indexIfNeeded(); + + int start = findStartIndex(seqs, seq); + + if (start >= files.length) { + return EMPTY_DELETES; + } + + List matchingFiles = Lists.newArrayList(); + + for (int index = start; index < files.length; index++) { + EqualityDeleteFile file = files[index]; + if (canContainEqDeletesForFile(dataFile, file)) { + matchingFiles.add(file.wrapped()); + } + } + + return matchingFiles.toArray(EMPTY_DELETES); + } + + public Iterable referencedDeleteFiles() { + indexIfNeeded(); + return Iterables.transform(Arrays.asList(files), EqualityDeleteFile::wrapped); + } + + public boolean isEmpty() { + indexIfNeeded(); + return files.length == 0; + } + + private void indexIfNeeded() { + if (buffer != null) { + synchronized (this) { + if (buffer != null) { + this.files = indexFiles(buffer); + this.seqs = indexSeqs(files); + this.buffer = null; + } + } + } + } + + private static EqualityDeleteFile[] indexFiles(List list) { + EqualityDeleteFile[] array = list.toArray(EMPTY_EQUALITY_DELETES); + Arrays.sort(array, SEQ_COMPARATOR); + return array; + } + + private static long[] indexSeqs(EqualityDeleteFile[] files) { + long[] seqs = new long[files.length]; + + for (int index = 0; index < files.length; index++) { + seqs[index] = files[index].applySequenceNumber(); + } + + return seqs; + } + } + + // an equality delete file wrapper that caches the converted boundaries for faster boundary checks + // this class is not meant to be exposed beyond the delete file index + private static class EqualityDeleteFile { + private final PartitionSpec spec; + private final DeleteFile wrapped; + private final long applySequenceNumber; + private volatile List equalityFields = null; + private volatile Map convertedLowerBounds = null; + private volatile Map convertedUpperBounds = null; + + EqualityDeleteFile(PartitionSpec spec, DeleteFile file) { + this.spec = spec; + this.wrapped = file; + this.applySequenceNumber = wrapped.dataSequenceNumber() - 1; + } + + public DeleteFile wrapped() { + return wrapped; + } + + public long applySequenceNumber() { + return applySequenceNumber; + } + + public List equalityFields() { + if (equalityFields == null) { + synchronized (this) { + if (equalityFields == null) { + List fields = Lists.newArrayList(); + for (int id : wrapped.equalityFieldIds()) { + Types.NestedField field = spec.schema().findField(id); + fields.add(field); + } + this.equalityFields = fields; + } + } + } + + return equalityFields; + } + + public Map valueCounts() { + return wrapped.valueCounts(); + } + + public Map nullValueCounts() { + return wrapped.nullValueCounts(); + } + + public boolean hasLowerAndUpperBounds() { + return wrapped.lowerBounds() != null && wrapped.upperBounds() != null; + } + + @SuppressWarnings("unchecked") + public T lowerBound(int id) { + return (T) lowerBounds().get(id); + } + + private Map lowerBounds() { + if (convertedLowerBounds == null) { + synchronized (this) { + if (convertedLowerBounds == null) { + this.convertedLowerBounds = convertBounds(wrapped.lowerBounds()); + } + } + } + + return convertedLowerBounds; + } + + @SuppressWarnings("unchecked") + public T upperBound(int id) { + return (T) upperBounds().get(id); + } + + private Map upperBounds() { + if (convertedUpperBounds == null) { + synchronized (this) { + if (convertedUpperBounds == null) { + this.convertedUpperBounds = convertBounds(wrapped.upperBounds()); + } + } + } + + return convertedUpperBounds; + } + + private Map convertBounds(Map bounds) { + Map converted = Maps.newHashMap(); + + if (bounds != null) { + for (Types.NestedField field : equalityFields()) { + int id = field.fieldId(); + Type type = spec.schema().findField(id).type(); + if (type.isPrimitiveType()) { + ByteBuffer bound = bounds.get(id); + if (bound != null) { + converted.put(id, Conversions.fromByteBuffer(type, bound)); + } + } + } + } + + return converted; + } + } +} From bdaf820278473d0e3a58faca1761822bfd376bb3 Mon Sep 17 00:00:00 2001 From: Qi Chen Date: Fri, 19 Dec 2025 11:45:28 +0800 Subject: [PATCH 09/12] [opt](multi-catalog) Optimize file split size. (#59175) --- .../datasource/FederationBackendPolicy.java | 12 +- .../doris/datasource/FileQueryScanNode.java | 19 +- .../apache/doris/datasource/FileScanNode.java | 10 +- .../apache/doris/datasource/FileSplitter.java | 230 ++++++++++++++++-- .../doris/datasource/SplitGenerator.java | 2 +- .../datasource/hive/source/HiveScanNode.java | 78 ++++-- .../iceberg/source/IcebergScanNode.java | 80 +++++- .../paimon/source/PaimonScanNode.java | 47 +++- .../datasource/tvf/source/TVFScanNode.java | 31 ++- .../org/apache/doris/qe/SessionVariable.java | 39 +++ .../doris/datasource/FileSplitterTest.java | 216 ++++++++++++++++ .../paimon/source/PaimonScanNodeTest.java | 17 ++ .../planner/FederationBackendPolicyTest.java | 4 +- .../hive/test_hive_compress_type.groovy | 2 +- 14 files changed, 691 insertions(+), 96 deletions(-) create mode 100644 fe/fe-core/src/test/java/org/apache/doris/datasource/FileSplitterTest.java diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FederationBackendPolicy.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FederationBackendPolicy.java index 813d1892642167..a8927d86a946e6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FederationBackendPolicy.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FederationBackendPolicy.java @@ -63,12 +63,16 @@ import java.util.Map; import java.util.Objects; import java.util.Optional; +import java.util.Random; import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.stream.Collectors; public class FederationBackendPolicy { private static final Logger LOG = LogManager.getLogger(FederationBackendPolicy.class); + + private static final long FIXED_SHUFFLE_SEED = 123456789L; + protected final List backends = Lists.newArrayList(); private final Map> backendMap = Maps.newHashMap(); @@ -220,6 +224,7 @@ public void setEnableSplitsRedistribution(boolean enableSplitsRedistribution) { public Multimap computeScanRangeAssignment(List splits) throws UserException { ListMultimap assignment = ArrayListMultimap.create(); + Collections.shuffle(splits, new Random(FIXED_SHUFFLE_SEED)); List remainingSplits; List backends = new ArrayList<>(); @@ -228,8 +233,6 @@ public Multimap computeScanRangeAssignment(List splits) t } ResettableRandomizedIterator randomCandidates = new ResettableRandomizedIterator<>(backends); - boolean splitsToBeRedistributed = false; - // optimizedLocalScheduling enables prioritized assignment of splits to local nodes when splits contain // locality information if (Config.split_assigner_optimized_local_scheduling) { @@ -246,7 +249,6 @@ public Multimap computeScanRangeAssignment(List splits) t assignment.put(selectedBackend, split); assignedWeightPerBackend.put(selectedBackend, assignedWeightPerBackend.get(selectedBackend) + split.getSplitWeight().getRawValue()); - splitsToBeRedistributed = true; continue; } } @@ -276,7 +278,6 @@ public Multimap computeScanRangeAssignment(List splits) t case CONSISTENT_HASHING: { candidateNodes = consistentHash.getNode(split, Config.split_assigner_min_consistent_hash_candidate_num); - splitsToBeRedistributed = true; break; } default: { @@ -302,7 +303,7 @@ public Multimap computeScanRangeAssignment(List splits) t assignedWeightPerBackend.get(selectedBackend) + split.getSplitWeight().getRawValue()); } - if (enableSplitsRedistribution && splitsToBeRedistributed) { + if (enableSplitsRedistribution) { equateDistribution(assignment); } return assignment; @@ -499,3 +500,4 @@ public void funnel(Split split, PrimitiveSink primitiveSink) { } } } + diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java index 93bec2d1849b06..3ae32170e4bc76 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java @@ -94,6 +94,8 @@ public abstract class FileQueryScanNode extends FileScanNode { protected TableScanParams scanParams; + protected FileSplitter fileSplitter; + /** * External file scan node for Query hms table * needCheckColumnPriv: Some of ExternalFileScanNode do not need to check column priv @@ -134,6 +136,8 @@ protected void doInitialize() throws UserException { } initBackendPolicy(); initSchemaParams(); + fileSplitter = new FileSplitter(sessionVariable.maxInitialSplitSize, sessionVariable.maxSplitSize, + sessionVariable.maxInitialSplitNum); } // Init schema (Tuple/Slot) related params. @@ -592,19 +596,4 @@ public TableScanParams getScanParams() { } return this.scanParams; } - - /** - * The real file split size is determined by: - * 1. If user specify the split size in session variable `file_split_size`, use user specified value. - * 2. Otherwise, use the max value of DEFAULT_SPLIT_SIZE and block size. - * @param blockSize, got from file system, eg, hdfs - * @return the real file split size - */ - protected long getRealFileSplitSize(long blockSize) { - long realSplitSize = sessionVariable.getFileSplitSize(); - if (realSplitSize <= 0) { - realSplitSize = Math.max(DEFAULT_SPLIT_SIZE, blockSize); - } - return realSplitSize; - } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java index c3e06999bba297..a7aa0f607ac504 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java @@ -62,9 +62,6 @@ * Base class for External File Scan, including external query and load. */ public abstract class FileScanNode extends ExternalScanNode { - - public static final long DEFAULT_SPLIT_SIZE = 64 * 1024 * 1024; // 64MB - // For explain protected long totalFileSize = 0; protected long totalPartitionNum = 0; @@ -115,12 +112,7 @@ public String getNodeExplainString(String prefix, TExplainLevel detailLevel) { } output.append(prefix); - boolean isBatch; - try { - isBatch = isBatchMode(); - } catch (UserException e) { - throw new RuntimeException(e); - } + boolean isBatch = isBatchMode(); if (isBatch) { output.append("(approximate)"); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileSplitter.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileSplitter.java index 33b2d70bfb16a9..5fe8444197181c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileSplitter.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileSplitter.java @@ -22,13 +22,19 @@ import org.apache.doris.spi.Split; import org.apache.doris.thrift.TFileCompressType; +import com.google.common.base.Preconditions; +import com.google.common.base.Verify; +import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; import org.apache.hadoop.fs.BlockLocation; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.IOException; +import java.util.Arrays; import java.util.List; +import java.util.Objects; +import java.util.concurrent.atomic.AtomicInteger; public class FileSplitter { private static final Logger LOG = LogManager.getLogger(FileSplitter.class); @@ -40,18 +46,66 @@ public static boolean needSplitForCountPushdown(int parallelism, int numBackends return totalFileNum < parallelism * numBackends; } - public static List splitFile( - LocationPath path, - long fileSplitSize, - BlockLocation[] blockLocations, - long length, - long modificationTime, - boolean splittable, - List partitionValues, - SplitCreator splitCreator) - throws IOException { + private long maxInitialSplitSize; + + private long maxSplitSize; + + private int maxInitialSplitNum; + private final AtomicInteger remainingInitialSplitNum; + + private long currentMaxSplitSize; + + public long getMaxInitialSplitSize() { + return maxInitialSplitSize; + } + + public void setMaxInitialSplitSize(long maxInitialSplitSize) { + this.maxInitialSplitSize = maxInitialSplitSize; + } + + public long getMaxSplitSize() { + return maxSplitSize; + } + + public void setMaxSplitSize(long maxSplitSize) { + this.maxSplitSize = maxSplitSize; + } + + public int maxInitialSplitNum() { + return maxInitialSplitNum; + } + + public void setMaxInitialSplits(int maxInitialSplitNum) { + this.maxInitialSplitNum = maxInitialSplitNum; + } + + public long getRemainingInitialSplitNum() { + return remainingInitialSplitNum.get(); + } + + public FileSplitter(long maxInitialSplitSize, long maxSplitSize, int maxInitialSplitNum) { + this.maxInitialSplitSize = maxInitialSplitSize; + this.maxSplitSize = maxSplitSize; + this.maxInitialSplitNum = maxInitialSplitNum; + currentMaxSplitSize = maxInitialSplitSize; + remainingInitialSplitNum = new AtomicInteger(maxInitialSplitNum); + } + + public List splitFile( + LocationPath path, + long specifiedFileSplitSize, + BlockLocation[] blockLocations, + long length, + long modificationTime, + boolean splittable, + List partitionValues, + SplitCreator splitCreator) + throws IOException { + // Pass splitCreator.create() to set target file split size to calculate split weight. + long targetFileSplitSize = specifiedFileSplitSize > 0 ? specifiedFileSplitSize : maxSplitSize; if (blockLocations == null) { - blockLocations = new BlockLocation[0]; + blockLocations = new BlockLocation[1]; + blockLocations[0] = new BlockLocation(null, null, 0L, length); } List result = Lists.newArrayList(); TFileCompressType compressType = Util.inferFileCompressTypeByPath(path.getNormalizedLocation()); @@ -60,23 +114,83 @@ public static List splitFile( LOG.debug("Path {} is not splittable.", path); } String[] hosts = blockLocations.length == 0 ? null : blockLocations[0].getHosts(); - result.add(splitCreator.create(path, 0, length, length, fileSplitSize, + result.add(splitCreator.create(path, 0, length, length, + targetFileSplitSize, modificationTime, hosts, partitionValues)); + updateCurrentMaxSplitSize(); + return result; + } + + // if specified split size is not zero, split file by specified size + if (specifiedFileSplitSize > 0) { + long bytesRemaining; + for (bytesRemaining = length; (double) bytesRemaining / (double) specifiedFileSplitSize > 1.1D; + bytesRemaining -= specifiedFileSplitSize) { + int location = getBlockIndex(blockLocations, length - bytesRemaining); + String[] hosts = location == -1 ? null : blockLocations[location].getHosts(); + result.add(splitCreator.create(path, length - bytesRemaining, specifiedFileSplitSize, + length, specifiedFileSplitSize, modificationTime, hosts, partitionValues)); + } + if (bytesRemaining != 0L) { + int location = getBlockIndex(blockLocations, length - bytesRemaining); + String[] hosts = location == -1 ? null : blockLocations[location].getHosts(); + result.add(splitCreator.create(path, length - bytesRemaining, bytesRemaining, + length, specifiedFileSplitSize, modificationTime, hosts, partitionValues)); + } return result; } - long bytesRemaining; - for (bytesRemaining = length; (double) bytesRemaining / (double) fileSplitSize > 1.1D; - bytesRemaining -= fileSplitSize) { - int location = getBlockIndex(blockLocations, length - bytesRemaining); - String[] hosts = location == -1 ? null : blockLocations[location].getHosts(); - result.add(splitCreator.create(path, length - bytesRemaining, fileSplitSize, - length, fileSplitSize, modificationTime, hosts, partitionValues)); + + // split file by block + long start = 0; + ImmutableList.Builder blockBuilder = ImmutableList.builder(); + for (BlockLocation blockLocation : blockLocations) { + // clamp the block range + long blockStart = Math.max(start, blockLocation.getOffset()); + long blockEnd = Math.min(start + length, blockLocation.getOffset() + blockLocation.getLength()); + if (blockStart > blockEnd) { + // block is outside split range + continue; + } + if (blockStart == blockEnd && !(blockStart == start && blockEnd == start + length)) { + // skip zero-width block, except in the special circumstance: + // slice is empty, and the block covers the empty slice interval. + continue; + } + blockBuilder.add(new InternalBlock(blockStart, blockEnd, blockLocation.getHosts())); + } + List blocks = blockBuilder.build(); + if (blocks.isEmpty()) { + result.add(splitCreator.create(path, 0, length, length, + targetFileSplitSize, modificationTime, null, + partitionValues)); + updateCurrentMaxSplitSize(); + return result; } - if (bytesRemaining != 0L) { - int location = getBlockIndex(blockLocations, length - bytesRemaining); - String[] hosts = location == -1 ? null : blockLocations[location].getHosts(); - result.add(splitCreator.create(path, length - bytesRemaining, bytesRemaining, - length, fileSplitSize, modificationTime, hosts, partitionValues)); + + long splitStart = start; + int currentBlockIdx = 0; + while (splitStart < start + length) { + updateCurrentMaxSplitSize(); + long splitBytes; + long remainingBlockBytes = blocks.get(currentBlockIdx).getEnd() - splitStart; + if (remainingBlockBytes <= currentMaxSplitSize) { + splitBytes = remainingBlockBytes; + } else if (currentMaxSplitSize * 2 >= remainingBlockBytes) { + // Second to last split in this block, generate two evenly sized splits + splitBytes = remainingBlockBytes / 2; + } else { + splitBytes = currentMaxSplitSize; + } + result.add(splitCreator.create(path, splitStart, splitBytes, + length, targetFileSplitSize, modificationTime, blocks.get(currentBlockIdx).getHosts(), + partitionValues)); + splitStart += splitBytes; + if (splitStart == blocks.get(currentBlockIdx).getEnd()) { + currentBlockIdx++; + if (currentBlockIdx != blocks.size()) { + Verify.verify(splitStart == blocks.get(currentBlockIdx).getStart()); + } + } } if (LOG.isDebugEnabled()) { @@ -85,7 +199,19 @@ public static List splitFile( return result; } - private static int getBlockIndex(BlockLocation[] blkLocations, long offset) { + private void updateCurrentMaxSplitSize() { + currentMaxSplitSize = maxSplitSize; + int cur = remainingInitialSplitNum.get(); + while (cur > 0) { + if (remainingInitialSplitNum.compareAndSet(cur, cur - 1)) { + currentMaxSplitSize = maxInitialSplitSize; + break; + } + cur = remainingInitialSplitNum.get(); + } + } + + private int getBlockIndex(BlockLocation[] blkLocations, long offset) { if (blkLocations == null || blkLocations.length == 0) { return -1; } @@ -100,5 +226,59 @@ private static int getBlockIndex(BlockLocation[] blkLocations, long offset) { throw new IllegalArgumentException(String.format("Offset %d is outside of file (0..%d)", offset, fileLength)); } + private static class InternalBlock { + private final long start; + private final long end; + private final String[] hosts; + + public InternalBlock(long start, long end, String[] hosts) { + Preconditions.checkArgument(start <= end, "block end cannot be before block start"); + this.start = start; + this.end = end; + this.hosts = hosts; + } + + public long getStart() { + return start; + } + + public long getEnd() { + return end; + } + + public String[] getHosts() { + return hosts; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + InternalBlock that = (InternalBlock) o; + return start == that.start && end == that.end && Arrays.equals(hosts, that.hosts); + } + + @Override + public int hashCode() { + int result = Objects.hash(start, end); + result = 31 * result + Arrays.hashCode(hosts); + return result; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("InternalBlock{"); + sb.append("start=").append(start); + sb.append(", end=").append(end); + sb.append(", hosts=").append(Arrays.toString(hosts)); + sb.append('}'); + return sb.toString(); + } + } + } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/SplitGenerator.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/SplitGenerator.java index 0b8a1022d5a50d..391552a5106a83 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/SplitGenerator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/SplitGenerator.java @@ -40,7 +40,7 @@ default List getSplits(int numBackends) throws UserException { /** * Whether the producer(e.g. ScanNode) support batch mode. */ - default boolean isBatchMode() throws UserException { + default boolean isBatchMode() { return false; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java index 744423f622cce8..5bcf2f5546a51f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java @@ -186,7 +186,7 @@ public List getSplits(int numBackends) throws UserException { .getMetaStoreCache((HMSExternalCatalog) hmsTable.getCatalog()); String bindBrokerName = hmsTable.getCatalog().bindBrokerName(); List allFiles = Lists.newArrayList(); - getFileSplitByPartitions(cache, prunedPartitions, allFiles, bindBrokerName, numBackends); + getFileSplitByPartitions(cache, prunedPartitions, allFiles, bindBrokerName, numBackends, false); if (ConnectContext.get().getExecutor() != null) { ConnectContext.get().getExecutor().getSummaryProfile().setGetPartitionFilesFinishTime(); } @@ -226,7 +226,8 @@ public void startSplit(int numBackends) { try { List allFiles = Lists.newArrayList(); getFileSplitByPartitions( - cache, Collections.singletonList(partition), allFiles, bindBrokerName, numBackends); + cache, Collections.singletonList(partition), allFiles, bindBrokerName, + numBackends, true); if (allFiles.size() > numSplitsPerPartition.get()) { numSplitsPerPartition.set(allFiles.size()); } @@ -277,7 +278,8 @@ public int numApproximateSplits() { } private void getFileSplitByPartitions(HiveMetaStoreCache cache, List partitions, - List allFiles, String bindBrokerName, int numBackends) throws IOException, UserException { + List allFiles, String bindBrokerName, int numBackends, + boolean isBatchMode) throws IOException, UserException { List fileCaches; if (hiveTransaction != null) { try { @@ -293,9 +295,11 @@ private void getFileSplitByPartitions(HiveMetaStoreCache cache, List 1, directoryLister, hmsTable); } + + long targetFileSplitSize = determineTargetFileSplitSize(fileCaches, isBatchMode); if (tableSample != null) { List hiveFileStatuses = selectFiles(fileCaches); - splitAllFiles(allFiles, hiveFileStatuses); + splitAllFiles(allFiles, hiveFileStatuses, targetFileSplitSize); return; } @@ -319,27 +323,67 @@ private void getFileSplitByPartitions(HiveMetaStoreCache cache, List fileCaches, + boolean isBatchMode) { + if (sessionVariable.getFileSplitSize() > 0) { + return sessionVariable.getFileSplitSize(); + } + /** Hive batch split mode will return 0. and FileSplitter + * will determine file split size. + */ + if (isBatchMode) { + return 0; + } + long result = sessionVariable.getMaxInitialSplitSize(); + long totalFileSize = 0; + for (HiveMetaStoreCache.FileCacheValue fileCacheValue : fileCaches) { + if (fileCacheValue.getFiles() == null) { + continue; + } + for (HiveMetaStoreCache.HiveFileStatus status : fileCacheValue.getFiles()) { + totalFileSize += status.getLength(); + if (totalFileSize >= sessionVariable.getMaxSplitSize() * sessionVariable.getMaxInitialSplitNum()) { + result = sessionVariable.getMaxSplitSize(); + break; } } } + return result; } private void splitAllFiles(List allFiles, - List hiveFileStatuses) throws IOException { + List hiveFileStatuses, + long realFileSplitSize) throws IOException { for (HiveMetaStoreCache.HiveFileStatus status : hiveFileStatuses) { - allFiles.addAll(FileSplitter.splitFile(status.getPath(), getRealFileSplitSize(status.getBlockSize()), - status.getBlockLocations(), status.getLength(), status.getModificationTime(), - status.isSplittable(), status.getPartitionValues(), + allFiles.addAll(fileSplitter.splitFile( + status.getPath(), + realFileSplitSize, + status.getBlockLocations(), + status.getLength(), + status.getModificationTime(), + status.isSplittable(), + status.getPartitionValues(), new HiveSplitCreator(status.getAcidInfo()))); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java index 0ffe86edb315d6..133ac0676448c7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java @@ -85,6 +85,7 @@ import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.util.ScanTaskUtil; import org.apache.iceberg.util.TableScanUtil; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -119,7 +120,7 @@ public class IcebergScanNode extends FileQueryScanNode { private boolean tableLevelPushDownCount = false; private long countFromSnapshot; private static final long COUNT_WITH_PARALLEL_SPLITS = 10000; - private long targetSplitSize; + private long targetSplitSize = 0; // This is used to avoid repeatedly calculating partition info map for the same partition data. private Map> partitionMapInfos; private boolean isPartitionedTable; @@ -131,6 +132,8 @@ public class IcebergScanNode extends FileQueryScanNode { private Map storagePropertiesMap; private Map backendStorageProperties; + private Boolean isBatchMode = null; + // for test @VisibleForTesting public IcebergScanNode(PlanNodeId id, TupleDescriptor desc, SessionVariable sv) { @@ -171,7 +174,6 @@ public IcebergScanNode(PlanNodeId id, TupleDescriptor desc, boolean needCheckCol @Override protected void doInitialize() throws UserException { icebergTable = source.getIcebergTable(); - targetSplitSize = getRealFileSplitSize(0); partitionMapInfos = new HashMap<>(); isPartitionedTable = icebergTable.spec().isPartitioned(); formatVersion = ((BaseTable) icebergTable).operations().current().formatVersion(); @@ -375,18 +377,57 @@ public TableScan createTableScan() throws UserException { private CloseableIterable planFileScanTask(TableScan scan) { if (!Config.iceberg_manifest_cache_enable) { - long targetSplitSize = getRealFileSplitSize(0); - return TableScanUtil.splitFiles(scan.planFiles(), targetSplitSize); + return splitFiles(scan); } try { return planFileScanTaskWithManifestCache(scan); } catch (Exception e) { LOG.warn("Plan with manifest cache failed, fallback to original scan: {}", e.getMessage()); - long targetSplitSize = getRealFileSplitSize(0); - return TableScanUtil.splitFiles(scan.planFiles(), targetSplitSize); + return splitFiles(scan); } } + private CloseableIterable splitFiles(TableScan scan) { + if (sessionVariable.getFileSplitSize() > 0) { + return TableScanUtil.splitFiles(scan.planFiles(), + sessionVariable.getFileSplitSize()); + } + if (isBatchMode()) { + // Currently iceberg batch split mode will use max split size. + // TODO: dynamic split size in batch split mode need to customize iceberg splitter. + return TableScanUtil.splitFiles(scan.planFiles(), sessionVariable.getMaxSplitSize()); + } + + // Non Batch Mode + // Materialize planFiles() into a list to avoid iterating the CloseableIterable twice. + // RISK: It will cost memory if the table is large. + List fileScanTaskList = new ArrayList<>(); + try (CloseableIterable scanTasksIter = scan.planFiles()) { + for (FileScanTask task : scanTasksIter) { + fileScanTaskList.add(task); + } + } catch (Exception e) { + throw new RuntimeException("Failed to materialize file scan tasks", e); + } + + targetSplitSize = determineTargetFileSplitSize(fileScanTaskList); + return TableScanUtil.splitFiles(CloseableIterable.withNoopClose(fileScanTaskList), targetSplitSize); + } + + private long determineTargetFileSplitSize(Iterable tasks) { + long result = sessionVariable.getMaxInitialSplitSize(); + long accumulatedTotalFileSize = 0; + for (FileScanTask task : tasks) { + accumulatedTotalFileSize += ScanTaskUtil.contentSizeInBytes(task.file()); + if (accumulatedTotalFileSize + >= sessionVariable.getMaxSplitSize() * sessionVariable.getMaxInitialSplitNum()) { + result = sessionVariable.getMaxSplitSize(); + break; + } + } + return result; + } + private CloseableIterable planFileScanTaskWithManifestCache(TableScan scan) throws IOException { // Get the snapshot from the scan; return empty if no snapshot exists Snapshot snapshot = scan.snapshot(); @@ -502,7 +543,7 @@ private CloseableIterable planFileScanTaskWithManifestCache(TableS } // Split tasks into smaller chunks based on target split size for parallel processing - long targetSplitSize = getRealFileSplitSize(0); + targetSplitSize = determineTargetFileSplitSize(tasks); return TableScanUtil.splitFiles(CloseableIterable.withNoopClose(tasks), targetSplitSize); } @@ -592,21 +633,36 @@ private List doGetSplits(int numBackends) throws UserException { } @Override - public boolean isBatchMode() throws UserException { + public boolean isBatchMode() { + Boolean cached = isBatchMode; + if (cached != null) { + return cached; + } TPushAggOp aggOp = getPushDownAggNoGroupingOp(); if (aggOp.equals(TPushAggOp.COUNT)) { - countFromSnapshot = getCountFromSnapshot(); + try { + countFromSnapshot = getCountFromSnapshot(); + } catch (UserException e) { + throw new RuntimeException(e); + } if (countFromSnapshot >= 0) { tableLevelPushDownCount = true; + isBatchMode = false; return false; } } - if (createTableScan().snapshot() == null) { - return false; + try { + if (createTableScan().snapshot() == null) { + isBatchMode = false; + return false; + } + } catch (UserException e) { + throw new RuntimeException(e); } if (!sessionVariable.getEnableExternalTableBatchMode()) { + isBatchMode = false; return false; } @@ -622,10 +678,12 @@ public boolean isBatchMode() throws UserException { ManifestFile next = matchingManifest.next(); cnt += next.addedFilesCount() + next.existingFilesCount(); if (cnt >= sessionVariable.getNumFilesInBatchMode()) { + isBatchMode = true; return true; } } } + isBatchMode = false; return false; }); } catch (Exception e) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java index 402bf3d0ef6625..1671ce0f17a336 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java @@ -28,7 +28,6 @@ import org.apache.doris.common.util.LocationPath; import org.apache.doris.datasource.ExternalUtil; import org.apache.doris.datasource.FileQueryScanNode; -import org.apache.doris.datasource.FileSplitter; import org.apache.doris.datasource.credentials.CredentialUtils; import org.apache.doris.datasource.credentials.VendedCredentialsFactory; import org.apache.doris.datasource.paimon.PaimonExternalCatalog; @@ -293,7 +292,8 @@ public List getSplits(int numBackends) throws UserException { // And for counting the number of selected partitions for this paimon table. Map> partitionInfoMaps = new HashMap<>(); // if applyCountPushdown is true, we can't split the DataSplit - long realFileSplitSize = getRealFileSplitSize(applyCountPushdown ? Long.MAX_VALUE : 0); + boolean hasDeterminedTargetFileSplitSize = false; + long targetFileSplitSize = 0; for (DataSplit dataSplit : dataSplits) { SplitStat splitStat = new SplitStat(); splitStat.setRowCount(dataSplit.rowCount()); @@ -325,6 +325,10 @@ public List getSplits(int numBackends) throws UserException { if (ignoreSplitType == SessionVariable.IgnoreSplitType.IGNORE_NATIVE) { continue; } + if (!hasDeterminedTargetFileSplitSize) { + targetFileSplitSize = determineTargetFileSplitSize(dataSplits, isBatchMode()); + hasDeterminedTargetFileSplitSize = true; + } splitStat.setType(SplitReadType.NATIVE); splitStat.setRawFileConvertable(true); List rawFiles = optRawFiles.get(); @@ -332,13 +336,13 @@ public List getSplits(int numBackends) throws UserException { RawFile file = rawFiles.get(i); LocationPath locationPath = LocationPath.of(file.path(), storagePropertiesMap); try { - List dorisSplits = FileSplitter.splitFile( + List dorisSplits = fileSplitter.splitFile( locationPath, - realFileSplitSize, + targetFileSplitSize, null, file.length(), -1, - true, + !applyCountPushdown, null, PaimonSplit.PaimonSplitCreator.DEFAULT); for (Split dorisSplit : dorisSplits) { @@ -383,12 +387,43 @@ public List getSplits(int numBackends) throws UserException { // We need to set the target size for all splits so that we can calculate the // proportion of each split later. - splits.forEach(s -> s.setTargetSplitSize(realFileSplitSize)); + splits.forEach(s -> s.setTargetSplitSize(sessionVariable.getFileSplitSize() > 0 + ? sessionVariable.getFileSplitSize() : sessionVariable.getMaxSplitSize())); this.selectedPartitionNum = partitionInfoMaps.size(); return splits; } + private long determineTargetFileSplitSize(List dataSplits, + boolean isBatchMode) { + if (sessionVariable.getFileSplitSize() > 0) { + return sessionVariable.getFileSplitSize(); + } + /** Paimon batch split mode will return 0. and FileSplitter + * will determine file split size. + */ + if (isBatchMode) { + return 0; + } + long result = sessionVariable.getMaxInitialSplitSize(); + long totalFileSize = 0; + for (DataSplit dataSplit : dataSplits) { + Optional> rawFiles = dataSplit.convertToRawFiles(); + if (!supportNativeReader(rawFiles)) { + continue; + } + for (RawFile rawFile : rawFiles.get()) { + totalFileSize += rawFile.fileSize(); + if (totalFileSize + >= sessionVariable.getMaxSplitSize() * sessionVariable.getMaxInitialSplitNum()) { + result = sessionVariable.getMaxSplitSize(); + break; + } + } + } + return result; + } + @VisibleForTesting public Map getIncrReadParams() throws UserException { Map paimonScanParams = new HashMap<>(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/tvf/source/TVFScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/tvf/source/TVFScanNode.java index e75675597622d3..c3b0e3e8b6d04a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/tvf/source/TVFScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/tvf/source/TVFScanNode.java @@ -146,12 +146,18 @@ public List getSplits(int numBackends) throws UserException { needSplit = FileSplitter.needSplitForCountPushdown(parallelNum, numBackends, totalFileNum); } + long targetFileSplitSize = determineTargetFileSplitSize(fileStatuses); + for (TBrokerFileStatus fileStatus : fileStatuses) { try { - splits.addAll(FileSplitter.splitFile(LocationPath.of(fileStatus.getPath()), - getRealFileSplitSize(needSplit ? fileStatus.getBlockSize() : Long.MAX_VALUE), - null, fileStatus.getSize(), - fileStatus.getModificationTime(), fileStatus.isSplitable, null, + splits.addAll(fileSplitter.splitFile( + LocationPath.of(fileStatus.getPath()), + targetFileSplitSize, + null, + fileStatus.getSize(), + fileStatus.getModificationTime(), + fileStatus.isSplitable && needSplit, + null, FileSplitCreator.DEFAULT)); } catch (IOException e) { LOG.warn("get file split failed for TVF: {}", fileStatus.getPath(), e); @@ -161,6 +167,23 @@ public List getSplits(int numBackends) throws UserException { return splits; } + private long determineTargetFileSplitSize(List fileStatuses) { + if (sessionVariable.getFileSplitSize() > 0) { + return sessionVariable.getFileSplitSize(); + } + long result = sessionVariable.getMaxInitialSplitSize(); + long totalFileSize = 0; + for (TBrokerFileStatus fileStatus : fileStatuses) { + totalFileSize += fileStatus.getSize(); + if (totalFileSize + >= sessionVariable.getMaxSplitSize() * sessionVariable.getMaxInitialSplitNum()) { + result = sessionVariable.getMaxSplitSize(); + break; + } + } + return result; + } + @Override protected void setScanParams(TFileRangeDesc rangeDesc, Split split) { if (split instanceof FileSplit) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 4fddfd0332d952..14a25a992b8025 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -513,6 +513,12 @@ public class SessionVariable implements Serializable, Writable { // Split size for ExternalFileScanNode. Default value 0 means use the block size of HDFS/S3. public static final String FILE_SPLIT_SIZE = "file_split_size"; + public static final String MAX_INITIAL_FILE_SPLIT_SIZE = "max_initial_file_split_size"; + + public static final String MAX_FILE_SPLIT_SIZE = "max_file_split_size"; + + public static final String MAX_INITIAL_FILE_SPLIT_NUM = "max_initial_file_split_num"; + // Target file size in bytes for Iceberg write operations public static final String ICEBERG_WRITE_TARGET_FILE_SIZE_BYTES = "iceberg_write_target_file_size_bytes"; @@ -2161,6 +2167,15 @@ public boolean isEnableHboNonStrictMatchingMode() { @VariableMgr.VarAttr(name = FILE_SPLIT_SIZE, needForward = true) public long fileSplitSize = 0; + @VariableMgr.VarAttr(name = MAX_INITIAL_FILE_SPLIT_SIZE, needForward = true) + public long maxInitialSplitSize = 32L * 1024L * 1024L; + + @VariableMgr.VarAttr(name = MAX_FILE_SPLIT_SIZE, needForward = true) + public long maxSplitSize = 64L * 1024L * 1024L; + + @VariableMgr.VarAttr(name = MAX_INITIAL_FILE_SPLIT_NUM, needForward = true) + public int maxInitialSplitNum = 200; + // Target file size for Iceberg write operations // Default 0 means use config::iceberg_sink_max_file_size @VariableMgr.VarAttr(name = ICEBERG_WRITE_TARGET_FILE_SIZE_BYTES, needForward = true) @@ -4181,6 +4196,30 @@ public void setFileSplitSize(long fileSplitSize) { this.fileSplitSize = fileSplitSize; } + public long getMaxInitialSplitSize() { + return maxInitialSplitSize; + } + + public void setMaxInitialSplitSize(long maxInitialSplitSize) { + this.maxInitialSplitSize = maxInitialSplitSize; + } + + public long getMaxSplitSize() { + return maxSplitSize; + } + + public void setMaxSplitSize(long maxSplitSize) { + this.maxSplitSize = maxSplitSize; + } + + public int getMaxInitialSplitNum() { + return maxInitialSplitNum; + } + + public void setMaxInitialSplitNum(int maxInitialSplitNum) { + this.maxInitialSplitNum = maxInitialSplitNum; + } + public long getIcebergWriteTargetFileSizeBytes() { return icebergWriteTargetFileSizeBytes; } diff --git a/fe/fe-core/src/test/java/org/apache/doris/datasource/FileSplitterTest.java b/fe/fe-core/src/test/java/org/apache/doris/datasource/FileSplitterTest.java new file mode 100644 index 00000000000000..a455923da4d91e --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/datasource/FileSplitterTest.java @@ -0,0 +1,216 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.datasource; + +import org.apache.doris.common.util.LocationPath; +import org.apache.doris.spi.Split; + +import org.apache.hadoop.fs.BlockLocation; +import org.junit.Assert; +import org.junit.Test; + +import java.util.Collections; +import java.util.List; + +public class FileSplitterTest { + + private static final long MB = 1024L * 1024L; + + private static final int DEFAULT_INITIAL_SPLITS = 200; + + @Test + public void testNonSplittableCompressedFileProducesSingleSplit() throws Exception { + LocationPath loc = LocationPath.of("hdfs://example.com/path/file.gz"); + BlockLocation[] locations = new BlockLocation[]{new BlockLocation(null, new String[]{"h1"}, 0L, 10 * MB)}; + FileSplitter fileSplitter = new FileSplitter(32 * MB, 64 * MB, DEFAULT_INITIAL_SPLITS); + List splits = fileSplitter.splitFile( + loc, + 0L, + locations, + 10 * MB, + 0L, + true, + Collections.emptyList(), + FileSplit.FileSplitCreator.DEFAULT); + Assert.assertEquals(1, splits.size()); + Split s = splits.get(0); + Assert.assertEquals(10 * MB, ((org.apache.doris.datasource.FileSplit) s).getLength()); + // host should be preserved + Assert.assertArrayEquals(new String[]{"h1"}, ((org.apache.doris.datasource.FileSplit) s).getHosts()); + Assert.assertEquals(DEFAULT_INITIAL_SPLITS - 1, fileSplitter.getRemainingInitialSplitNum()); + } + + @Test + public void testEmptyBlockLocationsProducesSingleSplitAndNullHosts() throws Exception { + LocationPath loc = LocationPath.of("hdfs://example.com/path/file"); + BlockLocation[] locations = new BlockLocation[0]; + FileSplitter fileSplitter = new FileSplitter(32 * MB, 64 * MB, DEFAULT_INITIAL_SPLITS); + List splits = fileSplitter.splitFile( + loc, + 0L, + locations, + 5 * MB, + 0L, + true, + Collections.emptyList(), + FileSplit.FileSplitCreator.DEFAULT); + Assert.assertEquals(1, splits.size()); + org.apache.doris.datasource.FileSplit s = (org.apache.doris.datasource.FileSplit) splits.get(0); + Assert.assertEquals(5 * MB, s.getLength()); + // hosts should be empty array when passing null + Assert.assertNotNull(s.getHosts()); + Assert.assertEquals(0, s.getHosts().length); + } + + @Test + public void testSplittableSingleBigBlockProducesExpectedSplitsWithInitialSmallChunks() throws Exception { + LocationPath loc = LocationPath.of("hdfs://example.com/path/bigfile"); + long length = 200 * MB; + BlockLocation[] locations = new BlockLocation[]{new BlockLocation(null, new String[]{"h1"}, 0L, length)}; + // set maxInitialSplits to 2 to force the first two splits to be small. + FileSplitter fileSplitter = new FileSplitter(32 * MB, 64 * MB, 2); + List splits = fileSplitter.splitFile( + loc, + 0L, + locations, + length, + 0L, + true, + Collections.emptyList(), + FileSplit.FileSplitCreator.DEFAULT); + + // expect splits sizes: 32MB, 32MB, 64MB, 36MB, 36MB -> sum is 200MB + long[] expected = new long[]{32 * MB, 32 * MB, 64 * MB, 36 * MB, 36 * MB}; + Assert.assertEquals(expected.length, splits.size()); + long sum = 0L; + for (int i = 0; i < expected.length; i++) { + org.apache.doris.datasource.FileSplit s = (org.apache.doris.datasource.FileSplit) splits.get(i); + Assert.assertEquals(expected[i], s.getLength()); + sum += s.getLength(); + // ensure host preserved + Assert.assertArrayEquals(new String[]{"h1"}, s.getHosts()); + } + Assert.assertEquals(length, sum); + // ensure the initial small-split counter is consumed for the two initial small splits + Assert.assertEquals(0, fileSplitter.getRemainingInitialSplitNum()); + } + + @Test + public void testMultiBlockSplitsAndHostPreservation() throws Exception { + LocationPath loc = LocationPath.of("hdfs://example.com/path/twoblocks"); + long len = 96 * MB; + BlockLocation[] locations = new BlockLocation[]{ + new BlockLocation(null, new String[]{"h1"}, 0L, 48 * MB), + new BlockLocation(null, new String[]{"h2"}, 48 * MB, 48 * MB) + }; + FileSplitter fileSplitter = new FileSplitter(32 * MB, 64 * MB, 0); + List splits = fileSplitter.splitFile( + loc, + 0L, + locations, + len, + 0L, + true, + Collections.emptyList(), + FileSplit.FileSplitCreator.DEFAULT); + Assert.assertEquals(2, splits.size()); + FileSplit s0 = (FileSplit) splits.get(0); + FileSplit s1 = (FileSplit) splits.get(1); + Assert.assertEquals(48 * MB, s0.getLength()); + Assert.assertEquals(48 * MB, s1.getLength()); + Assert.assertArrayEquals(new String[]{"h1"}, s0.getHosts()); + Assert.assertArrayEquals(new String[]{"h2"}, s1.getHosts()); + } + + @Test + public void testZeroLengthBlockIsSkipped() throws Exception { + LocationPath loc = LocationPath.of("hdfs://example.com/path/zeroblock"); + long length = 10 * MB; + BlockLocation[] locations = new BlockLocation[]{ + new BlockLocation(null, new String[]{"h1"}, 0L, 0L), + new BlockLocation(null, new String[]{"h1"}, 0L, 10 * MB) + }; + FileSplitter fileSplitter = new FileSplitter(32 * MB, 64 * MB, DEFAULT_INITIAL_SPLITS); + List splits = fileSplitter.splitFile( + loc, + 0L, + locations, + length, + 0L, + true, + Collections.emptyList(), + FileSplit.FileSplitCreator.DEFAULT); + Assert.assertEquals(1, splits.size()); + FileSplit s = (FileSplit) splits.get(0); + Assert.assertEquals(10 * MB, s.getLength()); + Assert.assertArrayEquals(new String[]{"h1"}, s.getHosts()); + } + + @Test + public void testNonSplittableFlagDecrementsCounter() throws Exception { + LocationPath loc = LocationPath.of("hdfs://example.com/path/file.gz"); + BlockLocation[] locations = new BlockLocation[]{new BlockLocation(null, new String[]{"h1"}, 0L, 10 * MB)}; + FileSplitter fileSplitter = new FileSplitter(32 * MB, 64 * MB, 2); + List splits = fileSplitter.splitFile( + loc, + 0L, + locations, + 10 * MB, + 0L, + false, + Collections.emptyList(), + FileSplit.FileSplitCreator.DEFAULT); + Assert.assertEquals(1, splits.size()); + } + + @Test + public void testNullRemainingInitialSplitIsAllowed() throws Exception { + LocationPath loc = LocationPath.of("hdfs://example.com/path/somefile"); + BlockLocation[] locations = new BlockLocation[]{new BlockLocation(null, new String[]{"h1"}, 0L, 10 * MB)}; + FileSplitter fileSplitter = new FileSplitter(32 * MB, 64 * MB, DEFAULT_INITIAL_SPLITS); + List splits = fileSplitter.splitFile( + loc, + 0L, + locations, + 10 * MB, + 0L, + true, + Collections.emptyList(), + FileSplit.FileSplitCreator.DEFAULT); + Assert.assertEquals(1, splits.size()); + } + + @Test + public void testSmallFileNoSplit() throws Exception { + LocationPath loc = LocationPath.of("hdfs://example.com/path/small"); + BlockLocation[] locations = new BlockLocation[]{new BlockLocation(null, new String[]{"h1"}, 0L, 2 * MB)}; + FileSplitter fileSplitter = new FileSplitter(32 * MB, 64 * MB, DEFAULT_INITIAL_SPLITS); + List splits = fileSplitter.splitFile( + loc, + 0L, + locations, + 2 * MB, + 0L, + true, + Collections.emptyList(), + FileSplit.FileSplitCreator.DEFAULT); + Assert.assertEquals(1, splits.size()); + FileSplit s = (FileSplit) splits.get(0); + Assert.assertEquals(2 * MB, s.getLength()); + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/datasource/paimon/source/PaimonScanNodeTest.java b/fe/fe-core/src/test/java/org/apache/doris/datasource/paimon/source/PaimonScanNodeTest.java index 93afa390530e6e..692a0db12caa63 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/datasource/paimon/source/PaimonScanNodeTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/datasource/paimon/source/PaimonScanNodeTest.java @@ -21,6 +21,8 @@ import org.apache.doris.analysis.TupleId; import org.apache.doris.common.ExceptionChecker; import org.apache.doris.common.UserException; +import org.apache.doris.datasource.FileQueryScanNode; +import org.apache.doris.datasource.FileSplitter; import org.apache.doris.datasource.paimon.PaimonFileExternalCatalog; import org.apache.doris.planner.PlanNodeId; import org.apache.doris.qe.SessionVariable; @@ -92,11 +94,26 @@ public void testSplitWeight() throws UserException { } }).when(spyPaimonScanNode).getPaimonSplitFromAPI(); + long maxInitialSplitSize = 32L * 1024L * 1024L; + long maxSplitSize = 64L * 1024L * 1024L; + // Ensure fileSplitter is initialized on the spy as doInitialize() is not called in this unit test + FileSplitter fileSplitter = new FileSplitter(maxInitialSplitSize, maxSplitSize, + 0); + try { + java.lang.reflect.Field field = FileQueryScanNode.class.getDeclaredField("fileSplitter"); + field.setAccessible(true); + field.set(spyPaimonScanNode, fileSplitter); + } catch (NoSuchFieldException | IllegalAccessException e) { + throw new RuntimeException("Failed to inject FileSplitter into PaimonScanNode test", e); + } + // Note: The original PaimonSource is sufficient for this test // No need to mock catalog properties since doInitialize() is not called in this test // Mock SessionVariable behavior Mockito.when(sv.isForceJniScanner()).thenReturn(false); Mockito.when(sv.getIgnoreSplitType()).thenReturn("NONE"); + Mockito.when(sv.getMaxInitialSplitSize()).thenReturn(maxInitialSplitSize); + Mockito.when(sv.getMaxSplitSize()).thenReturn(maxSplitSize); // native mockNativeReader(spyPaimonScanNode); diff --git a/fe/fe-core/src/test/java/org/apache/doris/planner/FederationBackendPolicyTest.java b/fe/fe-core/src/test/java/org/apache/doris/planner/FederationBackendPolicyTest.java index 30582224f7603f..f6e8efd5294583 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/planner/FederationBackendPolicyTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/planner/FederationBackendPolicyTest.java @@ -769,9 +769,9 @@ public ComputeGroupMgr getComputeGroupMgr() { Map> backendListMap2 = mergeAssignment(assignment2); backendListMap2.forEach((k, v) -> { if (k.getId() == 1) { - Assert.assertEquals(900000L, v.stream().mapToLong(Split::getLength).sum()); + Assert.assertEquals(1000000L, v.stream().mapToLong(Split::getLength).sum()); } else if (k.getId() == 2) { - Assert.assertEquals(500000L, v.stream().mapToLong(Split::getLength).sum()); + Assert.assertEquals(400000L, v.stream().mapToLong(Split::getLength).sum()); } else if (k.getId() == 3) { Assert.assertEquals(1000000L, v.stream().mapToLong(Split::getLength).sum()); } diff --git a/regression-test/suites/external_table_p0/hive/test_hive_compress_type.groovy b/regression-test/suites/external_table_p0/hive/test_hive_compress_type.groovy index 4117501eff2c3c..44dd9104411196 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_compress_type.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_compress_type.groovy @@ -48,7 +48,7 @@ suite("test_hive_compress_type", "p0,external,hive,external_docker,external_dock sql """set file_split_size=8388608""" explain { sql("select count(*) from test_compress_partitioned") - contains "inputSplitNum=82, totalFileSize=734675596, scanRanges=82" + contains "inputSplitNum=16, totalFileSize=734675596, scanRanges=16" contains "partition=8/8" } From 8147ffe47402258e912ba54f4bb11605892be54f Mon Sep 17 00:00:00 2001 From: daidai Date: Fri, 19 Dec 2025 12:01:42 +0800 Subject: [PATCH 10/12] [Enhancement](parquet)update runtime filter when read next parquet row group.(#59053) (#59181) bp #59053 --- .../runtime_filter_consumer_helper.h | 2 + .../format/parquet/vparquet_group_reader.h | 7 + .../exec/format/parquet/vparquet_reader.cpp | 38 +++- .../vec/exec/format/parquet/vparquet_reader.h | 16 ++ be/src/vec/exec/scan/file_scanner.cpp | 24 ++- be/src/vec/exec/scan/file_scanner.h | 3 +- be/src/vec/exec/scan/scanner.cpp | 1 + .../create_preinstalled_scripts/run84.hql | 20 ++ .../dim_small.parquet | Bin 0 -> 4230 bytes .../runtime_filter_fact_big/fact_big.parquet | Bin 0 -> 129338 bytes .../test_parquet_join_runtime_filter.groovy | 174 ++++++++++++++++++ 11 files changed, 272 insertions(+), 13 deletions(-) create mode 100644 docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run84.hql create mode 100644 docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/runtime_filter_dim_small/dim_small.parquet create mode 100644 docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/runtime_filter_fact_big/fact_big.parquet create mode 100644 regression-test/suites/external_table_p0/hive/test_parquet_join_runtime_filter.groovy diff --git a/be/src/runtime_filter/runtime_filter_consumer_helper.h b/be/src/runtime_filter/runtime_filter_consumer_helper.h index 212df4338cbdd8..36da3cd10c0167 100644 --- a/be/src/runtime_filter/runtime_filter_consumer_helper.h +++ b/be/src/runtime_filter/runtime_filter_consumer_helper.h @@ -52,6 +52,8 @@ class RuntimeFilterConsumerHelper { // parent_operator_profile is owned by LocalState so update it is safe at here. void collect_realtime_profile(RuntimeProfile* parent_operator_profile); + size_t runtime_filter_nums() const { return _runtime_filter_descs.size(); } + private: // Append late-arrival runtime filters to the vconjunct_ctx. Status _append_rf_into_conjuncts(RuntimeState* state, diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.h b/be/src/vec/exec/format/parquet/vparquet_group_reader.h index 265a95f4470537..f81d660734931c 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.h @@ -79,7 +79,14 @@ class RowGroupReader : public ProfileCollector { // table name struct LazyReadContext { + // all conjuncts: in sql, join runtime filter, topn runtime filter. VExprContextSPtrs conjuncts; + + // ParquetReader::set_fill_columns(xxx, xxx) will set these two members + std::unordered_map> + fill_partition_columns; + std::unordered_map fill_missing_columns; + bool can_lazy_read = false; // block->rows() returns the number of rows of the first column, // so we should check and resize the first column diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index fb30e5d4a613bf..45cf3e2c5edde1 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -383,11 +383,17 @@ bool ParquetReader::_type_matches(const VSlotRef* slot_ref) const { !is_complex_type(table_col_type->get_primitive_type()); } -Status ParquetReader::set_fill_columns( - const std::unordered_map>& - partition_columns, - const std::unordered_map& missing_columns) { - SCOPED_RAW_TIMER(&_statistics.parse_meta_time); +Status ParquetReader::_update_lazy_read_ctx(const VExprContextSPtrs& new_conjuncts) { + RowGroupReader::LazyReadContext new_lazy_read_ctx; + new_lazy_read_ctx.conjuncts = new_conjuncts; + new_lazy_read_ctx.fill_partition_columns = std::move(_lazy_read_ctx.fill_partition_columns); + new_lazy_read_ctx.fill_missing_columns = std::move(_lazy_read_ctx.fill_missing_columns); + _lazy_read_ctx = std::move(new_lazy_read_ctx); + + _top_runtime_vexprs.clear(); + _push_down_predicates.clear(); + _useless_predicates.clear(); + // std::unordered_map> std::unordered_map> predicate_columns; // visit_slot for lazy mat. @@ -494,7 +500,7 @@ Status ParquetReader::set_fill_columns( _lazy_read_ctx.all_predicate_col_ids.emplace_back(_row_id_column_iterator_pair.second); } - for (auto& kv : partition_columns) { + for (auto& kv : _lazy_read_ctx.fill_partition_columns) { auto iter = predicate_columns.find(kv.first); if (iter == predicate_columns.end()) { _lazy_read_ctx.partition_columns.emplace(kv.first, kv.second); @@ -504,7 +510,7 @@ Status ParquetReader::set_fill_columns( } } - for (auto& kv : missing_columns) { + for (auto& kv : _lazy_read_ctx.fill_missing_columns) { auto iter = predicate_columns.find(kv.first); if (iter == predicate_columns.end()) { _lazy_read_ctx.missing_columns.emplace(kv.first, kv.second); @@ -536,6 +542,17 @@ Status ParquetReader::set_fill_columns( } } + return Status::OK(); +} + +Status ParquetReader::set_fill_columns( + const std::unordered_map>& + partition_columns, + const std::unordered_map& missing_columns) { + _lazy_read_ctx.fill_partition_columns = partition_columns; + _lazy_read_ctx.fill_missing_columns = missing_columns; + RETURN_IF_ERROR(_update_lazy_read_ctx(_lazy_read_ctx.conjuncts)); + if (_filter_groups && (_total_groups == 0 || _t_metadata->num_rows == 0 || _range_size < 0)) { return Status::EndOfFile("No row group to read"); } @@ -673,6 +690,13 @@ Status ParquetReader::_next_row_group_reader() { continue; } + bool has_late_rf_cond = false; + VExprContextSPtrs new_push_down_conjuncts; + RETURN_IF_ERROR(_call_late_rf_func(&has_late_rf_cond, new_push_down_conjuncts)); + if (has_late_rf_cond) { + RETURN_IF_ERROR(_update_lazy_read_ctx(new_push_down_conjuncts)); + } + size_t before_predicate_size = _push_down_predicates.size(); _push_down_predicates.reserve(before_predicate_size + _top_runtime_vexprs.size()); for (const auto& vexpr : _top_runtime_vexprs) { diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h b/be/src/vec/exec/format/parquet/vparquet_reader.h index c3c73d98bea398..e2ba5d82a706b2 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_reader.h @@ -160,6 +160,10 @@ class ParquetReader : public GenericReader, public ExprPushDownHelper { bool count_read_rows() override { return true; } + void set_update_late_rf_func(std::function&& func) { + _call_late_rf_func = std::move(func); + } + protected: void _collect_profile_before_close() override; @@ -252,6 +256,9 @@ class ParquetReader : public GenericReader, public ExprPushDownHelper { bool _exists_in_file(const VSlotRef* slot) const override; bool _type_matches(const VSlotRef*) const override; + // update lazy read context when runtime filter changed + Status _update_lazy_read_ctx(const VExprContextSPtrs& new_conjuncts); + RuntimeProfile* _profile = nullptr; const TFileScanRangeParams& _scan_params; const TFileRangeDesc& _scan_range; @@ -337,6 +344,15 @@ class ParquetReader : public GenericReader, public ExprPushDownHelper { std::vector> _push_down_predicates; std::vector> _useless_predicates; Arena _arena; + + // when creating a new row group reader, call this function to get the latest runtime filter conjuncts. + // The default implementation does nothing, sets 'changed' to false, and returns OK. + // This is used when iceberg read position delete file ... + static Status default_late_rf_func(bool* changed, VExprContextSPtrs&) { + *changed = false; + return Status::OK(); + } + std::function _call_late_rf_func = default_late_rf_func; }; #include "common/compile_check_end.h" diff --git a/be/src/vec/exec/scan/file_scanner.cpp b/be/src/vec/exec/scan/file_scanner.cpp index c7d10c89dc0144..8629737a320214 100644 --- a/be/src/vec/exec/scan/file_scanner.cpp +++ b/be/src/vec/exec/scan/file_scanner.cpp @@ -357,8 +357,11 @@ Status FileScanner::_process_conjuncts() { return Status::OK(); } -Status FileScanner::_process_late_arrival_conjuncts() { +Status FileScanner::_process_late_arrival_conjuncts(bool* changed, + VExprContextSPtrs& new_push_down_conjuncts) { + *changed = false; if (_push_down_conjuncts.size() < _conjuncts.size()) { + *changed = true; _push_down_conjuncts.clear(); _push_down_conjuncts.resize(_conjuncts.size()); for (size_t i = 0; i != _conjuncts.size(); ++i) { @@ -366,6 +369,7 @@ Status FileScanner::_process_late_arrival_conjuncts() { } RETURN_IF_ERROR(_process_conjuncts()); _discard_conjuncts(); + new_push_down_conjuncts = _push_down_conjuncts; } if (_applied_rf_num == _total_rf_num) { _local_state->scanner_profile()->add_info_string("ApplyAllRuntimeFilters", "True"); @@ -1045,9 +1049,17 @@ Status FileScanner::_get_next_reader() { // ATTN: the push down agg type may be set back to NONE, // see IcebergTableReader::init_row_filters for example. parquet_reader->set_push_down_agg_type(_get_push_down_agg_type()); - if (push_down_predicates) { - RETURN_IF_ERROR(_process_late_arrival_conjuncts()); - } + + std::function update_late_rf = + [&](bool* changed, VExprContextSPtrs& new_push_down_conjuncts) -> Status { + if (!_is_load) { + RETURN_IF_ERROR(try_append_late_arrival_runtime_filter()); + RETURN_IF_ERROR( + _process_late_arrival_conjuncts(changed, new_push_down_conjuncts)); + } + return Status::OK(); + }; + parquet_reader->set_update_late_rf_func(std::move(update_late_rf)); RETURN_IF_ERROR(_init_parquet_reader(std::move(parquet_reader), file_meta_cache_ptr)); need_to_get_parsed_schema = true; @@ -1068,7 +1080,9 @@ Status FileScanner::_get_next_reader() { orc_reader->set_push_down_agg_type(_get_push_down_agg_type()); if (push_down_predicates) { - RETURN_IF_ERROR(_process_late_arrival_conjuncts()); + bool changed = false; + VExprContextSPtrs new_push_down_conjuncts; + RETURN_IF_ERROR(_process_late_arrival_conjuncts(&changed, new_push_down_conjuncts)); } RETURN_IF_ERROR(_init_orc_reader(std::move(orc_reader), file_meta_cache_ptr)); diff --git a/be/src/vec/exec/scan/file_scanner.h b/be/src/vec/exec/scan/file_scanner.h index d26186eeef621b..1cbe9c1bbcf12a 100644 --- a/be/src/vec/exec/scan/file_scanner.h +++ b/be/src/vec/exec/scan/file_scanner.h @@ -251,7 +251,8 @@ class FileScanner : public Scanner { void _init_runtime_filter_partition_prune_block(); Status _process_runtime_filters_partition_prune(bool& is_partition_pruned); Status _process_conjuncts(); - Status _process_late_arrival_conjuncts(); + Status _process_late_arrival_conjuncts(bool* changed, + VExprContextSPtrs& new_push_down_conjuncts); void _get_slot_ids(VExpr* expr, std::vector* slot_ids); Status _generate_truncate_columns(bool need_to_get_parsed_schema); Status _set_fill_or_truncate_columns(bool need_to_get_parsed_schema); diff --git a/be/src/vec/exec/scan/scanner.cpp b/be/src/vec/exec/scan/scanner.cpp index 5dced63feb6507..2857738297fd09 100644 --- a/be/src/vec/exec/scan/scanner.cpp +++ b/be/src/vec/exec/scan/scanner.cpp @@ -41,6 +41,7 @@ Scanner::Scanner(RuntimeState* state, pipeline::ScanLocalStateBase* local_state, _output_tuple_desc(_local_state->output_tuple_desc()), _output_row_descriptor(_local_state->_parent->output_row_descriptor()), _has_prepared(false) { + _total_rf_num = cast_set(_local_state->_helper.runtime_filter_nums()); DorisMetrics::instance()->scanner_cnt->increment(1); } diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run84.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run84.hql new file mode 100644 index 00000000000000..4b4e7b6e549b29 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run84.hql @@ -0,0 +1,20 @@ +use `default`; + +create table fact_big ( + k INT, + c1 INT, + c2 BIGINT, + c3 DOUBLE, + c4 STRING +)stored as parquet +LOCATION '/user/doris/preinstalled_data/parquet_table/runtime_filter_fact_big'; + +create table dim_small ( + k INT, + c1 INT, + c2 BIGINT +)stored as parquet +LOCATION '/user/doris/preinstalled_data/parquet_table/runtime_filter_dim_small'; + +msck repair table fact_big; +msck repair table dim_small; diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/runtime_filter_dim_small/dim_small.parquet b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/runtime_filter_dim_small/dim_small.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e998f3c817a3748ef8c2feaba3718937f2123a0f GIT binary patch literal 4230 zcmcIoPe>F|7=JU4>tsuA<{RJ2mTTq1gPID`AMEf}jQ*&p6%lk15zQ!Vt&GkcC8U#d z3h5*f5fPn2y5u1uIz)$%P98dhM5l=8`(}Lm?aa=!1hbEL^L^i&?>E2s{nmMpM$}k0 zx46xdgWTk-o-r$FFvd9J_1tNPT8bxlA|ouUdb7jYiZUqg$eJ*lhG8=K$jERr-OCcG zv@ou6(=nYT!&x?Dm$d(dl5PhYjGOr66kLEpoV4Co5iN6~yNK6@DPNlZ%`C+mv|x6o z7TDq_5zW>sq7Z?2g_x?)TrA>hBEzk15zUDnm+dRVSu^CrYWdB|P$NYVz~yyq2xD$x zRk$8RSbp~+C6LWtACBH+g3JYfhq%1(}*-j$nH+-#DF4qDx zY7GE|GV>XZFNLGjaJ?dI9RYYDrYbZS8@{+J%JAPw`7Q~DYdDG`fJ52){=VD7m|IvC z?u>F!7GU}{MY{S7->Q@Abs#Hs25`y~s|qECRbkJ8MOw0 zLYest-vodKNk#H>(aSQQq|2uN9g*v}>+<~RIall+nfX4D!03T5Uq z9Dl_)N)6X5!qyRh7hqo7;_7&!ktkL$^uLu zXGvF|;V+uyVl&7}odKNk#HvGyVO3Z-BOqk~VxRUCF~8xPE%J2>Fr(H0P$)B>;XUv$ z9HoZq6=CZLzzZ=|p}E-b=dDqOuN;&=4uIhrj-m+QF!t%eUM3Rr6VsE|#(4q`POT=K zx4bKe1E(h_z}bO0wdu?0z8KGCGS;({tms+C`CR3g(ss70VE+Ua;ykX%SdSR8qDKHv zhs_!k!bBnB+{R_BTd%C>*60bbUWY)DJRw#aQHTTz5$9)L#`>w2 z75$WYLM%3;5H^EjLsE^6oxOO>`?QKD<{=zMrxc7)oM4<$YC+|u)Cr}gRKL5Vayg}j zaIs&hQ+S13Um?S#To=}dadM&hXAv5boSJrv7jT9?Jpf9nOLqb0^bFj21k;>LV)*=a zKF~ivRX4=cdvtDQ2yVVJ0=F(;ZNOy;H^FnJkpD@14;Z8LP>ts`;G#DWI+w>7NWsfM z|AO~^?)o`*KF&mTZajbE*5u8OiMhFs@qB)EA$xl=KYwNRdbay;XIE!ex-Y|+_m90z KFg61J1MwFiVwOk% literal 0 HcmV?d00001 diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/runtime_filter_fact_big/fact_big.parquet b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/runtime_filter_fact_big/fact_big.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b3ad736022e91b9462bd4c6f32b302873536cca5 GIT binary patch literal 129338 zcmcG%eOzVLnI_6P9NT1`!bv5fKrwM8pyiOGHFOL_{nR5s@GwVu^@|h=_*HBjv9q%1vo~w+ z$M^30D*4Xdo4YsnydVZ|c-)uu743}N`R7OJ7>z`tS)wU&-rnfB2eWd|J~xQ`=e^N= z(S6mCef1G>fr$Tqp`80wqLH;XikZ)*KZwHrpR~rZ6|Lxx_iManqoO;SF|)Jxo+A?$ zKaa>JK2Fl&awJGul)RL!y!7|OqdV)Vxc5^Q@S`vxMSAxA97&QljE%LB~j z^`FbWn4P59V&9T~-rhYa_5~XIy0}=(Q67uFPK>OXB070IYZ}(#`7{}X7t`LWdR?%ctKo_C<`s(dg)?a zIW_0|lvDCKG4WaIJr}|Q%;wc#AGYyhUgbRYP1)*?eG`+&zpuO&`>HScl(X}BG4}bC za?SvlWzQ<-fSvrJq;d{~$~j<>oc^*;Ion?nBVS4>Cv`bX#ZXQ**8(wkgT#@Nr=3}R zHh)PqsG*$lUlMbI`qPW^3ntd~D~gz@oZep+HNTuOGo$yuiYF<3?BWG+Iif7|p6jKH zb>)=(nol_!zbb})m3q&G@Bp)UHP{yx;PVut961uirG-hPt-Kcdk_&vwS^t_C{2H-$ zAw0k=dsaCGcJhmo$|(qyQ(%!CJ>XN$+J4c$Kc$@1dzL0npB5os&0mBy_jKIJSQ z7QKh5_gn}MFq>C{efSJ{mQjuzkx!RrwP((=sVC91-?Z3oeBGy<#a|cQzn*&L=n!D< zPM5{9XO&avQ%+&1oI;D_z&Cx$S@?$N{69OH zP(wLuN5$|_>Q8m0h4Tw0R`V@I%v4VEZ;7qn%$S+cd#An1kt0D|T4)hzE3cKt)!*?c zXZE*6`)^b4xey*;Hm?SInwvgzjB@0NT!xrSD981ZM}J~2|7)LeX1*=jzD?|12(Wj( z87SwV-Lpl>XU@S;IR`D0UElF3XZm+V%kQR?le(Nuq@kQ#n!;ug#NZ8&6r*-#@tOR6 z)u4uQ7Jg6k{T}tF7v~pDtn3dIF;h8p-xaI>He+T+?|mQN-st0`EiOlbl!X>?y>zkO zd-H$jQ_jTqMDzEk_gn}MFq>C{efaA15~CbBB44jwN+`$mTI^^4#HXCGKNgLDOzd3< zuy?%~D5uCyeo<06MWJ$vERwDN&ZnHw?~8`-r<9YroK2*moII`tV(J}kNu@jIRk$os{SISoYdtk6+=0AXxc1-7`)+;V${woK7D_!8q`qE z=wFHBe?|T2#rXvjTm4Upn5mqdzY$Y^lQA=+_s++<2YsBh#pOtlvd|)~moCD+3@3{~jU^cG?`|#_Q8m0h4Tw0HviugF;h9~{~$*GA!BAn@0EI$BS(U`w9w$vR$eQO z{r|(Koc8}NcK(ri&xP;+vw1bxKQW5W3Zoo362zs2Nu;g37W>}+Ux%oA)P%mhfO1h!5bbaNbSrb z)KcQxL(qLuL{wcAp&s=@rRPSoG2GxigY6Bor5ZO=KC2f;#L&ec=fz^l87;Wv6_6YW z;?hE=NLzVna{bXWa48Yy_Y!oJMMQBK5oS<4*y#g6>{(fBjE3Zh{Bpmhtre`%(?^ce zpLzg;6%+u!ouKowh$y)X0my5Wg#cVH|E1@gy@A|Nu;)wo7L9!a`?(zjx#38V{L`XZ ze+5ysl~ftoR~r#Ml@U=|8G)8;3^u2Rnm$DHxN3qoh~Y@FCjJc1EK-A4Y8`7li6&^X zLt7OZ)(e%M8y2=)Zt$MLriZPoG;XGvDz1u%`D&2!VzJ~5HGL-`MTTHpjsz(S4dZ&P zWS*)e!n~SBua1agSECtS6c2X#0KylmHyJg_5&81rHC!QNujBuC`(*<0Fb z+*_n3Ic^28b({j=)wFzbMD*T_0OYmGLIAEelV=mR^B`2yfux%3y$9pB5>;MJYb_Dc z*P^LOVX!$h)bwGtW8n>AI6?yI%+B9!TE`k{s<@4A|E*|Pb+HANo*Uk!Ts6Ua2Ad-~ zPiWjsH4WZQ8!>MOIWHDV&QQ}i_=hg~I0WNzBuH6k7}rY|>n}>RcM@ST;SZ%!xFN+U^UQK0pMZ{DG0&r0ffa}dfP26sTP)!9% zHQDM4TSb)TA)joEmESDBH2{1~D8V0d;2Q@1)kT zhMEQ+rrZC+XjpZz1(lu~`qfpFck_d(rsW}xo2jO)Mp@vE&Rjo${(ljs$UO zp<$%0yjC(tA0xuNn)-$#qH-9`=%RSA(+5D)(`S=WlN^yN1)B*qxnA<<57)6#3V>Hr z|KqeM^*91>Q4oOZ%|uPyu7yxd2a{^D_bnWMlBn`(Iyn{*)nl5P6b749LrrH>Aa2LP z8^myg1k@QhjQssv>sUigv*UF8A4kKgi!G@1+;+9GqHI#*W~!;~sfbve0J+RIR~TwK zififPBrh&Uf|P}ZalLf0p3J2wBFw94`ss*heHzW^qIj^`>>2@t>jCdFYLX*z&EVaH znq03H!1A*c0I#N*X@|X|2!I2U2H+6=eDcO7R@he+)Oof&P7DU z9LQz1;lgOf<6bq%ksvNDG>o*B*GgvT0ukobbb3A_PR^qlT@(*Cn_VM-@SFd8jGE+# zTq$@jp(fXB1yJ@11;DFm<7HZudKm$@Cq6@Z7)T{ z@RFt`g~8_3P!s*XRrmH9gEvSVApvz}=Wj*pSVK*9uhH#)84as0wxH5;!wZ0O>-TS* zFmLt!s~R^`O=GV|M9b?ShZ)$-94tAb8RJkQeH?;uITEBSG>q$|i}hzy)0;$?S5x&` zM9jT`W^_?J*y#fZzxi)7YLX*z&0ssBCf92P(0rN#;MG*~7A;DxBLEi#0l3~w)Wq$0 z2-S2rsU~~R!{!E2<<(Tb5fKY-Yid#$Y)%a|eS&L<7`#E^2nnb&JAdzL9c!qmbCYiW zn`l^du?3Z$+pacc%)F;@Gu5=b6%m75AeY(33`0#n@v2FV1aWDhVWh3RRx*);+PzxnSlYLX*z&0r^?Cf92P(0@_Xr>6GesMz@c0k|j#!1ZRL zCTrRPSoG2GxigS`+NlSIP_=J^yNgDSJQAsRMb}xVFty6&1Tn76Y@x(qfYgcYXVtE<(fg(QA`s4b=^xI zJ%E`i3V?q$ja(5G4Obul7sZ3k2EhL7?E&vzhyxddYC3R%MYZE9qROjjygDk5RVy`V z7%U}2O?zn`C!h3_L(s7}QmopUMQY}1tz(U6({fEz4A!7wy-?}7?P@PXO`XQgR8#Xc zQL$AEa=X|IQJ8EDCwXx>LSGV+e+LaCZRNF+dAgnm^J-eSHY&QVMKii69&9$dhMK~+ zD^5*vM821WYI40+02|j+0KA$O8|bIPG#~&M1p&C;44zE~yn7)IB=<2K*l$rCZ6vC^ znpSRzik=%ZH7N`>r-qvL(L7E=#NZ7QN2p1iS)?|aw2n2@RMted{~OV;>S7BjJ-1!$ zg=oK7<7TRw8s=S(x_e90oy_%X72AfkuO##j0stMj8h9e}P&MZ>x_iG(%sHwk~ZvVY# zSaq=lm7d$K_ChQ^pm8(Rw9yw8V|^gEi@gv(!U_g`oaDvjNRYD7Fs_#_*1vWP4-jEq zO+61rMfrnhMi<3{&1Tn7Q@ARf&8SI^$o1^(gqmEh6~M^D6afEh>K%-Via`Y6q96d* zn~9pddm#=a_c0tOw5T=>5mjDI1CK;So*B*GlHX z6GWI-)8uGW93Mq9x+orOHoJzJXnOj@7&Xa}Ao=)>CDi14$)n#67RM<7UQJU^(xTLp z2*5=_0IoL^HF@_!97yhCIB?LS+V>Pu<<)d*A}U%ZG&Lyp@!Lrvi~e@;zuM6MZxYI40+041{& z0I#O?Q?w{`3IVt%2*CAbq9*TNhy%%e3c|3sG_M1x-x~gUzX-rcZMX z5ra2K93cU9W|1m+N$XfcO*Qj$`+pG)t1h;n(sRQaw9~Bq{R?Ju_P(rfGu1S*5Eab} zAcq;)A7a6hGn%o|t0p-T#HEFXk+$+$$vn11gn2bpz7iFu7SW6@iU&J=0O2?P9Ol_1 zN93A8PU6|*daVE&U!wqcHB~LsqSUJhz(qj-t~V1kdG|sbNbX}eaLA&1`gNkptEu+& zsF+*P)TA)joEmDvkE3=w7TzF+BP5{CEK-eYTE`k{YJY=n|8JmS)x{Q6dTw|DaJj+D z?q4v~G_|gAGu5>CW>oaQ336U6mYkucHLsfFND!A68b;d6YbCSiZ6eI8sr7VJY`%qN zbWuFm=>rJA`R6ifk|T1>AUB~V*J}mPyGa4?YHE9j7Ns^2fQy0vTyG|7^6rH=kle>` z;IKut`aPn`tEpovDz@L%)TA)joEmES49(+qEWAMsM@T@OS)_X3*E-fv)5td6{S7BjJ-1!$g;@VU<7TR<)5d6E%7FLL5l$V>occqI#m7sPbwWy);|YUz(lT3!!1KIW^SuS*{^s@CJz^B%sbL zQd1Q~irovbcv-gSzl_qSU2H+6=SH)+&5C6>;)Wvt)friOb5>oUaWmD_SeY$0E(bYv zwS~o!Gn%o-t0p-T#HB^xDO-6h!fRI&VO~vhRoSAW3e6}HmF2-s9{^&{%8D~;k|T1> zAf8Z@>m`q#@%0)CfLGJ}RoSBRDg@x7AOP2!ftm`udm##vdm#$!y%59K5LI4HOSRde zt5&H=!(el2sOfVQh;w_5!5buw=rVOiZXTk;yORZ=| z7sZ3kX4gHTCsni}F5AO$vj}siCGXP)b}i!5hSIgap)?oxcaQjy2RY z)lawoel)DQ*n&#W4PR?qZt$`{%fWnZ?mVP%Gu2e}P_|ec068xfOU`J$eE&z&T~yqd-z&K6A%qZwTk4|e(h!f*Z`V$>u@390}smLc>T~d97q_|C|W(YFZi37JcJrMi<3{&1ToQ9nkdjIg3$~9Fc1V zXC>6+ddZ{T4t6Ff0A5Y2Ptl^(QwYFCK>)5d6E%7FLKGzTF%;N)Ats+8s=S&`Pi2dN zDNRiZgUzX-rY}(-&h0e@Z;&`b0_u$1M*g1FI@VBA)w6W_pGL!~i!G@1+;+7WqWhG_ z%~aFSbJ?QtIgs1MUWl{t! z*)`M@e)H$lBuC_$L8vCzYX#6SPXX|1s(6tWrRETTi-G`LZzgK;?u95w?qevh_d={K z5LI4H)eG6;)JvM06b749Lrq`i8X^X7kT^mD>demHD_X}IYHC}g+y5dOR$XjCrRTP* zy$}18ydi{im% zvumg+{O12*=Gi1iE(!v0y_u-VyBDG$xsRd1 z-V0H=PE>g{op>`_Y_4f)QW$Jb4K)c$iF136!5buwkbpX~^LJY7SVK)iZ_(}lEi|mU z*n&#W4KDyLH+aurFU0DG#?4gI&fD2y>TQtoVzK0mW}Nn_Nsa_@X`x}Ht-Mw;$KNHw zyqX3!vqkkLn$bn^V5bit{O11=Mon@=t{HqJp(fXB1u(Hq0q|-Xe2*5T-a`N`3IcGw znW)LT7os4!kD;G+pOxn3)P)ypUVUQM&* zG0|R*09+IV;CeGrlXovfL2@rdfxQ>vWF=AM)wFPVOmtixOYMcwFxZ?LYWfP-5HWaz z#1RruXLkOow2n2NO*>b_#MBjNSaq=lm7W{T<~C~#-ZR(>(Q=i>%~Vr&bxc%MgPa$O zC1*5alUGf0B#2834I^#krO9_&W^_?J*y#g6>{(f7Gis6}a?RlE zgqmD0dGr7(uB8BYHEmrJ6C>9k02c)TxZVuZROsCcQJCBdQE2aln7fXs@@gu+E+$6n zm6|jRHm8P~ewhMsZm%(TgTxUMP-o;e@^?(@SVK(>*VFC40S&7zwxH5;i?H0_J%ito zG1#baGu1S4Lrk>Y0CHX|mYkuc#E;$N2*%||kh0J)uGdQD@tcS+ucq3jm{_demHZCb|~YU*yK+kYz>R$XjCrRNr9 zxxpLmVgR5zv$)OPu5mNfwAvOELv0|ZuC}mPa)z4z8b8FAKF+NuE=PitMc^r0dFf*P z)-=#ggn2b}+z}JScc2;17(0Cc;W`?pCOINk;6gRIUMqmXyC?u&O`RPvQPP0`Tofe7 z^=6_b?_P+)qRrVC?4$e0Td)YA7kG1&7R_EP}7n#LZ8 ziN*&IfQ#b6W&>b<0$8DUFGOK-A48$N7ozJSqROjjav&y}1~fG(43?6irmwOc3vUp^ zkz!5!8J^ksdsypOLrtrLbo(Df!+N38bHmpf=hpAvzhK_#>xMLLrkYwFiHV()Acq;) zeHAP@LroXpM?%xbAsCkQCn6JcIWOOM7x&!cEY7sZ2}K7eplnp2Y; zk?YwXOMGu~y;cBQk5d4=nwCdsQECJMxF`s~^=9&H^6rHwOzvYSwD&@cj}cW~O>0lY zMBfvdniK|`Q$tO^#x+C?-XL*=n$(${zj3W&4K-Da)9wFBG_1PVf=bUVhvf$E8SI7V zd`jbHs%dZ{CK@I{&Wpv8Gt^Y;Rg)YE;?hFHNLzWWWY$g*VO~w8PshaMB%0Ah@nEM9 zfTpL<$C+o79Fc1VAJ`vs6wS6f&tIYUjGUNy;)ATBKe zPua?A5$<@22=i)cnvaRK7txFoQCS}B^Z|t5{5dtr5xHg%s>$_Q0dy`>0KA%-U#3N= z1q9%tAOP2!$+OA37osq^kD<`s3sL?mQRUUtx)c+qU(wX0FxZ?LYWf$H5?4*|1~DAb zW$Mh%-)maO8fqF`rrZBA8dhCwL8a%G$8v+0{aFs?opSkgjhm^at(BOVSOGaN7E8`h z(}nnT1@v)}7ndVJ%0k1qUb^pS(e+Lb#F1Dc3bHi(a%MIQ$*b7nip2p2o zQ{7fftiB6!UM!ZJp{4<^n&e0jmlhgE+RAGsbLo8|%&Td7J0@DU(TpyN2RnTL;lG%E z;;571DNlnr1$rMX3)EfQ#b6W&_~g^t^i^3X}U73hliR z0~ZrjUQKf)IpRb~PU>H8H4K)Lp(d=vJNb;k8zhbtt9EAR?-C-#{!jL;(i|~SnnMli zg-XvYpXCN`xQhYY)S1Ptxm@FBs;TqR98qy8$f>I>ES8+1rZ3@_f6&KCUR;g@DGLqb zdg)^Q*;INt5$4r&`m!8x@-iaKpm?y;2M~Vq=hP%e@b#L||2Lvx)x{Q6 zdTzVg3o+BIaWmDld{d4Xyb0uXu@_?H6O;`4ILV94ksxKEVO%d=tXH`279z~6>BR9I zv3)a|(M9oKv)MIn2jMq=PEB$|t{H@Ca=lgn{jC%Lucr1}bHq*y0&r0ffa}dfP2Rl_ z2b2344&nt#&votXM3q-lS6hxKZqwAHFxZ?LYQjpq(-1LugTxUMP-hmY{yVgeHPkeA zf^PpO(6H)a3o1RgUG0U~xKrb1s;TVG95LGta=X|I@!fMM8T4_I7ndVJ%0k1qUb*8fpr^`EzQLBXZ3kRFmtq0+_jn0^rp&(v>3`x)6Yi zf&g4^CTjBTg*cep$8ZoYNP4b2dWb5orty1o#IbudH7N`>r-qua67Mub4BjAdgap)? zMQWy3>sUig%lFak|2{OVy4ZqB&uv$GA!;7bxS48d?#mHd_k-Ln_Cj=d)g(uPxU|qP z(pFw8nWrBl!n~Rm`g25AKbp}+@nEyrHPjTY2kbrSR6n^=us5{|u{ZIiC;z@&N!WYT z3SeW90^rrO_)w1Meh2}$CS7BjJ-1!$g=inqxS49|e=JAT zJqB{S*b5QtrDV{@NnTuz1Sty*<9g|0y~5Q`5Mf?T#iKc5{Bbm+i{im%vumg+{N~T8 zNsh>sf>2Ga*9xF!oC4t0RPrP(N{t}^7X<;h-b~cw-3xIrxsTx>UXb)$FHI0tUQOi_ zIb!nXnwk^_n^QwgSc!KUA_i}eI6?yI%pz6uwAQhPnwlr+_Wu+bR$XjCrRTP*y$~bM zXxvOS%}nKp&MA=F#a@Vi@~TOW1aWDhVWh3RRx(e_5Mf?T$DYj*E7NF37sZ3kX4g;? zO;4YF%(F?3$d!V9+A74p)cV1`qxARm7Hxl?0^rruc#0OKop@!Lrvi~e@;zuM6MZxYI40+03*v30I#OrS7}k|RRrLoAOP2!$+OA3 z7vf-YAHzYsAn9sqdY!29Y8qI{5tS>NniK|`Q$tNyiFefmZxF)~5>RIrsgXCdjy2RY zvr4!BRWz)+*n&#WZC86CO4c=QrkZNr%n{3LAh(OX5Z~HI$)Jyuyto_*QWhG<_0q+9 zg%?f}VO~v>Z{>*NZ=o4o6c09=T|-UbH-An|azw5fglclVRsf6dPyoD|rZ#9%Y6Ahd zC7Q_|5;5N1Y5O*91P9T7~$e zwhEzptpG|cp#XR_tzVog1}{bca#m#_0N0y|nhx6`6lHxS_wRr9(;ps8?u9sr7bIOx z)0YxeUQL^2x#DD5ZfY-thQa34P}8r|JT{pi25)$zShX{YR7nMqV)sJST$U@A%PD=@ z#THb0ZoAqG(Oao;Gu1S5d9G-_9OQPf7vii>QZne{Brh&Uf|P}ZalLf0Ug2X`5@G(? zR9TfPPF;a!bWuFmY<7*?LHN!8lZ=|=h+H!W)#Q4u02*s30A5X1SLKS?Y6Re-AOP2! ziJH87Ar2<@F&xATlAi0+wM3OyQ*CXon7dk2lfqzgYN+Y!Ttmd*4H8F4K%H5n8n4wl z)_69x*X4?`Iy9`h*n&#Wjb?M3)xUqiOxVb-bexPYHDlD6Kro|4r{db^Y)x{Q6dTzVg3sKdjaWmD_ zcz3SY=mfc4?1eZSP%`M_Brh&Uf|P}ZalLf0ersRrCc?a$=I+TA9rvIaT@(*Cn_WXq z;WvLyO>#u88H8$by;cD0_fY`6n&x|QMQ0BJa8VF|>&-+>-n|e-$$bn(_FjnLKBCI2 zY3cr4(RIJ3CWXP~)KJqmxrT_r8zhd9fI74D*ROS~p{A04y8S6fdXxg-)wJ_St{8g+0h}>58vy^N=iLiYl-$Qq zWbcJoc#Np>YASs!SBwv9YEl?1B|}X}iF136!5buwkbpWPw|+aV8r3@1P*daMbo(Dc z!>WrdsPx=+wHIP&Oyg#%Y3hkw(f$O;?P4!P*141n`Z&pp%MtpLko-I7cxfvyU92aw z_2)#GS5y6Xu2^~!&FG?du-WVyY6`#kb83sUigz0-92pGL!~i!G@1+;+7WV*WXeo2jPtnOre419H3A3-RY(HOY}6E-f^Sw3XLN z=E+$i%&V#E`CL)@Jetu(@nEyrHPl4Y)8|vnvq_G~HG@xSs}P?`twMb2DE&SCb};lJ z1;DGRdyW>R<`BRcW3vJ9&nE9)h@#{^h9Y|}#IXgU%B!jGrCd?|lBOnw!BR5R^qUl( zb9;@!8zhd9fI74Dx2Scjp{A*q>GuCJ8dhCwL8a%G%W{MF41O-e&XUH>R8!Sbu2_5p zwQb839^R9g*S-d2nnb&JAZF!9c!p*eVuOq>u6YYu?3Z$+phLPG;Ciu|Zbfl95~M6NjO(R~^;`S)CK2Yh4^%872?xJtpFM>qX2j{Rg~w6nM)DC8Dp~n@NatFy%0soy%0tA zUWm2Ji7KzA>dW)QsfxVRUI-0?rDUk-w<#sg?KK8(kT^mD>dekxmDaJwv#IThJW+B5 z8dhCwL8a%mtGy5t)fzWbP4idgiQX$gZWntYPJ7iPM}oMt(DBk%UMrbhR}*1gO~-5U z#ObThj4p}?o6WAFrtq6TrzSZf*9<~6xn3)P?m7y9S5wP1d19j$0k|j#!1ZSGZ1V1f zC`#^QC^}+Mt-Ow?@@hI!pC>l2)zqXg*qj<_`W>3bRTI2H3`a;nomr&1uh%-(P}5KY z-ToWUuE@lYEm}iq5k!uF$X{!+DrB)%%J8A_m zaWe(Lt7)(~Pt-IcfHTHs17M#`hrD|s4khA@5#@L&<#%hwQx&C+{YzyqXp| z^F&9drY42K=G0Krx7m(`H;CZ~38*tWfA?q|Yp7|bi*Em2XjpZz1(lv#l;sBR8T^in zmL83psiy9G^F-CXAm_zm$r)-o|2*PBALmvSmm@*SLc_RTx>&z8mETWQ49_;i1gx~x*HOUdVW)P~$^;!W`^iu%5nzs7##7G|ka8VF|>&-+>-n|fqlKU7A z*?S@828b%Jrs9D-G5VmUCWXP~)KJsE<{BafZ;&`b0_x1p-@{tR8ft17q}%^PXjpZz z1(lv#Hp>m(Gx!}DgO6z3Of^lM%oA-VLC%ZCk~7p)>Q$2*3F6X1!$@0stz;e_Cc?a$ zY9GxL3qxo|7sZ2}K7epN;4{pdo*a>D2A@g1>A7AjfR@K80A5XXBeW>>7y@un5P<8= zL`~kk5Qmcc7!KKcA-10&s=S(xJ&`AtMm04l3^u2Rn*I&jvG4{l93cU9X6J8Q>sUig z-A~f(|4B5gy4ZqB&kbK|oLj$t|AKj|pPkUSnQB`7d7c>hImls#f9CJVSn{e#js$UO zp<$%0yjC&?o+iS)nmQ))MDZk=(M9oKv)MIn2jMq=PEB$|t{H@Ca=lgngVPiMucppt zXi@4J1mL0|0N0y|n!I}<4kh<79J2R9)IUd5c{TOSL-t;Xt|g+%t7-C;Jkj)u zrY42K=G0KrcPJ32A!6_bi6bPS&d9Ad!0fWtv4)yfU!~jst7uqtu?3Z$TQ18D-ZR(> zQTMvW%~Vs%N}kwx4dlF7EIC6>9bPrbksvNDG>o*B*GlH*8$_5_)6#05=vhTGx+os( z^Z|t5{5dtr5xHg%s>$_Q0c@>P0KA%(-=syUHxYn~f&g4^CTjBTg*cSl$8gBr3o-sS zQRUUNb~;b=oz~Q(FxZ?LYWh94W8n>AI6?yI%+B9CTE`k{s(6QP{~Ksnb+HANo?9Nv z4c;^O9T}Zl8aGo-gYV{vhIc{Ei^Y;N)O5~gi3fe0TTxt&1Sty*<9g|0{nk|bJ`v{C zRJxrfCf`Fdx+os(^Z|t5{5dtr5xHg%s>$_Q0n`=8eQGNEfEJ~85P*w<09Zu~I@*c{Np*#KrVQ@zlTGY8Y%z4K@8f*AOvygTxUMP-k}jE+JCv|733| zjf*8n=V+oB!vSH$6Eb*9<Pyqb1 zskt&P)-Oi@E(!v0y_u-VyBFe6axcUodoM(JHBsf&)OuxHoUYQ;q%hc=8fyBtY{$YI z#BhWJ)R~>X8m(iEXVc(SanW!U8dhCwL8a%G&vJwJ41Pz(a;?VARMXbgaWQc<$a%3? zaz-43nvQ-B7c&IoawJGuXc*T^7wd0&OUH>Yucqmn7Qf_>Kc z%(F?3$TfrW6VE2sYXz`;D+R!-X{IGE+FB5Ri-G`LZw6{Q?A;4-IJp<%@C6ptfi|Mb zt7-1GxHxf}Qj>HpRbEZo-ElG8t*J?2usJo<^gWu#RTI2H3`a;nomr&HdbN%<)KqsL-Tr&fubmH;2*d9SMx+orOHoJzJ!tW}an&gQ5mJ_PU^;!Y+KS=@bYHA;&MX4tcfQy0v zTyG|7^6rH=oZQE7xWJ-XJ3&-=HFf!iNOA8Gn zZRMqlbu~>tON4nf4Nu2K{WO}*)`M@u1a%ik|T0GJ5-bFwE~zqMFH??8hMTu zrJh3oE(!v0y_u-VyBFecav#ItLW^q03q+My)A(#$9GlhDq%hc=8fyAut|4OZ28kmi zpw28(GcRf#Yp7{?j&A>RXjpZz1(lxLuJ%IIENI+JH8sB!7hCfnw~M_Hg`cNn(8pDi z9HHglS7BjJ-1!$g=l|M z<7TR$_Q0o1%h0q|-n*`P(Kw-JDgf&g4^CeJ4CUWmiVeGG?-EUHWI5>;MJ z5v4)zO-=o|A78+JvY(b^xwyV7m zBRd*5Q%y7P$3^G+Ah(OX5Zhig$&nx~Ei{a@mDftnd)5d6E%7FLL5%+g*bf3qFQ<> zQRUTiyewa=U6P;L3!!1KIW^SueVWHr6TCqTM@T@OS)|%8BU0>Mi2m|?QCCjs(=N84 z(sRRWfy)hEcK?F;;U>dowWcP8!RFLZ)4$^yA_i}eI6?yI%px^X zr**9HY?`?yUvyrBhE*3^Q0cklv)tf4gS`+X*J<2LHPzJTi{)!U&Wpv8Gt_j_t0p-T z#HEFXk+$+$$y~Ue2=i*1Y{(bK8_ouI(M9oKrw<^(KI;odo$BL( zO`k8M)*HT{twQKt`b&BMB_}8VUQO$5`C_mQ0VJT$1e*C6|+ognAMV#yh5s`RQ!js$UOq2r~kyjC)gbrWG;O_lfL zi&I@_Mi<3{oj!o@n?I)}IU?5#LN&QwD}cuPC;(nfRXzD)_Fe?wq96d*o58c`h<7i< zk>oyxBl|6?r|&1Kyqaq7&lhvOnwk^_n^Qwg7t%bgn&1s$I6?yI%p%p;uXU`UruGNu z_WuAHR$XjCrRTP*y%19a8aGo-ix1|D{s%#B7keSrylRpoL0npB7-=i7mCT-pi7>CG z*1>$S`4F1XMe$&>*)?tl;WvLyO>#u88H8$by;cCdLlgk7rnX1&#nwp#;G!S^*PDr& zyn7*zB=<2KvG+n$KSoq}HFXTH7N`>r-qvTlv3iV3Em)vBP5{C?EF2hb*!PL zkrBH6kDy`I#THb0ZrLn1c+cSHLaaZbaWmCa@sUigi_g&Q{~0u_y4ZqB&uv$GA*!C!xS48doXHm(&w|`8_CoY{)g(uP zxU|qP(pFw8nQPAzVO~vhr}9O|DKw*t;=yLKYupaPZ~mN`|lAb>N*W&_|?A-sDbjwJUn9I^L8EUXb#UQMNI z`C|MHO-%}erDUk-M--lu<Qx;s^<-Gji(cL2eg&A&R|fk|RM}TIhIbE3cKz)=eVJtEv8-e6h5FW^_?J*lczU zHHH7*l~a=(k^fURRFmtq0%&`W0^rruutkef?;-#f1p&C;OrA~Ny%0x|`xuVcdm)N< zh$^q9ruXSz%C|K&DGWBJhMN8(rNmVeyg>{{NI;$0`77Sz`PbXt59s#)0UB0aY(b^x zmdA2~_Y8h6#C(ay%~aF+MSH}^MSF;x7mFolG~!iNOA8GnZRNF+dGZn>%&V!Z zbdM-4CBh7f2RnTL;WvLyO>#u88H8$by;cB2W>~f;YtEsPIk0`I$liCZRVX!$h)bwX;$HE)LaD)WZnVr8Yw2n2NO;eS7 zM0+I~R$XjCrRRp%0$1tqp21#-ohvnNrkbj*+#?pNK+cQBk~7rwZ9zQf;}DF?ksxKE zVO%d=tUu`IYKSnertz!xh^DL1j4p}?JADA*H-An|azw5fglclVRsi$YPyoD|CTjPH z=2`^cq96d*n~9pddm)Y__c0u?_d@j46IEVK)7S11$FJ4Yq%hc=8fyA6*AOvygTxUM zP-k}j8nliz)UWrdsPx=+wHKn{292AkrnX~yM9DFb+r?gp6J9mRksvND zG>o*B*GlI0jYODN(@NtW(btG(bWuFmY<7*?LHNy|QIsB8lE^BX$9MK;i$lALnVH9NTJu6`pemUzq=TQdt4M)!1 z{_}G$6ipG&d)@^CTT1D#k|c42j?@{GC^43;pv=~vjf(DQhG=cx-z(OTBbH-SRItHE zStx4g)4d#5Y_i~+hJF~zB7V%3gHz0+7jx)qA9Cwl7TH%75w#x@^NA|w{Qb{<`ojx? z@DB!KUtR9LsEF_D&pjtAi|SPU5tUryp4aqYv3VBdHiAqfj;NG6W3a{G*$T|8)WkSPr_W3GRu~*ETL+If{@xE^=3+S$wLthAd zl`0R!)F<(-m?+Wyuu}UOjIdIhXQ5P4l&bX8D!WFhYCk1b&ZXo=6Q#ltrBY`Mxaj_j z0y9fBbe?EDFGIAN^L?c%`J9;eEG2U|RVryAnbKC{9M`HZ`bxF)c`^3+RH+_ImFlr? zs#57*4t@H|zEW*}NsN3cRjLDFsSYq0VX0QmMX6#a)#fj$>>8yi|0OXeD7o2*QsIbF zsWS##wEc<#GfUO`%cAC&Gej%gG z(8Fbrr>IoY0=l%7mqQ;t;49VIe$l@_RjPupR0Rx1SgJXJQstslD}^e%Mya+7#AE>_ z7f+`%{F&dP@sI*DOVwT^%8D{X+dASa)$(D{dzg}0ohp^IkW6W-aV8OBVONR_HEEL9VBb^`wf*{qf~20 z#qd!|ZU+BwAZos)z|2xL|CZSL%?#02f5%s<+20oJzfH+Johp^IP%3GwaV8OB`Ct1= zHS=xJ_H9B>2sUeuN+m6zOIvw4^sevtN;UnvqUCo}r8*dv>L7y=mTLGYO0@^2n*4p0 zU87VBzbE>BkCL0gs7y(p)Acfm z5VL>cE7jN^i^e}D^zc^qQtAfyQbJX(mqTy;cfL}MeqS_vKUJ!tuvA41Mp&x8-$SX+ zLaB!Ty~?gps_7qyjvr8RGg$HzTYsv+%u-eSshGboL$s+M`ARkPLs9obN@icGRMJAJ zq^-u8M2LwW`${$VXQJlM2t8b6T%=M-3+U2TUJm`(U;0Wl@E4-$FH)sC6qf1`gAtah z;|D0!M^LK1zgF2bN;Udd;`m=ta-n#3ZO;9&CszNH0y9gs^EYDZZ!$z1`7gdw_5MUu z{DhKuKHhS`FJ&Q_u9r!K82US3sk;AGl>IHChp*+Yrk+)=YR@X&%b{2Qy{}YVKNTfE zO_l0!SgOMeMp&xje}z(=jZ$^|SCw6(R0BT~^*^KJX0TWz=Kq@lGfTDp4`SpWGDPeD zAHGtx|97$TkCe>PRH>weQb}8lGl>wr|I1gZwto^^|C7+e<&hOCm9&5^ZRO?AON(h; z!CvYJzXG@ZAF=VjQ?)u0*6IiY64t8zXQwlQP@Qpr1D6sYznIoh77@i|1Q=gR6-`=5tF+ZPH^E>9 zO~9{PbY2z_C6{3WaLmeDqq<286G&Tmxe4m8pb6}{RYvyJMnq3#M3h!WP`O+zUhGf) zCx!hNFhKi`VflhevhQT>Ip?5wgI8+FHk39&>mqGc$n}SaWOzib)fqE^*t$x?GmBSo zRYc5J15ho408pKAfTwB+kT2fo)e&**YUK7ii4MTcQCi5Yw3U~eVD?&?fG^%yT|_k2 zVS@1f_-5+<_@;J$)Vo`ro7jOCIi0Hi;6NE2^Z&C52g$bmsyxauix6%ZB@zz=*qOV0O zUUK{Nes+x`Q9LRMiWi`GTeoS+Hi}nq8wsTqx&CmXcsL^0>WpLBc|yZ8i#K?CL^RwE zK($l?Ky}6e*4{~geDO-#BVzIntaQ-?eDNx~B4YY(t$4}Zy8GExk3{jPBq-jeP`tW(wPYK`Yw0GT+=E4!!wIF+Y=GpJphDG&U7iIqmKaj;x*kL5o^83?LVf9CoL3D+G?C* z-1#6)z!$H%KO)v2zy#sD{st9KT9`oE%F9hqK1dVr#cO>iB2Euz#Y=7u-OsLrB#K8R zLGjK*@j6dx$u^2N_%I3OVdVOwiCp7|;;A!^>GF_8B&2^=ahxC=kIiWg)k&mt(yAEKR@{Z)Td_*`_f;_`>oo z6;E22K-$X7O)zkZCg6)V_gqArcup%`a_ifEcC92)JSqu_cRq@@JgX(!DBjldB$Vfo z>yIHVi6e@q&N!ycFKT#Z@jB-sqGAq!nJmDRE)XDJywmd$adIBHJ)SC_w2)hAt8tET z*()>wU%ZW%>7DImOc1`Byhp{87ABCk@^TZLdX*;Ni?_WL5yMMb@shi>_Oq)eiQ-X7 zP`odocx5YEvW?=^y+%S=My@}eC?1X|o;u^0_OEJqX7R>ekBFAn0f_s}nJ&pRy-9$4 z@v7G%V(ty(HjZ`xXv#uvT`$MD`7}+y7q8|mdS_e51mUa6HWg1=m_XXf%T2JkK@;%B ztKW!-g}1fhB{%l$XV+5_#iNp-cwa*Cn%~uuZ4|F_lZ3K~Tz>*-NgPo;b;dEBc~8SL zi?_TL5rbO*%w&GLDNVo^uj}HdD84wVzN9Djk?dy|SQ5pflAw43#p^F4WcG(Pja?EI zEtg;u2kl3o;^BzmsWXo0M!AM(7O$*4DrPUubXUmqMIB^KAtL`w2)hA zt8tF;OchPQD{thAsA#wX6Hx5<990%32%Wl@o1o(=nt(6fcy&}9t5(H3kX&Io;9p^( zlAw5BLGfm;){<=~Z@DHa25XS(y+krRqIl|zV_H+E;hDv2z9uTRY5}O$Q~;>XIKb2O z1jrX};o7L^x)!-DOy*Hp$gS(;CfK;1Cg6*=*g$)b8Zbe4gG8C6g$bmsyxat%jWhvY zypx_z(yO7)MRPm&R z+)7)Ga}%`F{ytCf8t;jU)hMny(E-gqJFe)krF+upHAe)LOEleP7<>e-58lnmK z;tf0!6_t-@#Y?WR9PqEOP)SfcdMI8fMuxRy8^xP>l!Wpqa(yn+k~pGx>WpJr^0F$;t$4{5mIMA37AgsfNB7kW#p2UivW?0A{knQv4hN^2J+wHYx_5MQ*=>e{!UcvXEQX z%P}sQr3v`rt)HTIwo{lOd^JJwq=gBjt-Ra>({nTdU%bs1qT=KWTJe%AEC>86EL0K{ zkED5_D0xXswo$yAc@oNt$n~d@mc$XoQ)e90-j_8zvv?y5QPI2rz)V(Hjx7-&U%bj! zqTDOrjzIe5- zN5$NVR=ng2%K`ri3zY=Lqv~EL8rQUB8^vpXgM{)1a*h8c5FSxHb;dEBTG#N*;w`=z z75#4l5ciuiU18~Yn*jOZwVsZO&9{)-wN&w>h1^P8jdP59H)#UCcx~^{JKF{(2wzQd zsd&=D1kzSsZi4FfXac@?9a~Yc{jOHL60%#B5aXA1Jc}^Sw zsLnXRwJQmbFWy{Lw&Pzb zHPHlo@pc-s#aJUI2wzQ5JZWJ9X)7-`!9p`lz!$HyIa`e1q!ll@!cyR0VWETq)fvZh=vEESEZ$U0wrFnwpjuM_pgQ9KTiXbbFJAp^ z*#T_&OU%aL} zv&BlgR=ng2OM!odg-U|r9YyilI<;gQ#p}I`gmM>h{rN=ka73=v8OLWl+Cc`pI-#p~+M7Nye+g_5e-57q71`Ta@={#Y?WR6!=$Is3a)fZ=!fZ4{FIaiZ|6yLg`1Y zzmO;%j>xq-e+=Tc8Q};#DtXi&HOY#Y?WR6!=$Is3a)fw^6)?SF~gs#cNw6p)4ZTUq)II zM-)$;aZD#()$q*X%`at(-X#ELvcl4}LV$enj=z>IPA?<3KT8!)TF9-m)i}qv`wg0a zFJ8+ky|cZJ3Bp&C4^#1^g$bmsyxatp>ofshyc2I`i_JByc*zx(0{;pNl?286Hz;2B zX)W1C@rK?ap}d7$i$w8oMDf%a$8>c=!!wJw^LDnFdK-Ya-<;_R%lNwl$QN&5Gh0+| zBDd42;zwL0UN4mN6dX7MI&h>5lv02oOC;s`)> z#sMC`i2(WH)i%Y%!i~snVka;_Qx!cyp8VWEnZU*Ek~A>WpJL zd%K2b7H_pJCWhJo7)=1;2takl0S>eiAYZ(WJ7S{v4&?T)v5lKP%0g~kFE_#9T{Hn- zyv~l8DCxii;d>>DCoN1MZRO=AsPCc)_~P~49TTN@YsE{huoU`NSg0f@-XEcOgWX!P zjp9w*LqfR+x&Bq8C2>Tq)fvZht4G5#i&xPT6Z7{1P_3x|P@Qprr|u^}zIda(F>$OH zxgAXvPg=;WwADB_!E8TGz!z`qftYA~02742qvm5&JZWJ9X)7-`LDxex0bjhyftY9- z(2AE_VJY;luuw@*ygxzlW*^p)Z4_^Hkc2XbTz@rDJRFg0b;dER8`AL1;y6FVmX zh{s%b1fV+O05^vTkT2fSqcPF*C~|uN_TU3FWg)k&mz!YgahiZH-tq{&vyEVa@Jj)T zCoN1MZRO=A7$2hv_~NZS5fgn+XvIseuoU`NSg0f@-uF?wt#K{cM)4}fNhnVu*T06e zB#y|nI^&pjKBeKA#T%T6iG~RP<|F`d1fV+O0BffRkS|{8(=jnQiQLwviYG1PR@!Qu zo1pGlnt(4}*)+YgJ%b5|Dt$gq#gi5$khb!26RbQ(6Y#~Wd@d%YXSCuaS6B-DD=btJ z6z>NpUfrygY@>KB&y!G2A=h6clHn1#R%aa3u{jOTEZ*!3G12`305e%(>3E3%`QkOr z$Hdx;$n9pTc+x^{rLD#}#+{2a0bji4m+75t0TYC;CMce?FoCp{mz$vcRhob=Uh7g! zoPI?sUUG${(7(b$B|-5nMDaRb(~@lzZ*ZA}f}b)N`xlAg;fUg?Gmh!<>l&U}ysed( zm{4V}kJ21jUmU zCXlxBauXcipb7ZmoqRhcs^8X%mt0{f^slf`Nl?81fZ~m9YRNW=H~S6=>Sy zzT(Y%K<{iHV1n@L_$R1%(!vDNR$gv`fs1JZzIbybIpRb~PU=f~VRD6~(7(b$B|-6i zgyJnU8Y2r&L7{MRM=C=0oDy&U7RDw=>--bQ7P7^=hs;j0OXCoN1MZRO=AI8{v(@WtD{ zGDi$wsTD7|!cyp8VWEe;Wyn!a*i&uX`j#xORigz%%!gA2R!a^lM@%|jeYi`n#Z4|Hb zMiR=6$aMjc43EgQI^&qmG;4Tf@s@AO5ra1YP_3x|P@QpreYX%GU%V5?bHw(|$n8q1 zc+x^{rLD%f3Hn=U0={_dx8{hQ7EBPnnxJ^n!UWP*UT%Wg+i3#6cwKEdqPR^fUUG%y zpnrvhN`m73C5qR7hn8%kcw;9>C?}BX!bI_KM6T5t$8_UP4bLoI*_}CJwjF?KO$C7J zj02qRBtX7+!*}I~`n!WA|#sORlgS^slf`Nl?7MM)78PwPYK`TfUEkavySyU$qn-k!y9v zF|Bz(!!wK5+?OM^?gyY+Qvskl;{Z=TNPv9t7W#8US3h#wl`5XJkXvc1ac+W*L7IRs z-r_?!qWd9C5Wbr1rQ%5o6G&Tmxd}!ep$Yint(?peJtwu|C0AGu`d3(}Bq-j0Lh&{p z)sk%#uk2A0$`Ep0lqepK$hA7-n6{5-cxLhXAIlMSj{#7vsQ^%&ae&oN5FlT?;?W#2 z{y1_Q;D3D5M_I_N>*Xe>8K(*O;*~r}?`&h3Abd4J@uY-l!!ZQ2{0rJJ`S_&LLGh%838byO+ys4FGyz|{Q}5=8)_1ky zC0AGu`d3(}Bq-ibQM|=%E!jr#Hr^wlyoX%>%S7>TMDf%a$F%VS4bLoI`%aE1+W{c% zH)p!SQe5KwB_V6YxniI=mvY;hDxS2ETWPCtj&aE)Gyz|{^^0@G;Ki6Ad^Pzb6;E22 zK-$X7O)z~aO~4m#vn*GfEXz%;upCUTupIQSuuw@*yq}?XB^89suCUZxmMfOaiFxeT z6S>9_#ZzY-)80xA&n(`^<+-ByasXzs!gB0N0_2rfS(PhJU4h)5h5sl|A7vr8u9sun zSVI%=#jCn1SIkypg7DP@#gi5$khb!26P&K43HajG*5-=2tF_`KS6B}ES6HYdDBeGy zc#YR;$u^YNUY9G%>X7TNBQ1#|il@#vrc>8xcxLey>vKhaJpdDVP86mR5a63WfU^}hn|#1XkxXB^Y@77foV zUP()?n7IXjYE1=z>Wl-Nyo~_);+<^G6}7F%?PRKW(n4;ft;V?trcTfVeDQ{E&lPpI zV}kJ21jUmUCXlxBaub}mlP2JcH`<;n>f5#AC0AIA{3|R}5)|)$pmo@D=bC+6&5N9iub=!ysA+x*+%giA19%VAlF9|#lsP~R%aa3 zp)n25EZ)=;xuX3E0ID?=0ID+%u=VEz$QQ4EJXb6|iQH!4=ZnxsS;(#H#m~?LeDRv5a>dHiTJe%AEJgkm7AgsfR~$k0+Md<2 zZB(y!nv^n)Wd8>8k~kvS>Wrg0|D1+rR&RYKSB%U6P%WweP@QprCua$euU^;lxuW!W zB=^r#)sq&ID{VE-O)&H#O~6;LdyZb(<}g8`H?lrO)sq$`khb!26C7Kh3Ha*uy_74; zU(%|VTw*EmFR@TbP`#2w^@bL;Y#Y^^dYPp1GLrq9iR$5qWUDid>dumeXI8IjDOW7M z0>DhxSms_MK)!n8%ekUy8Oa?^RZm(-uC&!S$9R5~Cg7_#@jAV=y^aaOmlIS^T9`oE z%F9jA^CnHeS8sYPR~%o{s+U}2De|weP)Sg|(nR&<-_o*eRBwHqq_U1=|5l=UIHG#$ zjHBAHq2Za;YkNCal)MeVOcq(THwloh-pV_Y zx9GKP3loHIC#asZFoCp{mz!X6hbG{wclv!=X?S0&UUHG8$iK)!B|-Jd64l!&&hvax zuPV+Hiyt7_-$GszM^sOpaa6l6*6_^g4VC1H#uEDNrWviW)Rz$;U%m26^2Bs0eqCyH zs(R8wa{u4T&i|>d;tb=NX=lzkclrm|scli)o&KWL8AqTqo#`bee*YG=YHM55j1%kF zYGd0t{!)TSkPt!$A>>L3xgmrg5=xK|f&>u}kr0A}AVGpi2qB0Bkq{)L@3ZIZ-95v5 zX2Kn_d3O&y&*u5Ku)HEswSLa+IPw-5@bog$Ez!3T2JZFbCqz$GFi^E#)y|T@gNZE|AS4esoK~K-Pt#)M_dVYFC zTP#tx1%gXyvs7;*L7rZ5jwPnIBHZa1JyjuGRqN;6j%)JBfTvfo-4ZjoFmUfDprs8L6w1ffZ(qLK`FdIJY7QE?Ck?)?PxR0RW7>s8L6`4AcK^oFY}QCVfs^BOJj!HpI| z0(x1J-o$%G+CF*<)pS&<5%#^39!7+%XWUjxY7IRx|e7q4;#84YB>)0?ZeL|Z)!-1`aWsR{~WuxgT3%xCP;+#^|XE;i_6c=L||dB?F#bUK?H9K7oOIKlv%q zQxyzUtyejN`Qv23(#}I&}HcP=}nxpM9WDC;#1xA{{KRar$~^eSJ`ce8SK0><47n7(q|ZxUJ5eHT3-SGR|6JXaIt`y8=Ny;{^MMNRX%3{-q@fzC^eW;-?bQ zOIHYY@hWF9aE=UkdL6@baT|t#EB`a1rz#kzTCZ{j)uUv<)9XHOiK6ocJ+IdiAKYsp zB%qfk=?#n-Y5V9+T%e6aYr#Cub ziP{N-TN$ILDuk>OME$fu z&+E0s2lrYC3FzfZdUKbJw0-n4F4Iw&LD&yTdKf`Z&$z8t%^7-rdQDd>5xD}vrSw{I zuaY27FLmA$-QOcz`Wvp~OIHYY@hZ3Dykt9wUfMOfxGliIy`O-ds$igMy~-Jkr;q_p zFEhm!y&LS@#ndJk zxc8Hv6FpVIK-GGcGf2-S1D;;#7F$eb8T7nfOMGyzg^++=fuvWJW2Egndrezy5!s5c zACX~W1U)_DwmOz;==tf*ZL>w!HV7`I*V4X&1bKS(dA7*RL%97hda6RWs@BiB9d|^? zfT!26(-v7fVc_0RKu=XLP_s8L6v5X9OdV>dSQGP(vOYnLv z3BkPg1Wl`K|SLH zQ>#djr#D?`i{?s%TO6aODukdL7GbZF^e`f9J>$08P;cn@>2(~n zMafYJ>h1~z^^6lNI!1y#z3c{C48D(W?Ks*Ty>x|e7q4;#(IztB>E(Q2i=jpsxVKBt zQxyzUtyejN*=91}>E$=uV)#RYp4V$h2=28I63{D_^r9adY5V9^wa`&%LD-KY7dawq zJ>$08_lcqBr#IGWi>6ix>h1~z^^6m&Zzn;XUiqiCm~TV4FU5f!=&1_fs#>pd1`Q|3 zfTvf{K^M2hplZF!8RT}60Z*^`q%D$98uYwgOG0q3g^++=iKN%iZKUm^ z*YO!0mCq3N6Uar52wTs%tG)1;M5CT6#YxL7rY~uPyR>5pHK3*nys^ z5U#5AD!1dlelpo+SgGUgv-<3I+^%Uaut~xYt5R zK(AEN>-*A3+edF~kdDeA!hRCD$Px7PjN59?u%YLt7yZf>b3+i+-4zJx87DYBLV`TK z;d8dAK8JAcyh*|%da6RWs@AKV!OR6R;OUKw(#35Q2Cn=~L{C*PP_8~JJyjuGRqN;6j&m-P0Z%VwhAwX3!N9$rfS#&gplZF! z8H`>b1D;;`6c2=6{%#v(<^+#5fdp68MyZo z&{Gu*RIOJzgVc>=z|$*EbHwDEPOR6G;PqM(f_p851oSE-y^6O;n(ehTq&p%nowokN zrzJg%pr>ctR!24&dVYE{8II`4fFM5PTyL+XHH!rKvsb&>5$T%|?sGSjB1B^q!d<+| z?YM0#8SwP#vK^6;4FmUn0(z=~fvWW?XHb+&20XpSZH~y?X3+C`EeXNB7D57gm6Bdt zo{_fi?DcJTMAdeL{S0!EBk1WFx7C!LhMu2ZPQD|?c0h0`y_Vs(Nsy=49dSf)1mQNv z=&1_fs#-tic05u@20XorFJ-_`UIy9tikB3cDO-Ccp8o^gT&H6+N>%Y4rf{nZFJGe%EU2v^nmIcHFKgba9kS+$NBI1B^# zegb-`f`O{_DrYc#lni)!xknu_SZC1ldM$~;y%s_OdNq<>VS|yjk6y+5bX49)*w0FO z7!kIfaa-+aH1z!RMvgh6;TQyUcLjoa#tGJbNP;}Q(k4gDet>W%WAs#oa8<3Ja|U%S zWWdua`^XV<%`kB9C!nV)7^qsWat7I-kO5Dxvegmu9~<<%UQ1$duZ56+Uah28_o2KyaR*^h8*WAs#oa8<3Jb32|IBmYS0bk6y|-Ix52m`!6zVjG(7y+*Zp*4Lv`-y7P|6 zIS)bIU4fvUae|o_Nsy;Ef58#$7ZC1^zogC4OIHYY@hWGKH9-bEy@heQxQ)ZWmA{4P zsR{etJFAj;NT1;8J=mrL!c+)62i?h|w8@n;fI3DukUPj6~NNVIJTk>FB#EzNI|AWyG4H6&6~NzfjS(Nh(|RkeQ3 z?YJeK40w7q8$%*(BMjX83FxT`2CCMpoI$}RGT`aeXM{w0Mkv;6N%VRxiNU=VLIQeC zl3vSZBW>T=>&Xm>icEz4S4j^e=;;}^)rBpFo}XUUmXH|9f*?NRTyL*sFoy(rdYxNC zqG&6^y;qPTL}L}gUA)TecxXEr@btQJLn4|B1NVLcda8nfs`V;oP@7K%JiXo>AyK@; zpy%~k5`%j!gaq`OCB31Dk+zTC)J{4oI}!F8TOOmQDukqG1mV-1`aWsR{d>?+Nx%!=EPtvcuF@#^jaF3K)j}pS+ zm3Na#0u?J(uE3ncF~Q)zAeHpu!x-f{-aqM1Y)vk`Mq@a<27}Z?A-56^Kl8fW79>gJ z8Bdy?u3i3e?EPPmZ!h<^`2FeK?thWq6#EM&$(=v^gg0NY+}*ESU9{haX!*5A=}qOI zbcOmukT(<1d@Txoh E53Q5_%m4rY literal 0 HcmV?d00001 diff --git a/regression-test/suites/external_table_p0/hive/test_parquet_join_runtime_filter.groovy b/regression-test/suites/external_table_p0/hive/test_parquet_join_runtime_filter.groovy new file mode 100644 index 00000000000000..8c0b1516459a57 --- /dev/null +++ b/regression-test/suites/external_table_p0/hive/test_parquet_join_runtime_filter.groovy @@ -0,0 +1,174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import groovy.json.JsonSlurper + +suite("test_parquet_join_runtime_filter", "p0,external,hive,external_docker,external_docker_hive") { + + def getProfileList = { + def dst = 'http://' + context.config.feHttpAddress + def conn = new URL(dst + "/rest/v1/query_profile").openConnection() + conn.setRequestMethod("GET") + def encoding = Base64.getEncoder().encodeToString((context.config.feHttpUser + ":" + + (context.config.feHttpPassword == null ? "" : context.config.feHttpPassword)).getBytes("UTF-8")) + conn.setRequestProperty("Authorization", "Basic ${encoding}") + return conn.getInputStream().getText() + } + + def getProfile = { id -> + def dst = 'http://' + context.config.feHttpAddress + def conn = new URL(dst + "/api/profile/text/?query_id=$id").openConnection() + conn.setRequestMethod("GET") + def encoding = Base64.getEncoder().encodeToString((context.config.feHttpUser + ":" + + (context.config.feHttpPassword == null ? "" : context.config.feHttpPassword)).getBytes("UTF-8")) + conn.setRequestProperty("Authorization", "Basic ${encoding}") + return conn.getInputStream().getText() + } + + + def extractFilteredGroupsValue = { String profileText -> + def values = (profileText =~ /FilteredGroups:\s*(\d+)/).collect { it[1].toLong() } + return values.sort { a, b -> b <=> a } + } + + def getProfileWithToken = { token -> + String profileId = "" + int attempts = 0 + while (attempts < 10 && (profileId == null || profileId == "")) { + List profileData = new JsonSlurper().parseText(getProfileList()).data.rows + for (def profileItem in profileData) { + if (profileItem["Sql Statement"].toString().contains(token)) { + profileId = profileItem["Profile ID"].toString() + break + } + } + if (profileId == null || profileId == "") { + Thread.sleep(300) + } + attempts++ + } + assertTrue(profileId != null && profileId != "") + Thread.sleep(800) + return getProfile(profileId).toString() + } + // session vars + sql "unset variable all;" + sql "set profile_level=2;" + sql "set enable_profile=true;" + + + String enabled = context.config.otherConfigs.get("enableHiveTest") + if (!"true".equalsIgnoreCase(enabled)) { + return; + } + for (String hivePrefix : ["hive2"]) { + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + String hmsPort = context.config.otherConfigs.get(hivePrefix + "HmsPort") + String catalog_name = "test_parquet_join_runtime_filter" + + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hadoop.username' = 'hadoop', + 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hmsPort}' + ); + """ + logger.info("catalog " + catalog_name + " created") + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + + sql """ use `default` """ + + + for (int wait_time : [0, 10, 100]) { + sql """ set runtime_filter_wait_time_ms = ${wait_time}; """ + + def f1 = { + def t1 = UUID.randomUUID().toString() + def sql_result = sql """ + select *, "${t1}" from fact_big as a join dim_small as b on a.k = b.k where b.c1 = 5 + """ + def filter_result = extractFilteredGroupsValue(getProfileWithToken(t1)); + logger.info("sql_result = ${sql_result}"); + logger.info("filter_result = ${filter_result}"); + + assertTrue(filter_result.size() == 2) + assertTrue(filter_result[0] > 40) + } + + + + def f2 = { + def t1 = UUID.randomUUID().toString() + def sql_result = sql """ + select *, "${t1}" from fact_big as a join dim_small as b on a.k = b.k where b.c1 in (1,2) + """ + def filter_result = extractFilteredGroupsValue(getProfileWithToken(t1)); + logger.info("sql_result = ${sql_result}"); + logger.info("filter_result = ${filter_result}"); + + assertTrue(filter_result.size() == 2) + assertTrue(filter_result[0] > 30) + } + + + + + def f3 = { + def t1 = UUID.randomUUID().toString() + def sql_result = sql """ + select *, "${t1}" from fact_big as a join dim_small as b on a.k = b.k where b.c1 < 3 + """ + def filter_result = extractFilteredGroupsValue(getProfileWithToken(t1)); + logger.info("sql_result = ${sql_result}"); + logger.info("filter_result = ${filter_result}"); + + assertTrue(filter_result.size() == 2) + assertTrue(filter_result[0] > 30) + } + + + + def f4 = { + def t1 = UUID.randomUUID().toString() + def sql_result = sql """ + select *, "${t1}" from fact_big as a join dim_small as b on a.k = b.k where b.c2 >= 50 + """ + def filter_result = extractFilteredGroupsValue(getProfileWithToken(t1)); + logger.info("sql_result = ${sql_result}"); + logger.info("filter_result = ${filter_result}"); + + assertTrue(filter_result.size() == 2) + assertTrue(filter_result[0] > 40) + } + + + f1() + f2() + f3() + f4() + } + + sql """drop catalog ${catalog_name};""" + } + + + + + +} From c7572648fb955873e24cadc5cbbdcd2bb816b758 Mon Sep 17 00:00:00 2001 From: Socrates Date: Sat, 20 Dec 2025 22:35:38 +0800 Subject: [PATCH 11/12] Optimize location for tpch1000 (#59218) --- .../doris/common/util/LocationPath.java | 89 +++++++++++++++---- .../iceberg/source/IcebergScanNode.java | 87 +++++++++++++++++- .../property/storage/S3PropertyUtils.java | 48 +++++++++- 3 files changed, 205 insertions(+), 19 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java index e4b9aa0b25c121..cbe2b01d912584 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java @@ -96,27 +96,25 @@ private LocationPath(String schema, } private static String parseScheme(String finalLocation) { - String scheme = ""; - String[] schemeSplit = finalLocation.split(SCHEME_DELIM); - if (schemeSplit.length > 1) { - scheme = schemeSplit[0]; - } else { - schemeSplit = finalLocation.split(NONSTANDARD_SCHEME_DELIM); - if (schemeSplit.length > 1) { - scheme = schemeSplit[0]; - } + // Use indexOf instead of split for better performance + int schemeDelimIndex = finalLocation.indexOf(SCHEME_DELIM); + if (schemeDelimIndex > 0) { + return finalLocation.substring(0, schemeDelimIndex); + } + + int nonstandardDelimIndex = finalLocation.indexOf(NONSTANDARD_SCHEME_DELIM); + if (nonstandardDelimIndex > 0) { + return finalLocation.substring(0, nonstandardDelimIndex); } // if not get scheme, need consider /path/to/local to no scheme - if (scheme.isEmpty()) { - try { - Paths.get(finalLocation); - } catch (InvalidPathException exception) { - throw new IllegalArgumentException("Fail to parse scheme, invalid location: " + finalLocation); - } + try { + Paths.get(finalLocation); + } catch (InvalidPathException exception) { + throw new IllegalArgumentException("Fail to parse scheme, invalid location: " + finalLocation); } - return scheme; + return ""; } /** @@ -201,6 +199,65 @@ public static LocationPath of(String location, } } + /** + * Ultra-fast factory method that directly constructs LocationPath without any parsing. + * This is used when the normalized location is already known (e.g., from prefix transformation). + * + * @param normalizedLocation the already-normalized location string + * @param schema pre-computed schema + * @param fsIdentifier pre-computed filesystem identifier + * @param storageProperties the storage properties (can be null) + * @return a new LocationPath instance + */ + public static LocationPath ofDirect(String normalizedLocation, + String schema, + String fsIdentifier, + StorageProperties storageProperties) { + return new LocationPath(schema, normalizedLocation, fsIdentifier, storageProperties); + } + + /** + * Fast factory method that reuses pre-computed schema and fsIdentifier. + * This is optimized for batch processing where many files share the same bucket/prefix. + * + * @param location the input URI location string + * @param storageProperties pre-computed storage properties for normalization + * @param cachedSchema pre-computed schema (can be null to compute) + * @param cachedFsIdPrefix pre-computed fsIdentifier prefix like "s3://" (can be null to compute) + * @return a new LocationPath instance + */ + public static LocationPath ofWithCache(String location, + StorageProperties storageProperties, + String cachedSchema, + String cachedFsIdPrefix) { + try { + String normalizedLocation = storageProperties.validateAndNormalizeUri(location); + + String fsIdentifier; + if (cachedFsIdPrefix != null && normalizedLocation.startsWith(cachedFsIdPrefix)) { + // Fast path: extract authority from normalized location without full URI parsing + int authorityStart = cachedFsIdPrefix.length(); + int authorityEnd = normalizedLocation.indexOf('/', authorityStart); + if (authorityEnd == -1) { + authorityEnd = normalizedLocation.length(); + } + String authority = normalizedLocation.substring(authorityStart, authorityEnd); + fsIdentifier = cachedFsIdPrefix + authority; + } else { + // Fallback to full URI parsing + String encodedLocation = encodedLocation(normalizedLocation); + URI uri = URI.create(encodedLocation); + fsIdentifier = Strings.nullToEmpty(uri.getScheme()) + "://" + + Strings.nullToEmpty(uri.getAuthority()); + } + + String schema = cachedSchema != null ? cachedSchema : extractScheme(location); + return new LocationPath(schema, normalizedLocation, fsIdentifier, storageProperties); + } catch (UserException e) { + throw new StoragePropertiesException("Failed to create LocationPath for location: " + location, e); + } + } + /** * Extracts the URI scheme (e.g., "s3", "hdfs") from the location string. * diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java index 133ac0676448c7..698a6a380f0c18 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java @@ -133,6 +133,17 @@ public class IcebergScanNode extends FileQueryScanNode { private Map backendStorageProperties; private Boolean isBatchMode = null; + // Cached values for LocationPath creation optimization + // These are lazily initialized on first use to avoid parsing overhead for each file + private volatile StorageProperties cachedStorageProperties; + private volatile String cachedSchema; + private volatile String cachedFsIdPrefix; + private volatile boolean locationPathCacheInitialized = false; + // Cache for path prefix transformation to avoid repeated S3URI parsing + // Maps original path prefix (e.g., "https://bucket.s3.amazonaws.com/") to normalized prefix (e.g., "s3://bucket/") + private volatile String cachedOriginalPathPrefix; + private volatile String cachedNormalizedPathPrefix; + private volatile String cachedFsIdentifier; // for test @VisibleForTesting @@ -547,9 +558,83 @@ private CloseableIterable planFileScanTaskWithManifestCache(TableS return TableScanUtil.splitFiles(CloseableIterable.withNoopClose(tasks), targetSplitSize); } + /** + * Initialize cached values for LocationPath creation on first use. + * This avoids repeated StorageProperties lookup, scheme parsing, and S3URI regex parsing for each file. + */ + private void initLocationPathCache(String samplePath) { + if (locationPathCacheInitialized) { + return; + } + synchronized (this) { + if (locationPathCacheInitialized) { + return; + } + try { + // Create a LocationPath using the full method to get all cached values + LocationPath sampleLocationPath = LocationPath.of(samplePath, storagePropertiesMap); + cachedStorageProperties = sampleLocationPath.getStorageProperties(); + cachedSchema = sampleLocationPath.getSchema(); + cachedFsIdentifier = sampleLocationPath.getFsIdentifier(); + + // Extract fsIdPrefix like "s3://" from fsIdentifier like "s3://bucket" + int schemeEnd = cachedFsIdentifier.indexOf("://"); + if (schemeEnd > 0) { + cachedFsIdPrefix = cachedFsIdentifier.substring(0, schemeEnd + 3); + } + + // Cache path prefix mapping for fast transformation + // This allows subsequent files to skip S3URI regex parsing entirely + String normalizedPath = sampleLocationPath.getNormalizedLocation(); + + // Find the common prefix by looking for the last '/' before the filename + int lastSlashInOriginal = samplePath.lastIndexOf('/'); + int lastSlashInNormalized = normalizedPath.lastIndexOf('/'); + + if (lastSlashInOriginal > 0 && lastSlashInNormalized > 0) { + cachedOriginalPathPrefix = samplePath.substring(0, lastSlashInOriginal + 1); + cachedNormalizedPathPrefix = normalizedPath.substring(0, lastSlashInNormalized + 1); + } + + locationPathCacheInitialized = true; + } catch (Exception e) { + // If caching fails, we'll fall back to the full method each time + LOG.warn("Failed to initialize LocationPath cache, will use full parsing", e); + locationPathCacheInitialized = true; + } + } + } + + /** + * Create a LocationPath with cached values for better performance. + * Uses cached path prefix mapping to completely bypass S3URI regex parsing for most files. + * Falls back to full parsing if cache is not available or path doesn't match cached prefix. + */ + private LocationPath createLocationPathWithCache(String path) { + // Initialize cache on first call + if (!locationPathCacheInitialized) { + initLocationPathCache(path); + } + + // Fast path: if path starts with cached original prefix, directly transform without any parsing + if (cachedOriginalPathPrefix != null && path.startsWith(cachedOriginalPathPrefix)) { + // Transform: replace original prefix with normalized prefix + String normalizedPath = cachedNormalizedPathPrefix + path.substring(cachedOriginalPathPrefix.length()); + return LocationPath.ofDirect(normalizedPath, cachedSchema, cachedFsIdentifier, cachedStorageProperties); + } + + // Medium path: use cached StorageProperties but still need validateAndNormalizeUri + if (cachedStorageProperties != null) { + return LocationPath.ofWithCache(path, cachedStorageProperties, cachedSchema, cachedFsIdPrefix); + } + + // Fallback to full parsing + return LocationPath.of(path, storagePropertiesMap); + } + private Split createIcebergSplit(FileScanTask fileScanTask) { String originalPath = fileScanTask.file().path().toString(); - LocationPath locationPath = LocationPath.of(originalPath, storagePropertiesMap); + LocationPath locationPath = createLocationPathWithCache(originalPath); IcebergSplit split = new IcebergSplit( locationPath, fileScanTask.start(), diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java index 99064b4e2e2d3b..71360fc47996e4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java @@ -33,6 +33,15 @@ public class S3PropertyUtils { private static final Logger LOG = LogManager.getLogger(S3PropertyUtils.class); + private static final String SCHEME_DELIM = "://"; + private static final String S3_SCHEME_PREFIX = "s3://"; + + // S3-compatible schemes that can be converted to s3:// with simple string replacement + // Format: scheme://bucket/key -> s3://bucket/key + private static final String[] SIMPLE_S3_COMPATIBLE_SCHEMES = { + "s3a", "s3n", "oss", "cos", "cosn", "obs", "bos", "gs" + }; + /** * Constructs the S3 endpoint from a given URI in the props map. * @@ -113,7 +122,8 @@ public static String constructRegionFromUrl(Map props, /** * Validates and normalizes the given path into a standard S3 URI. - * If the input already starts with "s3://", it is returned as-is. + * If the input already starts with a known S3-compatible scheme (s3://, s3a://, oss://, etc.), + * it is returned as-is to avoid expensive regex parsing. * Otherwise, it is parsed and converted into an S3-compatible URI format. * * @param path the raw S3-style path or full URI @@ -132,16 +142,50 @@ public static String validateAndNormalizeUri(String path, if (StringUtils.isBlank(path)) { throw new StoragePropertiesException("path is null"); } - if (path.startsWith("s3://")) { + + // Fast path 1: s3:// paths are already in the normalized format expected by BE + if (path.startsWith(S3_SCHEME_PREFIX)) { return path; } + // Fast path 2: simple S3-compatible schemes (oss://, cos://, s3a://, etc.) + // can be converted with simple string replacement: scheme://bucket/key -> s3://bucket/key + String normalized = trySimpleSchemeConversion(path); + if (normalized != null) { + return normalized; + } + + // Full parsing path: for HTTP URLs and other complex formats boolean usePathStyle = Boolean.parseBoolean(stringUsePathStyle); boolean forceParsingByStandardUri = Boolean.parseBoolean(stringForceParsingByStandardUri); S3URI s3uri = S3URI.create(path, usePathStyle, forceParsingByStandardUri); return "s3" + S3URI.SCHEME_DELIM + s3uri.getBucket() + S3URI.PATH_DELIM + s3uri.getKey(); } + /** + * Try to convert simple S3-compatible scheme URIs to s3:// format using string replacement. + * This avoids expensive regex parsing for common cases like oss://bucket/key, s3a://bucket/key, etc. + * + * @param path the input path + * @return converted s3:// path if successful, null if the path doesn't match simple pattern + */ + private static String trySimpleSchemeConversion(String path) { + int delimIndex = path.indexOf(SCHEME_DELIM); + if (delimIndex <= 0) { + return null; + } + + String scheme = path.substring(0, delimIndex).toLowerCase(); + for (String compatibleScheme : SIMPLE_S3_COMPATIBLE_SCHEMES) { + if (compatibleScheme.equals(scheme)) { + // Simple conversion: replace scheme with "s3" + // e.g., "oss://bucket/key" -> "s3://bucket/key" + return S3_SCHEME_PREFIX + path.substring(delimIndex + SCHEME_DELIM.length()); + } + } + return null; + } + /** * Extracts and returns the raw URI string from the given props map. * From 3e58ab24b707ee00e6b2f6d2f8c3b51a17408757 Mon Sep 17 00:00:00 2001 From: Mryange Date: Fri, 26 Dec 2025 17:58:12 +0800 Subject: [PATCH 12/12] pick 58636 --- be/src/common/config.cpp | 2 +- be/src/olap/column_predicate.h | 5 +- .../runtime_filter_selectivity.h | 96 ++++++++ be/src/vec/exprs/vexpr.cpp | 55 +++++ be/src/vec/exprs/vexpr.h | 10 +- be/src/vec/exprs/vexpr_context.cpp | 90 +------ be/src/vec/exprs/vexpr_context.h | 9 + be/src/vec/exprs/vruntimefilter_wrapper.cpp | 117 +++++++-- be/src/vec/exprs/vruntimefilter_wrapper.h | 46 +--- .../runtime_filter_selectivity_test.cpp | 222 ++++++++++++++++++ 10 files changed, 495 insertions(+), 157 deletions(-) create mode 100644 be/src/runtime_filter/runtime_filter_selectivity.h create mode 100644 be/test/runtime_filter/runtime_filter_selectivity_test.cpp diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 29f84798fcf006..03a25a6f891583 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1037,7 +1037,7 @@ DEFINE_mInt64(big_column_size_buffer, "65535"); DEFINE_mInt64(small_column_size_buffer, "100"); // Perform the always_true check at intervals determined by runtime_filter_sampling_frequency -DEFINE_mInt32(runtime_filter_sampling_frequency, "64"); +DEFINE_mInt32(runtime_filter_sampling_frequency, "32"); DEFINE_mInt32(execution_max_rpc_timeout_sec, "3600"); DEFINE_mBool(execution_ignore_eovercrowded, "true"); // cooldown task configs diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h index 6e6671ff33766c..7162a96399da72 100644 --- a/be/src/olap/column_predicate.h +++ b/be/src/olap/column_predicate.h @@ -25,6 +25,7 @@ #include "olap/rowset/segment_v2/bloom_filter.h" #include "olap/rowset/segment_v2/inverted_index_iterator.h" #include "runtime/define_primitive_type.h" +#include "runtime_filter/runtime_filter_selectivity.h" #include "util/defer_op.h" #include "util/runtime_profile.h" #include "vec/columns/column.h" @@ -372,8 +373,8 @@ class ColumnPredicate { if (!_always_true) { _judge_filter_rows += filter_rows; _judge_input_rows += input_rows; - vectorized::VRuntimeFilterWrapper::judge_selectivity( - get_ignore_threshold(), _judge_filter_rows, _judge_input_rows, _always_true); + RuntimeFilterSelectivity::judge_selectivity(get_ignore_threshold(), _judge_filter_rows, + _judge_input_rows, _always_true); } } diff --git a/be/src/runtime_filter/runtime_filter_selectivity.h b/be/src/runtime_filter/runtime_filter_selectivity.h new file mode 100644 index 00000000000000..1b0a82143de57a --- /dev/null +++ b/be/src/runtime_filter/runtime_filter_selectivity.h @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "common/config.h" +#include "common/logging.h" + +namespace doris { + +// Used to track the selectivity of runtime filters +// If the selectivity of a runtime filter is very low, it is considered ineffective and can be ignored +// Considering that the selectivity of runtime filters may change with data variations +// A dynamic selectivity tracking mechanism is needed +// Note: this is not a thread-safe class + +class RuntimeFilterSelectivity { +public: + RuntimeFilterSelectivity() = default; + + RuntimeFilterSelectivity(const RuntimeFilterSelectivity&) = delete; + void update_judge_counter() { + if ((_judge_counter++) >= config::runtime_filter_sampling_frequency) { + reset_judge_selectivity(); + } + } + + void update_judge_selectivity(int filter_id, uint64_t filter_rows, uint64_t input_rows, + double ignore_thredhold) { + if (!_always_true) { + _judge_filter_rows += filter_rows; + _judge_input_rows += input_rows; + judge_selectivity(ignore_thredhold, _judge_filter_rows, _judge_input_rows, + _always_true); + } + + VLOG_ROW << fmt::format( + "Runtime filter[{}] selectivity update: filter_rows: {}, input_rows: {}, filter " + "rate: {}, " + "ignore_thredhold: {}, counter: {} , always_true: {}", + filter_id, _judge_filter_rows, _judge_input_rows, + static_cast(_judge_filter_rows) / static_cast(_judge_input_rows), + ignore_thredhold, _judge_counter, _always_true); + } + + bool maybe_always_true_can_ignore() const { + /// TODO: maybe we can use session variable to control this behavior ? + if (config::runtime_filter_sampling_frequency <= 0) { + return false; + } else { + return _always_true; + } + } + + static void judge_selectivity(double ignore_threshold, int64_t filter_rows, int64_t input_rows, + bool& always_true) { + // if the judged input rows is too small, we think the selectivity is not reliable + if (input_rows > min_judge_input_rows) { + always_true = (static_cast(filter_rows) / static_cast(input_rows)) < + ignore_threshold; + } + } + +private: + void reset_judge_selectivity() { + _always_true = false; + _judge_counter = 0; + _judge_input_rows = 0; + _judge_filter_rows = 0; + } + + int64_t _judge_input_rows = 0; + int64_t _judge_filter_rows = 0; + int _judge_counter = 0; + bool _always_true = false; + + constexpr static int64_t min_judge_input_rows = 4096 * 10; +}; + +} // namespace doris diff --git a/be/src/vec/exprs/vexpr.cpp b/be/src/vec/exprs/vexpr.cpp index 1bafe01ad710de..52d4ca01eac856 100644 --- a/be/src/vec/exprs/vexpr.cpp +++ b/be/src/vec/exprs/vexpr.cpp @@ -1015,5 +1015,60 @@ bool VExpr::ann_dist_is_fulfilled() const { return _virtual_column_is_fulfilled; } +Status VExpr::execute_filter(VExprContext* context, const Block* block, + uint8_t* __restrict result_filter_data, size_t rows, bool accept_null, + bool* can_filter_all) const { + ColumnPtr filter_column; + RETURN_IF_ERROR(execute_column(context, block, filter_column)); + if (const auto* const_column = check_and_get_column(*filter_column)) { + // const(nullable) or const(bool) + const bool result = accept_null + ? (const_column->is_null_at(0) || const_column->get_bool(0)) + : (!const_column->is_null_at(0) && const_column->get_bool(0)); + if (!result) { + // filter all + *can_filter_all = true; + memset(result_filter_data, 0, rows); + return Status::OK(); + } + } else if (const auto* nullable_column = check_and_get_column(*filter_column)) { + // nullable(bool) + const ColumnPtr& nested_column = nullable_column->get_nested_column_ptr(); + const IColumn::Filter& filter = assert_cast(*nested_column).get_data(); + const auto* __restrict filter_data = filter.data(); + const auto* __restrict null_map_data = nullable_column->get_null_map_data().data(); + + if (accept_null) { + for (size_t i = 0; i < rows; ++i) { + result_filter_data[i] &= (null_map_data[i]) || filter_data[i]; + } + } else { + for (size_t i = 0; i < rows; ++i) { + result_filter_data[i] &= (!null_map_data[i]) & filter_data[i]; + } + } + + if ((memchr(result_filter_data, 0x1, rows) == nullptr)) { + *can_filter_all = true; + return Status::OK(); + } + } else { + // bool + const IColumn::Filter& filter = assert_cast(*filter_column).get_data(); + const auto* __restrict filter_data = filter.data(); + + for (size_t i = 0; i < rows; ++i) { + result_filter_data[i] &= filter_data[i]; + } + + if (memchr(result_filter_data, 0x1, rows) == nullptr) { + *can_filter_all = true; + return Status::OK(); + } + } + + return Status::OK(); +} + #include "common/compile_check_end.h" } // namespace doris::vectorized diff --git a/be/src/vec/exprs/vexpr.h b/be/src/vec/exprs/vexpr.h index 35a0d3733b094f..2a0abe439f9778 100644 --- a/be/src/vec/exprs/vexpr.h +++ b/be/src/vec/exprs/vexpr.h @@ -147,6 +147,10 @@ class VExpr { // Therefore we need a function like this to return the actual type produced by execution. virtual DataTypePtr execute_type(const Block* block) const { return _data_type; } + virtual Status execute_filter(VExprContext* context, const Block* block, + uint8_t* __restrict result_filter_data, size_t rows, + bool accept_null, bool* can_filter_all) const; + // `is_blockable` means this expr will be blocked in `execute` (e.g. AI Function, Remote Function) [[nodiscard]] virtual bool is_blockable() const { return std::any_of(_children.begin(), _children.end(), @@ -204,12 +208,6 @@ class VExpr { [](VExprSPtr child) { return child->is_rf_wrapper(); }); } - virtual void do_judge_selectivity(uint64_t filter_rows, uint64_t input_rows) { - for (auto child : _children) { - child->do_judge_selectivity(filter_rows, input_rows); - } - } - static Status create_expr_tree(const TExpr& texpr, VExprContextSPtr& ctx); static Status create_expr_trees(const std::vector& texprs, VExprContextSPtrs& ctxs); diff --git a/be/src/vec/exprs/vexpr_context.cpp b/be/src/vec/exprs/vexpr_context.cpp index a7b71b77646435..2a9c049e303b77 100644 --- a/be/src/vec/exprs/vexpr_context.cpp +++ b/be/src/vec/exprs/vexpr_context.cpp @@ -199,7 +199,12 @@ Status VExprContext::execute_conjuncts(const VExprContextSPtrs& ctxs, return execute_conjuncts(ctxs, filters, false, block, result_filter, can_filter_all); } -// TODO: Performance Optimization +Status VExprContext::execute_filter(const Block* block, uint8_t* __restrict result_filter_data, + size_t rows, bool accept_null, bool* can_filter_all) { + return _root->execute_filter(this, block, result_filter_data, rows, accept_null, + can_filter_all); +} + Status VExprContext::execute_conjuncts(const VExprContextSPtrs& ctxs, const std::vector* filters, bool accept_null, const Block* block, @@ -209,85 +214,10 @@ Status VExprContext::execute_conjuncts(const VExprContextSPtrs& ctxs, *can_filter_all = false; auto* __restrict result_filter_data = result_filter->data(); for (const auto& ctx : ctxs) { - // Statistics are only required when an rf wrapper exists in the expr. - bool is_rf_wrapper = ctx->root()->is_rf_wrapper(); - ColumnPtr filter_column; - RETURN_IF_ERROR(ctx->execute(block, filter_column)); - if (const auto* nullable_column = check_and_get_column(*filter_column)) { - size_t column_size = nullable_column->size(); - if (column_size == 0) { - *can_filter_all = true; - return Status::OK(); - } else { - const ColumnPtr& nested_column = nullable_column->get_nested_column_ptr(); - const IColumn::Filter& filter = - assert_cast(*nested_column).get_data(); - const auto* __restrict filter_data = filter.data(); - const auto* __restrict null_map_data = nullable_column->get_null_map_data().data(); - - size_t input_rows = - rows - (is_rf_wrapper - ? simd::count_zero_num((int8_t*)result_filter_data, rows) - : 0); - - if (accept_null) { - for (size_t i = 0; i < rows; ++i) { - result_filter_data[i] &= (null_map_data[i]) || filter_data[i]; - } - } else { - for (size_t i = 0; i < rows; ++i) { - result_filter_data[i] &= (!null_map_data[i]) & filter_data[i]; - } - } - - size_t output_rows = - rows - (is_rf_wrapper - ? simd::count_zero_num((int8_t*)result_filter_data, rows) - : 0); - - if (is_rf_wrapper) { - ctx->root()->do_judge_selectivity(input_rows - output_rows, input_rows); - } - - if ((is_rf_wrapper && output_rows == 0) || - (!is_rf_wrapper && memchr(result_filter_data, 0x1, rows) == nullptr)) { - *can_filter_all = true; - return Status::OK(); - } - } - } else if (const auto* const_column = check_and_get_column(*filter_column)) { - // filter all - if (!const_column->get_bool(0)) { - *can_filter_all = true; - memset(result_filter_data, 0, result_filter->size()); - return Status::OK(); - } - } else { - const IColumn::Filter& filter = - assert_cast(*filter_column).get_data(); - const auto* __restrict filter_data = filter.data(); - - size_t input_rows = - rows - - (is_rf_wrapper ? simd::count_zero_num((int8_t*)result_filter_data, rows) : 0); - - for (size_t i = 0; i < rows; ++i) { - result_filter_data[i] &= filter_data[i]; - } - - size_t output_rows = - rows - - (is_rf_wrapper ? simd::count_zero_num((int8_t*)result_filter_data, rows) : 0); - - if (is_rf_wrapper) { - ctx->root()->do_judge_selectivity(input_rows - output_rows, input_rows); - } - - if ((is_rf_wrapper && output_rows == 0) || - (!is_rf_wrapper && memchr(result_filter_data, 0x1, rows) == nullptr)) { - *can_filter_all = true; - return Status::OK(); - } + RETURN_IF_ERROR( + ctx->execute_filter(block, result_filter_data, rows, accept_null, can_filter_all)); + if (*can_filter_all) { + return Status::OK(); } } if (filters != nullptr) { diff --git a/be/src/vec/exprs/vexpr_context.h b/be/src/vec/exprs/vexpr_context.h index 3179526ec546d6..349f199af234b6 100644 --- a/be/src/vec/exprs/vexpr_context.h +++ b/be/src/vec/exprs/vexpr_context.h @@ -33,6 +33,7 @@ #include "olap/rowset/segment_v2/inverted_index_reader.h" #include "runtime/runtime_state.h" #include "runtime/types.h" +#include "runtime_filter/runtime_filter_selectivity.h" #include "udf/udf.h" #include "vec/columns/column.h" #include "vec/core/block.h" @@ -210,6 +211,9 @@ class VExprContext { bool all_expr_inverted_index_evaluated(); + Status execute_filter(const Block* block, uint8_t* __restrict result_filter_data, size_t rows, + bool accept_null, bool* can_filter_all); + [[nodiscard]] static Status filter_block(VExprContext* vexpr_ctx, Block* block); [[nodiscard]] static Status filter_block(const VExprContextSPtrs& expr_contexts, Block* block, @@ -246,6 +250,8 @@ class VExprContext { return _last_result_column_id; } + RuntimeFilterSelectivity& get_runtime_filter_selectivity() { return *_rf_selectivity; } + FunctionContext::FunctionStateScope get_function_state_scope() const { return _is_clone ? FunctionContext::THREAD_LOCAL : FunctionContext::FRAGMENT_LOCAL; } @@ -337,5 +343,8 @@ class VExprContext { segment_v2::AnnRangeSearchRuntime _ann_range_search_runtime; bool _suitable_for_ann_index = true; + + std::unique_ptr _rf_selectivity = + std::make_unique(); }; } // namespace doris::vectorized diff --git a/be/src/vec/exprs/vruntimefilter_wrapper.cpp b/be/src/vec/exprs/vruntimefilter_wrapper.cpp index 8e915ffff675f0..b24df4860dae1b 100644 --- a/be/src/vec/exprs/vruntimefilter_wrapper.cpp +++ b/be/src/vec/exprs/vruntimefilter_wrapper.cpp @@ -62,9 +62,7 @@ VRuntimeFilterWrapper::VRuntimeFilterWrapper(const TExprNode& node, VExprSPtr im _impl(std::move(impl)), _ignore_thredhold(ignore_thredhold), _null_aware(null_aware), - _filter_id(filter_id) { - reset_judge_selectivity(); -} + _filter_id(filter_id) {} Status VRuntimeFilterWrapper::prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) { @@ -89,38 +87,105 @@ void VRuntimeFilterWrapper::close(VExprContext* context, Status VRuntimeFilterWrapper::execute_column(VExprContext* context, const Block* block, ColumnPtr& result_column) const { - DCHECK(_open_finished || _getting_const_col); - if (_judge_counter.fetch_sub(1) == 0) { - reset_judge_selectivity(); + return Status::InternalError("Not implement VRuntimeFilterWrapper::execute_column"); +} + +const std::string& VRuntimeFilterWrapper::expr_name() const { + return _expr_name; +} + +Status VRuntimeFilterWrapper::execute_filter(VExprContext* context, const Block* block, + uint8_t* __restrict result_filter_data, size_t rows, + bool accept_null, bool* can_filter_all) const { + DCHECK(_open_finished); + if (accept_null) { + return Status::InternalError( + "Runtime filter does not support accept_null in execute_filter"); } - if (_always_true) { - size_t size = block->rows(); - result_column = create_always_true_column(size, _data_type->is_nullable()); - COUNTER_UPDATE(_always_true_filter_rows, size); + + auto& rf_selectivity = context->get_runtime_filter_selectivity(); + Defer auto_update_judge_counter = [&]() { rf_selectivity.update_judge_counter(); }; + + // if always true, skip evaluate runtime filter + if (rf_selectivity.maybe_always_true_can_ignore()) { + COUNTER_UPDATE(_always_true_filter_rows, rows); return Status::OK(); - } else { - if (_getting_const_col) { - _impl->set_getting_const_col(true); + } + + ColumnPtr filter_column; + ColumnPtr arg_column = nullptr; + RETURN_IF_ERROR(_impl->execute_runtime_filter(context, block, filter_column, &arg_column)); + + // bloom filter will handle null aware inside itself + if (_null_aware && TExprNodeType::BLOOM_PRED != node_type()) { + DCHECK(arg_column); + change_null_to_true(filter_column->assume_mutable(), arg_column); + } + + if (const auto* const_column = check_and_get_column(*filter_column)) { + // const(nullable) or const(bool) + if (!const_column->get_bool(0)) { + // filter all + COUNTER_UPDATE(_rf_filter_rows, rows); + COUNTER_UPDATE(_rf_input_rows, rows); + rf_selectivity.update_judge_selectivity(_filter_id, rows, rows, _ignore_thredhold); + *can_filter_all = true; + memset(result_filter_data, 0, rows); + return Status::OK(); + } else { + // filter none + COUNTER_UPDATE(_rf_input_rows, rows); + rf_selectivity.update_judge_selectivity(_filter_id, 0, rows, _ignore_thredhold); + return Status::OK(); } + } else if (const auto* nullable_column = check_and_get_column(*filter_column)) { + // nullable(bool) + const ColumnPtr& nested_column = nullable_column->get_nested_column_ptr(); + const IColumn::Filter& filter = assert_cast(*nested_column).get_data(); + const auto* __restrict filter_data = filter.data(); + const auto* __restrict null_map_data = nullable_column->get_null_map_data().data(); + + const size_t input_rows = rows - simd::count_zero_num((int8_t*)result_filter_data, rows); - ColumnPtr arg_column = nullptr; - RETURN_IF_ERROR(_impl->execute_runtime_filter(context, block, result_column, &arg_column)); - if (_getting_const_col) { - _impl->set_getting_const_col(false); + for (size_t i = 0; i < rows; ++i) { + result_filter_data[i] &= (!null_map_data[i]) & filter_data[i]; } - // bloom filter will handle null aware inside itself - if (_null_aware && TExprNodeType::BLOOM_PRED != node_type()) { - DCHECK(arg_column); - change_null_to_true(result_column->assume_mutable(), arg_column); + const size_t output_rows = rows - simd::count_zero_num((int8_t*)result_filter_data, rows); + + COUNTER_UPDATE(_rf_filter_rows, input_rows - output_rows); + COUNTER_UPDATE(_rf_input_rows, input_rows); + rf_selectivity.update_judge_selectivity(_filter_id, input_rows - output_rows, input_rows, + _ignore_thredhold); + + if (output_rows == 0) { + *can_filter_all = true; + return Status::OK(); } + } else { + // bool + const IColumn::Filter& filter = assert_cast(*filter_column).get_data(); + const auto* __restrict filter_data = filter.data(); - return Status::OK(); - } -} + const size_t input_rows = rows - simd::count_zero_num((int8_t*)result_filter_data, rows); -const std::string& VRuntimeFilterWrapper::expr_name() const { - return _expr_name; + for (size_t i = 0; i < rows; ++i) { + result_filter_data[i] &= filter_data[i]; + } + + const size_t output_rows = rows - simd::count_zero_num((int8_t*)result_filter_data, rows); + + COUNTER_UPDATE(_rf_filter_rows, input_rows - output_rows); + COUNTER_UPDATE(_rf_input_rows, input_rows); + rf_selectivity.update_judge_selectivity(_filter_id, input_rows - output_rows, input_rows, + _ignore_thredhold); + + if (output_rows == 0) { + *can_filter_all = true; + return Status::OK(); + } + } + return Status::OK(); } #include "common/compile_check_end.h" diff --git a/be/src/vec/exprs/vruntimefilter_wrapper.h b/be/src/vec/exprs/vruntimefilter_wrapper.h index 3535898915b2ba..09bc8a815c7d6d 100644 --- a/be/src/vec/exprs/vruntimefilter_wrapper.h +++ b/be/src/vec/exprs/vruntimefilter_wrapper.h @@ -63,6 +63,10 @@ class VRuntimeFilterWrapper final : public VExpr { const std::string& expr_name() const override; const VExprSPtrs& children() const override { return _impl->children(); } + Status execute_filter(VExprContext* context, const Block* block, + uint8_t* __restrict result_filter_data, size_t rows, bool accept_null, + bool* can_filter_all) const override; + uint64_t get_digest(uint64_t seed) const override { seed = _impl->get_digest(seed); if (seed) { @@ -91,33 +95,10 @@ class VRuntimeFilterWrapper final : public VExpr { } } - void update_counters(int64_t filter_rows, int64_t input_rows) { - COUNTER_UPDATE(_rf_filter_rows, filter_rows); - COUNTER_UPDATE(_rf_input_rows, input_rows); - } - - template - static void judge_selectivity(double ignore_threshold, int64_t filter_rows, int64_t input_rows, - T& always_true) { - always_true = static_cast(filter_rows) / static_cast(input_rows) < - ignore_threshold; - } - bool is_rf_wrapper() const override { return true; } int filter_id() const { return _filter_id; } - void do_judge_selectivity(uint64_t filter_rows, uint64_t input_rows) override { - update_counters(filter_rows, input_rows); - - if (!_always_true) { - _judge_filter_rows += filter_rows; - _judge_input_rows += input_rows; - judge_selectivity(_ignore_thredhold, _judge_filter_rows, _judge_input_rows, - _always_true); - } - } - std::shared_ptr predicate_filtered_rows_counter() const { return _rf_filter_rows; } @@ -129,26 +110,7 @@ class VRuntimeFilterWrapper final : public VExpr { } private: - void reset_judge_selectivity() const { - _always_true = false; - _judge_counter = config::runtime_filter_sampling_frequency; - _judge_input_rows = 0; - _judge_filter_rows = 0; - } - VExprSPtr _impl; - // VRuntimeFilterWrapper and ColumnPredicate share the same logic, - // but it's challenging to unify them, so the code is duplicated. - // _judge_counter, _judge_input_rows, _judge_filter_rows, and _always_true - // are variables used to implement the _always_true logic, calculated periodically - // based on runtime_filter_sampling_frequency. During each period, if _always_true - // is evaluated as true, the logic for always_true is applied for the rest of that period - // without recalculating. At the beginning of the next period, - // reset_judge_selectivity is used to reset these variables. - mutable std::atomic_int _judge_counter = 0; - mutable std::atomic_uint64_t _judge_input_rows = 0; - mutable std::atomic_uint64_t _judge_filter_rows = 0; - mutable std::atomic_int _always_true = false; std::shared_ptr _rf_input_rows = std::make_shared(TUnit::UNIT, 0); diff --git a/be/test/runtime_filter/runtime_filter_selectivity_test.cpp b/be/test/runtime_filter/runtime_filter_selectivity_test.cpp new file mode 100644 index 00000000000000..b8504f950c21d1 --- /dev/null +++ b/be/test/runtime_filter/runtime_filter_selectivity_test.cpp @@ -0,0 +1,222 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "runtime_filter/runtime_filter_selectivity.h" + +#include +#include + +namespace doris { + +class RuntimeFilterSelectivityTest : public testing::Test { +protected: + void SetUp() override { + // Save original config value + _original_sampling_frequency = config::runtime_filter_sampling_frequency; + } + + void TearDown() override { + // Restore original config value + config::runtime_filter_sampling_frequency = _original_sampling_frequency; + } + + int _original_sampling_frequency; +}; + +TEST_F(RuntimeFilterSelectivityTest, basic_initialization) { + RuntimeFilterSelectivity selectivity; + // Initially should be false (not always_true) + EXPECT_FALSE(selectivity.maybe_always_true_can_ignore()); +} + +TEST_F(RuntimeFilterSelectivityTest, disabled_sampling_frequency) { + RuntimeFilterSelectivity selectivity; + config::runtime_filter_sampling_frequency = 0; + + // Even if conditions are met, should return false when sampling is disabled + selectivity.update_judge_selectivity(-1, 2000, 50000, 0.1); + EXPECT_FALSE(selectivity.maybe_always_true_can_ignore()); +} + +TEST_F(RuntimeFilterSelectivityTest, negative_sampling_frequency) { + RuntimeFilterSelectivity selectivity; + config::runtime_filter_sampling_frequency = -1; + + selectivity.update_judge_selectivity(-1, 2000, 50000, 0.1); + EXPECT_FALSE(selectivity.maybe_always_true_can_ignore()); +} + +TEST_F(RuntimeFilterSelectivityTest, judge_selectivity_below_threshold) { + bool always_true = false; + // filter_rows/input_rows = 5/50000 = 0.0001 < 0.1 + // input_rows (50000) > min_judge_input_rows (40960) + RuntimeFilterSelectivity::judge_selectivity(0.1, 5, 50000, always_true); + EXPECT_TRUE(always_true); +} + +TEST_F(RuntimeFilterSelectivityTest, judge_selectivity_above_threshold) { + bool always_true = false; + // filter_rows/input_rows = 25000/50000 = 0.5 >= 0.1 + RuntimeFilterSelectivity::judge_selectivity(0.1, 25000, 50000, always_true); + EXPECT_FALSE(always_true); +} + +TEST_F(RuntimeFilterSelectivityTest, judge_selectivity_insufficient_input_rows) { + bool always_true = false; + // Even though 5/100 = 0.05 < 0.1, input_rows (100) < min_judge_input_rows (40960) + RuntimeFilterSelectivity::judge_selectivity(0.1, 5, 100, always_true); + EXPECT_FALSE(always_true); +} + +TEST_F(RuntimeFilterSelectivityTest, update_with_low_selectivity) { + config::runtime_filter_sampling_frequency = 100; + RuntimeFilterSelectivity selectivity; + + // filter_rows/input_rows = 2000/50000 = 0.04 < 0.1 + selectivity.update_judge_selectivity(-1, 2000, 50000, 0.1); + EXPECT_TRUE(selectivity.maybe_always_true_can_ignore()); +} + +TEST_F(RuntimeFilterSelectivityTest, update_with_high_selectivity) { + config::runtime_filter_sampling_frequency = 100; + RuntimeFilterSelectivity selectivity; + + // filter_rows/input_rows = 45000/50000 = 0.9 >= 0.1 + selectivity.update_judge_selectivity(-1, 45000, 50000, 0.1); + EXPECT_FALSE(selectivity.maybe_always_true_can_ignore()); +} + +TEST_F(RuntimeFilterSelectivityTest, once_always_true_stays_true) { + config::runtime_filter_sampling_frequency = 100; + RuntimeFilterSelectivity selectivity; + + // First update: low selectivity + selectivity.update_judge_selectivity(-1, 2000, 50000, 0.1); + EXPECT_TRUE(selectivity.maybe_always_true_can_ignore()); + + // Second update: high selectivity, but should be ignored + selectivity.update_judge_selectivity(-1, 45000, 50000, 0.1); + EXPECT_TRUE(selectivity.maybe_always_true_can_ignore()); +} + +TEST_F(RuntimeFilterSelectivityTest, accumulated_selectivity_low) { + config::runtime_filter_sampling_frequency = 100; + RuntimeFilterSelectivity selectivity; + + // First update: 1000/50000 = 0.02 + selectivity.update_judge_selectivity(-1, 1000, 50000, 0.1); + EXPECT_TRUE(selectivity.maybe_always_true_can_ignore()); +} + +TEST_F(RuntimeFilterSelectivityTest, accumulated_selectivity_high) { + config::runtime_filter_sampling_frequency = 100; + RuntimeFilterSelectivity selectivity; + + // First update: 20000/50000 = 0.4 + selectivity.update_judge_selectivity(-1, 20000, 50000, 0.1); + EXPECT_FALSE(selectivity.maybe_always_true_can_ignore()); + + // Second update: accumulated (20000+20000)/(50000+50000) = 0.4 + selectivity.update_judge_selectivity(-1, 20000, 50000, 0.1); + EXPECT_FALSE(selectivity.maybe_always_true_can_ignore()); +} + +TEST_F(RuntimeFilterSelectivityTest, counter_triggers_reset) { + config::runtime_filter_sampling_frequency = 3; + RuntimeFilterSelectivity selectivity; + + // Mark as always_true + selectivity.update_judge_selectivity(-1, 2000, 50000, 0.1); + EXPECT_TRUE(selectivity.maybe_always_true_can_ignore()); + + // Update counter to trigger reset + selectivity.update_judge_counter(); // counter = 1 + selectivity.update_judge_counter(); // counter = 2 + selectivity.update_judge_counter(); // counter = 3, triggers reset + + EXPECT_TRUE(selectivity.maybe_always_true_can_ignore()); +} + +TEST_F(RuntimeFilterSelectivityTest, reset_allows_reevaluation) { + config::runtime_filter_sampling_frequency = 2; + RuntimeFilterSelectivity selectivity; + + // First cycle: mark as always_true + selectivity.update_judge_selectivity(-1, 2000, 50000, 0.1); + EXPECT_TRUE(selectivity.maybe_always_true_can_ignore()); + + // Trigger reset + selectivity.update_judge_counter(); // counter = 1 + selectivity.update_judge_counter(); // counter = 2, triggers reset + + // Second cycle: now with high selectivity + selectivity.update_judge_selectivity(-1, 45000, 50000, 0.1); + EXPECT_TRUE(selectivity.maybe_always_true_can_ignore()); +} + +TEST_F(RuntimeFilterSelectivityTest, edge_case_zero_rows) { + bool always_true = false; + RuntimeFilterSelectivity::judge_selectivity(0.1, 0, 0, always_true); + EXPECT_FALSE(always_true); +} + +TEST_F(RuntimeFilterSelectivityTest, edge_case_exact_threshold) { + bool always_true = false; + // Exactly at threshold: 5000/50000 = 0.1, NOT less than 0.1 + RuntimeFilterSelectivity::judge_selectivity(0.1, 5000, 50000, always_true); + EXPECT_FALSE(always_true); + + // Just below threshold: 4999/50000 = 0.09998 < 0.1 + RuntimeFilterSelectivity::judge_selectivity(0.1, 4999, 50000, always_true); + EXPECT_TRUE(always_true); +} + +TEST_F(RuntimeFilterSelectivityTest, multiple_updates_before_threshold) { + config::runtime_filter_sampling_frequency = 100; + RuntimeFilterSelectivity selectivity; + + // Multiple updates with insufficient rows each time + selectivity.update_judge_selectivity(-1, 100, 1000, 0.1); // 100/1000, insufficient + EXPECT_FALSE(selectivity.maybe_always_true_can_ignore()); + + selectivity.update_judge_selectivity(-1, 200, 2000, 0.1); // 300/3000, insufficient + EXPECT_FALSE(selectivity.maybe_always_true_can_ignore()); + + // Now accumulated rows are sufficient: 300+2000 = 2300, 3000+40000 = 43000 + selectivity.update_judge_selectivity(-1, 2000, 40000, 0.1); // 2300/43000 = 0.053 < 0.1 + EXPECT_TRUE(selectivity.maybe_always_true_can_ignore()); +} + +TEST_F(RuntimeFilterSelectivityTest, different_thresholds) { + config::runtime_filter_sampling_frequency = 100; + + // Test with threshold 0.05 + { + RuntimeFilterSelectivity selectivity; + selectivity.update_judge_selectivity(-1, 2000, 50000, 0.05); // 0.04 < 0.05 + EXPECT_TRUE(selectivity.maybe_always_true_can_ignore()); + } + + // Test with threshold 0.03 + { + RuntimeFilterSelectivity selectivity; + selectivity.update_judge_selectivity(-1, 2000, 50000, 0.03); // 0.04 >= 0.03 + EXPECT_FALSE(selectivity.maybe_always_true_can_ignore()); + } +} + +} // namespace doris