From 3cc5165d3d30a3fd32b867e632431ffcc190c48b Mon Sep 17 00:00:00 2001
From: happenlee <happenlee@selectdb.com>
Date: Wed, 10 Dec 2025 16:49:37 +0800
Subject: [PATCH 01/12] change the null map encode

---
 .../olap/rowset/segment_v2/column_writer.cpp  | 87 +++++++++++++++++--
 be/src/olap/rowset/segment_v2/column_writer.h |  4 +-
 be/src/olap/rowset/segment_v2/parsed_page.h   | 19 ++--
 gensrc/proto/segment_v2.proto                 |  2 +
 4 files changed, 98 insertions(+), 14 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp
index b165b2b766a6d1..19409824612706 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/column_writer.cpp
@@ -22,6 +22,7 @@
 #include <algorithm>
 #include <filesystem>
 #include <memory>
+#include <vector>
 
 #include "common/config.h"
 #include "common/logging.h"
@@ -52,7 +53,28 @@
 namespace doris::segment_v2 {
 #include "common/compile_check_begin.h"
 
-class NullBitmapBuilder {
+// Abstract base class for null bitmap builders
+class NullBitmapBuilderBase {
+public:
+    virtual ~NullBitmapBuilderBase() = default;
+
+    // Add a run of 'run' values, all equal to 'value'
+    virtual void add_run(bool value, size_t run) = 0;
+
+    // Returns whether the building nullmap contains any null values
+    virtual bool has_null() const = 0;
+
+    // Finish building the null bitmap and write the result to 'slice'
+    virtual Status finish(OwnedSlice* slice) = 0;
+
+    // Reset the builder to its initial state
+    virtual void reset() = 0;
+
+    // Return the current size of the buffer in bytes
+    virtual uint64_t size() = 0;
+};
+
+class NullBitmapBuilder : public NullBitmapBuilderBase {
 public:
     NullBitmapBuilder() : _has_null(false), _bitmap_buf(512), _rle_encoder(&_bitmap_buf, 1) {}
 
@@ -61,26 +83,26 @@ class NullBitmapBuilder {
               _bitmap_buf(BitmapSize(reserve_bits)),
               _rle_encoder(&_bitmap_buf, 1) {}
 
-    void add_run(bool value, size_t run) {
+    void add_run(bool value, size_t run) override {
         _has_null |= value;
         _rle_encoder.Put(value, run);
     }
 
     // Returns whether the building nullmap contains nullptr
-    bool has_null() const { return _has_null; }
+    bool has_null() const override { return _has_null; }
 
-    Status finish(OwnedSlice* slice) {
+    Status finish(OwnedSlice* slice) override {
         _rle_encoder.Flush();
         RETURN_IF_CATCH_EXCEPTION({ *slice = _bitmap_buf.build(); });
         return Status::OK();
     }
 
-    void reset() {
+    void reset() override {
         _has_null = false;
         _rle_encoder.Clear();
     }
 
-    uint64_t size() { return _bitmap_buf.size(); }
+    uint64_t size() override { return _bitmap_buf.size(); }
 
 private:
     bool _has_null;
@@ -88,6 +110,56 @@ class NullBitmapBuilder {
     RleEncoder<bool> _rle_encoder;
 };
 
+// PlainNullBitmapBuilder uses std::vector<uint8_t> to store null values directly without RLE encoding
+// Each uint8_t represents a single null value: 0 = non-null, 1 = null
+class PlainNullBitmapBuilder : public NullBitmapBuilderBase {
+public:
+    PlainNullBitmapBuilder() : _has_null(false), _bitmap_buf() {}
+
+    explicit PlainNullBitmapBuilder(size_t reserve_bits)
+            : _has_null(false),
+              _bitmap_buf(reserve_bits, 0) {} // Reserve enough bytes for the given number of bits
+
+    void add_run(bool value, size_t run) override {
+        _has_null |= value;
+        const uint8_t val = value ? 1 : 0;
+
+        // Ensure the buffer has enough bytes to hold all values
+        const size_t current_size = _bitmap_buf.size();
+        _bitmap_buf.resize(current_size + run, 0);
+
+        if (val) {
+            // Fill the new bytes with the value
+            std::fill(_bitmap_buf.begin() + current_size, _bitmap_buf.end(), val);
+        }
+    }
+
+    // Returns whether the building nullmap contains nullptr
+    bool has_null() const override { return _has_null; }
+
+    Status finish(OwnedSlice* slice) override {
+        // No need to flush, just build the slice from the buffer
+        RETURN_IF_CATCH_EXCEPTION({
+            // Create a new OwnedSlice and copy the data
+            OwnedSlice result(_bitmap_buf.size());
+            memcpy(result.data(), _bitmap_buf.data(), _bitmap_buf.size());
+            *slice = std::move(result);
+        });
+        return Status::OK();
+    }
+
+    void reset() override {
+        _has_null = false;
+        _bitmap_buf.clear();
+    }
+
+    uint64_t size() override { return _bitmap_buf.size(); }
+
+private:
+    bool _has_null;
+    std::vector<uint8_t> _bitmap_buf;
+};
+
 inline ScalarColumnWriter* get_null_writer(const ColumnWriterOptions& opts,
                                            io::FileWriter* file_writer, uint32_t id) {
     if (!opts.meta->is_nullable()) {
@@ -458,7 +530,7 @@ Status ScalarColumnWriter::init() {
     _ordinal_index_builder = std::make_unique<OrdinalIndexWriter>();
     // create null bitmap builder
     if (is_nullable()) {
-        _null_bitmap_builder = std::make_unique<NullBitmapBuilder>();
+        _null_bitmap_builder = std::make_unique<PlainNullBitmapBuilder>();
     }
     if (_opts.need_zone_map) {
         RETURN_IF_ERROR(ZoneMapIndexWriter::create(get_field(), _zone_map_index_builder));
@@ -743,6 +815,7 @@ Status ScalarColumnWriter::finish_current_page() {
     data_page_footer->set_first_ordinal(_first_rowid);
     data_page_footer->set_num_values(_next_rowid - _first_rowid);
     data_page_footer->set_nullmap_size(cast_set<uint32_t>(nullmap.slice().size));
+    data_page_footer->set_new_null_map(true);
     if (_new_page_callback != nullptr) {
         _new_page_callback->put_extra_info_in_page(data_page_footer);
     }
diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h
index 9e39ef45bb4c00..89d544ea2e918d 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.h
+++ b/be/src/olap/rowset/segment_v2/column_writer.h
@@ -92,7 +92,9 @@ struct ColumnWriterOptions {
 
 class BitmapIndexWriter;
 class EncodingInfo;
+class NullBitmapBuilderBase;
 class NullBitmapBuilder;
+class PlainNullBitmapBuilder;
 class OrdinalIndexWriter;
 class PageBuilder;
 class BloomFilterIndexWriter;
@@ -268,7 +270,7 @@ class ScalarColumnWriter : public ColumnWriter {
 private:
     std::unique_ptr<PageBuilder> _page_builder;
 
-    std::unique_ptr<NullBitmapBuilder> _null_bitmap_builder;
+    std::unique_ptr<NullBitmapBuilderBase> _null_bitmap_builder;
 
     ColumnWriterOptions _opts;
 
diff --git a/be/src/olap/rowset/segment_v2/parsed_page.h b/be/src/olap/rowset/segment_v2/parsed_page.h
index 7ef20adecfed64..b654b73b10d9ef 100644
--- a/be/src/olap/rowset/segment_v2/parsed_page.h
+++ b/be/src/olap/rowset/segment_v2/parsed_page.h
@@ -51,11 +51,17 @@ struct ParsedPage {
         auto null_bitmap = Slice(body.data + body.size - null_size, null_size);
 
         if (null_size > 0) {
-            auto null_decoder = RleDecoder<bool>((const uint8_t*)null_bitmap.data, null_size, 1);
-            // Decode all null values into null_maps in advance
-            auto num_rows = footer.num_values();
-            page->null_maps.resize(num_rows);
-            null_decoder.get_values((bool*)page->null_maps.data(), num_rows);
+            if (footer.has_new_null_map() && footer.new_null_map()) {
+                page->null_maps = std::span<uint8_t>((uint8_t*)null_bitmap.data, null_size);
+            } else {
+                auto null_decoder =
+                        RleDecoder<bool>((const uint8_t*)null_bitmap.data, null_size, 1);
+                // Decode all null values into null_maps in advance
+                auto num_rows = footer.num_values();
+                page->null_bitmap.resize(num_rows);
+                null_decoder.get_values((bool*)page->null_bitmap.data(), num_rows);
+                page->null_maps = std::span<uint8_t>(page->null_bitmap.data(), num_rows);
+            }
         }
 
         Slice data_slice(body.data, body.size - null_size);
@@ -84,7 +90,8 @@ struct ParsedPage {
 
     PageHandle page_handle;
 
-    std::vector<uint8_t> null_maps;
+    std::span<uint8_t> null_maps;
+    std::vector<uint8_t> null_bitmap;
     std::unique_ptr<PageDecoder> data_decoder;
 
     // ordinal of the first value in this page
diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto
index 653d565d546973..535c270d40b811 100644
--- a/gensrc/proto/segment_v2.proto
+++ b/gensrc/proto/segment_v2.proto
@@ -74,6 +74,8 @@ message DataPageFooterPB {
     // only for array column
     // Save the offset of next page 
     optional uint64 next_array_item_ordinal = 4;
+
+    optional bool new_null_map = 5;
 }
 
 message IndexPageFooterPB {

From 69297e7631205e957e07aec0f6e948210d18473c Mon Sep 17 00:00:00 2001
From: happenlee <happenlee@selectdb.com>
Date: Wed, 10 Dec 2025 19:13:32 +0800
Subject: [PATCH 02/12] plain lz4 null page encode

---
 .../olap/rowset/segment_v2/column_writer.cpp  | 32 ++++++++++++++++---
 be/src/olap/rowset/segment_v2/parsed_page.h   | 21 ++++++++----
 2 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp
index 19409824612706..03bb4bfffdbc2c 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/column_writer.cpp
@@ -140,10 +140,34 @@ class PlainNullBitmapBuilder : public NullBitmapBuilderBase {
     Status finish(OwnedSlice* slice) override {
         // No need to flush, just build the slice from the buffer
         RETURN_IF_CATCH_EXCEPTION({
-            // Create a new OwnedSlice and copy the data
-            OwnedSlice result(_bitmap_buf.size());
-            memcpy(result.data(), _bitmap_buf.data(), _bitmap_buf.size());
-            *slice = std::move(result);
+            // Check if we should compress the data
+            if (!_bitmap_buf.empty()) {
+                // Get LZ4 compression codec
+                BlockCompressionCodec* codec = nullptr;
+                RETURN_IF_ERROR(
+                        get_block_compression_codec(segment_v2::CompressionTypePB::LZ4, &codec));
+                if (codec != nullptr) {
+                    // Compress the data
+                    faststring compressed_buf;
+                    Slice raw_slice(_bitmap_buf.data(), _bitmap_buf.size());
+                    Status status = codec->compress(raw_slice, &compressed_buf);
+                    if (status.ok()) {
+                        // Use compressed data if compression is successful and reduces size
+                        // if (compressed_buf.size() < _bitmap_buf.size()) {
+                        // Directly build OwnedSlice from compressed_buf to avoid memory copy
+                        *slice = compressed_buf.build();
+                        return Status::OK();
+                        // }
+                    } else {
+                        return status;
+                    }
+                }
+            }
+            // // Fallback to uncompressed data if compression fails or doesn't reduce size
+            // // Create OwnedSlice directly from _bitmap_buf data
+            // OwnedSlice result(_bitmap_buf.size());
+            // memcpy(result.data(), _bitmap_buf.data(), _bitmap_buf.size());
+            // *slice = std::move(result);
         });
         return Status::OK();
     }
diff --git a/be/src/olap/rowset/segment_v2/parsed_page.h b/be/src/olap/rowset/segment_v2/parsed_page.h
index b654b73b10d9ef..6137ee5e1f2d53 100644
--- a/be/src/olap/rowset/segment_v2/parsed_page.h
+++ b/be/src/olap/rowset/segment_v2/parsed_page.h
@@ -29,6 +29,7 @@
 #include "olap/rowset/segment_v2/options.h"
 #include "olap/rowset/segment_v2/page_decoder.h"
 #include "olap/rowset/segment_v2/page_handle.h"
+#include "util/block_compression.h"
 #include "util/rle_encoding.h"
 #include "util/slice.h"
 
@@ -52,15 +53,24 @@ struct ParsedPage {
 
         if (null_size > 0) {
             if (footer.has_new_null_map() && footer.new_null_map()) {
-                page->null_maps = std::span<uint8_t>((uint8_t*)null_bitmap.data, null_size);
+                // Get LZ4 compression codec
+                BlockCompressionCodec* codec = nullptr;
+                RETURN_IF_ERROR(
+                        get_block_compression_codec(segment_v2::CompressionTypePB::LZ4, &codec));
+                if (codec != nullptr) {
+                    // Compress the data
+                    faststring compressed_buf;
+                    page->null_maps.resize(footer.num_values());
+                    auto tmp_slice = Slice(page->null_maps.data(), page->null_maps.size());
+                    RETURN_IF_ERROR(codec->decompress(null_bitmap, &tmp_slice));
+                }
             } else {
                 auto null_decoder =
                         RleDecoder<bool>((const uint8_t*)null_bitmap.data, null_size, 1);
                 // Decode all null values into null_maps in advance
                 auto num_rows = footer.num_values();
-                page->null_bitmap.resize(num_rows);
-                null_decoder.get_values((bool*)page->null_bitmap.data(), num_rows);
-                page->null_maps = std::span<uint8_t>(page->null_bitmap.data(), num_rows);
+                page->null_maps.resize(num_rows);
+                null_decoder.get_values((bool*)page->null_maps.data(), num_rows);
             }
         }
 
@@ -90,8 +100,7 @@ struct ParsedPage {
 
     PageHandle page_handle;
 
-    std::span<uint8_t> null_maps;
-    std::vector<uint8_t> null_bitmap;
+    std::vector<uint8_t> null_maps;
     std::unique_ptr<PageDecoder> data_decoder;
 
     // ordinal of the first value in this page

From ae0e410294be3fe138da5d4864b88c3358024fc2 Mon Sep 17 00:00:00 2001
From: happenlee <happenlee@selectdb.com>
Date: Thu, 11 Dec 2025 10:56:51 +0800
Subject: [PATCH 03/12] use bitshuffle to decoding

---
 .../olap/rowset/segment_v2/column_writer.cpp  | 78 ++++++++++++-------
 be/src/olap/rowset/segment_v2/parsed_page.h   | 32 +++++---
 gensrc/proto/segment_v2.proto                 |  4 +-
 3 files changed, 72 insertions(+), 42 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp
index 03bb4bfffdbc2c..0a21bea7a50721 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/column_writer.cpp
@@ -29,6 +29,7 @@
 #include "io/fs/file_writer.h"
 #include "olap/olap_common.h"
 #include "olap/rowset/segment_v2/bitmap_index_writer.h"
+#include "olap/rowset/segment_v2/bitshuffle_page.h"
 #include "olap/rowset/segment_v2/bloom_filter_index_writer.h"
 #include "olap/rowset/segment_v2/encoding_info.h"
 #include "olap/rowset/segment_v2/inverted_index_writer.h"
@@ -72,6 +73,11 @@ class NullBitmapBuilderBase {
 
     // Return the current size of the buffer in bytes
     virtual uint64_t size() = 0;
+
+    EncodingTypePB encoding() const { return _encoding_type; }
+
+protected:
+    EncodingTypePB _encoding_type = RLE;
 };
 
 class NullBitmapBuilder : public NullBitmapBuilderBase {
@@ -114,11 +120,9 @@ class NullBitmapBuilder : public NullBitmapBuilderBase {
 // Each uint8_t represents a single null value: 0 = non-null, 1 = null
 class PlainNullBitmapBuilder : public NullBitmapBuilderBase {
 public:
-    PlainNullBitmapBuilder() : _has_null(false), _bitmap_buf() {}
-
-    explicit PlainNullBitmapBuilder(size_t reserve_bits)
-            : _has_null(false),
-              _bitmap_buf(reserve_bits, 0) {} // Reserve enough bytes for the given number of bits
+    PlainNullBitmapBuilder(EncodingTypePB encoding_type) : _has_null(false), _bitmap_buf() {
+        _encoding_type = encoding_type;
+    }
 
     void add_run(bool value, size_t run) override {
         _has_null |= value;
@@ -142,32 +146,40 @@ class PlainNullBitmapBuilder : public NullBitmapBuilderBase {
         RETURN_IF_CATCH_EXCEPTION({
             // Check if we should compress the data
             if (!_bitmap_buf.empty()) {
-                // Get LZ4 compression codec
-                BlockCompressionCodec* codec = nullptr;
-                RETURN_IF_ERROR(
-                        get_block_compression_codec(segment_v2::CompressionTypePB::LZ4, &codec));
-                if (codec != nullptr) {
-                    // Compress the data
+                if (_encoding_type == EncodingTypePB::BIT_SHUFFLE) {
                     faststring compressed_buf;
-                    Slice raw_slice(_bitmap_buf.data(), _bitmap_buf.size());
-                    Status status = codec->compress(raw_slice, &compressed_buf);
-                    if (status.ok()) {
-                        // Use compressed data if compression is successful and reduces size
-                        // if (compressed_buf.size() < _bitmap_buf.size()) {
-                        // Directly build OwnedSlice from compressed_buf to avoid memory copy
-                        *slice = compressed_buf.build();
-                        return Status::OK();
-                        // }
-                    } else {
-                        return status;
+                    compressed_buf.resize(
+                            bitshuffle::compress_lz4_bound(_bitmap_buf.size(), sizeof(uint8_t), 0));
+                    int64_t r = bitshuffle::compress_lz4(_bitmap_buf.data(), compressed_buf.data(),
+                                                         _bitmap_buf.size(), sizeof(uint8_t), 0);
+                    if (UNLIKELY(r < 0)) {
+                        return Status::InternalError("bitshuffle compress failed");
+                    }
+                    // before build(), update buffer length to the actual compressed size
+                    compressed_buf.resize(r);
+                    *slice = compressed_buf.build();
+                    return Status::OK();
+                } else if (_encoding_type == EncodingTypePB::PLAIN_ENCODING) {
+                    // Get LZ4 compression codec
+                    BlockCompressionCodec* codec = nullptr;
+                    RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::LZ4,
+                                                                &codec));
+                    if (codec != nullptr) {
+                        // Compress the data
+                        faststring compressed_buf;
+                        Slice raw_slice(_bitmap_buf.data(), _bitmap_buf.size());
+                        Status status = codec->compress(raw_slice, &compressed_buf);
+                        if (status.ok()) {
+                            *slice = compressed_buf.build();
+                            return Status::OK();
+                        } else {
+                            return status;
+                        }
                     }
+                } else {
+                    return Status::Corruption("unsupported null map encoding");
                 }
             }
-            // // Fallback to uncompressed data if compression fails or doesn't reduce size
-            // // Create OwnedSlice directly from _bitmap_buf data
-            // OwnedSlice result(_bitmap_buf.size());
-            // memcpy(result.data(), _bitmap_buf.data(), _bitmap_buf.size());
-            // *slice = std::move(result);
         });
         return Status::OK();
     }
@@ -554,7 +566,15 @@ Status ScalarColumnWriter::init() {
     _ordinal_index_builder = std::make_unique<OrdinalIndexWriter>();
     // create null bitmap builder
     if (is_nullable()) {
-        _null_bitmap_builder = std::make_unique<PlainNullBitmapBuilder>();
+        if (_opts.meta->has_null_map_encoding()) {
+            if (config::cooldown_thread_num < 10) {
+                _null_bitmap_builder = std::make_unique<PlainNullBitmapBuilder>(BIT_SHUFFLE);
+            } else {
+                _null_bitmap_builder = std::make_unique<PlainNullBitmapBuilder>(PLAIN_ENCODING);
+            }
+        } else {
+            _null_bitmap_builder = std::make_unique<NullBitmapBuilder>();
+        }
     }
     if (_opts.need_zone_map) {
         RETURN_IF_ERROR(ZoneMapIndexWriter::create(get_field(), _zone_map_index_builder));
@@ -839,7 +859,7 @@ Status ScalarColumnWriter::finish_current_page() {
     data_page_footer->set_first_ordinal(_first_rowid);
     data_page_footer->set_num_values(_next_rowid - _first_rowid);
     data_page_footer->set_nullmap_size(cast_set<uint32_t>(nullmap.slice().size));
-    data_page_footer->set_new_null_map(true);
+    data_page_footer->set_null_map_encoding(_null_bitmap_builder->encoding());
     if (_new_page_callback != nullptr) {
         _new_page_callback->put_extra_info_in_page(data_page_footer);
     }
diff --git a/be/src/olap/rowset/segment_v2/parsed_page.h b/be/src/olap/rowset/segment_v2/parsed_page.h
index 6137ee5e1f2d53..69783f9e26f0f0 100644
--- a/be/src/olap/rowset/segment_v2/parsed_page.h
+++ b/be/src/olap/rowset/segment_v2/parsed_page.h
@@ -22,6 +22,7 @@
 #include <memory>
 #include <vector>
 
+#include "bitshuffle_wrapper.h"
 #include "common/status.h"
 #include "olap/rowset/segment_v2/binary_dict_page.h"
 #include "olap/rowset/segment_v2/common.h"
@@ -52,24 +53,31 @@ struct ParsedPage {
         auto null_bitmap = Slice(body.data + body.size - null_size, null_size);
 
         if (null_size > 0) {
-            if (footer.has_new_null_map() && footer.new_null_map()) {
-                // Get LZ4 compression codec
-                BlockCompressionCodec* codec = nullptr;
-                RETURN_IF_ERROR(
-                        get_block_compression_codec(segment_v2::CompressionTypePB::LZ4, &codec));
-                if (codec != nullptr) {
-                    // Compress the data
-                    faststring compressed_buf;
-                    page->null_maps.resize(footer.num_values());
-                    auto tmp_slice = Slice(page->null_maps.data(), page->null_maps.size());
-                    RETURN_IF_ERROR(codec->decompress(null_bitmap, &tmp_slice));
+            page->null_maps.resize(footer.num_values());
+            if (footer.has_null_map_encoding()) {
+                if (footer.null_map_encoding() == BIT_SHUFFLE) {
+                    int64_t r = bitshuffle::decompress_lz4(null_bitmap.data, page->null_maps.data(),
+                                                           null_size, sizeof(uint8_t), 0);
+                    if (UNLIKELY(r < 0)) {
+                        return Status::Corruption("bitshuffle decompress failed");
+                    }
+                } else if (footer.null_map_encoding() == PLAIN_ENCODING) {
+                    // Get LZ4 compression codec
+                    BlockCompressionCodec* codec = nullptr;
+                    RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::LZ4,
+                                                                &codec));
+                    if (codec != nullptr) {
+                        // Compress the data
+                        faststring compressed_buf;
+                        auto tmp_slice = Slice(page->null_maps.data(), page->null_maps.size());
+                        RETURN_IF_ERROR(codec->decompress(null_bitmap, &tmp_slice));
+                    }
                 }
             } else {
                 auto null_decoder =
                         RleDecoder<bool>((const uint8_t*)null_bitmap.data, null_size, 1);
                 // Decode all null values into null_maps in advance
                 auto num_rows = footer.num_values();
-                page->null_maps.resize(num_rows);
                 null_decoder.get_values((bool*)page->null_maps.data(), num_rows);
             }
         }
diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto
index 535c270d40b811..933d9a56b5d377 100644
--- a/gensrc/proto/segment_v2.proto
+++ b/gensrc/proto/segment_v2.proto
@@ -75,7 +75,7 @@ message DataPageFooterPB {
     // Save the offset of next page 
     optional uint64 next_array_item_ordinal = 4;
 
-    optional bool new_null_map = 5;
+    optional EncodingTypePB null_map_encoding = 5;
 }
 
 message IndexPageFooterPB {
@@ -220,6 +220,8 @@ message ColumnMetaPB {
     optional uint64 compressed_data_bytes = 24;
     optional uint64 uncompressed_data_bytes = 25;
     optional uint64 raw_data_bytes = 26;
+
+    optional EncodingTypePB null_map_encoding = 27;
 }
 
 message PrimaryKeyIndexMetaPB {

From 9a33e3ec66d98ad77587d3b7650b22a0eac6912a Mon Sep 17 00:00:00 2001
From: happenlee <happenlee@selectdb.com>
Date: Thu, 11 Dec 2025 23:04:13 +0800
Subject: [PATCH 04/12] Revert "use bitshuffle to decoding"

This reverts commit 5a54f831420141840afd12883d00ee97a237a09c.
---
 .../olap/rowset/segment_v2/column_writer.cpp  | 78 +++++++------------
 be/src/olap/rowset/segment_v2/parsed_page.h   | 32 +++-----
 gensrc/proto/segment_v2.proto                 |  4 +-
 3 files changed, 42 insertions(+), 72 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp
index 0a21bea7a50721..03bb4bfffdbc2c 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/column_writer.cpp
@@ -29,7 +29,6 @@
 #include "io/fs/file_writer.h"
 #include "olap/olap_common.h"
 #include "olap/rowset/segment_v2/bitmap_index_writer.h"
-#include "olap/rowset/segment_v2/bitshuffle_page.h"
 #include "olap/rowset/segment_v2/bloom_filter_index_writer.h"
 #include "olap/rowset/segment_v2/encoding_info.h"
 #include "olap/rowset/segment_v2/inverted_index_writer.h"
@@ -73,11 +72,6 @@ class NullBitmapBuilderBase {
 
     // Return the current size of the buffer in bytes
     virtual uint64_t size() = 0;
-
-    EncodingTypePB encoding() const { return _encoding_type; }
-
-protected:
-    EncodingTypePB _encoding_type = RLE;
 };
 
 class NullBitmapBuilder : public NullBitmapBuilderBase {
@@ -120,9 +114,11 @@ class NullBitmapBuilder : public NullBitmapBuilderBase {
 // Each uint8_t represents a single null value: 0 = non-null, 1 = null
 class PlainNullBitmapBuilder : public NullBitmapBuilderBase {
 public:
-    PlainNullBitmapBuilder(EncodingTypePB encoding_type) : _has_null(false), _bitmap_buf() {
-        _encoding_type = encoding_type;
-    }
+    PlainNullBitmapBuilder() : _has_null(false), _bitmap_buf() {}
+
+    explicit PlainNullBitmapBuilder(size_t reserve_bits)
+            : _has_null(false),
+              _bitmap_buf(reserve_bits, 0) {} // Reserve enough bytes for the given number of bits
 
     void add_run(bool value, size_t run) override {
         _has_null |= value;
@@ -146,40 +142,32 @@ class PlainNullBitmapBuilder : public NullBitmapBuilderBase {
         RETURN_IF_CATCH_EXCEPTION({
             // Check if we should compress the data
             if (!_bitmap_buf.empty()) {
-                if (_encoding_type == EncodingTypePB::BIT_SHUFFLE) {
+                // Get LZ4 compression codec
+                BlockCompressionCodec* codec = nullptr;
+                RETURN_IF_ERROR(
+                        get_block_compression_codec(segment_v2::CompressionTypePB::LZ4, &codec));
+                if (codec != nullptr) {
+                    // Compress the data
                     faststring compressed_buf;
-                    compressed_buf.resize(
-                            bitshuffle::compress_lz4_bound(_bitmap_buf.size(), sizeof(uint8_t), 0));
-                    int64_t r = bitshuffle::compress_lz4(_bitmap_buf.data(), compressed_buf.data(),
-                                                         _bitmap_buf.size(), sizeof(uint8_t), 0);
-                    if (UNLIKELY(r < 0)) {
-                        return Status::InternalError("bitshuffle compress failed");
-                    }
-                    // before build(), update buffer length to the actual compressed size
-                    compressed_buf.resize(r);
-                    *slice = compressed_buf.build();
-                    return Status::OK();
-                } else if (_encoding_type == EncodingTypePB::PLAIN_ENCODING) {
-                    // Get LZ4 compression codec
-                    BlockCompressionCodec* codec = nullptr;
-                    RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::LZ4,
-                                                                &codec));
-                    if (codec != nullptr) {
-                        // Compress the data
-                        faststring compressed_buf;
-                        Slice raw_slice(_bitmap_buf.data(), _bitmap_buf.size());
-                        Status status = codec->compress(raw_slice, &compressed_buf);
-                        if (status.ok()) {
-                            *slice = compressed_buf.build();
-                            return Status::OK();
-                        } else {
-                            return status;
-                        }
+                    Slice raw_slice(_bitmap_buf.data(), _bitmap_buf.size());
+                    Status status = codec->compress(raw_slice, &compressed_buf);
+                    if (status.ok()) {
+                        // Use compressed data if compression is successful and reduces size
+                        // if (compressed_buf.size() < _bitmap_buf.size()) {
+                        // Directly build OwnedSlice from compressed_buf to avoid memory copy
+                        *slice = compressed_buf.build();
+                        return Status::OK();
+                        // }
+                    } else {
+                        return status;
                     }
-                } else {
-                    return Status::Corruption("unsupported null map encoding");
                 }
             }
+            // // Fallback to uncompressed data if compression fails or doesn't reduce size
+            // // Create OwnedSlice directly from _bitmap_buf data
+            // OwnedSlice result(_bitmap_buf.size());
+            // memcpy(result.data(), _bitmap_buf.data(), _bitmap_buf.size());
+            // *slice = std::move(result);
         });
         return Status::OK();
     }
@@ -566,15 +554,7 @@ Status ScalarColumnWriter::init() {
     _ordinal_index_builder = std::make_unique<OrdinalIndexWriter>();
     // create null bitmap builder
     if (is_nullable()) {
-        if (_opts.meta->has_null_map_encoding()) {
-            if (config::cooldown_thread_num < 10) {
-                _null_bitmap_builder = std::make_unique<PlainNullBitmapBuilder>(BIT_SHUFFLE);
-            } else {
-                _null_bitmap_builder = std::make_unique<PlainNullBitmapBuilder>(PLAIN_ENCODING);
-            }
-        } else {
-            _null_bitmap_builder = std::make_unique<NullBitmapBuilder>();
-        }
+        _null_bitmap_builder = std::make_unique<PlainNullBitmapBuilder>();
     }
     if (_opts.need_zone_map) {
         RETURN_IF_ERROR(ZoneMapIndexWriter::create(get_field(), _zone_map_index_builder));
@@ -859,7 +839,7 @@ Status ScalarColumnWriter::finish_current_page() {
     data_page_footer->set_first_ordinal(_first_rowid);
     data_page_footer->set_num_values(_next_rowid - _first_rowid);
     data_page_footer->set_nullmap_size(cast_set<uint32_t>(nullmap.slice().size));
-    data_page_footer->set_null_map_encoding(_null_bitmap_builder->encoding());
+    data_page_footer->set_new_null_map(true);
     if (_new_page_callback != nullptr) {
         _new_page_callback->put_extra_info_in_page(data_page_footer);
     }
diff --git a/be/src/olap/rowset/segment_v2/parsed_page.h b/be/src/olap/rowset/segment_v2/parsed_page.h
index 69783f9e26f0f0..6137ee5e1f2d53 100644
--- a/be/src/olap/rowset/segment_v2/parsed_page.h
+++ b/be/src/olap/rowset/segment_v2/parsed_page.h
@@ -22,7 +22,6 @@
 #include <memory>
 #include <vector>
 
-#include "bitshuffle_wrapper.h"
 #include "common/status.h"
 #include "olap/rowset/segment_v2/binary_dict_page.h"
 #include "olap/rowset/segment_v2/common.h"
@@ -53,31 +52,24 @@ struct ParsedPage {
         auto null_bitmap = Slice(body.data + body.size - null_size, null_size);
 
         if (null_size > 0) {
-            page->null_maps.resize(footer.num_values());
-            if (footer.has_null_map_encoding()) {
-                if (footer.null_map_encoding() == BIT_SHUFFLE) {
-                    int64_t r = bitshuffle::decompress_lz4(null_bitmap.data, page->null_maps.data(),
-                                                           null_size, sizeof(uint8_t), 0);
-                    if (UNLIKELY(r < 0)) {
-                        return Status::Corruption("bitshuffle decompress failed");
-                    }
-                } else if (footer.null_map_encoding() == PLAIN_ENCODING) {
-                    // Get LZ4 compression codec
-                    BlockCompressionCodec* codec = nullptr;
-                    RETURN_IF_ERROR(get_block_compression_codec(segment_v2::CompressionTypePB::LZ4,
-                                                                &codec));
-                    if (codec != nullptr) {
-                        // Compress the data
-                        faststring compressed_buf;
-                        auto tmp_slice = Slice(page->null_maps.data(), page->null_maps.size());
-                        RETURN_IF_ERROR(codec->decompress(null_bitmap, &tmp_slice));
-                    }
+            if (footer.has_new_null_map() && footer.new_null_map()) {
+                // Get LZ4 compression codec
+                BlockCompressionCodec* codec = nullptr;
+                RETURN_IF_ERROR(
+                        get_block_compression_codec(segment_v2::CompressionTypePB::LZ4, &codec));
+                if (codec != nullptr) {
+                    // Compress the data
+                    faststring compressed_buf;
+                    page->null_maps.resize(footer.num_values());
+                    auto tmp_slice = Slice(page->null_maps.data(), page->null_maps.size());
+                    RETURN_IF_ERROR(codec->decompress(null_bitmap, &tmp_slice));
                 }
             } else {
                 auto null_decoder =
                         RleDecoder<bool>((const uint8_t*)null_bitmap.data, null_size, 1);
                 // Decode all null values into null_maps in advance
                 auto num_rows = footer.num_values();
+                page->null_maps.resize(num_rows);
                 null_decoder.get_values((bool*)page->null_maps.data(), num_rows);
             }
         }
diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto
index 933d9a56b5d377..535c270d40b811 100644
--- a/gensrc/proto/segment_v2.proto
+++ b/gensrc/proto/segment_v2.proto
@@ -75,7 +75,7 @@ message DataPageFooterPB {
     // Save the offset of next page 
     optional uint64 next_array_item_ordinal = 4;
 
-    optional EncodingTypePB null_map_encoding = 5;
+    optional bool new_null_map = 5;
 }
 
 message IndexPageFooterPB {
@@ -220,8 +220,6 @@ message ColumnMetaPB {
     optional uint64 compressed_data_bytes = 24;
     optional uint64 uncompressed_data_bytes = 25;
     optional uint64 raw_data_bytes = 26;
-
-    optional EncodingTypePB null_map_encoding = 27;
 }
 
 message PrimaryKeyIndexMetaPB {

From 1432172e29bfee2bc80a4ecb4123142e4110a3cf Mon Sep 17 00:00:00 2001
From: happenlee <happenlee@selectdb.com>
Date: Tue, 16 Dec 2025 02:21:59 +0800
Subject: [PATCH 05/12] contine page

---
 .../olap/rowset/segment_v2/column_reader.cpp  | 180 ++++++++++++------
 .../olap/rowset/segment_v2/column_writer.cpp  | 101 +++++-----
 be/src/olap/rowset/segment_v2/column_writer.h |  27 +--
 be/src/olap/rowset/segment_v2/parsed_page.h   |   3 +
 .../variant/variant_column_writer_impl.cpp    |   4 +-
 gensrc/proto/segment_v2.proto                 |   2 +
 tools/tpcds-tools/conf/doris-cluster.conf     |   4 +-
 7 files changed, 198 insertions(+), 123 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp
index e9154e920cb888..e490e8f306116f 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -1636,6 +1636,9 @@ Status FileColumnIterator::_seek_to_pos_in_page(ParsedPage* page, ordinal_t offs
     }
 
     auto num_nulls = [this](ordinal_t start, ordinal_t end) {
+        if (_page.is_continue) {
+            return 0;
+        }
         auto null_count = 0;
         for (auto i = start; i < end; i++) {
             null_count += _page.null_maps[i];
@@ -1712,21 +1715,38 @@ Status FileColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr& d
             auto& null_map = null_col->get_null_map_data();
             auto nest_column = null_col->get_nested_column_ptr();
 
-            while (nrows_to_read > 0) {
-                bool is_null;
-                int i;
-                std::tie(is_null, i) = null_count(nrows_to_read);
-                if (is_null) {
-                    null_col->insert_many_defaults(i);
-                } else {
-                    null_map.resize_fill(null_map.size() + i, 0);
-                    size_t num_rows = i;
-                    RETURN_IF_ERROR(_page.data_decoder->next_batch(&num_rows, nest_column));
-                    DCHECK_EQ(i, num_rows);
+            // Optimization: if is_continue is true, skip null_count check and read directly
+            if (_page.is_continue) {
+                // Copy null values from page's null_maps
+                size_t start_offset = _page.offset_in_page;
+                // Reserve space for new null values
+                null_map.resize(null_map.size() + nrows_to_read);
+                // Copy from page's null_maps to column's null_map using memcpy for better performance
+                memcpy(null_map.data() + null_map.size() - nrows_to_read,
+                       _page.null_maps.data() + start_offset, nrows_to_read);
+
+                size_t num_rows = nrows_to_read;
+                RETURN_IF_ERROR(_page.data_decoder->next_batch(&num_rows, nest_column));
+                DCHECK_EQ(nrows_to_read, num_rows);
+                _page.offset_in_page += nrows_to_read;
+                _current_ordinal += nrows_to_read;
+            } else {
+                while (nrows_to_read > 0) {
+                    bool is_null;
+                    int i;
+                    std::tie(is_null, i) = null_count(nrows_to_read);
+                    if (is_null) {
+                        null_col->insert_many_defaults(i);
+                    } else {
+                        null_map.resize_fill(null_map.size() + i, 0);
+                        size_t num_rows = i;
+                        RETURN_IF_ERROR(_page.data_decoder->next_batch(&num_rows, nest_column));
+                        DCHECK_EQ(i, num_rows);
+                    }
+                    nrows_to_read -= i;
+                    _page.offset_in_page += i;
+                    _current_ordinal += i;
                 }
-                nrows_to_read -= i;
-                _page.offset_in_page += i;
-                _current_ordinal += i;
             }
         } else {
             RETURN_IF_ERROR(_page.data_decoder->next_batch(&nrows_to_read, dst));
@@ -1760,65 +1780,107 @@ Status FileColumnIterator::read_by_rowids(const rowid_t* rowids, const size_t co
         nrows_to_read = std::min(remaining, _page.remaining());
 
         if (!_page.null_maps.empty()) {
-            size_t already_read = 0;
-            while ((nrows_to_read - already_read) > 0) {
-                bool is_null = false;
-                size_t this_run = std::min(nrows_to_read - already_read, _page.remaining());
-                if (UNLIKELY(this_run == 0)) {
-                    break;
-                }
-                std::tie(is_null, this_run) = null_count(this_run);
-                size_t offset = total_read_count + already_read;
+            auto* null_col =
+                    vectorized::check_and_get_column<vectorized::ColumnNullable>(dst.get());
+            if (UNLIKELY(null_col == nullptr)) {
+                return Status::InternalError("unexpected column type in column reader");
+            }
+            auto& null_map = null_col->get_null_map_data();
+            auto nest_column = null_col->get_nested_column_ptr();
+
+            // Optimization: if is_continue is true, skip null_count check and read directly
+            if (_page.is_continue) {
+                size_t offset = total_read_count;
                 size_t this_read_count = 0;
-                rowid_t current_ordinal_in_page =
-                        cast_set<uint32_t>(_page.offset_in_page + _page.first_ordinal);
-                for (size_t i = 0; i < this_run; ++i) {
-                    if (rowids[offset + i] - current_ordinal_in_page >= this_run) {
+                // rowid_t current_ordinal_in_page =
+                // cast_set<uint32_t>(_page.offset_in_page + _page.first_ordinal);
+
+                // Calculate how many rowids in this batch belong to the current page
+                for (size_t i = 0; i < nrows_to_read; ++i) {
+                    // Check if this rowid is within the current page's range
+                    if (rowids[offset + i] >= _page.first_ordinal + _page.num_rows) {
                         break;
                     }
                     this_read_count++;
                 }
 
-                auto origin_index = _page.data_decoder->current_index();
                 if (this_read_count > 0) {
-                    auto* null_col =
-                            vectorized::check_and_get_column<vectorized::ColumnNullable>(dst.get());
-                    if (UNLIKELY(null_col == nullptr)) {
-                        return Status::InternalError("unexpected column type in column reader");
-                    }
-                    auto& null_map = null_col->get_null_map_data();
-                    auto nest_column = null_col->get_nested_column_ptr();
+                    // Read data for the rows
+                    size_t read_count = this_read_count;
 
-                    if (is_null) {
-                        null_col->insert_many_defaults(this_read_count);
-                    } else {
-                        size_t read_count = this_read_count;
-
-                        // ordinal in nullable columns' data buffer maybe be not continuously(the data doesn't contain null value),
-                        // so we need use `page_start_off_in_decoder` to calculate the actual offset in `data_decoder`
-                        size_t page_start_off_in_decoder =
-                                _page.first_ordinal + _page.offset_in_page - origin_index;
-                        RETURN_IF_ERROR(_page.data_decoder->read_by_rowids(
-                                &rowids[offset], page_start_off_in_decoder, &read_count,
-                                nest_column));
-                        null_map.resize_fill(null_map.size() + read_count, 0);
-                        DCHECK_EQ(read_count, this_read_count);
+                    // Read data using data_decoder's read_by_rowids
+                    RETURN_IF_ERROR(_page.data_decoder->read_by_rowids(
+                            &rowids[offset], _page.first_ordinal, &read_count, nest_column));
+
+                    // Update null map with data from page's null_maps
+                    size_t null_map_start_offset = null_map.size();
+                    null_map.resize(null_map.size() + read_count);
+
+                    // Copy null flags from page's null_maps for the rows we just read
+                    for (size_t i = 0; i < read_count; ++i) {
+                        size_t idx_in_page = rowids[offset + i] - _page.first_ordinal;
+                        null_map[null_map_start_offset + i] = _page.null_maps[idx_in_page];
                     }
+                    DCHECK_EQ(read_count, this_read_count);
                 }
 
-                if (!is_null) {
-                    RETURN_IF_ERROR(
-                            _page.data_decoder->seek_to_position_in_page(origin_index + this_run));
+                DCHECK_EQ(nest_column->size(), null_map.size());
+                total_read_count += this_read_count;
+                remaining -= this_read_count;
+            } else {
+                // Original logic for non-continue case
+                size_t already_read = 0;
+                while ((nrows_to_read - already_read) > 0) {
+                    bool is_null = false;
+                    size_t this_run = std::min(nrows_to_read - already_read, _page.remaining());
+                    if (UNLIKELY(this_run == 0)) {
+                        break;
+                    }
+                    std::tie(is_null, this_run) = null_count(this_run);
+                    size_t offset = total_read_count + already_read;
+                    size_t this_read_count = 0;
+                    rowid_t current_ordinal_in_page =
+                            cast_set<uint32_t>(_page.offset_in_page + _page.first_ordinal);
+                    for (size_t i = 0; i < this_run; ++i) {
+                        if (rowids[offset + i] - current_ordinal_in_page >= this_run) {
+                            break;
+                        }
+                        this_read_count++;
+                    }
+
+                    auto origin_index = _page.data_decoder->current_index();
+                    if (this_read_count > 0) {
+                        if (is_null) {
+                            null_col->insert_many_defaults(this_read_count);
+                        } else {
+                            size_t read_count = this_read_count;
+
+                            // ordinal in nullable columns' data buffer maybe be not continuously(the data doesn't contain null value),
+                            // so we need use `page_start_off_in_decoder` to calculate the actual offset in `data_decoder`
+                            size_t page_start_off_in_decoder =
+                                    _page.first_ordinal + _page.offset_in_page - origin_index;
+                            RETURN_IF_ERROR(_page.data_decoder->read_by_rowids(
+                                    &rowids[offset], page_start_off_in_decoder, &read_count,
+                                    nest_column));
+                            null_map.resize_fill(null_map.size() + read_count, 0);
+                            DCHECK_EQ(read_count, this_read_count);
+                        }
+                    }
+
+                    if (!is_null) {
+                        RETURN_IF_ERROR(_page.data_decoder->seek_to_position_in_page(origin_index +
+                                                                                     this_run));
+                    }
+
+                    already_read += this_read_count;
+                    _page.offset_in_page += this_run;
+                    DCHECK(_page.offset_in_page <= _page.num_rows);
                 }
 
-                already_read += this_read_count;
-                _page.offset_in_page += this_run;
-                DCHECK(_page.offset_in_page <= _page.num_rows);
+                nrows_to_read = already_read;
+                total_read_count += nrows_to_read;
+                remaining -= nrows_to_read;
             }
-
-            nrows_to_read = already_read;
-            total_read_count += nrows_to_read;
-            remaining -= nrows_to_read;
         } else {
             RETURN_IF_ERROR(_page.data_decoder->read_by_rowids(
                     &rowids[total_read_count], _page.first_ordinal, &nrows_to_read, dst));
diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp
index 03bb4bfffdbc2c..13b5829691804f 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/column_writer.cpp
@@ -449,9 +449,9 @@ Status ColumnWriter::append_nullable(const uint8_t* is_null_bits, const void* da
     size_t this_run = 0;
     while ((this_run = null_iter.Next(&is_null)) > 0) {
         if (is_null) {
-            RETURN_IF_ERROR(append_nulls(this_run));
+            RETURN_IF_ERROR(append_data(&ptr, this_run, true));
         } else {
-            RETURN_IF_ERROR(append_data(&ptr, this_run));
+            RETURN_IF_ERROR(append_data(&ptr, this_run, false));
         }
     }
     return Status::OK();
@@ -475,13 +475,13 @@ Status ColumnWriter::append_nullable(const uint8_t* null_map, const uint8_t** pt
     do {
         auto step = next_run_step();
         if (null_map[offset]) {
-            RETURN_IF_ERROR(append_nulls(step));
-            *ptr += get_field()->size() * step;
+            RETURN_IF_ERROR(append_data(ptr, step, true));
+            // *ptr += get_field()->size() * step;
         } else {
             // TODO:
             //  1. `*ptr += get_field()->size() * step;` should do in this function, not append_data;
             //  2. support array vectorized load and ptr offset add
-            RETURN_IF_ERROR(append_data(ptr, step));
+            RETURN_IF_ERROR(append_data(ptr, step, false));
         }
         offset += step;
     } while (offset < num_rows);
@@ -495,7 +495,7 @@ Status ColumnWriter::append(const uint8_t* nullmap, const void* data, size_t num
     if (nullmap) {
         return append_nullable(nullmap, &ptr, num_rows);
     } else {
-        return append_data(&ptr, num_rows);
+        return append_data(&ptr, num_rows, false);
     }
 }
 
@@ -641,11 +641,11 @@ Status ScalarColumnWriter::append_nulls(size_t num_rows) {
 // append data to page builder. this function will make sure that
 // num_rows must be written before return. And ptr will be modified
 // to next data should be written
-Status ScalarColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) {
+Status ScalarColumnWriter::append_data(const uint8_t** ptr, size_t num_rows, bool null) {
     size_t remaining = num_rows;
     while (remaining > 0) {
         size_t num_written = remaining;
-        RETURN_IF_ERROR(append_data_in_current_page(ptr, &num_written));
+        RETURN_IF_ERROR(append_data_in_current_page(ptr, &num_written, null));
 
         remaining -= num_written;
 
@@ -657,35 +657,41 @@ Status ScalarColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) {
 }
 
 Status ScalarColumnWriter::_internal_append_data_in_current_page(const uint8_t* data,
-                                                                 size_t* num_written) {
+                                                                 size_t* num_written, bool null) {
     RETURN_IF_ERROR(_page_builder->add(data, num_written));
-    if (_opts.need_bitmap_index) {
-        _bitmap_index_builder->add_values(data, *num_written);
-    }
-    if (_opts.need_zone_map) {
-        _zone_map_index_builder->add_values(data, *num_written);
-    }
-    if (_opts.need_inverted_index) {
-        for (const auto& builder : _inverted_index_builders) {
-            RETURN_IF_ERROR(builder->add_values(get_field()->name(), data, *num_written));
+    if (!null) {
+        if (_opts.need_bitmap_index) {
+            _bitmap_index_builder->add_values(data, *num_written);
+        }
+        if (_opts.need_zone_map) {
+            _zone_map_index_builder->add_values(data, *num_written);
+        }
+        if (_opts.need_inverted_index) {
+            for (const auto& builder : _inverted_index_builders) {
+                RETURN_IF_ERROR(builder->add_values(get_field()->name(), data, *num_written));
+            }
+        }
+        if (_opts.need_bloom_filter) {
+            RETURN_IF_ERROR(_bloom_filter_index_builder->add_values(data, *num_written));
         }
-    }
-    if (_opts.need_bloom_filter) {
-        RETURN_IF_ERROR(_bloom_filter_index_builder->add_values(data, *num_written));
-    }
 
-    _next_rowid += *num_written;
+        _next_rowid += *num_written;
+        if (is_nullable()) {
+            _null_bitmap_builder->add_run(false, *num_written);
+        }
+    } else {
+        DCHECK(is_nullable());
+        RETURN_IF_ERROR(append_nulls(*num_written));
+    }
 
     // we must write null bits after write data, because we don't
     // know how many rows can be written into current page
-    if (is_nullable()) {
-        _null_bitmap_builder->add_run(false, *num_written);
-    }
     return Status::OK();
 }
 
-Status ScalarColumnWriter::append_data_in_current_page(const uint8_t** data, size_t* num_written) {
-    RETURN_IF_ERROR(append_data_in_current_page(*data, num_written));
+Status ScalarColumnWriter::append_data_in_current_page(const uint8_t** data, size_t* num_written,
+                                                       bool null) {
+    RETURN_IF_ERROR(append_data_in_current_page(*data, num_written, null));
     *data += get_field()->size() * (*num_written);
     return Status::OK();
 }
@@ -840,6 +846,7 @@ Status ScalarColumnWriter::finish_current_page() {
     data_page_footer->set_num_values(_next_rowid - _first_rowid);
     data_page_footer->set_nullmap_size(cast_set<uint32_t>(nullmap.slice().size));
     data_page_footer->set_new_null_map(true);
+    data_page_footer->set_is_continue(true);
     if (_new_page_callback != nullptr) {
         _new_page_callback->put_extra_info_in_page(data_page_footer);
     }
@@ -883,11 +890,11 @@ Status OffsetColumnWriter::init() {
     return Status::OK();
 }
 
-Status OffsetColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) {
+Status OffsetColumnWriter::append_data(const uint8_t** ptr, size_t num_rows, bool null) {
     size_t remaining = num_rows;
     while (remaining > 0) {
         size_t num_written = remaining;
-        RETURN_IF_ERROR(append_data_in_current_page(ptr, &num_written));
+        RETURN_IF_ERROR(append_data_in_current_page(ptr, &num_written, null));
         // _next_offset after append_data_in_current_page is the offset of next data, which will used in finish_current_page() to set next_array_item_ordinal
         _next_offset = *(const uint64_t*)(*ptr);
         remaining -= num_written;
@@ -940,12 +947,12 @@ Status StructColumnWriter::write_inverted_index() {
 
 Status StructColumnWriter::append_nullable(const uint8_t* null_map, const uint8_t** ptr,
                                            size_t num_rows) {
-    RETURN_IF_ERROR(append_data(ptr, num_rows));
-    RETURN_IF_ERROR(_null_writer->append_data(&null_map, num_rows));
+    RETURN_IF_ERROR(append_data(ptr, num_rows, false));
+    RETURN_IF_ERROR(_null_writer->append_data(&null_map, num_rows, false));
     return Status::OK();
 }
 
-Status StructColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) {
+Status StructColumnWriter::append_data(const uint8_t** ptr, size_t num_rows, bool null) {
     const auto* results = reinterpret_cast<const uint64_t*>(*ptr);
     for (size_t i = 0; i < _num_sub_column_writers; ++i) {
         auto nullmap = *(results + _num_sub_column_writers + i);
@@ -1004,7 +1011,7 @@ Status StructColumnWriter::append_nulls(size_t num_rows) {
     if (is_nullable()) {
         std::vector<vectorized::UInt8> null_signs(num_rows, 1);
         const uint8_t* null_sign_ptr = null_signs.data();
-        RETURN_IF_ERROR(_null_writer->append_data(&null_sign_ptr, num_rows));
+        RETURN_IF_ERROR(_null_writer->append_data(&null_sign_ptr, num_rows, false));
     }
     return Status::OK();
 }
@@ -1066,7 +1073,7 @@ Status ArrayColumnWriter::write_ann_index() {
 }
 
 // batch append data for array
-Status ArrayColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) {
+Status ArrayColumnWriter::append_data(const uint8_t** ptr, size_t num_rows, bool null) {
     // data_ptr contains
     // [size, offset_ptr, item_data_ptr, item_nullmap_ptr]
     auto data_ptr = reinterpret_cast<const uint64_t*>(*ptr);
@@ -1107,7 +1114,7 @@ Status ArrayColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) {
         }
     }
 
-    RETURN_IF_ERROR(_offset_writer->append_data(&offsets_ptr, num_rows));
+    RETURN_IF_ERROR(_offset_writer->append_data(&offsets_ptr, num_rows, false));
     return Status::OK();
 }
 
@@ -1119,12 +1126,12 @@ uint64_t ArrayColumnWriter::estimate_buffer_size() {
 
 Status ArrayColumnWriter::append_nullable(const uint8_t* null_map, const uint8_t** ptr,
                                           size_t num_rows) {
-    RETURN_IF_ERROR(append_data(ptr, num_rows));
+    RETURN_IF_ERROR(append_data(ptr, num_rows, false));
     if (is_nullable()) {
         if (_opts.need_inverted_index) {
             RETURN_IF_ERROR(_inverted_index_writer->add_array_nulls(null_map, num_rows));
         }
-        RETURN_IF_ERROR(_null_writer->append_data(&null_map, num_rows));
+        RETURN_IF_ERROR(_null_writer->append_data(&null_map, num_rows, false));
     }
     return Status::OK();
 }
@@ -1165,7 +1172,7 @@ Status ArrayColumnWriter::append_nulls(size_t num_rows) {
     while (num_lengths > 0) {
         // TODO llj bulk write
         const auto* offset_ptr = reinterpret_cast<const uint8_t*>(&offset);
-        RETURN_IF_ERROR(_offset_writer->append_data(&offset_ptr, 1));
+        RETURN_IF_ERROR(_offset_writer->append_data(&offset_ptr, 1, false));
         --num_lengths;
     }
     return write_null_column(num_rows, true);
@@ -1176,7 +1183,7 @@ Status ArrayColumnWriter::write_null_column(size_t num_rows, bool is_null) {
     while (is_nullable() && num_rows > 0) {
         // TODO llj bulk write
         const uint8_t* null_sign_ptr = &null_sign;
-        RETURN_IF_ERROR(_null_writer->append_data(&null_sign_ptr, 1));
+        RETURN_IF_ERROR(_null_writer->append_data(&null_sign_ptr, 1, false));
         --num_rows;
     }
     return Status::OK();
@@ -1240,15 +1247,15 @@ Status MapColumnWriter::finish() {
 
 Status MapColumnWriter::append_nullable(const uint8_t* null_map, const uint8_t** ptr,
                                         size_t num_rows) {
-    RETURN_IF_ERROR(append_data(ptr, num_rows));
+    RETURN_IF_ERROR(append_data(ptr, num_rows, false));
     if (is_nullable()) {
-        RETURN_IF_ERROR(_null_writer->append_data(&null_map, num_rows));
+        RETURN_IF_ERROR(_null_writer->append_data(&null_map, num_rows, false));
     }
     return Status::OK();
 }
 
 // write key value data with offsets
-Status MapColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) {
+Status MapColumnWriter::append_data(const uint8_t** ptr, size_t num_rows, bool null) {
     // data_ptr contains
     // [size, offset_ptr, key_data_ptr, val_data_ptr, k_nullmap_ptr, v_nullmap_pr]
     // which converted results from olap_map_convertor and later will use a structure to replace it
@@ -1269,7 +1276,7 @@ Status MapColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) {
     }
     // make sure the order : offset writer flush next_array_item_ordinal after kv_writers append_data
     // because we use _kv_writers[0]->get_next_rowid() to set next_array_item_ordinal in offset page footer
-    RETURN_IF_ERROR(_offsets_writer->append_data(&offsets_ptr, num_rows));
+    RETURN_IF_ERROR(_offsets_writer->append_data(&offsets_ptr, num_rows, false));
     return Status::OK();
 }
 
@@ -1304,12 +1311,12 @@ Status MapColumnWriter::append_nulls(size_t num_rows) {
     const ordinal_t offset = _kv_writers[0]->get_next_rowid();
     std::vector<vectorized::UInt8> offsets_data(num_rows, cast_set<uint8_t>(offset));
     const uint8_t* offsets_ptr = offsets_data.data();
-    RETURN_IF_ERROR(_offsets_writer->append_data(&offsets_ptr, num_rows));
+    RETURN_IF_ERROR(_offsets_writer->append_data(&offsets_ptr, num_rows, false));
 
     if (is_nullable()) {
         std::vector<vectorized::UInt8> null_signs(num_rows, 1);
         const uint8_t* null_sign_ptr = null_signs.data();
-        RETURN_IF_ERROR(_null_writer->append_data(&null_sign_ptr, num_rows));
+        RETURN_IF_ERROR(_null_writer->append_data(&null_sign_ptr, num_rows, false));
     }
     return Status::OK();
 }
@@ -1335,7 +1342,7 @@ Status VariantColumnWriter::init() {
     return _impl->init();
 }
 
-Status VariantColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) {
+Status VariantColumnWriter::append_data(const uint8_t** ptr, size_t num_rows, bool null) {
     _next_rowid += num_rows;
     return _impl->append_data(ptr, num_rows);
 }
diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h
index 89d544ea2e918d..56b1f559bcd9cd 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.h
+++ b/be/src/olap/rowset/segment_v2/column_writer.h
@@ -138,7 +138,7 @@ class ColumnWriter {
             return append_nullable(&nullmap, cell.cell_ptr(), 1);
         } else {
             auto* cel_ptr = cell.cell_ptr();
-            return append_data((const uint8_t**)&cel_ptr, 1);
+            return append_data((const uint8_t**)&cel_ptr, 1, false);
         }
     }
 
@@ -188,7 +188,7 @@ class ColumnWriter {
     virtual uint64_t get_total_compressed_data_pages_bytes() const = 0;
 
     // used for append not null data.
-    virtual Status append_data(const uint8_t** ptr, size_t num_rows) = 0;
+    virtual Status append_data(const uint8_t** ptr, size_t num_rows, bool null) = 0;
 
     bool is_nullable() const { return _is_nullable; }
 
@@ -252,20 +252,21 @@ class ScalarColumnWriter : public ColumnWriter {
     void register_flush_page_callback(FlushPageCallback* flush_page_callback) {
         _new_page_callback = flush_page_callback;
     }
-    Status append_data(const uint8_t** ptr, size_t num_rows) override;
+    Status append_data(const uint8_t** ptr, size_t num_rows, bool null) override;
 
     // used for append not null data. When page is full, will append data not reach num_rows.
-    Status append_data_in_current_page(const uint8_t** ptr, size_t* num_written);
+    Status append_data_in_current_page(const uint8_t** ptr, size_t* num_written, bool null);
 
-    Status append_data_in_current_page(const uint8_t* ptr, size_t* num_written) {
+    Status append_data_in_current_page(const uint8_t* ptr, size_t* num_written, bool null) {
         RETURN_IF_CATCH_EXCEPTION(
-                { return _internal_append_data_in_current_page(ptr, num_written); });
+                { return _internal_append_data_in_current_page(ptr, num_written, null); });
     }
     friend class ArrayColumnWriter;
     friend class OffsetColumnWriter;
 
 private:
-    Status _internal_append_data_in_current_page(const uint8_t* ptr, size_t* num_written);
+    Status _internal_append_data_in_current_page(const uint8_t* ptr, size_t* num_written,
+                                                 bool null);
 
 private:
     std::unique_ptr<PageBuilder> _page_builder;
@@ -338,7 +339,7 @@ class OffsetColumnWriter final : public ScalarColumnWriter, FlushPageCallback {
 
     Status init() override;
 
-    Status append_data(const uint8_t** ptr, size_t num_rows) override;
+    Status append_data(const uint8_t** ptr, size_t num_rows, bool null) override;
 
 private:
     void put_extra_info_in_page(DataPageFooterPB* footer) override;
@@ -356,7 +357,7 @@ class StructColumnWriter final : public ColumnWriter {
     Status init() override;
 
     Status append_nullable(const uint8_t* null_map, const uint8_t** data, size_t num_rows) override;
-    Status append_data(const uint8_t** ptr, size_t num_rows) override;
+    Status append_data(const uint8_t** ptr, size_t num_rows, bool null) override;
 
     uint64_t estimate_buffer_size() override;
 
@@ -428,7 +429,7 @@ class ArrayColumnWriter final : public ColumnWriter {
 
     Status init() override;
 
-    Status append_data(const uint8_t** ptr, size_t num_rows) override;
+    Status append_data(const uint8_t** ptr, size_t num_rows, bool null) override;
 
     uint64_t estimate_buffer_size() override;
 
@@ -509,7 +510,7 @@ class MapColumnWriter final : public ColumnWriter {
 
     Status init() override;
 
-    Status append_data(const uint8_t** ptr, size_t num_rows) override;
+    Status append_data(const uint8_t** ptr, size_t num_rows, bool null) override;
     Status append_nullable(const uint8_t* null_map, const uint8_t** ptr, size_t num_rows) override;
     uint64_t estimate_buffer_size() override;
 
@@ -588,7 +589,7 @@ class VariantSubcolumnWriter : public ColumnWriter {
 
     Status init() override;
 
-    Status append_data(const uint8_t** ptr, size_t num_rows) override;
+    Status append_data(const uint8_t** ptr, size_t num_rows, bool null) override;
 
     uint64_t estimate_buffer_size() override;
 
@@ -649,7 +650,7 @@ class VariantColumnWriter : public ColumnWriter {
 
     Status init() override;
 
-    Status append_data(const uint8_t** ptr, size_t num_rows) override;
+    Status append_data(const uint8_t** ptr, size_t num_rows, bool null) override;
 
     uint64_t estimate_buffer_size() override;
 
diff --git a/be/src/olap/rowset/segment_v2/parsed_page.h b/be/src/olap/rowset/segment_v2/parsed_page.h
index 6137ee5e1f2d53..fe2c39ad506dbe 100644
--- a/be/src/olap/rowset/segment_v2/parsed_page.h
+++ b/be/src/olap/rowset/segment_v2/parsed_page.h
@@ -50,6 +50,7 @@ struct ParsedPage {
 
         auto null_size = footer.nullmap_size();
         auto null_bitmap = Slice(body.data + body.size - null_size, null_size);
+        page->is_continue = footer.has_is_continue() && footer.is_continue();
 
         if (null_size > 0) {
             if (footer.has_new_null_map() && footer.new_null_map()) {
@@ -63,6 +64,7 @@ struct ParsedPage {
                     page->null_maps.resize(footer.num_values());
                     auto tmp_slice = Slice(page->null_maps.data(), page->null_maps.size());
                     RETURN_IF_ERROR(codec->decompress(null_bitmap, &tmp_slice));
+                    // Set is_continue to true when using new null map format
                 }
             } else {
                 auto null_decoder =
@@ -119,6 +121,7 @@ struct ParsedPage {
     ordinal_t offset_in_page = 0;
 
     bool is_dict_encoding = false;
+    bool is_continue = false;
 
     bool contains(ordinal_t ord) {
         return ord >= first_ordinal && ord < (first_ordinal + num_rows);
diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp b/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp
index 20096a8b0c7697..6319a23b741ecc 100644
--- a/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp
+++ b/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp
@@ -723,7 +723,7 @@ Status VariantSubcolumnWriter::init() {
     return Status::OK();
 }
 
-Status VariantSubcolumnWriter::append_data(const uint8_t** ptr, size_t num_rows) {
+Status VariantSubcolumnWriter::append_data(const uint8_t** ptr, size_t num_rows, bool null) {
     const auto* column = reinterpret_cast<const vectorized::VariantColumnData*>(*ptr);
     const auto& src = *reinterpret_cast<const vectorized::ColumnVariant*>(column->column_data);
     auto* dst_ptr = assert_cast<vectorized::ColumnVariant*>(_column.get());
@@ -837,7 +837,7 @@ Status VariantSubcolumnWriter::write_bloom_filter_index() {
 Status VariantSubcolumnWriter::append_nullable(const uint8_t* null_map, const uint8_t** ptr,
                                                size_t num_rows) {
     // the root contains the same nullable info
-    RETURN_IF_ERROR(append_data(ptr, num_rows));
+    RETURN_IF_ERROR(append_data(ptr, num_rows, false));
     return Status::OK();
 }
 
diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto
index 535c270d40b811..be4ecc9f344b10 100644
--- a/gensrc/proto/segment_v2.proto
+++ b/gensrc/proto/segment_v2.proto
@@ -76,6 +76,8 @@ message DataPageFooterPB {
     optional uint64 next_array_item_ordinal = 4;
 
     optional bool new_null_map = 5;
+
+    optional bool is_continue = 6;
 }
 
 message IndexPageFooterPB {
diff --git a/tools/tpcds-tools/conf/doris-cluster.conf b/tools/tpcds-tools/conf/doris-cluster.conf
index fd737356c2c103..3a3101ff663e3f 100644
--- a/tools/tpcds-tools/conf/doris-cluster.conf
+++ b/tools/tpcds-tools/conf/doris-cluster.conf
@@ -18,9 +18,9 @@
 # Any of FE host
 export FE_HOST='127.0.0.1'
 # http_port in fe.conf
-export FE_HTTP_PORT=8030
+export FE_HTTP_PORT=8137
 # query_port in fe.conf
-export FE_QUERY_PORT=9030
+export FE_QUERY_PORT=9137
 # Doris username
 export USER='root'
 # Doris password

From 9a980895355098dad04f05aa12b70f46b7678414 Mon Sep 17 00:00:00 2001
From: happenlee <happenlee@selectdb.com>
Date: Thu, 18 Dec 2025 22:56:20 +0800
Subject: [PATCH 06/12] fix query error

---
 be/src/olap/rowset/segment_v2/column_reader.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp
index e490e8f306116f..08ed188f4a5078 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -521,6 +521,7 @@ Status ColumnReader::_parse_zone_map(const ZoneMapPB& zone_map, WrapperField* mi
             }
         } else {
             RETURN_IF_ERROR(min_value_container->from_string(zone_map.min()));
+            min_value_container->set_not_null();
         }
 
         if (zone_map.has_nan()) {
@@ -545,6 +546,7 @@ Status ColumnReader::_parse_zone_map(const ZoneMapPB& zone_map, WrapperField* mi
             }
         } else {
             RETURN_IF_ERROR(max_value_container->from_string(zone_map.max()));
+            max_value_container->set_not_null();
         }
     }
     // for compatible original Cond eval logic

From df02946fe26490ca4209a4dea0b102fe1068fdd2 Mon Sep 17 00:00:00 2001
From: happenlee <happenlee@selectdb.com>
Date: Tue, 23 Dec 2025 17:04:57 +0800
Subject: [PATCH 07/12] support stream agg topn

---
 .../exec/streaming_aggregation_operator.cpp   | 301 ++++++++++++++++--
 .../exec/streaming_aggregation_operator.h     |  79 ++++-
 .../translator/PhysicalPlanTranslator.java    |   3 +-
 3 files changed, 344 insertions(+), 39 deletions(-)

diff --git a/be/src/pipeline/exec/streaming_aggregation_operator.cpp b/be/src/pipeline/exec/streaming_aggregation_operator.cpp
index 383c2e10079dfa..f1d1966dab7d15 100644
--- a/be/src/pipeline/exec/streaming_aggregation_operator.cpp
+++ b/be/src/pipeline/exec/streaming_aggregation_operator.cpp
@@ -99,6 +99,8 @@ Status StreamingAggLocalState::init(RuntimeState* state, LocalStateInfo& info) {
     _insert_values_to_column_timer = ADD_TIMER(Base::custom_profile(), "InsertValuesToColumnTime");
     _deserialize_data_timer = ADD_TIMER(Base::custom_profile(), "DeserializeAndMergeTime");
     _hash_table_compute_timer = ADD_TIMER(Base::custom_profile(), "HashTableComputeTime");
+    _hash_table_limit_compute_timer =
+            ADD_TIMER(Base::custom_profile(), "HashTableLimitComputeTime");
     _hash_table_emplace_timer = ADD_TIMER(Base::custom_profile(), "HashTableEmplaceTime");
     _hash_table_input_counter =
             ADD_COUNTER(Base::custom_profile(), "HashTableInputCount", TUnit::UNIT);
@@ -152,16 +154,10 @@ Status StreamingAggLocalState::open(RuntimeState* state) {
                        }},
                _agg_data->method_variant);
 
-    if (p._is_merge || p._needs_finalize) {
-        return Status::InvalidArgument(
-                "StreamingAggLocalState only support no merge and no finalize, "
-                "but got is_merge={}, needs_finalize={}",
-                p._is_merge, p._needs_finalize);
-    }
-
-    _should_limit_output = p._limit != -1 &&       // has limit
-                           (!p._have_conjuncts) && // no having conjunct
-                           p._needs_finalize;      // agg's finalize step
+    limit = p._sort_limit;
+    do_sort_limit = p._do_sort_limit;
+    null_directions = p._null_directions;
+    order_directions = p._order_directions;
 
     return Status::OK();
 }
@@ -316,23 +312,22 @@ bool StreamingAggLocalState::_should_not_do_pre_agg(size_t rows) {
     const auto spill_streaming_agg_mem_limit = p._spill_streaming_agg_mem_limit;
     const bool used_too_much_memory =
             spill_streaming_agg_mem_limit > 0 && _memory_usage() > spill_streaming_agg_mem_limit;
-    std::visit(
-            vectorized::Overload {
-                    [&](std::monostate& arg) {
-                        throw doris::Exception(ErrorCode::INTERNAL_ERROR, "uninited hash table");
-                    },
-                    [&](auto& agg_method) {
-                        auto& hash_tbl = *agg_method.hash_table;
-                        /// If too much memory is used during the pre-aggregation stage,
-                        /// it is better to output the data directly without performing further aggregation.
-                        // do not try to do agg, just init and serialize directly return the out_block
-                        if (used_too_much_memory || (hash_tbl.add_elem_size_overflow(rows) &&
-                                                     !_should_expand_preagg_hash_tables())) {
-                            SCOPED_TIMER(_streaming_agg_timer);
-                            ret_flag = true;
-                        }
-                    }},
-            _agg_data->method_variant);
+    std::visit(vectorized::Overload {
+                       [&](std::monostate& arg) {
+                           throw doris::Exception(ErrorCode::INTERNAL_ERROR, "uninited hash table");
+                       },
+                       [&](auto& agg_method) {
+                           auto& hash_tbl = *agg_method.hash_table;
+                           /// If too much memory is used during the pre-aggregation stage,
+                           /// it is better to output the data directly without performing further aggregation.
+                           // do not try to do agg, just init and serialize directly return the out_block
+                           if (used_too_much_memory || (hash_tbl.add_elem_size_overflow(rows) &&
+                                                        !_should_expand_preagg_hash_tables())) {
+                               SCOPED_TIMER(_streaming_agg_timer);
+                               ret_flag = true;
+                           }
+                       }},
+               _agg_data->method_variant);
 
     return ret_flag;
 }
@@ -363,6 +358,30 @@ Status StreamingAggLocalState::_pre_agg_with_serialized_key(doris::vectorized::B
     _places.resize(rows);
 
     if (_should_not_do_pre_agg(rows)) {
+        if (limit > 0) {
+            DCHECK(do_sort_limit);
+            if (need_do_sort_limit == -1) {
+                const size_t hash_table_size = _get_hash_table_size();
+                need_do_sort_limit = hash_table_size >= limit ? 1 : 0;
+                if (need_do_sort_limit == 1) {
+                    build_limit_heap(hash_table_size);
+                }
+            }
+
+            if (need_do_sort_limit == 1) {
+                if (_do_limit_filter(rows, key_columns)) {
+                    bool need_filter = std::find(need_computes.begin(), need_computes.end(), 1) !=
+                                       need_computes.end();
+                    if (need_filter) {
+                        _add_limit_heap_top(key_columns, rows);
+                        vectorized::Block::filter_block_internal(in_block, need_computes);
+                        rows = (uint32_t)in_block->rows();
+                    } else {
+                        return Status::OK();
+                    }
+                }
+            }
+        }
         bool mem_reuse = p._make_nullable_keys.empty() && out_block->mem_reuse();
 
         std::vector<vectorized::DataTypePtr> data_types;
@@ -404,12 +423,23 @@ Status StreamingAggLocalState::_pre_agg_with_serialized_key(doris::vectorized::B
             }
         }
     } else {
-        _emplace_into_hash_table(_places.data(), key_columns, rows);
+        bool need_agg = true;
+        if (need_do_sort_limit != 1) {
+            _emplace_into_hash_table(_places.data(), key_columns, rows);
+        } else {
+            need_agg = _emplace_into_hash_table_limit(_places.data(), in_block, key_columns, rows);
+        }
 
-        for (int i = 0; i < _aggregate_evaluators.size(); ++i) {
-            RETURN_IF_ERROR(_aggregate_evaluators[i]->execute_batch_add(
-                    in_block, p._offsets_of_aggregate_states[i], _places.data(), _agg_arena_pool,
-                    _should_expand_hash_table));
+        if (need_agg) {
+            for (int i = 0; i < _aggregate_evaluators.size(); ++i) {
+                RETURN_IF_ERROR(_aggregate_evaluators[i]->execute_batch_add(
+                        in_block, p._offsets_of_aggregate_states[i], _places.data(),
+                        _agg_arena_pool, _should_expand_hash_table));
+            }
+            if (limit > 0 && need_do_sort_limit == -1 && _get_hash_table_size() >= limit) {
+                need_do_sort_limit = 1;
+                build_limit_heap(_get_hash_table_size());
+            }
         }
     }
 
@@ -561,6 +591,183 @@ void StreamingAggLocalState::_destroy_agg_status(vectorized::AggregateDataPtr da
     }
 }
 
+vectorized::MutableColumns StreamingAggLocalState::_get_keys_hash_table() {
+    return std::visit(
+            vectorized::Overload {
+                    [&](std::monostate& arg) {
+                        throw doris::Exception(ErrorCode::INTERNAL_ERROR, "uninited hash table");
+                        return vectorized::MutableColumns();
+                    },
+                    [&](auto&& agg_method) -> vectorized::MutableColumns {
+                        vectorized::MutableColumns key_columns;
+                        for (int i = 0; i < _probe_expr_ctxs.size(); ++i) {
+                            key_columns.emplace_back(
+                                    _probe_expr_ctxs[i]->root()->data_type()->create_column());
+                        }
+                        auto& data = *agg_method.hash_table;
+                        bool has_null_key = data.has_null_key_data();
+                        const auto size = data.size() - has_null_key;
+                        using KeyType = std::decay_t<decltype(agg_method)>::Key;
+                        std::vector<KeyType> keys(size);
+
+                        uint32_t num_rows = 0;
+                        auto iter = _aggregate_data_container->begin();
+                        {
+                            while (iter != _aggregate_data_container->end()) {
+                                keys[num_rows] = iter.get_key<KeyType>();
+                                ++iter;
+                                ++num_rows;
+                            }
+                        }
+                        agg_method.insert_keys_into_columns(keys, key_columns, num_rows);
+                        if (has_null_key) {
+                            key_columns[0]->insert_data(nullptr, 0);
+                        }
+                        return key_columns;
+                    }},
+            _agg_data->method_variant);
+}
+
+void StreamingAggLocalState::build_limit_heap(size_t hash_table_size) {
+    limit_columns = _get_keys_hash_table();
+    for (size_t i = 0; i < hash_table_size; ++i) {
+        limit_heap.emplace(i, limit_columns, order_directions, null_directions);
+    }
+    while (hash_table_size > limit) {
+        limit_heap.pop();
+        hash_table_size--;
+    }
+    limit_columns_min = limit_heap.top()._row_id;
+}
+
+void StreamingAggLocalState::_add_limit_heap_top(vectorized::ColumnRawPtrs& key_columns,
+                                                 size_t rows) {
+    for (int i = 0; i < rows; ++i) {
+        if (cmp_res[i] == 1 && need_computes[i]) {
+            for (int j = 0; j < key_columns.size(); ++j) {
+                limit_columns[j]->insert_from(*key_columns[j], i);
+            }
+            limit_heap.emplace(limit_columns[0]->size() - 1, limit_columns, order_directions,
+                               null_directions);
+            limit_heap.pop();
+            limit_columns_min = limit_heap.top()._row_id;
+            break;
+        }
+    }
+}
+
+void StreamingAggLocalState::_refresh_limit_heap(size_t i, vectorized::ColumnRawPtrs& key_columns) {
+    for (int j = 0; j < key_columns.size(); ++j) {
+        limit_columns[j]->insert_from(*key_columns[j], i);
+    }
+    limit_heap.emplace(limit_columns[0]->size() - 1, limit_columns, order_directions,
+                       null_directions);
+    limit_heap.pop();
+    limit_columns_min = limit_heap.top()._row_id;
+}
+
+bool StreamingAggLocalState::_emplace_into_hash_table_limit(vectorized::AggregateDataPtr* places,
+                                                            vectorized::Block* block,
+                                                            vectorized::ColumnRawPtrs& key_columns,
+                                                            uint32_t num_rows) {
+    return std::visit(
+            vectorized::Overload {
+                    [&](std::monostate& arg) {
+                        throw doris::Exception(ErrorCode::INTERNAL_ERROR, "uninited hash table");
+                        return true;
+                    },
+                    [&](auto&& agg_method) -> bool {
+                        SCOPED_TIMER(_hash_table_compute_timer);
+                        using HashMethodType = std::decay_t<decltype(agg_method)>;
+                        using AggState = typename HashMethodType::State;
+
+                        bool need_filter = _do_limit_filter(num_rows, key_columns);
+                        if (auto need_agg =
+                                    std::find(need_computes.begin(), need_computes.end(), 1);
+                            need_agg != need_computes.end()) {
+                            if (need_filter) {
+                                vectorized::Block::filter_block_internal(block, need_computes);
+                                num_rows = (uint32_t)block->rows();
+                            }
+
+                            AggState state(key_columns);
+                            agg_method.init_serialized_keys(key_columns, num_rows);
+                            size_t i = 0;
+
+                            auto creator = [&](const auto& ctor, auto& key, auto& origin) {
+                                try {
+                                    HashMethodType::try_presis_key_and_origin(key, origin,
+                                                                              _agg_arena_pool);
+                                    auto mapped = _aggregate_data_container->append_data(origin);
+                                    auto st = _create_agg_status(mapped);
+                                    if (!st) {
+                                        throw Exception(st.code(), st.to_string());
+                                    }
+                                    ctor(key, mapped);
+                                    _refresh_limit_heap(i, key_columns);
+                                } catch (...) {
+                                    // Exception-safety - if it can not allocate memory or create status,
+                                    // the destructors will not be called.
+                                    ctor(key, nullptr);
+                                    throw;
+                                }
+                            };
+
+                            auto creator_for_null_key = [&](auto& mapped) {
+                                mapped = _agg_arena_pool.aligned_alloc(
+                                        Base::_parent->template cast<StreamingAggOperatorX>()
+                                                ._total_size_of_aggregate_states,
+                                        Base::_parent->template cast<StreamingAggOperatorX>()
+                                                ._align_aggregate_states);
+                                auto st = _create_agg_status(mapped);
+                                if (!st) {
+                                    throw Exception(st.code(), st.to_string());
+                                }
+                                _refresh_limit_heap(i, key_columns);
+                            };
+
+                            SCOPED_TIMER(_hash_table_emplace_timer);
+                            for (i = 0; i < num_rows; ++i) {
+                                places[i] = *agg_method.lazy_emplace(state, i, creator,
+                                                                     creator_for_null_key);
+                            }
+                            COUNTER_UPDATE(_hash_table_input_counter, num_rows);
+                            return true;
+                        }
+                        return false;
+                    }},
+            _agg_data->method_variant);
+}
+
+bool StreamingAggLocalState::_do_limit_filter(size_t num_rows,
+                                              vectorized::ColumnRawPtrs& key_columns) {
+    SCOPED_TIMER(_hash_table_limit_compute_timer);
+    if (num_rows) {
+        cmp_res.resize(num_rows);
+        need_computes.resize(num_rows);
+        memset(need_computes.data(), 0, need_computes.size());
+        memset(cmp_res.data(), 0, cmp_res.size());
+
+        const auto key_size = null_directions.size();
+        for (int i = 0; i < key_size; i++) {
+            key_columns[i]->compare_internal(limit_columns_min, *limit_columns[i],
+                                             null_directions[i], order_directions[i], cmp_res,
+                                             need_computes.data());
+        }
+
+        auto set_computes_arr = [](auto* __restrict res, auto* __restrict computes, size_t rows) {
+            for (size_t i = 0; i < rows; ++i) {
+                computes[i] = computes[i] == res[i];
+            }
+        };
+        set_computes_arr(cmp_res.data(), need_computes.data(), num_rows);
+
+        return std::find(need_computes.begin(), need_computes.end(), 0) != need_computes.end();
+    }
+
+    return false;
+}
+
 void StreamingAggLocalState::_emplace_into_hash_table(vectorized::AggregateDataPtr* places,
                                                       vectorized::ColumnRawPtrs& key_columns,
                                                       const uint32_t num_rows) {
@@ -616,7 +823,6 @@ StreamingAggOperatorX::StreamingAggOperatorX(ObjectPool* pool, int operator_id,
           _intermediate_tuple_id(tnode.agg_node.intermediate_tuple_id),
           _output_tuple_id(tnode.agg_node.output_tuple_id),
           _needs_finalize(tnode.agg_node.need_finalize),
-          _is_merge(false),
           _is_first_phase(tnode.agg_node.__isset.is_first_phase && tnode.agg_node.is_first_phase),
           _have_conjuncts(tnode.__isset.vconjunct && !tnode.vconjunct.nodes.empty()),
           _agg_fn_output_row_descriptor(descs, tnode.row_tuples, tnode.nullable_tuples),
@@ -668,8 +874,33 @@ Status StreamingAggOperatorX::init(const TPlanNode& tnode, RuntimeState* state)
     }
 
     const auto& agg_functions = tnode.agg_node.aggregate_functions;
-    _is_merge = std::any_of(agg_functions.cbegin(), agg_functions.cend(),
-                            [](const auto& e) { return e.nodes[0].agg_expr.is_merge_agg; });
+    auto is_merge = std::any_of(agg_functions.cbegin(), agg_functions.cend(),
+                                [](const auto& e) { return e.nodes[0].agg_expr.is_merge_agg; });
+    if (is_merge || _needs_finalize) {
+        return Status::InvalidArgument(
+                "StreamingAggLocalState only support no merge and no finalize, "
+                "but got is_merge={}, needs_finalize={}",
+                is_merge, _needs_finalize);
+    }
+
+    // Handle sort limit
+    if (tnode.agg_node.__isset.agg_sort_info_by_group_key) {
+        _sort_limit = _limit;
+        _limit = -1;
+        _do_sort_limit = true;
+        const auto& agg_sort_info = tnode.agg_node.agg_sort_info_by_group_key;
+        DCHECK_EQ(agg_sort_info.nulls_first.size(), agg_sort_info.is_asc_order.size());
+
+        const size_t order_by_key_size = agg_sort_info.is_asc_order.size();
+        _order_directions.resize(order_by_key_size);
+        _null_directions.resize(order_by_key_size);
+        for (int i = 0; i < order_by_key_size; ++i) {
+            _order_directions[i] = agg_sort_info.is_asc_order[i] ? 1 : -1;
+            _null_directions[i] =
+                    agg_sort_info.nulls_first[i] ? -_order_directions[i] : _order_directions[i];
+        }
+    }
+
     _op_name = "STREAMING_AGGREGATION_OPERATOR";
     return Status::OK();
 }
diff --git a/be/src/pipeline/exec/streaming_aggregation_operator.h b/be/src/pipeline/exec/streaming_aggregation_operator.h
index d7fc56b6fe65f9..7f7ee9403decf8 100644
--- a/be/src/pipeline/exec/streaming_aggregation_operator.h
+++ b/be/src/pipeline/exec/streaming_aggregation_operator.h
@@ -48,6 +48,7 @@ class StreamingAggLocalState MOCK_REMOVE(final) : public PipelineXLocalState<Fak
     Status do_pre_agg(RuntimeState* state, vectorized::Block* input_block,
                       vectorized::Block* output_block);
     void make_nullable_output_key(vectorized::Block* block);
+    void build_limit_heap(size_t hash_table_size);
 
 private:
     friend class StreamingAggOperatorX;
@@ -55,6 +56,10 @@ class StreamingAggLocalState MOCK_REMOVE(final) : public PipelineXLocalState<Fak
     friend class StatefulOperatorX;
 
     size_t _memory_usage() const;
+    void _add_limit_heap_top(vectorized::ColumnRawPtrs& key_columns, size_t rows);
+    bool _do_limit_filter(size_t num_rows, vectorized::ColumnRawPtrs& key_columns);
+    void _refresh_limit_heap(size_t i, vectorized::ColumnRawPtrs& key_columns);
+
     Status _pre_agg_with_serialized_key(doris::vectorized::Block* in_block,
                                         doris::vectorized::Block* out_block);
     bool _should_expand_preagg_hash_tables();
@@ -68,11 +73,15 @@ class StreamingAggLocalState MOCK_REMOVE(final) : public PipelineXLocalState<Fak
                                             bool* eos);
     void _emplace_into_hash_table(vectorized::AggregateDataPtr* places,
                                   vectorized::ColumnRawPtrs& key_columns, const uint32_t num_rows);
+    bool _emplace_into_hash_table_limit(vectorized::AggregateDataPtr* places,
+                                        vectorized::Block* block,
+                                        vectorized::ColumnRawPtrs& key_columns, uint32_t num_rows);
     Status _create_agg_status(vectorized::AggregateDataPtr data);
     size_t _get_hash_table_size();
 
     RuntimeProfile::Counter* _streaming_agg_timer = nullptr;
     RuntimeProfile::Counter* _hash_table_compute_timer = nullptr;
+    RuntimeProfile::Counter* _hash_table_limit_compute_timer = nullptr;
     RuntimeProfile::Counter* _hash_table_emplace_timer = nullptr;
     RuntimeProfile::Counter* _hash_table_input_counter = nullptr;
     RuntimeProfile::Counter* _build_timer = nullptr;
@@ -95,10 +104,70 @@ class StreamingAggLocalState MOCK_REMOVE(final) : public PipelineXLocalState<Fak
     // group by k1,k2
     vectorized::VExprContextSPtrs _probe_expr_ctxs;
     std::unique_ptr<AggregateDataContainer> _aggregate_data_container = nullptr;
-    bool _should_limit_output = false;
     bool _reach_limit = false;
     size_t _input_num_rows = 0;
 
+    int64_t limit = -1;
+    int need_do_sort_limit = -1;
+    bool do_sort_limit = false;
+    vectorized::MutableColumns limit_columns;
+    int limit_columns_min = -1;
+    vectorized::PaddedPODArray<uint8_t> need_computes;
+    std::vector<uint8_t> cmp_res;
+    std::vector<int> order_directions;
+    std::vector<int> null_directions;
+
+    struct HeapLimitCursor {
+        HeapLimitCursor(int row_id, vectorized::MutableColumns& limit_columns,
+                        std::vector<int>& order_directions, std::vector<int>& null_directions)
+                : _row_id(row_id),
+                  _limit_columns(limit_columns),
+                  _order_directions(order_directions),
+                  _null_directions(null_directions) {}
+
+        HeapLimitCursor(const HeapLimitCursor& other) = default;
+
+        HeapLimitCursor(HeapLimitCursor&& other) noexcept
+                : _row_id(other._row_id),
+                  _limit_columns(other._limit_columns),
+                  _order_directions(other._order_directions),
+                  _null_directions(other._null_directions) {}
+
+        HeapLimitCursor& operator=(const HeapLimitCursor& other) noexcept {
+            _row_id = other._row_id;
+            return *this;
+        }
+
+        HeapLimitCursor& operator=(HeapLimitCursor&& other) noexcept {
+            _row_id = other._row_id;
+            return *this;
+        }
+
+        bool operator<(const HeapLimitCursor& rhs) const {
+            for (int i = 0; i < _limit_columns.size(); ++i) {
+                const auto& _limit_column = _limit_columns[i];
+                auto res = _limit_column->compare_at(_row_id, rhs._row_id, *_limit_column,
+                                                     _null_directions[i]) *
+                           _order_directions[i];
+                if (res < 0) {
+                    return true;
+                } else if (res > 0) {
+                    return false;
+                }
+            }
+            return false;
+        }
+
+        int _row_id;
+        vectorized::MutableColumns& _limit_columns;
+        std::vector<int>& _order_directions;
+        std::vector<int>& _null_directions;
+    };
+
+    std::priority_queue<HeapLimitCursor> limit_heap;
+
+    vectorized::MutableColumns _get_keys_hash_table();
+
     vectorized::PODArray<vectorized::AggregateDataPtr> _places;
     std::vector<char> _deserialize_buffer;
 
@@ -182,7 +251,6 @@ class StreamingAggOperatorX MOCK_REMOVE(final) : public StatefulOperatorX<Stream
     TupleId _output_tuple_id;
     TupleDescriptor* _output_tuple_desc = nullptr;
     bool _needs_finalize;
-    bool _is_merge;
     const bool _is_first_phase;
     size_t _align_aggregate_states = 1;
     /// The offset to the n-th aggregate function in a row of aggregate functions.
@@ -199,6 +267,13 @@ class StreamingAggOperatorX MOCK_REMOVE(final) : public StatefulOperatorX<Stream
     std::vector<size_t> _make_nullable_keys;
     bool _have_conjuncts;
     RowDescriptor _agg_fn_output_row_descriptor;
+
+    // For sort limit
+    bool _do_sort_limit = false;
+    int64_t _sort_limit = -1;
+    std::vector<int> _order_directions;
+    std::vector<int> _null_directions;
+
     const std::vector<TExpr> _partition_exprs;
 };
 
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java
index 01b10a316048b4..0f621c492f6c4c 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java
@@ -344,8 +344,7 @@ public PlanFragment visitPhysicalDistribute(PhysicalDistribute<? extends Plan> d
         if (upstreamFragment.getPlanRoot() instanceof AggregationNode && upstream instanceof PhysicalHashAggregate) {
             PhysicalHashAggregate<?> hashAggregate = (PhysicalHashAggregate<?>) upstream;
             if (hashAggregate.getAggPhase() == AggPhase.LOCAL
-                    && hashAggregate.getAggMode() == AggMode.INPUT_TO_BUFFER
-                    && hashAggregate.getTopnPushInfo() == null) {
+                    && hashAggregate.getAggMode() == AggMode.INPUT_TO_BUFFER) {
                 AggregationNode aggregationNode = (AggregationNode) upstreamFragment.getPlanRoot();
                 aggregationNode.setUseStreamingPreagg(hashAggregate.isMaybeUsingStream());
             }

From bbca9eacc3b6c9fdd089500a03904f7240cd8259 Mon Sep 17 00:00:00 2001
From: Socrates <suyiteng@selectdb.com>
Date: Fri, 19 Dec 2025 10:53:50 +0800
Subject: [PATCH 08/12] Manifest cache for tpch1000 (#59178)

---
 be/src/clucene                                |   1 +
 fe/check/checkstyle/suppressions.xml          |   3 +
 .../java/org/apache/doris/common/Config.java  |  12 +
 .../datasource/ExternalMetaCacheMgr.java      |   7 +
 .../iceberg/IcebergManifestCacheMgr.java      |  35 +
 .../datasource/iceberg/IcebergUtils.java      |   8 +
 .../iceberg/cache/ContentFileEstimater.java   | 194 ++++
 .../iceberg/cache/IcebergManifestCache.java   |  96 ++
 .../cache/IcebergManifestCacheLoader.java     |  89 ++
 .../iceberg/cache/ManifestCacheKey.java       |  58 ++
 .../iceberg/cache/ManifestCacheValue.java     |  65 ++
 .../iceberg/source/IcebergScanNode.java       | 146 ++-
 .../metastore/AbstractIcebergProperties.java  |  62 ++
 .../org/apache/iceberg/DeleteFileIndex.java   | 906 ++++++++++++++++++
 14 files changed, 1681 insertions(+), 1 deletion(-)
 create mode 160000 be/src/clucene
 create mode 100644 fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergManifestCacheMgr.java
 create mode 100644 fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ContentFileEstimater.java
 create mode 100644 fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/IcebergManifestCache.java
 create mode 100644 fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/IcebergManifestCacheLoader.java
 create mode 100644 fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ManifestCacheKey.java
 create mode 100644 fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ManifestCacheValue.java
 create mode 100644 fe/fe-core/src/main/java/org/apache/iceberg/DeleteFileIndex.java

diff --git a/be/src/clucene b/be/src/clucene
new file mode 160000
index 00000000000000..bb22247973e55d
--- /dev/null
+++ b/be/src/clucene
@@ -0,0 +1 @@
+Subproject commit bb22247973e55dcac9a3eaafedc57cc6c36d2fc3
diff --git a/fe/check/checkstyle/suppressions.xml b/fe/check/checkstyle/suppressions.xml
index 8f000bb7616ca9..7340c4c5bd5fe9 100644
--- a/fe/check/checkstyle/suppressions.xml
+++ b/fe/check/checkstyle/suppressions.xml
@@ -69,6 +69,9 @@ under the License.
     <!-- ignore hudi disk map copied from hudi/common/util/collection/DiskMap.java -->
     <suppress files="org[\\/]apache[\\/]hudi[\\/]common[\\/]util[\\/]collection[\\/]DiskMap\.java" checks="[a-zA-Z0-9]*"/>
 
+    <!-- ignore iceberg delete file index copied from iceberg/DeleteFileIndex.java -->
+    <suppress files="org[\\/]apache[\\/]iceberg[\\/]DeleteFileIndex\.java" checks="[a-zA-Z0-9]*"/>
+
     <!-- ignore gensrc/thrift/ExternalTableSchema.thrift -->
     <suppress files=".*thrift/schema/external/.*" checks=".*"/>
 </suppressions>
diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
index 350e34f8a90940..d4426c2d515136 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
@@ -2298,6 +2298,18 @@ public class Config extends ConfigBase {
     })
     public static long external_cache_refresh_time_minutes = 10; // 10 mins
 
+    @ConfField(description = {"是否启用 Iceberg Manifest DataFile/DeleteFile 缓存。",
+            "Whether to enable Iceberg manifest DataFile/DeleteFile cache."})
+    public static boolean iceberg_manifest_cache_enable = true;
+
+    @ConfField(description = {"Iceberg Manifest 缓存的容量上限，单位 MB。",
+            "Iceberg manifest cache capacity in MB."})
+    public static long iceberg_manifest_cache_capacity_mb = 1024;
+
+    @ConfField(description = {"Iceberg Manifest 缓存的访问过期时间（秒），0 或负数表示不过期。",
+            "Iceberg manifest cache expire after access in seconds. 0 or negative disables expiration."})
+    public static long iceberg_manifest_cache_ttl_sec = 48 * 60 * 60;
+
     /**
      * Github workflow test type, for setting some session variables
      * only for certain test type. E.g. only settting batch_size to small
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalMetaCacheMgr.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalMetaCacheMgr.java
index e777285a07f587..798a2170b1e53b 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalMetaCacheMgr.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalMetaCacheMgr.java
@@ -29,6 +29,7 @@
 import org.apache.doris.datasource.hudi.source.HudiCachedMetaClientProcessor;
 import org.apache.doris.datasource.hudi.source.HudiMetadataCacheMgr;
 import org.apache.doris.datasource.hudi.source.HudiPartitionProcessor;
+import org.apache.doris.datasource.iceberg.IcebergManifestCacheMgr;
 import org.apache.doris.datasource.iceberg.IcebergMetadataCache;
 import org.apache.doris.datasource.iceberg.IcebergMetadataCacheMgr;
 import org.apache.doris.datasource.maxcompute.MaxComputeMetadataCache;
@@ -97,6 +98,7 @@ public class ExternalMetaCacheMgr {
     private FileSystemCache fsCache;
     // all external table row count cache.
     private ExternalRowCountCache rowCountCache;
+    private final IcebergManifestCacheMgr icebergManifestCacheMgr;
     private final IcebergMetadataCacheMgr icebergMetadataCacheMgr;
     private final MaxComputeMetadataCacheMgr maxComputeMetadataCacheMgr;
     private final PaimonMetadataCacheMgr paimonMetadataCacheMgr;
@@ -128,6 +130,7 @@ public ExternalMetaCacheMgr(boolean isCheckpointCatalog) {
         rowCountCache = new ExternalRowCountCache(rowCountRefreshExecutor);
 
         hudiMetadataCacheMgr = new HudiMetadataCacheMgr(commonRefreshExecutor);
+        icebergManifestCacheMgr = new IcebergManifestCacheMgr();
         icebergMetadataCacheMgr = new IcebergMetadataCacheMgr(commonRefreshExecutor);
         maxComputeMetadataCacheMgr = new MaxComputeMetadataCacheMgr();
         paimonMetadataCacheMgr = new PaimonMetadataCacheMgr(commonRefreshExecutor);
@@ -199,6 +202,10 @@ public HudiMetadataCacheMgr getHudiMetadataCacheMgr() {
         return hudiMetadataCacheMgr;
     }
 
+    public IcebergManifestCacheMgr getIcebergManifestCacheMgr() {
+        return icebergManifestCacheMgr;
+    }
+
     public IcebergMetadataCache getIcebergMetadataCache() {
         return icebergMetadataCacheMgr.getIcebergMetadataCache();
     }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergManifestCacheMgr.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergManifestCacheMgr.java
new file mode 100644
index 00000000000000..ad95e151b98db5
--- /dev/null
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergManifestCacheMgr.java
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.datasource.iceberg;
+
+import org.apache.doris.datasource.iceberg.cache.IcebergManifestCache;
+
+/**
+ * Wrapper manager for Iceberg manifest cache.
+ */
+public class IcebergManifestCacheMgr {
+    private final IcebergManifestCache manifestCache;
+
+    public IcebergManifestCacheMgr() {
+        this.manifestCache = new IcebergManifestCache();
+    }
+
+    public IcebergManifestCache getManifestCache() {
+        return manifestCache;
+    }
+}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java
index 9587ca4f8169ca..28ddf2817df40b 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java
@@ -56,6 +56,7 @@
 import org.apache.doris.datasource.ExternalSchemaCache;
 import org.apache.doris.datasource.ExternalTable;
 import org.apache.doris.datasource.SchemaCacheValue;
+import org.apache.doris.datasource.iceberg.cache.IcebergManifestCache;
 import org.apache.doris.datasource.iceberg.source.IcebergTableQueryInfo;
 import org.apache.doris.datasource.mvcc.MvccSnapshot;
 import org.apache.doris.datasource.mvcc.MvccUtil;
@@ -1452,4 +1453,11 @@ public static String showCreateView(IcebergExternalTable icebergExternalTable) {
                 icebergExternalTable.getViewText();
     }
 
+    public static IcebergManifestCache getManifestCache() {
+        return Env.getCurrentEnv()
+                .getExtMetaCacheMgr()
+                .getIcebergManifestCacheMgr()
+                .getManifestCache();
+    }
+
 }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ContentFileEstimater.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ContentFileEstimater.java
new file mode 100644
index 00000000000000..43f60096e31e4f
--- /dev/null
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ContentFileEstimater.java
@@ -0,0 +1,194 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.datasource.iceberg.cache;
+
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.DeleteFile;
+import org.apache.iceberg.StructLike;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Utility to estimate the JVM weight of Iceberg {@link ContentFile} objects.
+ */
+public final class ContentFileEstimater {
+    private static final long LIST_BASE_WEIGHT = 48L;
+    private static final long OBJECT_REFERENCE_WEIGHT = 8L;
+    private static final long CONTENT_FILE_BASE_WEIGHT = 256L;
+    private static final long STRING_BASE_WEIGHT = 40L;
+    private static final long CHAR_BYTES = 2L;
+    private static final long BYTE_BUFFER_BASE_WEIGHT = 16L;
+    private static final long MAP_BASE_WEIGHT = 48L;
+    private static final long MAP_ENTRY_OVERHEAD = 24L;
+    private static final long LONG_OBJECT_WEIGHT = 24L;
+    private static final long INT_OBJECT_WEIGHT = 16L;
+    private static final long PARTITION_BASE_WEIGHT = 48L;
+    private static final long PARTITION_VALUE_BASE_WEIGHT = 8L;
+
+    private ContentFileEstimater() {
+    }
+
+    public static long estimate(List<? extends ContentFile<?>> files) {
+        return listReferenceWeight(files) + estimateContentFilesWeight(files);
+    }
+
+    private static long listReferenceWeight(List<?> files) {
+        if (files == null || files.isEmpty()) {
+            return 0L;
+        }
+        return LIST_BASE_WEIGHT + (long) files.size() * OBJECT_REFERENCE_WEIGHT;
+    }
+
+    private static long estimateContentFilesWeight(List<? extends ContentFile<?>> files) {
+        long total = 0L;
+        if (files == null) {
+            return 0L;
+        }
+        for (ContentFile<?> file : files) {
+            total += estimateContentFileWeight(file);
+        }
+        return total;
+    }
+
+    private static long estimateContentFileWeight(ContentFile<?> file) {
+        if (file == null) {
+            return 0L;
+        }
+
+        long weight = CONTENT_FILE_BASE_WEIGHT;
+        weight += charSequenceWeight(file.path());
+        weight += stringWeight(file.manifestLocation());
+        weight += byteBufferWeight(file.keyMetadata());
+        weight += partitionWeight(file.partition());
+
+        weight += numericMapWeight(file.columnSizes());
+        weight += numericMapWeight(file.valueCounts());
+        weight += numericMapWeight(file.nullValueCounts());
+        weight += numericMapWeight(file.nanValueCounts());
+        weight += byteBufferMapWeight(file.lowerBounds());
+        weight += byteBufferMapWeight(file.upperBounds());
+
+        weight += listWeight(file.splitOffsets(), LONG_OBJECT_WEIGHT);
+        weight += listWeight(file.equalityFieldIds(), INT_OBJECT_WEIGHT);
+
+        weight += optionalLongWeight(file.pos());
+        weight += optionalLongWeight(file.dataSequenceNumber());
+        weight += optionalLongWeight(file.fileSequenceNumber());
+        weight += optionalLongWeight(file.firstRowId());
+        weight += optionalIntWeight(file.sortOrderId());
+
+        if (file instanceof DeleteFile) {
+            DeleteFile deleteFile = (DeleteFile) file;
+            weight += stringWeight(deleteFile.referencedDataFile());
+            weight += optionalLongWeight(deleteFile.contentOffset());
+            weight += optionalLongWeight(deleteFile.contentSizeInBytes());
+        }
+
+        return weight;
+    }
+
+    private static long listWeight(List<? extends Number> list, long elementWeight) {
+        if (list == null || list.isEmpty()) {
+            return 0L;
+        }
+        return LIST_BASE_WEIGHT + (long) list.size() * (OBJECT_REFERENCE_WEIGHT + elementWeight);
+    }
+
+    private static long numericMapWeight(Map<Integer, Long> map) {
+        if (map == null || map.isEmpty()) {
+            return 0L;
+        }
+        return MAP_BASE_WEIGHT + (long) map.size() * (MAP_ENTRY_OVERHEAD + LONG_OBJECT_WEIGHT);
+    }
+
+    private static long byteBufferMapWeight(Map<Integer, ByteBuffer> map) {
+        if (map == null || map.isEmpty()) {
+            return 0L;
+        }
+        long weight = MAP_BASE_WEIGHT + (long) map.size() * MAP_ENTRY_OVERHEAD;
+        for (ByteBuffer buffer : map.values()) {
+            weight += byteBufferWeight(buffer);
+        }
+        return weight;
+    }
+
+    private static long partitionWeight(StructLike partition) {
+        if (partition == null) {
+            return 0L;
+        }
+        long weight = PARTITION_BASE_WEIGHT + (long) partition.size() * PARTITION_VALUE_BASE_WEIGHT;
+        for (int i = 0; i < partition.size(); i++) {
+            Object value = partition.get(i, Object.class);
+            weight += estimateValueWeight(value);
+        }
+        return weight;
+    }
+
+    private static long estimateValueWeight(Object value) {
+        if (value == null) {
+            return 0L;
+        }
+        if (value instanceof CharSequence) {
+            return charSequenceWeight((CharSequence) value);
+        } else if (value instanceof byte[]) {
+            return BYTE_BUFFER_BASE_WEIGHT + ((byte[]) value).length;
+        } else if (value instanceof ByteBuffer) {
+            return byteBufferWeight((ByteBuffer) value);
+        } else if (value instanceof Long || value instanceof Double) {
+            return LONG_OBJECT_WEIGHT;
+        } else if (value instanceof Integer || value instanceof Float) {
+            return INT_OBJECT_WEIGHT;
+        } else if (value instanceof Short || value instanceof Character) {
+            return 4L;
+        } else if (value instanceof Boolean) {
+            return 1L;
+        }
+        return OBJECT_REFERENCE_WEIGHT;
+    }
+
+    private static long charSequenceWeight(CharSequence value) {
+        if (value == null) {
+            return 0L;
+        }
+        return STRING_BASE_WEIGHT + (long) value.length() * CHAR_BYTES;
+    }
+
+    private static long stringWeight(String value) {
+        if (value == null) {
+            return 0L;
+        }
+        return STRING_BASE_WEIGHT + (long) value.length() * CHAR_BYTES;
+    }
+
+    private static long byteBufferWeight(ByteBuffer buffer) {
+        if (buffer == null) {
+            return 0L;
+        }
+        return BYTE_BUFFER_BASE_WEIGHT + buffer.remaining();
+    }
+
+    private static long optionalLongWeight(Long value) {
+        return value == null ? 0L : LONG_OBJECT_WEIGHT;
+    }
+
+    private static long optionalIntWeight(Integer value) {
+        return value == null ? 0L : INT_OBJECT_WEIGHT;
+    }
+}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/IcebergManifestCache.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/IcebergManifestCache.java
new file mode 100644
index 00000000000000..be919c5d3134fb
--- /dev/null
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/IcebergManifestCache.java
@@ -0,0 +1,96 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.datasource.iceberg.cache;
+
+import org.apache.doris.common.Config;
+import org.apache.doris.datasource.CacheException;
+
+import com.github.benmanes.caffeine.cache.CacheLoader;
+import com.github.benmanes.caffeine.cache.Caffeine;
+import com.github.benmanes.caffeine.cache.LoadingCache;
+import com.github.benmanes.caffeine.cache.Weigher;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.time.Duration;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+
+/**
+ * A lightweight manifest cache that stores parsed DataFile/DeleteFile lists per manifest.
+ */
+public class IcebergManifestCache {
+    private static final Logger LOG = LogManager.getLogger(IcebergManifestCache.class);
+
+    private final LoadingCache<ManifestCacheKey, ManifestCacheValue> cache;
+
+    public IcebergManifestCache() {
+        long capacityInBytes = Config.iceberg_manifest_cache_capacity_mb * 1024L * 1024L;
+        Weigher<ManifestCacheKey, ManifestCacheValue> weigher = (key, value) -> {
+            long weight = Optional.ofNullable(value).map(ManifestCacheValue::getWeightBytes).orElse(0L);
+            if (weight > Integer.MAX_VALUE) {
+                return Integer.MAX_VALUE;
+            }
+            return (int) weight;
+        };
+        Caffeine<ManifestCacheKey, ManifestCacheValue> builder = Caffeine.newBuilder()
+                .maximumWeight(capacityInBytes)
+                .weigher(weigher);
+        if (Config.iceberg_manifest_cache_ttl_sec > 0) {
+            builder = builder.expireAfterAccess(Duration.ofSeconds(Config.iceberg_manifest_cache_ttl_sec));
+        }
+        cache = builder.build(new CacheLoader<ManifestCacheKey, ManifestCacheValue>() {
+            @Override
+            public ManifestCacheValue load(ManifestCacheKey key) {
+                throw new CacheException("Manifest cache loader should be provided explicitly for key %s", null, key);
+            }
+        });
+    }
+
+    public ManifestCacheValue get(ManifestCacheKey key, Callable<ManifestCacheValue> loader) {
+        try {
+            return cache.get(key, ignored -> {
+                try {
+                    return loader.call();
+                } catch (Exception e) {
+                    throw new RuntimeException(e);
+                }
+            });
+        } catch (Exception e) {
+            throw new CacheException("Failed to load manifest cache for key %s", e, key);
+        }
+    }
+
+    public Optional<ManifestCacheValue> peek(ManifestCacheKey key) {
+        return Optional.ofNullable(cache.getIfPresent(key));
+    }
+
+    public void invalidateByPath(String path) {
+        cache.asMap().keySet().stream()
+                .filter(key -> key.getPath().equals(path))
+                .forEach(cache::invalidate);
+    }
+
+    public void invalidateAll() {
+        cache.invalidateAll();
+    }
+
+    public ManifestCacheKey buildKey(String path) {
+        return new ManifestCacheKey(path);
+    }
+}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/IcebergManifestCacheLoader.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/IcebergManifestCacheLoader.java
new file mode 100644
index 00000000000000..dc4d16da61b60a
--- /dev/null
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/IcebergManifestCacheLoader.java
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.datasource.iceberg.cache;
+
+import org.apache.doris.datasource.CacheException;
+
+import com.google.common.collect.Lists;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.DeleteFile;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.ManifestFiles;
+import org.apache.iceberg.ManifestReader;
+import org.apache.iceberg.Table;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * Helper to load manifest content and populate the manifest cache.
+ */
+public class IcebergManifestCacheLoader {
+    private static final Logger LOG = LogManager.getLogger(IcebergManifestCacheLoader.class);
+
+    private IcebergManifestCacheLoader() {
+    }
+
+    public static ManifestCacheValue loadDataFilesWithCache(IcebergManifestCache cache, ManifestFile manifest,
+            Table table) {
+        ManifestCacheKey key = buildKey(cache, manifest);
+        return cache.get(key, () -> loadDataFiles(manifest, table));
+    }
+
+    public static ManifestCacheValue loadDeleteFilesWithCache(IcebergManifestCache cache, ManifestFile manifest,
+            Table table) {
+        ManifestCacheKey key = buildKey(cache, manifest);
+        return cache.get(key, () -> loadDeleteFiles(manifest, table));
+    }
+
+    private static ManifestCacheValue loadDataFiles(ManifestFile manifest, Table table) {
+        List<DataFile> dataFiles = Lists.newArrayList();
+        try (ManifestReader<DataFile> reader = ManifestFiles.read(manifest, table.io())) {
+            // ManifestReader implements CloseableIterable<DataFile>, iterate directly
+            for (DataFile dataFile : reader) {
+                dataFiles.add(dataFile.copy());
+            }
+        } catch (IOException e) {
+            LOG.warn("Failed to read data manifest {}", manifest.path(), e);
+            throw new CacheException("Failed to read data manifest %s", e, manifest.path());
+        }
+        return ManifestCacheValue.forDataFiles(dataFiles);
+    }
+
+    private static ManifestCacheValue loadDeleteFiles(ManifestFile manifest, Table table) {
+        List<DeleteFile> deleteFiles = Lists.newArrayList();
+        try (ManifestReader<DeleteFile> reader = ManifestFiles.readDeleteManifest(manifest, table.io(),
+                table.specs())) {
+            // ManifestReader implements CloseableIterable<DeleteFile>, iterate directly
+            for (DeleteFile deleteFile : reader) {
+                deleteFiles.add(deleteFile.copy());
+            }
+        } catch (IOException e) {
+            LOG.warn("Failed to read delete manifest {}", manifest.path(), e);
+            throw new CacheException("Failed to read delete manifest %s", e, manifest.path());
+        }
+        return ManifestCacheValue.forDeleteFiles(deleteFiles);
+    }
+
+    private static ManifestCacheKey buildKey(IcebergManifestCache cache, ManifestFile manifest) {
+        // Iceberg manifest files are immutable, so path uniquely identifies a manifest
+        return cache.buildKey(manifest.path());
+    }
+}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ManifestCacheKey.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ManifestCacheKey.java
new file mode 100644
index 00000000000000..41b52187aec3f5
--- /dev/null
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ManifestCacheKey.java
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.datasource.iceberg.cache;
+
+import java.util.Objects;
+
+/**
+ * Cache key for a single Iceberg manifest file.
+ * Since Iceberg manifest files are immutable, path uniquely identifies a manifest.
+ */
+public class ManifestCacheKey {
+    private final String path;
+
+    public ManifestCacheKey(String path) {
+        this.path = path;
+    }
+
+    public String getPath() {
+        return path;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) {
+            return true;
+        }
+        if (!(o instanceof ManifestCacheKey)) {
+            return false;
+        }
+        ManifestCacheKey that = (ManifestCacheKey) o;
+        return Objects.equals(path, that.path);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(path);
+    }
+
+    @Override
+    public String toString() {
+        return "ManifestCacheKey{path='" + path + "'}";
+    }
+}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ManifestCacheValue.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ManifestCacheValue.java
new file mode 100644
index 00000000000000..0c7c9154639d6e
--- /dev/null
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/cache/ManifestCacheValue.java
@@ -0,0 +1,65 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.datasource.iceberg.cache;
+
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.DeleteFile;
+
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Cached manifest payload containing parsed files and an estimated weight.
+ */
+public class ManifestCacheValue {
+    private final List<DataFile> dataFiles;
+    private final List<DeleteFile> deleteFiles;
+    private final long weightBytes;
+
+    private ManifestCacheValue(List<DataFile> dataFiles, List<DeleteFile> deleteFiles, long weightBytes) {
+        this.dataFiles = dataFiles == null ? Collections.emptyList() : dataFiles;
+        this.deleteFiles = deleteFiles == null ? Collections.emptyList() : deleteFiles;
+        this.weightBytes = weightBytes;
+    }
+
+    public static ManifestCacheValue forDataFiles(List<DataFile> dataFiles) {
+        return new ManifestCacheValue(dataFiles, Collections.emptyList(),
+                estimateWeight(dataFiles, Collections.emptyList()));
+    }
+
+    public static ManifestCacheValue forDeleteFiles(List<DeleteFile> deleteFiles) {
+        return new ManifestCacheValue(Collections.emptyList(), deleteFiles,
+                estimateWeight(Collections.emptyList(), deleteFiles));
+    }
+
+    public List<DataFile> getDataFiles() {
+        return dataFiles;
+    }
+
+    public List<DeleteFile> getDeleteFiles() {
+        return deleteFiles;
+    }
+
+    public long getWeightBytes() {
+        return weightBytes;
+    }
+
+    private static long estimateWeight(List<DataFile> dataFiles, List<DeleteFile> deleteFiles) {
+        return ContentFileEstimater.estimate(dataFiles) + ContentFileEstimater.estimate(deleteFiles);
+    }
+}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
index f5208397a0f324..0ffe86edb315d6 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
@@ -23,6 +23,7 @@
 import org.apache.doris.analysis.TupleDescriptor;
 import org.apache.doris.catalog.Env;
 import org.apache.doris.catalog.TableIf;
+import org.apache.doris.common.Config;
 import org.apache.doris.common.DdlException;
 import org.apache.doris.common.UserException;
 import org.apache.doris.common.security.authentication.ExecutionAuthenticator;
@@ -38,6 +39,9 @@
 import org.apache.doris.datasource.iceberg.IcebergExternalCatalog;
 import org.apache.doris.datasource.iceberg.IcebergExternalTable;
 import org.apache.doris.datasource.iceberg.IcebergUtils;
+import org.apache.doris.datasource.iceberg.cache.IcebergManifestCache;
+import org.apache.doris.datasource.iceberg.cache.IcebergManifestCacheLoader;
+import org.apache.doris.datasource.iceberg.cache.ManifestCacheValue;
 import org.apache.doris.datasource.property.storage.StorageProperties;
 import org.apache.doris.nereids.exceptions.NotSupportedException;
 import org.apache.doris.planner.PlanNodeId;
@@ -57,18 +61,27 @@
 import com.google.common.base.Preconditions;
 import com.google.common.collect.Lists;
 import org.apache.commons.lang3.exception.ExceptionUtils;
+import org.apache.iceberg.BaseFileScanTask;
 import org.apache.iceberg.BaseTable;
 import org.apache.iceberg.DeleteFile;
+import org.apache.iceberg.DeleteFileIndex;
 import org.apache.iceberg.FileContent;
 import org.apache.iceberg.FileScanTask;
+import org.apache.iceberg.ManifestContent;
 import org.apache.iceberg.ManifestFile;
 import org.apache.iceberg.MetadataColumns;
 import org.apache.iceberg.PartitionData;
 import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.PartitionSpecParser;
+import org.apache.iceberg.SchemaParser;
 import org.apache.iceberg.Snapshot;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.TableScan;
 import org.apache.iceberg.expressions.Expression;
+import org.apache.iceberg.expressions.Expressions;
+import org.apache.iceberg.expressions.InclusiveMetricsEvaluator;
+import org.apache.iceberg.expressions.ManifestEvaluator;
+import org.apache.iceberg.expressions.ResidualEvaluator;
 import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.io.CloseableIterator;
 import org.apache.iceberg.types.Conversions;
@@ -78,9 +91,12 @@
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Objects;
 import java.util.Optional;
 import java.util.OptionalLong;
 import java.util.concurrent.CompletableFuture;
@@ -358,8 +374,136 @@ public TableScan createTableScan() throws UserException {
     }
 
     private CloseableIterable<FileScanTask> planFileScanTask(TableScan scan) {
+        if (!Config.iceberg_manifest_cache_enable) {
+            long targetSplitSize = getRealFileSplitSize(0);
+            return TableScanUtil.splitFiles(scan.planFiles(), targetSplitSize);
+        }
+        try {
+            return planFileScanTaskWithManifestCache(scan);
+        } catch (Exception e) {
+            LOG.warn("Plan with manifest cache failed, fallback to original scan: {}", e.getMessage());
+            long targetSplitSize = getRealFileSplitSize(0);
+            return TableScanUtil.splitFiles(scan.planFiles(), targetSplitSize);
+        }
+    }
+
+    private CloseableIterable<FileScanTask> planFileScanTaskWithManifestCache(TableScan scan) throws IOException {
+        // Get the snapshot from the scan; return empty if no snapshot exists
+        Snapshot snapshot = scan.snapshot();
+        if (snapshot == null) {
+            return CloseableIterable.withNoopClose(Collections.emptyList());
+        }
+
+        // Initialize manifest cache for efficient manifest file access
+        IcebergManifestCache cache = IcebergUtils.getManifestCache();
+
+        // Convert query conjuncts to Iceberg filter expression
+        // This combines all predicates with AND logic for partition/file pruning
+        Expression filterExpr = conjuncts.stream()
+                .map(conjunct -> IcebergUtils.convertToIcebergExpr(conjunct, icebergTable.schema()))
+                .filter(Objects::nonNull)
+                .reduce(Expressions.alwaysTrue(), Expressions::and);
+
+        // Get all partition specs by their IDs for later use
+        Map<Integer, PartitionSpec> specsById = icebergTable.specs();
+        boolean caseSensitive = true;
+
+        // Create residual evaluators for each partition spec
+        // Residual evaluators compute the remaining filter expression after partition pruning
+        Map<Integer, ResidualEvaluator> residualEvaluators = new HashMap<>();
+        specsById.forEach((id, spec) -> residualEvaluators.put(id,
+                ResidualEvaluator.of(spec, filterExpr == null ? Expressions.alwaysTrue() : filterExpr,
+                        caseSensitive)));
+
+        // Create metrics evaluator for file-level pruning based on column statistics
+        InclusiveMetricsEvaluator metricsEvaluator = filterExpr == null ? null
+                : new InclusiveMetricsEvaluator(icebergTable.schema(), filterExpr, caseSensitive);
+
+        // ========== Phase 1: Load delete files from delete manifests ==========
+        List<DeleteFile> deleteFiles = new ArrayList<>();
+        List<ManifestFile> deleteManifests = snapshot.deleteManifests(icebergTable.io());
+        for (ManifestFile manifest : deleteManifests) {
+            // Skip non-delete manifests
+            if (manifest.content() != ManifestContent.DELETES) {
+                continue;
+            }
+            // Get the partition spec for this manifest
+            PartitionSpec spec = specsById.get(manifest.partitionSpecId());
+            if (spec == null) {
+                continue;
+            }
+            // Create manifest evaluator for partition-level pruning
+            ManifestEvaluator evaluator = filterExpr == null ? null
+                    : ManifestEvaluator.forPartitionFilter(filterExpr, spec, caseSensitive);
+            // Skip manifest if it doesn't match the filter expression (partition pruning)
+            if (evaluator != null && !evaluator.eval(manifest)) {
+                continue;
+            }
+            // Load delete files from cache (or from storage if not cached)
+            ManifestCacheValue value = IcebergManifestCacheLoader.loadDeleteFilesWithCache(cache, manifest,
+                    icebergTable);
+            deleteFiles.addAll(value.getDeleteFiles());
+        }
+
+        // Build delete file index for efficient lookup of deletes applicable to each data file
+        DeleteFileIndex deleteIndex = DeleteFileIndex.builderFor(deleteFiles)
+                .specsById(specsById)
+                .caseSensitive(caseSensitive)
+                .build();
+
+        // ========== Phase 2: Load data files and create scan tasks ==========
+        List<FileScanTask> tasks = new ArrayList<>();
+        try (CloseableIterable<ManifestFile> dataManifests =
+                     IcebergUtils.getMatchingManifest(snapshot.dataManifests(icebergTable.io()),
+                             specsById, filterExpr)) {
+            for (ManifestFile manifest : dataManifests) {
+                // Skip non-data manifests
+                if (manifest.content() != ManifestContent.DATA) {
+                    continue;
+                }
+                // Get the partition spec for this manifest
+                PartitionSpec spec = specsById.get(manifest.partitionSpecId());
+                if (spec == null) {
+                    continue;
+                }
+                // Get the residual evaluator for this partition spec
+                ResidualEvaluator residualEvaluator = residualEvaluators.get(manifest.partitionSpecId());
+
+                // Load data files from cache (or from storage if not cached)
+                ManifestCacheValue value = IcebergManifestCacheLoader.loadDataFilesWithCache(cache, manifest,
+                        icebergTable);
+
+                // Process each data file in the manifest
+                for (org.apache.iceberg.DataFile dataFile : value.getDataFiles()) {
+                    // Skip file if column statistics indicate no matching rows (metrics-based pruning)
+                    if (metricsEvaluator != null && !metricsEvaluator.eval(dataFile)) {
+                        continue;
+                    }
+                    // Skip file if partition values don't match the residual filter
+                    if (residualEvaluator != null) {
+                        if (residualEvaluator.residualFor(dataFile.partition()).equals(Expressions.alwaysFalse())) {
+                            continue;
+                        }
+                    }
+                    // Find all delete files that apply to this data file based on sequence number
+                    List<DeleteFile> deletes = Arrays.asList(
+                            deleteIndex.forDataFile(dataFile.dataSequenceNumber(), dataFile));
+
+                    // Create a FileScanTask containing the data file, associated deletes, and metadata
+                    tasks.add(new BaseFileScanTask(
+                            dataFile,
+                            deletes.toArray(new DeleteFile[0]),
+                            SchemaParser.toJson(icebergTable.schema()),
+                            PartitionSpecParser.toJson(spec),
+                            residualEvaluator == null ? ResidualEvaluator.unpartitioned(Expressions.alwaysTrue())
+                                    : residualEvaluator));
+                }
+            }
+        }
+
+        // Split tasks into smaller chunks based on target split size for parallel processing
         long targetSplitSize = getRealFileSplitSize(0);
-        return TableScanUtil.splitFiles(scan.planFiles(), targetSplitSize);
+        return TableScanUtil.splitFiles(CloseableIterable.withNoopClose(tasks), targetSplitSize);
     }
 
     private Split createIcebergSplit(FileScanTask fileScanTask) {
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/metastore/AbstractIcebergProperties.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/metastore/AbstractIcebergProperties.java
index 2cc829c87433f0..88def12d2a599c 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/metastore/AbstractIcebergProperties.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/metastore/AbstractIcebergProperties.java
@@ -43,6 +43,43 @@ public abstract class AbstractIcebergProperties extends MetastoreProperties {
     )
     protected String warehouse;
 
+    @Getter
+    @ConnectorProperty(
+            names = {CatalogProperties.IO_MANIFEST_CACHE_ENABLED},
+            required = false,
+            description = "Controls whether to use caching during manifest reads or not. Default: false."
+    )
+    protected String ioManifestCacheEnabled;
+
+    @Getter
+    @ConnectorProperty(
+            names = {CatalogProperties.IO_MANIFEST_CACHE_EXPIRATION_INTERVAL_MS},
+            required = false,
+            description = "Controls the maximum duration for which an entry stays in the manifest cache. "
+                    + "Must be a non-negative value. Zero means entries expire only due to memory pressure. "
+                    + "Default: 60000 (60s)."
+    )
+    protected String ioManifestCacheExpirationIntervalMs;
+
+    @Getter
+    @ConnectorProperty(
+            names = {CatalogProperties.IO_MANIFEST_CACHE_MAX_TOTAL_BYTES},
+            required = false,
+            description = "Controls the maximum total amount of bytes to cache in manifest cache. "
+                    + "Must be a positive value. Default: 104857600 (100MB)."
+    )
+    protected String ioManifestCacheMaxTotalBytes;
+
+    @Getter
+    @ConnectorProperty(
+            names = {CatalogProperties.IO_MANIFEST_CACHE_MAX_CONTENT_LENGTH},
+            required = false,
+            description = "Controls the maximum length of file to be considered for caching. "
+                    + "An InputFile will not be cached if the length is longer than this limit. "
+                    + "Must be a positive value. Default: 8388608 (8MB)."
+    )
+    protected String ioManifestCacheMaxContentLength;
+
     @Getter
     protected ExecutionAuthenticator executionAuthenticator = new ExecutionAuthenticator(){};
 
@@ -80,6 +117,9 @@ public final Catalog initializeCatalog(String catalogName, List<StoragePropertie
             catalogProps.put(CatalogProperties.WAREHOUSE_LOCATION, warehouse);
         }
 
+        // Add manifest cache properties if configured
+        addManifestCacheProperties(catalogProps);
+
         Catalog catalog = initCatalog(catalogName, catalogProps, storagePropertiesList);
 
         if (catalog == null) {
@@ -88,6 +128,28 @@ public final Catalog initializeCatalog(String catalogName, List<StoragePropertie
         return catalog;
     }
 
+    /**
+     * Add manifest cache related properties to catalog properties.
+     * These properties control caching behavior during manifest reads.
+     *
+     * @param catalogProps the catalog properties map to add manifest cache properties to
+     */
+    protected void addManifestCacheProperties(Map<String, String> catalogProps) {
+        if (StringUtils.isNotBlank(ioManifestCacheEnabled)) {
+            catalogProps.put(CatalogProperties.IO_MANIFEST_CACHE_ENABLED, ioManifestCacheEnabled);
+        }
+        if (StringUtils.isNotBlank(ioManifestCacheExpirationIntervalMs)) {
+            catalogProps.put(CatalogProperties.IO_MANIFEST_CACHE_EXPIRATION_INTERVAL_MS,
+                    ioManifestCacheExpirationIntervalMs);
+        }
+        if (StringUtils.isNotBlank(ioManifestCacheMaxTotalBytes)) {
+            catalogProps.put(CatalogProperties.IO_MANIFEST_CACHE_MAX_TOTAL_BYTES, ioManifestCacheMaxTotalBytes);
+        }
+        if (StringUtils.isNotBlank(ioManifestCacheMaxContentLength)) {
+            catalogProps.put(CatalogProperties.IO_MANIFEST_CACHE_MAX_CONTENT_LENGTH, ioManifestCacheMaxContentLength);
+        }
+    }
+
     /**
      * Subclasses must implement this to create the concrete Catalog instance.
      */
diff --git a/fe/fe-core/src/main/java/org/apache/iceberg/DeleteFileIndex.java b/fe/fe-core/src/main/java/org/apache/iceberg/DeleteFileIndex.java
new file mode 100644
index 00000000000000..5c9cdd93c45f93
--- /dev/null
+++ b/fe/fe-core/src/main/java/org/apache/iceberg/DeleteFileIndex.java
@@ -0,0 +1,906 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg;
+
+import com.github.benmanes.caffeine.cache.Caffeine;
+import com.github.benmanes.caffeine.cache.LoadingCache;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+import java.util.Queue;
+import java.util.Set;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.ExecutorService;
+import org.apache.iceberg.exceptions.RuntimeIOException;
+import org.apache.iceberg.exceptions.ValidationException;
+import org.apache.iceberg.expressions.Expression;
+import org.apache.iceberg.expressions.Expressions;
+import org.apache.iceberg.expressions.ManifestEvaluator;
+import org.apache.iceberg.expressions.Projections;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.metrics.ScanMetrics;
+import org.apache.iceberg.metrics.ScanMetricsUtil;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
+import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.relocated.com.google.common.collect.Maps;
+import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.types.Comparators;
+import org.apache.iceberg.types.Conversions;
+import org.apache.iceberg.types.Type;
+import org.apache.iceberg.types.Types;
+import org.apache.iceberg.util.ArrayUtil;
+import org.apache.iceberg.util.ContentFileUtil;
+import org.apache.iceberg.util.PartitionMap;
+import org.apache.iceberg.util.PartitionSet;
+import org.apache.iceberg.util.Tasks;
+
+/**
+ * An index of {@link DeleteFile delete files} by sequence number.
+ *
+ * <p>Use {@link #builderFor(FileIO, Iterable)} to construct an index, and {@link #forDataFile(long,
+ * DataFile)} or {@link #forEntry(ManifestEntry)} to get the delete files to apply to a given data
+ * file.
+ * 
+ * Copyed from https://github.com/apache/iceberg/blob/apache-iceberg-1.9.1/core/src/main/java/org/apache/iceberg/DeleteFileIndex.java
+ * Change DeleteFileIndex and some methods to public.
+ */
+public class DeleteFileIndex {
+  private static final DeleteFile[] EMPTY_DELETES = new DeleteFile[0];
+
+  private final EqualityDeletes globalDeletes;
+  private final PartitionMap<EqualityDeletes> eqDeletesByPartition;
+  private final PartitionMap<PositionDeletes> posDeletesByPartition;
+  private final Map<String, PositionDeletes> posDeletesByPath;
+  private final Map<String, DeleteFile> dvByPath;
+  private final boolean hasEqDeletes;
+  private final boolean hasPosDeletes;
+  private final boolean isEmpty;
+
+  private DeleteFileIndex(
+      EqualityDeletes globalDeletes,
+      PartitionMap<EqualityDeletes> eqDeletesByPartition,
+      PartitionMap<PositionDeletes> posDeletesByPartition,
+      Map<String, PositionDeletes> posDeletesByPath,
+      Map<String, DeleteFile> dvByPath) {
+    this.globalDeletes = globalDeletes;
+    this.eqDeletesByPartition = eqDeletesByPartition;
+    this.posDeletesByPartition = posDeletesByPartition;
+    this.posDeletesByPath = posDeletesByPath;
+    this.dvByPath = dvByPath;
+    this.hasEqDeletes = globalDeletes != null || eqDeletesByPartition != null;
+    this.hasPosDeletes =
+        posDeletesByPartition != null || posDeletesByPath != null || dvByPath != null;
+    this.isEmpty = !hasEqDeletes && !hasPosDeletes;
+  }
+
+  public boolean isEmpty() {
+    return isEmpty;
+  }
+
+  public boolean hasEqualityDeletes() {
+    return hasEqDeletes;
+  }
+
+  public boolean hasPositionDeletes() {
+    return hasPosDeletes;
+  }
+
+  public Iterable<DeleteFile> referencedDeleteFiles() {
+    Iterable<DeleteFile> deleteFiles = Collections.emptyList();
+
+    if (globalDeletes != null) {
+      deleteFiles = Iterables.concat(deleteFiles, globalDeletes.referencedDeleteFiles());
+    }
+
+    if (eqDeletesByPartition != null) {
+      for (EqualityDeletes deletes : eqDeletesByPartition.values()) {
+        deleteFiles = Iterables.concat(deleteFiles, deletes.referencedDeleteFiles());
+      }
+    }
+
+    if (posDeletesByPartition != null) {
+      for (PositionDeletes deletes : posDeletesByPartition.values()) {
+        deleteFiles = Iterables.concat(deleteFiles, deletes.referencedDeleteFiles());
+      }
+    }
+
+    if (posDeletesByPath != null) {
+      for (PositionDeletes deletes : posDeletesByPath.values()) {
+        deleteFiles = Iterables.concat(deleteFiles, deletes.referencedDeleteFiles());
+      }
+    }
+
+    if (dvByPath != null) {
+      deleteFiles = Iterables.concat(deleteFiles, dvByPath.values());
+    }
+
+    return deleteFiles;
+  }
+
+  DeleteFile[] forEntry(ManifestEntry<DataFile> entry) {
+    return forDataFile(entry.dataSequenceNumber(), entry.file());
+  }
+
+  public DeleteFile[] forDataFile(DataFile file) {
+    return forDataFile(file.dataSequenceNumber(), file);
+  }
+
+  public DeleteFile[] forDataFile(long sequenceNumber, DataFile file) {
+    if (isEmpty) {
+      return EMPTY_DELETES;
+    }
+
+    DeleteFile[] global = findGlobalDeletes(sequenceNumber, file);
+    DeleteFile[] eqPartition = findEqPartitionDeletes(sequenceNumber, file);
+    DeleteFile dv = findDV(sequenceNumber, file);
+    if (dv != null && global == null && eqPartition == null) {
+      return new DeleteFile[] {dv};
+    } else if (dv != null) {
+      return concat(global, eqPartition, new DeleteFile[] {dv});
+    } else {
+      DeleteFile[] posPartition = findPosPartitionDeletes(sequenceNumber, file);
+      DeleteFile[] posPath = findPathDeletes(sequenceNumber, file);
+      return concat(global, eqPartition, posPartition, posPath);
+    }
+  }
+
+  private DeleteFile[] findGlobalDeletes(long seq, DataFile dataFile) {
+    return globalDeletes == null ? EMPTY_DELETES : globalDeletes.filter(seq, dataFile);
+  }
+
+  private DeleteFile[] findPosPartitionDeletes(long seq, DataFile dataFile) {
+    if (posDeletesByPartition == null) {
+      return EMPTY_DELETES;
+    }
+
+    PositionDeletes deletes = posDeletesByPartition.get(dataFile.specId(), dataFile.partition());
+    return deletes == null ? EMPTY_DELETES : deletes.filter(seq);
+  }
+
+  private DeleteFile[] findEqPartitionDeletes(long seq, DataFile dataFile) {
+    if (eqDeletesByPartition == null) {
+      return EMPTY_DELETES;
+    }
+
+    EqualityDeletes deletes = eqDeletesByPartition.get(dataFile.specId(), dataFile.partition());
+    return deletes == null ? EMPTY_DELETES : deletes.filter(seq, dataFile);
+  }
+
+  @SuppressWarnings("CollectionUndefinedEquality")
+  private DeleteFile[] findPathDeletes(long seq, DataFile dataFile) {
+    if (posDeletesByPath == null) {
+      return EMPTY_DELETES;
+    }
+
+    PositionDeletes deletes = posDeletesByPath.get(dataFile.location());
+    return deletes == null ? EMPTY_DELETES : deletes.filter(seq);
+  }
+
+  private DeleteFile findDV(long seq, DataFile dataFile) {
+    if (dvByPath == null) {
+      return null;
+    }
+
+    DeleteFile dv = dvByPath.get(dataFile.location());
+    if (dv != null) {
+      ValidationException.check(
+          dv.dataSequenceNumber() >= seq,
+          "DV data sequence number (%s) must be greater than or equal to data file sequence number (%s)",
+          dv.dataSequenceNumber(),
+          seq);
+    }
+    return dv;
+  }
+
+  @SuppressWarnings("checkstyle:CyclomaticComplexity")
+  private static boolean canContainEqDeletesForFile(
+      DataFile dataFile, EqualityDeleteFile deleteFile) {
+    Map<Integer, ByteBuffer> dataLowers = dataFile.lowerBounds();
+    Map<Integer, ByteBuffer> dataUppers = dataFile.upperBounds();
+
+    // whether to check data ranges or to assume that the ranges match
+    // if upper/lower bounds are missing, null counts may still be used to determine delete files
+    // can be skipped
+    boolean checkRanges =
+        dataLowers != null && dataUppers != null && deleteFile.hasLowerAndUpperBounds();
+
+    Map<Integer, Long> dataNullCounts = dataFile.nullValueCounts();
+    Map<Integer, Long> dataValueCounts = dataFile.valueCounts();
+    Map<Integer, Long> deleteNullCounts = deleteFile.nullValueCounts();
+    Map<Integer, Long> deleteValueCounts = deleteFile.valueCounts();
+
+    for (Types.NestedField field : deleteFile.equalityFields()) {
+      if (!field.type().isPrimitiveType()) {
+        // stats are not kept for nested types. assume that the delete file may match
+        continue;
+      }
+
+      if (containsNull(dataNullCounts, field) && containsNull(deleteNullCounts, field)) {
+        // the data has null values and null has been deleted, so the deletes must be applied
+        continue;
+      }
+
+      if (allNull(dataNullCounts, dataValueCounts, field) && allNonNull(deleteNullCounts, field)) {
+        // the data file contains only null values for this field, but there are no deletes for null
+        // values
+        return false;
+      }
+
+      if (allNull(deleteNullCounts, deleteValueCounts, field)
+          && allNonNull(dataNullCounts, field)) {
+        // the delete file removes only null rows with null for this field, but there are no data
+        // rows with null
+        return false;
+      }
+
+      if (!checkRanges) {
+        // some upper and lower bounds are missing, assume they match
+        continue;
+      }
+
+      int id = field.fieldId();
+      ByteBuffer dataLower = dataLowers.get(id);
+      ByteBuffer dataUpper = dataUppers.get(id);
+      Object deleteLower = deleteFile.lowerBound(id);
+      Object deleteUpper = deleteFile.upperBound(id);
+      if (dataLower == null || dataUpper == null || deleteLower == null || deleteUpper == null) {
+        // at least one bound is not known, assume the delete file may match
+        continue;
+      }
+
+      if (!rangesOverlap(field, dataLower, dataUpper, deleteLower, deleteUpper)) {
+        // no values overlap between the data file and the deletes
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  private static <T> boolean rangesOverlap(
+      Types.NestedField field,
+      ByteBuffer dataLowerBuf,
+      ByteBuffer dataUpperBuf,
+      T deleteLower,
+      T deleteUpper) {
+    Type.PrimitiveType type = field.type().asPrimitiveType();
+    Comparator<T> comparator = Comparators.forType(type);
+
+    T dataLower = Conversions.fromByteBuffer(type, dataLowerBuf);
+    if (comparator.compare(dataLower, deleteUpper) > 0) {
+      return false;
+    }
+
+    T dataUpper = Conversions.fromByteBuffer(type, dataUpperBuf);
+    if (comparator.compare(deleteLower, dataUpper) > 0) {
+      return false;
+    }
+
+    return true;
+  }
+
+  private static boolean allNonNull(Map<Integer, Long> nullValueCounts, Types.NestedField field) {
+    if (field.isRequired()) {
+      return true;
+    }
+
+    if (nullValueCounts == null) {
+      return false;
+    }
+
+    Long nullValueCount = nullValueCounts.get(field.fieldId());
+    if (nullValueCount == null) {
+      return false;
+    }
+
+    return nullValueCount <= 0;
+  }
+
+  private static boolean allNull(
+      Map<Integer, Long> nullValueCounts, Map<Integer, Long> valueCounts, Types.NestedField field) {
+    if (field.isRequired()) {
+      return false;
+    }
+
+    if (nullValueCounts == null || valueCounts == null) {
+      return false;
+    }
+
+    Long nullValueCount = nullValueCounts.get(field.fieldId());
+    Long valueCount = valueCounts.get(field.fieldId());
+    if (nullValueCount == null || valueCount == null) {
+      return false;
+    }
+
+    return nullValueCount.equals(valueCount);
+  }
+
+  private static boolean containsNull(Map<Integer, Long> nullValueCounts, Types.NestedField field) {
+    if (field.isRequired()) {
+      return false;
+    }
+
+    if (nullValueCounts == null) {
+      return true;
+    }
+
+    Long nullValueCount = nullValueCounts.get(field.fieldId());
+    if (nullValueCount == null) {
+      return true;
+    }
+
+    return nullValueCount > 0;
+  }
+
+  static Builder builderFor(FileIO io, Iterable<ManifestFile> deleteManifests) {
+    return new Builder(io, Sets.newHashSet(deleteManifests));
+  }
+
+  // changed to public method.
+  public static Builder builderFor(Iterable<DeleteFile> deleteFiles) {
+    return new Builder(deleteFiles);
+  }
+
+  // changed to public class.
+  public static class Builder {
+    private final FileIO io;
+    private final Set<ManifestFile> deleteManifests;
+    private final Iterable<DeleteFile> deleteFiles;
+    private long minSequenceNumber = 0L;
+    private Map<Integer, PartitionSpec> specsById = null;
+    private Expression dataFilter = Expressions.alwaysTrue();
+    private Expression partitionFilter = Expressions.alwaysTrue();
+    private PartitionSet partitionSet = null;
+    private boolean caseSensitive = true;
+    private ExecutorService executorService = null;
+    private ScanMetrics scanMetrics = ScanMetrics.noop();
+    private boolean ignoreResiduals = false;
+
+    Builder(FileIO io, Set<ManifestFile> deleteManifests) {
+      this.io = io;
+      this.deleteManifests = Sets.newHashSet(deleteManifests);
+      this.deleteFiles = null;
+    }
+
+    Builder(Iterable<DeleteFile> deleteFiles) {
+      this.io = null;
+      this.deleteManifests = null;
+      this.deleteFiles = deleteFiles;
+    }
+
+    Builder afterSequenceNumber(long seq) {
+      this.minSequenceNumber = seq;
+      return this;
+    }
+
+    public Builder specsById(Map<Integer, PartitionSpec> newSpecsById) {
+      this.specsById = newSpecsById;
+      return this;
+    }
+
+    Builder filterData(Expression newDataFilter) {
+      Preconditions.checkArgument(
+          deleteFiles == null, "Index constructed from files does not support data filters");
+      this.dataFilter = Expressions.and(dataFilter, newDataFilter);
+      return this;
+    }
+
+    Builder filterPartitions(Expression newPartitionFilter) {
+      Preconditions.checkArgument(
+          deleteFiles == null, "Index constructed from files does not support partition filters");
+      this.partitionFilter = Expressions.and(partitionFilter, newPartitionFilter);
+      return this;
+    }
+
+    Builder filterPartitions(PartitionSet newPartitionSet) {
+      Preconditions.checkArgument(
+          deleteFiles == null, "Index constructed from files does not support partition filters");
+      this.partitionSet = newPartitionSet;
+      return this;
+    }
+
+    public Builder caseSensitive(boolean newCaseSensitive) {
+      this.caseSensitive = newCaseSensitive;
+      return this;
+    }
+
+    Builder planWith(ExecutorService newExecutorService) {
+      this.executorService = newExecutorService;
+      return this;
+    }
+
+    Builder scanMetrics(ScanMetrics newScanMetrics) {
+      this.scanMetrics = newScanMetrics;
+      return this;
+    }
+
+    Builder ignoreResiduals() {
+      this.ignoreResiduals = true;
+      return this;
+    }
+
+    private Iterable<DeleteFile> filterDeleteFiles() {
+      return Iterables.filter(deleteFiles, file -> file.dataSequenceNumber() > minSequenceNumber);
+    }
+
+    private Collection<DeleteFile> loadDeleteFiles() {
+      // read all of the matching delete manifests in parallel and accumulate the matching files in
+      // a queue
+      Queue<DeleteFile> files = new ConcurrentLinkedQueue<>();
+      Tasks.foreach(deleteManifestReaders())
+          .stopOnFailure()
+          .throwFailureWhenFinished()
+          .executeWith(executorService)
+          .run(
+              deleteFile -> {
+                try (CloseableIterable<ManifestEntry<DeleteFile>> reader = deleteFile) {
+                  for (ManifestEntry<DeleteFile> entry : reader) {
+                    if (entry.dataSequenceNumber() > minSequenceNumber) {
+                      // copy with stats for better filtering against data file stats
+                      files.add(entry.file().copy());
+                    }
+                  }
+                } catch (IOException e) {
+                  throw new RuntimeIOException(e, "Failed to close");
+                }
+              });
+      return files;
+    }
+
+    public DeleteFileIndex build() {
+      Iterable<DeleteFile> files = deleteFiles != null ? filterDeleteFiles() : loadDeleteFiles();
+
+      EqualityDeletes globalDeletes = new EqualityDeletes();
+      PartitionMap<EqualityDeletes> eqDeletesByPartition = PartitionMap.create(specsById);
+      PartitionMap<PositionDeletes> posDeletesByPartition = PartitionMap.create(specsById);
+      Map<String, PositionDeletes> posDeletesByPath = Maps.newHashMap();
+      Map<String, DeleteFile> dvByPath = Maps.newHashMap();
+
+      for (DeleteFile file : files) {
+        switch (file.content()) {
+          case POSITION_DELETES:
+            if (ContentFileUtil.isDV(file)) {
+              add(dvByPath, file);
+            } else {
+              add(posDeletesByPath, posDeletesByPartition, file);
+            }
+            break;
+          case EQUALITY_DELETES:
+            add(globalDeletes, eqDeletesByPartition, file);
+            break;
+          default:
+            throw new UnsupportedOperationException("Unsupported content: " + file.content());
+        }
+        ScanMetricsUtil.indexedDeleteFile(scanMetrics, file);
+      }
+
+      return new DeleteFileIndex(
+          globalDeletes.isEmpty() ? null : globalDeletes,
+          eqDeletesByPartition.isEmpty() ? null : eqDeletesByPartition,
+          posDeletesByPartition.isEmpty() ? null : posDeletesByPartition,
+          posDeletesByPath.isEmpty() ? null : posDeletesByPath,
+          dvByPath.isEmpty() ? null : dvByPath);
+    }
+
+    private void add(Map<String, DeleteFile> dvByPath, DeleteFile dv) {
+      String path = dv.referencedDataFile();
+      DeleteFile existingDV = dvByPath.putIfAbsent(path, dv);
+      if (existingDV != null) {
+        throw new ValidationException(
+            "Can't index multiple DVs for %s: %s and %s",
+            path, ContentFileUtil.dvDesc(dv), ContentFileUtil.dvDesc(existingDV));
+      }
+    }
+
+    private void add(
+        Map<String, PositionDeletes> deletesByPath,
+        PartitionMap<PositionDeletes> deletesByPartition,
+        DeleteFile file) {
+      String path = ContentFileUtil.referencedDataFileLocation(file);
+
+      PositionDeletes deletes;
+      if (path != null) {
+        deletes = deletesByPath.computeIfAbsent(path, ignored -> new PositionDeletes());
+      } else {
+        int specId = file.specId();
+        StructLike partition = file.partition();
+        deletes = deletesByPartition.computeIfAbsent(specId, partition, PositionDeletes::new);
+      }
+
+      deletes.add(file);
+    }
+
+    private void add(
+        EqualityDeletes globalDeletes,
+        PartitionMap<EqualityDeletes> deletesByPartition,
+        DeleteFile file) {
+      PartitionSpec spec = specsById.get(file.specId());
+
+      EqualityDeletes deletes;
+      if (spec.isUnpartitioned()) {
+        deletes = globalDeletes;
+      } else {
+        int specId = spec.specId();
+        StructLike partition = file.partition();
+        deletes = deletesByPartition.computeIfAbsent(specId, partition, EqualityDeletes::new);
+      }
+
+      deletes.add(spec, file);
+    }
+
+    private Iterable<CloseableIterable<ManifestEntry<DeleteFile>>> deleteManifestReaders() {
+      Expression entryFilter = ignoreResiduals ? Expressions.alwaysTrue() : dataFilter;
+
+      LoadingCache<Integer, Expression> partExprCache =
+          specsById == null
+              ? null
+              : Caffeine.newBuilder()
+                  .build(
+                      specId -> {
+                        PartitionSpec spec = specsById.get(specId);
+                        return Projections.inclusive(spec, caseSensitive).project(dataFilter);
+                      });
+
+      LoadingCache<Integer, ManifestEvaluator> evalCache =
+          specsById == null
+              ? null
+              : Caffeine.newBuilder()
+                  .build(
+                      specId -> {
+                        PartitionSpec spec = specsById.get(specId);
+                        return ManifestEvaluator.forPartitionFilter(
+                            Expressions.and(partitionFilter, partExprCache.get(specId)),
+                            spec,
+                            caseSensitive);
+                      });
+
+      CloseableIterable<ManifestFile> closeableDeleteManifests =
+          CloseableIterable.withNoopClose(deleteManifests);
+      CloseableIterable<ManifestFile> matchingManifests =
+          evalCache == null
+              ? closeableDeleteManifests
+              : CloseableIterable.filter(
+                  scanMetrics.skippedDeleteManifests(),
+                  closeableDeleteManifests,
+                  manifest ->
+                      manifest.content() == ManifestContent.DELETES
+                          && (manifest.hasAddedFiles() || manifest.hasExistingFiles())
+                          && evalCache.get(manifest.partitionSpecId()).eval(manifest));
+
+      matchingManifests =
+          CloseableIterable.count(scanMetrics.scannedDeleteManifests(), matchingManifests);
+      return Iterables.transform(
+          matchingManifests,
+          manifest ->
+              ManifestFiles.readDeleteManifest(manifest, io, specsById)
+                  .filterRows(entryFilter)
+                  .filterPartitions(
+                      Expressions.and(
+                          partitionFilter, partExprCache.get(manifest.partitionSpecId())))
+                  .filterPartitions(partitionSet)
+                  .caseSensitive(caseSensitive)
+                  .scanMetrics(scanMetrics)
+                  .liveEntries());
+    }
+  }
+
+  /**
+   * Finds an index in the sorted array of sequence numbers where the given sequence number should
+   * be inserted or is found.
+   *
+   * <p>If the sequence number is present in the array, this method returns the index of the first
+   * occurrence of the sequence number. If the sequence number is not present, the method returns
+   * the index where the sequence number would be inserted while maintaining the sorted order of the
+   * array. This returned index ranges from 0 (inclusive) to the length of the array (inclusive).
+   *
+   * <p>This method is used to determine the subset of delete files that apply to a given data file.
+   *
+   * @param seqs an array of sequence numbers sorted in ascending order
+   * @param seq the sequence number to search for
+   * @return the index of the first occurrence or the insertion point
+   */
+  private static int findStartIndex(long[] seqs, long seq) {
+    int pos = Arrays.binarySearch(seqs, seq);
+    int start;
+    if (pos < 0) {
+      // the sequence number was not found, where it would be inserted is -(pos + 1)
+      start = -(pos + 1);
+    } else {
+      // the sequence number was found, but may not be the first
+      // find the first delete file with the given sequence number by decrementing the position
+      start = pos;
+      while (start > 0 && seqs[start - 1] >= seq) {
+        start -= 1;
+      }
+    }
+
+    return start;
+  }
+
+  private static DeleteFile[] concat(DeleteFile[]... deletes) {
+    return ArrayUtil.concat(DeleteFile.class, deletes);
+  }
+
+  // a group of position delete files sorted by the sequence number they apply to
+  static class PositionDeletes {
+    private static final Comparator<DeleteFile> SEQ_COMPARATOR =
+        Comparator.comparingLong(DeleteFile::dataSequenceNumber);
+
+    // indexed state
+    private long[] seqs = null;
+    private DeleteFile[] files = null;
+
+    // a buffer that is used to hold files before indexing
+    private volatile List<DeleteFile> buffer = Lists.newArrayList();
+
+    public void add(DeleteFile file) {
+      Preconditions.checkState(buffer != null, "Can't add files upon indexing");
+      buffer.add(file);
+    }
+
+    public DeleteFile[] filter(long seq) {
+      indexIfNeeded();
+
+      int start = findStartIndex(seqs, seq);
+
+      if (start >= files.length) {
+        return EMPTY_DELETES;
+      }
+
+      if (start == 0) {
+        return files;
+      }
+
+      int matchingFilesCount = files.length - start;
+      DeleteFile[] matchingFiles = new DeleteFile[matchingFilesCount];
+      System.arraycopy(files, start, matchingFiles, 0, matchingFilesCount);
+      return matchingFiles;
+    }
+
+    public Iterable<DeleteFile> referencedDeleteFiles() {
+      indexIfNeeded();
+      return Arrays.asList(files);
+    }
+
+    public boolean isEmpty() {
+      indexIfNeeded();
+      return files.length == 0;
+    }
+
+    private void indexIfNeeded() {
+      if (buffer != null) {
+        synchronized (this) {
+          if (buffer != null) {
+            this.files = indexFiles(buffer);
+            this.seqs = indexSeqs(files);
+            this.buffer = null;
+          }
+        }
+      }
+    }
+
+    private static DeleteFile[] indexFiles(List<DeleteFile> list) {
+      DeleteFile[] array = list.toArray(EMPTY_DELETES);
+      Arrays.sort(array, SEQ_COMPARATOR);
+      return array;
+    }
+
+    private static long[] indexSeqs(DeleteFile[] files) {
+      long[] seqs = new long[files.length];
+
+      for (int index = 0; index < files.length; index++) {
+        seqs[index] = files[index].dataSequenceNumber();
+      }
+
+      return seqs;
+    }
+  }
+
+  // a group of equality delete files sorted by the sequence number they apply to
+  static class EqualityDeletes {
+    private static final Comparator<EqualityDeleteFile> SEQ_COMPARATOR =
+        Comparator.comparingLong(EqualityDeleteFile::applySequenceNumber);
+    private static final EqualityDeleteFile[] EMPTY_EQUALITY_DELETES = new EqualityDeleteFile[0];
+
+    // indexed state
+    private long[] seqs = null;
+    private EqualityDeleteFile[] files = null;
+
+    // a buffer that is used to hold files before indexing
+    private volatile List<EqualityDeleteFile> buffer = Lists.newArrayList();
+
+    public void add(PartitionSpec spec, DeleteFile file) {
+      Preconditions.checkState(buffer != null, "Can't add files upon indexing");
+      buffer.add(new EqualityDeleteFile(spec, file));
+    }
+
+    public DeleteFile[] filter(long seq, DataFile dataFile) {
+      indexIfNeeded();
+
+      int start = findStartIndex(seqs, seq);
+
+      if (start >= files.length) {
+        return EMPTY_DELETES;
+      }
+
+      List<DeleteFile> matchingFiles = Lists.newArrayList();
+
+      for (int index = start; index < files.length; index++) {
+        EqualityDeleteFile file = files[index];
+        if (canContainEqDeletesForFile(dataFile, file)) {
+          matchingFiles.add(file.wrapped());
+        }
+      }
+
+      return matchingFiles.toArray(EMPTY_DELETES);
+    }
+
+    public Iterable<DeleteFile> referencedDeleteFiles() {
+      indexIfNeeded();
+      return Iterables.transform(Arrays.asList(files), EqualityDeleteFile::wrapped);
+    }
+
+    public boolean isEmpty() {
+      indexIfNeeded();
+      return files.length == 0;
+    }
+
+    private void indexIfNeeded() {
+      if (buffer != null) {
+        synchronized (this) {
+          if (buffer != null) {
+            this.files = indexFiles(buffer);
+            this.seqs = indexSeqs(files);
+            this.buffer = null;
+          }
+        }
+      }
+    }
+
+    private static EqualityDeleteFile[] indexFiles(List<EqualityDeleteFile> list) {
+      EqualityDeleteFile[] array = list.toArray(EMPTY_EQUALITY_DELETES);
+      Arrays.sort(array, SEQ_COMPARATOR);
+      return array;
+    }
+
+    private static long[] indexSeqs(EqualityDeleteFile[] files) {
+      long[] seqs = new long[files.length];
+
+      for (int index = 0; index < files.length; index++) {
+        seqs[index] = files[index].applySequenceNumber();
+      }
+
+      return seqs;
+    }
+  }
+
+  // an equality delete file wrapper that caches the converted boundaries for faster boundary checks
+  // this class is not meant to be exposed beyond the delete file index
+  private static class EqualityDeleteFile {
+    private final PartitionSpec spec;
+    private final DeleteFile wrapped;
+    private final long applySequenceNumber;
+    private volatile List<Types.NestedField> equalityFields = null;
+    private volatile Map<Integer, Object> convertedLowerBounds = null;
+    private volatile Map<Integer, Object> convertedUpperBounds = null;
+
+    EqualityDeleteFile(PartitionSpec spec, DeleteFile file) {
+      this.spec = spec;
+      this.wrapped = file;
+      this.applySequenceNumber = wrapped.dataSequenceNumber() - 1;
+    }
+
+    public DeleteFile wrapped() {
+      return wrapped;
+    }
+
+    public long applySequenceNumber() {
+      return applySequenceNumber;
+    }
+
+    public List<Types.NestedField> equalityFields() {
+      if (equalityFields == null) {
+        synchronized (this) {
+          if (equalityFields == null) {
+            List<Types.NestedField> fields = Lists.newArrayList();
+            for (int id : wrapped.equalityFieldIds()) {
+              Types.NestedField field = spec.schema().findField(id);
+              fields.add(field);
+            }
+            this.equalityFields = fields;
+          }
+        }
+      }
+
+      return equalityFields;
+    }
+
+    public Map<Integer, Long> valueCounts() {
+      return wrapped.valueCounts();
+    }
+
+    public Map<Integer, Long> nullValueCounts() {
+      return wrapped.nullValueCounts();
+    }
+
+    public boolean hasLowerAndUpperBounds() {
+      return wrapped.lowerBounds() != null && wrapped.upperBounds() != null;
+    }
+
+    @SuppressWarnings("unchecked")
+    public <T> T lowerBound(int id) {
+      return (T) lowerBounds().get(id);
+    }
+
+    private Map<Integer, Object> lowerBounds() {
+      if (convertedLowerBounds == null) {
+        synchronized (this) {
+          if (convertedLowerBounds == null) {
+            this.convertedLowerBounds = convertBounds(wrapped.lowerBounds());
+          }
+        }
+      }
+
+      return convertedLowerBounds;
+    }
+
+    @SuppressWarnings("unchecked")
+    public <T> T upperBound(int id) {
+      return (T) upperBounds().get(id);
+    }
+
+    private Map<Integer, Object> upperBounds() {
+      if (convertedUpperBounds == null) {
+        synchronized (this) {
+          if (convertedUpperBounds == null) {
+            this.convertedUpperBounds = convertBounds(wrapped.upperBounds());
+          }
+        }
+      }
+
+      return convertedUpperBounds;
+    }
+
+    private Map<Integer, Object> convertBounds(Map<Integer, ByteBuffer> bounds) {
+      Map<Integer, Object> converted = Maps.newHashMap();
+
+      if (bounds != null) {
+        for (Types.NestedField field : equalityFields()) {
+          int id = field.fieldId();
+          Type type = spec.schema().findField(id).type();
+          if (type.isPrimitiveType()) {
+            ByteBuffer bound = bounds.get(id);
+            if (bound != null) {
+              converted.put(id, Conversions.fromByteBuffer(type, bound));
+            }
+          }
+        }
+      }
+
+      return converted;
+    }
+  }
+}

From bdaf820278473d0e3a58faca1761822bfd376bb3 Mon Sep 17 00:00:00 2001
From: Qi Chen <chenqi@selectdb.com>
Date: Fri, 19 Dec 2025 11:45:28 +0800
Subject: [PATCH 09/12] [opt](multi-catalog) Optimize file split size. (#59175)

---
 .../datasource/FederationBackendPolicy.java   |  12 +-
 .../doris/datasource/FileQueryScanNode.java   |  19 +-
 .../apache/doris/datasource/FileScanNode.java |  10 +-
 .../apache/doris/datasource/FileSplitter.java | 230 ++++++++++++++++--
 .../doris/datasource/SplitGenerator.java      |   2 +-
 .../datasource/hive/source/HiveScanNode.java  |  78 ++++--
 .../iceberg/source/IcebergScanNode.java       |  80 +++++-
 .../paimon/source/PaimonScanNode.java         |  47 +++-
 .../datasource/tvf/source/TVFScanNode.java    |  31 ++-
 .../org/apache/doris/qe/SessionVariable.java  |  39 +++
 .../doris/datasource/FileSplitterTest.java    | 216 ++++++++++++++++
 .../paimon/source/PaimonScanNodeTest.java     |  17 ++
 .../planner/FederationBackendPolicyTest.java  |   4 +-
 .../hive/test_hive_compress_type.groovy       |   2 +-
 14 files changed, 691 insertions(+), 96 deletions(-)
 create mode 100644 fe/fe-core/src/test/java/org/apache/doris/datasource/FileSplitterTest.java

diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FederationBackendPolicy.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FederationBackendPolicy.java
index 813d1892642167..a8927d86a946e6 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FederationBackendPolicy.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FederationBackendPolicy.java
@@ -63,12 +63,16 @@
 import java.util.Map;
 import java.util.Objects;
 import java.util.Optional;
+import java.util.Random;
 import java.util.Set;
 import java.util.concurrent.ExecutionException;
 import java.util.stream.Collectors;
 
 public class FederationBackendPolicy {
     private static final Logger LOG = LogManager.getLogger(FederationBackendPolicy.class);
+
+    private static final long FIXED_SHUFFLE_SEED = 123456789L;
+
     protected final List<Backend> backends = Lists.newArrayList();
     private final Map<String, List<Backend>> backendMap = Maps.newHashMap();
 
@@ -220,6 +224,7 @@ public void setEnableSplitsRedistribution(boolean enableSplitsRedistribution) {
     public Multimap<Backend, Split> computeScanRangeAssignment(List<Split> splits) throws UserException {
         ListMultimap<Backend, Split> assignment = ArrayListMultimap.create();
 
+        Collections.shuffle(splits, new Random(FIXED_SHUFFLE_SEED));
         List<Split> remainingSplits;
 
         List<Backend> backends = new ArrayList<>();
@@ -228,8 +233,6 @@ public Multimap<Backend, Split> computeScanRangeAssignment(List<Split> splits) t
         }
         ResettableRandomizedIterator<Backend> randomCandidates = new ResettableRandomizedIterator<>(backends);
 
-        boolean splitsToBeRedistributed = false;
-
         // optimizedLocalScheduling enables prioritized assignment of splits to local nodes when splits contain
         // locality information
         if (Config.split_assigner_optimized_local_scheduling) {
@@ -246,7 +249,6 @@ public Multimap<Backend, Split> computeScanRangeAssignment(List<Split> splits) t
                         assignment.put(selectedBackend, split);
                         assignedWeightPerBackend.put(selectedBackend,
                                 assignedWeightPerBackend.get(selectedBackend) + split.getSplitWeight().getRawValue());
-                        splitsToBeRedistributed = true;
                         continue;
                     }
                 }
@@ -276,7 +278,6 @@ public Multimap<Backend, Split> computeScanRangeAssignment(List<Split> splits) t
                     case CONSISTENT_HASHING: {
                         candidateNodes = consistentHash.getNode(split,
                                 Config.split_assigner_min_consistent_hash_candidate_num);
-                        splitsToBeRedistributed = true;
                         break;
                     }
                     default: {
@@ -302,7 +303,7 @@ public Multimap<Backend, Split> computeScanRangeAssignment(List<Split> splits) t
                     assignedWeightPerBackend.get(selectedBackend) + split.getSplitWeight().getRawValue());
         }
 
-        if (enableSplitsRedistribution && splitsToBeRedistributed) {
+        if (enableSplitsRedistribution) {
             equateDistribution(assignment);
         }
         return assignment;
@@ -499,3 +500,4 @@ public void funnel(Split split, PrimitiveSink primitiveSink) {
         }
     }
 }
+
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java
index 93bec2d1849b06..3ae32170e4bc76 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java
@@ -94,6 +94,8 @@ public abstract class FileQueryScanNode extends FileScanNode {
 
     protected TableScanParams scanParams;
 
+    protected FileSplitter fileSplitter;
+
     /**
      * External file scan node for Query hms table
      * needCheckColumnPriv: Some of ExternalFileScanNode do not need to check column priv
@@ -134,6 +136,8 @@ protected void doInitialize() throws UserException {
         }
         initBackendPolicy();
         initSchemaParams();
+        fileSplitter = new FileSplitter(sessionVariable.maxInitialSplitSize, sessionVariable.maxSplitSize,
+                sessionVariable.maxInitialSplitNum);
     }
 
     // Init schema (Tuple/Slot) related params.
@@ -592,19 +596,4 @@ public TableScanParams getScanParams() {
         }
         return this.scanParams;
     }
-
-    /**
-     * The real file split size is determined by:
-     * 1. If user specify the split size in session variable `file_split_size`, use user specified value.
-     * 2. Otherwise, use the max value of DEFAULT_SPLIT_SIZE and block size.
-     * @param blockSize, got from file system, eg, hdfs
-     * @return the real file split size
-     */
-    protected long getRealFileSplitSize(long blockSize) {
-        long realSplitSize = sessionVariable.getFileSplitSize();
-        if (realSplitSize <= 0) {
-            realSplitSize = Math.max(DEFAULT_SPLIT_SIZE, blockSize);
-        }
-        return realSplitSize;
-    }
 }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java
index c3e06999bba297..a7aa0f607ac504 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java
@@ -62,9 +62,6 @@
  * Base class for External File Scan, including external query and load.
  */
 public abstract class FileScanNode extends ExternalScanNode {
-
-    public static final long DEFAULT_SPLIT_SIZE = 64 * 1024 * 1024; // 64MB
-
     // For explain
     protected long totalFileSize = 0;
     protected long totalPartitionNum = 0;
@@ -115,12 +112,7 @@ public String getNodeExplainString(String prefix, TExplainLevel detailLevel) {
         }
 
         output.append(prefix);
-        boolean isBatch;
-        try {
-            isBatch = isBatchMode();
-        } catch (UserException e) {
-            throw new RuntimeException(e);
-        }
+        boolean isBatch = isBatchMode();
         if (isBatch) {
             output.append("(approximate)");
         }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileSplitter.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileSplitter.java
index 33b2d70bfb16a9..5fe8444197181c 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileSplitter.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileSplitter.java
@@ -22,13 +22,19 @@
 import org.apache.doris.spi.Split;
 import org.apache.doris.thrift.TFileCompressType;
 
+import com.google.common.base.Preconditions;
+import com.google.common.base.Verify;
+import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Lists;
 import org.apache.hadoop.fs.BlockLocation;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 
 import java.io.IOException;
+import java.util.Arrays;
 import java.util.List;
+import java.util.Objects;
+import java.util.concurrent.atomic.AtomicInteger;
 
 public class FileSplitter {
     private static final Logger LOG = LogManager.getLogger(FileSplitter.class);
@@ -40,18 +46,66 @@ public static boolean needSplitForCountPushdown(int parallelism, int numBackends
         return totalFileNum < parallelism * numBackends;
     }
 
-    public static List<Split> splitFile(
-            LocationPath path,
-            long fileSplitSize,
-            BlockLocation[] blockLocations,
-            long length,
-            long modificationTime,
-            boolean splittable,
-            List<String> partitionValues,
-            SplitCreator splitCreator)
-            throws IOException {
+    private long maxInitialSplitSize;
+
+    private long maxSplitSize;
+
+    private int maxInitialSplitNum;
+    private final AtomicInteger remainingInitialSplitNum;
+
+    private long currentMaxSplitSize;
+
+    public long getMaxInitialSplitSize() {
+        return maxInitialSplitSize;
+    }
+
+    public void setMaxInitialSplitSize(long maxInitialSplitSize) {
+        this.maxInitialSplitSize = maxInitialSplitSize;
+    }
+
+    public long getMaxSplitSize() {
+        return maxSplitSize;
+    }
+
+    public void setMaxSplitSize(long maxSplitSize) {
+        this.maxSplitSize = maxSplitSize;
+    }
+
+    public int maxInitialSplitNum() {
+        return maxInitialSplitNum;
+    }
+
+    public void setMaxInitialSplits(int maxInitialSplitNum) {
+        this.maxInitialSplitNum = maxInitialSplitNum;
+    }
+
+    public long getRemainingInitialSplitNum() {
+        return remainingInitialSplitNum.get();
+    }
+
+    public FileSplitter(long maxInitialSplitSize, long maxSplitSize, int maxInitialSplitNum) {
+        this.maxInitialSplitSize = maxInitialSplitSize;
+        this.maxSplitSize = maxSplitSize;
+        this.maxInitialSplitNum = maxInitialSplitNum;
+        currentMaxSplitSize = maxInitialSplitSize;
+        remainingInitialSplitNum = new AtomicInteger(maxInitialSplitNum);
+    }
+
+    public List<Split> splitFile(
+                LocationPath path,
+                long specifiedFileSplitSize,
+                BlockLocation[] blockLocations,
+                long length,
+                long modificationTime,
+                boolean splittable,
+                List<String> partitionValues,
+                SplitCreator splitCreator)
+                throws IOException {
+        // Pass splitCreator.create() to set target file split size to calculate split weight.
+        long targetFileSplitSize = specifiedFileSplitSize > 0 ? specifiedFileSplitSize : maxSplitSize;
         if (blockLocations == null) {
-            blockLocations = new BlockLocation[0];
+            blockLocations = new BlockLocation[1];
+            blockLocations[0] = new BlockLocation(null, null, 0L, length);
         }
         List<Split> result = Lists.newArrayList();
         TFileCompressType compressType = Util.inferFileCompressTypeByPath(path.getNormalizedLocation());
@@ -60,23 +114,83 @@ public static List<Split> splitFile(
                 LOG.debug("Path {} is not splittable.", path);
             }
             String[] hosts = blockLocations.length == 0 ? null : blockLocations[0].getHosts();
-            result.add(splitCreator.create(path, 0, length, length, fileSplitSize,
+            result.add(splitCreator.create(path, 0, length, length,
+                    targetFileSplitSize,
                     modificationTime, hosts, partitionValues));
+            updateCurrentMaxSplitSize();
+            return result;
+        }
+
+        // if specified split size is not zero, split file by specified size
+        if (specifiedFileSplitSize > 0) {
+            long bytesRemaining;
+            for (bytesRemaining = length; (double) bytesRemaining / (double) specifiedFileSplitSize > 1.1D;
+                    bytesRemaining -= specifiedFileSplitSize) {
+                int location = getBlockIndex(blockLocations, length - bytesRemaining);
+                String[] hosts = location == -1 ? null : blockLocations[location].getHosts();
+                result.add(splitCreator.create(path, length - bytesRemaining, specifiedFileSplitSize,
+                        length, specifiedFileSplitSize, modificationTime, hosts, partitionValues));
+            }
+            if (bytesRemaining != 0L) {
+                int location = getBlockIndex(blockLocations, length - bytesRemaining);
+                String[] hosts = location == -1 ? null : blockLocations[location].getHosts();
+                result.add(splitCreator.create(path, length - bytesRemaining, bytesRemaining,
+                        length, specifiedFileSplitSize, modificationTime, hosts, partitionValues));
+            }
             return result;
         }
-        long bytesRemaining;
-        for (bytesRemaining = length; (double) bytesRemaining / (double) fileSplitSize > 1.1D;
-                bytesRemaining -= fileSplitSize) {
-            int location = getBlockIndex(blockLocations, length - bytesRemaining);
-            String[] hosts = location == -1 ? null : blockLocations[location].getHosts();
-            result.add(splitCreator.create(path, length - bytesRemaining, fileSplitSize,
-                    length, fileSplitSize, modificationTime, hosts, partitionValues));
+
+        // split file by block
+        long start = 0;
+        ImmutableList.Builder<InternalBlock> blockBuilder = ImmutableList.builder();
+        for (BlockLocation blockLocation : blockLocations) {
+            // clamp the block range
+            long blockStart = Math.max(start, blockLocation.getOffset());
+            long blockEnd = Math.min(start + length, blockLocation.getOffset() + blockLocation.getLength());
+            if (blockStart > blockEnd) {
+                // block is outside split range
+                continue;
+            }
+            if (blockStart == blockEnd && !(blockStart == start && blockEnd == start + length)) {
+                // skip zero-width block, except in the special circumstance:
+                // slice is empty, and the block covers the empty slice interval.
+                continue;
+            }
+            blockBuilder.add(new InternalBlock(blockStart, blockEnd, blockLocation.getHosts()));
+        }
+        List<InternalBlock> blocks = blockBuilder.build();
+        if (blocks.isEmpty()) {
+            result.add(splitCreator.create(path, 0, length, length,
+                    targetFileSplitSize, modificationTime, null,
+                    partitionValues));
+            updateCurrentMaxSplitSize();
+            return result;
         }
-        if (bytesRemaining != 0L) {
-            int location = getBlockIndex(blockLocations, length - bytesRemaining);
-            String[] hosts = location == -1 ? null : blockLocations[location].getHosts();
-            result.add(splitCreator.create(path, length - bytesRemaining, bytesRemaining,
-                    length, fileSplitSize, modificationTime, hosts, partitionValues));
+
+        long splitStart = start;
+        int currentBlockIdx = 0;
+        while (splitStart < start + length) {
+            updateCurrentMaxSplitSize();
+            long splitBytes;
+            long remainingBlockBytes = blocks.get(currentBlockIdx).getEnd() - splitStart;
+            if (remainingBlockBytes <= currentMaxSplitSize) {
+                splitBytes = remainingBlockBytes;
+            } else if (currentMaxSplitSize * 2 >= remainingBlockBytes) {
+                // Second to last split in this block, generate two evenly sized splits
+                splitBytes = remainingBlockBytes / 2;
+            } else {
+                splitBytes = currentMaxSplitSize;
+            }
+            result.add(splitCreator.create(path, splitStart, splitBytes,
+                    length, targetFileSplitSize, modificationTime, blocks.get(currentBlockIdx).getHosts(),
+                    partitionValues));
+            splitStart += splitBytes;
+            if (splitStart == blocks.get(currentBlockIdx).getEnd()) {
+                currentBlockIdx++;
+                if (currentBlockIdx != blocks.size()) {
+                    Verify.verify(splitStart == blocks.get(currentBlockIdx).getStart());
+                }
+            }
         }
 
         if (LOG.isDebugEnabled()) {
@@ -85,7 +199,19 @@ public static List<Split> splitFile(
         return result;
     }
 
-    private static int getBlockIndex(BlockLocation[] blkLocations, long offset) {
+    private void updateCurrentMaxSplitSize() {
+        currentMaxSplitSize = maxSplitSize;
+        int cur = remainingInitialSplitNum.get();
+        while (cur > 0) {
+            if (remainingInitialSplitNum.compareAndSet(cur, cur - 1)) {
+                currentMaxSplitSize = maxInitialSplitSize;
+                break;
+            }
+            cur = remainingInitialSplitNum.get();
+        }
+    }
+
+    private int getBlockIndex(BlockLocation[] blkLocations, long offset) {
         if (blkLocations == null || blkLocations.length == 0) {
             return -1;
         }
@@ -100,5 +226,59 @@ private static int getBlockIndex(BlockLocation[] blkLocations, long offset) {
         throw new IllegalArgumentException(String.format("Offset %d is outside of file (0..%d)", offset, fileLength));
     }
 
+    private static class InternalBlock {
+        private final long start;
+        private final long end;
+        private final String[] hosts;
+
+        public InternalBlock(long start, long end, String[] hosts) {
+            Preconditions.checkArgument(start <= end, "block end cannot be before block start");
+            this.start = start;
+            this.end = end;
+            this.hosts = hosts;
+        }
+
+        public long getStart() {
+            return start;
+        }
+
+        public long getEnd() {
+            return end;
+        }
+
+        public String[] getHosts() {
+            return hosts;
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) {
+                return true;
+            }
+            if (o == null || getClass() != o.getClass()) {
+                return false;
+            }
+            InternalBlock that = (InternalBlock) o;
+            return start == that.start && end == that.end && Arrays.equals(hosts, that.hosts);
+        }
+
+        @Override
+        public int hashCode() {
+            int result = Objects.hash(start, end);
+            result = 31 * result + Arrays.hashCode(hosts);
+            return result;
+        }
+
+        @Override
+        public String toString() {
+            final StringBuilder sb = new StringBuilder("InternalBlock{");
+            sb.append("start=").append(start);
+            sb.append(", end=").append(end);
+            sb.append(", hosts=").append(Arrays.toString(hosts));
+            sb.append('}');
+            return sb.toString();
+        }
+    }
+
 
 }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/SplitGenerator.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/SplitGenerator.java
index 0b8a1022d5a50d..391552a5106a83 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/SplitGenerator.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/SplitGenerator.java
@@ -40,7 +40,7 @@ default List<Split> getSplits(int numBackends) throws UserException {
     /**
      * Whether the producer(e.g. ScanNode) support batch mode.
      */
-    default boolean isBatchMode() throws UserException {
+    default boolean isBatchMode() {
         return false;
     }
 
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
index 744423f622cce8..5bcf2f5546a51f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
@@ -186,7 +186,7 @@ public List<Split> getSplits(int numBackends) throws UserException {
                     .getMetaStoreCache((HMSExternalCatalog) hmsTable.getCatalog());
             String bindBrokerName = hmsTable.getCatalog().bindBrokerName();
             List<Split> allFiles = Lists.newArrayList();
-            getFileSplitByPartitions(cache, prunedPartitions, allFiles, bindBrokerName, numBackends);
+            getFileSplitByPartitions(cache, prunedPartitions, allFiles, bindBrokerName, numBackends, false);
             if (ConnectContext.get().getExecutor() != null) {
                 ConnectContext.get().getExecutor().getSummaryProfile().setGetPartitionFilesFinishTime();
             }
@@ -226,7 +226,8 @@ public void startSplit(int numBackends) {
                         try {
                             List<Split> allFiles = Lists.newArrayList();
                             getFileSplitByPartitions(
-                                    cache, Collections.singletonList(partition), allFiles, bindBrokerName, numBackends);
+                                    cache, Collections.singletonList(partition), allFiles, bindBrokerName,
+                                    numBackends, true);
                             if (allFiles.size() > numSplitsPerPartition.get()) {
                                 numSplitsPerPartition.set(allFiles.size());
                             }
@@ -277,7 +278,8 @@ public int numApproximateSplits() {
     }
 
     private void getFileSplitByPartitions(HiveMetaStoreCache cache, List<HivePartition> partitions,
-            List<Split> allFiles, String bindBrokerName, int numBackends) throws IOException, UserException {
+            List<Split> allFiles, String bindBrokerName, int numBackends,
+            boolean isBatchMode) throws IOException, UserException {
         List<FileCacheValue> fileCaches;
         if (hiveTransaction != null) {
             try {
@@ -293,9 +295,11 @@ private void getFileSplitByPartitions(HiveMetaStoreCache cache, List<HivePartiti
             fileCaches = cache.getFilesByPartitions(partitions, withCache, partitions.size() > 1,
                     directoryLister, hmsTable);
         }
+
+        long targetFileSplitSize = determineTargetFileSplitSize(fileCaches, isBatchMode);
         if (tableSample != null) {
             List<HiveMetaStoreCache.HiveFileStatus> hiveFileStatuses = selectFiles(fileCaches);
-            splitAllFiles(allFiles, hiveFileStatuses);
+            splitAllFiles(allFiles, hiveFileStatuses, targetFileSplitSize);
             return;
         }
 
@@ -319,27 +323,67 @@ private void getFileSplitByPartitions(HiveMetaStoreCache cache, List<HivePartiti
             int parallelNum = sessionVariable.getParallelExecInstanceNum();
             needSplit = FileSplitter.needSplitForCountPushdown(parallelNum, numBackends, totalFileNum);
         }
+
         for (HiveMetaStoreCache.FileCacheValue fileCacheValue : fileCaches) {
-            if (fileCacheValue.getFiles() != null) {
-                boolean isSplittable = fileCacheValue.isSplittable();
-                for (HiveMetaStoreCache.HiveFileStatus status : fileCacheValue.getFiles()) {
-                    allFiles.addAll(FileSplitter.splitFile(status.getPath(),
-                            // set block size to Long.MAX_VALUE to avoid splitting the file.
-                            getRealFileSplitSize(needSplit ? status.getBlockSize() : Long.MAX_VALUE),
-                            status.getBlockLocations(), status.getLength(), status.getModificationTime(),
-                            isSplittable, fileCacheValue.getPartitionValues(),
-                            new HiveSplitCreator(fileCacheValue.getAcidInfo())));
+            if (fileCacheValue.getFiles() == null) {
+                continue;
+            }
+            boolean isSplittable = fileCacheValue.isSplittable();
+
+            for (HiveMetaStoreCache.HiveFileStatus status : fileCacheValue.getFiles()) {
+                allFiles.addAll(fileSplitter.splitFile(
+                        status.getPath(),
+                        targetFileSplitSize,
+                        status.getBlockLocations(),
+                        status.getLength(),
+                        status.getModificationTime(),
+                        isSplittable && needSplit,
+                        fileCacheValue.getPartitionValues(),
+                        new HiveSplitCreator(fileCacheValue.getAcidInfo())));
+            }
+        }
+    }
+
+    private long determineTargetFileSplitSize(List<FileCacheValue> fileCaches,
+            boolean isBatchMode) {
+        if (sessionVariable.getFileSplitSize() > 0) {
+            return sessionVariable.getFileSplitSize();
+        }
+        /** Hive batch split mode will return 0. and <code>FileSplitter</code>
+         *  will determine file split size.
+         */
+        if (isBatchMode) {
+            return 0;
+        }
+        long result = sessionVariable.getMaxInitialSplitSize();
+        long totalFileSize = 0;
+        for (HiveMetaStoreCache.FileCacheValue fileCacheValue : fileCaches) {
+            if (fileCacheValue.getFiles() == null) {
+                continue;
+            }
+            for (HiveMetaStoreCache.HiveFileStatus status : fileCacheValue.getFiles()) {
+                totalFileSize += status.getLength();
+                if (totalFileSize >= sessionVariable.getMaxSplitSize() * sessionVariable.getMaxInitialSplitNum()) {
+                    result = sessionVariable.getMaxSplitSize();
+                    break;
                 }
             }
         }
+        return result;
     }
 
     private void splitAllFiles(List<Split> allFiles,
-                               List<HiveMetaStoreCache.HiveFileStatus> hiveFileStatuses) throws IOException {
+                               List<HiveMetaStoreCache.HiveFileStatus> hiveFileStatuses,
+            long realFileSplitSize) throws IOException {
         for (HiveMetaStoreCache.HiveFileStatus status : hiveFileStatuses) {
-            allFiles.addAll(FileSplitter.splitFile(status.getPath(), getRealFileSplitSize(status.getBlockSize()),
-                    status.getBlockLocations(), status.getLength(), status.getModificationTime(),
-                    status.isSplittable(), status.getPartitionValues(),
+            allFiles.addAll(fileSplitter.splitFile(
+                    status.getPath(),
+                    realFileSplitSize,
+                    status.getBlockLocations(),
+                    status.getLength(),
+                    status.getModificationTime(),
+                    status.isSplittable(),
+                    status.getPartitionValues(),
                     new HiveSplitCreator(status.getAcidInfo())));
         }
     }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
index 0ffe86edb315d6..133ac0676448c7 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
@@ -85,6 +85,7 @@
 import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.io.CloseableIterator;
 import org.apache.iceberg.types.Conversions;
+import org.apache.iceberg.util.ScanTaskUtil;
 import org.apache.iceberg.util.TableScanUtil;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
@@ -119,7 +120,7 @@ public class IcebergScanNode extends FileQueryScanNode {
     private boolean tableLevelPushDownCount = false;
     private long countFromSnapshot;
     private static final long COUNT_WITH_PARALLEL_SPLITS = 10000;
-    private long targetSplitSize;
+    private long targetSplitSize = 0;
     // This is used to avoid repeatedly calculating partition info map for the same partition data.
     private Map<PartitionData, Map<String, String>> partitionMapInfos;
     private boolean isPartitionedTable;
@@ -131,6 +132,8 @@ public class IcebergScanNode extends FileQueryScanNode {
     private Map<StorageProperties.Type, StorageProperties> storagePropertiesMap;
     private Map<String, String> backendStorageProperties;
 
+    private Boolean isBatchMode = null;
+
     // for test
     @VisibleForTesting
     public IcebergScanNode(PlanNodeId id, TupleDescriptor desc, SessionVariable sv) {
@@ -171,7 +174,6 @@ public IcebergScanNode(PlanNodeId id, TupleDescriptor desc, boolean needCheckCol
     @Override
     protected void doInitialize() throws UserException {
         icebergTable = source.getIcebergTable();
-        targetSplitSize = getRealFileSplitSize(0);
         partitionMapInfos = new HashMap<>();
         isPartitionedTable = icebergTable.spec().isPartitioned();
         formatVersion = ((BaseTable) icebergTable).operations().current().formatVersion();
@@ -375,18 +377,57 @@ public TableScan createTableScan() throws UserException {
 
     private CloseableIterable<FileScanTask> planFileScanTask(TableScan scan) {
         if (!Config.iceberg_manifest_cache_enable) {
-            long targetSplitSize = getRealFileSplitSize(0);
-            return TableScanUtil.splitFiles(scan.planFiles(), targetSplitSize);
+            return splitFiles(scan);
         }
         try {
             return planFileScanTaskWithManifestCache(scan);
         } catch (Exception e) {
             LOG.warn("Plan with manifest cache failed, fallback to original scan: {}", e.getMessage());
-            long targetSplitSize = getRealFileSplitSize(0);
-            return TableScanUtil.splitFiles(scan.planFiles(), targetSplitSize);
+            return splitFiles(scan);
         }
     }
 
+    private CloseableIterable<FileScanTask> splitFiles(TableScan scan) {
+        if (sessionVariable.getFileSplitSize() > 0) {
+            return TableScanUtil.splitFiles(scan.planFiles(),
+                    sessionVariable.getFileSplitSize());
+        }
+        if (isBatchMode()) {
+            // Currently iceberg batch split mode will use max split size.
+            // TODO: dynamic split size in batch split mode need to customize iceberg splitter.
+            return TableScanUtil.splitFiles(scan.planFiles(), sessionVariable.getMaxSplitSize());
+        }
+
+        // Non Batch Mode
+        // Materialize planFiles() into a list to avoid iterating the CloseableIterable twice.
+        // RISK: It will cost memory if the table is large.
+        List<FileScanTask> fileScanTaskList = new ArrayList<>();
+        try (CloseableIterable<FileScanTask> scanTasksIter = scan.planFiles()) {
+            for (FileScanTask task : scanTasksIter) {
+                fileScanTaskList.add(task);
+            }
+        } catch (Exception e) {
+            throw new RuntimeException("Failed to materialize file scan tasks", e);
+        }
+
+        targetSplitSize = determineTargetFileSplitSize(fileScanTaskList);
+        return TableScanUtil.splitFiles(CloseableIterable.withNoopClose(fileScanTaskList), targetSplitSize);
+    }
+
+    private long determineTargetFileSplitSize(Iterable<FileScanTask> tasks) {
+        long result = sessionVariable.getMaxInitialSplitSize();
+        long accumulatedTotalFileSize = 0;
+        for (FileScanTask task : tasks) {
+            accumulatedTotalFileSize += ScanTaskUtil.contentSizeInBytes(task.file());
+            if (accumulatedTotalFileSize
+                    >= sessionVariable.getMaxSplitSize() * sessionVariable.getMaxInitialSplitNum()) {
+                result = sessionVariable.getMaxSplitSize();
+                break;
+            }
+        }
+        return result;
+    }
+
     private CloseableIterable<FileScanTask> planFileScanTaskWithManifestCache(TableScan scan) throws IOException {
         // Get the snapshot from the scan; return empty if no snapshot exists
         Snapshot snapshot = scan.snapshot();
@@ -502,7 +543,7 @@ private CloseableIterable<FileScanTask> planFileScanTaskWithManifestCache(TableS
         }
 
         // Split tasks into smaller chunks based on target split size for parallel processing
-        long targetSplitSize = getRealFileSplitSize(0);
+        targetSplitSize = determineTargetFileSplitSize(tasks);
         return TableScanUtil.splitFiles(CloseableIterable.withNoopClose(tasks), targetSplitSize);
     }
 
@@ -592,21 +633,36 @@ private List<Split> doGetSplits(int numBackends) throws UserException {
     }
 
     @Override
-    public boolean isBatchMode() throws UserException {
+    public boolean isBatchMode() {
+        Boolean cached = isBatchMode;
+        if (cached != null) {
+            return cached;
+        }
         TPushAggOp aggOp = getPushDownAggNoGroupingOp();
         if (aggOp.equals(TPushAggOp.COUNT)) {
-            countFromSnapshot = getCountFromSnapshot();
+            try {
+                countFromSnapshot = getCountFromSnapshot();
+            } catch (UserException e) {
+                throw new RuntimeException(e);
+            }
             if (countFromSnapshot >= 0) {
                 tableLevelPushDownCount = true;
+                isBatchMode = false;
                 return false;
             }
         }
 
-        if (createTableScan().snapshot() == null) {
-            return false;
+        try {
+            if (createTableScan().snapshot() == null) {
+                isBatchMode = false;
+                return false;
+            }
+        } catch (UserException e) {
+            throw new RuntimeException(e);
         }
 
         if (!sessionVariable.getEnableExternalTableBatchMode()) {
+            isBatchMode = false;
             return false;
         }
 
@@ -622,10 +678,12 @@ public boolean isBatchMode() throws UserException {
                         ManifestFile next = matchingManifest.next();
                         cnt += next.addedFilesCount() + next.existingFilesCount();
                         if (cnt >= sessionVariable.getNumFilesInBatchMode()) {
+                            isBatchMode = true;
                             return true;
                         }
                     }
                 }
+                isBatchMode = false;
                 return false;
             });
         } catch (Exception e) {
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java
index 402bf3d0ef6625..1671ce0f17a336 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java
@@ -28,7 +28,6 @@
 import org.apache.doris.common.util.LocationPath;
 import org.apache.doris.datasource.ExternalUtil;
 import org.apache.doris.datasource.FileQueryScanNode;
-import org.apache.doris.datasource.FileSplitter;
 import org.apache.doris.datasource.credentials.CredentialUtils;
 import org.apache.doris.datasource.credentials.VendedCredentialsFactory;
 import org.apache.doris.datasource.paimon.PaimonExternalCatalog;
@@ -293,7 +292,8 @@ public List<Split> getSplits(int numBackends) throws UserException {
         // And for counting the number of selected partitions for this paimon table.
         Map<BinaryRow, Map<String, String>> partitionInfoMaps = new HashMap<>();
         // if applyCountPushdown is true, we can't split the DataSplit
-        long realFileSplitSize = getRealFileSplitSize(applyCountPushdown ? Long.MAX_VALUE : 0);
+        boolean hasDeterminedTargetFileSplitSize = false;
+        long targetFileSplitSize = 0;
         for (DataSplit dataSplit : dataSplits) {
             SplitStat splitStat = new SplitStat();
             splitStat.setRowCount(dataSplit.rowCount());
@@ -325,6 +325,10 @@ public List<Split> getSplits(int numBackends) throws UserException {
                 if (ignoreSplitType == SessionVariable.IgnoreSplitType.IGNORE_NATIVE) {
                     continue;
                 }
+                if (!hasDeterminedTargetFileSplitSize) {
+                    targetFileSplitSize = determineTargetFileSplitSize(dataSplits, isBatchMode());
+                    hasDeterminedTargetFileSplitSize = true;
+                }
                 splitStat.setType(SplitReadType.NATIVE);
                 splitStat.setRawFileConvertable(true);
                 List<RawFile> rawFiles = optRawFiles.get();
@@ -332,13 +336,13 @@ public List<Split> getSplits(int numBackends) throws UserException {
                     RawFile file = rawFiles.get(i);
                     LocationPath locationPath = LocationPath.of(file.path(), storagePropertiesMap);
                     try {
-                        List<Split> dorisSplits = FileSplitter.splitFile(
+                        List<Split> dorisSplits = fileSplitter.splitFile(
                                 locationPath,
-                                realFileSplitSize,
+                                targetFileSplitSize,
                                 null,
                                 file.length(),
                                 -1,
-                                true,
+                                !applyCountPushdown,
                                 null,
                                 PaimonSplit.PaimonSplitCreator.DEFAULT);
                         for (Split dorisSplit : dorisSplits) {
@@ -383,12 +387,43 @@ public List<Split> getSplits(int numBackends) throws UserException {
 
         // We need to set the target size for all splits so that we can calculate the
         // proportion of each split later.
-        splits.forEach(s -> s.setTargetSplitSize(realFileSplitSize));
+        splits.forEach(s -> s.setTargetSplitSize(sessionVariable.getFileSplitSize() > 0
+                ? sessionVariable.getFileSplitSize() : sessionVariable.getMaxSplitSize()));
 
         this.selectedPartitionNum = partitionInfoMaps.size();
         return splits;
     }
 
+    private long determineTargetFileSplitSize(List<DataSplit> dataSplits,
+            boolean isBatchMode) {
+        if (sessionVariable.getFileSplitSize() > 0) {
+            return sessionVariable.getFileSplitSize();
+        }
+        /** Paimon batch split mode will return 0. and <code>FileSplitter</code>
+         *  will determine file split size.
+         */
+        if (isBatchMode) {
+            return 0;
+        }
+        long result = sessionVariable.getMaxInitialSplitSize();
+        long totalFileSize = 0;
+        for (DataSplit dataSplit : dataSplits) {
+            Optional<List<RawFile>> rawFiles = dataSplit.convertToRawFiles();
+            if (!supportNativeReader(rawFiles)) {
+                continue;
+            }
+            for (RawFile rawFile : rawFiles.get()) {
+                totalFileSize += rawFile.fileSize();
+                if (totalFileSize
+                        >= sessionVariable.getMaxSplitSize() * sessionVariable.getMaxInitialSplitNum()) {
+                    result = sessionVariable.getMaxSplitSize();
+                    break;
+                }
+            }
+        }
+        return result;
+    }
+
     @VisibleForTesting
     public Map<String, String> getIncrReadParams() throws UserException {
         Map<String, String> paimonScanParams = new HashMap<>();
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/tvf/source/TVFScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/tvf/source/TVFScanNode.java
index e75675597622d3..c3b0e3e8b6d04a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/tvf/source/TVFScanNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/tvf/source/TVFScanNode.java
@@ -146,12 +146,18 @@ public List<Split> getSplits(int numBackends) throws UserException {
             needSplit = FileSplitter.needSplitForCountPushdown(parallelNum, numBackends, totalFileNum);
         }
 
+        long targetFileSplitSize = determineTargetFileSplitSize(fileStatuses);
+
         for (TBrokerFileStatus fileStatus : fileStatuses) {
             try {
-                splits.addAll(FileSplitter.splitFile(LocationPath.of(fileStatus.getPath()),
-                        getRealFileSplitSize(needSplit ? fileStatus.getBlockSize() : Long.MAX_VALUE),
-                        null, fileStatus.getSize(),
-                        fileStatus.getModificationTime(), fileStatus.isSplitable, null,
+                splits.addAll(fileSplitter.splitFile(
+                        LocationPath.of(fileStatus.getPath()),
+                        targetFileSplitSize,
+                        null,
+                        fileStatus.getSize(),
+                        fileStatus.getModificationTime(),
+                        fileStatus.isSplitable && needSplit,
+                        null,
                         FileSplitCreator.DEFAULT));
             } catch (IOException e) {
                 LOG.warn("get file split failed for TVF: {}", fileStatus.getPath(), e);
@@ -161,6 +167,23 @@ public List<Split> getSplits(int numBackends) throws UserException {
         return splits;
     }
 
+    private long determineTargetFileSplitSize(List<TBrokerFileStatus> fileStatuses) {
+        if (sessionVariable.getFileSplitSize() > 0) {
+            return sessionVariable.getFileSplitSize();
+        }
+        long result = sessionVariable.getMaxInitialSplitSize();
+        long totalFileSize = 0;
+        for (TBrokerFileStatus fileStatus : fileStatuses) {
+            totalFileSize += fileStatus.getSize();
+            if (totalFileSize
+                    >= sessionVariable.getMaxSplitSize() * sessionVariable.getMaxInitialSplitNum()) {
+                result = sessionVariable.getMaxSplitSize();
+                break;
+            }
+        }
+        return result;
+    }
+
     @Override
     protected void setScanParams(TFileRangeDesc rangeDesc, Split split) {
         if (split instanceof FileSplit) {
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index 4fddfd0332d952..14a25a992b8025 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -513,6 +513,12 @@ public class SessionVariable implements Serializable, Writable {
     // Split size for ExternalFileScanNode. Default value 0 means use the block size of HDFS/S3.
     public static final String FILE_SPLIT_SIZE = "file_split_size";
 
+    public static final String MAX_INITIAL_FILE_SPLIT_SIZE = "max_initial_file_split_size";
+
+    public static final String MAX_FILE_SPLIT_SIZE = "max_file_split_size";
+
+    public static final String MAX_INITIAL_FILE_SPLIT_NUM = "max_initial_file_split_num";
+
     // Target file size in bytes for Iceberg write operations
     public static final String ICEBERG_WRITE_TARGET_FILE_SIZE_BYTES = "iceberg_write_target_file_size_bytes";
 
@@ -2161,6 +2167,15 @@ public boolean isEnableHboNonStrictMatchingMode() {
     @VariableMgr.VarAttr(name = FILE_SPLIT_SIZE, needForward = true)
     public long fileSplitSize = 0;
 
+    @VariableMgr.VarAttr(name = MAX_INITIAL_FILE_SPLIT_SIZE, needForward = true)
+    public long maxInitialSplitSize = 32L * 1024L * 1024L;
+
+    @VariableMgr.VarAttr(name = MAX_FILE_SPLIT_SIZE, needForward = true)
+    public long maxSplitSize = 64L * 1024L * 1024L;
+
+    @VariableMgr.VarAttr(name = MAX_INITIAL_FILE_SPLIT_NUM, needForward = true)
+    public int maxInitialSplitNum = 200;
+
     // Target file size for Iceberg write operations
     // Default 0 means use config::iceberg_sink_max_file_size
     @VariableMgr.VarAttr(name = ICEBERG_WRITE_TARGET_FILE_SIZE_BYTES, needForward = true)
@@ -4181,6 +4196,30 @@ public void setFileSplitSize(long fileSplitSize) {
         this.fileSplitSize = fileSplitSize;
     }
 
+    public long getMaxInitialSplitSize() {
+        return maxInitialSplitSize;
+    }
+
+    public void setMaxInitialSplitSize(long maxInitialSplitSize) {
+        this.maxInitialSplitSize = maxInitialSplitSize;
+    }
+
+    public long getMaxSplitSize() {
+        return maxSplitSize;
+    }
+
+    public void setMaxSplitSize(long maxSplitSize) {
+        this.maxSplitSize = maxSplitSize;
+    }
+
+    public int getMaxInitialSplitNum() {
+        return maxInitialSplitNum;
+    }
+
+    public void setMaxInitialSplitNum(int maxInitialSplitNum) {
+        this.maxInitialSplitNum = maxInitialSplitNum;
+    }
+
     public long getIcebergWriteTargetFileSizeBytes() {
         return icebergWriteTargetFileSizeBytes;
     }
diff --git a/fe/fe-core/src/test/java/org/apache/doris/datasource/FileSplitterTest.java b/fe/fe-core/src/test/java/org/apache/doris/datasource/FileSplitterTest.java
new file mode 100644
index 00000000000000..a455923da4d91e
--- /dev/null
+++ b/fe/fe-core/src/test/java/org/apache/doris/datasource/FileSplitterTest.java
@@ -0,0 +1,216 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.datasource;
+
+import org.apache.doris.common.util.LocationPath;
+import org.apache.doris.spi.Split;
+
+import org.apache.hadoop.fs.BlockLocation;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.Collections;
+import java.util.List;
+
+public class FileSplitterTest {
+
+    private static final long MB = 1024L * 1024L;
+
+    private static final int DEFAULT_INITIAL_SPLITS = 200;
+
+    @Test
+    public void testNonSplittableCompressedFileProducesSingleSplit() throws Exception {
+        LocationPath loc = LocationPath.of("hdfs://example.com/path/file.gz");
+        BlockLocation[] locations = new BlockLocation[]{new BlockLocation(null, new String[]{"h1"}, 0L, 10 * MB)};
+        FileSplitter fileSplitter = new FileSplitter(32 * MB, 64 * MB, DEFAULT_INITIAL_SPLITS);
+        List<Split> splits = fileSplitter.splitFile(
+                loc,
+                0L,
+                locations,
+                10 * MB,
+                0L,
+                true,
+                Collections.emptyList(),
+                FileSplit.FileSplitCreator.DEFAULT);
+        Assert.assertEquals(1, splits.size());
+        Split s = splits.get(0);
+        Assert.assertEquals(10 * MB, ((org.apache.doris.datasource.FileSplit) s).getLength());
+        // host should be preserved
+        Assert.assertArrayEquals(new String[]{"h1"}, ((org.apache.doris.datasource.FileSplit) s).getHosts());
+        Assert.assertEquals(DEFAULT_INITIAL_SPLITS - 1, fileSplitter.getRemainingInitialSplitNum());
+    }
+
+    @Test
+    public void testEmptyBlockLocationsProducesSingleSplitAndNullHosts() throws Exception {
+        LocationPath loc = LocationPath.of("hdfs://example.com/path/file");
+        BlockLocation[] locations = new BlockLocation[0];
+        FileSplitter fileSplitter = new FileSplitter(32 * MB, 64 * MB, DEFAULT_INITIAL_SPLITS);
+        List<Split> splits = fileSplitter.splitFile(
+                loc,
+                0L,
+                locations,
+                5 * MB,
+                0L,
+                true,
+                Collections.emptyList(),
+                FileSplit.FileSplitCreator.DEFAULT);
+        Assert.assertEquals(1, splits.size());
+        org.apache.doris.datasource.FileSplit s = (org.apache.doris.datasource.FileSplit) splits.get(0);
+        Assert.assertEquals(5 * MB, s.getLength());
+        // hosts should be empty array when passing null
+        Assert.assertNotNull(s.getHosts());
+        Assert.assertEquals(0, s.getHosts().length);
+    }
+
+    @Test
+    public void testSplittableSingleBigBlockProducesExpectedSplitsWithInitialSmallChunks() throws Exception {
+        LocationPath loc = LocationPath.of("hdfs://example.com/path/bigfile");
+        long length = 200 * MB;
+        BlockLocation[] locations = new BlockLocation[]{new BlockLocation(null, new String[]{"h1"}, 0L, length)};
+        // set maxInitialSplits to 2 to force the first two splits to be small.
+        FileSplitter fileSplitter = new FileSplitter(32 * MB, 64 * MB, 2);
+        List<Split> splits = fileSplitter.splitFile(
+                loc,
+                0L,
+                locations,
+                length,
+                0L,
+                true,
+                Collections.emptyList(),
+                FileSplit.FileSplitCreator.DEFAULT);
+
+        // expect splits sizes: 32MB, 32MB, 64MB, 36MB, 36MB -> sum is 200MB
+        long[] expected = new long[]{32 * MB, 32 * MB, 64 * MB, 36 * MB, 36 * MB};
+        Assert.assertEquals(expected.length, splits.size());
+        long sum = 0L;
+        for (int i = 0; i < expected.length; i++) {
+            org.apache.doris.datasource.FileSplit s = (org.apache.doris.datasource.FileSplit) splits.get(i);
+            Assert.assertEquals(expected[i], s.getLength());
+            sum += s.getLength();
+            // ensure host preserved
+            Assert.assertArrayEquals(new String[]{"h1"}, s.getHosts());
+        }
+        Assert.assertEquals(length, sum);
+        // ensure the initial small-split counter is consumed for the two initial small splits
+        Assert.assertEquals(0, fileSplitter.getRemainingInitialSplitNum());
+    }
+
+    @Test
+    public void testMultiBlockSplitsAndHostPreservation() throws Exception {
+        LocationPath loc = LocationPath.of("hdfs://example.com/path/twoblocks");
+        long len = 96 * MB;
+        BlockLocation[] locations = new BlockLocation[]{
+                new BlockLocation(null, new String[]{"h1"}, 0L, 48 * MB),
+                new BlockLocation(null, new String[]{"h2"}, 48 * MB, 48 * MB)
+        };
+        FileSplitter fileSplitter = new FileSplitter(32 * MB, 64 * MB, 0);
+        List<Split> splits = fileSplitter.splitFile(
+                loc,
+                0L,
+                locations,
+                len,
+                0L,
+                true,
+                Collections.emptyList(),
+                FileSplit.FileSplitCreator.DEFAULT);
+        Assert.assertEquals(2, splits.size());
+        FileSplit s0 = (FileSplit) splits.get(0);
+        FileSplit s1 = (FileSplit) splits.get(1);
+        Assert.assertEquals(48 * MB, s0.getLength());
+        Assert.assertEquals(48 * MB, s1.getLength());
+        Assert.assertArrayEquals(new String[]{"h1"}, s0.getHosts());
+        Assert.assertArrayEquals(new String[]{"h2"}, s1.getHosts());
+    }
+
+    @Test
+    public void testZeroLengthBlockIsSkipped() throws Exception {
+        LocationPath loc = LocationPath.of("hdfs://example.com/path/zeroblock");
+        long length = 10 * MB;
+        BlockLocation[] locations = new BlockLocation[]{
+                new BlockLocation(null, new String[]{"h1"}, 0L, 0L),
+                new BlockLocation(null, new String[]{"h1"}, 0L, 10 * MB)
+        };
+        FileSplitter fileSplitter = new FileSplitter(32 * MB, 64 * MB, DEFAULT_INITIAL_SPLITS);
+        List<Split> splits = fileSplitter.splitFile(
+                loc,
+                0L,
+                locations,
+                length,
+                0L,
+                true,
+                Collections.emptyList(),
+                FileSplit.FileSplitCreator.DEFAULT);
+        Assert.assertEquals(1, splits.size());
+        FileSplit s = (FileSplit) splits.get(0);
+        Assert.assertEquals(10 * MB, s.getLength());
+        Assert.assertArrayEquals(new String[]{"h1"}, s.getHosts());
+    }
+
+    @Test
+    public void testNonSplittableFlagDecrementsCounter() throws Exception {
+        LocationPath loc = LocationPath.of("hdfs://example.com/path/file.gz");
+        BlockLocation[] locations = new BlockLocation[]{new BlockLocation(null, new String[]{"h1"}, 0L, 10 * MB)};
+        FileSplitter fileSplitter = new FileSplitter(32 * MB, 64 * MB, 2);
+        List<Split> splits = fileSplitter.splitFile(
+                loc,
+                0L,
+                locations,
+                10 * MB,
+                0L,
+                false,
+                Collections.emptyList(),
+                FileSplit.FileSplitCreator.DEFAULT);
+        Assert.assertEquals(1, splits.size());
+    }
+
+    @Test
+    public void testNullRemainingInitialSplitIsAllowed() throws Exception {
+        LocationPath loc = LocationPath.of("hdfs://example.com/path/somefile");
+        BlockLocation[] locations = new BlockLocation[]{new BlockLocation(null, new String[]{"h1"}, 0L, 10 * MB)};
+        FileSplitter fileSplitter = new FileSplitter(32 * MB, 64 * MB, DEFAULT_INITIAL_SPLITS);
+        List<Split> splits = fileSplitter.splitFile(
+                loc,
+                0L,
+                locations,
+                10 * MB,
+                0L,
+                true,
+                Collections.emptyList(),
+                FileSplit.FileSplitCreator.DEFAULT);
+        Assert.assertEquals(1, splits.size());
+    }
+
+    @Test
+    public void testSmallFileNoSplit() throws Exception {
+        LocationPath loc = LocationPath.of("hdfs://example.com/path/small");
+        BlockLocation[] locations = new BlockLocation[]{new BlockLocation(null, new String[]{"h1"}, 0L, 2 * MB)};
+        FileSplitter fileSplitter = new FileSplitter(32 * MB, 64 * MB, DEFAULT_INITIAL_SPLITS);
+        List<Split> splits = fileSplitter.splitFile(
+                loc,
+                0L,
+                locations,
+                2 * MB,
+                0L,
+                true,
+                Collections.emptyList(),
+                FileSplit.FileSplitCreator.DEFAULT);
+        Assert.assertEquals(1, splits.size());
+        FileSplit s = (FileSplit) splits.get(0);
+        Assert.assertEquals(2 * MB, s.getLength());
+    }
+}
diff --git a/fe/fe-core/src/test/java/org/apache/doris/datasource/paimon/source/PaimonScanNodeTest.java b/fe/fe-core/src/test/java/org/apache/doris/datasource/paimon/source/PaimonScanNodeTest.java
index 93afa390530e6e..692a0db12caa63 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/datasource/paimon/source/PaimonScanNodeTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/datasource/paimon/source/PaimonScanNodeTest.java
@@ -21,6 +21,8 @@
 import org.apache.doris.analysis.TupleId;
 import org.apache.doris.common.ExceptionChecker;
 import org.apache.doris.common.UserException;
+import org.apache.doris.datasource.FileQueryScanNode;
+import org.apache.doris.datasource.FileSplitter;
 import org.apache.doris.datasource.paimon.PaimonFileExternalCatalog;
 import org.apache.doris.planner.PlanNodeId;
 import org.apache.doris.qe.SessionVariable;
@@ -92,11 +94,26 @@ public void testSplitWeight() throws UserException {
             }
         }).when(spyPaimonScanNode).getPaimonSplitFromAPI();
 
+        long maxInitialSplitSize = 32L * 1024L * 1024L;
+        long maxSplitSize = 64L * 1024L * 1024L;
+        // Ensure fileSplitter is initialized on the spy as doInitialize() is not called in this unit test
+        FileSplitter fileSplitter = new FileSplitter(maxInitialSplitSize, maxSplitSize,
+                0);
+        try {
+            java.lang.reflect.Field field = FileQueryScanNode.class.getDeclaredField("fileSplitter");
+            field.setAccessible(true);
+            field.set(spyPaimonScanNode, fileSplitter);
+        } catch (NoSuchFieldException | IllegalAccessException e) {
+            throw new RuntimeException("Failed to inject FileSplitter into PaimonScanNode test", e);
+        }
+
         // Note: The original PaimonSource is sufficient for this test
         // No need to mock catalog properties since doInitialize() is not called in this test
         // Mock SessionVariable behavior
         Mockito.when(sv.isForceJniScanner()).thenReturn(false);
         Mockito.when(sv.getIgnoreSplitType()).thenReturn("NONE");
+        Mockito.when(sv.getMaxInitialSplitSize()).thenReturn(maxInitialSplitSize);
+        Mockito.when(sv.getMaxSplitSize()).thenReturn(maxSplitSize);
 
         // native
         mockNativeReader(spyPaimonScanNode);
diff --git a/fe/fe-core/src/test/java/org/apache/doris/planner/FederationBackendPolicyTest.java b/fe/fe-core/src/test/java/org/apache/doris/planner/FederationBackendPolicyTest.java
index 30582224f7603f..f6e8efd5294583 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/planner/FederationBackendPolicyTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/planner/FederationBackendPolicyTest.java
@@ -769,9 +769,9 @@ public ComputeGroupMgr getComputeGroupMgr() {
         Map<Backend, List<Split>> backendListMap2 = mergeAssignment(assignment2);
         backendListMap2.forEach((k, v) -> {
             if (k.getId() == 1) {
-                Assert.assertEquals(900000L, v.stream().mapToLong(Split::getLength).sum());
+                Assert.assertEquals(1000000L, v.stream().mapToLong(Split::getLength).sum());
             } else if (k.getId() == 2) {
-                Assert.assertEquals(500000L, v.stream().mapToLong(Split::getLength).sum());
+                Assert.assertEquals(400000L, v.stream().mapToLong(Split::getLength).sum());
             } else if (k.getId() == 3) {
                 Assert.assertEquals(1000000L, v.stream().mapToLong(Split::getLength).sum());
             }
diff --git a/regression-test/suites/external_table_p0/hive/test_hive_compress_type.groovy b/regression-test/suites/external_table_p0/hive/test_hive_compress_type.groovy
index 4117501eff2c3c..44dd9104411196 100644
--- a/regression-test/suites/external_table_p0/hive/test_hive_compress_type.groovy
+++ b/regression-test/suites/external_table_p0/hive/test_hive_compress_type.groovy
@@ -48,7 +48,7 @@ suite("test_hive_compress_type", "p0,external,hive,external_docker,external_dock
         sql """set file_split_size=8388608"""
         explain {
             sql("select count(*) from test_compress_partitioned")
-            contains "inputSplitNum=82, totalFileSize=734675596, scanRanges=82"
+            contains "inputSplitNum=16, totalFileSize=734675596, scanRanges=16"
             contains "partition=8/8"
         }
 

From 8147ffe47402258e912ba54f4bb11605892be54f Mon Sep 17 00:00:00 2001
From: daidai <changyuwei@selectdb.com>
Date: Fri, 19 Dec 2025 12:01:42 +0800
Subject: [PATCH 10/12] [Enhancement](parquet)update runtime filter when read
 next parquet row group.(#59053) (#59181)

bp #59053
---
 .../runtime_filter_consumer_helper.h          |   2 +
 .../format/parquet/vparquet_group_reader.h    |   7 +
 .../exec/format/parquet/vparquet_reader.cpp   |  38 +++-
 .../vec/exec/format/parquet/vparquet_reader.h |  16 ++
 be/src/vec/exec/scan/file_scanner.cpp         |  24 ++-
 be/src/vec/exec/scan/file_scanner.h           |   3 +-
 be/src/vec/exec/scan/scanner.cpp              |   1 +
 .../create_preinstalled_scripts/run84.hql     |  20 ++
 .../dim_small.parquet                         | Bin 0 -> 4230 bytes
 .../runtime_filter_fact_big/fact_big.parquet  | Bin 0 -> 129338 bytes
 .../test_parquet_join_runtime_filter.groovy   | 174 ++++++++++++++++++
 11 files changed, 272 insertions(+), 13 deletions(-)
 create mode 100644 docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run84.hql
 create mode 100644 docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/runtime_filter_dim_small/dim_small.parquet
 create mode 100644 docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/runtime_filter_fact_big/fact_big.parquet
 create mode 100644 regression-test/suites/external_table_p0/hive/test_parquet_join_runtime_filter.groovy

diff --git a/be/src/runtime_filter/runtime_filter_consumer_helper.h b/be/src/runtime_filter/runtime_filter_consumer_helper.h
index 212df4338cbdd8..36da3cd10c0167 100644
--- a/be/src/runtime_filter/runtime_filter_consumer_helper.h
+++ b/be/src/runtime_filter/runtime_filter_consumer_helper.h
@@ -52,6 +52,8 @@ class RuntimeFilterConsumerHelper {
     // parent_operator_profile is owned by LocalState so update it is safe at here.
     void collect_realtime_profile(RuntimeProfile* parent_operator_profile);
 
+    size_t runtime_filter_nums() const { return _runtime_filter_descs.size(); }
+
 private:
     // Append late-arrival runtime filters to the vconjunct_ctx.
     Status _append_rf_into_conjuncts(RuntimeState* state,
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.h b/be/src/vec/exec/format/parquet/vparquet_group_reader.h
index 265a95f4470537..f81d660734931c 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.h
@@ -79,7 +79,14 @@ class RowGroupReader : public ProfileCollector {
 
     // table name
     struct LazyReadContext {
+        // all conjuncts: in sql, join runtime filter, topn runtime filter.
         VExprContextSPtrs conjuncts;
+
+        // ParquetReader::set_fill_columns(xxx, xxx) will set these two members
+        std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>>
+                fill_partition_columns;
+        std::unordered_map<std::string, VExprContextSPtr> fill_missing_columns;
+
         bool can_lazy_read = false;
         // block->rows() returns the number of rows of the first column,
         // so we should check and resize the first column
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
index fb30e5d4a613bf..45cf3e2c5edde1 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
@@ -383,11 +383,17 @@ bool ParquetReader::_type_matches(const VSlotRef* slot_ref) const {
            !is_complex_type(table_col_type->get_primitive_type());
 }
 
-Status ParquetReader::set_fill_columns(
-        const std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>>&
-                partition_columns,
-        const std::unordered_map<std::string, VExprContextSPtr>& missing_columns) {
-    SCOPED_RAW_TIMER(&_statistics.parse_meta_time);
+Status ParquetReader::_update_lazy_read_ctx(const VExprContextSPtrs& new_conjuncts) {
+    RowGroupReader::LazyReadContext new_lazy_read_ctx;
+    new_lazy_read_ctx.conjuncts = new_conjuncts;
+    new_lazy_read_ctx.fill_partition_columns = std::move(_lazy_read_ctx.fill_partition_columns);
+    new_lazy_read_ctx.fill_missing_columns = std::move(_lazy_read_ctx.fill_missing_columns);
+    _lazy_read_ctx = std::move(new_lazy_read_ctx);
+
+    _top_runtime_vexprs.clear();
+    _push_down_predicates.clear();
+    _useless_predicates.clear();
+
     // std::unordered_map<column_name, std::pair<col_id, slot_id>>
     std::unordered_map<std::string, std::pair<uint32_t, int>> predicate_columns;
     // visit_slot for lazy mat.
@@ -494,7 +500,7 @@ Status ParquetReader::set_fill_columns(
         _lazy_read_ctx.all_predicate_col_ids.emplace_back(_row_id_column_iterator_pair.second);
     }
 
-    for (auto& kv : partition_columns) {
+    for (auto& kv : _lazy_read_ctx.fill_partition_columns) {
         auto iter = predicate_columns.find(kv.first);
         if (iter == predicate_columns.end()) {
             _lazy_read_ctx.partition_columns.emplace(kv.first, kv.second);
@@ -504,7 +510,7 @@ Status ParquetReader::set_fill_columns(
         }
     }
 
-    for (auto& kv : missing_columns) {
+    for (auto& kv : _lazy_read_ctx.fill_missing_columns) {
         auto iter = predicate_columns.find(kv.first);
         if (iter == predicate_columns.end()) {
             _lazy_read_ctx.missing_columns.emplace(kv.first, kv.second);
@@ -536,6 +542,17 @@ Status ParquetReader::set_fill_columns(
         }
     }
 
+    return Status::OK();
+}
+
+Status ParquetReader::set_fill_columns(
+        const std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>>&
+                partition_columns,
+        const std::unordered_map<std::string, VExprContextSPtr>& missing_columns) {
+    _lazy_read_ctx.fill_partition_columns = partition_columns;
+    _lazy_read_ctx.fill_missing_columns = missing_columns;
+    RETURN_IF_ERROR(_update_lazy_read_ctx(_lazy_read_ctx.conjuncts));
+
     if (_filter_groups && (_total_groups == 0 || _t_metadata->num_rows == 0 || _range_size < 0)) {
         return Status::EndOfFile("No row group to read");
     }
@@ -673,6 +690,13 @@ Status ParquetReader::_next_row_group_reader() {
             continue;
         }
 
+        bool has_late_rf_cond = false;
+        VExprContextSPtrs new_push_down_conjuncts;
+        RETURN_IF_ERROR(_call_late_rf_func(&has_late_rf_cond, new_push_down_conjuncts));
+        if (has_late_rf_cond) {
+            RETURN_IF_ERROR(_update_lazy_read_ctx(new_push_down_conjuncts));
+        }
+
         size_t before_predicate_size = _push_down_predicates.size();
         _push_down_predicates.reserve(before_predicate_size + _top_runtime_vexprs.size());
         for (const auto& vexpr : _top_runtime_vexprs) {
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h b/be/src/vec/exec/format/parquet/vparquet_reader.h
index c3c73d98bea398..e2ba5d82a706b2 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.h
@@ -160,6 +160,10 @@ class ParquetReader : public GenericReader, public ExprPushDownHelper {
 
     bool count_read_rows() override { return true; }
 
+    void set_update_late_rf_func(std::function<Status(bool*, VExprContextSPtrs&)>&& func) {
+        _call_late_rf_func = std::move(func);
+    }
+
 protected:
     void _collect_profile_before_close() override;
 
@@ -252,6 +256,9 @@ class ParquetReader : public GenericReader, public ExprPushDownHelper {
     bool _exists_in_file(const VSlotRef* slot) const override;
     bool _type_matches(const VSlotRef*) const override;
 
+    // update lazy read context when runtime filter changed
+    Status _update_lazy_read_ctx(const VExprContextSPtrs& new_conjuncts);
+
     RuntimeProfile* _profile = nullptr;
     const TFileScanRangeParams& _scan_params;
     const TFileRangeDesc& _scan_range;
@@ -337,6 +344,15 @@ class ParquetReader : public GenericReader, public ExprPushDownHelper {
     std::vector<std::unique_ptr<MutilColumnBlockPredicate>> _push_down_predicates;
     std::vector<std::unique_ptr<ColumnPredicate>> _useless_predicates;
     Arena _arena;
+
+    // when creating a new row group reader, call this function to get the latest runtime filter conjuncts.
+    // The default implementation does nothing, sets 'changed' to false, and returns OK.
+    // This is used when iceberg read position delete file ...
+    static Status default_late_rf_func(bool* changed, VExprContextSPtrs&) {
+        *changed = false;
+        return Status::OK();
+    }
+    std::function<Status(bool*, VExprContextSPtrs&)> _call_late_rf_func = default_late_rf_func;
 };
 #include "common/compile_check_end.h"
 
diff --git a/be/src/vec/exec/scan/file_scanner.cpp b/be/src/vec/exec/scan/file_scanner.cpp
index c7d10c89dc0144..8629737a320214 100644
--- a/be/src/vec/exec/scan/file_scanner.cpp
+++ b/be/src/vec/exec/scan/file_scanner.cpp
@@ -357,8 +357,11 @@ Status FileScanner::_process_conjuncts() {
     return Status::OK();
 }
 
-Status FileScanner::_process_late_arrival_conjuncts() {
+Status FileScanner::_process_late_arrival_conjuncts(bool* changed,
+                                                    VExprContextSPtrs& new_push_down_conjuncts) {
+    *changed = false;
     if (_push_down_conjuncts.size() < _conjuncts.size()) {
+        *changed = true;
         _push_down_conjuncts.clear();
         _push_down_conjuncts.resize(_conjuncts.size());
         for (size_t i = 0; i != _conjuncts.size(); ++i) {
@@ -366,6 +369,7 @@ Status FileScanner::_process_late_arrival_conjuncts() {
         }
         RETURN_IF_ERROR(_process_conjuncts());
         _discard_conjuncts();
+        new_push_down_conjuncts = _push_down_conjuncts;
     }
     if (_applied_rf_num == _total_rf_num) {
         _local_state->scanner_profile()->add_info_string("ApplyAllRuntimeFilters", "True");
@@ -1045,9 +1049,17 @@ Status FileScanner::_get_next_reader() {
             // ATTN: the push down agg type may be set back to NONE,
             // see IcebergTableReader::init_row_filters for example.
             parquet_reader->set_push_down_agg_type(_get_push_down_agg_type());
-            if (push_down_predicates) {
-                RETURN_IF_ERROR(_process_late_arrival_conjuncts());
-            }
+
+            std::function<Status(bool*, VExprContextSPtrs&)> update_late_rf =
+                    [&](bool* changed, VExprContextSPtrs& new_push_down_conjuncts) -> Status {
+                if (!_is_load) {
+                    RETURN_IF_ERROR(try_append_late_arrival_runtime_filter());
+                    RETURN_IF_ERROR(
+                            _process_late_arrival_conjuncts(changed, new_push_down_conjuncts));
+                }
+                return Status::OK();
+            };
+            parquet_reader->set_update_late_rf_func(std::move(update_late_rf));
             RETURN_IF_ERROR(_init_parquet_reader(std::move(parquet_reader), file_meta_cache_ptr));
 
             need_to_get_parsed_schema = true;
@@ -1068,7 +1080,9 @@ Status FileScanner::_get_next_reader() {
 
             orc_reader->set_push_down_agg_type(_get_push_down_agg_type());
             if (push_down_predicates) {
-                RETURN_IF_ERROR(_process_late_arrival_conjuncts());
+                bool changed = false;
+                VExprContextSPtrs new_push_down_conjuncts;
+                RETURN_IF_ERROR(_process_late_arrival_conjuncts(&changed, new_push_down_conjuncts));
             }
             RETURN_IF_ERROR(_init_orc_reader(std::move(orc_reader), file_meta_cache_ptr));
 
diff --git a/be/src/vec/exec/scan/file_scanner.h b/be/src/vec/exec/scan/file_scanner.h
index d26186eeef621b..1cbe9c1bbcf12a 100644
--- a/be/src/vec/exec/scan/file_scanner.h
+++ b/be/src/vec/exec/scan/file_scanner.h
@@ -251,7 +251,8 @@ class FileScanner : public Scanner {
     void _init_runtime_filter_partition_prune_block();
     Status _process_runtime_filters_partition_prune(bool& is_partition_pruned);
     Status _process_conjuncts();
-    Status _process_late_arrival_conjuncts();
+    Status _process_late_arrival_conjuncts(bool* changed,
+                                           VExprContextSPtrs& new_push_down_conjuncts);
     void _get_slot_ids(VExpr* expr, std::vector<int>* slot_ids);
     Status _generate_truncate_columns(bool need_to_get_parsed_schema);
     Status _set_fill_or_truncate_columns(bool need_to_get_parsed_schema);
diff --git a/be/src/vec/exec/scan/scanner.cpp b/be/src/vec/exec/scan/scanner.cpp
index 5dced63feb6507..2857738297fd09 100644
--- a/be/src/vec/exec/scan/scanner.cpp
+++ b/be/src/vec/exec/scan/scanner.cpp
@@ -41,6 +41,7 @@ Scanner::Scanner(RuntimeState* state, pipeline::ScanLocalStateBase* local_state,
           _output_tuple_desc(_local_state->output_tuple_desc()),
           _output_row_descriptor(_local_state->_parent->output_row_descriptor()),
           _has_prepared(false) {
+    _total_rf_num = cast_set<int>(_local_state->_helper.runtime_filter_nums());
     DorisMetrics::instance()->scanner_cnt->increment(1);
 }
 
diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run84.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run84.hql
new file mode 100644
index 00000000000000..4b4e7b6e549b29
--- /dev/null
+++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run84.hql
@@ -0,0 +1,20 @@
+use `default`;
+
+create table fact_big (
+  k        INT,
+  c1       INT,
+  c2       BIGINT,
+  c3       DOUBLE,
+  c4       STRING
+)stored as parquet
+LOCATION '/user/doris/preinstalled_data/parquet_table/runtime_filter_fact_big';
+
+create table dim_small (
+  k        INT,
+  c1       INT,
+  c2       BIGINT
+)stored as parquet
+LOCATION '/user/doris/preinstalled_data/parquet_table/runtime_filter_dim_small';
+
+msck repair table fact_big;
+msck repair table dim_small;
diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/runtime_filter_dim_small/dim_small.parquet b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/runtime_filter_dim_small/dim_small.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..e998f3c817a3748ef8c2feaba3718937f2123a0f
GIT binary patch
literal 4230
zcmcIoPe>F|7=JU4>tsuA<{RJ2mTTq1gPID`AMEf}jQ*&p6%lk15zQ!Vt&GkcC8U#d
z3h5*f5fPn2y5u1uIz)$%P98dhM5l=8`(}Lm?aa=!1hbEL^L^i&?>E2s{nmMpM$}k0
zx46xdgWTk-o-r$FFvd9J_1tNPT8bxlA|ouUdb7jYiZUqg$eJ*lhG8=K$jERr-OCcG
zv@ou6(=nYT!&x?Dm$d(dl5PhYjGOr66kLEpoV4Co5iN6~yNK6@DPNlZ%`C+mv|x6o
z7TDq_5zW>sq7Z?2g_x?)TrA>hBEzk15zUDnm+dRVSu^CrYWdB|P$NYVz~yyq2xD$x
zRk$<SO<92HADeXb*?r%Zn>8RSbp~+C6LWtACBH+g3JYfhq%1(}*-j$nH+-#DF4qDx
zY7GE|GV>XZFNLGjaJ?dI9RYYDrYbZS8@{+J%JAPw`7Q~DYdDG`fJ52){=VD7m|IvC
z?u>F!7GU}{MY{S7->Q@Abs#Hs25`y~s|qECRbk<bfRqJ@J=jCU{D!a8%U`>J8MOw0
zLYest-v<xFQEIqe5w?y1ybx0rnu`tp&Z7+9Xpm1Ez;F#mQ3P-(d!ONbVazS83U@|1
zC<`!sBS=@D;U632Y9q)>odKNk#H>(aSQQq|2uN9g*v}>+<~RIall+nfX4D!03T5Uq
z9Dl_)N)6X5!qyRh7h<YHbFtwc_C^`LmXS*tFkHh?6agH{-e>qo7;_7&!ktkL$^uLu
zXGvF|;V+uyVl&7}odKNk#HvGyVO3Z-BOqk~VxRUCF~8xPE%J2>Fr(H0P$)B>;XUv$
z9HoZq6=CZLzzZ=|p}E-b=dDqOuN;&=4uIhrj-m+QF!t%eUM3Rr6VsE|#(4q`POT=K
zx4bKe1E(h_z}bO0wdu?0z8KGCGS;({tms+C`CR3g(ss70VE+Ua;ykX%SdSR8qDKHv
zhs_!k!bBnB+{R_BTd%C>*60bbUWY<hC`6pwn~Zf!k`>)DJRw#aQHTTz5$9)L#`>w2
z75$WYLM%3;5H^EjLsE^6oxOO>`?QKD<{=zMrxc7)oM4<$YC+|u)Cr}gRKL5Vayg}j
zaIs&hQ+S13Um?S#To=}dadM&hXAv5boSJrv7jT9?Jpf9nOLqb0^bFj21k;>LV)*=a
zKF~ivRX4=cdvtDQ2yVVJ0=F(;ZNOy;H^FnJkpD@14;Z8LP>ts`;G#DWI+w>7NWsfM
z|AO~^?)o`*KF&mTZajbE*5u8OiMhFs@qB)EA$xl=KYwNRdbay;XIE!ex-Y|+_m90z
KFg61J1MwFiVwOk%

literal 0
HcmV?d00001

diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/runtime_filter_fact_big/fact_big.parquet b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/runtime_filter_fact_big/fact_big.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..b3ad736022e91b9462bd4c6f32b302873536cca5
GIT binary patch
literal 129338
zcmcG%eOzVLnI_6P9N<tfiAk)=HZh6$TAl7rXRLyvs-`nZb$7aNW;)%IPJi4?#-3z4
z{W+7l-M@6Er)Q>T1`!bv5fKrwM8pyiOGHFOL_{nR5s@GwVu^@|h=_<KA|l@RIs2@$
zzTS0Do#y8JvFqLMs{O3z{do4?XVohB?r(f=|K8Z%_};U=vp2do>*HBjv9q%1vo~w+
z$M^30D*4Xdo4YsnydVZ|c-)uu743}N`R7OJ7>z`tS)wU&-rnfB2eWd|J~xQ`=e^N=
z(S6mCef1G>fr$Tqp`80wqLH;XikZ)*KZwHrpR~rZ6|Lxx_iManqoO;SF|)Jxo+A?$
zKaa>JK2Fl&awJGul)RL!y!7|<B(~+Aw>OqdV)Vxc5^Q@S`vxMSAxA97&QljE%LB~j
z^`FbWn4P59V&9T~-rhYa_5~XIy0}=(<JkLvS@zVJGWYrC=g{PT|FfU|@B%yeMOk0T
zNel$x9~Q~A51qI7tQ5(@1j(|<zN(0*{g9a7bDncKOU2Mlgr=}r1TlESBPCBev-nhh
zL^Y_Po2Czo&9kUKy*R&MVuNQZVy1E?J}TNinlUq@_x_x+fRB^5xEu*m7JAS1(#5)R
z8b9GvPSwZ7?8m70TnG;^n^%K<!*%%F_f40*90}sm!X(mGUW<LhKA&<b_KKNvh`kHp
z0cP2=%GqzzFG?zBf2f@O7RiOcr=0RYOnow?oYdtk6+=0R+r9I|fj5ZZNHHe<49_e+
zrJq&}YAC1nQ)1;@>Q67uFPK>OXB070IYZ}(#`7{}X7t`LWdR?%ctKo_C<`s(dg)?a
zIW_0|lvDCKG4WaIJr}|Q%;wc#AGYyhUgbRYP1)*?eG`+&zpuO&`>HScl(X}BG4}bC
za?SvlWzQ<-fSvrJq;d{~$~j<>oc^*;Ion?nBVS4>Cv`bX#ZXQ**8(wkgT#@Nr=3}R
zHh)PqsG*$lUlMbI`qPW^3ntd~D~gz@oZep+HNTuOGo$yuiYF<3?BWG+Iif7|p6jKH
zb>)=(nol_!zbb})m3q&G@Bp)UHP{yx;PVut961uirG-hPt-Kcdk_&vwS^t_C{2H-$
zAw0k=dsaCGcJhmo$|(qyQ(%!CJ>XN$+J4c$Kc$@1<t!CLIWevUV(<ouBgLqlS$tLs
zRf8JJ*)9;11=OEjoL?}p#zTsjshsvAQC5^OGo$yay~>dzL0npB5os&0mBy_jKIJSQ
z7QKh5_gn}MFq>C{efSJ{mQjuzkx!RrwP((=sVC91-?Z3oeBGy<#a|cQzn*&L=n!D<
zPM5{9XO&avQ%+&1oI;D_z&Cx$S@?$N{6<PSsmoa^hH~(g%VrV8;0=#ZjykjW%>9OH
zP(wLuN5$|_>Q8m0h4Tw0R`V@I%v4VEZ;7qn%$S+cd#An1kt0D|T4)hzE3cKt)!*?c
zXZE*6`)^b4xey*;Hm?SInwvgzjB@0NT!xrSD981ZM}J~2|7)LeX1*=jzD?|12(Wj(
z87SwV-Lpl>XU@S;IR`D0UElF3XZm+V%kQR?le(Nuq@kQ#n!;ug#NZ8&6r*-#@tOR6
z)u4uQ7Jg6k{T}tF7v~pDtn3dIF;h8p-xaI>He+T+?|mQN-st0`EiOlbl!X>?y>zkO
zd-H$jQ_jTqMDzEk_gn}MFq>C{efaA15~CbBB44jwN+`$mTI^^4#HXCGKNgLDOzd3<
zuy?%~D5uCyeo<06MWJ$vERwDN&ZnHw?~8`-r<9YroK2*moII`tV(<ouBgLqlS$u~7
zy=qWHInzH79Y3J{^y2)2iEaI<B4#S5;!nl=g&8w5dauu`961uirG*xew(?qOocfVZ
zIYU1bbw8xub0IvyY+eoafAyF6EHcWGBSBnRm_*vjYq6jBu}?XJe<o`FjM%#n9$=O|
z`?hw-r<_Bfat>J}kNu@jIRk$os{SISoYdtk6+=0AXxc1-7`)+;V${woK7D_!8q`qE
z=wFHBe?|T2#rXvjTm4Upn5mqdzY$Y^lQA=+_s++<2YsBh#pOtlvd|)~moC<yIV1nY
zr<~rOh>D+3@3{~jU^cG?`|#_<tBi8wi2R=Os`fqS)ztT#SMBdPLx1N}PWRu6vcDzv
zXAI1;XO(l<PJU7HnR7T)&S5LX)qn3(PS;OG$xl<tNnOrTF_e?fwLlEsAaR6p)S1Pn
z<G-o~HIy^(Gg1FD>Q8m0h4Tw0HviugF;h9~{~$*GA!BAn@0EI$BS(U`w9w$vR$eQO
z{r|(Koc8}NcK(ri&xP;+vw1bxKQW5W3Zoo362zs2Nu;g37W>}+<x@`EKZ&jXN$gz+
z4=~G~Rn8Hga*l+`IbxA4EvA36=l?0R_5X;C|DBRf>Ux%oA)P%mhfO1h!5bbaNbSrb
z)KcQxL(qLuL{wcAp&s=@rRPSoG2GxigY6Bor5ZO=KC2f;#L&ec=fz^l87;Wv6_6YW
z;?hE=NLzVna{bXWa48Yy_Y!oJMMQBK5oS<4*y#g6>{(fBjE3Zh{Bpmhtre`%(?^ce
zpLzg;6%+u!ouKowh$y)X0my5Wg#cVH|E1@gy@A|Nu;)wo7L9!a`?(zjx#38V{L`XZ
ze+5ysl~ftoR~r#Ml@U=|8G)8;3^u2Rnm$DHxN3qoh~Y@FCjJc1EK-A4Y8`7li6&^X
zLt7OZ)(e%M8y2=)Zt$MLriZPoG;XGvDz1u%`D&2!VzJ~5HGL-`MTTHpjsz(S4dZ&P
zWS*)e!n~SBua1agSECtS6c2X#0KylmHyJg_5&81<ruHuPX6jw;P5WJL_F4*nSJPNs
zL^ReRfHTHs1K`!f?J@|Jus^9Ld!Ip915xGGG<jV_G+n2uNnx;*3^ieYo7=JQ28q3I
zP?I{d8esOA*0F|~R<Ebq|Mh5Cb+HANo*TZhyWG4R7|c6m-HjSIQ%x<65wUXv$a%3?
za)z2ZylRpoL0nqscxfxImCVhXh%m3Fr6#&((q2M!t+G7W=>rHC!QNujBuC`(*<0Fb
z+*_n3Ic^28b({j=)wFzbMD*T_0OYmGLIAEelV=mR^B`2yfux%3y$9pB5>;MJYb_Dc
z*P^LOVX!$h)bwGtW8n>AI6?yI%+B9!TE`k{s<@4A|E*|Pb+HANo*Uk!Ts6Ua2Ad-~
zPiWjsH4WZQ8!>MOIWHDV&QQ}i_=hg~I0WNzBuH6k7}rY|>n}>RcM@S<O{MJ-F?k1?
z(M9oKrw<_f>T;S<lN^y>Z%!xF<a(_D>N+U^UQK0pMZ{DG0&r0ffa}dfP26sTP)!9%
zHQD<SR=S8Pucpebh?u@xQ<K7Ab84vRBV0qo;0+Q-NI;$0`MX!^SVK)M-E{lE2Mwz(
zwxH5;!`<I$R{ur^vpL7^)3}*xn(c{*?jDfC4D99$mYkucfAp$Jjs$UOp<$%0yjC(h
z`iL;Erl$KNVyzd==%RSA(+3bPK5sB;k|T0~dLyAG*J}mP`5*<rtEss^BGw;304@pw
zaJ`wRiQAzNs;MxkCVP)U`5;l{)ztb>M4TSb)TA)joEmESDBH2{1~D8V0d;2Q@1)kT
zhMEQ+rrZC+XjpZz1(lu~`qfpFck_d(rsW}xo2jO)M<Qb45s>p@vE&Rjo${(ljs$UO
zp<$%0yjC(tA0xuNn)-$#qH-9`=%RSA(+5D)(`S=WlN^yN1)B*qxnA<<57)6#3V>Hr
z|KqeM^*91>Q4oOZ%|uPyu7yxd2a{^D_bnWMlBn`(Iyn{*)nl5P6b749LrrH>Aa2LP
z8^myg1k@QhjQssv>sUigv*UF8A4kKgi!G@1+;+9GqHI#*W~!;~sfbve0J+RIR~TwK
zififPBrh&Uf|P}ZalLf0p3J2wBFw94`ss*heHzW^qIj^`>>2@t>jCdFYLX*z&EVaH
znq03H!1A*c0I#N*X<C$;MgT4f0&u;VsEOOj5UQytsU~|b!@wz`%ByMaxrjLNoTes)
z!RFLZ)5o}mh`}2qj*x&lv-3Brb*!PLt>@|X|2!I2U2H+6=eDcO7R@he+)Oof&P7DU
z9LQz1;lgOf<6bq%ksvNDG>o*B*GgvT0ukobbb3A_PR^qlT@(*Cn_VM-@SFd8jGE+#
zTq$@jp(fXB1yJ@11;DFm<7HZudKm$@C<wsyW}+r;cSESALrFE+`x{QZN>q6@Z7)T{
z@RFt`g~8_3P!s*XRrmH9gEvSVApvz}=Wj*pSVK*9uhH#)84as0wxH5;!wZ0O>-TS*
zFmLt!s~R^`O=GV|M9b?ShZ)$-94tAb8RJkQeH?;uITEBSG>q$|i}hzy)0;$?S5x&`
zM9jT`W^_?J*y#fZzxi)7YLX*z&0ssBCf92P(0rN#;MG*~7A;DxBLEi#0l3~w)Wq$0
z2-S2rsU~~R!{!E2<<(Tb5fKY-Yid#$Y)%a|eS&L<7`#E^2nnb&JAdzL9c!qmbCYiW
zn`l^du?3Z$+pacc%)F;@Gu5=b6%m75AeY(33`0#n@v2FV1aWDhVWh3RRx<l`h%m3F
z6YodF_BNW)Me$&>*);+PzxnSlYLX*z&0r^?Cf92P(0@_Xr>6GesMz@c0k|j#!1ZRL
zCT<r*sHP)HHQD<hYD<YKucoexqoVlYsQTAig~8_3P}4b-66f|BgEvSVApvz}=dX-N
zu|H2??2@QxxdcDCKwWG>rRPSoG2GxigS`+N<r+6rO=aa#F?;DQf2_i|S50yxh)W9%
zBW>lSIP_=J^yNgDSJQAsRMb}xVFty6&1Tn76Y@x(qfYgcYXVtE<(fg(QA`s4b=^xI
zJ%E`i3V?q$ja(5G4Obul7sZ3k2EhL7?E&vzhyxddYC3R%MYZE9qROjjygDk5RVy`V
z7%U}2O?zn`C!h3_L(s7}QmopUMQY}1tz(U6({fEz4A!7wy-?}7?P@PXO`XQgR8#Xc
zQL$AEa=X|IQJ8EDCwXx>LSGV+e+LaCZRNF+dAgnm^J-eSHY&QVMKii69&9$dhMK~+
zD^5*vM821WYI40+02|j+0KA$O8|bIPG#~&M1p&C;44zE~yn7)IB=<2K*l$rCZ6vC^
znpSRzik=%ZH7N`>r-qvL(L7E=#NZ7QN2p1iS)?|aw2n2@RMted{~OV;>S7BjJ-1!$
zg=oK7<7TR<zd0)Enn7+Cdm-w*YLX*CTv})tX)CXl%<2{*%&V#RmZ%s%j%IXGJlJe@
z4K;=9Xc6YwBuC^5TqN;qa=lgnHMda!yqZc{qhjJ#1mL0|0N0y|n!I}<4kY(695`T6
zUAmp9@@gu-Jt`*KG&Ly<Hm8P~K1uVqYJxY2;Rp$+GmBJByVkLWnwsyR+y4nPth(5O
zO3!Uqdm%<TG;XGvX6}rN&O1SF7keSLy=sypL0npB7-=i7mCO@&6JcIW$2y~8<t{X%
zi{im%vumg+{H_vZ)Fembx14A~O|I7ppq+junMX~H_e8~N7Xolm5P<8=L`~kk5C@X`
z7!DLzR7>w8s=S(x_e90oy_%X72AfkuO##j0stMj8h9e}P&MZ>x_iG(%sHwk~ZvVY#
zSaq=lm7d$K_ChQ^pm8(Rw9yw8V|^gEi@gv(!U_g`oaDvjNRYD7Fs_#_*1vWP4-jEq
zO+61rMfrnhMi<3{&1Tn7Q@ARf&8SI^$o1^(gqmEh6~M^D6afEh>K%-Via`Y6q96d*
zn~9pddm#=a_c0tOw5T=>5mjDI1CK;S<s+J!6b749Lrv#$4H1JkNE{&nb!L$o8P+=1
zP}9t#bo+l44XZA;pwe^O)n15_$2D%InrcR(V)-$U+r?gplU_B+ksvNDG>o*B*GlHX
z6GWI-)8uGW93Mq9x+orOHoJzJXnOj@7&Xa}Ao=)>CDi14$)n#67RM<7UQJU^(xTLp
z2*5=_0IoL^HF@_!97yhCIB?LS+V>Pu<<)d*A}U%ZG&Ly<Hm8P~K1K65x7QfFLE;Dr
zs56Vy;?r8k8fw~@q}%@_8dhCwL8a%mtGy77(;7EZP3_M_McFeTw~M_HU%`LeqK}ij
zxEu*m78=I&(#3i+7C%RXc{Qy)8x;f3q8VKj4>p@!Lrvi~e@;zuM6MZxYI40+041{&
z0I#O?Q?w{`3IVt%2*CAbq9*TNhy%%e3<ruVs?&2sl~>c|3sG_M1x-x~gUzX-rcZMX
z5ra2K93cU9W|1m+N$XfcO*Qj$`+pG)t1h;n(sRQaw9~Bq{R?Ju_P(rfGu1S*5Eab}
zAcq;)A7a6hGn%o|t0p-T#HEFXk+$+$$vn11gn2bpz7iFu7SW6@iU&J=0O2?P9Ol_1
zN93A8PU6|*daVE&U!wqcHB~LsqSUJhz(qj-t~V1kdG|sbNbX}eaLA&1`gNkptEu+&
zsF+*P)TA)joEmDvkE3=w7TzF+BP5{CEK-eYTE`k{YJY=n|8JmS)x{Q6dTw|DaJj+D
z?q4v~G_|gAGu5>CW>oaQ336U6mYkucHLsfFND!A68b;d6YbCSiZ6eI8sr7VJY`%qN
zbWuFm=>rJA`R6ifk|T1>AUB~V*J}mPyGa4?YHE9j7Ns^2fQy0vTyG|7^6rH=kle>`
z;IKut`aPn`tEpovDz@L%)TA)joEmES49(+qEWAMsM@T@OS)_X3*E-fv)5td6{<qPv
z>S7BjJ-1!$g;@VU<7TR<<b$Y~*#Ws-?1lIv{HJ32ILV94ksxKEVO%d=tiS0^me9}e
z=hbxbqHIxnQ8qQBi{im%vuoTA!f*b0jGE+#Tr<c^sLAzO0Zd&&0q|-XD$N#kr3k=9
zK>)5d6E%7FLL5l$V>occqI#m7sPbwWy);|YUz(lT3!!1KIW^SuS*{^s@CJz^B%sbL
zQd1Q~irovbcv-gSzl_qSU2H+6=SH)+&5C6>;)Wvt)friOb5>oUaWmD_SeY$0E(bYv
zwS~o!Gn%o-t0p-T#HB^xDO-6h!fRI&VO~vhRoSAW3e6}HmF2-s9{^&{%8D~;k|T1>
zAf8Z@>m`q#@%0)CfLGJ}RoSBRDg@x7AOP2!ftm`udm##vdm#$!y%59K5LI4HOSRde
zt5&H=!(el2sOfVQh;w_5!5buw=rVOiZX<u!Y8`7ln@X<D7Bh8dSaq=lm7d$K_CmBZ
zXxvOS^<I}PYOVvhUF?N85C27<K2GxDawJGuXc*T^7wgHaynzVwYTCX&TZ~?hW^_?J
z*lc!<+d=rvpHq_@k!uE_nq03HKvfe3z^iGeF<Xo^A^;Z!0l3~w)a2a@QIOonP+;$c
zSZF4yyqZdzv&HyLnwk^_n^Qwg=W`7agEvSVApvzpZoL7jZqYi{P*dY^y8YjbhE*3^
zQ0cktYA?jltr|B|O;at|qP+#=cCi<t*sCTv62zs2hLN`NTFGo}Bf`9z>Tk;yORZ=|
z7sZ3kX4g<t_{~3`c{a%rxn_``cs99SD}c5;C;(nf4JWe2^6dz~ML__rHxo5^_d*mT
z_c0XMdm)NDh$^q9raQC6O1q{eg~8_3P}AorCC=?N25*o!LIUc{&R?h2v4)y@@1ooP
zU1(T!u?3Z$+phLP%y((rOf{|Foh?T02Dx4Ag;?;aNsa_@X`x}Ht-Mw;Pu@#}c{O!)
zXN%HqG^2~+!Dh2-s44vBzlTwi9Fc1VdlG7Ly;cB2y%YeirtbT)McI7_z(qj-t~V1k
zdG|sTB=<2C*n1(4JwQ}>HTCsni}F5AO$vj}siCGXP)b}i!5hSIgap)?oxcaQjy2RY
z)lawoel)DQ*n&#W4PR?qZt$`{%fWnZ?mVP%Gu2e}P_|ec068xfOU`J<Z{x?u(8nPd
zmm@*SLc_RTx>$eE&z&T~yqd-z&K6A%qZwTk4|e(h!f*Z`V$>u@<eI^U5^8e2Rsi#l
zQUJV~CWf*_^AG}XQ4oOZ%|uP!y$}V-eGCQmUWlF%qROjj`mt<r{4q^U3WLq5p{6f#
z4H1JkNE{&nb!O*pRO?tnP3w=-?f-E!th(5OO3y8a<p%E={Bn(kCpB)Sn%c&)MG5^D
z%w6q;IN?>390}smLc>T~d97q_|C|W(YFZi37JcJrMi<3{&1ToQ9nkdjIg3$~9Fc1V
zXC>6+ddZ{T4t6Ff0A5Y2Ptl^(QwYFCK>)5d6E%7FLKGzTF%;N)Ats+8s=S&`Pi2dN
zDNRiZgUzX-rY}(-&h0e@Z;&`b0_u$1M*g1FI@VBA)w6W_pGL!~i!G@1+;+7WqWhG_
z%~aFSbJ?QtIgs1MUWl{t!<gvfBrh&Uf|P}ZalLf0p3M3eh%m3F^4V-L{XCk{Me$&>
z*)`M@e)H$lBuC_$L8vCzYX#6SPXX|1s(6tWrRETTi-G`LZzgK;?u95w?qevh_d={K
z5LI4H)eG6;)JvM06b749Lrq`i8X^X7kT^mD>demHD_X}IYHC}g+y5dOR$XjCrRTP*
zy$}<xYTQgU%`at(-X)OR#a@V?dDSFGg1EHMFw$0DE16v@M3`68@z=7&>18ydi{im%
zvumg+{O12*=Gi1i<eI^U6VE2sYX#8#1_i*Ysb!TGrCvt>E(!v0y_u-VyBDG$xsRd1
z-V0H=PE>g{op>`_Y_4f)QW$Jb4K)c$iF136!5buwkbpX~^LJY7SVK)iZ_(}lEi|mU
z*n&#W4KDyLH+aurFU0DG#?4gI&fD2y>TQtoVzK0mW}Nn_Nsa_@X`x}Ht-Mw;$KNHw
zyqX3!vqkkLn$bn^V5bit{O11=Mon@=t{HqJp(fXB1u(Hq0q|-Xe2*5T-a`N`3IcGw
znW)LT7os4!kD<Wc3(@)kQRUS%yhHy|zN4v0VX!$h)bvYi$HE)LaD)WZnVr9jVxE7!
zoiC1w-eUT#_S(f3RC;c^+6z%ps&O;b)NpZ3te1e?F7`rv9X}|LK91buawJGuXc*T^
z7wd0&D`iBOSJSCWV&cRlM3_PGV6)jZZU^Bv|Bo_ik|T1>;G+pOxn3)P)ypUVUQM&*
zG0|R*09+IV;CeGrlXovfL2@rdfxQ>vWF=AM)wFPVOmtixOYMcwFxZ?LYWfP-5HWaz
z#1RruXLkOow2n2NO*>b_#MBjNSaq=lm7W{T<~C~#-ZR(>(Q=i>%~Vr&bxc%MgPa$O
zC1*5alUGf0B#2834I^#krO9<QmDdtsUQL@dF)>_&W^_?J*y#g6>{(f7Gis6}a?RlE
zgqmD0dGr7(uB8BYHEmrJ6C>9k02c)TxZVuZROsCcQJCBdQE2aln7fXs@@gu+E+$6n
zm6|jRHm8P~ewhMsZm%(TgTxUMP-o;e@^?(@SVK(>*VFC40S&7zwxH5;i?H0_J%ito
zG1#baGu1S4Lrk>Y0CHX|mYkuc#E;$N2*%||kh0J)uGdQD@tcS+ucq3jm{_<G&FG?d
zu+s++zFl!@k|XlHEL4;0wE}24P66<0s=GNR7Ml@(i-G`LZzgK;?u95!?qevl_d;y9
z5LI4H$68`y=@v~*3WLq5p{8Hq8X^X7kT^mD>demHZCb|~YU*yK+kYz>R$XjCrRNr9
zxxpLmVgR5zv$)OPu5mNfwAvOELv0|ZuC}mPa)z4z8b8FAKF+NuE=PitMc^r0dFf*P
z)-=#ggn2b}+z}JScc2;17(0Cc;W`?pCOINk;6gRIUMqmXyC?u&O`RPvQPP0`Tofe7
z^=6_b?_P+)<UWQ%doM(N7g6Qa)N^-Cl-{kWNnx-#HPrO0Ttmd*4H8FmnL4xc*R6G|
zp{9v@==OgP8dhCwL8a%G&2oeH41Pz(R*%NbR8vJyOw8X4a$YQ!oS~*subSjY5SJDj
zM%v12CG*t%M3`68Xm3m$>qRrVC?4$e0Td)YA7kG1<VX;g7ABFl@>&7R_EP}7n#LZ8
ziN*&IfQ#b6W&>b<0$8DUFGOK-A48$N7ozJSqROjjav&y}1~fG(43?6irmwOc3vUp^
zkz!5!8J^ksdsypOLrtrLbo(Df!+N38bHmpf=hpAvzhK_#>xMLLrkYwFiHV()Acq;)
zeHAP@LroXpM?%xbAsCk<LCQkIxL&$ge>QCn6JcIWOOM7x&!cEY7sZ2}K7eplnp2Y;
zk?YwXOMGu~y;cBQk5d4=nwCdsQECJMxF`s~^=9&H^6rHwOzvYSwD&@cj}cW~O>0lY
zMBfvdniK|`Q$tO^#x+C?-XL*=n$(${zj3W&4K-Da)9wFBG_1PVf=bUVhvf$E8SI7V
zd`jbHs%dZ{CK@I{&Wpv8Gt^Y;Rg)YE;?hFHNLzWWWY$g*VO~w8PshaMB%0Ah@nEM9
zfTpL<$C+o79Fc1VAJ<kPKA!rSl^;J!e^0+1)ICc9@M<cXrbVe|5WpE@vjOnWChuN|
z!sI@NLVGX7%5y}OS5xJ4F)=-(sYzk5lnga}jly$ouQ7Oo#1RruXLkN(wT?B^)bc#t
z{!gJ{)x{Q6dTzNaH+aKc3;<MT7PqlEjhm^a*%xA>`vs6wS6f&tIYUjGUNy;)ATBKe
zPua?A5$<@22=i)cnvaRK7txFoQCS}B^Z|t5{5dtr5xHg%s>$_Q0dy`>0KA%-U#3N=
z1q9%tAOP2!$+OA37osq^kD<`s3sL?mQRUUtx)c+qU(wX0FxZ?LYWf$H5?4*|1~DAb
zW$Mh%-)maO8fqF`rrZBA8dhCwL8a%G$8v+0{aFs?opSkgjhm^at(BOVSOGaN7E8`h
z(}nnT1@v)}7ndVJ%0k1qUb<MnwU4e5VO~vrZ^T698)!xs#e<zbfbg3?rzSZf*9<~6
zxn3)Pv9~AyUQPY$v?#TX09+IV;CeGrlXovfVR9csp}iO4_y$qs)pYXhn5ceRQ<K7A
zb84swEAdW4#NZ7QM@T@O+4<YlI@VCr>^pS(e+Lb#F1Dc3bHi(a%MIQ$*b7nip2p2o
zQ{7fftiB6!UM!ZJp{4<^n&e0jmlhgE+RAGsbLo8|%&Td7J0@DU(TpyN2RnTL;lG%E
z;;571<iDkUg5K%m-@zo(R$eQB<>DNlnr1$rMX3)EfQ#b6W&_~g^t^i^3X}U73hliR
z0~ZrjUQKf)IpRb~PU>H8H4K)Lp(d=vJNb;k8zhbtt9EAR?-C-#{!jL;(i|~SnnMli
zg-XvYpXCN`xQhYY)S1Ptxm@FBs;TqR98qy8$f>I>ES8+1rZ3@_f6&KCUR;g@DGLqb
zdg)^Q*;INt5$4r&`m!8x@-iaKpm?y;2M~Vq=hP%e<eI@Jv{eXbOpaRtlvPmx{IhAJ
zGDi$mA^>@<vJiml&E(nS-3w8e+zU}??}a#3O;mX`ZC{xqhOgArq%hc=8fwByywead
zc!R_dU8c_L{9UbetnqBBtH}|oSD|6m#THb0ZZsRi4c;@@3(<d##?4gISZ$7IsRcPN
z7E8`hQ@K}7awLdL3k@S}<)z7WH8s@}VO~wu*XD@1Iy9q;;=xWI0AkO|I_IcU{p3o)
zIjQxAbF@_m-AkA20W@Dv0q|<7X~+@t*CBw!q%^Ct5P<8=Kurg|dm#=c_d*=R3zANC
z^9G{ItEv8m9I<drsY%0Nb84swEAdV~WAFxvBP5{CEK<!)TE`k{>b#L||2Lvx)x{Q6
zdTzVg3o+BIaWmDld{d4Xyb0uXu@_?H6O;`4ILV94ksxKEVO%d=tXH`279z~6>BR9I
zv3)a|(M9oKv)MIn2jMq=PEB$|t{H@Ca=lgn{jC%Lucr1}bHq*y0&r0ffa}dfP2Rl_
z2b2344&nt#&votXM3q-lS6hxKZqwAHFxZ?LYQjpq(-1LugTxUMP-hmY{yVgeHPkeA
zf^PpO(6H)a3o1RgUG0U~xKrb1s;TVG95LGta=X|I@!fMM8T4_I7ndVJ%0k1qUb<MX
z@N_2;=G8QOSB|K^3(e@Fc(B>*8fpr^`EzQLBXZ3kRFmtq0+_jn0^rp&(v>3`x)6Yi
zf&g4^CTjBTg*cep$8ZoYNP4b2dWb5orty1o#IbudH7N`>r-qua67Mub4BjAdgap)?
zMQWy3>sUig%lFak|2{OVy4ZqB&uv$GA!;7bxS48d?#mHd_k-Ln_Cj=d)g(uPxU|qP
z(pFw8nWrBl!n~Rm`g25AKbp}+@nEyrHPjTY2kbrSR6n^=us5{|u{ZIiC;z@&N!WYT
z3SeW90^rrO_)w1Meh2}$C<wsyW}+tVUWkLqeGCWjf~2cy^bw-Ut7+wAj_5h5sYzk5
zIW^RTm3SwgF?fT-5fV^m7O9O#wT?B^RQ4#{{)f=8>S7BjJ-1!$g=inqxS49|e=JAT
zJqB{S*b5QtrDV{@NnTuz1Sty*<9g|0y~5Q`5Mf?T#iKc5{Bbm+i{im%vumg+{N~T8
zNsh>sf>2Ga*9xF!oC4t0RPrP(N{t}^7X<;h-b~cw-3xIrxsTx>UXb)$FHI0tUQOi_
zIb!nXnwk^_n^QwgSc!KUA_i}eI6?yI%pz6uwAQhPnwlr+_Wu+bR$XjCrRTP*y$~bM
zXxvOS%}nKp&MA=F#a@Vi@~TOW1aWDhVWh3RRx(e_5Mf?T$DYj*E7NF37sZ3kX4g;?
zO;4YF%(F?3$d!V9+A74p)cV1`qxARm7Hxl?0^rruc#0OKo<jg<jLinXu8<$}?u9s*
z+{bVbFGxDo(mA5atLgX)Ibv;AQ<K7ADH&?QO1!Hkc!L;@kbpX~NVU&v9c!qm|3$j}
zzlera7h6#2x$SB%#NvX+%~aFIOF3ffC6L?2UWhrbn&e0jmlir++RAGsbNCe^%&Vzq
zF-Mdyq8VKj4>p@!Lrvi~e@;zuM6MZxYI40+03*v30I#OrS7}k|RRrLoAOP2!$+OA3
z7vf-YAHzYsAn9sqdY!29Y8qI{5tS>NniK|`Q$tNyiFefmZxF)~5>RIrsgXCdjy2RY
zvr4!BRWz)+*n&#WZC86CO4c=QrkZNr%n{3LAh(OX5Z~HI$)Jyuyto_*QWhG<_0q+9
zg%?f}VO~v>Z{>*NZ=o4o6c09=T|-UbH-An|azw5fglclVRsf6dPyoD|rZ#9%Y6Ahd
zC<wsyW}+tVUWkLqeGCWjf~4oVZ;Pn%YC839j%a;XQ<K7Ab84vRUvdo*gEvSVApvz}
zky_l=I@VCr#(Q-8e-90-F1Dc3bHi(a)2#me3ueL^KhU_DYHHuf5oJ3dhZ)#?6)ZWU
z8C$(-k|RM}T4)$)E3cKz;*wmSn%0VQ#XxZ`HKU8-!A>7Q_|5;5N1Y5O*91P9T7~$e
zwhEzptpG|cp#XR_tzVog1}{bca#m#_0N0y|nhx6`6lHxS_wRr9(;ps8?u9sr7bIOx
z)0YxeUQL^2x#DD5ZfY-thQa34P}8r|JT{pi25)$zShX{YR7nMqV)sJST$U@A%PD=@
z#THb0ZoAqG(Oao;Gu1S5d9G-_9OQPf7vii>QZne{Brh&Uf|P}ZalLf0Ug2X`5@G(?
zR9TfPPF;a!bWuFmY<7*?LHN!8lZ=|=h+H!W)#Q4u02*s30A5X1SLKS?Y6Re-AOP2!
ziJH87Ar2<@F&xATlAi0+wM3OyQ*CXon7dk2lfqzgYN+Y!Ttmd*4H8F4K%H5n8n4wl
z)_69x*X4?`Iy9`h*n&#Wjb?M3)xUqiOxV<Q8aGo-i}kspzaHc;1G}$+C1*6_-+9#}
zM}oMt&@j?gUYc94aL+L!%&V#O`dqQufM#@2JlN?2K<pGfqb4~b*9-z}6(UHjLIg+Y
z@96>b-bexPYHDlD6<aqTfHTHs17KGnioAOvijsRFitN1*)y+heS5wDLxnjFXsY%0N
zDH&?|R}`LedyT;xB#w}PI<xb4T<cgvO(Qqc?f+&pth(5OO3!Uqdm+|aG;XGvN?LNo
z%q<|di@gw&UNy;)ATBL*ytI|qO6KHkM3`68$<|y^+lpp%Q9Rgec8%LX_|2bFlN^z2
z2BDf<uNA=52?~H$)6ng?qV9GC;G!S^*PFq!smQw*qA0nKp~&6~apF#*%ByL#Jy+DX
zYid#$Y)%a|eS=csstMj8h9e}P&g}f%rFE>Kro|4r{db^Y)x{Q6dTzVg3sKdjaWmD_
zcz3SY=mfc4?1eZSP%`M_Brh&Uf|P}ZalLf0ersRrCc?a$=I+TA9rvIaT@(*Cn_WXq
z;WvLyO>#u88H8$by;cD0_fY`6n&x|QMQ0BJa8VF|>&-+>-n|e-$$bn(_FjnLKBCI2
zY3cr4(RIJ3CWXP~)KJqmxrT_r8zhd9fI74D*ROS~p{A04y8S<ZhE*3^Q0cktYA;0F
zLmD?zO}zuTqGkZ(cCi=Ym{(15B#2834I^#kwUSwRk_hu^+8)dmql0Kh7sZ3kX4g<t
z_|5-Z=1otI$d!U~wN;37Q)?6F9<>6fdXxg-)wJ_St{8g+0h}>58vy^N=iLiYl-$Qq
zWbcJoc#Np>YASs!SBwv9YEl?1B|}X}iF136!5buwkbpWPw|+aV8r3@1P*daMbo(Dc
z!>WrdsPx=+wHIP&Oyg#%Y3hkw(f$O;?P4!P*141n`Z&pp%MtpLko-I7cxfvyU92aw
z_2)#GS5y6Xu2^~!&FG?du-WVyY6`#kb83<!a?K!Alk2qtXq%(}cr`UVMT=4s2*5=_
z0IoNaXOnj?L{V}dLy^4~qWBr2%B!hqDp#yLt*J?2usJo<^c!45#NZ7QM@T@O+4*}`
z>sUigz0-92pGL!~i!G@1+;+7WV*WXeo2jPtnOre419H3A3-RY(HOY}6E-f^Sw3XLN
z=E+$i%&V#E`CL)@Jetu(@nEyrHPl4Y)8|vnvq_G~HG@xSs}P?`twMb2DE&SCb};lJ
z1;DGRdyW>R<`BRcW3vJ9&nE9)h@#{^h9Y|}#IXgU%B!jGrCd?|lBOnw!BR5R^qUl(
zb9;@!8zhd9fI74Dx2Scjp{A*q>GuCJ8dhCwL8a%G%W{MF41O-e&XUH>R8!Sbu2_5p
z<h)ocIineey=sypL0nqscxfxImCU)<h%m3F@#S36w2WqSQ9Rh`0|>wQb83<!a?K!A
zlk2qtm|vv;cr{JDPK#2nBLEi#0l3~wo=x7p5Jky-3`O={h@LlzDzB#LwOnz0O;eM?
zU~_7y>9^R9g*S-d2nnb&JAZF!9c!p*eVuOq>u6YYu?3Z$+phLPG;C<xOf|K=ohwS-
z2Dx4Ah4|{HC>iu|Zbfl95~M6NjO(R~^;`S)CK2Y<wDL}_=z9mv=%RSA+3Xs(gYcU_
zrzSZf*9<~6xn3)Po%bjJUQMf8v?#TO09+IV;CeGrlXovfQF0$ck-ZmUa)+q$YC8Qs
z{Y&}#nwk^_n^Qwg-{Kk~25*o!LIUc{&R=n!=U;EDiu1(c2WVJzu?3Z$+phLPbYHA-
zGu1Ryk|!EV@`&6n_Ci#9)g(uPxU|qP(pFw8ne}Bvm{(KzC3#}Hln65@9&9$dhML0l
zfKM}TdU8ar8GKq>h4^%872?xJtpFM>qX2j{Rg~w6nM)DC8Dp~n@NatFy%0soy%0tA
zUWm2Ji7KzA>dW)QsfxVRUI-0?rDUk-w<#sg?KK8(kT^mD>dekxmDaJwv#IThJW+B5
z8dhCwL8a%mtGy5t)fzWbP4idgiQX$gZWntYPJ7iPM}oMt(DBk%UMrbhR}*1gO~-5U
z#ObThj4p}?o6WAFrtq6TrzSZf*9<~6xn3)P?m7y9S5wP1d19j$0k|j#!1ZSGZ1V1f
zC`#^QC^}+Mt-Ow?@@hI!pC>l2)zqXg*qj<_`W>3bRTI2H3`a;nomr&1uh%-(P}5KY
z-ToWUu<Bw9Dm^!vjo}6_yMMuaaaz4W<7TR9=UASYItFrHES8+njNkt>E@lYE<w%gS
z&@ir-F4o`l#+!&Rucm<;^F;NHXhs*sgPlHr1pBP>m}iq5k!uF$X{!+DrB)%%J8A_m
zaWe(Lt7)(~Pt-IcfHTHs17M#`hrD|s4kh<O9J2R9w6+jcUQNTd<cZo_l$taQmXe{S
z|BJ$N)dX)4!x0itXXMZuV4_v)SVK+ox6<wZRy3@-*n&#WEy8kx_Y8hVM#b$KH&aaw
zZFyq-HjwjTvE&Rj^?B7KM}oMt(DBk%UMra^cMxG-O{Y%ei4!N#j4p}?JADA*H-An|
zazw5fglclVRsgFV6acTL**o(@`<)2DML__rH-l%>A@5#@L&<#%hwQx&C+{YzyqXp|
z^F&9drY42K=G0Krx7m(`H;CZ~38*tWfA?q|Yp7|bi*Em2XjpZz1(lv#l;sBR8T^in
zmL83psiy9G^F-CXAm_zm$r)-o|2*PBALmvSmm@*SLc_RTx>&z8mETW<c{Od`mnVkr
zLo>Q49_;i1gx~x*HOUdVW)P~$^;!W`^iu%5nzs7##7G|ka8VF|>&-+>-n|fqlKU7A
z*?S@828b%Jrs9D-G5VmUCWXP~)KJsE<{BafZ;&`b0_x1p-@{tR8ft17q}%^PXjpZz
z1(lv#Hp>m(Gx!}DgO6z3Of^lM%oA-VLC%ZCk~7p)>Q$2*3F6X1!$@0stz;e_Cc?a$
zY9GxL3qxo|7sZ2}K7epN;4{pdo*a>D2A@g1>A7AjfR@K80A5XXBeW>>7y@un5P<8=
zL`~kk5Qmcc7!KKcA-10&s=S(xJ&`AtMm04l3^u2Rn*I&jvG4{l93cU9X6J8Q>sUig
z-A~f(|4B5gy4ZqB&kbK|oLj$t|AKj|pPkUSnQB`7d7c>hImls#f9CJVSn{e#js$UO
zp<$%0yjC&?o+iS)nmQ))MDZk=(M9oKv)MIn2jMq=PEB$|t{H@Ca=lgngVPiMucppt
zXi@4J1mL0|0N0y|n!I}<4kh<79J2R9)IUd5c{TOS<cZQ5O-%}e&8eZL-(@=%-XMk}
zB%sdh{5`LAtf8ifQ*`@3g@#oZTTtn_<*?k~J%itovGsz+%~Vsx3wdIG7UaBGEIFeY
zzw;U5K_5r%aXAvCEHsSkrHl19y;Ji<m{-&2i+SSMi)cm{#e<zbfCT%j&oa*@IU?5#
zKAU(pxn3)P*_SB*UQJ^Qv?#TJ09+IV;CeGrlXow~q2xY>L-t;Xt|g+%t7-C;Jkj)u
zrY42K=G0KrcPJ32A!6_bi6bPS&d9Ad!0fWtv4)yfU!~jst7uqtu?3Z$TQ18D-ZR(>
zQTMvW%~Vs%N}kwx4dlF7EIC6>9bPrbksvNDG>o*B*GlH*8$_5_)6#05=vhTGx+os(
z^Z|t5{5dtr5xHg%s>$_Q0c@>P0KA%(-=syUHxYn~f&g4^CTjBTg*cSl$8gBr3o-sS
zQRUUNb~;b=oz~Q(FxZ?LYWh94W8n>AI6?yI%+B9CTE`k{s(6QP{~Ksnb+HANo?9Nv
z4c;^O9T}Zl8aGo-gYV{vhIc{Ei^Y;N)O5~gi3fe0TTxt&1Sty*<9g|0{nk|bJ`v{C
zRJxrfCf`Fdx+os(^Z|t5{5dtr5xHg%s>$_Q0n`=8eQGNEfEJ~85P*w<09<b-YVz)d
zIF#JSaLC>Zu~I@*c{Np*#KrVQ@zlTGY8Y%z4K@8f*AOvygTxUMP-k}jE+JCv|733|
zjf<U&<J7R~VhbugH@pBi&FbI3U^eI2r5ZO=O|xZj(Om{|nBkxKb0PlGt0p-T#HEFX
zk+$+$$?T{g!n~TAE{lt`aw5#2c(B>*8n=V+oB!vSH$6Eb*9<<Fc++#eRsfw>Pyqb1
zskt&P)-Oi@E(!v0y_u-VyBFe6axcUodoM(JHBsf&)OuxHoUYQ;q%hc=8fyBtY{$YI
z#BhWJ)R~>X8m(iEXVc(SanW!U8dhCwL8a%G&vJwJ41Pz(a;?VARMXbgaWQc<$a%3?
zaz-<r@~TOW1aWDhVWh3RRx(GgCBnR#`s(7MvJTDYqIj^=2M~Vq=hP%e<eEXKCf92P
zFxEf;@M`M6E-tFBLjW!c0&u;VsL8t*;!tuQ!y$Vw#PJ)5DzB!K$Ks;;n5HI$!RFLZ
z(|6g9g*S-d2nnb&JAXH79c!p*wvlfCjc8bPu?3Z$8_mW%o8Uczy%1&18aGo-bvMPu
zY7@wLu~>43nvQ-B7c&IoawJGuXc*T^7wd0&OUH>Yucqmn<D&IuG^2~+!A>7Qf_>Kc
z%(F?3$TfrW6VE2sYXz`;D+R!-X{IGE+FB5Ri-G`LZw6{Q?A;4-IJp<%@C6ptfi|Mb
zt7-1GxHxf}Qj><k=G0KrAJ9DR+Z4P(3`a;nomr%oPiP%$sA=nVy8YjdhE*3^Q0ckt
zYA;0dof<b&O`Yv=QPB=^yVwhH+^Z%z62zs2hLN`NTFET!B*MI!PItt`$qqE5i{im%
zvuoTA!rzY0sY#B=-=H3<$@N+Rl-)xC@M_w)J1&OqMgT4f0&u;VsL8t*;&5^w!{PlF
z)l>HpRbEZo-ElG8t*J?2usJo<^gWu#RTI2H3`a;nomr&HdbN%<)KqsL-Tr&fu<Bw9
zDm}Md?S<&?)3}*x8oNI(TJ8tAUF?O3pHIo4kCVK(90^hu8pid~#rm_U=|LjQtEsv_
zF6JIUGrA}qY&N@wn!<H7PEB$|uE2$Aa=lgn&4UyGucn%Z;$nUP0k|j#!1ZRLChuN|
z!^wRNhYwg(H%}5(UQP8U<6_}qO-%}e&8eZLKja!B25*o!LIUc{BGvq;*0F|~I)~`?
zKZJ%=7h6#2x$SB%#LQzFH&ac^!*MY<405~J3-OclDH-%})g(uPxU|qP(pFx&SXWcu
zC=uq>bmH;2*d9SMx+orOHoJzJ!tW}an&gQ5mJ_PU^;!Y+KS=@bYHA;&MX4tcfQy0v
zTyG|7^6rH=oZQE7xWJ-XJ3&-=HFf<wE{cDysYzk5IW^SuM_fb1;0+Q-NI;!gr1~ed
zjy2RY_7vUzpF+c`i!G@1+;+7WVq;3<W~!-dDlTT92Dx4Ag&04dl0hF=O>!iNOA8Gn
zZRMqlbu~>tON4nf4Nu2K{WO}<Me$&>*)`M@u1a%ik|T0GJ5-bFwE~zqMFH??8hMTu
zrJh3oE(!v0y_u-VyBFecav#ItLW^q03q+My)A(#$9GlhDq%hc=8fyAut|4OZ28kmi
zpw28(GcRf#Yp7{?j&A>RXjpZz1(lxLuJ%IIENI+JH8sB!7hCfnw~M_Hg`cNn(8pDi
z9HHgl<ljNVNLzX7VqHzA7l|;hriGW|qU&Weql@CfX0vOkDZxJL^GDqP<cM4|_`J3X
z@%hyM$oTwGD}aqxDF9wgi%Ya9wS)l97@G}%{T~^Jy?Y@JC-*TNK4?)LT_LKxnpR$m
zi=NjsH7N|1lA)$Qp?REq#^4PSM@T@OS)?{rwT?B^RJKaD|JTv5>S7BjJ-1!$g=l|M
z<7TR<e=RQR)<A9-dm-w*YLX*CTw3UOX)CXl%<9ubm{(KrTX8YIj%IXGJlJe@4K;<|
z{5dtr5xHg%s>$_Q0o1%h0q|-n*`P(Kw-JDgf&g4^CeJ4CUWmiVeGG?-EUHWI5>;MJ
z<?qJD<ff)3g~8_3P}BcS^SEk)H;CZ~38*uRRL!>5v4)zO-=o|A78+JvY(b^xwyV7m
zBRd*5Q%y7P$3^G+Ah(OX5Zhig$&nx~Ei{a@mDft<iHq`mYC2Y&FIGN4GrA}qY&N@w
zn!<1XoSNi_Tr&vO<a(_D+Dj<_UQLY`=Zn>nd<wutK>)5d6E%7FLL5%+g*bf3qFQ<>
zQRUTiyewa=U6P;L3!!1KIW^SueVWHr6TCqTM@T@OS)|%8BU0>Mi2m|?QCCjs(=N84
z(sRRWfy)hEcK?F;;<R|V#?4gIMn%3Ds{lDK7E8`(#vgrwc+kfo7?&eK%0k1qUb<L+
z(;KcL!u+$T=Zbt$eg&G*Me$<P*J&#6g{$TfpdO|I7pVB{(afLBv*b-t*mMgT4f
z0&u;VsL8t*;&5^w!{Ngg)uvja%ByMM>U>dowWcP8!RFLZ)4$^yA_i}eI6?yI%px^X
zr**9HY?`?yUvyrBhE*3^Q0cklv)tf4gS`+X*J<2LHPzJTi{)!U&Wpv8Gt_j_t0p-T
z#HEFXk+$+$$y~Ue2=i*1Y{(bK8_<j{iU&J=0O2=(PEB$|t{H@Ca=lgni;WZjucoOR
z@<q!H2*5=_0IoL^HF@_!98T_IIDEvS+IJIC<<)emDPOcUX=+j!Y)%a|{eb3iZm%(T
zgTxUMP-hmY#hbN`HPp1xOt=4LG_1PVf=bVgW@EU)dj@+U8e24OrkdJs$roj}fSebL
zC1<GVD__LL48gb@2~rjs#`V(0`m?F{HX_WcY3<g0F>ouI(M9oKrw<^(KI;odo$BL(
zO`k8M)*HT{twQKt`b&BMB_}8VUQO$5`C_mQ0VJT$1e*<jU4=N}-3xIfxfkMyy%%D-
zov89^+PouQoV-J+NyA_%8EX3X6rPjM7`#E^2nnb&JAZd+9c!qmrh{((ccNj{#THb0
zZV{Fnyl3!pA$srDxS47i>C6|+ognAMV#yh5s`RQ!js$UOq2r~kyjC)gbrWG;O_lfL
zi&I@_Mi<3{oj!o@n?I)}IU?5#LN&QwD}cuPC;(nfRXzD)_Fe?wq96d*o58c`h<7i<
zk>oyxBl|6?r|&1Kyqaq7&lhvOnwk^_n^Qwg7t%bgn&1s$I6?yI%p%p;uXU`UruGNu
z_WuAHR$XjCrRTP*y%19a8aGo-ix1|D{s%#B7keSrylRpoL0npB7-=i7mCT-pi7>CG
z*1>$S`4F1XMe$&>*)?tl;WvLyO>#u88H8$by;cCdLlgk7rnX1&#nwp#;G!S^*PDr&
zyn7*zB=<2KvG+n$KSoq}HFXT<i|t1>H7N`>r-qvTlv3iV3Em)vBP5{C?EF2hb*!PL
zkrBH6kDy`I#THb0ZrLn1c+cSHLaaZbaWmCa@<hIv83j2n7E8`(#y|QJ@t}{Byto_*
zQWhG<_0q-qi_+vc5$4r&^2vNr`y`stMe$<P*J&#6g{$TfpdO|I7pVCpFffLGJd
z1T9KUAOIHy0l3~w)a2a@aU{8q;fTE#;=~kD<<&I$biSy6T2qt4U~_7y=|6A{5ra2K
z93cU9X6J8O>sUigi_g&Q{~0u_y4ZqB&uv$GA*!C!xS48doXHm(&w|`8_CoY{)g(uP
zxU|qP(pFw8nQPAzVO~vhr}9O|DKw*t;=yLKYupaPZ~mN`<cM4|2-W0ztpL{NC;(nf
z^Doe%)C&l}ML__rHxo5^_d*;=?qfJ&?}ZqCiKy~wTAI%nUGtin6b749Lrp)VlsLE7
z7`#E^2nnb&JAW^09c!qm<Yl`3FQ8%7#THb0ZoAqG(YB;<Gu71lO1`Lh1>|<I7vj7x
zQ!?n|Brh&Uf|P}ZalLf0{%opzjR^B<+I}@(jJ}FybWuFmY<3MbCD><u@u(Ys9Fc1V
zU({A1zL;8@_~KD3fT~prfLGJb3N1>lAb>N*W&_|?A-sDbjwJUn9I^L8EUXb#UQMNI
z`C|MHO-%}erDUk-M--lu&ltQx;s^<-Gji(<Q1zD9v4)x&*Xj2ECK^^<Y(b^xwyV7m
zLvL%`Of^lN&KK>cL2eg&A&R|fk|RM}TIhIbE3cKz)=eVJtEv8-e6h5FW^_?J*lczU
zHHH7*l~a=(k^fURRFmtq0%&`W0^rruutkef?;-#f1p&C;OrA~Ny%0x|`xuVcdm)N<
zh$^q9ruXSz%C|K&DGWBJhMN8(rNmVeyg>{{NI;$0`77Sz`PbXt59s#)0UB0aY(b^x
zmdA2~_Y8h6#C(ay%~aF+MSH}^MSF;x7mFolG~<F-O>!iNOA8GnZRNF+dGZn>%&V!Z
zbdM-4CBh7f2RnTL;WvLyO>#u88H8$by;cB2<rDy~rtVAkh_Xu&fQy0vTyG|7^6rH=
zlH3b%#NG>W>~f;YtEsPIk0`I$liCZRVX!$h)bwX;$HE)LaD)WZnVr8Yw2n2NO;eS7
zM0+I~R$XjCrRRp%0$1tqp21#-ohvnNrkbj*+#?pNK+cQBk~7rwZ9zQf;}DF?ksxKE
zVO%d=tUu`IYKSnertz!xh^DL1j4p}?JADA*H-An|azw5fglclVRsi$YPyoD|CTjPH
z=2`^cq96d*n~9pddm)Y__c0u?_d@j46IEVK)7S11$FJ4Yq%hc=8fyA6*AOvygTxUM
zP-k}j8nliz)U<vb-Ttpb!>WrdsPx=+wHKn{292AkrnX~yM9DFb+r?gp6J9mRksvND
zG>o*B*GlI0jYODN(@NtW(btG(bWuFmY<7*?LHNy|Q<EH#YX+g3T(1?tPBR6-t7-M7
zJ)-|61mL0|0N0y|n!I}<jwJUn9I^L8Ox{9Nc{QCrzDEok*VLpixSXE*lY?17?%)6T
zum9q|{Kvg}vi3&591SA>IsB8lE^BX$9MK;i$lALnVH9NTJu6`pemUzq=TQdt4M)!1
z{_}G$6ipG&d)@^CTT1D#k|c42j?@{GC^43;pv=~vjf(DQhG=cx-z(OTBbH-SRItHE
zStx4g)4d#5Y_i~+hJF~zB7V%3gHz0+7jx)qA9Cwl7TH%75w#x@^NA|w{Qb{<`ojx?
z@DB!KUtR9LsEF_D&pjtAi|SPU5tUryp4aqYv3VBdHiAqfj;NG6W3a{G*$T|8)Wk<c
z+eb4*Yy5=o4pQ}TG5ayf<)0^$sVwBu^>SPr_W3GRu~*ETL+If{@xE^=3+S$wLthAd
zl`0R!)F<(-m?+Wyuu}UOjIdIhXQ5P4l&bX8D!WFhYCk1b&ZXo=6Q#ltrBY`Mxaj_j
z0y9fBbe?EDFGIAN^L?c%`J9;eEG2U|RVryAnbKC{9M`HZ`bxF)c`^3+RH+_ImFlr?
zs#57*4t@H|zEW*}NsN3cRjLDFsSYq0VX0QmMX6#a)#fj$>>8yi|0OXeD7o2*QsIbF
zsWS##wEc<#GfUO`%cAC&Gej%<HD9SVepL+pDkbx)5IKF6g=D&3j%&#UzEZ7!O$>gG
z(8Fbrr>IoY0=l%7mqQ;t;49VIe$l@_RjPupR0Rx1SgJXJQstslD}^e%Mya+7#AE>_
z7f+`%{F&dP@sI*DOVwT^%8D{X+dASa)$(D{dzg}0ohp^IkW6W-aV8OB<LkasE&jUb
z{`Hiqo=vIhSxr^CmqQ=;rms{B-w>VONR_HEEL9<c5teGQ0HvZ|>VBb^`wf*{qf~20
z#qd!|ZU+BwAZos)z|2xL|CZSL%?#02f5%s<+20oJzfH+Johp^IP%3GwaV8OB`Ct1=
zHS=xJ_H9B>2sUeuN+m6zOIvw4^sevtN;UnvqUCo}r8*dv>L7y=mTLGYO0@^2n*4p0
zU87VBzbE>BkCL0gs<bHk0|jQ5s_wgD_1|WQHvfmdQcZkMG=Gnh`F%V(>7y(p)Acfm
z5VL>cE7jN^i^e}D^zc^qQtAfyQbJX(mqTy;cfL}MeqS_vKUJ!tuvA41Mp&x8-$SX+
zLaB!Ty~?gps_7qyjvr8RGg$HzTYsv+%u-eSshGboL$s+M`ARkPLs9obN@icGRMJAJ
zq^-u8M2LwW`${$VXQJlM2t8b6T%=M-3+U2TUJm`(U;0Wl@E4-$FH)sC6qf1`gAtah
z;|D0!M^LK1zgF2bN;Udd;`m=ta-n#3ZO;9&CszNH0y9gs^EYDZZ!$z1`7gdw_5MUu
z{DhKuKHhS`FJ&Q_u9r!K82US3sk;AGl>IHChp*+Yrk+)=YR@X&%b{2Qy{}YVKNTfE
zO_l0!SgOMeMp&xje}z(=jZ$^|SCw6(R0BT~^*^KJX0TWz=Kq@lGfTDp4`SpWGDPeD
zAHGtx|97$TkCe>PRH>weQb}8lGl>wr|I1gZwto^^|C7+e<&hOCm9&5^ZRO?AON(h;
z!CvYJzXG@ZAF=VjQ?)u0*6IiY64t8zXQ<W3QLB~`-(p4gMG;YTQG^n_2c4TZ5?Sqx
z!56co8lG9N)r%uy=wbjSq>wlQP@Qpr1D6sYznIoh77@i|1Q=gR6-`=5tF+ZPH^E>9
zO~9{PbY2z_C6{3WaLmeDqq<286G&Tmxe4m8pb6}{RYvyJMnq3#M3h!WP`O+zUhGf)
zCx!hNFhKi`VflhevhQT>Ip?5wgI8+FHk39&>mqGc$n}SaWOzib)fqE^*t$x?GmBSo
zRYc5J15ho408pKAfTwB+kT2fo)e&**YUK7ii4MTcQCi5Yw3U~eVD?&?fG^%yT|_k2
zVS@1f_-5+<_@;J$)V<sUT@5q=U%bidBBJR!t$4}(?EBdjjYRRNBq-iK6mRyJmTaSV
ztJjlIu1BuVN)!)A<XW9^OzUpc@XX@1G)BbE4FFUNBLGxq9N^|n1jrX}sfl`*mO1Gc
zS)__5Efi1MYMh&3>o`ro7jOCIi0Hi;6NE2^Z&C52g$bmsyxauix6%ZB@zz=*qOV0O
zUUK{Nes+x`Q9LRMiWi`GTeoS+Hi}nq8wsTqx&CmXcsL^0>WpLBc|yZ8i#K?CL^RwE
zK($l?Ky}6e*4{~geDO-#BVzIn<n|nVX{3*`kXzTwO;FcK6Y#|=yDK85Ixs=_U^|_9
z+?`G+&-HQ>taQ-?eDNx~B4YY(t$4}Zy8GExk3{jPBq-jeP`tW(wPYK`Yw0GT+=E<y
z1Zhbek!y9vF&(>4!!wIF+Y=GpJphDG&U7iIqmKaj;x*kL5o^83?LVf9CoL3D+G?C*
z-1#6)z!$H%KO)v2zy#sD{st9KT9`oE%F9hqK1dVr#cO>iB2Euz#Y=7u-OsLrB#K8R
zLGjK*@j6dx$u^2N_%I3OVdVOwiCp7|;;A!^>GF_<XBKbkk%*Xh1b~?=u8clLfPC@#
zh9jbK7`Z)_DxS2ETWPCtj`7$iO~4ng|8aU}dmIx`?DW~B;z<h=NLzWi364KW6Y#}5
zITjJsV_NZ&dlC1uD<g^GQAtp|&!Tu^Ki86N6mNE%gffm?pG_pgBZ{ZaIHqNj8lG9a
zx~C#ybpn8yEVnF85g=c@>8B&2^=ahxC=kIiWg)k&mt(yAEKR@{Z)Td_*`_f;_`>oo
z6;E22K-$X7O)zkZCg6)V_gqArcup%`a_ifEcC92)JSqu_cRq@@JgX(!DBjldB$Vfo
z>yIHVi6e@q&N!ycFKT#Z@jB-sqGAq!nJmDRE)XDJywmd$adIBHJ)SC_w2)hAt8tET
z*()>wU%ZW%>7DImOc1`Byhp{87ABCk@^TZLdX*;Ni?_WL5yMMb@shi>_Oq)eiQ-X7
zP`odocx5YEvW?=^y+%S=My@}eC?1X|o;u^0_OEJqX7R>ekBFAn0f_s}nJ&pRy-9$4
z@v7G%V(ty(HjZ`xXv#uvT`$MD`7}+y7q8|mdS_e51mUa6HWg1=m_XXf%T2JkK@;%B
ztKW!-g}1fhB{%l$XV+5_#iNp-cwa*Cn%~uuZ4|F_lZ3K~Tz>*-NgPo;b;dEBc~8SL
zi?_TL5rbO*%w&<KZ-)T+;+=RuBDS}Y+n=P0CoSYw+G?C*+<#HjSG@M(sMz@c6NIlO
zJ5)SrVFGC@FE>GLDNVo^uj}HdD84wVzN9Djk?dy|SQ5pflAw43#p^F4WcG(Pja?EI
zEtg;u2kl3o;^BzmsWXo0M!AM(7O$*4DrPUubXUmq<pjtVZ@3~V>MIB^KAtL`w2)hA
zt8tF;OchPQD{thAsA#wX6Hx5<990%32%Wl@o1o(=nt(6fcy&}9t5(H3kX&Io;9p^(
zlAw5BLGfm;){<=~Z@DHa25XS(y+krRqIl|zV_H+E;hDv2z9uTRY5}O$Q~;>XIKb2O
z1jrX};o7L^x)!-DOy*Hp$gS(;CfK;1Cg6*=*g$)b8Zbe4gG8C6g$bmsyxat%jWhvY
zyp<cGqUQ#!c*zx(1O62jDhZ1BD=6MZla_3wcx6o_lpB%jeTm}Xh+L~Pj%oYN8lG9a
z{^qEtYX+cNQvskl;{dB$2#_yc@hwp?ejK^2PZdvE$gQ;1I5$DfZ8QO2ypq<an79=a
zgzuFRDxS13fwYyEn_%g7nt(4}`R!3L*`^gQxx#Y5zrsQ#LGiwd;?=Zk$u^4Dd<O~T
z1ake!MDcJ$uGJaGbfiPWGmAHKXH<0F2|%@`0zh@f0iL*<0Quq_>x_z(yO7)MRPm&R
z+)7)Ga}%`F{ytCf8t;jU)h<jBKHZ{JJZWJ9X)7-`LFs)o0bji1JyEfCuU5R|3d;fi
z3JaA4#rqnH*M7g2Y@>Mny(E-g<T^+c4@cx$opDSTAJFj3;%)Rr#aJHzGg)C79w0!z
zcs&nBMfros?T_#ZL?2}#x2~6CJn}G2z$>qJFe)krF+upHAe)LOEleP7<>e-58lnmK
z;tf0!6_t-@#Y?WR9PqEOP)SfcdMI8fMuxRy8^xP>l!Wpqa(yn+k~pGx>WpJr^0<a)
z7O!R`DwZDuU?wXp3r`RrU%biDs5m}~+@4GoPg=;WwADDrcyXL2;EOl)B)zjei3t*|
zkrkujNedH5TY0$&`ktZ*_~M<Kh>F$;t$4{5mIMA37AgsfNB7kW#p2UivW?<xOp;J0
zk?T(-iiabLr_MO0jnf*QS-kdVqN3~>0A{knQv4hN^2J+wHYx_5MQ*=>e{!UcvXEQX
z%P}sQr3v`rt)HTIwo{lOd^JJwq=gBjt-Ra>({nTdU%bs1qT=KWTJe%AEC>86EL0K{
zkED5_D0xXswo$yAc@oNt$n~d@mc$XoQ)e90-j_8zvv?y5QPI2rz)V(Hjx7-&U%bj!
zqT<vda$A`yp0to#X{&LLapP+=0bjhTWqN0O6%&N7COK3*X<-6sD=#;}>DOrjzIe5-
zN5$NVR=ng2%K`ri3zY=Lqv~EL8rQUB8^vpXgM{)1a*h8c5FSxHb;dEBTG#N*;w`=z
z75#4l5ciuiU18~Yn*jOZwVsZO&9{)-wN&w>h1^P8jdP59H)#UCcx~^{JKF{(2wzQd
zsd&=D1kzSsZi4FfXac@?9a~Yc{jOHL<O<6H{|XD01jVDjaG~gZUrV-8ype4Z$~JQS
znMAH}MDf%a$8`Mz4bLoI$p=v}vjf0PR#+xW=zoIn#XEUXwy3=*n{xX{_~(84C=0oD
zy&U7IOK1YVctfSxqOKGZgs&!fR6J>60%<ERH^GT=nt(6f=%v}B{?hE!3d@1y3d;fi
z3JaA4#ru~i-c$u4vnwo%mt~9o%ZPdGvq($gh~lX;j%n2u8lG9a#>#B5aXA1Jc}^Sw
zsLnXRwJQmbFWy{Lw&<urZhKP2lNNF-Z8gqKuwFwG@Wq?IDqD13g$ckhD=SXLlNKhB
zw(@cl3|~VN@WorI%@$p?s(1y-6_x`33JaA4#rrynw|=dbY(se^*Jg{EI^_CuL^3=g
z*XoR8+SZ`qnZ@h9E?d-G2SBx^0zh@f0ao5XfPC?`ug?~v*CV&*VQ&UlrYz*v^>Pzb
zHPHlo@pc-s#aJUI2wzQ5JZWJ9X)7-`!9p`lz!$HyIa`e1q!ll@!cyR0VWE<sc;7(r
zs&3JeZ4|HZI0@xu<obN1C2>Tq)fvZh=vEESEZ$U0wrFnwpjuM_pgQ9KTiXbbFJAp^
z*<z^`xh+l=Pg=;WwADB_LE9ZP0bjg^6WL<<c1#eyn&eaQq=gBjt-Ra>#T_&OU%aL}
zv&BlgR=ng2OM!odg-U|r9YyilI<;gQ#p}I`gmM>h{rN=ka73=v8OL<KOT#mZw|;lF
z7`YpOYE1=z>Wl+Cc`pI-#p~+M7Ny<D?Lw+}(n4;ft;V?thI(lNzIfgDWs9==FhTfg
zvWJQ%EleP7<>e+g_5e-57q71`Ta@={#Y?WR6!=$Is3a)fZ=!fZ4{FIaiZ|6yLg`1Y
zzmO;%j>xq-<CyL|q~V#xt9mF~EDiuLlNFY^lLW{YZ~WnG(eyBK`)%xsqK~qWTi44m
zo_~}k;EOjglr5TvFhTfg@*ygov@n6Rm6w~KXM`r;i#PpPwmANnR=ng2OM!odg-U|r
zeGA2#AJvj=6mR`;63XMq^%s$r#1X|)XB^XpCpA2?cx_|ZqJ;hzab_zl+dn5jzIZF+
z*`jY8xjm68p0to#X{&LL@y;Yoz!z`zDSBsn3KJw+BkL?Gp0qH5w3U~eVDcH7fG^(Z
zscbPYr4=u^!cyR0VWE<sc)x?<?L4a`+bCYuvm}&h<oZjA;^BzmsWXmg_bCm}EZ)#_
z*`o0|0A{knQvU)0^2IBk%@)(oBe!Q`OB;QZh1|Mcj&Z|0O~4ng;zfFAo5KX*s|kuH
zEleP7<>e+=Tc8Q};#DtXi&HOY#Y?WR6!=$Is3a)fw^6)?SF~gs#cNw6p)4ZTUq)II
zM-)$;aZD#()$q*X%`at(-X#ELvcl4}LV$enj=z>IPA?<3KT8!)TF9-m)i}qv`wg0a
zFJ8+ky|cZJ3Bp&C4^#1^g$bmsyxatp>ofshyc2I`i_JByc*zx(0{;pNl?286Hz;2B
zX)W1C@rK?ap}d7$i$w8oMDf%a$8>c=!!wJw^LDnFdK-Ya-<;_R%lNwl$QN&5Gh0+|
zBDd42;z<j+m9`q^7*A}|1bp!Z-=lZ7_b@^DYVr{(p0qH5w3U~ep!EZqfG^(g4lOh6
zXvIseuoU=LSg0f@-gi*EiHl;MFX{8eG0|I0KYb$hONm_Lh~lX;j%h`yhG!P9;o_KB
zF9BdCD=aHz1jrZf)Fm--;u8FPjIU!eDSecM+`3+l@#<wX0bjh?@|b8Z#{}W4$w#Sp
z(!vDNR$gv`la(|9U%Z9OW1{2oSZal(Ai2U);9p^(lAw6MkK(OXX~{N}w{t~IOkIIo
ze+6kt98o-V#xZTVO2adY*IgYGRn-7Y<T-H!pgQ9K%WDacFWzQNObpi`w@s<yNej7^
zwi@RqsJNCU;ET6)O-zhjg9*SfE9-13p0qH5w3U~eVD37afG=M0bulqouZmZgTwy8n
zudq-_P`vM=cooOAWE;h6xSoX4fL#AFkqnQ>wL0UN4mN6dX7MI&h>5lv02oOC;s`)>
z#sMC`i2(WH)i%Y%!i~snVka;_Qx<aTdbtT&j?)Bu@#=1liN$725Z)kBJZWJ9X)7-`
z!FCHxz!&dWOH3@?q7^T>!cyp8VWE<sc;7?uT5i*lZ4|G&m4woYT>nZU*Ek~A>WpJL
zd%K2b7H_pJCWhJo7)=1;2takl0S>eiAYZ(WJ7S{v4&?T)v5lKP%0g~kFE_#9T{Hn-
zyv~l8DCxii;d>>DCoN1MZRO=AsPCc)_~P~49TTN@YsE{huoU`NSg0f@-XEcOgWX!P
zjp9w*LqfR+x&Bq8C2>Tq)fvZht4G5#i&xPT6Z7{1P_3x|P@Qprr|u^}zIda(F>$OH
zxgAXvPg=;WwADB_!E8TGz!z`qftYA~02742qvm5&JZWJ9X)7-`LDxex0bjhyftY9-
z(2AE_VJY;luuw@*ygxzlW*^p)Z4_^Hkc2XbTz@rDJRFg0b;dER8`AL1;<Y>y6FVmX
zh{s%b1fV+O05^vTkT2fSqcPF*C~|uN_TU3FWg)k&mz!YgahiZH-tq{&vyEVa@Jj)T
zCoN1MZRO=A7$2hv_~NZS5fgn+XvIseuoU`NSg0f@-uF?wt#K{cM)4}fNhnVu*T06e
zB#y|nI^&pjKBeKA#T%T6iG~RP<|F`d1fV+O0BffRkS|{8(=jnQiQLwviYG1PR@!Qu
zo1pGlnt(4}*)+YgJ%b5|Dt$gq#gi5$khb!26RbQ(6Y#~Wd@d%YXSCuaS6B-DD=btJ
z6z>NpUfrygY@>KB&y!G2A=h6clHn1#R%aa3u{jOTEZ*!3G12`305e%(>3E3%`QkOr
z$Hdx;$n9pTc+x^{rLD#}#+{2a0bji4m+75t0TYC;CMce?FoCp{mz$vcRhob=Uh7g!
zoPI?sUUG${(7(b$B|-5nMDaRb(~@lzZ*ZA}f}b)N`xlAg;fUg?Gmh!<>l&U}ysed(
zm{<W|CMztXYXry_ukVeRsC)yty%7KVg+9tcZe1_Oc<e2jfG=MEI=!>4V}kJ21jUmU
zCXlxBauXcipb7ZmoqRhcs^8X%mt0{f^slf`Nl?81fZ~m9YRNW=H~S6=<sIbu0;DBz
zMDf%a$F%G{4bLoI-BwJjz6(IyZ_aduW$Aqa<cl}G9TTnF$n8L?c+x^{rLD#}#>>Sy
zzT(Y%K<{iHV1n@L_$R1%(!vDNR$gv`fs1JZzIbybIpRb~PU=f~VRD6~(7(b$B|-6i
zgyJn<Ldfhkh^^8bF;SXBx!#}1HI68rI^&o&muq-t@j5Tf5fzsLFq0LQ(#r{uFW%|P
za>U8Y2r&L7{MRM=C=0oDy&U7RDw=>--bQ7P7^=hs;j0OXCoN1MZRO=AI8{v(@WtD{
zGDi$wsTD7|!cyp8VWE<scz=fCm0hhR+fZIzO^#T-3b{Ujv?Puwo;u^0_FtpnnZ+Bc
z%@HlN08HdLaRi_`;{coL36L*d^|d)-t`4~^PZdvE$gQ;1I5$D_^)vxryc+uPwDZ?t
z0&vXAI){oUEleP7<>e;Wyn!a*i&uX`j#xORigz%%!gA2R!a^lM@%|jeYi`n#Z4|Hb
zMiR=6$aMjc43EgQI^&qmG;4Tf@s@AO5ra1YP_3x|P@QpreYX%GU%V5?bHw(|$n8q1
zc+x^{rLD%f3Hn=U0={_dx8{hQ7EBPnnxJ^n!UWP*UT%Wg+i3#6cwKEdqPR^fUUG%y
zpnrvhN`m73C5qR7hn8%kcw;9>C?}BX!bI_KM6T5t$8_UP4bLoI*_}CJwjF?KO$C7J
zj02qRBtX7+!*}I~`n!<Z@8UlV(nndyt?T6`n7M~0;EOlXl_MIuFhTfgg5pUF6G&Tm
zxd}RYXac@?<M-x>WA|#sORlgS^slf`Nl?7MM)78PwPYK`TfUEkavySyU$qn-k!y9v
zF|Bz(!!wK5+?OM^?gyY+Qvskl;{Z=TNPv9t7W#8US3h#wl`5XJkXvc1ac+W*L7IRs
z-r_?!qWd9C5Wbr1rQ%5o6G&Tmxd}!ep$Yint(?peJtwu|C0AGu`d3(}Bq-j0Lh&{p
z)sk%#uk2A0$`Ep0lqepK$hA7-n6{5-cxLhXAIlMSj{#7vsQ^%&ae&oN5FlT?;?W#2
z{y1_Q;D3D5M_I_N>*Xe>8K(*O;*~r}?`&h3Abd4J@uY<bq^-Q%1WOY%0bjiGi5xNc
zbFFyE6_$hk6&5N9iuV%~ujXkj*+%i2CrKzzA=ihHmc$XcR%aa3k!LhKvv@O8Iihn4
zfND(zfa;6`JTXIneDRJwn<G}Hk=uVt6;E2ot+drRH$nUJGyz|{##8jp_8cZiv_{rG
zDxS13fwYyEo1k=#Cg6*A{DmB`Hmemcxx#YLzrsQ#LGk_{6t8_=OSVzG{ufCoFCy3Y
zy(QrhxmIT!)5Qf1&n(`?OF3ffB>-l!!ZQ2{0rJJ`S<DgTi^%O<s(8{uZl$fpImRQ)
zGyz|{-dE|J?Nv+=zM7zT(!vDNR$gv`rq^i#zIX#GIihk!D_(Mi<)D9sg-U|r{T+%o
z@`jddqj)o`B$QR;8owqZJfe8&jAL4|uHl)*t9g@tFT@%EGg)C-I8A_j@h0EO5y#&`
zZojoJ6oImkTi44mUVMip;EOl4LGNrEm>_&LLGh%838byO+ys4FGyz|{Q}5=8)_1ky
zC0AGu`d3(}Bq-ibQM|=%E!jr#Hr^wlyoX%>%S7>TMDf%a$F%VS4bLoI`%aE1+W{c%
zH)p!SQe5KwB_V6YxniI=mvY;hDxS2ETWPCtj&aE)Gyz|{^^0@G;Ki6Ad^Pzb6;E22
zK-$X7O)z~aO~4m#vn*GfEXz%;upCUTupIQSuuw@*yq}?XB^89suCUZxmMfOaiFxeT
z6S>9_#ZzY-)80xA&n(`^<+-ByasXzs!gB0N0_2rfS(PhJU4h)5h5sl|A7vr8u9sun
zSVI%=#jCn1SIkypg7DP@#gi5$khb!26P&K43HajG*5-=2tF_`KS6B}ES6HYdDBeGy
zc#YR;$u^YNUY9G%>X7TNBQ1#|il@#vrc>8xcxLey>vKhaJpdDVP8<QK&N#rHV+6<-
zul4#|vDtv!{#~kg(n4;ft;V?tdT*o&_~NxS=8CNwFabDH5~z66!UWP*UT%WwW}1L6
zUdK(jV!KHduPC{~QsiG@p^~6@|A^xC9@mm>6mR5a63WfU^}hn|#1XkxXB^Y@77foV
zUP()?n7IXjYE1=z>Wl-Nyo~_);+<^G6}7F%?PRKW(n4;ft;V?trcTfVeDQ{E&lPpI
zV}kJ21jUmUCXlxBaub}mlP2JcH`<;n>f5#AC0AIA{3|R}5)|)$pm<YvX~{N<x7a~K
z=|HZ(ktiOH$hA7-m{xUZcxLe$@6Ht)od8s8Dgabx9N=0v0rJJ0yC+w4+=JX6#*e6=
zkFt<k*UL??ejiQ17jM2NS9JDZg7DP@#gi5$khb!26Abs!1bp$9?#~rn_iM#VuCNsO
zS6HYdDBeGzc<cRIvW?=E^pj8?K(4=uv?PwmwL0UNwmqcbnZ@fJ$Q3mM090!#090oj
zVC6{y<cqgGm@7sHk=tXb;z<j+m9`q^Ca8LpCg6*=^GL23dju1NuO{bG@uY<bq^-Q%
z1PhPR1bp#IAIlZv!&>o@D=bC+6&5N9iub=!ysA+x*+%giA19%VAlF9|#lsP~R%aa3
zp)n25EZ)=;xuX3E0ID?=0ID+%u=VEz$QQ4EJXb6|iQH!4=ZnxsS;(#H<tAvGqzU-q
zH9SS{Y!jFud^JJwq=gBjt-Ra>#m~?LeDRv5a>dHiTJe%AEJgkm7AgsfR~$k0+Md<2
zZB(y!nv^n)Wd8>8k~kvS>Wrg0|D1+rR&RYKSB%U6P%WweP@QprCua$euU^;lxuW!W
zB=^r#)sq&ID{VE-O)&H#O~6;LdyZb(<}g8`H?lrO)sq$`khb!26C7Kh3Ha*uy_74;
zU(%|VTw*EmFR@TbP`#2w^@bL;Y#Y^^dYPp1GLrq9iR$5qWUDid>dumeXI8IjDOW7M
z0>DhxSms_MK)!n8%ekUy8Oa?^RZm(-uC&!S$9R5~Cg7_#@jAV=y^aaOmlIS^T9`oE
z%F9jA^CnHeS8sYPR~%o{s+U}2De|weP)Sg|(nR&<-_o*eRBwHqq_U1=|5l=UIHG#$
zjHBAHq2Za;YkNCal)MeVOcq(THwloh-pV_<qVF9f_pA8fKJ-x*lIwao#yjuP1bp>Y
zx9GKP3loHIC#asZFoCp{mz!X6hbG{wclv!=X?S0&UUHG8$iK)!B|-Jd64l!&&hvax
zuPV+Hiyt7_-$GszM^sOpaa6l6*6_^g4VC1H#uEDNrWviW)Rz$;U%m26^2Bs0eqCyH
zs(R8wa{u4T&i|>d;tb=NX=lzkclrm|scli)o&KWL8AqTqo#`bee*YG=YHM55j1%kF
zYGd0t{!)TSkPt!$A>>L3xgmrg5=xK|f&>u}kr0A}AVGpi2qB0Bkq{)L@3ZIZ-95v5
zX2Kn_d3O&y&*u5Ku)HEswSLa+IPw-5@bog$Ez!3T2JZFbCqz$GFi^E#<qRe_kpWLH
zdy^&lGptyb<u0$wa#wJdg^++=x}+D$GSc>)y|T@gNZE|AS4esoK~K-Pt#)M_dVYFC
zTP#tx1%gXyvs7;*L7rZ5jwPnIBHZa1JyjuGRqN;6j%)JBfTvfo-4ZjoFmUfDpr<Mr
zs9LXb2AMm_fTvfUZ;9C*20gFMa#wJhg^++=hNM^Xwvo1vUQ2|IN(5oAl=Luyo}O`A
zo!DjQ`ROebSfZx@f}});86yPsj1%lEB0-*B<8Di2??$*!<0os;OIHYY@hWG~wU-Qd
zdQDMF<V0cM%HK%zR0RW7>s8L6w1f<Jdae5`k-JaRi}(60@xgr-LIQf3l3rJ-k+zTC
z(0)29`w{j%$VH9_ThF+yX1rtQ`RPT<EHQNeg1WT=K|SLH$KNGEo?d^sCCbYYZhefN
zst~TK^>fZ(qLK`FdIJY7QE?Ck?)?PxR0RW7>s8L6`4AcK^oFY}QCVfs^BOJj!HpI|
z0(x1J-o$%G+CF*<)pS&<5%#^39!7+%XWUjxY7IR<y_&<8$T|!`-CKd6o^gWdM@f*U
zH+#eqtw#`U9DXnwy>x|e7q4;#84YB>)0?ZeL|Z)!-1`aWsR{<F)~lSsU?Umu^pYDb
z(SFRJ=XF}*gF7vR1oY@{t);K`GCnlY_R)(p(NXyTVc&;b<cP5KjN59{M~0rCURSdv
z%9<gln=261GfuF$l>~WuxgT3%xCP;+#^|XE;i_6c=L||dB?F#bUK?H9K7oOIKlv%q
zQxyzUtyejN`Qv23(<?Y`iP3h0p4V!L4{o&(641+$^h!F7w0-nyPS8;~fw1qF^e`f9
zJ>#}I&}HcP=}nxpM9WDC;#1xA{{KRar$~^eSJ`ce<ZgsJ8l$HwgsW=(oZE3zFB$Ok
zs!r3z?KBMB`w8f&3I?jytDHgp88YDM)%IB;wa=jE^;+VCdo6?n^l~M=rZ0@Nee}Be
z>8SK0><47n7(q|ZxUJ5eHT3-SGR|6JXaIt`y8=Ny;{^MMNRX%3{-q@fzC^eW;-?bQ
zOIHYY@hWF9aE=UkdL6@baT|t#EB`a1rz#kzTCZ{j)uUv<)9XHOiK6ocJ+IdiAKYsp
zB%qfk=?#n-Y5V9+T%e<J0bxIgT;zzb^^DtU-q(hnpI*thB^EA1a4Ef(*>6aYr#Cub
ziP{N-TN$ILDuk<Q{hZtJ+!Pt`^u{jH#qAOd-1`aWsR{<F)~lRB=XYel)0>>OME$fu
z&+E0s2lrYC3FzfZdUKbJw0-n4F4Iw&LD&yTdKf`Z&$z8t%^7-rdQDd>5xD}vrSw{I
zuaY27FLmA$-QOcz`Wvp~OIHYY@hZ3Dykt9wUfMOfxGliIy`O-ds$igMy~-Jkr;q_p
zFEhm!y&LS<SL^X!uO&XX*Fs1@FCyvXy-CvSH${}B+G61io5Frr(!&UPdd6+FW22$x
zr#Fyhi<&eD;zQ2$en(4X1_|=?ir%utWI74juf^!83gN0+Kj(H_wV4d~vlq>@#ndJk
zxc8Hv6FpVIK-GGcGf2-S1D;;#7F$eb8T7nfOMGyzg^++=fuvWJW2Egndrezy5!s5c
zACX~W1U)_DwmOz;==tf*ZL>w!HV7`I*V4X&1bKS(dA7*RL%97hda6RWs@BiB9d|^?
zfT!26(-v7fVc_0RKu=XLP_<s=42pM=0Z*^Fz!upB20gFW5+B@aAtazzDCu?VHq!Rd
z8z`isQi!l0mGm%zo}O`AO^X_OetLO(Y%x&;K~kc#hY^B$#tDuVlORv8cdso<_afZi
z;U~J%OIHYY@hWF9wx0}mdVM9fC@X=1D<4PnR0RW7>s8L6v5X9OdV>dSQGP(vOYnLv
z3BkP<LIQe4lHORkk+zTC+&gqs-a*)pAs0C!Y(3+)8a-&}`RP?v*dn6>g1Wl`K|SLH
zQ>#djr#D?`i{?s%TO6aODuk<Q{hTvMdyfowdNYS?(Q*g|?hk?hJypR#)q0gP=s!#b
zJiYlETeQ{~^t@h6LU6BzkbquP(o3r|()Q8IJ3>dL7GbZF^e`f9J>$08P;cn@>2(~n
zMafYJ>h1~z^^6lNI!1y#z3c{C48D(W?Ks*Ty>x|e7q4;#(IztB>E(Q2i=jpsxVKBt
zQxyzUtyejN*=91}>E$=uV)#RYp4V$h2=28I63{D_^r9adY5V9^wa`&%LD-KY7dawq
zJ>$08_lcqBr#IGWi>6ix>h1~z^^6m&Zzn;XUiqiCm~TV4FU5f!=&1_fs#>pd1`Q|3
zfTvf{K^M2<FmOM^fu5>hplZF!8RT}60Z*^`q%D$98uYwgOG0q3g^++=iKN%iZKUm^
z*YO!0mCq3N6Uar52wTs%t<LlqdVYFoJ+>G)1;M5CT6#YxL7rY~uPyR>5pHK3*nys^
z5U#5AD!1dlelp<cwVk1h+Zh<RUmJj)s$igMy~-I>o+SgGUgv-<3I+^%Uaut~xYt5R
zK(AEN>-*A3+edF~kdDeA!hRCD$Px7PjN59?u%YLt7yZf>b3+i+-4zJx87DYBLV`TK
z;d8dAK8JAcyh*|%da6RWs@AKV!OR6R;OUKw(#35Q2Cn=~L{C*PP_<s=4BE%ZfTuTp
z(H6BA4SHU$B_X)iLP$WbOwyZ~Fw*wXOPiph@-@Q#J?+vR5w@OjTdlZc==te2OxhxE
z5`s(VwPa6|AWtv(TU&HaA>8~JJyjuGRqN;6j&m-P0Z%VwhAwX3!N9$rfS#&gplZF!
z8H`>b1D;;`6<c)A8uYwgOG0q3g^++=xulmfZ=~&`7yX`&${fOeO47p!dV0ofwQa%B
z^V93QYKy9?5L`;HrF?@EM6V#(5#!eo?tF}%st~TK^>c2=6{%#v(<^+#5fdp68MyZo
z&{Gu*RIOJzgVc>=z|$*EbHwDEPOR6G;PqM(f_p851oSE-y^6O;n(ehTq&p%nowokN
zrzJg%pr>ctR!24&dVYE{8II`4fFM5PTyL+XHH!rKvsb&>5$T%|?sGSjB1B^q!d<+|
z?YM0#8SwP#vK^6;4FmUn0(z=~fvWW?XHb+&20XpSZH~y?X3+C`EeXNB7D57gm6Bdt
zo{_fi?DcJTMAdeL{S0!EBk1WFx7C!LhMu2ZPQD|?c0h0`y_Vs(Nsy=49dSf)1mQNv
z=&1_fs#-tic05u@20Xo<U5+T(1q1hf0(z=~fvWW?XHdU~40w9|MUE&fGU$1|mW1G5
z3n2l$DoJl-uaUNo-b|E^N)%!LLDItrdV0ofwXnp{^V6%?=ZLg@5F{n~*8w90^^6lt
zK0tyzy~+KKXxxu*e{~B)aQ0M%a8<2WIfImPGT`Y=l{uoR3<j?J%|uUCFi^E#<qUc&
z$bhFe`>rFJ-_`UIy<SUVaIb}sfL^tvm-0U&Z6CdyN;)bB5%wREiyRTQo^e~PJ7nnj
z>9tikB3cDO-Ccp8o^gT&H6+N>%Y4rf{nZFJGe%EU2v^nmIcHFKgba9kS+$NBI1B^#
zegb-`f`O{_DrYc#lni)!xknu_SZC1ldM$~;y%s_OdNq<>VS|yjk6y+5bX49)*w0FO
z7!kIfaa-+aH1z!RMvgh6;TQyUcLjoa#tGJbNP;}Q(k4gDet>W%WAs#oa8<3Ja|U%S
zWWdua`^XV<%`kB9C!nV)7^qsWat7I-kO5Dxvegmu9~<<%UQ1$duZ56+Uah28_o<P#
zk6v3F9hEkO{U=EeBf{1*ZmUzr4Lv`-l;e)*Ylooju0T-FIKl2t66EPMpKwI(355H^
zFKBc0(iOs8yviB$d`1R5y_PPzxOKt6y`O-ds$igMy~-Js_mBZkul<xG@=qD`yk1LU
zaIb}sfL@)X*VAjH?V~qxnvTk8g#BmaB1eR+XWUk^&KP=rdWC(CnE4!n_*8ej-`6tv
z1qt%>2KyaR*^h8*WAs#oa8<3Jb32|IBm<t_&{?{;orQsWKLI^e!9dk|l{0AliVS#q
zqeG6U9x~{8y_Uq_UJD@sy?RM+>YS0bk6y|-Ix52m`!6zVjG(7y+*Zp*4Lv`-y7P|6
zIS)bIU4fvUae|o_Nsy;Ef58#$7ZC1^zogC4OIHYY@hWGKH9-bEy@heQxQ)ZWmA{4P
zsR{<F)~lSs@FW@V^in4s(fN%*&+D}$2KQPB3FtLQdRgBZY5V9EPSH`hgs`8ZUAiN}
z)-!IaE#Db>etJFAj;NT1;8J=mrL!c+)62i?h|w8@n;fI3Duk<Q{hZrz+4p3?(~Hc}
z#qA0V-2aLWda8nfs`V;oki0+!JiVf;)NhzK=y|=C#Nb{FApyNcNv|w96!6u0-8DKY
z*AVt<Ne?6F=^3}xp%g>UPj6~NNVIJTk>FB#EzNI|AWyG4H6&6~NzfjS(Nh(|RkeQ3
z?YJeK40w7q8$%*(BMjX83FxT`2CCMpoI$}RGT`aeXM{w0Mkv;6N%VRxiNU=VLIQeC
zl3vSZBW>T=>&Xm>icEz4S4j^e=;;}^)rBpFo}XUUmXH|9f*?NRTyL*sFoy(rdYxNC
zqG&6^y;qPTL}L}gUA)TecxXEr@btQJLn4|B1NVLcda8nfs`V;oP@7K%JiXo>AyK@;
zpy%~k5`%j!gaq`OCB31Dk+zTC)J{4oI}!F8<RVAV(=%?XkzIzKpI%u(NTd`%a4Ef(
z`Q0SQ(;F`giTXl>TOOmQDuk<Q{hZtJLX-@6dJ}s>qG1mV-1`aWsR{<F)~lRB_dYV<
z=}i}hL}Rf*&+D}$2KQPB3Fx&Dy&G@%QFzs=N1po4t7~3({_m@ADF4B)@p^r2QaBub
zjDF$uKsdZUoU}H)G8|qTUcOeYCD9u^glc&>d>?+Nx%!=EPtvcuF@#^jaF3K)j}pS+
zm3Na#0u?J(uE3ncF~Q)zAeHpu!x-f{-aqM1Y)vk`Mq@a<27}Z?A-56^Kl8fW79>gJ
z8Bdy?u3i3e?EPPmZ!h<^`2FeK?thWq6#EM&$(=v^gg0NY+}*ESU9{haX!*5A=}qOI
zbcOmuk<VJ}<6h)QugCPJm7e<!c<8<$owN=$<x77cdjIuqzVi9wa{TSG8{%GBz3!j?
ze*Ry#u6gB^TUW1J_wxF<|2)6$)xW*`kGMPHZ@c}r+kgBleV4iU&$9m>T(<1d@Txoh
E53Q5_%m4rY

literal 0
HcmV?d00001

diff --git a/regression-test/suites/external_table_p0/hive/test_parquet_join_runtime_filter.groovy b/regression-test/suites/external_table_p0/hive/test_parquet_join_runtime_filter.groovy
new file mode 100644
index 00000000000000..8c0b1516459a57
--- /dev/null
+++ b/regression-test/suites/external_table_p0/hive/test_parquet_join_runtime_filter.groovy
@@ -0,0 +1,174 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import groovy.json.JsonSlurper
+
+suite("test_parquet_join_runtime_filter", "p0,external,hive,external_docker,external_docker_hive") {
+
+    def getProfileList = {
+        def dst = 'http://' + context.config.feHttpAddress
+        def conn = new URL(dst + "/rest/v1/query_profile").openConnection()
+        conn.setRequestMethod("GET")
+        def encoding = Base64.getEncoder().encodeToString((context.config.feHttpUser + ":" +
+                (context.config.feHttpPassword == null ? "" : context.config.feHttpPassword)).getBytes("UTF-8"))
+        conn.setRequestProperty("Authorization", "Basic ${encoding}")
+        return conn.getInputStream().getText()
+    }
+
+    def getProfile = { id ->
+        def dst = 'http://' + context.config.feHttpAddress
+        def conn = new URL(dst + "/api/profile/text/?query_id=$id").openConnection()
+        conn.setRequestMethod("GET")
+        def encoding = Base64.getEncoder().encodeToString((context.config.feHttpUser + ":" +
+                (context.config.feHttpPassword == null ? "" : context.config.feHttpPassword)).getBytes("UTF-8"))
+        conn.setRequestProperty("Authorization", "Basic ${encoding}")
+        return conn.getInputStream().getText()
+    }
+
+
+    def extractFilteredGroupsValue = { String profileText ->
+        def values = (profileText =~ /FilteredGroups:\s*(\d+)/).collect { it[1].toLong() }
+        return values.sort { a, b -> b <=> a }
+    }
+
+    def getProfileWithToken = { token ->
+        String profileId = ""
+        int attempts = 0
+        while (attempts < 10 && (profileId == null || profileId == "")) {
+            List profileData = new JsonSlurper().parseText(getProfileList()).data.rows
+            for (def profileItem in profileData) {
+                if (profileItem["Sql Statement"].toString().contains(token)) {
+                    profileId = profileItem["Profile ID"].toString()
+                    break
+                }
+            }
+            if (profileId == null || profileId == "") {
+                Thread.sleep(300)
+            }
+            attempts++
+        }
+        assertTrue(profileId != null && profileId != "")
+        Thread.sleep(800)
+        return getProfile(profileId).toString()
+    }
+    // session vars
+    sql "unset variable all;"
+    sql "set profile_level=2;"
+    sql "set enable_profile=true;"
+
+
+    String enabled = context.config.otherConfigs.get("enableHiveTest")
+    if (!"true".equalsIgnoreCase(enabled)) {
+        return;
+    }
+    for (String hivePrefix : ["hive2"]) {
+        String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+        String hmsPort = context.config.otherConfigs.get(hivePrefix + "HmsPort")
+        String catalog_name = "test_parquet_join_runtime_filter"
+
+        sql """drop catalog if exists ${catalog_name};"""
+        sql """
+            create catalog if not exists ${catalog_name} properties (
+                'type'='hms',
+                'hadoop.username' = 'hadoop',
+                'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hmsPort}'
+            );
+        """
+        logger.info("catalog " + catalog_name + " created")
+        sql """switch ${catalog_name};"""
+        logger.info("switched to catalog " + catalog_name)
+        
+        sql """ use `default` """
+
+
+        for (int wait_time : [0, 10, 100]) {
+            sql """ set runtime_filter_wait_time_ms = ${wait_time}; """ 
+
+            def f1 = {
+                def t1 = UUID.randomUUID().toString()
+                def sql_result = sql """
+                    select *, "${t1}" from fact_big as a  join dim_small as b on a.k = b.k  where b.c1 = 5
+                """
+                def filter_result = extractFilteredGroupsValue(getProfileWithToken(t1));
+                logger.info("sql_result = ${sql_result}");
+                logger.info("filter_result = ${filter_result}");
+
+                assertTrue(filter_result.size() == 2)
+                assertTrue(filter_result[0] > 40)
+            }
+
+
+
+            def f2 = {
+                def t1 = UUID.randomUUID().toString()
+                def sql_result = sql """
+                    select *, "${t1}" from fact_big as a  join dim_small as b on a.k = b.k  where b.c1 in (1,2)
+                """
+                def filter_result = extractFilteredGroupsValue(getProfileWithToken(t1));
+                logger.info("sql_result = ${sql_result}");
+                logger.info("filter_result = ${filter_result}");
+
+                assertTrue(filter_result.size() == 2)
+                assertTrue(filter_result[0] > 30)
+            }
+
+
+
+
+            def f3 = {
+                def t1 = UUID.randomUUID().toString()
+                def sql_result = sql """
+                    select *, "${t1}" from fact_big as a  join dim_small as b on a.k = b.k  where b.c1 < 3  
+                """
+                def filter_result = extractFilteredGroupsValue(getProfileWithToken(t1));
+                logger.info("sql_result = ${sql_result}");
+                logger.info("filter_result = ${filter_result}");
+
+                assertTrue(filter_result.size() == 2)
+                assertTrue(filter_result[0] > 30)
+            }
+
+
+
+            def f4 = {
+                def t1 = UUID.randomUUID().toString()
+                def sql_result = sql """
+                    select *, "${t1}" from fact_big as a  join dim_small as b on a.k = b.k  where b.c2 >= 50   
+                """
+                def filter_result = extractFilteredGroupsValue(getProfileWithToken(t1));
+                logger.info("sql_result = ${sql_result}");
+                logger.info("filter_result = ${filter_result}");
+
+                assertTrue(filter_result.size() == 2)
+                assertTrue(filter_result[0] > 40)
+            }
+
+
+            f1()
+            f2()
+            f3()
+            f4()
+        }     
+
+        sql """drop catalog ${catalog_name};"""
+    }
+
+
+
+
+  
+}

From c7572648fb955873e24cadc5cbbdcd2bb816b758 Mon Sep 17 00:00:00 2001
From: Socrates <suyiteng@selectdb.com>
Date: Sat, 20 Dec 2025 22:35:38 +0800
Subject: [PATCH 11/12] Optimize location for tpch1000 (#59218)

---
 .../doris/common/util/LocationPath.java       | 89 +++++++++++++++----
 .../iceberg/source/IcebergScanNode.java       | 87 +++++++++++++++++-
 .../property/storage/S3PropertyUtils.java     | 48 +++++++++-
 3 files changed, 205 insertions(+), 19 deletions(-)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java
index e4b9aa0b25c121..cbe2b01d912584 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java
@@ -96,27 +96,25 @@ private LocationPath(String schema,
     }
 
     private static String parseScheme(String finalLocation) {
-        String scheme = "";
-        String[] schemeSplit = finalLocation.split(SCHEME_DELIM);
-        if (schemeSplit.length > 1) {
-            scheme = schemeSplit[0];
-        } else {
-            schemeSplit = finalLocation.split(NONSTANDARD_SCHEME_DELIM);
-            if (schemeSplit.length > 1) {
-                scheme = schemeSplit[0];
-            }
+        // Use indexOf instead of split for better performance
+        int schemeDelimIndex = finalLocation.indexOf(SCHEME_DELIM);
+        if (schemeDelimIndex > 0) {
+            return finalLocation.substring(0, schemeDelimIndex);
+        }
+
+        int nonstandardDelimIndex = finalLocation.indexOf(NONSTANDARD_SCHEME_DELIM);
+        if (nonstandardDelimIndex > 0) {
+            return finalLocation.substring(0, nonstandardDelimIndex);
         }
 
         // if not get scheme, need consider /path/to/local to no scheme
-        if (scheme.isEmpty()) {
-            try {
-                Paths.get(finalLocation);
-            } catch (InvalidPathException exception) {
-                throw new IllegalArgumentException("Fail to parse scheme, invalid location: " + finalLocation);
-            }
+        try {
+            Paths.get(finalLocation);
+        } catch (InvalidPathException exception) {
+            throw new IllegalArgumentException("Fail to parse scheme, invalid location: " + finalLocation);
         }
 
-        return scheme;
+        return "";
     }
 
     /**
@@ -201,6 +199,65 @@ public static LocationPath of(String location,
         }
     }
 
+    /**
+     * Ultra-fast factory method that directly constructs LocationPath without any parsing.
+     * This is used when the normalized location is already known (e.g., from prefix transformation).
+     *
+     * @param normalizedLocation the already-normalized location string
+     * @param schema             pre-computed schema
+     * @param fsIdentifier       pre-computed filesystem identifier
+     * @param storageProperties  the storage properties (can be null)
+     * @return a new LocationPath instance
+     */
+    public static LocationPath ofDirect(String normalizedLocation,
+                                        String schema,
+                                        String fsIdentifier,
+                                        StorageProperties storageProperties) {
+        return new LocationPath(schema, normalizedLocation, fsIdentifier, storageProperties);
+    }
+
+    /**
+     * Fast factory method that reuses pre-computed schema and fsIdentifier.
+     * This is optimized for batch processing where many files share the same bucket/prefix.
+     *
+     * @param location           the input URI location string
+     * @param storageProperties  pre-computed storage properties for normalization
+     * @param cachedSchema       pre-computed schema (can be null to compute)
+     * @param cachedFsIdPrefix   pre-computed fsIdentifier prefix like "s3://" (can be null to compute)
+     * @return a new LocationPath instance
+     */
+    public static LocationPath ofWithCache(String location,
+                                           StorageProperties storageProperties,
+                                           String cachedSchema,
+                                           String cachedFsIdPrefix) {
+        try {
+            String normalizedLocation = storageProperties.validateAndNormalizeUri(location);
+
+            String fsIdentifier;
+            if (cachedFsIdPrefix != null && normalizedLocation.startsWith(cachedFsIdPrefix)) {
+                // Fast path: extract authority from normalized location without full URI parsing
+                int authorityStart = cachedFsIdPrefix.length();
+                int authorityEnd = normalizedLocation.indexOf('/', authorityStart);
+                if (authorityEnd == -1) {
+                    authorityEnd = normalizedLocation.length();
+                }
+                String authority = normalizedLocation.substring(authorityStart, authorityEnd);
+                fsIdentifier = cachedFsIdPrefix + authority;
+            } else {
+                // Fallback to full URI parsing
+                String encodedLocation = encodedLocation(normalizedLocation);
+                URI uri = URI.create(encodedLocation);
+                fsIdentifier = Strings.nullToEmpty(uri.getScheme()) + "://"
+                        + Strings.nullToEmpty(uri.getAuthority());
+            }
+
+            String schema = cachedSchema != null ? cachedSchema : extractScheme(location);
+            return new LocationPath(schema, normalizedLocation, fsIdentifier, storageProperties);
+        } catch (UserException e) {
+            throw new StoragePropertiesException("Failed to create LocationPath for location: " + location, e);
+        }
+    }
+
     /**
      * Extracts the URI scheme (e.g., "s3", "hdfs") from the location string.
      *
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
index 133ac0676448c7..698a6a380f0c18 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
@@ -133,6 +133,17 @@ public class IcebergScanNode extends FileQueryScanNode {
     private Map<String, String> backendStorageProperties;
 
     private Boolean isBatchMode = null;
+    // Cached values for LocationPath creation optimization
+    // These are lazily initialized on first use to avoid parsing overhead for each file
+    private volatile StorageProperties cachedStorageProperties;
+    private volatile String cachedSchema;
+    private volatile String cachedFsIdPrefix;
+    private volatile boolean locationPathCacheInitialized = false;
+    // Cache for path prefix transformation to avoid repeated S3URI parsing
+    // Maps original path prefix (e.g., "https://bucket.s3.amazonaws.com/") to normalized prefix (e.g., "s3://bucket/")
+    private volatile String cachedOriginalPathPrefix;
+    private volatile String cachedNormalizedPathPrefix;
+    private volatile String cachedFsIdentifier;
 
     // for test
     @VisibleForTesting
@@ -547,9 +558,83 @@ private CloseableIterable<FileScanTask> planFileScanTaskWithManifestCache(TableS
         return TableScanUtil.splitFiles(CloseableIterable.withNoopClose(tasks), targetSplitSize);
     }
 
+    /**
+     * Initialize cached values for LocationPath creation on first use.
+     * This avoids repeated StorageProperties lookup, scheme parsing, and S3URI regex parsing for each file.
+     */
+    private void initLocationPathCache(String samplePath) {
+        if (locationPathCacheInitialized) {
+            return;
+        }
+        synchronized (this) {
+            if (locationPathCacheInitialized) {
+                return;
+            }
+            try {
+                // Create a LocationPath using the full method to get all cached values
+                LocationPath sampleLocationPath = LocationPath.of(samplePath, storagePropertiesMap);
+                cachedStorageProperties = sampleLocationPath.getStorageProperties();
+                cachedSchema = sampleLocationPath.getSchema();
+                cachedFsIdentifier = sampleLocationPath.getFsIdentifier();
+
+                // Extract fsIdPrefix like "s3://" from fsIdentifier like "s3://bucket"
+                int schemeEnd = cachedFsIdentifier.indexOf("://");
+                if (schemeEnd > 0) {
+                    cachedFsIdPrefix = cachedFsIdentifier.substring(0, schemeEnd + 3);
+                }
+
+                // Cache path prefix mapping for fast transformation
+                // This allows subsequent files to skip S3URI regex parsing entirely
+                String normalizedPath = sampleLocationPath.getNormalizedLocation();
+
+                // Find the common prefix by looking for the last '/' before the filename
+                int lastSlashInOriginal = samplePath.lastIndexOf('/');
+                int lastSlashInNormalized = normalizedPath.lastIndexOf('/');
+
+                if (lastSlashInOriginal > 0 && lastSlashInNormalized > 0) {
+                    cachedOriginalPathPrefix = samplePath.substring(0, lastSlashInOriginal + 1);
+                    cachedNormalizedPathPrefix = normalizedPath.substring(0, lastSlashInNormalized + 1);
+                }
+
+                locationPathCacheInitialized = true;
+            } catch (Exception e) {
+                // If caching fails, we'll fall back to the full method each time
+                LOG.warn("Failed to initialize LocationPath cache, will use full parsing", e);
+                locationPathCacheInitialized = true;
+            }
+        }
+    }
+
+    /**
+     * Create a LocationPath with cached values for better performance.
+     * Uses cached path prefix mapping to completely bypass S3URI regex parsing for most files.
+     * Falls back to full parsing if cache is not available or path doesn't match cached prefix.
+     */
+    private LocationPath createLocationPathWithCache(String path) {
+        // Initialize cache on first call
+        if (!locationPathCacheInitialized) {
+            initLocationPathCache(path);
+        }
+
+        // Fast path: if path starts with cached original prefix, directly transform without any parsing
+        if (cachedOriginalPathPrefix != null && path.startsWith(cachedOriginalPathPrefix)) {
+            // Transform: replace original prefix with normalized prefix
+            String normalizedPath = cachedNormalizedPathPrefix + path.substring(cachedOriginalPathPrefix.length());
+            return LocationPath.ofDirect(normalizedPath, cachedSchema, cachedFsIdentifier, cachedStorageProperties);
+        }
+
+        // Medium path: use cached StorageProperties but still need validateAndNormalizeUri
+        if (cachedStorageProperties != null) {
+            return LocationPath.ofWithCache(path, cachedStorageProperties, cachedSchema, cachedFsIdPrefix);
+        }
+
+        // Fallback to full parsing
+        return LocationPath.of(path, storagePropertiesMap);
+    }
+
     private Split createIcebergSplit(FileScanTask fileScanTask) {
         String originalPath = fileScanTask.file().path().toString();
-        LocationPath locationPath = LocationPath.of(originalPath, storagePropertiesMap);
+        LocationPath locationPath = createLocationPathWithCache(originalPath);
         IcebergSplit split = new IcebergSplit(
                 locationPath,
                 fileScanTask.start(),
diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java
index 99064b4e2e2d3b..71360fc47996e4 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java
@@ -33,6 +33,15 @@
 public class S3PropertyUtils {
     private static final Logger LOG = LogManager.getLogger(S3PropertyUtils.class);
 
+    private static final String SCHEME_DELIM = "://";
+    private static final String S3_SCHEME_PREFIX = "s3://";
+
+    // S3-compatible schemes that can be converted to s3:// with simple string replacement
+    // Format: scheme://bucket/key -> s3://bucket/key
+    private static final String[] SIMPLE_S3_COMPATIBLE_SCHEMES = {
+            "s3a", "s3n", "oss", "cos", "cosn", "obs", "bos", "gs"
+    };
+
     /**
      * Constructs the S3 endpoint from a given URI in the props map.
      *
@@ -113,7 +122,8 @@ public static String constructRegionFromUrl(Map<String, String> props,
 
     /**
      * Validates and normalizes the given path into a standard S3 URI.
-     * If the input already starts with "s3://", it is returned as-is.
+     * If the input already starts with a known S3-compatible scheme (s3://, s3a://, oss://, etc.),
+     * it is returned as-is to avoid expensive regex parsing.
      * Otherwise, it is parsed and converted into an S3-compatible URI format.
      *
      * @param path                            the raw S3-style path or full URI
@@ -132,16 +142,50 @@ public static String validateAndNormalizeUri(String path,
         if (StringUtils.isBlank(path)) {
             throw new StoragePropertiesException("path is null");
         }
-        if (path.startsWith("s3://")) {
+
+        // Fast path 1: s3:// paths are already in the normalized format expected by BE
+        if (path.startsWith(S3_SCHEME_PREFIX)) {
             return path;
         }
 
+        // Fast path 2: simple S3-compatible schemes (oss://, cos://, s3a://, etc.)
+        // can be converted with simple string replacement: scheme://bucket/key -> s3://bucket/key
+        String normalized = trySimpleSchemeConversion(path);
+        if (normalized != null) {
+            return normalized;
+        }
+
+        // Full parsing path: for HTTP URLs and other complex formats
         boolean usePathStyle = Boolean.parseBoolean(stringUsePathStyle);
         boolean forceParsingByStandardUri = Boolean.parseBoolean(stringForceParsingByStandardUri);
         S3URI s3uri = S3URI.create(path, usePathStyle, forceParsingByStandardUri);
         return "s3" + S3URI.SCHEME_DELIM + s3uri.getBucket() + S3URI.PATH_DELIM + s3uri.getKey();
     }
 
+    /**
+     * Try to convert simple S3-compatible scheme URIs to s3:// format using string replacement.
+     * This avoids expensive regex parsing for common cases like oss://bucket/key, s3a://bucket/key, etc.
+     *
+     * @param path the input path
+     * @return converted s3:// path if successful, null if the path doesn't match simple pattern
+     */
+    private static String trySimpleSchemeConversion(String path) {
+        int delimIndex = path.indexOf(SCHEME_DELIM);
+        if (delimIndex <= 0) {
+            return null;
+        }
+
+        String scheme = path.substring(0, delimIndex).toLowerCase();
+        for (String compatibleScheme : SIMPLE_S3_COMPATIBLE_SCHEMES) {
+            if (compatibleScheme.equals(scheme)) {
+                // Simple conversion: replace scheme with "s3"
+                // e.g., "oss://bucket/key" -> "s3://bucket/key"
+                return S3_SCHEME_PREFIX + path.substring(delimIndex + SCHEME_DELIM.length());
+            }
+        }
+        return null;
+    }
+
     /**
      * Extracts and returns the raw URI string from the given props map.
      *

From 3e58ab24b707ee00e6b2f6d2f8c3b51a17408757 Mon Sep 17 00:00:00 2001
From: Mryange <yanxuecheng@selectdb.com>
Date: Fri, 26 Dec 2025 17:58:12 +0800
Subject: [PATCH 12/12] pick 58636

---
 be/src/common/config.cpp                      |   2 +-
 be/src/olap/column_predicate.h                |   5 +-
 .../runtime_filter_selectivity.h              |  96 ++++++++
 be/src/vec/exprs/vexpr.cpp                    |  55 +++++
 be/src/vec/exprs/vexpr.h                      |  10 +-
 be/src/vec/exprs/vexpr_context.cpp            |  90 +------
 be/src/vec/exprs/vexpr_context.h              |   9 +
 be/src/vec/exprs/vruntimefilter_wrapper.cpp   | 117 +++++++--
 be/src/vec/exprs/vruntimefilter_wrapper.h     |  46 +---
 .../runtime_filter_selectivity_test.cpp       | 222 ++++++++++++++++++
 10 files changed, 495 insertions(+), 157 deletions(-)
 create mode 100644 be/src/runtime_filter/runtime_filter_selectivity.h
 create mode 100644 be/test/runtime_filter/runtime_filter_selectivity_test.cpp

diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index 29f84798fcf006..03a25a6f891583 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1037,7 +1037,7 @@ DEFINE_mInt64(big_column_size_buffer, "65535");
 DEFINE_mInt64(small_column_size_buffer, "100");
 
 // Perform the always_true check at intervals determined by runtime_filter_sampling_frequency
-DEFINE_mInt32(runtime_filter_sampling_frequency, "64");
+DEFINE_mInt32(runtime_filter_sampling_frequency, "32");
 DEFINE_mInt32(execution_max_rpc_timeout_sec, "3600");
 DEFINE_mBool(execution_ignore_eovercrowded, "true");
 // cooldown task configs
diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h
index 6e6671ff33766c..7162a96399da72 100644
--- a/be/src/olap/column_predicate.h
+++ b/be/src/olap/column_predicate.h
@@ -25,6 +25,7 @@
 #include "olap/rowset/segment_v2/bloom_filter.h"
 #include "olap/rowset/segment_v2/inverted_index_iterator.h"
 #include "runtime/define_primitive_type.h"
+#include "runtime_filter/runtime_filter_selectivity.h"
 #include "util/defer_op.h"
 #include "util/runtime_profile.h"
 #include "vec/columns/column.h"
@@ -372,8 +373,8 @@ class ColumnPredicate {
         if (!_always_true) {
             _judge_filter_rows += filter_rows;
             _judge_input_rows += input_rows;
-            vectorized::VRuntimeFilterWrapper::judge_selectivity(
-                    get_ignore_threshold(), _judge_filter_rows, _judge_input_rows, _always_true);
+            RuntimeFilterSelectivity::judge_selectivity(get_ignore_threshold(), _judge_filter_rows,
+                                                        _judge_input_rows, _always_true);
         }
     }
 
diff --git a/be/src/runtime_filter/runtime_filter_selectivity.h b/be/src/runtime_filter/runtime_filter_selectivity.h
new file mode 100644
index 00000000000000..1b0a82143de57a
--- /dev/null
+++ b/be/src/runtime_filter/runtime_filter_selectivity.h
@@ -0,0 +1,96 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "common/config.h"
+#include "common/logging.h"
+
+namespace doris {
+
+// Used to track the selectivity of runtime filters
+// If the selectivity of a runtime filter is very low, it is considered ineffective and can be ignored
+// Considering that the selectivity of runtime filters may change with data variations
+// A dynamic selectivity tracking mechanism is needed
+// Note: this is not a thread-safe class
+
+class RuntimeFilterSelectivity {
+public:
+    RuntimeFilterSelectivity() = default;
+
+    RuntimeFilterSelectivity(const RuntimeFilterSelectivity&) = delete;
+    void update_judge_counter() {
+        if ((_judge_counter++) >= config::runtime_filter_sampling_frequency) {
+            reset_judge_selectivity();
+        }
+    }
+
+    void update_judge_selectivity(int filter_id, uint64_t filter_rows, uint64_t input_rows,
+                                  double ignore_thredhold) {
+        if (!_always_true) {
+            _judge_filter_rows += filter_rows;
+            _judge_input_rows += input_rows;
+            judge_selectivity(ignore_thredhold, _judge_filter_rows, _judge_input_rows,
+                              _always_true);
+        }
+
+        VLOG_ROW << fmt::format(
+                "Runtime filter[{}] selectivity update: filter_rows: {}, input_rows: {},  filter "
+                "rate: {}, "
+                "ignore_thredhold: {}, counter: {} , always_true: {}",
+                filter_id, _judge_filter_rows, _judge_input_rows,
+                static_cast<double>(_judge_filter_rows) / static_cast<double>(_judge_input_rows),
+                ignore_thredhold, _judge_counter, _always_true);
+    }
+
+    bool maybe_always_true_can_ignore() const {
+        /// TODO: maybe we can use session variable to control this behavior ?
+        if (config::runtime_filter_sampling_frequency <= 0) {
+            return false;
+        } else {
+            return _always_true;
+        }
+    }
+
+    static void judge_selectivity(double ignore_threshold, int64_t filter_rows, int64_t input_rows,
+                                  bool& always_true) {
+        // if the judged input rows is too small, we think the selectivity is not reliable
+        if (input_rows > min_judge_input_rows) {
+            always_true = (static_cast<double>(filter_rows) / static_cast<double>(input_rows)) <
+                          ignore_threshold;
+        }
+    }
+
+private:
+    void reset_judge_selectivity() {
+        _always_true = false;
+        _judge_counter = 0;
+        _judge_input_rows = 0;
+        _judge_filter_rows = 0;
+    }
+
+    int64_t _judge_input_rows = 0;
+    int64_t _judge_filter_rows = 0;
+    int _judge_counter = 0;
+    bool _always_true = false;
+
+    constexpr static int64_t min_judge_input_rows = 4096 * 10;
+};
+
+} // namespace doris
diff --git a/be/src/vec/exprs/vexpr.cpp b/be/src/vec/exprs/vexpr.cpp
index 1bafe01ad710de..52d4ca01eac856 100644
--- a/be/src/vec/exprs/vexpr.cpp
+++ b/be/src/vec/exprs/vexpr.cpp
@@ -1015,5 +1015,60 @@ bool VExpr::ann_dist_is_fulfilled() const {
     return _virtual_column_is_fulfilled;
 }
 
+Status VExpr::execute_filter(VExprContext* context, const Block* block,
+                             uint8_t* __restrict result_filter_data, size_t rows, bool accept_null,
+                             bool* can_filter_all) const {
+    ColumnPtr filter_column;
+    RETURN_IF_ERROR(execute_column(context, block, filter_column));
+    if (const auto* const_column = check_and_get_column<ColumnConst>(*filter_column)) {
+        // const(nullable) or const(bool)
+        const bool result = accept_null
+                                    ? (const_column->is_null_at(0) || const_column->get_bool(0))
+                                    : (!const_column->is_null_at(0) && const_column->get_bool(0));
+        if (!result) {
+            // filter all
+            *can_filter_all = true;
+            memset(result_filter_data, 0, rows);
+            return Status::OK();
+        }
+    } else if (const auto* nullable_column = check_and_get_column<ColumnNullable>(*filter_column)) {
+        // nullable(bool)
+        const ColumnPtr& nested_column = nullable_column->get_nested_column_ptr();
+        const IColumn::Filter& filter = assert_cast<const ColumnUInt8&>(*nested_column).get_data();
+        const auto* __restrict filter_data = filter.data();
+        const auto* __restrict null_map_data = nullable_column->get_null_map_data().data();
+
+        if (accept_null) {
+            for (size_t i = 0; i < rows; ++i) {
+                result_filter_data[i] &= (null_map_data[i]) || filter_data[i];
+            }
+        } else {
+            for (size_t i = 0; i < rows; ++i) {
+                result_filter_data[i] &= (!null_map_data[i]) & filter_data[i];
+            }
+        }
+
+        if ((memchr(result_filter_data, 0x1, rows) == nullptr)) {
+            *can_filter_all = true;
+            return Status::OK();
+        }
+    } else {
+        // bool
+        const IColumn::Filter& filter = assert_cast<const ColumnUInt8&>(*filter_column).get_data();
+        const auto* __restrict filter_data = filter.data();
+
+        for (size_t i = 0; i < rows; ++i) {
+            result_filter_data[i] &= filter_data[i];
+        }
+
+        if (memchr(result_filter_data, 0x1, rows) == nullptr) {
+            *can_filter_all = true;
+            return Status::OK();
+        }
+    }
+
+    return Status::OK();
+}
+
 #include "common/compile_check_end.h"
 } // namespace doris::vectorized
diff --git a/be/src/vec/exprs/vexpr.h b/be/src/vec/exprs/vexpr.h
index 35a0d3733b094f..2a0abe439f9778 100644
--- a/be/src/vec/exprs/vexpr.h
+++ b/be/src/vec/exprs/vexpr.h
@@ -147,6 +147,10 @@ class VExpr {
     // Therefore we need a function like this to return the actual type produced by execution.
     virtual DataTypePtr execute_type(const Block* block) const { return _data_type; }
 
+    virtual Status execute_filter(VExprContext* context, const Block* block,
+                                  uint8_t* __restrict result_filter_data, size_t rows,
+                                  bool accept_null, bool* can_filter_all) const;
+
     // `is_blockable` means this expr will be blocked in `execute` (e.g. AI Function, Remote Function)
     [[nodiscard]] virtual bool is_blockable() const {
         return std::any_of(_children.begin(), _children.end(),
@@ -204,12 +208,6 @@ class VExpr {
                                    [](VExprSPtr child) { return child->is_rf_wrapper(); });
     }
 
-    virtual void do_judge_selectivity(uint64_t filter_rows, uint64_t input_rows) {
-        for (auto child : _children) {
-            child->do_judge_selectivity(filter_rows, input_rows);
-        }
-    }
-
     static Status create_expr_tree(const TExpr& texpr, VExprContextSPtr& ctx);
 
     static Status create_expr_trees(const std::vector<TExpr>& texprs, VExprContextSPtrs& ctxs);
diff --git a/be/src/vec/exprs/vexpr_context.cpp b/be/src/vec/exprs/vexpr_context.cpp
index a7b71b77646435..2a9c049e303b77 100644
--- a/be/src/vec/exprs/vexpr_context.cpp
+++ b/be/src/vec/exprs/vexpr_context.cpp
@@ -199,7 +199,12 @@ Status VExprContext::execute_conjuncts(const VExprContextSPtrs& ctxs,
     return execute_conjuncts(ctxs, filters, false, block, result_filter, can_filter_all);
 }
 
-// TODO: Performance Optimization
+Status VExprContext::execute_filter(const Block* block, uint8_t* __restrict result_filter_data,
+                                    size_t rows, bool accept_null, bool* can_filter_all) {
+    return _root->execute_filter(this, block, result_filter_data, rows, accept_null,
+                                 can_filter_all);
+}
+
 Status VExprContext::execute_conjuncts(const VExprContextSPtrs& ctxs,
                                        const std::vector<IColumn::Filter*>* filters,
                                        bool accept_null, const Block* block,
@@ -209,85 +214,10 @@ Status VExprContext::execute_conjuncts(const VExprContextSPtrs& ctxs,
     *can_filter_all = false;
     auto* __restrict result_filter_data = result_filter->data();
     for (const auto& ctx : ctxs) {
-        // Statistics are only required when an rf wrapper exists in the expr.
-        bool is_rf_wrapper = ctx->root()->is_rf_wrapper();
-        ColumnPtr filter_column;
-        RETURN_IF_ERROR(ctx->execute(block, filter_column));
-        if (const auto* nullable_column = check_and_get_column<ColumnNullable>(*filter_column)) {
-            size_t column_size = nullable_column->size();
-            if (column_size == 0) {
-                *can_filter_all = true;
-                return Status::OK();
-            } else {
-                const ColumnPtr& nested_column = nullable_column->get_nested_column_ptr();
-                const IColumn::Filter& filter =
-                        assert_cast<const ColumnUInt8&>(*nested_column).get_data();
-                const auto* __restrict filter_data = filter.data();
-                const auto* __restrict null_map_data = nullable_column->get_null_map_data().data();
-
-                size_t input_rows =
-                        rows - (is_rf_wrapper
-                                        ? simd::count_zero_num((int8_t*)result_filter_data, rows)
-                                        : 0);
-
-                if (accept_null) {
-                    for (size_t i = 0; i < rows; ++i) {
-                        result_filter_data[i] &= (null_map_data[i]) || filter_data[i];
-                    }
-                } else {
-                    for (size_t i = 0; i < rows; ++i) {
-                        result_filter_data[i] &= (!null_map_data[i]) & filter_data[i];
-                    }
-                }
-
-                size_t output_rows =
-                        rows - (is_rf_wrapper
-                                        ? simd::count_zero_num((int8_t*)result_filter_data, rows)
-                                        : 0);
-
-                if (is_rf_wrapper) {
-                    ctx->root()->do_judge_selectivity(input_rows - output_rows, input_rows);
-                }
-
-                if ((is_rf_wrapper && output_rows == 0) ||
-                    (!is_rf_wrapper && memchr(result_filter_data, 0x1, rows) == nullptr)) {
-                    *can_filter_all = true;
-                    return Status::OK();
-                }
-            }
-        } else if (const auto* const_column = check_and_get_column<ColumnConst>(*filter_column)) {
-            // filter all
-            if (!const_column->get_bool(0)) {
-                *can_filter_all = true;
-                memset(result_filter_data, 0, result_filter->size());
-                return Status::OK();
-            }
-        } else {
-            const IColumn::Filter& filter =
-                    assert_cast<const ColumnUInt8&>(*filter_column).get_data();
-            const auto* __restrict filter_data = filter.data();
-
-            size_t input_rows =
-                    rows -
-                    (is_rf_wrapper ? simd::count_zero_num((int8_t*)result_filter_data, rows) : 0);
-
-            for (size_t i = 0; i < rows; ++i) {
-                result_filter_data[i] &= filter_data[i];
-            }
-
-            size_t output_rows =
-                    rows -
-                    (is_rf_wrapper ? simd::count_zero_num((int8_t*)result_filter_data, rows) : 0);
-
-            if (is_rf_wrapper) {
-                ctx->root()->do_judge_selectivity(input_rows - output_rows, input_rows);
-            }
-
-            if ((is_rf_wrapper && output_rows == 0) ||
-                (!is_rf_wrapper && memchr(result_filter_data, 0x1, rows) == nullptr)) {
-                *can_filter_all = true;
-                return Status::OK();
-            }
+        RETURN_IF_ERROR(
+                ctx->execute_filter(block, result_filter_data, rows, accept_null, can_filter_all));
+        if (*can_filter_all) {
+            return Status::OK();
         }
     }
     if (filters != nullptr) {
diff --git a/be/src/vec/exprs/vexpr_context.h b/be/src/vec/exprs/vexpr_context.h
index 3179526ec546d6..349f199af234b6 100644
--- a/be/src/vec/exprs/vexpr_context.h
+++ b/be/src/vec/exprs/vexpr_context.h
@@ -33,6 +33,7 @@
 #include "olap/rowset/segment_v2/inverted_index_reader.h"
 #include "runtime/runtime_state.h"
 #include "runtime/types.h"
+#include "runtime_filter/runtime_filter_selectivity.h"
 #include "udf/udf.h"
 #include "vec/columns/column.h"
 #include "vec/core/block.h"
@@ -210,6 +211,9 @@ class VExprContext {
 
     bool all_expr_inverted_index_evaluated();
 
+    Status execute_filter(const Block* block, uint8_t* __restrict result_filter_data, size_t rows,
+                          bool accept_null, bool* can_filter_all);
+
     [[nodiscard]] static Status filter_block(VExprContext* vexpr_ctx, Block* block);
 
     [[nodiscard]] static Status filter_block(const VExprContextSPtrs& expr_contexts, Block* block,
@@ -246,6 +250,8 @@ class VExprContext {
         return _last_result_column_id;
     }
 
+    RuntimeFilterSelectivity& get_runtime_filter_selectivity() { return *_rf_selectivity; }
+
     FunctionContext::FunctionStateScope get_function_state_scope() const {
         return _is_clone ? FunctionContext::THREAD_LOCAL : FunctionContext::FRAGMENT_LOCAL;
     }
@@ -337,5 +343,8 @@ class VExprContext {
 
     segment_v2::AnnRangeSearchRuntime _ann_range_search_runtime;
     bool _suitable_for_ann_index = true;
+
+    std::unique_ptr<RuntimeFilterSelectivity> _rf_selectivity =
+            std::make_unique<RuntimeFilterSelectivity>();
 };
 } // namespace doris::vectorized
diff --git a/be/src/vec/exprs/vruntimefilter_wrapper.cpp b/be/src/vec/exprs/vruntimefilter_wrapper.cpp
index 8e915ffff675f0..b24df4860dae1b 100644
--- a/be/src/vec/exprs/vruntimefilter_wrapper.cpp
+++ b/be/src/vec/exprs/vruntimefilter_wrapper.cpp
@@ -62,9 +62,7 @@ VRuntimeFilterWrapper::VRuntimeFilterWrapper(const TExprNode& node, VExprSPtr im
           _impl(std::move(impl)),
           _ignore_thredhold(ignore_thredhold),
           _null_aware(null_aware),
-          _filter_id(filter_id) {
-    reset_judge_selectivity();
-}
+          _filter_id(filter_id) {}
 
 Status VRuntimeFilterWrapper::prepare(RuntimeState* state, const RowDescriptor& desc,
                                       VExprContext* context) {
@@ -89,38 +87,105 @@ void VRuntimeFilterWrapper::close(VExprContext* context,
 
 Status VRuntimeFilterWrapper::execute_column(VExprContext* context, const Block* block,
                                              ColumnPtr& result_column) const {
-    DCHECK(_open_finished || _getting_const_col);
-    if (_judge_counter.fetch_sub(1) == 0) {
-        reset_judge_selectivity();
+    return Status::InternalError("Not implement VRuntimeFilterWrapper::execute_column");
+}
+
+const std::string& VRuntimeFilterWrapper::expr_name() const {
+    return _expr_name;
+}
+
+Status VRuntimeFilterWrapper::execute_filter(VExprContext* context, const Block* block,
+                                             uint8_t* __restrict result_filter_data, size_t rows,
+                                             bool accept_null, bool* can_filter_all) const {
+    DCHECK(_open_finished);
+    if (accept_null) {
+        return Status::InternalError(
+                "Runtime filter does not support accept_null in execute_filter");
     }
-    if (_always_true) {
-        size_t size = block->rows();
-        result_column = create_always_true_column(size, _data_type->is_nullable());
-        COUNTER_UPDATE(_always_true_filter_rows, size);
+
+    auto& rf_selectivity = context->get_runtime_filter_selectivity();
+    Defer auto_update_judge_counter = [&]() { rf_selectivity.update_judge_counter(); };
+
+    // if always true, skip evaluate runtime filter
+    if (rf_selectivity.maybe_always_true_can_ignore()) {
+        COUNTER_UPDATE(_always_true_filter_rows, rows);
         return Status::OK();
-    } else {
-        if (_getting_const_col) {
-            _impl->set_getting_const_col(true);
+    }
+
+    ColumnPtr filter_column;
+    ColumnPtr arg_column = nullptr;
+    RETURN_IF_ERROR(_impl->execute_runtime_filter(context, block, filter_column, &arg_column));
+
+    // bloom filter will handle null aware inside itself
+    if (_null_aware && TExprNodeType::BLOOM_PRED != node_type()) {
+        DCHECK(arg_column);
+        change_null_to_true(filter_column->assume_mutable(), arg_column);
+    }
+
+    if (const auto* const_column = check_and_get_column<ColumnConst>(*filter_column)) {
+        // const(nullable) or const(bool)
+        if (!const_column->get_bool(0)) {
+            // filter all
+            COUNTER_UPDATE(_rf_filter_rows, rows);
+            COUNTER_UPDATE(_rf_input_rows, rows);
+            rf_selectivity.update_judge_selectivity(_filter_id, rows, rows, _ignore_thredhold);
+            *can_filter_all = true;
+            memset(result_filter_data, 0, rows);
+            return Status::OK();
+        } else {
+            // filter none
+            COUNTER_UPDATE(_rf_input_rows, rows);
+            rf_selectivity.update_judge_selectivity(_filter_id, 0, rows, _ignore_thredhold);
+            return Status::OK();
         }
+    } else if (const auto* nullable_column = check_and_get_column<ColumnNullable>(*filter_column)) {
+        // nullable(bool)
+        const ColumnPtr& nested_column = nullable_column->get_nested_column_ptr();
+        const IColumn::Filter& filter = assert_cast<const ColumnUInt8&>(*nested_column).get_data();
+        const auto* __restrict filter_data = filter.data();
+        const auto* __restrict null_map_data = nullable_column->get_null_map_data().data();
+
+        const size_t input_rows = rows - simd::count_zero_num((int8_t*)result_filter_data, rows);
 
-        ColumnPtr arg_column = nullptr;
-        RETURN_IF_ERROR(_impl->execute_runtime_filter(context, block, result_column, &arg_column));
-        if (_getting_const_col) {
-            _impl->set_getting_const_col(false);
+        for (size_t i = 0; i < rows; ++i) {
+            result_filter_data[i] &= (!null_map_data[i]) & filter_data[i];
         }
 
-        // bloom filter will handle null aware inside itself
-        if (_null_aware && TExprNodeType::BLOOM_PRED != node_type()) {
-            DCHECK(arg_column);
-            change_null_to_true(result_column->assume_mutable(), arg_column);
+        const size_t output_rows = rows - simd::count_zero_num((int8_t*)result_filter_data, rows);
+
+        COUNTER_UPDATE(_rf_filter_rows, input_rows - output_rows);
+        COUNTER_UPDATE(_rf_input_rows, input_rows);
+        rf_selectivity.update_judge_selectivity(_filter_id, input_rows - output_rows, input_rows,
+                                                _ignore_thredhold);
+
+        if (output_rows == 0) {
+            *can_filter_all = true;
+            return Status::OK();
         }
+    } else {
+        // bool
+        const IColumn::Filter& filter = assert_cast<const ColumnUInt8&>(*filter_column).get_data();
+        const auto* __restrict filter_data = filter.data();
 
-        return Status::OK();
-    }
-}
+        const size_t input_rows = rows - simd::count_zero_num((int8_t*)result_filter_data, rows);
 
-const std::string& VRuntimeFilterWrapper::expr_name() const {
-    return _expr_name;
+        for (size_t i = 0; i < rows; ++i) {
+            result_filter_data[i] &= filter_data[i];
+        }
+
+        const size_t output_rows = rows - simd::count_zero_num((int8_t*)result_filter_data, rows);
+
+        COUNTER_UPDATE(_rf_filter_rows, input_rows - output_rows);
+        COUNTER_UPDATE(_rf_input_rows, input_rows);
+        rf_selectivity.update_judge_selectivity(_filter_id, input_rows - output_rows, input_rows,
+                                                _ignore_thredhold);
+
+        if (output_rows == 0) {
+            *can_filter_all = true;
+            return Status::OK();
+        }
+    }
+    return Status::OK();
 }
 
 #include "common/compile_check_end.h"
diff --git a/be/src/vec/exprs/vruntimefilter_wrapper.h b/be/src/vec/exprs/vruntimefilter_wrapper.h
index 3535898915b2ba..09bc8a815c7d6d 100644
--- a/be/src/vec/exprs/vruntimefilter_wrapper.h
+++ b/be/src/vec/exprs/vruntimefilter_wrapper.h
@@ -63,6 +63,10 @@ class VRuntimeFilterWrapper final : public VExpr {
     const std::string& expr_name() const override;
     const VExprSPtrs& children() const override { return _impl->children(); }
 
+    Status execute_filter(VExprContext* context, const Block* block,
+                          uint8_t* __restrict result_filter_data, size_t rows, bool accept_null,
+                          bool* can_filter_all) const override;
+
     uint64_t get_digest(uint64_t seed) const override {
         seed = _impl->get_digest(seed);
         if (seed) {
@@ -91,33 +95,10 @@ class VRuntimeFilterWrapper final : public VExpr {
         }
     }
 
-    void update_counters(int64_t filter_rows, int64_t input_rows) {
-        COUNTER_UPDATE(_rf_filter_rows, filter_rows);
-        COUNTER_UPDATE(_rf_input_rows, input_rows);
-    }
-
-    template <typename T>
-    static void judge_selectivity(double ignore_threshold, int64_t filter_rows, int64_t input_rows,
-                                  T& always_true) {
-        always_true = static_cast<double>(filter_rows) / static_cast<double>(input_rows) <
-                      ignore_threshold;
-    }
-
     bool is_rf_wrapper() const override { return true; }
 
     int filter_id() const { return _filter_id; }
 
-    void do_judge_selectivity(uint64_t filter_rows, uint64_t input_rows) override {
-        update_counters(filter_rows, input_rows);
-
-        if (!_always_true) {
-            _judge_filter_rows += filter_rows;
-            _judge_input_rows += input_rows;
-            judge_selectivity(_ignore_thredhold, _judge_filter_rows, _judge_input_rows,
-                              _always_true);
-        }
-    }
-
     std::shared_ptr<RuntimeProfile::Counter> predicate_filtered_rows_counter() const {
         return _rf_filter_rows;
     }
@@ -129,26 +110,7 @@ class VRuntimeFilterWrapper final : public VExpr {
     }
 
 private:
-    void reset_judge_selectivity() const {
-        _always_true = false;
-        _judge_counter = config::runtime_filter_sampling_frequency;
-        _judge_input_rows = 0;
-        _judge_filter_rows = 0;
-    }
-
     VExprSPtr _impl;
-    // VRuntimeFilterWrapper and ColumnPredicate share the same logic,
-    // but it's challenging to unify them, so the code is duplicated.
-    // _judge_counter, _judge_input_rows, _judge_filter_rows, and _always_true
-    // are variables used to implement the _always_true logic, calculated periodically
-    // based on runtime_filter_sampling_frequency. During each period, if _always_true
-    // is evaluated as true, the logic for always_true is applied for the rest of that period
-    // without recalculating. At the beginning of the next period,
-    // reset_judge_selectivity is used to reset these variables.
-    mutable std::atomic_int _judge_counter = 0;
-    mutable std::atomic_uint64_t _judge_input_rows = 0;
-    mutable std::atomic_uint64_t _judge_filter_rows = 0;
-    mutable std::atomic_int _always_true = false;
 
     std::shared_ptr<RuntimeProfile::Counter> _rf_input_rows =
             std::make_shared<RuntimeProfile::Counter>(TUnit::UNIT, 0);
diff --git a/be/test/runtime_filter/runtime_filter_selectivity_test.cpp b/be/test/runtime_filter/runtime_filter_selectivity_test.cpp
new file mode 100644
index 00000000000000..b8504f950c21d1
--- /dev/null
+++ b/be/test/runtime_filter/runtime_filter_selectivity_test.cpp
@@ -0,0 +1,222 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime_filter/runtime_filter_selectivity.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+namespace doris {
+
+class RuntimeFilterSelectivityTest : public testing::Test {
+protected:
+    void SetUp() override {
+        // Save original config value
+        _original_sampling_frequency = config::runtime_filter_sampling_frequency;
+    }
+
+    void TearDown() override {
+        // Restore original config value
+        config::runtime_filter_sampling_frequency = _original_sampling_frequency;
+    }
+
+    int _original_sampling_frequency;
+};
+
+TEST_F(RuntimeFilterSelectivityTest, basic_initialization) {
+    RuntimeFilterSelectivity selectivity;
+    // Initially should be false (not always_true)
+    EXPECT_FALSE(selectivity.maybe_always_true_can_ignore());
+}
+
+TEST_F(RuntimeFilterSelectivityTest, disabled_sampling_frequency) {
+    RuntimeFilterSelectivity selectivity;
+    config::runtime_filter_sampling_frequency = 0;
+
+    // Even if conditions are met, should return false when sampling is disabled
+    selectivity.update_judge_selectivity(-1, 2000, 50000, 0.1);
+    EXPECT_FALSE(selectivity.maybe_always_true_can_ignore());
+}
+
+TEST_F(RuntimeFilterSelectivityTest, negative_sampling_frequency) {
+    RuntimeFilterSelectivity selectivity;
+    config::runtime_filter_sampling_frequency = -1;
+
+    selectivity.update_judge_selectivity(-1, 2000, 50000, 0.1);
+    EXPECT_FALSE(selectivity.maybe_always_true_can_ignore());
+}
+
+TEST_F(RuntimeFilterSelectivityTest, judge_selectivity_below_threshold) {
+    bool always_true = false;
+    // filter_rows/input_rows = 5/50000 = 0.0001 < 0.1
+    // input_rows (50000) > min_judge_input_rows (40960)
+    RuntimeFilterSelectivity::judge_selectivity(0.1, 5, 50000, always_true);
+    EXPECT_TRUE(always_true);
+}
+
+TEST_F(RuntimeFilterSelectivityTest, judge_selectivity_above_threshold) {
+    bool always_true = false;
+    // filter_rows/input_rows = 25000/50000 = 0.5 >= 0.1
+    RuntimeFilterSelectivity::judge_selectivity(0.1, 25000, 50000, always_true);
+    EXPECT_FALSE(always_true);
+}
+
+TEST_F(RuntimeFilterSelectivityTest, judge_selectivity_insufficient_input_rows) {
+    bool always_true = false;
+    // Even though 5/100 = 0.05 < 0.1, input_rows (100) < min_judge_input_rows (40960)
+    RuntimeFilterSelectivity::judge_selectivity(0.1, 5, 100, always_true);
+    EXPECT_FALSE(always_true);
+}
+
+TEST_F(RuntimeFilterSelectivityTest, update_with_low_selectivity) {
+    config::runtime_filter_sampling_frequency = 100;
+    RuntimeFilterSelectivity selectivity;
+
+    // filter_rows/input_rows = 2000/50000 = 0.04 < 0.1
+    selectivity.update_judge_selectivity(-1, 2000, 50000, 0.1);
+    EXPECT_TRUE(selectivity.maybe_always_true_can_ignore());
+}
+
+TEST_F(RuntimeFilterSelectivityTest, update_with_high_selectivity) {
+    config::runtime_filter_sampling_frequency = 100;
+    RuntimeFilterSelectivity selectivity;
+
+    // filter_rows/input_rows = 45000/50000 = 0.9 >= 0.1
+    selectivity.update_judge_selectivity(-1, 45000, 50000, 0.1);
+    EXPECT_FALSE(selectivity.maybe_always_true_can_ignore());
+}
+
+TEST_F(RuntimeFilterSelectivityTest, once_always_true_stays_true) {
+    config::runtime_filter_sampling_frequency = 100;
+    RuntimeFilterSelectivity selectivity;
+
+    // First update: low selectivity
+    selectivity.update_judge_selectivity(-1, 2000, 50000, 0.1);
+    EXPECT_TRUE(selectivity.maybe_always_true_can_ignore());
+
+    // Second update: high selectivity, but should be ignored
+    selectivity.update_judge_selectivity(-1, 45000, 50000, 0.1);
+    EXPECT_TRUE(selectivity.maybe_always_true_can_ignore());
+}
+
+TEST_F(RuntimeFilterSelectivityTest, accumulated_selectivity_low) {
+    config::runtime_filter_sampling_frequency = 100;
+    RuntimeFilterSelectivity selectivity;
+
+    // First update: 1000/50000 = 0.02
+    selectivity.update_judge_selectivity(-1, 1000, 50000, 0.1);
+    EXPECT_TRUE(selectivity.maybe_always_true_can_ignore());
+}
+
+TEST_F(RuntimeFilterSelectivityTest, accumulated_selectivity_high) {
+    config::runtime_filter_sampling_frequency = 100;
+    RuntimeFilterSelectivity selectivity;
+
+    // First update: 20000/50000 = 0.4
+    selectivity.update_judge_selectivity(-1, 20000, 50000, 0.1);
+    EXPECT_FALSE(selectivity.maybe_always_true_can_ignore());
+
+    // Second update: accumulated (20000+20000)/(50000+50000) = 0.4
+    selectivity.update_judge_selectivity(-1, 20000, 50000, 0.1);
+    EXPECT_FALSE(selectivity.maybe_always_true_can_ignore());
+}
+
+TEST_F(RuntimeFilterSelectivityTest, counter_triggers_reset) {
+    config::runtime_filter_sampling_frequency = 3;
+    RuntimeFilterSelectivity selectivity;
+
+    // Mark as always_true
+    selectivity.update_judge_selectivity(-1, 2000, 50000, 0.1);
+    EXPECT_TRUE(selectivity.maybe_always_true_can_ignore());
+
+    // Update counter to trigger reset
+    selectivity.update_judge_counter(); // counter = 1
+    selectivity.update_judge_counter(); // counter = 2
+    selectivity.update_judge_counter(); // counter = 3, triggers reset
+
+    EXPECT_TRUE(selectivity.maybe_always_true_can_ignore());
+}
+
+TEST_F(RuntimeFilterSelectivityTest, reset_allows_reevaluation) {
+    config::runtime_filter_sampling_frequency = 2;
+    RuntimeFilterSelectivity selectivity;
+
+    // First cycle: mark as always_true
+    selectivity.update_judge_selectivity(-1, 2000, 50000, 0.1);
+    EXPECT_TRUE(selectivity.maybe_always_true_can_ignore());
+
+    // Trigger reset
+    selectivity.update_judge_counter(); // counter = 1
+    selectivity.update_judge_counter(); // counter = 2, triggers reset
+
+    // Second cycle: now with high selectivity
+    selectivity.update_judge_selectivity(-1, 45000, 50000, 0.1);
+    EXPECT_TRUE(selectivity.maybe_always_true_can_ignore());
+}
+
+TEST_F(RuntimeFilterSelectivityTest, edge_case_zero_rows) {
+    bool always_true = false;
+    RuntimeFilterSelectivity::judge_selectivity(0.1, 0, 0, always_true);
+    EXPECT_FALSE(always_true);
+}
+
+TEST_F(RuntimeFilterSelectivityTest, edge_case_exact_threshold) {
+    bool always_true = false;
+    // Exactly at threshold: 5000/50000 = 0.1, NOT less than 0.1
+    RuntimeFilterSelectivity::judge_selectivity(0.1, 5000, 50000, always_true);
+    EXPECT_FALSE(always_true);
+
+    // Just below threshold: 4999/50000 = 0.09998 < 0.1
+    RuntimeFilterSelectivity::judge_selectivity(0.1, 4999, 50000, always_true);
+    EXPECT_TRUE(always_true);
+}
+
+TEST_F(RuntimeFilterSelectivityTest, multiple_updates_before_threshold) {
+    config::runtime_filter_sampling_frequency = 100;
+    RuntimeFilterSelectivity selectivity;
+
+    // Multiple updates with insufficient rows each time
+    selectivity.update_judge_selectivity(-1, 100, 1000, 0.1); // 100/1000, insufficient
+    EXPECT_FALSE(selectivity.maybe_always_true_can_ignore());
+
+    selectivity.update_judge_selectivity(-1, 200, 2000, 0.1); // 300/3000, insufficient
+    EXPECT_FALSE(selectivity.maybe_always_true_can_ignore());
+
+    // Now accumulated rows are sufficient: 300+2000 = 2300, 3000+40000 = 43000
+    selectivity.update_judge_selectivity(-1, 2000, 40000, 0.1); // 2300/43000 = 0.053 < 0.1
+    EXPECT_TRUE(selectivity.maybe_always_true_can_ignore());
+}
+
+TEST_F(RuntimeFilterSelectivityTest, different_thresholds) {
+    config::runtime_filter_sampling_frequency = 100;
+
+    // Test with threshold 0.05
+    {
+        RuntimeFilterSelectivity selectivity;
+        selectivity.update_judge_selectivity(-1, 2000, 50000, 0.05); // 0.04 < 0.05
+        EXPECT_TRUE(selectivity.maybe_always_true_can_ignore());
+    }
+
+    // Test with threshold 0.03
+    {
+        RuntimeFilterSelectivity selectivity;
+        selectivity.update_judge_selectivity(-1, 2000, 50000, 0.03); // 0.04 >= 0.03
+        EXPECT_FALSE(selectivity.maybe_always_true_can_ignore());
+    }
+}
+
+} // namespace doris