Merge branch 'main' into chore/upgrade-rust-2024-group-5

timsaucer · web-flow · commit eacf226f1bf5 · 2025-12-09T07:40:48.000+01:00
diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs
@@ -579,14 +579,32 @@ pub fn fmt_sql(expr: &dyn PhysicalExpr) -> impl Display + '_ {
 pub fn snapshot_physical_expr(
     expr: Arc<dyn PhysicalExpr>,
 ) -> Result<Arc<dyn PhysicalExpr>> {
+    snapshot_physical_expr_opt(expr).data()
+}
+
+/// Take a snapshot of the given `PhysicalExpr` if it is dynamic.
+///
+/// Take a snapshot of this `PhysicalExpr` if it is dynamic.
+/// This is used to capture the current state of `PhysicalExpr`s that may contain
+/// dynamic references to other operators in order to serialize it over the wire
+/// or treat it via downcast matching.
+///
+/// See the documentation of [`PhysicalExpr::snapshot`] for more details.
+///
+/// # Returns
+///
+/// Returns a `[`Transformed`] indicating whether a snapshot was taken,
+/// along with the resulting `PhysicalExpr`.
+pub fn snapshot_physical_expr_opt(
+    expr: Arc<dyn PhysicalExpr>,
+) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
     expr.transform_up(|e| {
         if let Some(snapshot) = e.snapshot()? {
             Ok(Transformed::yes(snapshot))
         } else {
             Ok(Transformed::no(Arc::clone(&e)))
         }
     })
-    .data()
 }
 
 /// Check the generation of this `PhysicalExpr`.
diff --git a/datafusion/physical-expr/benches/in_list.rs b/datafusion/physical-expr/benches/in_list.rs
@@ -15,17 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{Array, ArrayRef, Float32Array, Int32Array, StringArray};
+use arrow::array::{
+    Array, ArrayRef, Float32Array, Int32Array, StringArray, StringViewArray,
+};
 use arrow::datatypes::{Field, Schema};
 use arrow::record_batch::RecordBatch;
 use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
 use datafusion_physical_expr::expressions::{col, in_list, lit};
 use rand::distr::Alphanumeric;
 use rand::prelude::*;
+use std::any::TypeId;
 use std::hint::black_box;
 use std::sync::Arc;
 
+/// Measures how long `in_list(col("a"), exprs)` takes to evaluate against a single RecordBatch.
 fn do_bench(c: &mut Criterion, name: &str, values: ArrayRef, exprs: &[ScalarValue]) {
     let schema = Schema::new(vec![Field::new("a", values.data_type().clone(), true)]);
     let exprs = exprs.iter().map(|s| lit(s.clone())).collect();
@@ -37,79 +41,128 @@ fn do_bench(c: &mut Criterion, name: &str, values: ArrayRef, exprs: &[ScalarValu
     });
 }
 
+/// Generates a random alphanumeric string of the specified length.
 fn random_string(rng: &mut StdRng, len: usize) -> String {
     let value = rng.sample_iter(&Alphanumeric).take(len).collect();
     String::from_utf8(value).unwrap()
 }
 
-fn do_benches(
-    c: &mut Criterion,
-    array_length: usize,
-    in_list_length: usize,
-    null_percent: f64,
-) {
-    let mut rng = StdRng::seed_from_u64(120320);
-    let non_null_percent = 1.0 - null_percent;
-
-    for string_length in [5, 10, 20] {
-        let values: StringArray = (0..array_length)
-            .map(|_| {
-                rng.random_bool(non_null_percent)
-                    .then(|| random_string(&mut rng, string_length))
-            })
-            .collect();
-
-        let in_list: Vec<_> = (0..in_list_length)
-            .map(|_| ScalarValue::from(random_string(&mut rng, string_length)))
-            .collect();
-
-        do_bench(
-            c,
-            &format!(
-                "in_list_utf8({string_length}) ({array_length}, {null_percent}) IN ({in_list_length}, 0)"
-            ),
-            Arc::new(values),
-            &in_list,
-        )
+const IN_LIST_LENGTHS: [usize; 3] = [3, 8, 100];
+const NULL_PERCENTS: [f64; 2] = [0., 0.2];
+const STRING_LENGTHS: [usize; 3] = [3, 12, 100];
+const ARRAY_LENGTH: usize = 1024;
+
+/// Returns a friendly type name for the array type.
+fn array_type_name<A: 'static>() -> &'static str {
+    let id = TypeId::of::<A>();
+    if id == TypeId::of::<StringArray>() {
+        "Utf8"
+    } else if id == TypeId::of::<StringViewArray>() {
+        "Utf8View"
+    } else if id == TypeId::of::<Float32Array>() {
+        "Float32"
+    } else if id == TypeId::of::<Int32Array>() {
+        "Int32"
+    } else {
+        "Unknown"
     }
+}
 
-    let values: Float32Array = (0..array_length)
-        .map(|_| rng.random_bool(non_null_percent).then(|| rng.random()))
-        .collect();
+/// Builds a benchmark name from array type, list size, and null percentage.
+fn bench_name<A: 'static>(in_list_length: usize, null_percent: f64) -> String {
+    format!(
+        "in_list/{}/list={in_list_length}/nulls={}%",
+        array_type_name::<A>(),
+        (null_percent * 100.0) as u32
+    )
+}
 
-    let in_list: Vec<_> = (0..in_list_length)
-        .map(|_| ScalarValue::Float32(Some(rng.random())))
-        .collect();
+/// Runs in_list benchmarks for a string array type across all list-size × null-ratio × string-length combinations.
+fn bench_string_type<A>(
+    c: &mut Criterion,
+    rng: &mut StdRng,
+    make_scalar: fn(String) -> ScalarValue,
+) where
+    A: Array + FromIterator<Option<String>> + 'static,
+{
+    for in_list_length in IN_LIST_LENGTHS {
+        for null_percent in NULL_PERCENTS {
+            for string_length in STRING_LENGTHS {
+                let values: A = (0..ARRAY_LENGTH)
+                    .map(|_| {
+                        rng.random_bool(1.0 - null_percent)
+                            .then(|| random_string(rng, string_length))
+                    })
+                    .collect();
+
+                let in_list: Vec<_> = (0..in_list_length)
+                    .map(|_| make_scalar(random_string(rng, string_length)))
+                    .collect();
+
+                do_bench(
+                    c,
+                    &format!(
+                        "{}/str={string_length}",
+                        bench_name::<A>(in_list_length, null_percent)
+                    ),
+                    Arc::new(values),
+                    &in_list,
+                )
+            }
+        }
+    }
+}
 
-    do_bench(
-        c,
-        &format!("in_list_f32 ({array_length}, {null_percent}) IN ({in_list_length}, 0)"),
-        Arc::new(values),
-        &in_list,
-    );
+/// Runs in_list benchmarks for a numeric array type across all list-size × null-ratio combinations.
+fn bench_numeric_type<T, A>(
+    c: &mut Criterion,
+    rng: &mut StdRng,
+    mut gen_value: impl FnMut(&mut StdRng) -> T,
+    make_scalar: fn(T) -> ScalarValue,
+) where
+    A: Array + FromIterator<Option<T>> + 'static,
+{
+    for in_list_length in IN_LIST_LENGTHS {
+        for null_percent in NULL_PERCENTS {
+            let values: A = (0..ARRAY_LENGTH)
+                .map(|_| rng.random_bool(1.0 - null_percent).then(|| gen_value(rng)))
+                .collect();
+
+            let in_list: Vec<_> = (0..in_list_length)
+                .map(|_| make_scalar(gen_value(rng)))
+                .collect();
+
+            do_bench(
+                c,
+                &bench_name::<A>(in_list_length, null_percent),
+                Arc::new(values),
+                &in_list,
+            );
+        }
+    }
+}
 
-    let values: Int32Array = (0..array_length)
-        .map(|_| rng.random_bool(non_null_percent).then(|| rng.random()))
-        .collect();
+/// Entry point: registers in_list benchmarks for Utf8, Utf8View, Float32, and Int32 arrays.
+fn criterion_benchmark(c: &mut Criterion) {
+    let mut rng = StdRng::seed_from_u64(120320);
 
-    let in_list: Vec<_> = (0..in_list_length)
-        .map(|_| ScalarValue::Int32(Some(rng.random())))
-        .collect();
+    // Benchmarks for string array types (Utf8, Utf8View)
+    bench_string_type::<StringArray>(c, &mut rng, |s| ScalarValue::Utf8(Some(s)));
+    bench_string_type::<StringViewArray>(c, &mut rng, |s| ScalarValue::Utf8View(Some(s)));
 
-    do_bench(
+    // Benchmarks for numeric types
+    bench_numeric_type::<f32, Float32Array>(
         c,
-        &format!("in_list_i32 ({array_length}, {null_percent}) IN ({in_list_length}, 0)"),
-        Arc::new(values),
-        &in_list,
-    )
-}
-
-fn criterion_benchmark(c: &mut Criterion) {
-    for in_list_length in [1, 3, 10, 100] {
-        for null_percent in [0., 0.2] {
-            do_benches(c, 1024, in_list_length, null_percent)
-        }
-    }
+        &mut rng,
+        |rng| rng.random(),
+        |v| ScalarValue::Float32(Some(v)),
+    );
+    bench_numeric_type::<i32, Int32Array>(
+        c,
+        &mut rng,
+        |rng| rng.random(),
+        |v| ScalarValue::Int32(Some(v)),
+    );
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/pruning/src/pruning_predicate.rs b/datafusion/pruning/src/pruning_predicate.rs