From c53a9ddd976279572b4b25f2749910244c8c1f02 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 23 Mar 2026 15:17:50 -0400 Subject: [PATCH 1/6] Add benchmark for string_to_array --- datafusion/functions-nested/Cargo.toml | 4 + .../benches/string_to_array.rs | 242 ++++++++++++++++++ 2 files changed, 246 insertions(+) create mode 100644 datafusion/functions-nested/benches/string_to_array.rs diff --git a/datafusion/functions-nested/Cargo.toml b/datafusion/functions-nested/Cargo.toml index 2ce9532a22ee7..4bec795ec2858 100644 --- a/datafusion/functions-nested/Cargo.toml +++ b/datafusion/functions-nested/Cargo.toml @@ -113,3 +113,7 @@ name = "array_position" [[bench]] harness = false name = "array_sort" + +[[bench]] +harness = false +name = "string_to_array" diff --git a/datafusion/functions-nested/benches/string_to_array.rs b/datafusion/functions-nested/benches/string_to_array.rs new file mode 100644 index 0000000000000..603b8207c8f58 --- /dev/null +++ b/datafusion/functions-nested/benches/string_to_array.rs @@ -0,0 +1,242 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, StringArray}; +use arrow::datatypes::{DataType, Field}; +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use datafusion_common::ScalarValue; +use datafusion_common::config::ConfigOptions; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl}; +use datafusion_functions_nested::string::StringToArray; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use std::hint::black_box; +use std::sync::Arc; + +const NUM_ROWS: usize = 1000; +const SEED: u64 = 42; + +fn criterion_benchmark(c: &mut Criterion) { + // Single-char delimiter + bench_string_to_array( + c, + "string_to_array_single_char_delim", + create_csv_strings, + ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string()))), + None, + ); + + // Multi-char delimiter + bench_string_to_array( + c, + "string_to_array_multi_char_delim", + create_multi_delim_strings, + ColumnarValue::Scalar(ScalarValue::Utf8(Some("::".to_string()))), + None, + ); + + // With null_str argument + bench_string_to_array( + c, + "string_to_array_with_null_str", + create_csv_strings_with_nulls, + ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string()))), + Some(ColumnarValue::Scalar(ScalarValue::Utf8(Some( + "NULL".to_string(), + )))), + ); + + // NULL delimiter + bench_string_to_array( + c, + "string_to_array_null_delim", + create_short_strings, + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + None, + ); + + // Columnar delimiter (fall-back path) + bench_string_to_array_columnar_delim(c); +} + +fn bench_string_to_array_columnar_delim(c: &mut Criterion) { + let mut group = c.benchmark_group("string_to_array_columnar_delim"); + + for &num_elements in &[5, 20, 100] { + let string_array = create_csv_strings(num_elements); + let delimiter_array: ArrayRef = + Arc::new(StringArray::from(vec![Some(","); NUM_ROWS])); + + let args = vec![ + ColumnarValue::Array(string_array.clone()), + ColumnarValue::Array(delimiter_array), + ]; + let arg_fields = vec![ + Field::new("str", DataType::Utf8, true).into(), + Field::new("delimiter", DataType::Utf8, false).into(), + ]; + + let return_field = Field::new( + "result", + DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))), + true, + ); + + group.bench_with_input( + BenchmarkId::from_parameter(num_elements), + &num_elements, + |b, _| { + let udf = StringToArray::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone().into(), + config_options: Arc::new(ConfigOptions::default()), + }) + .unwrap(), + ) + }) + }, + ); + } + + group.finish(); +} + +fn bench_string_to_array( + c: &mut Criterion, + group_name: &str, + make_strings: fn(usize) -> ArrayRef, + delimiter: ColumnarValue, + null_str: Option, +) { + let mut group = c.benchmark_group(group_name); + + for &num_elements in &[5, 20, 100] { + let string_array = make_strings(num_elements); + + let mut args = vec![ + ColumnarValue::Array(string_array.clone()), + delimiter.clone(), + ]; + let mut arg_fields = vec![ + Field::new("str", DataType::Utf8, true).into(), + Field::new("delimiter", DataType::Utf8, true).into(), + ]; + if let Some(ref ns) = null_str { + args.push(ns.clone()); + arg_fields.push(Field::new("null_str", DataType::Utf8, true).into()); + } + + let return_field = Field::new( + "result", + DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))), + true, + ); + + group.bench_with_input( + BenchmarkId::from_parameter(num_elements), + &num_elements, + |b, _| { + let udf = StringToArray::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone().into(), + config_options: Arc::new(ConfigOptions::default()), + }) + .unwrap(), + ) + }) + }, + ); + } + + group.finish(); +} + +/// Creates strings like "val1,val2,val3,...,valN" with `num_elements` elements. +fn create_csv_strings(num_elements: usize) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let strings: StringArray = (0..NUM_ROWS) + .map(|_| { + let parts: Vec = (0..num_elements) + .map(|_| format!("val{}", rng.random_range(0..1000))) + .collect(); + Some(parts.join(",")) + }) + .collect(); + Arc::new(strings) +} + +/// Creates strings like "val1::val2::val3::...::valN". +fn create_multi_delim_strings(num_elements: usize) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let strings: StringArray = (0..NUM_ROWS) + .map(|_| { + let parts: Vec = (0..num_elements) + .map(|_| format!("val{}", rng.random_range(0..1000))) + .collect(); + Some(parts.join("::")) + }) + .collect(); + Arc::new(strings) +} + +/// Creates CSV strings where ~10% of elements are the literal "NULL". +fn create_csv_strings_with_nulls(num_elements: usize) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let strings: StringArray = (0..NUM_ROWS) + .map(|_| { + let parts: Vec = (0..num_elements) + .map(|_| { + if rng.random::() < 0.1 { + "NULL".to_string() + } else { + format!("val{}", rng.random_range(0..1000)) + } + }) + .collect(); + Some(parts.join(",")) + }) + .collect(); + Arc::new(strings) +} + +/// Creates short strings (length = `num_chars`) for the NULL-delimiter +/// (split-into-characters) benchmark. +fn create_short_strings(num_chars: usize) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let strings: StringArray = (0..NUM_ROWS) + .map(|_| { + let s: String = (0..num_chars) + .map(|_| rng.random_range(b'a'..=b'z') as char) + .collect(); + Some(s) + }) + .collect(); + Arc::new(strings) +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); From 828223ea35d2a3f80d2102a75f1a15e15fdd7289 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 23 Mar 2026 15:17:58 -0400 Subject: [PATCH 2/6] Optimize string_to_array --- Cargo.lock | 1 + datafusion/functions-nested/Cargo.toml | 1 + .../benches/string_to_array.rs | 23 +- datafusion/functions-nested/src/string.rs | 531 +++++++++--------- datafusion/sqllogictest/test_files/array.slt | 68 +++ 5 files changed, 342 insertions(+), 282 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9c06c7856e30c..9f59fdeb2ad16 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2324,6 +2324,7 @@ dependencies = [ "itertools 0.14.0", "itoa", "log", + "memchr", "rand 0.9.2", ] diff --git a/datafusion/functions-nested/Cargo.toml b/datafusion/functions-nested/Cargo.toml index 4bec795ec2858..be0dc88e1dc61 100644 --- a/datafusion/functions-nested/Cargo.toml +++ b/datafusion/functions-nested/Cargo.toml @@ -60,6 +60,7 @@ datafusion-physical-expr-common = { workspace = true } hashbrown = { workspace = true } itertools = { workspace = true, features = ["use_std"] } itoa = { workspace = true } +memchr = { workspace = true } log = { workspace = true } [dev-dependencies] diff --git a/datafusion/functions-nested/benches/string_to_array.rs b/datafusion/functions-nested/benches/string_to_array.rs index 603b8207c8f58..3620854d0a8bb 100644 --- a/datafusion/functions-nested/benches/string_to_array.rs +++ b/datafusion/functions-nested/benches/string_to_array.rs @@ -32,40 +32,43 @@ const SEED: u64 = 42; fn criterion_benchmark(c: &mut Criterion) { // Single-char delimiter + let comma = ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string()))); bench_string_to_array( c, "string_to_array_single_char_delim", create_csv_strings, - ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string()))), + &comma, None, ); // Multi-char delimiter + let double_colon = + ColumnarValue::Scalar(ScalarValue::Utf8(Some("::".to_string()))); bench_string_to_array( c, "string_to_array_multi_char_delim", create_multi_delim_strings, - ColumnarValue::Scalar(ScalarValue::Utf8(Some("::".to_string()))), + &double_colon, None, ); // With null_str argument + let null_str = ColumnarValue::Scalar(ScalarValue::Utf8(Some("NULL".to_string()))); bench_string_to_array( c, "string_to_array_with_null_str", create_csv_strings_with_nulls, - ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string()))), - Some(ColumnarValue::Scalar(ScalarValue::Utf8(Some( - "NULL".to_string(), - )))), + &comma, + Some(&null_str), ); // NULL delimiter + let null_delim = ColumnarValue::Scalar(ScalarValue::Utf8(None)); bench_string_to_array( c, "string_to_array_null_delim", create_short_strings, - ColumnarValue::Scalar(ScalarValue::Utf8(None)), + &null_delim, None, ); @@ -124,8 +127,8 @@ fn bench_string_to_array( c: &mut Criterion, group_name: &str, make_strings: fn(usize) -> ArrayRef, - delimiter: ColumnarValue, - null_str: Option, + delimiter: &ColumnarValue, + null_str: Option<&ColumnarValue>, ) { let mut group = c.benchmark_group(group_name); @@ -140,7 +143,7 @@ fn bench_string_to_array( Field::new("str", DataType::Utf8, true).into(), Field::new("delimiter", DataType::Utf8, true).into(), ]; - if let Some(ref ns) = null_str { + if let Some(ns) = null_str { args.push(ns.clone()); arg_fields.push(Field::new("null_str", DataType::Utf8, true).into()); } diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs index 619e43f40bf71..0bbcb76ebbff4 100644 --- a/datafusion/functions-nested/src/string.rs +++ b/datafusion/functions-nested/src/string.rs @@ -26,14 +26,14 @@ use arrow::array::{ use arrow::datatypes::{DataType, Field}; use datafusion_common::utils::ListCoercion; -use datafusion_common::{DataFusionError, Result, not_impl_err}; +use datafusion_common::{DataFusionError, Result, ScalarValue, not_impl_err}; use std::any::Any; use std::fmt::{self, Write}; use crate::utils::make_scalar_function; use arrow::array::{ - GenericStringArray, StringArrayType, StringViewArray, + StringArrayType, StringViewArray, builder::{ArrayBuilder, LargeStringBuilder, StringViewBuilder}, cast::AsArray, }; @@ -194,11 +194,17 @@ make_udf_expr_and_func!( ) )] #[derive(Debug, PartialEq, Eq, Hash)] -pub(super) struct StringToArray { +pub struct StringToArray { signature: Signature, aliases: Vec, } +impl Default for StringToArray { + fn default() -> Self { + Self::new() + } +} + impl StringToArray { pub fn new() -> Self { Self { @@ -242,13 +248,77 @@ impl ScalarUDFImpl for StringToArray { } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { - let args = &args.args; - match args[0].data_type() { - Utf8 | Utf8View => make_scalar_function(string_to_array_inner::)(args), - LargeUtf8 => make_scalar_function(string_to_array_inner::)(args), + let ScalarFunctionArgs { args, .. } = args; + + let delimiter_is_scalar = matches!(&args[1], ColumnarValue::Scalar(_)); + let null_str_is_scalar = args + .get(2) + .is_none_or(|a| matches!(a, ColumnarValue::Scalar(_))); + + if !(delimiter_is_scalar && null_str_is_scalar) { + return make_scalar_function(string_to_array_fallback)(&args); + } + + // Delimiter and null_str (if given) are scalar, so use the fast path + let delimiter = match &args[1] { + ColumnarValue::Scalar(s) => match s.try_as_str() { + Some(s) => s, + None => { + return exec_err!( + "unsupported type for string_to_array delimiter: {:?}", + args[1].data_type() + ); + } + }, + _ => unreachable!("delimiter must be scalar in this branch"), + }; + let null_value = match args.get(2) { + Some(ColumnarValue::Scalar(s)) => match s.try_as_str() { + Some(s) => s, + None => { + return exec_err!( + "unsupported type for string_to_array null_str: {:?}", + args[2].data_type() + ); + } + }, + _ => None, + }; + + let (all_scalar, string_array) = match &args[0] { + ColumnarValue::Array(a) => (false, Arc::clone(a)), + ColumnarValue::Scalar(s) => (true, s.to_array_of_size(1)?), + }; + + let result = match string_array.data_type() { + Utf8 => { + let arr = string_array.as_string::(); + let builder = + StringBuilder::with_capacity(arr.len(), arr.get_buffer_memory_size()); + string_to_array_scalar_args(&arr, delimiter, null_value, builder) + } + Utf8View => { + let arr = string_array.as_string_view(); + let builder = StringViewBuilder::with_capacity(arr.len()); + string_to_array_scalar_args(&arr, delimiter, null_value, builder) + } + LargeUtf8 => { + let arr = string_array.as_string::(); + let builder = LargeStringBuilder::with_capacity( + arr.len(), + arr.get_buffer_memory_size(), + ); + string_to_array_scalar_args(&arr, delimiter, null_value, builder) + } other => { exec_err!("unsupported type for string_to_array function as {other:?}") } + }?; + + if all_scalar { + ScalarValue::try_from_array(&result, 0).map(ColumnarValue::Scalar) + } else { + Ok(ColumnarValue::Array(result)) } } @@ -261,6 +331,188 @@ impl ScalarUDFImpl for StringToArray { } } +/// Appends `value` to the string builder, or NULL if it matches `null_value`. +#[inline(always)] +fn append_part( + builder: &mut impl StringArrayBuilderType, + value: &str, + null_value: Option<&str>, +) { + if null_value == Some(value) { + builder.append_null(); + } else { + builder.append_value(value); + } +} + +/// Optimized `string_to_array` implementation for the common case where +/// delimiter and null_value are scalar values. +fn string_to_array_scalar_args<'a, StringArrType, StringBuilderType>( + string_array: &StringArrType, + delimiter: Option<&str>, + null_value: Option<&str>, + string_builder: StringBuilderType, +) -> Result +where + StringArrType: StringArrayType<'a>, + StringBuilderType: StringArrayBuilderType, +{ + let mut list_builder = ListBuilder::new(string_builder); + + match delimiter { + Some("") => { + // Empty delimiter: each string becomes a single-element list. + for i in 0..string_array.len() { + if string_array.is_null(i) { + list_builder.append(false); + continue; + } + let string = string_array.value(i); + append_part(list_builder.values(), string, null_value); + list_builder.append(true); + } + } + Some(delimiter) => { + // Rather than using `str::split`, do the split ourselves using + // `memmem::Finder`. This allows pre-compiling the delimiter search + // pattern once and reusing it for all rows. + let finder = memchr::memmem::Finder::new(delimiter.as_bytes()); + let delim_len = delimiter.len(); + + for i in 0..string_array.len() { + if string_array.is_null(i) { + list_builder.append(false); + continue; + } + let string = string_array.value(i); + let bytes = string.as_bytes(); + let mut start = 0; + for pos in finder.find_iter(bytes) { + append_part(list_builder.values(), &string[start..pos], null_value); + start = pos + delim_len; + } + // Trailing part after last delimiter (or entire string if no + // delimiter was found). + append_part(list_builder.values(), &string[start..], null_value); + list_builder.append(true); + } + } + None => { + // NULL delimiter: split into individual characters. + for i in 0..string_array.len() { + if string_array.is_null(i) { + list_builder.append(false); + continue; + } + let string = string_array.value(i); + for (pos, c) in string.char_indices() { + append_part( + list_builder.values(), + &string[pos..pos + c.len_utf8()], + null_value, + ); + } + list_builder.append(true); + } + } + } + + Ok(Arc::new(list_builder.finish()) as ArrayRef) +} + +/// Fallback path for `string_to_array` when delimiter and/or null_value +/// are array columns rather than scalars. +fn string_to_array_fallback(args: &[ArrayRef]) -> Result { + let null_value_array = args.get(2); + + match args[0].data_type() { + Utf8 => { + let arr = args[0].as_string::(); + let builder = + StringBuilder::with_capacity(arr.len(), arr.get_buffer_memory_size()); + string_to_array_column_args(&arr, &args[1], null_value_array, builder) + } + Utf8View => { + let arr = args[0].as_string_view(); + let builder = StringViewBuilder::with_capacity(arr.len()); + string_to_array_column_args(&arr, &args[1], null_value_array, builder) + } + LargeUtf8 => { + let arr = args[0].as_string::(); + let builder = LargeStringBuilder::with_capacity( + arr.len(), + arr.get_buffer_memory_size(), + ); + string_to_array_column_args(&arr, &args[1], null_value_array, builder) + } + other => exec_err!("unsupported type for string_to_array function as {other:?}"), + } +} + +fn string_to_array_column_args<'a, StringArrType, StringBuilderType>( + string_array: &StringArrType, + delimiter_array: &ArrayRef, + null_value_array: Option<&ArrayRef>, + string_builder: StringBuilderType, +) -> Result +where + StringArrType: StringArrayType<'a>, + StringBuilderType: StringArrayBuilderType, +{ + let mut list_builder = ListBuilder::new(string_builder); + + for i in 0..string_array.len() { + if string_array.is_null(i) { + list_builder.append(false); + continue; + } + + let string = string_array.value(i); + let delimiter = get_str_value(delimiter_array, i); + let null_value = null_value_array.and_then(|arr| get_str_value(arr, i)); + + match delimiter { + Some("") => { + append_part(list_builder.values(), string, null_value); + } + Some(delimiter) => { + for part in string.split(delimiter) { + append_part(list_builder.values(), part, null_value); + } + } + None => { + for (pos, c) in string.char_indices() { + append_part( + list_builder.values(), + &string[pos..pos + c.len_utf8()], + null_value, + ); + } + } + } + + list_builder.append(true); + } + + Ok(Arc::new(list_builder.finish()) as ArrayRef) +} + +/// Returns the string value at index `i` from a string array of any type. +fn get_str_value(array: &ArrayRef, i: usize) -> Option<&str> { + if array.is_null(i) { + return None; + } + match array.data_type() { + Utf8 => Some(array.as_string::().value(i)), + LargeUtf8 => Some(array.as_string::().value(i)), + Utf8View => Some(array.as_string_view().value(i)), + other => { + debug_assert!(false, "unexpected type in get_str_value: {other:?}"); + None + } + } +} + fn array_to_string_inner(args: &[ArrayRef]) -> Result { if args.len() < 2 || args.len() > 3 { return exec_err!("array_to_string expects two or three arguments"); @@ -530,271 +782,6 @@ where Ok(()) } -/// String_to_array SQL function -/// Splits string at occurrences of delimiter and returns an array of parts -/// string_to_array('abc~@~def~@~ghi', '~@~') = '["abc", "def", "ghi"]' -fn string_to_array_inner(args: &[ArrayRef]) -> Result { - if args.len() < 2 || args.len() > 3 { - return exec_err!("string_to_array expects two or three arguments"); - } - - match args[0].data_type() { - Utf8 => { - let string_array = args[0].as_string::(); - let builder = StringBuilder::with_capacity( - string_array.len(), - string_array.get_buffer_memory_size(), - ); - string_to_array_inner_2::<&GenericStringArray, StringBuilder>( - args, - &string_array, - builder, - ) - } - Utf8View => { - let string_array = args[0].as_string_view(); - let builder = StringViewBuilder::with_capacity(string_array.len()); - string_to_array_inner_2::<&StringViewArray, StringViewBuilder>( - args, - &string_array, - builder, - ) - } - LargeUtf8 => { - let string_array = args[0].as_string::(); - let builder = LargeStringBuilder::with_capacity( - string_array.len(), - string_array.get_buffer_memory_size(), - ); - string_to_array_inner_2::<&GenericStringArray, LargeStringBuilder>( - args, - &string_array, - builder, - ) - } - other => exec_err!( - "unsupported type for first argument to string_to_array function as {other:?}" - ), - } -} - -fn string_to_array_inner_2<'a, StringArrType, StringBuilderType>( - args: &'a [ArrayRef], - string_array: &StringArrType, - string_builder: StringBuilderType, -) -> Result -where - StringArrType: StringArrayType<'a>, - StringBuilderType: StringArrayBuilderType, -{ - match args[1].data_type() { - Utf8 => { - let delimiter_array = args[1].as_string::(); - if args.len() == 2 { - string_to_array_impl::< - StringArrType, - &GenericStringArray, - &StringViewArray, - StringBuilderType, - >(string_array, &delimiter_array, None, string_builder) - } else { - string_to_array_inner_3::< - StringArrType, - &GenericStringArray, - StringBuilderType, - >(args, string_array, &delimiter_array, string_builder) - } - } - Utf8View => { - let delimiter_array = args[1].as_string_view(); - - if args.len() == 2 { - string_to_array_impl::< - StringArrType, - &StringViewArray, - &StringViewArray, - StringBuilderType, - >(string_array, &delimiter_array, None, string_builder) - } else { - string_to_array_inner_3::< - StringArrType, - &StringViewArray, - StringBuilderType, - >(args, string_array, &delimiter_array, string_builder) - } - } - LargeUtf8 => { - let delimiter_array = args[1].as_string::(); - if args.len() == 2 { - string_to_array_impl::< - StringArrType, - &GenericStringArray, - &StringViewArray, - StringBuilderType, - >(string_array, &delimiter_array, None, string_builder) - } else { - string_to_array_inner_3::< - StringArrType, - &GenericStringArray, - StringBuilderType, - >(args, string_array, &delimiter_array, string_builder) - } - } - other => exec_err!( - "unsupported type for second argument to string_to_array function as {other:?}" - ), - } -} - -fn string_to_array_inner_3<'a, StringArrType, DelimiterArrType, StringBuilderType>( - args: &'a [ArrayRef], - string_array: &StringArrType, - delimiter_array: &DelimiterArrType, - string_builder: StringBuilderType, -) -> Result -where - StringArrType: StringArrayType<'a>, - DelimiterArrType: StringArrayType<'a>, - StringBuilderType: StringArrayBuilderType, -{ - match args[2].data_type() { - Utf8 => { - let null_type_array = Some(args[2].as_string::()); - string_to_array_impl::< - StringArrType, - DelimiterArrType, - &GenericStringArray, - StringBuilderType, - >( - string_array, - delimiter_array, - null_type_array, - string_builder, - ) - } - Utf8View => { - let null_type_array = Some(args[2].as_string_view()); - string_to_array_impl::< - StringArrType, - DelimiterArrType, - &StringViewArray, - StringBuilderType, - >( - string_array, - delimiter_array, - null_type_array, - string_builder, - ) - } - LargeUtf8 => { - let null_type_array = Some(args[2].as_string::()); - string_to_array_impl::< - StringArrType, - DelimiterArrType, - &GenericStringArray, - StringBuilderType, - >( - string_array, - delimiter_array, - null_type_array, - string_builder, - ) - } - other => { - exec_err!("unsupported type for string_to_array function as {other:?}") - } - } -} - -fn string_to_array_impl< - 'a, - StringArrType, - DelimiterArrType, - NullValueArrType, - StringBuilderType, ->( - string_array: &StringArrType, - delimiter_array: &DelimiterArrType, - null_value_array: Option, - string_builder: StringBuilderType, -) -> Result -where - StringArrType: StringArrayType<'a>, - DelimiterArrType: StringArrayType<'a>, - NullValueArrType: StringArrayType<'a>, - StringBuilderType: StringArrayBuilderType, -{ - let mut list_builder = ListBuilder::new(string_builder); - - match null_value_array { - None => { - string_array.iter().zip(delimiter_array.iter()).for_each( - |(string, delimiter)| { - match (string, delimiter) { - (Some(string), Some("")) => { - list_builder.values().append_value(string); - list_builder.append(true); - } - (Some(string), Some(delimiter)) => { - string.split(delimiter).for_each(|s| { - list_builder.values().append_value(s); - }); - list_builder.append(true); - } - (Some(string), None) => { - string.chars().map(|c| c.to_string()).for_each(|c| { - list_builder.values().append_value(c.as_str()); - }); - list_builder.append(true); - } - _ => list_builder.append(false), // null value - } - }, - ) - } - Some(null_value_array) => string_array - .iter() - .zip(delimiter_array.iter()) - .zip(null_value_array.iter()) - .for_each(|((string, delimiter), null_value)| { - match (string, delimiter) { - (Some(string), Some("")) => { - if Some(string) == null_value { - list_builder.values().append_null(); - } else { - list_builder.values().append_value(string); - } - list_builder.append(true); - } - (Some(string), Some(delimiter)) => { - string.split(delimiter).for_each(|s| { - if Some(s) == null_value { - list_builder.values().append_null(); - } else { - list_builder.values().append_value(s); - } - }); - list_builder.append(true); - } - (Some(string), None) => { - string.chars().map(|c| c.to_string()).for_each(|c| { - if Some(c.as_str()) == null_value { - list_builder.values().append_null(); - } else { - list_builder.values().append_value(c.as_str()); - } - }); - list_builder.append(true); - } - _ => list_builder.append(false), // null value - } - }), - }; - - let list_array = list_builder.finish(); - Ok(Arc::new(list_array) as ArrayRef) -} - trait StringArrayBuilderType: ArrayBuilder { fn append_value(&mut self, val: &str); diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 6e0d7a882ed41..77c87395eb018 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -9223,6 +9223,74 @@ select string_to_list(e, 'm') from values; [adipiscing] NULL +# string_to_array: single-char delimiter producing multiple elements +query ? +SELECT string_to_array('a,b,c', ',') +---- +[a, b, c] + +# string_to_array: delimiter not found in input +query ? +SELECT string_to_array('abc', ',') +---- +[abc] + +# string_to_array: empty string input +query ? +SELECT string_to_array('', ',') +---- +[] + +# string_to_array: null_str matching multiple elements +query ? +SELECT string_to_array('a,NULL,b,NULL,c', ',', 'NULL') +---- +[a, NULL, b, NULL, c] + +# string_to_array: null_str matching all elements +query ? +SELECT string_to_array('x,x,x', ',', 'x') +---- +[NULL, NULL, NULL] + +# string_to_array: null_str with empty-string delimiter +query ? +SELECT string_to_array('abc', '', 'abc') +---- +[NULL] + +# string_to_array: NULL string input +query ? +SELECT string_to_array(NULL, ',') +---- +NULL + +# string_to_array: columnar delimiter +query ?? +SELECT string_to_array('a,b,c', col1), string_to_array('a::b::c', col2) + FROM (VALUES (',', '::')) AS t(col1, col2) +---- +[a, b, c] [a, b, c] + +# string_to_array: columnar null_str +query ? +SELECT string_to_array('a,NULL,b', ',', col1) + FROM (VALUES ('NULL')) AS t(col1) +---- +[a, NULL, b] + +# string_to_array: adjacent delimiters produce empty strings +query ? +SELECT string_to_array('a,,b', ',') +---- +[a, , b] + +# string_to_array: delimiter at start and end +query ? +SELECT string_to_array(',a,b,', ',') +---- +[, a, b, ] + # array_resize scalar function #1 query ? select array_resize(make_array(1, 2, 3), 1); From a389480c1c0171595f3b45979da6bc46720d7aa4 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 23 Mar 2026 17:12:19 -0400 Subject: [PATCH 3/6] cargo fmt --- datafusion/functions-nested/benches/string_to_array.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/functions-nested/benches/string_to_array.rs b/datafusion/functions-nested/benches/string_to_array.rs index 3620854d0a8bb..e403d5e51bac8 100644 --- a/datafusion/functions-nested/benches/string_to_array.rs +++ b/datafusion/functions-nested/benches/string_to_array.rs @@ -42,8 +42,7 @@ fn criterion_benchmark(c: &mut Criterion) { ); // Multi-char delimiter - let double_colon = - ColumnarValue::Scalar(ScalarValue::Utf8(Some("::".to_string()))); + let double_colon = ColumnarValue::Scalar(ScalarValue::Utf8(Some("::".to_string()))); bench_string_to_array( c, "string_to_array_multi_char_delim", From 0c8d00ff1efacb75acaf857580cd4ba58625f825 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Wed, 25 Mar 2026 09:54:24 -0400 Subject: [PATCH 4/6] Fix TOML formatting --- datafusion/functions-nested/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions-nested/Cargo.toml b/datafusion/functions-nested/Cargo.toml index 7900f7bdf95ec..31462d5e509ed 100644 --- a/datafusion/functions-nested/Cargo.toml +++ b/datafusion/functions-nested/Cargo.toml @@ -60,8 +60,8 @@ datafusion-physical-expr-common = { workspace = true } hashbrown = { workspace = true } itertools = { workspace = true, features = ["use_std"] } itoa = { workspace = true } -memchr = { workspace = true } log = { workspace = true } +memchr = { workspace = true } [dev-dependencies] criterion = { workspace = true, features = ["async_tokio"] } From b849dc000912dc7970448e5b7646fdcff8526b8e Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 26 Mar 2026 18:19:36 -0400 Subject: [PATCH 5/6] Refactor per code review suggestion --- datafusion/functions-nested/src/string.rs | 32 +++++++++-------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs index 6541928f7f0ca..565748d8aa0f5 100644 --- a/datafusion/functions-nested/src/string.rs +++ b/datafusion/functions-nested/src/string.rs @@ -44,7 +44,7 @@ use arrow::datatypes::DataType::{ use datafusion_common::cast::{ as_fixed_size_list_array, as_large_list_array, as_list_array, }; -use datafusion_common::exec_err; +use datafusion_common::{exec_datafusion_err, exec_err}; use datafusion_common::types::logical_string; use datafusion_expr::{ ArrayFunctionArgument, ArrayFunctionSignature, Coercion, ColumnarValue, @@ -261,27 +261,21 @@ impl ScalarUDFImpl for StringToArray { // Delimiter and null_str (if given) are scalar, so use the fast path let delimiter = match &args[1] { - ColumnarValue::Scalar(s) => match s.try_as_str() { - Some(s) => s, - None => { - return exec_err!( - "unsupported type for string_to_array delimiter: {:?}", - args[1].data_type() - ); - } - }, + ColumnarValue::Scalar(s) => s.try_as_str().ok_or_else(|| { + exec_datafusion_err!( + "unsupported type for string_to_array delimiter: {:?}", + args[1].data_type() + ) + })?, _ => unreachable!("delimiter must be scalar in this branch"), }; let null_value = match args.get(2) { - Some(ColumnarValue::Scalar(s)) => match s.try_as_str() { - Some(s) => s, - None => { - return exec_err!( - "unsupported type for string_to_array null_str: {:?}", - args[2].data_type() - ); - } - }, + Some(ColumnarValue::Scalar(s)) => s.try_as_str().ok_or_else(|| { + exec_datafusion_err!( + "unsupported type for string_to_array null_str: {:?}", + args[2].data_type() + ) + })?, _ => None, }; From fa1b678cf271c234fc3ac27e59184111521b34a9 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 26 Mar 2026 18:25:46 -0400 Subject: [PATCH 6/6] cargo fmt --- datafusion/functions-nested/src/string.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs index 565748d8aa0f5..f059b6ca7dc89 100644 --- a/datafusion/functions-nested/src/string.rs +++ b/datafusion/functions-nested/src/string.rs @@ -44,8 +44,8 @@ use arrow::datatypes::DataType::{ use datafusion_common::cast::{ as_fixed_size_list_array, as_large_list_array, as_list_array, }; -use datafusion_common::{exec_datafusion_err, exec_err}; use datafusion_common::types::logical_string; +use datafusion_common::{exec_datafusion_err, exec_err}; use datafusion_expr::{ ArrayFunctionArgument, ArrayFunctionSignature, Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,