From ae0741b8d6f203d881de12f9f5d8f0d326f5093c Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sat, 21 Mar 2026 07:43:26 -0600
Subject: [PATCH 01/18] feat: add standalone shuffle benchmark binary for
 profiling

Add a `shuffle_bench` binary that benchmarks shuffle write and read
performance independently from Spark, making it easy to profile with
tools like `cargo flamegraph`, `perf`, or `instruments`.

Supports reading Parquet files (e.g. TPC-H/TPC-DS) or generating
synthetic data with configurable schema. Covers different scenarios
including compression codecs, partition counts, partitioning schemes,
and memory-constrained spilling.
---
 native/Cargo.lock                    |  88 +++-
 native/core/Cargo.toml               |   5 +
 native/core/src/bin/shuffle_bench.rs | 725 +++++++++++++++++++++++++++
 3 files changed, 816 insertions(+), 2 deletions(-)
 create mode 100644 native/core/src/bin/shuffle_bench.rs

diff --git a/native/Cargo.lock b/native/Cargo.lock
index 5f99c614b3..f43b41dd9a 100644
--- a/native/Cargo.lock
+++ b/native/Cargo.lock
@@ -96,12 +96,56 @@ version = "0.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
 
+[[package]]
+name = "anstream"
+version = "0.6.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
 [[package]]
 name = "anstyle"
 version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78"
 
+[[package]]
+name = "anstyle-parse"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
+dependencies = [
+ "windows-sys 0.60.2",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
+dependencies = [
+ "anstyle",
+ "once_cell_polyfill",
+ "windows-sys 0.60.2",
+]
+
 [[package]]
 name = "anyhow"
 version = "1.0.102"
@@ -1331,6 +1375,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a"
 dependencies = [
  "clap_builder",
+ "clap_derive",
 ]
 
 [[package]]
@@ -1339,8 +1384,22 @@ version = "4.5.60"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876"
 dependencies = [
+ "anstream",
  "anstyle",
  "clap_lex",
+ "strsim",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.5.55"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -1358,6 +1417,12 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "colorchoice"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
+
 [[package]]
 name = "combine"
 version = "4.6.7"
@@ -1834,6 +1899,7 @@ dependencies = [
  "aws-config",
  "aws-credential-types",
  "bytes",
+ "clap",
  "crc32fast",
  "criterion",
  "datafusion",
@@ -1885,7 +1951,7 @@ dependencies = [
 
 [[package]]
 name = "datafusion-comet-common"
-version = "0.14.0"
+version = "0.15.0"
 dependencies = [
  "arrow",
  "datafusion",
@@ -1911,7 +1977,7 @@ dependencies = [
 
 [[package]]
 name = "datafusion-comet-jni-bridge"
-version = "0.14.0"
+version = "0.15.0"
 dependencies = [
  "arrow",
  "assertables",
@@ -3609,6 +3675,12 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
+
 [[package]]
 name = "itertools"
 version = "0.13.0"
@@ -4289,6 +4361,12 @@ version = "1.21.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
 
+[[package]]
+name = "once_cell_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
+
 [[package]]
 name = "oorandom"
 version = "11.1.5"
@@ -6339,6 +6417,12 @@ version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
 
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
 [[package]]
 name = "uuid"
 version = "1.22.0"
diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml
index 3f305a631d..3df9e55719 100644
--- a/native/core/Cargo.toml
+++ b/native/core/Cargo.toml
@@ -72,6 +72,7 @@ url = { workspace = true }
 aws-config = { workspace = true }
 aws-credential-types = { workspace = true }
 parking_lot = "0.12.5"
+clap = { version = "4", features = ["derive"] }
 datafusion-comet-objectstore-hdfs = { path = "../hdfs", optional = true, default-features = false, features = ["hdfs"] }
 reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots", "http2"] }
 object_store_opendal = {version = "0.55.0", optional = true}
@@ -113,6 +114,10 @@ name = "comet"
 # "rlib" is for benchmarking with criterion.
 crate-type = ["cdylib", "rlib"]
 
+[[bin]]
+name = "shuffle_bench"
+path = "src/bin/shuffle_bench.rs"
+
 [[bench]]
 name = "parquet_read"
 harness = false
diff --git a/native/core/src/bin/shuffle_bench.rs b/native/core/src/bin/shuffle_bench.rs
new file mode 100644
index 0000000000..c1498161f7
--- /dev/null
+++ b/native/core/src/bin/shuffle_bench.rs
@@ -0,0 +1,725 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Standalone shuffle benchmark tool for profiling Comet shuffle write and read
+//! outside of Spark.
+//!
+//! # Usage
+//!
+//! Read from Parquet files (e.g. TPC-H lineitem):
+//! ```sh
+//! cargo run --release --bin shuffle_bench -- \
+//!   --input /data/tpch-sf100/lineitem/ \
+//!   --partitions 200 \
+//!   --codec zstd --zstd-level 1 \
+//!   --hash-columns 0,3 \
+//!   --read-back
+//! ```
+//!
+//! Generate synthetic data:
+//! ```sh
+//! cargo run --release --bin shuffle_bench -- \
+//!   --generate --gen-rows 10000000 --gen-string-cols 4 --gen-int-cols 4 \
+//!   --gen-decimal-cols 2 --gen-avg-string-len 32 \
+//!   --partitions 200 --codec lz4 --read-back
+//! ```
+//!
+//! Profile with flamegraph:
+//! ```sh
+//! cargo flamegraph --release --bin shuffle_bench -- \
+//!   --input /data/tpch-sf100/lineitem/ \
+//!   --partitions 200 --codec zstd --zstd-level 1
+//! ```
+
+use arrow::array::builder::{Date32Builder, Decimal128Builder, Int64Builder, StringBuilder};
+use arrow::array::RecordBatch;
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use clap::Parser;
+use comet::execution::shuffle::{
+    read_ipc_compressed, CometPartitioning, CompressionCodec, ShuffleWriterExec,
+};
+use datafusion::datasource::memory::MemorySourceConfig;
+use datafusion::datasource::source::DataSourceExec;
+use datafusion::execution::config::SessionConfig;
+use datafusion::execution::runtime_env::RuntimeEnvBuilder;
+use datafusion::physical_expr::expressions::Column;
+use datafusion::physical_plan::common::collect;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion::prelude::SessionContext;
+use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+use rand::RngExt;
+use std::fs;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Instant;
+
+#[derive(Parser, Debug)]
+#[command(
+    name = "shuffle_bench",
+    about = "Standalone benchmark for Comet shuffle write and read performance"
+)]
+struct Args {
+    /// Path to input Parquet file or directory of Parquet files
+    #[arg(long)]
+    input: Option<PathBuf>,
+
+    /// Generate synthetic data instead of reading from Parquet
+    #[arg(long, default_value_t = false)]
+    generate: bool,
+
+    /// Number of rows to generate (requires --generate)
+    #[arg(long, default_value_t = 1_000_000)]
+    gen_rows: usize,
+
+    /// Number of Int64 columns to generate
+    #[arg(long, default_value_t = 4)]
+    gen_int_cols: usize,
+
+    /// Number of Utf8 string columns to generate
+    #[arg(long, default_value_t = 2)]
+    gen_string_cols: usize,
+
+    /// Number of Decimal128 columns to generate
+    #[arg(long, default_value_t = 2)]
+    gen_decimal_cols: usize,
+
+    /// Number of Date32 columns to generate
+    #[arg(long, default_value_t = 1)]
+    gen_date_cols: usize,
+
+    /// Average string length for generated string columns
+    #[arg(long, default_value_t = 24)]
+    gen_avg_string_len: usize,
+
+    /// Batch size for reading Parquet or generating data
+    #[arg(long, default_value_t = 8192)]
+    batch_size: usize,
+
+    /// Number of output shuffle partitions
+    #[arg(long, default_value_t = 200)]
+    partitions: usize,
+
+    /// Partitioning scheme: hash, single, round-robin
+    #[arg(long, default_value = "hash")]
+    partitioning: String,
+
+    /// Column indices to hash on (comma-separated, e.g. "0,3")
+    #[arg(long, default_value = "0")]
+    hash_columns: String,
+
+    /// Compression codec: none, lz4, zstd, snappy
+    #[arg(long, default_value = "zstd")]
+    codec: String,
+
+    /// Zstd compression level (1-22)
+    #[arg(long, default_value_t = 1)]
+    zstd_level: i32,
+
+    /// Memory limit in bytes (triggers spilling when exceeded)
+    #[arg(long)]
+    memory_limit: Option<usize>,
+
+    /// Also benchmark reading back the shuffle output
+    #[arg(long, default_value_t = false)]
+    read_back: bool,
+
+    /// Number of iterations to run
+    #[arg(long, default_value_t = 1)]
+    iterations: usize,
+
+    /// Number of warmup iterations before timing
+    #[arg(long, default_value_t = 0)]
+    warmup: usize,
+
+    /// Output directory for shuffle data/index files
+    #[arg(long, default_value = "/tmp/comet_shuffle_bench")]
+    output_dir: PathBuf,
+
+    /// Write buffer size in bytes
+    #[arg(long, default_value_t = 1048576)]
+    write_buffer_size: usize,
+}
+
+fn main() {
+    let args = Args::parse();
+
+    // Validate args
+    if args.input.is_none() && !args.generate {
+        eprintln!("Error: must specify either --input <path> or --generate");
+        std::process::exit(1);
+    }
+
+    // Create output directory
+    fs::create_dir_all(&args.output_dir).expect("Failed to create output directory");
+
+    let data_file = args.output_dir.join("data.out");
+    let index_file = args.output_dir.join("index.out");
+
+    // Load data
+    let load_start = Instant::now();
+    let batches = if let Some(ref input_path) = args.input {
+        load_parquet(input_path, args.batch_size)
+    } else {
+        generate_data(&args)
+    };
+    let load_elapsed = load_start.elapsed();
+
+    let schema = batches[0].schema();
+    let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+    let total_bytes: usize = batches.iter().map(|b| b.get_array_memory_size()).sum();
+
+    println!("=== Shuffle Benchmark ===");
+    println!(
+        "Data source:    {}",
+        if args.input.is_some() {
+            "parquet"
+        } else {
+            "generated"
+        }
+    );
+    println!(
+        "Schema:         {} columns ({} fields)",
+        schema.fields().len(),
+        describe_schema(&schema)
+    );
+    println!("Total rows:     {}", format_number(total_rows));
+    println!("Total size:     {}", format_bytes(total_bytes));
+    println!("Batches:        {}", batches.len());
+    println!(
+        "Rows/batch:     ~{}",
+        if batches.is_empty() {
+            0
+        } else {
+            total_rows / batches.len()
+        }
+    );
+    println!("Load time:      {:.3}s", load_elapsed.as_secs_f64());
+    println!();
+
+    let codec = parse_codec(&args.codec, args.zstd_level);
+    let hash_col_indices = parse_hash_columns(&args.hash_columns);
+
+    println!("Partitioning:   {}", args.partitioning);
+    println!("Partitions:     {}", args.partitions);
+    println!("Codec:          {:?}", codec);
+    println!("Hash columns:   {:?}", hash_col_indices);
+    if let Some(mem_limit) = args.memory_limit {
+        println!("Memory limit:   {}", format_bytes(mem_limit));
+    }
+    println!(
+        "Iterations:     {} (warmup: {})",
+        args.iterations, args.warmup
+    );
+    println!();
+
+    // Run warmup + timed iterations
+    let total_iters = args.warmup + args.iterations;
+    let mut write_times = Vec::with_capacity(args.iterations);
+    let mut read_times = Vec::with_capacity(args.iterations);
+    let mut data_file_sizes = Vec::with_capacity(args.iterations);
+
+    for i in 0..total_iters {
+        let is_warmup = i < args.warmup;
+        let label = if is_warmup {
+            format!("warmup {}/{}", i + 1, args.warmup)
+        } else {
+            format!("iter {}/{}", i - args.warmup + 1, args.iterations)
+        };
+
+        // Write phase
+        let write_elapsed = run_shuffle_write(
+            &batches,
+            &schema,
+            &codec,
+            &hash_col_indices,
+            &args,
+            data_file.to_str().unwrap(),
+            index_file.to_str().unwrap(),
+        );
+        let data_size = fs::metadata(&data_file).map(|m| m.len()).unwrap_or(0);
+
+        if !is_warmup {
+            write_times.push(write_elapsed);
+            data_file_sizes.push(data_size);
+        }
+
+        print!("  [{label}] write: {:.3}s", write_elapsed);
+        print!("  output: {}", format_bytes(data_size as usize));
+
+        // Read phase
+        if args.read_back {
+            let read_elapsed = run_shuffle_read(
+                data_file.to_str().unwrap(),
+                index_file.to_str().unwrap(),
+                args.partitions,
+            );
+            if !is_warmup {
+                read_times.push(read_elapsed);
+            }
+            print!("  read: {:.3}s", read_elapsed);
+        }
+        println!();
+    }
+
+    // Print summary
+    if args.iterations > 0 {
+        println!();
+        println!("=== Results ===");
+
+        let avg_write = write_times.iter().sum::<f64>() / write_times.len() as f64;
+        let avg_data_size = data_file_sizes.iter().sum::<u64>() / data_file_sizes.len() as u64;
+        let write_throughput_rows = total_rows as f64 / avg_write;
+        let write_throughput_bytes = total_bytes as f64 / avg_write;
+        let compression_ratio = if avg_data_size > 0 {
+            total_bytes as f64 / avg_data_size as f64
+        } else {
+            0.0
+        };
+
+        println!("Write:");
+        println!("  avg time:         {:.3}s", avg_write);
+        if write_times.len() > 1 {
+            let min = write_times.iter().cloned().fold(f64::INFINITY, f64::min);
+            let max = write_times
+                .iter()
+                .cloned()
+                .fold(f64::NEG_INFINITY, f64::max);
+            println!("  min/max:          {:.3}s / {:.3}s", min, max);
+        }
+        println!(
+            "  throughput:       {}/s ({} rows/s)",
+            format_bytes(write_throughput_bytes as usize),
+            format_number(write_throughput_rows as usize)
+        );
+        println!(
+            "  output size:      {}",
+            format_bytes(avg_data_size as usize)
+        );
+        println!("  compression:      {:.2}x", compression_ratio);
+
+        if !read_times.is_empty() {
+            let avg_read = read_times.iter().sum::<f64>() / read_times.len() as f64;
+            let read_throughput_bytes = avg_data_size as f64 / avg_read;
+
+            println!("Read:");
+            println!("  avg time:         {:.3}s", avg_read);
+            if read_times.len() > 1 {
+                let min = read_times.iter().cloned().fold(f64::INFINITY, f64::min);
+                let max = read_times.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
+                println!("  min/max:          {:.3}s / {:.3}s", min, max);
+            }
+            println!(
+                "  throughput:       {}/s (from compressed)",
+                format_bytes(read_throughput_bytes as usize)
+            );
+        }
+    }
+
+    // Cleanup
+    let _ = fs::remove_file(&data_file);
+    let _ = fs::remove_file(&index_file);
+}
+
+fn load_parquet(path: &PathBuf, batch_size: usize) -> Vec<RecordBatch> {
+    let mut batches = Vec::new();
+
+    let paths = if path.is_dir() {
+        let mut files: Vec<PathBuf> = fs::read_dir(path)
+            .expect("Failed to read input directory")
+            .filter_map(|entry| {
+                let entry = entry.ok()?;
+                let p = entry.path();
+                if p.extension().and_then(|e| e.to_str()) == Some("parquet") {
+                    Some(p)
+                } else {
+                    None
+                }
+            })
+            .collect();
+        files.sort();
+        if files.is_empty() {
+            panic!("No .parquet files found in {}", path.display());
+        }
+        files
+    } else {
+        vec![path.clone()]
+    };
+
+    for file_path in &paths {
+        let file = fs::File::open(file_path)
+            .unwrap_or_else(|e| panic!("Failed to open {}: {}", file_path.display(), e));
+        let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap_or_else(|e| {
+            panic!(
+                "Failed to read Parquet metadata from {}: {}",
+                file_path.display(),
+                e
+            )
+        });
+        let reader = builder
+            .with_batch_size(batch_size)
+            .build()
+            .unwrap_or_else(|e| {
+                panic!(
+                    "Failed to build Parquet reader for {}: {}",
+                    file_path.display(),
+                    e
+                )
+            });
+        for batch_result in reader {
+            let batch = batch_result.unwrap_or_else(|e| {
+                panic!("Failed to read batch from {}: {}", file_path.display(), e)
+            });
+            if batch.num_rows() > 0 {
+                batches.push(batch);
+            }
+        }
+    }
+
+    if batches.is_empty() {
+        panic!("No data read from input");
+    }
+
+    println!(
+        "Loaded {} batches from {} file(s)",
+        batches.len(),
+        paths.len()
+    );
+    batches
+}
+
+fn generate_data(args: &Args) -> Vec<RecordBatch> {
+    let mut fields = Vec::new();
+    let mut col_idx = 0;
+
+    // Int64 columns
+    for _ in 0..args.gen_int_cols {
+        fields.push(Field::new(
+            format!("int_col_{col_idx}"),
+            DataType::Int64,
+            true,
+        ));
+        col_idx += 1;
+    }
+    // String columns
+    for _ in 0..args.gen_string_cols {
+        fields.push(Field::new(
+            format!("str_col_{col_idx}"),
+            DataType::Utf8,
+            true,
+        ));
+        col_idx += 1;
+    }
+    // Decimal columns
+    for _ in 0..args.gen_decimal_cols {
+        fields.push(Field::new(
+            format!("dec_col_{col_idx}"),
+            DataType::Decimal128(18, 2),
+            true,
+        ));
+        col_idx += 1;
+    }
+    // Date columns
+    for _ in 0..args.gen_date_cols {
+        fields.push(Field::new(
+            format!("date_col_{col_idx}"),
+            DataType::Date32,
+            true,
+        ));
+        col_idx += 1;
+    }
+
+    let schema = Arc::new(Schema::new(fields));
+    let mut batches = Vec::new();
+    let mut rng = rand::rng();
+    let mut remaining = args.gen_rows;
+
+    while remaining > 0 {
+        let batch_rows = remaining.min(args.batch_size);
+        remaining -= batch_rows;
+
+        let mut columns: Vec<Arc<dyn arrow::array::Array>> = Vec::new();
+
+        // Int64 columns
+        for _ in 0..args.gen_int_cols {
+            let mut builder = Int64Builder::with_capacity(batch_rows);
+            for _ in 0..batch_rows {
+                if rng.random_range(0..100) < 5 {
+                    builder.append_null();
+                } else {
+                    builder.append_value(rng.random_range(0..1_000_000i64));
+                }
+            }
+            columns.push(Arc::new(builder.finish()));
+        }
+        // String columns
+        for _ in 0..args.gen_string_cols {
+            let mut builder =
+                StringBuilder::with_capacity(batch_rows, batch_rows * args.gen_avg_string_len);
+            for _ in 0..batch_rows {
+                if rng.random_range(0..100) < 5 {
+                    builder.append_null();
+                } else {
+                    let len = rng.random_range(1..args.gen_avg_string_len * 2);
+                    let s: String = (0..len)
+                        .map(|_| rng.random_range(b'a'..=b'z') as char)
+                        .collect();
+                    builder.append_value(&s);
+                }
+            }
+            columns.push(Arc::new(builder.finish()));
+        }
+        // Decimal columns
+        for _ in 0..args.gen_decimal_cols {
+            let mut builder = Decimal128Builder::with_capacity(batch_rows)
+                .with_precision_and_scale(18, 2)
+                .unwrap();
+            for _ in 0..batch_rows {
+                if rng.random_range(0..100) < 5 {
+                    builder.append_null();
+                } else {
+                    builder.append_value(rng.random_range(0..100_000_000i128));
+                }
+            }
+            columns.push(Arc::new(builder.finish()));
+        }
+        // Date columns
+        for _ in 0..args.gen_date_cols {
+            let mut builder = Date32Builder::with_capacity(batch_rows);
+            for _ in 0..batch_rows {
+                if rng.random_range(0..100) < 5 {
+                    builder.append_null();
+                } else {
+                    builder.append_value(rng.random_range(0..20000i32));
+                }
+            }
+            columns.push(Arc::new(builder.finish()));
+        }
+
+        let batch = RecordBatch::try_new(Arc::clone(&schema), columns).unwrap();
+        batches.push(batch);
+    }
+
+    println!(
+        "Generated {} batches ({} rows)",
+        batches.len(),
+        args.gen_rows
+    );
+    batches
+}
+
+fn run_shuffle_write(
+    batches: &[RecordBatch],
+    schema: &SchemaRef,
+    codec: &CompressionCodec,
+    hash_col_indices: &[usize],
+    args: &Args,
+    data_file: &str,
+    index_file: &str,
+) -> f64 {
+    let partitioning = build_partitioning(
+        &args.partitioning,
+        args.partitions,
+        hash_col_indices,
+        schema,
+    );
+
+    let partitions = &[batches.to_vec()];
+    let exec = ShuffleWriterExec::try_new(
+        Arc::new(DataSourceExec::new(Arc::new(
+            MemorySourceConfig::try_new(partitions, Arc::clone(schema), None).unwrap(),
+        ))),
+        partitioning,
+        codec.clone(),
+        data_file.to_string(),
+        index_file.to_string(),
+        false,
+        args.write_buffer_size,
+    )
+    .expect("Failed to create ShuffleWriterExec");
+
+    let config = SessionConfig::new().with_batch_size(args.batch_size);
+    let mut runtime_builder = RuntimeEnvBuilder::new();
+    if let Some(mem_limit) = args.memory_limit {
+        runtime_builder = runtime_builder.with_memory_limit(mem_limit, 1.0);
+    }
+    let runtime_env = Arc::new(runtime_builder.build().unwrap());
+    let ctx = SessionContext::new_with_config_rt(config, runtime_env);
+    let task_ctx = ctx.task_ctx();
+
+    let start = Instant::now();
+    let stream = exec.execute(0, task_ctx).unwrap();
+    let rt = tokio::runtime::Runtime::new().unwrap();
+    rt.block_on(collect(stream)).unwrap();
+    start.elapsed().as_secs_f64()
+}
+
+fn run_shuffle_read(data_file: &str, index_file: &str, num_partitions: usize) -> f64 {
+    let start = Instant::now();
+
+    // Read index file to get partition offsets
+    let index_bytes = fs::read(index_file).expect("Failed to read index file");
+    let num_offsets = index_bytes.len() / 8;
+    let offsets: Vec<i64> = (0..num_offsets)
+        .map(|i| {
+            let bytes: [u8; 8] = index_bytes[i * 8..(i + 1) * 8].try_into().unwrap();
+            i64::from_le_bytes(bytes)
+        })
+        .collect();
+
+    // Read data file
+    let data_bytes = fs::read(data_file).expect("Failed to read data file");
+
+    let mut total_rows = 0usize;
+    let mut total_batches = 0usize;
+
+    // Decode each partition's data
+    for p in 0..num_partitions.min(offsets.len().saturating_sub(1)) {
+        let start_offset = offsets[p] as usize;
+        let end_offset = offsets[p + 1] as usize;
+
+        if start_offset >= end_offset {
+            continue; // Empty partition
+        }
+
+        // Read all IPC blocks within this partition
+        let mut offset = start_offset;
+        while offset < end_offset {
+            // First 8 bytes: IPC length
+            let ipc_length =
+                u64::from_le_bytes(data_bytes[offset..offset + 8].try_into().unwrap()) as usize;
+
+            // Skip 8-byte length prefix, then 8 bytes of field_count + codec header
+            let block_data = &data_bytes[offset + 16..offset + 8 + ipc_length];
+            let batch = read_ipc_compressed(block_data).expect("Failed to decode shuffle block");
+            total_rows += batch.num_rows();
+            total_batches += 1;
+
+            offset += 8 + ipc_length;
+        }
+    }
+
+    let elapsed = start.elapsed().as_secs_f64();
+    eprintln!(
+        "    read back {} rows in {} batches from {} partitions",
+        format_number(total_rows),
+        total_batches,
+        num_partitions
+    );
+    elapsed
+}
+
+fn build_partitioning(
+    scheme: &str,
+    num_partitions: usize,
+    hash_col_indices: &[usize],
+    schema: &SchemaRef,
+) -> CometPartitioning {
+    match scheme {
+        "single" => CometPartitioning::SinglePartition,
+        "round-robin" => CometPartitioning::RoundRobin(num_partitions, 0),
+        "hash" => {
+            let exprs: Vec<Arc<dyn datafusion::physical_expr::PhysicalExpr>> = hash_col_indices
+                .iter()
+                .map(|&idx| {
+                    let field = schema.field(idx);
+                    Arc::new(Column::new(field.name(), idx))
+                        as Arc<dyn datafusion::physical_expr::PhysicalExpr>
+                })
+                .collect();
+            CometPartitioning::Hash(exprs, num_partitions)
+        }
+        other => {
+            eprintln!("Unknown partitioning scheme: {other}. Using hash.");
+            build_partitioning("hash", num_partitions, hash_col_indices, schema)
+        }
+    }
+}
+
+fn parse_codec(codec: &str, zstd_level: i32) -> CompressionCodec {
+    match codec.to_lowercase().as_str() {
+        "none" => CompressionCodec::None,
+        "lz4" => CompressionCodec::Lz4Frame,
+        "zstd" => CompressionCodec::Zstd(zstd_level),
+        "snappy" => CompressionCodec::Snappy,
+        other => {
+            eprintln!("Unknown codec: {other}. Using zstd.");
+            CompressionCodec::Zstd(zstd_level)
+        }
+    }
+}
+
+fn parse_hash_columns(s: &str) -> Vec<usize> {
+    s.split(',')
+        .filter(|s| !s.is_empty())
+        .map(|s| s.trim().parse::<usize>().expect("Invalid column index"))
+        .collect()
+}
+
+fn describe_schema(schema: &Schema) -> String {
+    let mut counts = std::collections::HashMap::new();
+    for field in schema.fields() {
+        let type_name = match field.data_type() {
+            DataType::Int8
+            | DataType::Int16
+            | DataType::Int32
+            | DataType::Int64
+            | DataType::UInt8
+            | DataType::UInt16
+            | DataType::UInt32
+            | DataType::UInt64 => "int",
+            DataType::Float16 | DataType::Float32 | DataType::Float64 => "float",
+            DataType::Utf8 | DataType::LargeUtf8 => "string",
+            DataType::Boolean => "bool",
+            DataType::Date32 | DataType::Date64 => "date",
+            DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => "decimal",
+            DataType::Timestamp(_, _) => "timestamp",
+            DataType::Binary | DataType::LargeBinary | DataType::FixedSizeBinary(_) => "binary",
+            _ => "other",
+        };
+        *counts.entry(type_name).or_insert(0) += 1;
+    }
+    let mut parts: Vec<String> = counts
+        .into_iter()
+        .map(|(k, v)| format!("{}x{}", v, k))
+        .collect();
+    parts.sort();
+    parts.join(", ")
+}
+
+fn format_number(n: usize) -> String {
+    let s = n.to_string();
+    let mut result = String::new();
+    for (i, c) in s.chars().rev().enumerate() {
+        if i > 0 && i % 3 == 0 {
+            result.push(',');
+        }
+        result.push(c);
+    }
+    result.chars().rev().collect()
+}
+
+fn format_bytes(bytes: usize) -> String {
+    if bytes >= 1024 * 1024 * 1024 {
+        format!("{:.2} GiB", bytes as f64 / (1024.0 * 1024.0 * 1024.0))
+    } else if bytes >= 1024 * 1024 {
+        format!("{:.2} MiB", bytes as f64 / (1024.0 * 1024.0))
+    } else if bytes >= 1024 {
+        format!("{:.2} KiB", bytes as f64 / 1024.0)
+    } else {
+        format!("{bytes} B")
+    }
+}

From 9b5b305cb125a76e22f55ac61ee4994ff9e9f484 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sat, 21 Mar 2026 09:05:16 -0600
Subject: [PATCH 02/18] feat: add --limit option to shuffle benchmark (default
 1M rows)

---
 native/core/src/bin/shuffle_bench.rs | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/native/core/src/bin/shuffle_bench.rs b/native/core/src/bin/shuffle_bench.rs
index c1498161f7..9b963c5803 100644
--- a/native/core/src/bin/shuffle_bench.rs
+++ b/native/core/src/bin/shuffle_bench.rs
@@ -152,6 +152,10 @@ struct Args {
     /// Write buffer size in bytes
     #[arg(long, default_value_t = 1048576)]
     write_buffer_size: usize,
+
+    /// Maximum number of rows to use (default: 1,000,000)
+    #[arg(long, default_value_t = 1_000_000)]
+    limit: usize,
 }
 
 fn main() {
@@ -178,6 +182,26 @@ fn main() {
     };
     let load_elapsed = load_start.elapsed();
 
+    // Apply row limit
+    let batches = {
+        let mut limited = Vec::new();
+        let mut rows_so_far = 0usize;
+        for batch in batches {
+            if rows_so_far >= args.limit {
+                break;
+            }
+            let remaining = args.limit - rows_so_far;
+            if batch.num_rows() <= remaining {
+                rows_so_far += batch.num_rows();
+                limited.push(batch);
+            } else {
+                limited.push(batch.slice(0, remaining));
+                rows_so_far += remaining;
+            }
+        }
+        limited
+    };
+
     let schema = batches[0].schema();
     let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
     let total_bytes: usize = batches.iter().map(|b| b.get_array_memory_size()).sum();

From e1ab490c57b04387e4885284811ccf2ad993f4a6 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sat, 21 Mar 2026 09:19:01 -0600
Subject: [PATCH 03/18] perf: apply limit during parquet read to avoid scanning
 all files

---
 native/core/src/bin/shuffle_bench.rs | 43 ++++++++++++----------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/native/core/src/bin/shuffle_bench.rs b/native/core/src/bin/shuffle_bench.rs
index 9b963c5803..17b1a9a6ff 100644
--- a/native/core/src/bin/shuffle_bench.rs
+++ b/native/core/src/bin/shuffle_bench.rs
@@ -176,32 +176,12 @@ fn main() {
     // Load data
     let load_start = Instant::now();
     let batches = if let Some(ref input_path) = args.input {
-        load_parquet(input_path, args.batch_size)
+        load_parquet(input_path, args.batch_size, args.limit)
     } else {
         generate_data(&args)
     };
     let load_elapsed = load_start.elapsed();
 
-    // Apply row limit
-    let batches = {
-        let mut limited = Vec::new();
-        let mut rows_so_far = 0usize;
-        for batch in batches {
-            if rows_so_far >= args.limit {
-                break;
-            }
-            let remaining = args.limit - rows_so_far;
-            if batch.num_rows() <= remaining {
-                rows_so_far += batch.num_rows();
-                limited.push(batch);
-            } else {
-                limited.push(batch.slice(0, remaining));
-                rows_so_far += remaining;
-            }
-        }
-        limited
-    };
-
     let schema = batches[0].schema();
     let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
     let total_bytes: usize = batches.iter().map(|b| b.get_array_memory_size()).sum();
@@ -358,8 +338,9 @@ fn main() {
     let _ = fs::remove_file(&index_file);
 }
 
-fn load_parquet(path: &PathBuf, batch_size: usize) -> Vec<RecordBatch> {
+fn load_parquet(path: &PathBuf, batch_size: usize, limit: usize) -> Vec<RecordBatch> {
     let mut batches = Vec::new();
+    let mut total_rows = 0usize;
 
     let paths = if path.is_dir() {
         let mut files: Vec<PathBuf> = fs::read_dir(path)
@@ -383,7 +364,7 @@ fn load_parquet(path: &PathBuf, batch_size: usize) -> Vec<RecordBatch> {
         vec![path.clone()]
     };
 
-    for file_path in &paths {
+    'outer: for file_path in &paths {
         let file = fs::File::open(file_path)
             .unwrap_or_else(|e| panic!("Failed to open {}: {}", file_path.display(), e));
         let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap_or_else(|e| {
@@ -407,8 +388,19 @@ fn load_parquet(path: &PathBuf, batch_size: usize) -> Vec<RecordBatch> {
             let batch = batch_result.unwrap_or_else(|e| {
                 panic!("Failed to read batch from {}: {}", file_path.display(), e)
             });
-            if batch.num_rows() > 0 {
+            if batch.num_rows() == 0 {
+                continue;
+            }
+            let remaining = limit - total_rows;
+            if batch.num_rows() <= remaining {
+                total_rows += batch.num_rows();
                 batches.push(batch);
+            } else {
+                batches.push(batch.slice(0, remaining));
+                total_rows += remaining;
+            }
+            if total_rows >= limit {
+                break 'outer;
             }
         }
     }
@@ -418,8 +410,9 @@ fn load_parquet(path: &PathBuf, batch_size: usize) -> Vec<RecordBatch> {
     }
 
     println!(
-        "Loaded {} batches from {} file(s)",
+        "Loaded {} batches ({} rows) from {} file(s)",
         batches.len(),
+        format_number(total_rows),
         paths.len()
     );
     batches

From ca36cbd579d5c512e9617eb54cd052ec0d2c244d Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 23 Mar 2026 06:39:47 -0700
Subject: [PATCH 04/18] chore: add comment explaining parquet/rand deps in
 shuffle crate

---
 native/shuffle/Cargo.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/native/shuffle/Cargo.toml b/native/shuffle/Cargo.toml
index 1a64a36ff2..1337ea3b15 100644
--- a/native/shuffle/Cargo.toml
+++ b/native/shuffle/Cargo.toml
@@ -43,6 +43,7 @@ itertools = "0.14.0"
 jni = "0.21"
 log = "0.4"
 lz4_flex = { version = "0.13.0", default-features = false, features = ["frame"] }
+# parquet and rand are only used by the shuffle_bench binary (shuffle-bench feature)
 parquet = { workspace = true, optional = true }
 rand = { workspace = true, optional = true }
 simd-adler32 = "0.3.7"

From 6e8bed28902e7256e1e1b3e914439e0a5abf3622 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Thu, 26 Mar 2026 07:13:41 -0700
Subject: [PATCH 05/18] perf: add max_buffered_batches config and stream
 shuffle bench from parquet

- Add `spark.comet.exec.shuffle.maxBufferedBatches` config to limit
  the number of batches buffered before spilling, allowing earlier
  spilling to reduce peak memory usage on executors
- Fix too-many-open-files: close spill file FD after each spill and
  reopen in append mode, rather than holding one FD open per partition
- Refactor shuffle_bench to stream directly from Parquet instead of
  loading all input data into memory; remove synthetic data generation
- Add --max-buffered-batches CLI arg to shuffle_bench
- Add shuffle benchmark documentation to README
---
 .../scala/org/apache/comet/CometConf.scala    |  12 +
 native/core/src/execution/planner.rs          |   2 +
 native/proto/src/proto/operator.proto         |   3 +
 native/shuffle/README.md                      |  43 ++
 native/shuffle/benches/shuffle_writer.rs      |   1 +
 native/shuffle/src/bin/shuffle_bench.rs       | 432 +++++-------------
 .../src/partitioners/multi_partition.rs       |  11 +
 native/shuffle/src/shuffle_writer.rs          |  11 +
 .../shuffle/src/writers/partition_writer.rs   |  29 +-
 .../shuffle/CometNativeShuffleWriter.scala    |   2 +
 10 files changed, 209 insertions(+), 337 deletions(-)

diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
index bfe90181ff..2ad9c1f609 100644
--- a/common/src/main/scala/org/apache/comet/CometConf.scala
+++ b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -534,6 +534,18 @@ object CometConf extends ShimCometConf {
       .checkValue(v => v > 0, "Write buffer size must be positive")
       .createWithDefault(1)
 
+  val COMET_SHUFFLE_MAX_BUFFERED_BATCHES: ConfigEntry[Int] =
+    conf(s"$COMET_EXEC_CONFIG_PREFIX.shuffle.maxBufferedBatches")
+      .category(CATEGORY_SHUFFLE)
+      .doc("Maximum number of batches to buffer in memory before spilling to disk during " +
+        "native shuffle. Setting this to a small value causes earlier spilling, which reduces " +
+        "peak memory usage on executors at the cost of more disk I/O. " +
+        "The default value of 0 disables this limit and spills only when the memory pool is " +
+        "exhausted.")
+      .intConf
+      .checkValue(v => v >= 0, "Max buffered batches must be non-negative")
+      .createWithDefault(0)
+
   val COMET_SHUFFLE_PREFER_DICTIONARY_RATIO: ConfigEntry[Double] = conf(
     "spark.comet.shuffle.preferDictionary.ratio")
     .category(CATEGORY_SHUFFLE)
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
index 5af31fcc22..b6e05fccd1 100644
--- a/native/core/src/execution/planner.rs
+++ b/native/core/src/execution/planner.rs
@@ -1352,6 +1352,7 @@ impl PhysicalPlanner {
                 }?;
 
                 let write_buffer_size = writer.write_buffer_size as usize;
+                let max_buffered_batches = writer.max_buffered_batches as usize;
                 let shuffle_writer = Arc::new(ShuffleWriterExec::try_new(
                     Arc::clone(&child.native_plan),
                     partitioning,
@@ -1360,6 +1361,7 @@ impl PhysicalPlanner {
                     writer.output_index_file.clone(),
                     writer.tracing_enabled,
                     write_buffer_size,
+                    max_buffered_batches,
                 )?);
 
                 Ok((
diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto
index 344b9f0f21..5e23aad061 100644
--- a/native/proto/src/proto/operator.proto
+++ b/native/proto/src/proto/operator.proto
@@ -294,6 +294,9 @@ message ShuffleWriter {
   // Size of the write buffer in bytes used when writing shuffle data to disk.
   // Larger values may improve write performance but use more memory.
   int32 write_buffer_size = 8;
+  // Maximum number of batches to buffer before spilling to disk.
+  // 0 means no limit (spill only when memory pool is exhausted).
+  int32 max_buffered_batches = 9;
 }
 
 message ParquetWriter {
diff --git a/native/shuffle/README.md b/native/shuffle/README.md
index 8fba6b0323..0333ddbe8e 100644
--- a/native/shuffle/README.md
+++ b/native/shuffle/README.md
@@ -23,3 +23,46 @@ This crate provides the shuffle writer and reader implementation for Apache Data
 of the [Apache DataFusion Comet] subproject.
 
 [Apache DataFusion Comet]: https://github.com/apache/datafusion-comet/
+
+## Shuffle Benchmark Tool
+
+A standalone benchmark binary (`shuffle_bench`) is included for profiling shuffle write and read
+performance outside of Spark. It streams input data directly from Parquet files.
+
+### Basic usage
+
+```sh
+cargo run --release --features shuffle-bench --bin shuffle_bench -- \
+  --input /data/tpch-sf100/lineitem/ \
+  --partitions 200 \
+  --codec zstd --zstd-level 1 \
+  --hash-columns 0,3
+```
+
+### Options
+
+| Option | Default | Description |
+|---|---|---|
+| `--input` | *(required)* | Path to a Parquet file or directory of Parquet files |
+| `--partitions` | `200` | Number of output shuffle partitions |
+| `--partitioning` | `hash` | Partitioning scheme: `hash`, `single`, `round-robin` |
+| `--hash-columns` | `0` | Comma-separated column indices to hash on (e.g. `0,3`) |
+| `--codec` | `zstd` | Compression codec: `none`, `lz4`, `zstd`, `snappy` |
+| `--zstd-level` | `1` | Zstd compression level (1–22) |
+| `--batch-size` | `8192` | Batch size for reading Parquet data |
+| `--memory-limit` | *(none)* | Memory limit in bytes; triggers spilling when exceeded |
+| `--max-buffered-batches` | `0` | Max batches to buffer before spilling (0 = memory-pool-only) |
+| `--write-buffer-size` | `1048576` | Write buffer size in bytes |
+| `--limit` | `0` | Limit rows processed per iteration (0 = no limit) |
+| `--iterations` | `1` | Number of timed iterations |
+| `--warmup` | `0` | Number of warmup iterations before timing |
+| `--read-back` | `false` | Also benchmark reading back the shuffle output |
+| `--output-dir` | `/tmp/comet_shuffle_bench` | Directory for temporary shuffle output files |
+
+### Profiling with flamegraph
+
+```sh
+cargo flamegraph --release --features shuffle-bench --bin shuffle_bench -- \
+  --input /data/tpch-sf100/lineitem/ \
+  --partitions 200 --codec zstd --zstd-level 1
+```
diff --git a/native/shuffle/benches/shuffle_writer.rs b/native/shuffle/benches/shuffle_writer.rs
index 27abd919fa..8ff1f024d5 100644
--- a/native/shuffle/benches/shuffle_writer.rs
+++ b/native/shuffle/benches/shuffle_writer.rs
@@ -153,6 +153,7 @@ fn create_shuffle_writer_exec(
         "/tmp/index.out".to_string(),
         false,
         1024 * 1024,
+        0, // max_buffered_batches: no limit
     )
     .unwrap()
 }
diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs
index 373d0f92b9..0f04954ffa 100644
--- a/native/shuffle/src/bin/shuffle_bench.rs
+++ b/native/shuffle/src/bin/shuffle_bench.rs
@@ -16,11 +16,10 @@
 // under the License.
 
 //! Standalone shuffle benchmark tool for profiling Comet shuffle write and read
-//! outside of Spark.
+//! outside of Spark. Streams input directly from Parquet files.
 //!
 //! # Usage
 //!
-//! Read from Parquet files (e.g. TPC-H lineitem):
 //! ```sh
 //! cargo run --release --bin shuffle_bench -- \
 //!   --input /data/tpch-sf100/lineitem/ \
@@ -30,14 +29,6 @@
 //!   --read-back
 //! ```
 //!
-//! Generate synthetic data:
-//! ```sh
-//! cargo run --release --bin shuffle_bench -- \
-//!   --generate --gen-rows 10000000 --gen-string-cols 4 --gen-int-cols 4 \
-//!   --gen-decimal-cols 2 --gen-avg-string-len 32 \
-//!   --partitions 200 --codec lz4 --read-back
-//! ```
-//!
 //! Profile with flamegraph:
 //! ```sh
 //! cargo flamegraph --release --bin shuffle_bench -- \
@@ -45,23 +36,19 @@
 //!   --partitions 200 --codec zstd --zstd-level 1
 //! ```
 
-use arrow::array::builder::{Date32Builder, Decimal128Builder, Int64Builder, StringBuilder};
-use arrow::array::RecordBatch;
-use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow::datatypes::{DataType, SchemaRef};
 use clap::Parser;
-use datafusion::datasource::memory::MemorySourceConfig;
-use datafusion::datasource::source::DataSourceExec;
 use datafusion::execution::config::SessionConfig;
 use datafusion::execution::runtime_env::RuntimeEnvBuilder;
 use datafusion::physical_expr::expressions::Column;
+use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion::physical_plan::common::collect;
 use datafusion::physical_plan::ExecutionPlan;
-use datafusion::prelude::SessionContext;
+use datafusion::prelude::{ParquetReadOptions, SessionContext};
 use datafusion_comet_shuffle::{
     read_ipc_compressed, CometPartitioning, CompressionCodec, ShuffleWriterExec,
 };
 use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
-use rand::RngExt;
 use std::fs;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -75,37 +62,9 @@ use std::time::Instant;
 struct Args {
     /// Path to input Parquet file or directory of Parquet files
     #[arg(long)]
-    input: Option<PathBuf>,
-
-    /// Generate synthetic data instead of reading from Parquet
-    #[arg(long, default_value_t = false)]
-    generate: bool,
-
-    /// Number of rows to generate (requires --generate)
-    #[arg(long, default_value_t = 1_000_000)]
-    gen_rows: usize,
-
-    /// Number of Int64 columns to generate
-    #[arg(long, default_value_t = 4)]
-    gen_int_cols: usize,
+    input: PathBuf,
 
-    /// Number of Utf8 string columns to generate
-    #[arg(long, default_value_t = 2)]
-    gen_string_cols: usize,
-
-    /// Number of Decimal128 columns to generate
-    #[arg(long, default_value_t = 2)]
-    gen_decimal_cols: usize,
-
-    /// Number of Date32 columns to generate
-    #[arg(long, default_value_t = 1)]
-    gen_date_cols: usize,
-
-    /// Average string length for generated string columns
-    #[arg(long, default_value_t = 24)]
-    gen_avg_string_len: usize,
-
-    /// Batch size for reading Parquet or generating data
+    /// Batch size for reading Parquet data
     #[arg(long, default_value_t = 8192)]
     batch_size: usize,
 
@@ -153,70 +112,37 @@ struct Args {
     #[arg(long, default_value_t = 1048576)]
     write_buffer_size: usize,
 
-    /// Maximum number of rows to use (default: 1,000,000)
-    #[arg(long, default_value_t = 1_000_000)]
+    /// Maximum number of batches to buffer before spilling (0 = no limit)
+    #[arg(long, default_value_t = 0)]
+    max_buffered_batches: usize,
+
+    /// Limit rows processed per iteration (0 = no limit)
+    #[arg(long, default_value_t = 0)]
     limit: usize,
 }
 
 fn main() {
     let args = Args::parse();
 
-    // Validate args
-    if args.input.is_none() && !args.generate {
-        eprintln!("Error: must specify either --input <path> or --generate");
-        std::process::exit(1);
-    }
-
     // Create output directory
     fs::create_dir_all(&args.output_dir).expect("Failed to create output directory");
-
     let data_file = args.output_dir.join("data.out");
     let index_file = args.output_dir.join("index.out");
 
-    // Load data
-    let load_start = Instant::now();
-    let batches = if let Some(ref input_path) = args.input {
-        load_parquet(input_path, args.batch_size, args.limit)
-    } else {
-        generate_data(&args)
-    };
-    let load_elapsed = load_start.elapsed();
+    let (schema, total_rows) = read_parquet_metadata(&args.input, args.limit);
 
-    let schema = batches[0].schema();
-    let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
-    let total_bytes: usize = batches.iter().map(|b| b.get_array_memory_size()).sum();
+    let codec = parse_codec(&args.codec, args.zstd_level);
+    let hash_col_indices = parse_hash_columns(&args.hash_columns);
 
     println!("=== Shuffle Benchmark ===");
+    println!("Input:          {}", args.input.display());
     println!(
-        "Data source:    {}",
-        if args.input.is_some() {
-            "parquet"
-        } else {
-            "generated"
-        }
-    );
-    println!(
-        "Schema:         {} columns ({} fields)",
+        "Schema:         {} columns ({})",
         schema.fields().len(),
         describe_schema(&schema)
     );
-    println!("Total rows:     {}", format_number(total_rows));
-    println!("Total size:     {}", format_bytes(total_bytes));
-    println!("Batches:        {}", batches.len());
-    println!(
-        "Rows/batch:     ~{}",
-        if batches.is_empty() {
-            0
-        } else {
-            total_rows / batches.len()
-        }
-    );
-    println!("Load time:      {:.3}s", load_elapsed.as_secs_f64());
-    println!();
-
-    let codec = parse_codec(&args.codec, args.zstd_level);
-    let hash_col_indices = parse_hash_columns(&args.hash_columns);
-
+    println!("Total rows:     {}", format_number(total_rows as usize));
+    println!("Batch size:     {}", format_number(args.batch_size));
     println!("Partitioning:   {}", args.partitioning);
     println!("Partitions:     {}", args.partitions);
     println!("Codec:          {:?}", codec);
@@ -224,13 +150,15 @@ fn main() {
     if let Some(mem_limit) = args.memory_limit {
         println!("Memory limit:   {}", format_bytes(mem_limit));
     }
+    if args.max_buffered_batches > 0 {
+        println!("Max buf batches:{}", args.max_buffered_batches);
+    }
     println!(
         "Iterations:     {} (warmup: {})",
         args.iterations, args.warmup
     );
     println!();
 
-    // Run warmup + timed iterations
     let total_iters = args.warmup + args.iterations;
     let mut write_times = Vec::with_capacity(args.iterations);
     let mut read_times = Vec::with_capacity(args.iterations);
@@ -244,9 +172,8 @@ fn main() {
             format!("iter {}/{}", i - args.warmup + 1, args.iterations)
         };
 
-        // Write phase
         let write_elapsed = run_shuffle_write(
-            &batches,
+            &args.input,
             &schema,
             &codec,
             &hash_col_indices,
@@ -264,7 +191,6 @@ fn main() {
         print!("  [{label}] write: {:.3}s", write_elapsed);
         print!("  output: {}", format_bytes(data_size as usize));
 
-        // Read phase
         if args.read_back {
             let read_elapsed = run_shuffle_read(
                 data_file.to_str().unwrap(),
@@ -279,7 +205,6 @@ fn main() {
         println!();
     }
 
-    // Print summary
     if args.iterations > 0 {
         println!();
         println!("=== Results ===");
@@ -287,12 +212,6 @@ fn main() {
         let avg_write = write_times.iter().sum::<f64>() / write_times.len() as f64;
         let avg_data_size = data_file_sizes.iter().sum::<u64>() / data_file_sizes.len() as u64;
         let write_throughput_rows = total_rows as f64 / avg_write;
-        let write_throughput_bytes = total_bytes as f64 / avg_write;
-        let compression_ratio = if avg_data_size > 0 {
-            total_bytes as f64 / avg_data_size as f64
-        } else {
-            0.0
-        };
 
         println!("Write:");
         println!("  avg time:         {:.3}s", avg_write);
@@ -305,15 +224,13 @@ fn main() {
             println!("  min/max:          {:.3}s / {:.3}s", min, max);
         }
         println!(
-            "  throughput:       {}/s ({} rows/s)",
-            format_bytes(write_throughput_bytes as usize),
+            "  throughput:       {} rows/s",
             format_number(write_throughput_rows as usize)
         );
         println!(
             "  output size:      {}",
             format_bytes(avg_data_size as usize)
         );
-        println!("  compression:      {:.2}x", compression_ratio);
 
         if !read_times.is_empty() {
             let avg_read = read_times.iter().sum::<f64>() / read_times.len() as f64;
@@ -333,38 +250,17 @@ fn main() {
         }
     }
 
-    // Cleanup
     let _ = fs::remove_file(&data_file);
     let _ = fs::remove_file(&index_file);
 }
 
-fn load_parquet(path: &PathBuf, batch_size: usize, limit: usize) -> Vec<RecordBatch> {
-    let mut batches = Vec::new();
-    let mut total_rows = 0usize;
+/// Read schema and total row count from Parquet metadata without loading any data.
+fn read_parquet_metadata(path: &PathBuf, limit: usize) -> (SchemaRef, u64) {
+    let paths = collect_parquet_paths(path);
+    let mut schema = None;
+    let mut total_rows = 0u64;
 
-    let paths = if path.is_dir() {
-        let mut files: Vec<PathBuf> = fs::read_dir(path)
-            .expect("Failed to read input directory")
-            .filter_map(|entry| {
-                let entry = entry.ok()?;
-                let p = entry.path();
-                if p.extension().and_then(|e| e.to_str()) == Some("parquet") {
-                    Some(p)
-                } else {
-                    None
-                }
-            })
-            .collect();
-        files.sort();
-        if files.is_empty() {
-            panic!("No .parquet files found in {}", path.display());
-        }
-        files
-    } else {
-        vec![path.clone()]
-    };
-
-    'outer: for file_path in &paths {
+    for file_path in &paths {
         let file = fs::File::open(file_path)
             .unwrap_or_else(|e| panic!("Failed to open {}: {}", file_path.display(), e));
         let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap_or_else(|e| {
@@ -374,172 +270,44 @@ fn load_parquet(path: &PathBuf, batch_size: usize, limit: usize) -> Vec<RecordBa
                 e
             )
         });
-        let reader = builder
-            .with_batch_size(batch_size)
-            .build()
-            .unwrap_or_else(|e| {
-                panic!(
-                    "Failed to build Parquet reader for {}: {}",
-                    file_path.display(),
-                    e
-                )
-            });
-        for batch_result in reader {
-            let batch = batch_result.unwrap_or_else(|e| {
-                panic!("Failed to read batch from {}: {}", file_path.display(), e)
-            });
-            if batch.num_rows() == 0 {
-                continue;
-            }
-            let remaining = limit - total_rows;
-            if batch.num_rows() <= remaining {
-                total_rows += batch.num_rows();
-                batches.push(batch);
-            } else {
-                batches.push(batch.slice(0, remaining));
-                total_rows += remaining;
-            }
-            if total_rows >= limit {
-                break 'outer;
-            }
+        if schema.is_none() {
+            schema = Some(Arc::clone(builder.schema()));
+        }
+        total_rows += builder.metadata().file_metadata().num_rows() as u64;
+        if limit > 0 && total_rows >= limit as u64 {
+            total_rows = total_rows.min(limit as u64);
+            break;
         }
     }
 
-    if batches.is_empty() {
-        panic!("No data read from input");
-    }
-
-    println!(
-        "Loaded {} batches ({} rows) from {} file(s)",
-        batches.len(),
-        format_number(total_rows),
-        paths.len()
-    );
-    batches
+    (schema.expect("No parquet files found"), total_rows)
 }
 
-fn generate_data(args: &Args) -> Vec<RecordBatch> {
-    let mut fields = Vec::new();
-    let mut col_idx = 0;
-
-    // Int64 columns
-    for _ in 0..args.gen_int_cols {
-        fields.push(Field::new(
-            format!("int_col_{col_idx}"),
-            DataType::Int64,
-            true,
-        ));
-        col_idx += 1;
-    }
-    // String columns
-    for _ in 0..args.gen_string_cols {
-        fields.push(Field::new(
-            format!("str_col_{col_idx}"),
-            DataType::Utf8,
-            true,
-        ));
-        col_idx += 1;
-    }
-    // Decimal columns
-    for _ in 0..args.gen_decimal_cols {
-        fields.push(Field::new(
-            format!("dec_col_{col_idx}"),
-            DataType::Decimal128(18, 2),
-            true,
-        ));
-        col_idx += 1;
-    }
-    // Date columns
-    for _ in 0..args.gen_date_cols {
-        fields.push(Field::new(
-            format!("date_col_{col_idx}"),
-            DataType::Date32,
-            true,
-        ));
-        col_idx += 1;
-    }
-
-    let schema = Arc::new(Schema::new(fields));
-    let mut batches = Vec::new();
-    let mut rng = rand::rng();
-    let mut remaining = args.gen_rows;
-
-    while remaining > 0 {
-        let batch_rows = remaining.min(args.batch_size);
-        remaining -= batch_rows;
-
-        let mut columns: Vec<Arc<dyn arrow::array::Array>> = Vec::new();
-
-        // Int64 columns
-        for _ in 0..args.gen_int_cols {
-            let mut builder = Int64Builder::with_capacity(batch_rows);
-            for _ in 0..batch_rows {
-                if rng.random_range(0..100) < 5 {
-                    builder.append_null();
-                } else {
-                    builder.append_value(rng.random_range(0..1_000_000i64));
-                }
-            }
-            columns.push(Arc::new(builder.finish()));
-        }
-        // String columns
-        for _ in 0..args.gen_string_cols {
-            let mut builder =
-                StringBuilder::with_capacity(batch_rows, batch_rows * args.gen_avg_string_len);
-            for _ in 0..batch_rows {
-                if rng.random_range(0..100) < 5 {
-                    builder.append_null();
-                } else {
-                    let len = rng.random_range(1..args.gen_avg_string_len * 2);
-                    let s: String = (0..len)
-                        .map(|_| rng.random_range(b'a'..=b'z') as char)
-                        .collect();
-                    builder.append_value(&s);
-                }
-            }
-            columns.push(Arc::new(builder.finish()));
-        }
-        // Decimal columns
-        for _ in 0..args.gen_decimal_cols {
-            let mut builder = Decimal128Builder::with_capacity(batch_rows)
-                .with_precision_and_scale(18, 2)
-                .unwrap();
-            for _ in 0..batch_rows {
-                if rng.random_range(0..100) < 5 {
-                    builder.append_null();
-                } else {
-                    builder.append_value(rng.random_range(0..100_000_000i128));
-                }
-            }
-            columns.push(Arc::new(builder.finish()));
-        }
-        // Date columns
-        for _ in 0..args.gen_date_cols {
-            let mut builder = Date32Builder::with_capacity(batch_rows);
-            for _ in 0..batch_rows {
-                if rng.random_range(0..100) < 5 {
-                    builder.append_null();
+fn collect_parquet_paths(path: &PathBuf) -> Vec<PathBuf> {
+    if path.is_dir() {
+        let mut files: Vec<PathBuf> = fs::read_dir(path)
+            .unwrap_or_else(|e| panic!("Failed to read directory {}: {}", path.display(), e))
+            .filter_map(|entry| {
+                let p = entry.ok()?.path();
+                if p.extension().and_then(|e| e.to_str()) == Some("parquet") {
+                    Some(p)
                 } else {
-                    builder.append_value(rng.random_range(0..20000i32));
+                    None
                 }
-            }
-            columns.push(Arc::new(builder.finish()));
+            })
+            .collect();
+        files.sort();
+        if files.is_empty() {
+            panic!("No .parquet files found in {}", path.display());
         }
-
-        let batch = RecordBatch::try_new(Arc::clone(&schema), columns).unwrap();
-        batches.push(batch);
+        files
+    } else {
+        vec![path.clone()]
     }
-
-    println!(
-        "Generated {} batches ({} rows)",
-        batches.len(),
-        args.gen_rows
-    );
-    batches
 }
 
 fn run_shuffle_write(
-    batches: &[RecordBatch],
+    input_path: &PathBuf,
     schema: &SchemaRef,
     codec: &CompressionCodec,
     hash_col_indices: &[usize],
@@ -554,40 +322,61 @@ fn run_shuffle_write(
         schema,
     );
 
-    let partitions = &[batches.to_vec()];
-    let exec = ShuffleWriterExec::try_new(
-        Arc::new(DataSourceExec::new(Arc::new(
-            MemorySourceConfig::try_new(partitions, Arc::clone(schema), None).unwrap(),
-        ))),
-        partitioning,
-        codec.clone(),
-        data_file.to_string(),
-        index_file.to_string(),
-        false,
-        args.write_buffer_size,
-    )
-    .expect("Failed to create ShuffleWriterExec");
-
-    let config = SessionConfig::new().with_batch_size(args.batch_size);
-    let mut runtime_builder = RuntimeEnvBuilder::new();
-    if let Some(mem_limit) = args.memory_limit {
-        runtime_builder = runtime_builder.with_memory_limit(mem_limit, 1.0);
-    }
-    let runtime_env = Arc::new(runtime_builder.build().unwrap());
-    let ctx = SessionContext::new_with_config_rt(config, runtime_env);
-    let task_ctx = ctx.task_ctx();
-
-    let start = Instant::now();
-    let stream = exec.execute(0, task_ctx).unwrap();
     let rt = tokio::runtime::Runtime::new().unwrap();
-    rt.block_on(collect(stream)).unwrap();
-    start.elapsed().as_secs_f64()
+    rt.block_on(async {
+        let config = SessionConfig::new().with_batch_size(args.batch_size);
+        let mut runtime_builder = RuntimeEnvBuilder::new();
+        if let Some(mem_limit) = args.memory_limit {
+            runtime_builder = runtime_builder.with_memory_limit(mem_limit, 1.0);
+        }
+        let runtime_env = Arc::new(runtime_builder.build().unwrap());
+        let ctx = SessionContext::new_with_config_rt(config, runtime_env);
+
+        let path_str = input_path.to_str().unwrap();
+        let mut df = ctx
+            .read_parquet(path_str, ParquetReadOptions::default())
+            .await
+            .expect("Failed to create Parquet scan");
+        if args.limit > 0 {
+            df = df.limit(0, Some(args.limit)).unwrap();
+        }
+
+        let parquet_plan = df
+            .create_physical_plan()
+            .await
+            .expect("Failed to create physical plan");
+
+        // ShuffleWriterExec reads from a single input partition
+        let input: Arc<dyn ExecutionPlan> =
+            if parquet_plan.properties().output_partitioning().partition_count() > 1 {
+                Arc::new(CoalescePartitionsExec::new(parquet_plan))
+            } else {
+                parquet_plan
+            };
+
+        let exec = ShuffleWriterExec::try_new(
+            input,
+            partitioning,
+            codec.clone(),
+            data_file.to_string(),
+            index_file.to_string(),
+            false,
+            args.write_buffer_size,
+            args.max_buffered_batches,
+        )
+        .expect("Failed to create ShuffleWriterExec");
+
+        let task_ctx = ctx.task_ctx();
+        let start = Instant::now();
+        let stream = exec.execute(0, task_ctx).unwrap();
+        collect(stream).await.unwrap();
+        start.elapsed().as_secs_f64()
+    })
 }
 
 fn run_shuffle_read(data_file: &str, index_file: &str, num_partitions: usize) -> f64 {
     let start = Instant::now();
 
-    // Read index file to get partition offsets
     let index_bytes = fs::read(index_file).expect("Failed to read index file");
     let num_offsets = index_bytes.len() / 8;
     let offsets: Vec<i64> = (0..num_offsets)
@@ -597,34 +386,27 @@ fn run_shuffle_read(data_file: &str, index_file: &str, num_partitions: usize) ->
         })
         .collect();
 
-    // Read data file
     let data_bytes = fs::read(data_file).expect("Failed to read data file");
 
     let mut total_rows = 0usize;
     let mut total_batches = 0usize;
 
-    // Decode each partition's data
     for p in 0..num_partitions.min(offsets.len().saturating_sub(1)) {
         let start_offset = offsets[p] as usize;
         let end_offset = offsets[p + 1] as usize;
 
         if start_offset >= end_offset {
-            continue; // Empty partition
+            continue;
         }
 
-        // Read all IPC blocks within this partition
         let mut offset = start_offset;
         while offset < end_offset {
-            // First 8 bytes: IPC length
             let ipc_length =
                 u64::from_le_bytes(data_bytes[offset..offset + 8].try_into().unwrap()) as usize;
-
-            // Skip 8-byte length prefix, then 8 bytes of field_count + codec header
             let block_data = &data_bytes[offset + 16..offset + 8 + ipc_length];
             let batch = read_ipc_compressed(block_data).expect("Failed to decode shuffle block");
             total_rows += batch.num_rows();
             total_batches += 1;
-
             offset += 8 + ipc_length;
         }
     }
@@ -686,7 +468,7 @@ fn parse_hash_columns(s: &str) -> Vec<usize> {
         .collect()
 }
 
-fn describe_schema(schema: &Schema) -> String {
+fn describe_schema(schema: &arrow::datatypes::Schema) -> String {
     let mut counts = std::collections::HashMap::new();
     for field in schema.fields() {
         let type_name = match field.data_type() {
diff --git a/native/shuffle/src/partitioners/multi_partition.rs b/native/shuffle/src/partitioners/multi_partition.rs
index 42290c5510..4cc09e9679 100644
--- a/native/shuffle/src/partitioners/multi_partition.rs
+++ b/native/shuffle/src/partitioners/multi_partition.rs
@@ -124,6 +124,8 @@ pub(crate) struct MultiPartitionShuffleRepartitioner {
     tracing_enabled: bool,
     /// Size of the write buffer in bytes
     write_buffer_size: usize,
+    /// Maximum number of batches to buffer before spilling (0 = no limit)
+    max_buffered_batches: usize,
 }
 
 impl MultiPartitionShuffleRepartitioner {
@@ -140,6 +142,7 @@ impl MultiPartitionShuffleRepartitioner {
         codec: CompressionCodec,
         tracing_enabled: bool,
         write_buffer_size: usize,
+        max_buffered_batches: usize,
     ) -> datafusion::common::Result<Self> {
         let num_output_partitions = partitioning.partition_count();
         assert_ne!(
@@ -189,6 +192,7 @@ impl MultiPartitionShuffleRepartitioner {
             reservation,
             tracing_enabled,
             write_buffer_size,
+            max_buffered_batches,
         })
     }
 
@@ -397,6 +401,13 @@ impl MultiPartitionShuffleRepartitioner {
         partition_row_indices: &[u32],
         partition_starts: &[u32],
     ) -> datafusion::common::Result<()> {
+        // Spill before buffering if we've reached the configured batch count limit.
+        if self.max_buffered_batches > 0
+            && self.buffered_batches.len() >= self.max_buffered_batches
+        {
+            self.spill()?;
+        }
+
         let mut mem_growth: usize = input.get_array_memory_size();
         let buffered_partition_idx = self.buffered_batches.len() as u32;
         self.buffered_batches.push(input);
diff --git a/native/shuffle/src/shuffle_writer.rs b/native/shuffle/src/shuffle_writer.rs
index e649aaac69..95a09610a4 100644
--- a/native/shuffle/src/shuffle_writer.rs
+++ b/native/shuffle/src/shuffle_writer.rs
@@ -68,6 +68,8 @@ pub struct ShuffleWriterExec {
     tracing_enabled: bool,
     /// Size of the write buffer in bytes
     write_buffer_size: usize,
+    /// Maximum number of batches to buffer before spilling (0 = no limit)
+    max_buffered_batches: usize,
 }
 
 impl ShuffleWriterExec {
@@ -81,6 +83,7 @@ impl ShuffleWriterExec {
         output_index_file: String,
         tracing_enabled: bool,
         write_buffer_size: usize,
+        max_buffered_batches: usize,
     ) -> Result<Self> {
         let cache = PlanProperties::new(
             EquivalenceProperties::new(Arc::clone(&input.schema())),
@@ -99,6 +102,7 @@ impl ShuffleWriterExec {
             codec,
             tracing_enabled,
             write_buffer_size,
+            max_buffered_batches,
         })
     }
 }
@@ -163,6 +167,7 @@ impl ExecutionPlan for ShuffleWriterExec {
                 self.output_index_file.clone(),
                 self.tracing_enabled,
                 self.write_buffer_size,
+                self.max_buffered_batches,
             )?)),
             _ => panic!("ShuffleWriterExec wrong number of children"),
         }
@@ -190,6 +195,7 @@ impl ExecutionPlan for ShuffleWriterExec {
                     self.codec.clone(),
                     self.tracing_enabled,
                     self.write_buffer_size,
+                    self.max_buffered_batches,
                 )
                 .map_err(|e| ArrowError::ExternalError(Box::new(e))),
             )
@@ -210,6 +216,7 @@ async fn external_shuffle(
     codec: CompressionCodec,
     tracing_enabled: bool,
     write_buffer_size: usize,
+    max_buffered_batches: usize,
 ) -> Result<SendableRecordBatchStream> {
     with_trace_async("external_shuffle", tracing_enabled, || async {
         let schema = input.schema();
@@ -238,6 +245,7 @@ async fn external_shuffle(
                 codec,
                 tracing_enabled,
                 write_buffer_size,
+                max_buffered_batches,
             )?),
         };
 
@@ -362,6 +370,7 @@ mod test {
             CompressionCodec::Lz4Frame,
             false,
             1024 * 1024, // write_buffer_size: 1MB default
+            0,           // max_buffered_batches: no limit
         )
         .unwrap();
 
@@ -466,6 +475,7 @@ mod test {
                 "/tmp/index.out".to_string(),
                 false,
                 1024 * 1024, // write_buffer_size: 1MB default
+                0,           // max_buffered_batches: no limit
             )
             .unwrap();
 
@@ -525,6 +535,7 @@ mod test {
                 index_file.clone(),
                 false,
                 1024 * 1024,
+                0, // max_buffered_batches: no limit
             )
             .unwrap();
 
diff --git a/native/shuffle/src/writers/partition_writer.rs b/native/shuffle/src/writers/partition_writer.rs
index 48017871db..4de307de62 100644
--- a/native/shuffle/src/writers/partition_writer.rs
+++ b/native/shuffle/src/writers/partition_writer.rs
@@ -26,7 +26,6 @@ use std::fs::{File, OpenOptions};
 
 struct SpillFile {
     temp_file: RefCountedTempFile,
-    file: File,
 }
 
 pub(crate) struct PartitionWriter {
@@ -53,26 +52,28 @@ impl PartitionWriter {
         runtime: &RuntimeEnv,
     ) -> datafusion::common::Result<()> {
         if self.spill_file.is_none() {
-            // Spill file is not yet created, create it
             let spill_file = runtime
                 .disk_manager
                 .create_tmp_file("shuffle writer spill")?;
-            let spill_data = OpenOptions::new()
-                .write(true)
-                .create(true)
-                .truncate(true)
-                .open(spill_file.path())
-                .map_err(|e| {
-                    DataFusionError::Execution(format!("Error occurred while spilling {e}"))
-                })?;
+            // Create the file (truncating any pre-existing content)
+            File::create(spill_file.path()).map_err(|e| {
+                DataFusionError::Execution(format!("Error occurred while spilling {e}"))
+            })?;
             self.spill_file = Some(SpillFile {
                 temp_file: spill_file,
-                file: spill_data,
             });
         }
         Ok(())
     }
 
+    fn open_spill_file_for_append(&self) -> datafusion::common::Result<File> {
+        OpenOptions::new()
+            .write(true)
+            .append(true)
+            .open(self.spill_file.as_ref().unwrap().temp_file.path())
+            .map_err(|e| DataFusionError::Execution(format!("Error occurred while spilling {e}")))
+    }
+
     pub(crate) fn spill(
         &mut self,
         iter: &mut PartitionedBatchIterator,
@@ -84,10 +85,13 @@ impl PartitionWriter {
         if let Some(batch) = iter.next() {
             self.ensure_spill_file_created(runtime)?;
 
+            // Open the file for this spill and close it when done, so we don't
+            // hold open one FD per partition across multiple spill events.
+            let mut spill_data = self.open_spill_file_for_append()?;
             let total_bytes_written = {
                 let mut buf_batch_writer = BufBatchWriter::new(
                     &mut self.shuffle_block_writer,
-                    &mut self.spill_file.as_mut().unwrap().file,
+                    &mut spill_data,
                     write_buffer_size,
                     batch_size,
                 );
@@ -104,6 +108,7 @@ impl PartitionWriter {
                 buf_batch_writer.flush(&metrics.encode_time, &metrics.write_time)?;
                 bytes_written
             };
+            // spill_data is dropped here, closing the file descriptor
 
             Ok(total_bytes_written)
         } else {
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
index 3fc222bd19..a80d8b2fa4 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
@@ -192,6 +192,8 @@ class CometNativeShuffleWriter[K, V](
         CometConf.COMET_EXEC_SHUFFLE_COMPRESSION_ZSTD_LEVEL.get)
       shuffleWriterBuilder.setWriteBufferSize(
         CometConf.COMET_SHUFFLE_WRITE_BUFFER_SIZE.get().max(Int.MaxValue).toInt)
+      shuffleWriterBuilder.setMaxBufferedBatches(
+        CometConf.COMET_SHUFFLE_MAX_BUFFERED_BATCHES.get())
 
       outputPartitioning match {
         case p if isSinglePartitioning(p) =>

From 2ef57e79d4e9202d4fb1a70f62f5d8b717ca67b4 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 27 Mar 2026 12:05:23 -0600
Subject: [PATCH 06/18] cargo fmt

---
 native/shuffle/src/bin/shuffle_bench.rs | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs
index 7f919e7674..ec1227a2ee 100644
--- a/native/shuffle/src/bin/shuffle_bench.rs
+++ b/native/shuffle/src/bin/shuffle_bench.rs
@@ -340,12 +340,16 @@ fn run_shuffle_write(
             .expect("Failed to create physical plan");
 
         // ShuffleWriterExec reads from a single input partition
-        let input: Arc<dyn ExecutionPlan> =
-            if parquet_plan.properties().output_partitioning().partition_count() > 1 {
-                Arc::new(CoalescePartitionsExec::new(parquet_plan))
-            } else {
-                parquet_plan
-            };
+        let input: Arc<dyn ExecutionPlan> = if parquet_plan
+            .properties()
+            .output_partitioning()
+            .partition_count()
+            > 1
+        {
+            Arc::new(CoalescePartitionsExec::new(parquet_plan))
+        } else {
+            parquet_plan
+        };
 
         let exec = ShuffleWriterExec::try_new(
             input,

From 9136e104e87cae0c8920fcce790064de5716d425 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 27 Mar 2026 12:08:20 -0600
Subject: [PATCH 07/18] prettier

---
 native/shuffle/README.md | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/native/shuffle/README.md b/native/shuffle/README.md
index 0333ddbe8e..74b8dbe656 100644
--- a/native/shuffle/README.md
+++ b/native/shuffle/README.md
@@ -41,23 +41,23 @@ cargo run --release --features shuffle-bench --bin shuffle_bench -- \
 
 ### Options
 
-| Option | Default | Description |
-|---|---|---|
-| `--input` | *(required)* | Path to a Parquet file or directory of Parquet files |
-| `--partitions` | `200` | Number of output shuffle partitions |
-| `--partitioning` | `hash` | Partitioning scheme: `hash`, `single`, `round-robin` |
-| `--hash-columns` | `0` | Comma-separated column indices to hash on (e.g. `0,3`) |
-| `--codec` | `zstd` | Compression codec: `none`, `lz4`, `zstd`, `snappy` |
-| `--zstd-level` | `1` | Zstd compression level (1–22) |
-| `--batch-size` | `8192` | Batch size for reading Parquet data |
-| `--memory-limit` | *(none)* | Memory limit in bytes; triggers spilling when exceeded |
-| `--max-buffered-batches` | `0` | Max batches to buffer before spilling (0 = memory-pool-only) |
-| `--write-buffer-size` | `1048576` | Write buffer size in bytes |
-| `--limit` | `0` | Limit rows processed per iteration (0 = no limit) |
-| `--iterations` | `1` | Number of timed iterations |
-| `--warmup` | `0` | Number of warmup iterations before timing |
-| `--read-back` | `false` | Also benchmark reading back the shuffle output |
-| `--output-dir` | `/tmp/comet_shuffle_bench` | Directory for temporary shuffle output files |
+| Option                   | Default                    | Description                                                  |
+| ------------------------ | -------------------------- | ------------------------------------------------------------ |
+| `--input`                | _(required)_               | Path to a Parquet file or directory of Parquet files         |
+| `--partitions`           | `200`                      | Number of output shuffle partitions                          |
+| `--partitioning`         | `hash`                     | Partitioning scheme: `hash`, `single`, `round-robin`         |
+| `--hash-columns`         | `0`                        | Comma-separated column indices to hash on (e.g. `0,3`)       |
+| `--codec`                | `zstd`                     | Compression codec: `none`, `lz4`, `zstd`, `snappy`           |
+| `--zstd-level`           | `1`                        | Zstd compression level (1–22)                                |
+| `--batch-size`           | `8192`                     | Batch size for reading Parquet data                          |
+| `--memory-limit`         | _(none)_                   | Memory limit in bytes; triggers spilling when exceeded       |
+| `--max-buffered-batches` | `0`                        | Max batches to buffer before spilling (0 = memory-pool-only) |
+| `--write-buffer-size`    | `1048576`                  | Write buffer size in bytes                                   |
+| `--limit`                | `0`                        | Limit rows processed per iteration (0 = no limit)            |
+| `--iterations`           | `1`                        | Number of timed iterations                                   |
+| `--warmup`               | `0`                        | Number of warmup iterations before timing                    |
+| `--read-back`            | `false`                    | Also benchmark reading back the shuffle output               |
+| `--output-dir`           | `/tmp/comet_shuffle_bench` | Directory for temporary shuffle output files                 |
 
 ### Profiling with flamegraph
 

From 7e16819013e744eecd548483fcf414798f071744 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 27 Mar 2026 12:19:44 -0600
Subject: [PATCH 08/18] machete

---
 native/Cargo.lock         | 1 -
 native/shuffle/Cargo.toml | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/native/Cargo.lock b/native/Cargo.lock
index 88007af70a..2c117631a4 100644
--- a/native/Cargo.lock
+++ b/native/Cargo.lock
@@ -1996,7 +1996,6 @@ dependencies = [
  "log",
  "lz4_flex 0.13.0",
  "parquet",
- "rand 0.10.0",
  "simd-adler32",
  "snap",
  "tempfile",
diff --git a/native/shuffle/Cargo.toml b/native/shuffle/Cargo.toml
index 14ed5f60a0..a5982c05fa 100644
--- a/native/shuffle/Cargo.toml
+++ b/native/shuffle/Cargo.toml
@@ -43,9 +43,8 @@ itertools = "0.14.0"
 jni = "0.21"
 log = "0.4"
 lz4_flex = { version = "0.13.0", default-features = false, features = ["frame"] }
-# parquet and rand are only used by the shuffle_bench binary (shuffle-bench feature)
+# parquet is only used by the shuffle_bench binary (shuffle-bench feature)
 parquet = { workspace = true, optional = true }
-rand = { workspace = true, optional = true }
 simd-adler32 = "0.3.9"
 snap = "1.1"
 tokio = { version = "1", features = ["rt-multi-thread"] }
@@ -58,7 +57,7 @@ itertools = "0.14.0"
 tempfile = "3.26.0"
 
 [features]
-shuffle-bench = ["clap", "parquet", "rand"]
+shuffle-bench = ["clap", "parquet"]
 
 [lib]
 name = "datafusion_comet_shuffle"

From 22fe80491e111d1f261988a7bd06f73764ba8360 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sat, 28 Mar 2026 16:50:51 -0600
Subject: [PATCH 09/18] feat: add --concurrent-tasks flag to shuffle benchmark

Spawns N parallel shuffle tasks to simulate executor parallelism.
Each task reads the same input and writes to its own output files.
Extracts core shuffle logic into shared async helper to avoid
code duplication between single and concurrent paths.
---
 native/shuffle/src/bin/shuffle_bench.rs | 231 +++++++++++++++++-------
 1 file changed, 169 insertions(+), 62 deletions(-)

diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs
index ec1227a2ee..bb0952365b 100644
--- a/native/shuffle/src/bin/shuffle_bench.rs
+++ b/native/shuffle/src/bin/shuffle_bench.rs
@@ -115,6 +115,11 @@ struct Args {
     /// Limit rows processed per iteration (0 = no limit)
     #[arg(long, default_value_t = 0)]
     limit: usize,
+
+    /// Number of concurrent shuffle tasks to simulate executor parallelism.
+    /// Each task reads the same input and writes to its own output files.
+    #[arg(long, default_value_t = 1)]
+    concurrent_tasks: usize,
 }
 
 fn main() {
@@ -146,6 +151,9 @@ fn main() {
     if let Some(mem_limit) = args.memory_limit {
         println!("Memory limit:   {}", format_bytes(mem_limit));
     }
+    if args.concurrent_tasks > 1 {
+        println!("Concurrent:     {} tasks", args.concurrent_tasks);
+    }
     println!(
         "Iterations:     {} (warmup: {})",
         args.iterations, args.warmup
@@ -165,15 +173,19 @@ fn main() {
             format!("iter {}/{}", i - args.warmup + 1, args.iterations)
         };
 
-        let write_elapsed = run_shuffle_write(
-            &args.input,
-            &schema,
-            &codec,
-            &hash_col_indices,
-            &args,
-            data_file.to_str().unwrap(),
-            index_file.to_str().unwrap(),
-        );
+        let write_elapsed = if args.concurrent_tasks > 1 {
+            run_concurrent_shuffle_writes(&args.input, &schema, &codec, &hash_col_indices, &args)
+        } else {
+            run_shuffle_write(
+                &args.input,
+                &schema,
+                &codec,
+                &hash_col_indices,
+                &args,
+                data_file.to_str().unwrap(),
+                index_file.to_str().unwrap(),
+            )
+        };
         let data_size = fs::metadata(&data_file).map(|m| m.len()).unwrap_or(0);
 
         if !is_warmup {
@@ -182,9 +194,11 @@ fn main() {
         }
 
         print!("  [{label}] write: {:.3}s", write_elapsed);
-        print!("  output: {}", format_bytes(data_size as usize));
+        if args.concurrent_tasks <= 1 {
+            print!("  output: {}", format_bytes(data_size as usize));
+        }
 
-        if args.read_back {
+        if args.read_back && args.concurrent_tasks <= 1 {
             let read_elapsed = run_shuffle_read(
                 data_file.to_str().unwrap(),
                 index_file.to_str().unwrap(),
@@ -203,8 +217,7 @@ fn main() {
         println!("=== Results ===");
 
         let avg_write = write_times.iter().sum::<f64>() / write_times.len() as f64;
-        let avg_data_size = data_file_sizes.iter().sum::<u64>() / data_file_sizes.len() as u64;
-        let write_throughput_rows = total_rows as f64 / avg_write;
+        let write_throughput_rows = (total_rows as f64 * args.concurrent_tasks as f64) / avg_write;
 
         println!("Write:");
         println!("  avg time:         {:.3}s", avg_write);
@@ -217,15 +230,20 @@ fn main() {
             println!("  min/max:          {:.3}s / {:.3}s", min, max);
         }
         println!(
-            "  throughput:       {} rows/s",
-            format_number(write_throughput_rows as usize)
-        );
-        println!(
-            "  output size:      {}",
-            format_bytes(avg_data_size as usize)
+            "  throughput:       {} rows/s (total across {} tasks)",
+            format_number(write_throughput_rows as usize),
+            args.concurrent_tasks
         );
+        if args.concurrent_tasks <= 1 {
+            let avg_data_size = data_file_sizes.iter().sum::<u64>() / data_file_sizes.len() as u64;
+            println!(
+                "  output size:      {}",
+                format_bytes(avg_data_size as usize)
+            );
+        }
 
         if !read_times.is_empty() {
+            let avg_data_size = data_file_sizes.iter().sum::<u64>() / data_file_sizes.len() as u64;
             let avg_read = read_times.iter().sum::<f64>() / read_times.len() as f64;
             let read_throughput_bytes = avg_data_size as f64 / avg_read;
 
@@ -317,55 +335,144 @@ fn run_shuffle_write(
 
     let rt = tokio::runtime::Runtime::new().unwrap();
     rt.block_on(async {
-        let config = SessionConfig::new().with_batch_size(args.batch_size);
-        let mut runtime_builder = RuntimeEnvBuilder::new();
-        if let Some(mem_limit) = args.memory_limit {
-            runtime_builder = runtime_builder.with_memory_limit(mem_limit, 1.0);
-        }
-        let runtime_env = Arc::new(runtime_builder.build().unwrap());
-        let ctx = SessionContext::new_with_config_rt(config, runtime_env);
-
-        let path_str = input_path.to_str().unwrap();
-        let mut df = ctx
-            .read_parquet(path_str, ParquetReadOptions::default())
-            .await
-            .expect("Failed to create Parquet scan");
-        if args.limit > 0 {
-            df = df.limit(0, Some(args.limit)).unwrap();
-        }
-
-        let parquet_plan = df
-            .create_physical_plan()
-            .await
-            .expect("Failed to create physical plan");
-
-        // ShuffleWriterExec reads from a single input partition
-        let input: Arc<dyn ExecutionPlan> = if parquet_plan
-            .properties()
-            .output_partitioning()
-            .partition_count()
-            > 1
-        {
-            Arc::new(CoalescePartitionsExec::new(parquet_plan))
-        } else {
-            parquet_plan
-        };
-
-        let exec = ShuffleWriterExec::try_new(
-            input,
-            partitioning,
+        let start = Instant::now();
+        execute_shuffle_write(
+            input_path.to_str().unwrap(),
             codec.clone(),
+            partitioning,
+            args.batch_size,
+            args.memory_limit,
+            args.write_buffer_size,
+            args.limit,
             data_file.to_string(),
             index_file.to_string(),
-            false,
-            args.write_buffer_size,
         )
-        .expect("Failed to create ShuffleWriterExec");
+        .await
+        .unwrap();
+        start.elapsed().as_secs_f64()
+    })
+}
+
+/// Core async shuffle write logic shared by single and concurrent paths.
+async fn execute_shuffle_write(
+    input_path: &str,
+    codec: CompressionCodec,
+    partitioning: CometPartitioning,
+    batch_size: usize,
+    memory_limit: Option<usize>,
+    write_buffer_size: usize,
+    limit: usize,
+    data_file: String,
+    index_file: String,
+) -> datafusion::common::Result<()> {
+    let config = SessionConfig::new().with_batch_size(batch_size);
+    let mut runtime_builder = RuntimeEnvBuilder::new();
+    if let Some(mem_limit) = memory_limit {
+        runtime_builder = runtime_builder.with_memory_limit(mem_limit, 1.0);
+    }
+    let runtime_env = Arc::new(runtime_builder.build().unwrap());
+    let ctx = SessionContext::new_with_config_rt(config, runtime_env);
+
+    let mut df = ctx
+        .read_parquet(input_path, ParquetReadOptions::default())
+        .await
+        .expect("Failed to create Parquet scan");
+    if limit > 0 {
+        df = df.limit(0, Some(limit)).unwrap();
+    }
+
+    let parquet_plan = df
+        .create_physical_plan()
+        .await
+        .expect("Failed to create physical plan");
+
+    let input: Arc<dyn ExecutionPlan> = if parquet_plan
+        .properties()
+        .output_partitioning()
+        .partition_count()
+        > 1
+    {
+        Arc::new(CoalescePartitionsExec::new(parquet_plan))
+    } else {
+        parquet_plan
+    };
+
+    let exec = ShuffleWriterExec::try_new(
+        input,
+        partitioning,
+        codec,
+        data_file,
+        index_file,
+        false,
+        write_buffer_size,
+    )
+    .expect("Failed to create ShuffleWriterExec");
+
+    let task_ctx = ctx.task_ctx();
+    let stream = exec.execute(0, task_ctx).unwrap();
+    collect(stream).await.unwrap();
+    Ok(())
+}
 
-        let task_ctx = ctx.task_ctx();
+/// Run N concurrent shuffle writes to simulate executor parallelism.
+/// Returns wall-clock time for all tasks to complete.
+fn run_concurrent_shuffle_writes(
+    input_path: &PathBuf,
+    schema: &SchemaRef,
+    codec: &CompressionCodec,
+    hash_col_indices: &[usize],
+    args: &Args,
+) -> f64 {
+    let rt = tokio::runtime::Runtime::new().unwrap();
+    rt.block_on(async {
         let start = Instant::now();
-        let stream = exec.execute(0, task_ctx).unwrap();
-        collect(stream).await.unwrap();
+
+        let mut handles = Vec::with_capacity(args.concurrent_tasks);
+        for task_id in 0..args.concurrent_tasks {
+            let task_dir = args.output_dir.join(format!("task_{task_id}"));
+            fs::create_dir_all(&task_dir).expect("Failed to create task output directory");
+            let data_file = task_dir.join("data.out").to_str().unwrap().to_string();
+            let index_file = task_dir.join("index.out").to_str().unwrap().to_string();
+
+            let input_str = input_path.to_str().unwrap().to_string();
+            let codec = codec.clone();
+            let partitioning = build_partitioning(
+                &args.partitioning,
+                args.partitions,
+                hash_col_indices,
+                schema,
+            );
+            let batch_size = args.batch_size;
+            let memory_limit = args.memory_limit;
+            let write_buffer_size = args.write_buffer_size;
+            let limit = args.limit;
+
+            handles.push(tokio::spawn(async move {
+                execute_shuffle_write(
+                    &input_str,
+                    codec,
+                    partitioning,
+                    batch_size,
+                    memory_limit,
+                    write_buffer_size,
+                    limit,
+                    data_file,
+                    index_file,
+                )
+                .await
+                .unwrap();
+            }));
+        }
+
+        for handle in handles {
+            handle.await.expect("Task panicked");
+        }
+
+        for task_id in 0..args.concurrent_tasks {
+            let task_dir = args.output_dir.join(format!("task_{task_id}"));
+            let _ = fs::remove_dir_all(&task_dir);
+        }
+
         start.elapsed().as_secs_f64()
     })
 }

From 58ab9270669d4ddf27d7f935bd1cec408280f54d Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 30 Mar 2026 08:42:23 -0600
Subject: [PATCH 10/18] show metrics

---
 native/shuffle/src/bin/shuffle_bench.rs | 104 ++++++++++++++++++++----
 1 file changed, 90 insertions(+), 14 deletions(-)

diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs
index bb0952365b..bf476817f5 100644
--- a/native/shuffle/src/bin/shuffle_bench.rs
+++ b/native/shuffle/src/bin/shuffle_bench.rs
@@ -43,6 +43,7 @@ use datafusion::execution::runtime_env::RuntimeEnvBuilder;
 use datafusion::physical_expr::expressions::Column;
 use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion::physical_plan::common::collect;
+use datafusion::physical_plan::metrics::MetricsSet;
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::prelude::{ParquetReadOptions, SessionContext};
 use datafusion_comet_shuffle::{
@@ -50,7 +51,7 @@ use datafusion_comet_shuffle::{
 };
 use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
 use std::fs;
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 use std::sync::Arc;
 use std::time::Instant;
 
@@ -164,6 +165,7 @@ fn main() {
     let mut write_times = Vec::with_capacity(args.iterations);
     let mut read_times = Vec::with_capacity(args.iterations);
     let mut data_file_sizes = Vec::with_capacity(args.iterations);
+    let mut last_metrics: Option<MetricsSet> = None;
 
     for i in 0..total_iters {
         let is_warmup = i < args.warmup;
@@ -173,8 +175,15 @@ fn main() {
             format!("iter {}/{}", i - args.warmup + 1, args.iterations)
         };
 
-        let write_elapsed = if args.concurrent_tasks > 1 {
-            run_concurrent_shuffle_writes(&args.input, &schema, &codec, &hash_col_indices, &args)
+        let (write_elapsed, metrics) = if args.concurrent_tasks > 1 {
+            let elapsed = run_concurrent_shuffle_writes(
+                &args.input,
+                &schema,
+                &codec,
+                &hash_col_indices,
+                &args,
+            );
+            (elapsed, None)
         } else {
             run_shuffle_write(
                 &args.input,
@@ -191,6 +200,7 @@ fn main() {
         if !is_warmup {
             write_times.push(write_elapsed);
             data_file_sizes.push(data_size);
+            last_metrics = metrics;
         }
 
         print!("  [{label}] write: {:.3}s", write_elapsed);
@@ -259,14 +269,79 @@ fn main() {
                 format_bytes(read_throughput_bytes as usize)
             );
         }
+
+        if let Some(ref metrics) = last_metrics {
+            println!();
+            println!("Shuffle Metrics (last iteration):");
+            print_shuffle_metrics(metrics, avg_write);
+        }
     }
 
     let _ = fs::remove_file(&data_file);
     let _ = fs::remove_file(&index_file);
 }
 
+fn print_shuffle_metrics(metrics: &MetricsSet, total_wall_time_secs: f64) {
+    let get_metric = |name: &str| -> Option<usize> {
+        metrics
+            .iter()
+            .find(|m| m.value().name() == name)
+            .map(|m| m.value().as_usize())
+    };
+
+    let total_ns = (total_wall_time_secs * 1e9) as u64;
+    let fmt_time = |nanos: usize| -> String {
+        let secs = nanos as f64 / 1e9;
+        let pct = if total_ns > 0 {
+            (nanos as f64 / total_ns as f64) * 100.0
+        } else {
+            0.0
+        };
+        format!("{:.3}s ({:.1}%)", secs, pct)
+    };
+
+    if let Some(input_batches) = get_metric("input_batches") {
+        println!("  input batches:    {}", format_number(input_batches));
+    }
+    if let Some(nanos) = get_metric("repart_time") {
+        println!("  repart time:      {}", fmt_time(nanos));
+    }
+    if let Some(nanos) = get_metric("encode_time") {
+        println!("  encode time:      {}", fmt_time(nanos));
+    }
+    if let Some(nanos) = get_metric("write_time") {
+        println!("  write time:       {}", fmt_time(nanos));
+    }
+    if let (Some(repart), Some(encode), Some(write)) = (
+        get_metric("repart_time"),
+        get_metric("encode_time"),
+        get_metric("write_time"),
+    ) {
+        let accounted = repart + encode + write;
+        if total_ns as usize > accounted {
+            let other = total_ns as usize - accounted;
+            println!("  other time:       {}", fmt_time(other));
+        }
+    }
+    if let Some(spill_count) = get_metric("spill_count") {
+        if spill_count > 0 {
+            println!("  spill count:      {}", format_number(spill_count));
+        }
+    }
+    if let Some(spilled_bytes) = get_metric("spilled_bytes") {
+        if spilled_bytes > 0 {
+            println!("  spilled bytes:    {}", format_bytes(spilled_bytes));
+        }
+    }
+    if let Some(data_size) = get_metric("data_size") {
+        if data_size > 0 {
+            println!("  data size:        {}", format_bytes(data_size));
+        }
+    }
+}
+
 /// Read schema and total row count from Parquet metadata without loading any data.
-fn read_parquet_metadata(path: &PathBuf, limit: usize) -> (SchemaRef, u64) {
+fn read_parquet_metadata(path: &Path, limit: usize) -> (SchemaRef, u64) {
     let paths = collect_parquet_paths(path);
     let mut schema = None;
     let mut total_rows = 0u64;
@@ -294,7 +369,7 @@ fn read_parquet_metadata(path: &PathBuf, limit: usize) -> (SchemaRef, u64) {
     (schema.expect("No parquet files found"), total_rows)
 }
 
-fn collect_parquet_paths(path: &PathBuf) -> Vec<PathBuf> {
+fn collect_parquet_paths(path: &Path) -> Vec<PathBuf> {
     if path.is_dir() {
         let mut files: Vec<PathBuf> = fs::read_dir(path)
             .unwrap_or_else(|e| panic!("Failed to read directory {}: {}", path.display(), e))
@@ -313,19 +388,19 @@ fn collect_parquet_paths(path: &PathBuf) -> Vec<PathBuf> {
         }
         files
     } else {
-        vec![path.clone()]
+        vec![path.to_path_buf()]
     }
 }
 
 fn run_shuffle_write(
-    input_path: &PathBuf,
+    input_path: &Path,
     schema: &SchemaRef,
     codec: &CompressionCodec,
     hash_col_indices: &[usize],
     args: &Args,
     data_file: &str,
     index_file: &str,
-) -> f64 {
+) -> (f64, Option<MetricsSet>) {
     let partitioning = build_partitioning(
         &args.partitioning,
         args.partitions,
@@ -336,7 +411,7 @@ fn run_shuffle_write(
     let rt = tokio::runtime::Runtime::new().unwrap();
     rt.block_on(async {
         let start = Instant::now();
-        execute_shuffle_write(
+        let metrics = execute_shuffle_write(
             input_path.to_str().unwrap(),
             codec.clone(),
             partitioning,
@@ -349,11 +424,12 @@ fn run_shuffle_write(
         )
         .await
         .unwrap();
-        start.elapsed().as_secs_f64()
+        (start.elapsed().as_secs_f64(), Some(metrics))
     })
 }
 
 /// Core async shuffle write logic shared by single and concurrent paths.
+#[allow(clippy::too_many_arguments)]
 async fn execute_shuffle_write(
     input_path: &str,
     codec: CompressionCodec,
@@ -364,7 +440,7 @@ async fn execute_shuffle_write(
     limit: usize,
     data_file: String,
     index_file: String,
-) -> datafusion::common::Result<()> {
+) -> datafusion::common::Result<MetricsSet> {
     let config = SessionConfig::new().with_batch_size(batch_size);
     let mut runtime_builder = RuntimeEnvBuilder::new();
     if let Some(mem_limit) = memory_limit {
@@ -411,13 +487,13 @@ async fn execute_shuffle_write(
     let task_ctx = ctx.task_ctx();
     let stream = exec.execute(0, task_ctx).unwrap();
     collect(stream).await.unwrap();
-    Ok(())
+    Ok(exec.metrics().unwrap_or_default())
 }
 
 /// Run N concurrent shuffle writes to simulate executor parallelism.
 /// Returns wall-clock time for all tasks to complete.
 fn run_concurrent_shuffle_writes(
-    input_path: &PathBuf,
+    input_path: &Path,
     schema: &SchemaRef,
     codec: &CompressionCodec,
     hash_col_indices: &[usize],
@@ -460,7 +536,7 @@ fn run_concurrent_shuffle_writes(
                     index_file,
                 )
                 .await
-                .unwrap();
+                .unwrap()
             }));
         }
 

From c469077e49fa42b37ba77c987497a1a5e789fb8a Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 30 Mar 2026 09:02:01 -0600
Subject: [PATCH 11/18] improve metrics

---
 native/shuffle/src/bin/shuffle_bench.rs       | 101 ++++++++++++++----
 native/shuffle/src/metrics.rs                 |  12 +++
 .../src/partitioners/multi_partition.rs       |  15 ++-
 .../partitioned_batch_iterator.rs             |  17 ++-
 .../src/partitioners/single_partition.rs      |  10 +-
 native/shuffle/src/shuffle_writer.rs          |  17 ++-
 .../shuffle/src/writers/buf_batch_writer.rs   |   6 ++
 native/shuffle/src/writers/spill.rs           |  15 ++-
 8 files changed, 160 insertions(+), 33 deletions(-)

diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs
index bf476817f5..a412d11eed 100644
--- a/native/shuffle/src/bin/shuffle_bench.rs
+++ b/native/shuffle/src/bin/shuffle_bench.rs
@@ -43,7 +43,7 @@ use datafusion::execution::runtime_env::RuntimeEnvBuilder;
 use datafusion::physical_expr::expressions::Column;
 use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion::physical_plan::common::collect;
-use datafusion::physical_plan::metrics::MetricsSet;
+use datafusion::physical_plan::metrics::{MetricValue, MetricsSet};
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::prelude::{ParquetReadOptions, SessionContext};
 use datafusion_comet_shuffle::{
@@ -166,6 +166,7 @@ fn main() {
     let mut read_times = Vec::with_capacity(args.iterations);
     let mut data_file_sizes = Vec::with_capacity(args.iterations);
     let mut last_metrics: Option<MetricsSet> = None;
+    let mut last_input_metrics: Option<MetricsSet> = None;
 
     for i in 0..total_iters {
         let is_warmup = i < args.warmup;
@@ -175,7 +176,7 @@ fn main() {
             format!("iter {}/{}", i - args.warmup + 1, args.iterations)
         };
 
-        let (write_elapsed, metrics) = if args.concurrent_tasks > 1 {
+        let (write_elapsed, metrics, input_metrics) = if args.concurrent_tasks > 1 {
             let elapsed = run_concurrent_shuffle_writes(
                 &args.input,
                 &schema,
@@ -183,7 +184,7 @@ fn main() {
                 &hash_col_indices,
                 &args,
             );
-            (elapsed, None)
+            (elapsed, None, None)
         } else {
             run_shuffle_write(
                 &args.input,
@@ -201,6 +202,7 @@ fn main() {
             write_times.push(write_elapsed);
             data_file_sizes.push(data_size);
             last_metrics = metrics;
+            last_input_metrics = input_metrics;
         }
 
         print!("  [{label}] write: {:.3}s", write_elapsed);
@@ -270,6 +272,12 @@ fn main() {
             );
         }
 
+        if let Some(ref metrics) = last_input_metrics {
+            println!();
+            println!("Input Metrics (last iteration):");
+            print_input_metrics(metrics);
+        }
+
         if let Some(ref metrics) = last_metrics {
             println!();
             println!("Shuffle Metrics (last iteration):");
@@ -312,17 +320,16 @@ fn print_shuffle_metrics(metrics: &MetricsSet, total_wall_time_secs: f64) {
     if let Some(nanos) = get_metric("write_time") {
         println!("  write time:       {}", fmt_time(nanos));
     }
-    if let (Some(repart), Some(encode), Some(write)) = (
-        get_metric("repart_time"),
-        get_metric("encode_time"),
-        get_metric("write_time"),
-    ) {
-        let accounted = repart + encode + write;
-        if total_ns as usize > accounted {
-            let other = total_ns as usize - accounted;
-            println!("  other time:       {}", fmt_time(other));
-        }
+    if let Some(nanos) = get_metric("interleave_time") {
+        println!("  interleave time:  {}", fmt_time(nanos));
     }
+    if let Some(nanos) = get_metric("coalesce_time") {
+        println!("  coalesce time:    {}", fmt_time(nanos));
+    }
+    if let Some(nanos) = get_metric("memcopy_time") {
+        println!("  memcopy time:     {}", fmt_time(nanos));
+    }
+
     if let Some(spill_count) = get_metric("spill_count") {
         if spill_count > 0 {
             println!("  spill count:      {}", format_number(spill_count));
@@ -340,6 +347,37 @@ fn print_shuffle_metrics(metrics: &MetricsSet, total_wall_time_secs: f64) {
     }
 }
 
+fn print_input_metrics(metrics: &MetricsSet) {
+    let aggregated = metrics.aggregate_by_name();
+    for m in aggregated.iter() {
+        let value = m.value();
+        let name = value.name();
+        let v = value.as_usize();
+        if v == 0 {
+            continue;
+        }
+        // Format time metrics as seconds, everything else as a number
+        // Skip start/end timestamps — not useful in benchmark output
+        if matches!(
+            value,
+            MetricValue::StartTimestamp(_) | MetricValue::EndTimestamp(_)
+        ) {
+            continue;
+        }
+        let is_time = matches!(
+            value,
+            MetricValue::ElapsedCompute(_) | MetricValue::Time { .. }
+        );
+        if is_time {
+            println!("  {name}: {:.3}s", v as f64 / 1e9);
+        } else if name.contains("bytes") || name.contains("size") {
+            println!("  {name}: {}", format_bytes(v));
+        } else {
+            println!("  {name}: {}", format_number(v));
+        }
+    }
+}
+
 /// Read schema and total row count from Parquet metadata without loading any data.
 fn read_parquet_metadata(path: &Path, limit: usize) -> (SchemaRef, u64) {
     let paths = collect_parquet_paths(path);
@@ -400,7 +438,7 @@ fn run_shuffle_write(
     args: &Args,
     data_file: &str,
     index_file: &str,
-) -> (f64, Option<MetricsSet>) {
+) -> (f64, Option<MetricsSet>, Option<MetricsSet>) {
     let partitioning = build_partitioning(
         &args.partitioning,
         args.partitions,
@@ -411,7 +449,7 @@ fn run_shuffle_write(
     let rt = tokio::runtime::Runtime::new().unwrap();
     rt.block_on(async {
         let start = Instant::now();
-        let metrics = execute_shuffle_write(
+        let (shuffle_metrics, input_metrics) = execute_shuffle_write(
             input_path.to_str().unwrap(),
             codec.clone(),
             partitioning,
@@ -424,7 +462,11 @@ fn run_shuffle_write(
         )
         .await
         .unwrap();
-        (start.elapsed().as_secs_f64(), Some(metrics))
+        (
+            start.elapsed().as_secs_f64(),
+            Some(shuffle_metrics),
+            Some(input_metrics),
+        )
     })
 }
 
@@ -440,7 +482,7 @@ async fn execute_shuffle_write(
     limit: usize,
     data_file: String,
     index_file: String,
-) -> datafusion::common::Result<MetricsSet> {
+) -> datafusion::common::Result<(MetricsSet, MetricsSet)> {
     let config = SessionConfig::new().with_batch_size(batch_size);
     let mut runtime_builder = RuntimeEnvBuilder::new();
     if let Some(mem_limit) = memory_limit {
@@ -487,7 +529,30 @@ async fn execute_shuffle_write(
     let task_ctx = ctx.task_ctx();
     let stream = exec.execute(0, task_ctx).unwrap();
     collect(stream).await.unwrap();
-    Ok(exec.metrics().unwrap_or_default())
+
+    // Collect metrics from the input plan (Parquet scan + optional coalesce)
+    let input_metrics = collect_input_metrics(&exec);
+
+    Ok((exec.metrics().unwrap_or_default(), input_metrics))
+}
+
+/// Walk the plan tree and aggregate all metrics from input plans (everything below shuffle writer).
+fn collect_input_metrics(exec: &ShuffleWriterExec) -> MetricsSet {
+    let mut all_metrics = MetricsSet::new();
+    fn gather(plan: &dyn ExecutionPlan, out: &mut MetricsSet) {
+        if let Some(metrics) = plan.metrics() {
+            for m in metrics.iter() {
+                out.push(Arc::clone(m));
+            }
+        }
+        for child in plan.children() {
+            gather(child.as_ref(), out);
+        }
+    }
+    for child in exec.children() {
+        gather(child.as_ref(), &mut all_metrics);
+    }
+    all_metrics
 }
 
 /// Run N concurrent shuffle writes to simulate executor parallelism.
diff --git a/native/shuffle/src/metrics.rs b/native/shuffle/src/metrics.rs
index 1de751cf41..8755e2c65f 100644
--- a/native/shuffle/src/metrics.rs
+++ b/native/shuffle/src/metrics.rs
@@ -33,6 +33,15 @@ pub(crate) struct ShufflePartitionerMetrics {
     /// Time spent writing to disk. Maps to "shuffleWriteTime" in Spark SQL Metrics.
     pub(crate) write_time: Time,
 
+    /// Time spent in interleave_record_batch gathering rows into shuffled batches
+    pub(crate) interleave_time: Time,
+
+    /// Time spent coalescing small batches before serialization
+    pub(crate) coalesce_time: Time,
+
+    /// Time spent buffering partition indices and memory accounting
+    pub(crate) memcopy_time: Time,
+
     /// Number of input batches
     pub(crate) input_batches: Count,
 
@@ -53,6 +62,9 @@ impl ShufflePartitionerMetrics {
             repart_time: MetricBuilder::new(metrics).subset_time("repart_time", partition),
             encode_time: MetricBuilder::new(metrics).subset_time("encode_time", partition),
             write_time: MetricBuilder::new(metrics).subset_time("write_time", partition),
+            interleave_time: MetricBuilder::new(metrics).subset_time("interleave_time", partition),
+            coalesce_time: MetricBuilder::new(metrics).subset_time("coalesce_time", partition),
+            memcopy_time: MetricBuilder::new(metrics).subset_time("memcopy_time", partition),
             input_batches: MetricBuilder::new(metrics).counter("input_batches", partition),
             spill_count: MetricBuilder::new(metrics).spill_count(partition),
             spilled_bytes: MetricBuilder::new(metrics).spilled_bytes(partition),
diff --git a/native/shuffle/src/partitioners/multi_partition.rs b/native/shuffle/src/partitioners/multi_partition.rs
index 655bee3511..819c569064 100644
--- a/native/shuffle/src/partitioners/multi_partition.rs
+++ b/native/shuffle/src/partitioners/multi_partition.rs
@@ -398,6 +398,7 @@ impl MultiPartitionShuffleRepartitioner {
         partition_row_indices: &[u32],
         partition_starts: &[u32],
     ) -> datafusion::common::Result<()> {
+        let start_time = Instant::now();
         let mut mem_growth: usize = input.get_array_memory_size();
         let buffered_partition_idx = self.buffered_batches.len() as u32;
         self.buffered_batches.push(input);
@@ -426,6 +427,7 @@ impl MultiPartitionShuffleRepartitioner {
             let after_size = indices.allocated_size();
             mem_growth += after_size.saturating_sub(before_size);
         }
+        self.metrics.memcopy_time.add_duration(start_time.elapsed());
 
         if self.reservation.try_grow(mem_growth).is_err() {
             self.spill()?;
@@ -434,12 +436,14 @@ impl MultiPartitionShuffleRepartitioner {
         Ok(())
     }
 
+    #[allow(clippy::too_many_arguments)]
     fn shuffle_write_partition(
         partition_iter: &mut PartitionedBatchIterator,
         shuffle_block_writer: &mut ShuffleBlockWriter,
         output_data: &mut BufWriter<File>,
         encode_time: &Time,
         write_time: &Time,
+        coalesce_time: &Time,
         write_buffer_size: usize,
         batch_size: usize,
     ) -> datafusion::common::Result<()> {
@@ -451,9 +455,9 @@ impl MultiPartitionShuffleRepartitioner {
         );
         for batch in partition_iter {
             let batch = batch?;
-            buf_batch_writer.write(&batch, encode_time, write_time)?;
+            buf_batch_writer.write(&batch, encode_time, write_time, coalesce_time)?;
         }
-        buf_batch_writer.flush(encode_time, write_time)?;
+        buf_batch_writer.flush(encode_time, write_time, coalesce_time)?;
         Ok(())
     }
 
@@ -506,7 +510,8 @@ impl MultiPartitionShuffleRepartitioner {
 
             for partition_id in 0..num_output_partitions {
                 let partition_writer = &mut self.partition_writers[partition_id];
-                let mut iter = partitioned_batches.produce(partition_id);
+                let mut iter =
+                    partitioned_batches.produce(partition_id, &self.metrics.interleave_time);
                 spilled_bytes += partition_writer.spill(
                     &mut iter,
                     &self.runtime,
@@ -589,13 +594,15 @@ impl ShufflePartitioner for MultiPartitionShuffleRepartitioner {
                 }
 
                 // Write in memory batches to output data file
-                let mut partition_iter = partitioned_batches.produce(i);
+                let mut partition_iter =
+                    partitioned_batches.produce(i, &self.metrics.interleave_time);
                 Self::shuffle_write_partition(
                     &mut partition_iter,
                     &mut self.shuffle_block_writer,
                     &mut output_data,
                     &self.metrics.encode_time,
                     &self.metrics.write_time,
+                    &self.metrics.coalesce_time,
                     self.write_buffer_size,
                     self.batch_size,
                 )?;
diff --git a/native/shuffle/src/partitioners/partitioned_batch_iterator.rs b/native/shuffle/src/partitioners/partitioned_batch_iterator.rs
index 8309a8ed4a..f124d98ff2 100644
--- a/native/shuffle/src/partitioners/partitioned_batch_iterator.rs
+++ b/native/shuffle/src/partitioners/partitioned_batch_iterator.rs
@@ -18,6 +18,7 @@
 use arrow::array::RecordBatch;
 use arrow::compute::interleave_record_batch;
 use datafusion::common::DataFusionError;
+use datafusion::physical_plan::metrics::Time;
 
 /// A helper struct to produce shuffled batches.
 /// This struct takes ownership of the buffered batches and partition indices from the
@@ -41,11 +42,16 @@ impl PartitionedBatchesProducer {
         }
     }
 
-    pub(super) fn produce(&mut self, partition_id: usize) -> PartitionedBatchIterator<'_> {
+    pub(super) fn produce<'a>(
+        &'a mut self,
+        partition_id: usize,
+        interleave_time: &'a Time,
+    ) -> PartitionedBatchIterator<'a> {
         PartitionedBatchIterator::new(
             &self.partition_indices[partition_id],
             &self.buffered_batches,
             self.batch_size,
+            interleave_time,
         )
     }
 }
@@ -56,6 +62,7 @@ pub(crate) struct PartitionedBatchIterator<'a> {
     batch_size: usize,
     indices: Vec<(usize, usize)>,
     pos: usize,
+    interleave_time: &'a Time,
 }
 
 impl<'a> PartitionedBatchIterator<'a> {
@@ -63,6 +70,7 @@ impl<'a> PartitionedBatchIterator<'a> {
         indices: &'a [(u32, u32)],
         buffered_batches: &'a [RecordBatch],
         batch_size: usize,
+        interleave_time: &'a Time,
     ) -> Self {
         if indices.is_empty() {
             // Avoid unnecessary allocations when the partition is empty
@@ -71,6 +79,7 @@ impl<'a> PartitionedBatchIterator<'a> {
                 batch_size,
                 indices: vec![],
                 pos: 0,
+                interleave_time,
             };
         }
         let record_batches = buffered_batches.iter().collect::<Vec<_>>();
@@ -83,6 +92,7 @@ impl<'a> PartitionedBatchIterator<'a> {
             batch_size,
             indices: current_indices,
             pos: 0,
+            interleave_time,
         }
     }
 }
@@ -97,7 +107,10 @@ impl Iterator for PartitionedBatchIterator<'_> {
 
         let indices_end = std::cmp::min(self.pos + self.batch_size, self.indices.len());
         let indices = &self.indices[self.pos..indices_end];
-        match interleave_record_batch(&self.record_batches, indices) {
+        let mut timer = self.interleave_time.timer();
+        let result = interleave_record_batch(&self.record_batches, indices);
+        timer.stop();
+        match result {
             Ok(batch) => {
                 self.pos = indices_end;
                 Some(Ok(batch))
diff --git a/native/shuffle/src/partitioners/single_partition.rs b/native/shuffle/src/partitioners/single_partition.rs
index 5801ef613b..d866aae330 100644
--- a/native/shuffle/src/partitioners/single_partition.rs
+++ b/native/shuffle/src/partitioners/single_partition.rs
@@ -127,6 +127,7 @@ impl ShufflePartitioner for SinglePartitionShufflePartitioner {
                         &batch,
                         &self.metrics.encode_time,
                         &self.metrics.write_time,
+                        &self.metrics.coalesce_time,
                     )?;
                 }
 
@@ -136,6 +137,7 @@ impl ShufflePartitioner for SinglePartitionShufflePartitioner {
                         &batch,
                         &self.metrics.encode_time,
                         &self.metrics.write_time,
+                        &self.metrics.coalesce_time,
                     )?;
                 } else {
                     // Add the new batch to the buffer
@@ -164,10 +166,14 @@ impl ShufflePartitioner for SinglePartitionShufflePartitioner {
                 &batch,
                 &self.metrics.encode_time,
                 &self.metrics.write_time,
+                &self.metrics.coalesce_time,
             )?;
         }
-        self.output_data_writer
-            .flush(&self.metrics.encode_time, &self.metrics.write_time)?;
+        self.output_data_writer.flush(
+            &self.metrics.encode_time,
+            &self.metrics.write_time,
+            &self.metrics.coalesce_time,
+        )?;
 
         // Write index file. It should only contain 2 entries: 0 and the total number of bytes written
         let index_file = OpenOptions::new()
diff --git a/native/shuffle/src/shuffle_writer.rs b/native/shuffle/src/shuffle_writer.rs
index e649aaac69..03c9d3ff23 100644
--- a/native/shuffle/src/shuffle_writer.rs
+++ b/native/shuffle/src/shuffle_writer.rs
@@ -618,6 +618,7 @@ mod test {
         let codec = CompressionCodec::Lz4Frame;
         let encode_time = Time::default();
         let write_time = Time::default();
+        let coalesce_time = Time::default();
 
         // Write with coalescing (batch_size=8192)
         let mut coalesced_output = Vec::new();
@@ -630,9 +631,13 @@ mod test {
                 8192,
             );
             for batch in &small_batches {
-                buf_writer.write(batch, &encode_time, &write_time).unwrap();
+                buf_writer
+                    .write(batch, &encode_time, &write_time, &coalesce_time)
+                    .unwrap();
             }
-            buf_writer.flush(&encode_time, &write_time).unwrap();
+            buf_writer
+                .flush(&encode_time, &write_time, &coalesce_time)
+                .unwrap();
         }
 
         // Write without coalescing (batch_size=1)
@@ -646,9 +651,13 @@ mod test {
                 1,
             );
             for batch in &small_batches {
-                buf_writer.write(batch, &encode_time, &write_time).unwrap();
+                buf_writer
+                    .write(batch, &encode_time, &write_time, &coalesce_time)
+                    .unwrap();
             }
-            buf_writer.flush(&encode_time, &write_time).unwrap();
+            buf_writer
+                .flush(&encode_time, &write_time, &coalesce_time)
+                .unwrap();
         }
 
         // Coalesced output should be smaller due to fewer IPC schema blocks
diff --git a/native/shuffle/src/writers/buf_batch_writer.rs b/native/shuffle/src/writers/buf_batch_writer.rs
index cfddb46539..f6bb15137e 100644
--- a/native/shuffle/src/writers/buf_batch_writer.rs
+++ b/native/shuffle/src/writers/buf_batch_writer.rs
@@ -63,7 +63,9 @@ impl<S: Borrow<ShuffleBlockWriter>, W: Write> BufBatchWriter<S, W> {
         batch: &RecordBatch,
         encode_time: &Time,
         write_time: &Time,
+        coalesce_time: &Time,
     ) -> datafusion::common::Result<usize> {
+        let mut coalesce_timer = coalesce_time.timer();
         let coalescer = self
             .coalescer
             .get_or_insert_with(|| BatchCoalescer::new(batch.schema(), self.batch_size));
@@ -75,6 +77,7 @@ impl<S: Borrow<ShuffleBlockWriter>, W: Write> BufBatchWriter<S, W> {
         while let Some(batch) = coalescer.next_completed_batch() {
             completed.push(batch);
         }
+        coalesce_timer.stop();
 
         let mut bytes_written = 0;
         for batch in &completed {
@@ -110,8 +113,10 @@ impl<S: Borrow<ShuffleBlockWriter>, W: Write> BufBatchWriter<S, W> {
         &mut self,
         encode_time: &Time,
         write_time: &Time,
+        coalesce_time: &Time,
     ) -> datafusion::common::Result<()> {
         // Finish any remaining buffered rows in the coalescer
+        let mut coalesce_timer = coalesce_time.timer();
         let mut remaining = Vec::new();
         if let Some(coalescer) = &mut self.coalescer {
             coalescer.finish_buffered_batch()?;
@@ -119,6 +124,7 @@ impl<S: Borrow<ShuffleBlockWriter>, W: Write> BufBatchWriter<S, W> {
                 remaining.push(batch);
             }
         }
+        coalesce_timer.stop();
         for batch in &remaining {
             self.write_batch_to_buffer(batch, encode_time, write_time)?;
         }
diff --git a/native/shuffle/src/writers/spill.rs b/native/shuffle/src/writers/spill.rs
index c16caddbf9..05492791d1 100644
--- a/native/shuffle/src/writers/spill.rs
+++ b/native/shuffle/src/writers/spill.rs
@@ -93,17 +93,26 @@ impl PartitionWriter {
                     write_buffer_size,
                     batch_size,
                 );
-                let mut bytes_written =
-                    buf_batch_writer.write(&batch?, &metrics.encode_time, &metrics.write_time)?;
+                let mut bytes_written = buf_batch_writer.write(
+                    &batch?,
+                    &metrics.encode_time,
+                    &metrics.write_time,
+                    &metrics.coalesce_time,
+                )?;
                 for batch in iter {
                     let batch = batch?;
                     bytes_written += buf_batch_writer.write(
                         &batch,
                         &metrics.encode_time,
                         &metrics.write_time,
+                        &metrics.coalesce_time,
                     )?;
                 }
-                buf_batch_writer.flush(&metrics.encode_time, &metrics.write_time)?;
+                buf_batch_writer.flush(
+                    &metrics.encode_time,
+                    &metrics.write_time,
+                    &metrics.coalesce_time,
+                )?;
                 bytes_written
             };
 

From 8cccb45de4001ff3441611fad870bc1608df27ff Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 1 Apr 2026 13:49:18 -0600
Subject: [PATCH 12/18] refactor: address PR feedback, remove metrics and
 read-back

Revert new shuffle metrics (interleave_time, coalesce_time,
memcopy_time) to keep PR focused on the benchmark tool. Remove
read-back functionality from shuffle_bench to focus on write
performance. Remove undocumented --max-buffered-batches option
from README.
---
 native/shuffle/README.md                      |   4 +-
 native/shuffle/src/bin/shuffle_bench.rs       | 101 +-----------------
 native/shuffle/src/metrics.rs                 |  12 ---
 .../src/partitioners/multi_partition.rs       |  21 ++--
 .../partitioned_batch_iterator.rs             |  17 +--
 .../src/partitioners/single_partition.rs      |  10 +-
 native/shuffle/src/shuffle_writer.rs          |  17 +--
 .../shuffle/src/writers/buf_batch_writer.rs   |   6 --
 native/shuffle/src/writers/spill.rs           |  15 +--
 9 files changed, 25 insertions(+), 178 deletions(-)

diff --git a/native/shuffle/README.md b/native/shuffle/README.md
index 74b8dbe656..7484cc1093 100644
--- a/native/shuffle/README.md
+++ b/native/shuffle/README.md
@@ -26,7 +26,7 @@ of the [Apache DataFusion Comet] subproject.
 
 ## Shuffle Benchmark Tool
 
-A standalone benchmark binary (`shuffle_bench`) is included for profiling shuffle write and read
+A standalone benchmark binary (`shuffle_bench`) is included for profiling shuffle write
 performance outside of Spark. It streams input data directly from Parquet files.
 
 ### Basic usage
@@ -51,12 +51,10 @@ cargo run --release --features shuffle-bench --bin shuffle_bench -- \
 | `--zstd-level`           | `1`                        | Zstd compression level (1–22)                                |
 | `--batch-size`           | `8192`                     | Batch size for reading Parquet data                          |
 | `--memory-limit`         | _(none)_                   | Memory limit in bytes; triggers spilling when exceeded       |
-| `--max-buffered-batches` | `0`                        | Max batches to buffer before spilling (0 = memory-pool-only) |
 | `--write-buffer-size`    | `1048576`                  | Write buffer size in bytes                                   |
 | `--limit`                | `0`                        | Limit rows processed per iteration (0 = no limit)            |
 | `--iterations`           | `1`                        | Number of timed iterations                                   |
 | `--warmup`               | `0`                        | Number of warmup iterations before timing                    |
-| `--read-back`            | `false`                    | Also benchmark reading back the shuffle output               |
 | `--output-dir`           | `/tmp/comet_shuffle_bench` | Directory for temporary shuffle output files                 |
 
 ### Profiling with flamegraph
diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs
index a412d11eed..112fa26bbc 100644
--- a/native/shuffle/src/bin/shuffle_bench.rs
+++ b/native/shuffle/src/bin/shuffle_bench.rs
@@ -15,8 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Standalone shuffle benchmark tool for profiling Comet shuffle write and read
-//! outside of Spark. Streams input directly from Parquet files.
+//! Standalone shuffle benchmark tool for profiling Comet shuffle write
+//! performance outside of Spark. Streams input directly from Parquet files.
 //!
 //! # Usage
 //!
@@ -25,8 +25,7 @@
 //!   --input /data/tpch-sf100/lineitem/ \
 //!   --partitions 200 \
 //!   --codec zstd --zstd-level 1 \
-//!   --hash-columns 0,3 \
-//!   --read-back
+//!   --hash-columns 0,3
 //! ```
 //!
 //! Profile with flamegraph:
@@ -47,7 +46,7 @@ use datafusion::physical_plan::metrics::{MetricValue, MetricsSet};
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::prelude::{ParquetReadOptions, SessionContext};
 use datafusion_comet_shuffle::{
-    read_ipc_compressed, CometPartitioning, CompressionCodec, ShuffleWriterExec,
+    CometPartitioning, CompressionCodec, ShuffleWriterExec,
 };
 use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
 use std::fs;
@@ -93,10 +92,6 @@ struct Args {
     #[arg(long)]
     memory_limit: Option<usize>,
 
-    /// Also benchmark reading back the shuffle output
-    #[arg(long, default_value_t = false)]
-    read_back: bool,
-
     /// Number of iterations to run
     #[arg(long, default_value_t = 1)]
     iterations: usize,
@@ -163,8 +158,7 @@ fn main() {
 
     let total_iters = args.warmup + args.iterations;
     let mut write_times = Vec::with_capacity(args.iterations);
-    let mut read_times = Vec::with_capacity(args.iterations);
-    let mut data_file_sizes = Vec::with_capacity(args.iterations);
+let mut data_file_sizes = Vec::with_capacity(args.iterations);
     let mut last_metrics: Option<MetricsSet> = None;
     let mut last_input_metrics: Option<MetricsSet> = None;
 
@@ -210,17 +204,6 @@ fn main() {
             print!("  output: {}", format_bytes(data_size as usize));
         }
 
-        if args.read_back && args.concurrent_tasks <= 1 {
-            let read_elapsed = run_shuffle_read(
-                data_file.to_str().unwrap(),
-                index_file.to_str().unwrap(),
-                args.partitions,
-            );
-            if !is_warmup {
-                read_times.push(read_elapsed);
-            }
-            print!("  read: {:.3}s", read_elapsed);
-        }
         println!();
     }
 
@@ -254,24 +237,6 @@ fn main() {
             );
         }
 
-        if !read_times.is_empty() {
-            let avg_data_size = data_file_sizes.iter().sum::<u64>() / data_file_sizes.len() as u64;
-            let avg_read = read_times.iter().sum::<f64>() / read_times.len() as f64;
-            let read_throughput_bytes = avg_data_size as f64 / avg_read;
-
-            println!("Read:");
-            println!("  avg time:         {:.3}s", avg_read);
-            if read_times.len() > 1 {
-                let min = read_times.iter().cloned().fold(f64::INFINITY, f64::min);
-                let max = read_times.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
-                println!("  min/max:          {:.3}s / {:.3}s", min, max);
-            }
-            println!(
-                "  throughput:       {}/s (from compressed)",
-                format_bytes(read_throughput_bytes as usize)
-            );
-        }
-
         if let Some(ref metrics) = last_input_metrics {
             println!();
             println!("Input Metrics (last iteration):");
@@ -320,15 +285,6 @@ fn print_shuffle_metrics(metrics: &MetricsSet, total_wall_time_secs: f64) {
     if let Some(nanos) = get_metric("write_time") {
         println!("  write time:       {}", fmt_time(nanos));
     }
-    if let Some(nanos) = get_metric("interleave_time") {
-        println!("  interleave time:  {}", fmt_time(nanos));
-    }
-    if let Some(nanos) = get_metric("coalesce_time") {
-        println!("  coalesce time:    {}", fmt_time(nanos));
-    }
-    if let Some(nanos) = get_metric("memcopy_time") {
-        println!("  memcopy time:     {}", fmt_time(nanos));
-    }
 
     if let Some(spill_count) = get_metric("spill_count") {
         if spill_count > 0 {
@@ -618,53 +574,6 @@ fn run_concurrent_shuffle_writes(
     })
 }
 
-fn run_shuffle_read(data_file: &str, index_file: &str, num_partitions: usize) -> f64 {
-    let start = Instant::now();
-
-    let index_bytes = fs::read(index_file).expect("Failed to read index file");
-    let num_offsets = index_bytes.len() / 8;
-    let offsets: Vec<i64> = (0..num_offsets)
-        .map(|i| {
-            let bytes: [u8; 8] = index_bytes[i * 8..(i + 1) * 8].try_into().unwrap();
-            i64::from_le_bytes(bytes)
-        })
-        .collect();
-
-    let data_bytes = fs::read(data_file).expect("Failed to read data file");
-
-    let mut total_rows = 0usize;
-    let mut total_batches = 0usize;
-
-    for p in 0..num_partitions.min(offsets.len().saturating_sub(1)) {
-        let start_offset = offsets[p] as usize;
-        let end_offset = offsets[p + 1] as usize;
-
-        if start_offset >= end_offset {
-            continue;
-        }
-
-        let mut offset = start_offset;
-        while offset < end_offset {
-            let ipc_length =
-                u64::from_le_bytes(data_bytes[offset..offset + 8].try_into().unwrap()) as usize;
-            let block_data = &data_bytes[offset + 16..offset + 8 + ipc_length];
-            let batch = read_ipc_compressed(block_data).expect("Failed to decode shuffle block");
-            total_rows += batch.num_rows();
-            total_batches += 1;
-            offset += 8 + ipc_length;
-        }
-    }
-
-    let elapsed = start.elapsed().as_secs_f64();
-    eprintln!(
-        "    read back {} rows in {} batches from {} partitions",
-        format_number(total_rows),
-        total_batches,
-        num_partitions
-    );
-    elapsed
-}
-
 fn build_partitioning(
     scheme: &str,
     num_partitions: usize,
diff --git a/native/shuffle/src/metrics.rs b/native/shuffle/src/metrics.rs
index 8755e2c65f..1de751cf41 100644
--- a/native/shuffle/src/metrics.rs
+++ b/native/shuffle/src/metrics.rs
@@ -33,15 +33,6 @@ pub(crate) struct ShufflePartitionerMetrics {
     /// Time spent writing to disk. Maps to "shuffleWriteTime" in Spark SQL Metrics.
     pub(crate) write_time: Time,
 
-    /// Time spent in interleave_record_batch gathering rows into shuffled batches
-    pub(crate) interleave_time: Time,
-
-    /// Time spent coalescing small batches before serialization
-    pub(crate) coalesce_time: Time,
-
-    /// Time spent buffering partition indices and memory accounting
-    pub(crate) memcopy_time: Time,
-
     /// Number of input batches
     pub(crate) input_batches: Count,
 
@@ -62,9 +53,6 @@ impl ShufflePartitionerMetrics {
             repart_time: MetricBuilder::new(metrics).subset_time("repart_time", partition),
             encode_time: MetricBuilder::new(metrics).subset_time("encode_time", partition),
             write_time: MetricBuilder::new(metrics).subset_time("write_time", partition),
-            interleave_time: MetricBuilder::new(metrics).subset_time("interleave_time", partition),
-            coalesce_time: MetricBuilder::new(metrics).subset_time("coalesce_time", partition),
-            memcopy_time: MetricBuilder::new(metrics).subset_time("memcopy_time", partition),
             input_batches: MetricBuilder::new(metrics).counter("input_batches", partition),
             spill_count: MetricBuilder::new(metrics).spill_count(partition),
             spilled_bytes: MetricBuilder::new(metrics).spilled_bytes(partition),
diff --git a/native/shuffle/src/partitioners/multi_partition.rs b/native/shuffle/src/partitioners/multi_partition.rs
index 819c569064..7de9314f54 100644
--- a/native/shuffle/src/partitioners/multi_partition.rs
+++ b/native/shuffle/src/partitioners/multi_partition.rs
@@ -35,7 +35,7 @@ use itertools::Itertools;
 use std::fmt;
 use std::fmt::{Debug, Formatter};
 use std::fs::{File, OpenOptions};
-use std::io::{BufReader, BufWriter, Seek, Write};
+use std::io::{BufWriter, Seek, Write};
 use std::sync::Arc;
 use tokio::time::Instant;
 
@@ -398,7 +398,6 @@ impl MultiPartitionShuffleRepartitioner {
         partition_row_indices: &[u32],
         partition_starts: &[u32],
     ) -> datafusion::common::Result<()> {
-        let start_time = Instant::now();
         let mut mem_growth: usize = input.get_array_memory_size();
         let buffered_partition_idx = self.buffered_batches.len() as u32;
         self.buffered_batches.push(input);
@@ -427,7 +426,6 @@ impl MultiPartitionShuffleRepartitioner {
             let after_size = indices.allocated_size();
             mem_growth += after_size.saturating_sub(before_size);
         }
-        self.metrics.memcopy_time.add_duration(start_time.elapsed());
 
         if self.reservation.try_grow(mem_growth).is_err() {
             self.spill()?;
@@ -436,14 +434,12 @@ impl MultiPartitionShuffleRepartitioner {
         Ok(())
     }
 
-    #[allow(clippy::too_many_arguments)]
     fn shuffle_write_partition(
         partition_iter: &mut PartitionedBatchIterator,
         shuffle_block_writer: &mut ShuffleBlockWriter,
         output_data: &mut BufWriter<File>,
         encode_time: &Time,
         write_time: &Time,
-        coalesce_time: &Time,
         write_buffer_size: usize,
         batch_size: usize,
     ) -> datafusion::common::Result<()> {
@@ -455,9 +451,9 @@ impl MultiPartitionShuffleRepartitioner {
         );
         for batch in partition_iter {
             let batch = batch?;
-            buf_batch_writer.write(&batch, encode_time, write_time, coalesce_time)?;
+            buf_batch_writer.write(&batch, encode_time, write_time)?;
         }
-        buf_batch_writer.flush(encode_time, write_time, coalesce_time)?;
+        buf_batch_writer.flush(encode_time, write_time)?;
         Ok(())
     }
 
@@ -510,8 +506,7 @@ impl MultiPartitionShuffleRepartitioner {
 
             for partition_id in 0..num_output_partitions {
                 let partition_writer = &mut self.partition_writers[partition_id];
-                let mut iter =
-                    partitioned_batches.produce(partition_id, &self.metrics.interleave_time);
+                let mut iter = partitioned_batches.produce(partition_id);
                 spilled_bytes += partition_writer.spill(
                     &mut iter,
                     &self.runtime,
@@ -587,22 +582,22 @@ impl ShufflePartitioner for MultiPartitionShuffleRepartitioner {
                 // if we wrote a spill file for this partition then copy the
                 // contents into the shuffle file
                 if let Some(spill_path) = self.partition_writers[i].path() {
-                    let mut spill_file = BufReader::new(File::open(spill_path)?);
+                    // Use raw File handle (not BufReader) so that std::io::copy
+                    // can use copy_file_range/sendfile for zero-copy on Linux.
+                    let mut spill_file = File::open(spill_path)?;
                     let mut write_timer = self.metrics.write_time.timer();
                     std::io::copy(&mut spill_file, &mut output_data)?;
                     write_timer.stop();
                 }
 
                 // Write in memory batches to output data file
-                let mut partition_iter =
-                    partitioned_batches.produce(i, &self.metrics.interleave_time);
+                let mut partition_iter = partitioned_batches.produce(i);
                 Self::shuffle_write_partition(
                     &mut partition_iter,
                     &mut self.shuffle_block_writer,
                     &mut output_data,
                     &self.metrics.encode_time,
                     &self.metrics.write_time,
-                    &self.metrics.coalesce_time,
                     self.write_buffer_size,
                     self.batch_size,
                 )?;
diff --git a/native/shuffle/src/partitioners/partitioned_batch_iterator.rs b/native/shuffle/src/partitioners/partitioned_batch_iterator.rs
index f124d98ff2..8309a8ed4a 100644
--- a/native/shuffle/src/partitioners/partitioned_batch_iterator.rs
+++ b/native/shuffle/src/partitioners/partitioned_batch_iterator.rs
@@ -18,7 +18,6 @@
 use arrow::array::RecordBatch;
 use arrow::compute::interleave_record_batch;
 use datafusion::common::DataFusionError;
-use datafusion::physical_plan::metrics::Time;
 
 /// A helper struct to produce shuffled batches.
 /// This struct takes ownership of the buffered batches and partition indices from the
@@ -42,16 +41,11 @@ impl PartitionedBatchesProducer {
         }
     }
 
-    pub(super) fn produce<'a>(
-        &'a mut self,
-        partition_id: usize,
-        interleave_time: &'a Time,
-    ) -> PartitionedBatchIterator<'a> {
+    pub(super) fn produce(&mut self, partition_id: usize) -> PartitionedBatchIterator<'_> {
         PartitionedBatchIterator::new(
             &self.partition_indices[partition_id],
             &self.buffered_batches,
             self.batch_size,
-            interleave_time,
         )
     }
 }
@@ -62,7 +56,6 @@ pub(crate) struct PartitionedBatchIterator<'a> {
     batch_size: usize,
     indices: Vec<(usize, usize)>,
     pos: usize,
-    interleave_time: &'a Time,
 }
 
 impl<'a> PartitionedBatchIterator<'a> {
@@ -70,7 +63,6 @@ impl<'a> PartitionedBatchIterator<'a> {
         indices: &'a [(u32, u32)],
         buffered_batches: &'a [RecordBatch],
         batch_size: usize,
-        interleave_time: &'a Time,
     ) -> Self {
         if indices.is_empty() {
             // Avoid unnecessary allocations when the partition is empty
@@ -79,7 +71,6 @@ impl<'a> PartitionedBatchIterator<'a> {
                 batch_size,
                 indices: vec![],
                 pos: 0,
-                interleave_time,
             };
         }
         let record_batches = buffered_batches.iter().collect::<Vec<_>>();
@@ -92,7 +83,6 @@ impl<'a> PartitionedBatchIterator<'a> {
             batch_size,
             indices: current_indices,
             pos: 0,
-            interleave_time,
         }
     }
 }
@@ -107,10 +97,7 @@ impl Iterator for PartitionedBatchIterator<'_> {
 
         let indices_end = std::cmp::min(self.pos + self.batch_size, self.indices.len());
         let indices = &self.indices[self.pos..indices_end];
-        let mut timer = self.interleave_time.timer();
-        let result = interleave_record_batch(&self.record_batches, indices);
-        timer.stop();
-        match result {
+        match interleave_record_batch(&self.record_batches, indices) {
             Ok(batch) => {
                 self.pos = indices_end;
                 Some(Ok(batch))
diff --git a/native/shuffle/src/partitioners/single_partition.rs b/native/shuffle/src/partitioners/single_partition.rs
index d866aae330..5801ef613b 100644
--- a/native/shuffle/src/partitioners/single_partition.rs
+++ b/native/shuffle/src/partitioners/single_partition.rs
@@ -127,7 +127,6 @@ impl ShufflePartitioner for SinglePartitionShufflePartitioner {
                         &batch,
                         &self.metrics.encode_time,
                         &self.metrics.write_time,
-                        &self.metrics.coalesce_time,
                     )?;
                 }
 
@@ -137,7 +136,6 @@ impl ShufflePartitioner for SinglePartitionShufflePartitioner {
                         &batch,
                         &self.metrics.encode_time,
                         &self.metrics.write_time,
-                        &self.metrics.coalesce_time,
                     )?;
                 } else {
                     // Add the new batch to the buffer
@@ -166,14 +164,10 @@ impl ShufflePartitioner for SinglePartitionShufflePartitioner {
                 &batch,
                 &self.metrics.encode_time,
                 &self.metrics.write_time,
-                &self.metrics.coalesce_time,
             )?;
         }
-        self.output_data_writer.flush(
-            &self.metrics.encode_time,
-            &self.metrics.write_time,
-            &self.metrics.coalesce_time,
-        )?;
+        self.output_data_writer
+            .flush(&self.metrics.encode_time, &self.metrics.write_time)?;
 
         // Write index file. It should only contain 2 entries: 0 and the total number of bytes written
         let index_file = OpenOptions::new()
diff --git a/native/shuffle/src/shuffle_writer.rs b/native/shuffle/src/shuffle_writer.rs
index 03c9d3ff23..e649aaac69 100644
--- a/native/shuffle/src/shuffle_writer.rs
+++ b/native/shuffle/src/shuffle_writer.rs
@@ -618,7 +618,6 @@ mod test {
         let codec = CompressionCodec::Lz4Frame;
         let encode_time = Time::default();
         let write_time = Time::default();
-        let coalesce_time = Time::default();
 
         // Write with coalescing (batch_size=8192)
         let mut coalesced_output = Vec::new();
@@ -631,13 +630,9 @@ mod test {
                 8192,
             );
             for batch in &small_batches {
-                buf_writer
-                    .write(batch, &encode_time, &write_time, &coalesce_time)
-                    .unwrap();
+                buf_writer.write(batch, &encode_time, &write_time).unwrap();
             }
-            buf_writer
-                .flush(&encode_time, &write_time, &coalesce_time)
-                .unwrap();
+            buf_writer.flush(&encode_time, &write_time).unwrap();
         }
 
         // Write without coalescing (batch_size=1)
@@ -651,13 +646,9 @@ mod test {
                 1,
             );
             for batch in &small_batches {
-                buf_writer
-                    .write(batch, &encode_time, &write_time, &coalesce_time)
-                    .unwrap();
+                buf_writer.write(batch, &encode_time, &write_time).unwrap();
             }
-            buf_writer
-                .flush(&encode_time, &write_time, &coalesce_time)
-                .unwrap();
+            buf_writer.flush(&encode_time, &write_time).unwrap();
         }
 
         // Coalesced output should be smaller due to fewer IPC schema blocks
diff --git a/native/shuffle/src/writers/buf_batch_writer.rs b/native/shuffle/src/writers/buf_batch_writer.rs
index f6bb15137e..cfddb46539 100644
--- a/native/shuffle/src/writers/buf_batch_writer.rs
+++ b/native/shuffle/src/writers/buf_batch_writer.rs
@@ -63,9 +63,7 @@ impl<S: Borrow<ShuffleBlockWriter>, W: Write> BufBatchWriter<S, W> {
         batch: &RecordBatch,
         encode_time: &Time,
         write_time: &Time,
-        coalesce_time: &Time,
     ) -> datafusion::common::Result<usize> {
-        let mut coalesce_timer = coalesce_time.timer();
         let coalescer = self
             .coalescer
             .get_or_insert_with(|| BatchCoalescer::new(batch.schema(), self.batch_size));
@@ -77,7 +75,6 @@ impl<S: Borrow<ShuffleBlockWriter>, W: Write> BufBatchWriter<S, W> {
         while let Some(batch) = coalescer.next_completed_batch() {
             completed.push(batch);
         }
-        coalesce_timer.stop();
 
         let mut bytes_written = 0;
         for batch in &completed {
@@ -113,10 +110,8 @@ impl<S: Borrow<ShuffleBlockWriter>, W: Write> BufBatchWriter<S, W> {
         &mut self,
         encode_time: &Time,
         write_time: &Time,
-        coalesce_time: &Time,
     ) -> datafusion::common::Result<()> {
         // Finish any remaining buffered rows in the coalescer
-        let mut coalesce_timer = coalesce_time.timer();
         let mut remaining = Vec::new();
         if let Some(coalescer) = &mut self.coalescer {
             coalescer.finish_buffered_batch()?;
@@ -124,7 +119,6 @@ impl<S: Borrow<ShuffleBlockWriter>, W: Write> BufBatchWriter<S, W> {
                 remaining.push(batch);
             }
         }
-        coalesce_timer.stop();
         for batch in &remaining {
             self.write_batch_to_buffer(batch, encode_time, write_time)?;
         }
diff --git a/native/shuffle/src/writers/spill.rs b/native/shuffle/src/writers/spill.rs
index 05492791d1..c16caddbf9 100644
--- a/native/shuffle/src/writers/spill.rs
+++ b/native/shuffle/src/writers/spill.rs
@@ -93,26 +93,17 @@ impl PartitionWriter {
                     write_buffer_size,
                     batch_size,
                 );
-                let mut bytes_written = buf_batch_writer.write(
-                    &batch?,
-                    &metrics.encode_time,
-                    &metrics.write_time,
-                    &metrics.coalesce_time,
-                )?;
+                let mut bytes_written =
+                    buf_batch_writer.write(&batch?, &metrics.encode_time, &metrics.write_time)?;
                 for batch in iter {
                     let batch = batch?;
                     bytes_written += buf_batch_writer.write(
                         &batch,
                         &metrics.encode_time,
                         &metrics.write_time,
-                        &metrics.coalesce_time,
                     )?;
                 }
-                buf_batch_writer.flush(
-                    &metrics.encode_time,
-                    &metrics.write_time,
-                    &metrics.coalesce_time,
-                )?;
+                buf_batch_writer.flush(&metrics.encode_time, &metrics.write_time)?;
                 bytes_written
             };
 

From a56c201cb42f8953c97aeac876b3e08e556834f9 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 1 Apr 2026 13:51:20 -0600
Subject: [PATCH 13/18] style: apply cargo fmt and prettier

---
 native/shuffle/README.md                | 30 ++++++++++++-------------
 native/shuffle/src/bin/shuffle_bench.rs |  6 ++---
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/native/shuffle/README.md b/native/shuffle/README.md
index 7484cc1093..4ba5daebda 100644
--- a/native/shuffle/README.md
+++ b/native/shuffle/README.md
@@ -41,21 +41,21 @@ cargo run --release --features shuffle-bench --bin shuffle_bench -- \
 
 ### Options
 
-| Option                   | Default                    | Description                                                  |
-| ------------------------ | -------------------------- | ------------------------------------------------------------ |
-| `--input`                | _(required)_               | Path to a Parquet file or directory of Parquet files         |
-| `--partitions`           | `200`                      | Number of output shuffle partitions                          |
-| `--partitioning`         | `hash`                     | Partitioning scheme: `hash`, `single`, `round-robin`         |
-| `--hash-columns`         | `0`                        | Comma-separated column indices to hash on (e.g. `0,3`)       |
-| `--codec`                | `zstd`                     | Compression codec: `none`, `lz4`, `zstd`, `snappy`           |
-| `--zstd-level`           | `1`                        | Zstd compression level (1–22)                                |
-| `--batch-size`           | `8192`                     | Batch size for reading Parquet data                          |
-| `--memory-limit`         | _(none)_                   | Memory limit in bytes; triggers spilling when exceeded       |
-| `--write-buffer-size`    | `1048576`                  | Write buffer size in bytes                                   |
-| `--limit`                | `0`                        | Limit rows processed per iteration (0 = no limit)            |
-| `--iterations`           | `1`                        | Number of timed iterations                                   |
-| `--warmup`               | `0`                        | Number of warmup iterations before timing                    |
-| `--output-dir`           | `/tmp/comet_shuffle_bench` | Directory for temporary shuffle output files                 |
+| Option                | Default                    | Description                                            |
+| --------------------- | -------------------------- | ------------------------------------------------------ |
+| `--input`             | _(required)_               | Path to a Parquet file or directory of Parquet files   |
+| `--partitions`        | `200`                      | Number of output shuffle partitions                    |
+| `--partitioning`      | `hash`                     | Partitioning scheme: `hash`, `single`, `round-robin`   |
+| `--hash-columns`      | `0`                        | Comma-separated column indices to hash on (e.g. `0,3`) |
+| `--codec`             | `zstd`                     | Compression codec: `none`, `lz4`, `zstd`, `snappy`     |
+| `--zstd-level`        | `1`                        | Zstd compression level (1–22)                          |
+| `--batch-size`        | `8192`                     | Batch size for reading Parquet data                    |
+| `--memory-limit`      | _(none)_                   | Memory limit in bytes; triggers spilling when exceeded |
+| `--write-buffer-size` | `1048576`                  | Write buffer size in bytes                             |
+| `--limit`             | `0`                        | Limit rows processed per iteration (0 = no limit)      |
+| `--iterations`        | `1`                        | Number of timed iterations                             |
+| `--warmup`            | `0`                        | Number of warmup iterations before timing              |
+| `--output-dir`        | `/tmp/comet_shuffle_bench` | Directory for temporary shuffle output files           |
 
 ### Profiling with flamegraph
 
diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs
index 112fa26bbc..cb303243c0 100644
--- a/native/shuffle/src/bin/shuffle_bench.rs
+++ b/native/shuffle/src/bin/shuffle_bench.rs
@@ -45,9 +45,7 @@ use datafusion::physical_plan::common::collect;
 use datafusion::physical_plan::metrics::{MetricValue, MetricsSet};
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::prelude::{ParquetReadOptions, SessionContext};
-use datafusion_comet_shuffle::{
-    CometPartitioning, CompressionCodec, ShuffleWriterExec,
-};
+use datafusion_comet_shuffle::{CometPartitioning, CompressionCodec, ShuffleWriterExec};
 use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
 use std::fs;
 use std::path::{Path, PathBuf};
@@ -158,7 +156,7 @@ fn main() {
 
     let total_iters = args.warmup + args.iterations;
     let mut write_times = Vec::with_capacity(args.iterations);
-let mut data_file_sizes = Vec::with_capacity(args.iterations);
+    let mut data_file_sizes = Vec::with_capacity(args.iterations);
     let mut last_metrics: Option<MetricsSet> = None;
     let mut last_input_metrics: Option<MetricsSet> = None;
 

From cd4e5503eb7cee32eb7fc4acd8331b4506d90a7d Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 1 Apr 2026 13:56:58 -0600
Subject: [PATCH 14/18] fix: default codec to lz4 to match Comet default, fix
 about string

---
 native/shuffle/README.md                | 6 +++---
 native/shuffle/src/bin/shuffle_bench.rs | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/native/shuffle/README.md b/native/shuffle/README.md
index 4ba5daebda..0f53604fa3 100644
--- a/native/shuffle/README.md
+++ b/native/shuffle/README.md
@@ -35,7 +35,7 @@ performance outside of Spark. It streams input data directly from Parquet files.
 cargo run --release --features shuffle-bench --bin shuffle_bench -- \
   --input /data/tpch-sf100/lineitem/ \
   --partitions 200 \
-  --codec zstd --zstd-level 1 \
+  --codec lz4 \
   --hash-columns 0,3
 ```
 
@@ -47,7 +47,7 @@ cargo run --release --features shuffle-bench --bin shuffle_bench -- \
 | `--partitions`        | `200`                      | Number of output shuffle partitions                    |
 | `--partitioning`      | `hash`                     | Partitioning scheme: `hash`, `single`, `round-robin`   |
 | `--hash-columns`      | `0`                        | Comma-separated column indices to hash on (e.g. `0,3`) |
-| `--codec`             | `zstd`                     | Compression codec: `none`, `lz4`, `zstd`, `snappy`     |
+| `--codec`             | `lz4`                      | Compression codec: `none`, `lz4`, `zstd`, `snappy`     |
 | `--zstd-level`        | `1`                        | Zstd compression level (1–22)                          |
 | `--batch-size`        | `8192`                     | Batch size for reading Parquet data                    |
 | `--memory-limit`      | _(none)_                   | Memory limit in bytes; triggers spilling when exceeded |
@@ -62,5 +62,5 @@ cargo run --release --features shuffle-bench --bin shuffle_bench -- \
 ```sh
 cargo flamegraph --release --features shuffle-bench --bin shuffle_bench -- \
   --input /data/tpch-sf100/lineitem/ \
-  --partitions 200 --codec zstd --zstd-level 1
+  --partitions 200 --codec lz4
 ```
diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs
index cb303243c0..bb8c2a0380 100644
--- a/native/shuffle/src/bin/shuffle_bench.rs
+++ b/native/shuffle/src/bin/shuffle_bench.rs
@@ -24,7 +24,7 @@
 //! cargo run --release --bin shuffle_bench -- \
 //!   --input /data/tpch-sf100/lineitem/ \
 //!   --partitions 200 \
-//!   --codec zstd --zstd-level 1 \
+//!   --codec lz4 \
 //!   --hash-columns 0,3
 //! ```
 //!
@@ -32,7 +32,7 @@
 //! ```sh
 //! cargo flamegraph --release --bin shuffle_bench -- \
 //!   --input /data/tpch-sf100/lineitem/ \
-//!   --partitions 200 --codec zstd --zstd-level 1
+//!   --partitions 200 --codec lz4
 //! ```
 
 use arrow::datatypes::{DataType, SchemaRef};
@@ -55,7 +55,7 @@ use std::time::Instant;
 #[derive(Parser, Debug)]
 #[command(
     name = "shuffle_bench",
-    about = "Standalone benchmark for Comet shuffle write and read performance"
+    about = "Standalone benchmark for Comet shuffle write performance"
 )]
 struct Args {
     /// Path to input Parquet file or directory of Parquet files
@@ -79,7 +79,7 @@ struct Args {
     hash_columns: String,
 
     /// Compression codec: none, lz4, zstd, snappy
-    #[arg(long, default_value = "zstd")]
+    #[arg(long, default_value = "lz4")]
     codec: String,
 
     /// Zstd compression level (1-22)

From cb0c88f3515499a2cf4cd46b553ca8b89f5c2e0b Mon Sep 17 00:00:00 2001
From: shekharrajak <shekharrajak@live.com>
Date: Mon, 6 Apr 2026 15:41:57 +0530
Subject: [PATCH 15/18] Refactor PartitionWriter and add SpillInfo for combined
 spill files

---
 native/shuffle/src/writers/mod.rs   |   2 +-
 native/shuffle/src/writers/spill.rs | 118 +++++++++++++++-------------
 2 files changed, 63 insertions(+), 57 deletions(-)

diff --git a/native/shuffle/src/writers/mod.rs b/native/shuffle/src/writers/mod.rs
index 75caf9f3a3..06f9622c12 100644
--- a/native/shuffle/src/writers/mod.rs
+++ b/native/shuffle/src/writers/mod.rs
@@ -23,4 +23,4 @@ mod spill;
 pub(crate) use buf_batch_writer::BufBatchWriter;
 pub(crate) use checksum::Checksum;
 pub use shuffle_block_writer::{CompressionCodec, ShuffleBlockWriter};
-pub(crate) use spill::PartitionWriter;
+pub(crate) use spill::{PartitionSpillRange, PartitionWriter, SpillInfo};
diff --git a/native/shuffle/src/writers/spill.rs b/native/shuffle/src/writers/spill.rs
index c16caddbf9..d4011845b8 100644
--- a/native/shuffle/src/writers/spill.rs
+++ b/native/shuffle/src/writers/spill.rs
@@ -19,24 +19,68 @@ use super::ShuffleBlockWriter;
 use crate::metrics::ShufflePartitionerMetrics;
 use crate::partitioners::PartitionedBatchIterator;
 use crate::writers::buf_batch_writer::BufBatchWriter;
-use datafusion::common::DataFusionError;
 use datafusion::execution::disk_manager::RefCountedTempFile;
-use datafusion::execution::runtime_env::RuntimeEnv;
-use std::fs::{File, OpenOptions};
+use std::fs::File;
+use std::io::{Read, Seek, SeekFrom, Write};
 
-/// A temporary disk file for spilling a partition's intermediate shuffle data.
-struct SpillFile {
-    temp_file: RefCountedTempFile,
-    file: File,
+/// The byte range of a single partition's data within a combined spill file.
+#[derive(Debug, Clone)]
+pub(crate) struct PartitionSpillRange {
+    pub offset: u64,
+    pub length: u64,
 }
 
-/// Manages encoding and optional disk spilling for a single shuffle partition.
+/// Represents a single spill file that contains data from multiple partitions.
+/// Data is written sequentially ordered by partition ID. Each partition's byte
+/// range is tracked in `partition_ranges` so it can be read back during merge.
+pub(crate) struct SpillInfo {
+    /// The temporary file handle -- kept alive to prevent cleanup until we are done.
+    _temp_file: RefCountedTempFile,
+    /// Path to the spill file on disk.
+    path: std::path::PathBuf,
+    /// Byte range for each partition. None means the partition had no data in this spill.
+    pub partition_ranges: Vec<Option<PartitionSpillRange>>,
+}
+
+impl SpillInfo {
+    pub(crate) fn new(
+        temp_file: RefCountedTempFile,
+        partition_ranges: Vec<Option<PartitionSpillRange>>,
+    ) -> Self {
+        let path = temp_file.path().to_path_buf();
+        Self {
+            _temp_file: temp_file,
+            path,
+            partition_ranges,
+        }
+    }
+
+    /// Copy the data for `partition_id` from this spill file into `output`.
+    /// Returns the number of bytes copied.
+    pub(crate) fn copy_partition_to(
+        &self,
+        partition_id: usize,
+        output: &mut impl Write,
+    ) -> datafusion::common::Result<u64> {
+        if let Some(ref range) = self.partition_ranges[partition_id] {
+            if range.length == 0 {
+                return Ok(0);
+            }
+            let mut spill_file = File::open(&self.path)?;
+            spill_file.seek(SeekFrom::Start(range.offset))?;
+            let mut limited = spill_file.take(range.length);
+            let copied = std::io::copy(&mut limited, output)?;
+            Ok(copied)
+        } else {
+            Ok(0)
+        }
+    }
+}
+
+/// Manages encoding for a single shuffle partition. Does not own any spill file --
+/// spill files are managed at the repartitioner level as combined SpillInfo objects.
 pub(crate) struct PartitionWriter {
-    /// Spill file for intermediate shuffle output for this partition. Each spill event
-    /// will append to this file and the contents will be copied to the shuffle file at
-    /// the end of processing.
-    spill_file: Option<SpillFile>,
-    /// Writer that performs encoding and compression
+    /// Writer that performs encoding and compression.
     shuffle_block_writer: ShuffleBlockWriter,
 }
 
@@ -45,51 +89,25 @@ impl PartitionWriter {
         shuffle_block_writer: ShuffleBlockWriter,
     ) -> datafusion::common::Result<Self> {
         Ok(Self {
-            spill_file: None,
             shuffle_block_writer,
         })
     }
 
-    fn ensure_spill_file_created(
-        &mut self,
-        runtime: &RuntimeEnv,
-    ) -> datafusion::common::Result<()> {
-        if self.spill_file.is_none() {
-            // Spill file is not yet created, create it
-            let spill_file = runtime
-                .disk_manager
-                .create_tmp_file("shuffle writer spill")?;
-            let spill_data = OpenOptions::new()
-                .write(true)
-                .create(true)
-                .truncate(true)
-                .open(spill_file.path())
-                .map_err(|e| {
-                    DataFusionError::Execution(format!("Error occurred while spilling {e}"))
-                })?;
-            self.spill_file = Some(SpillFile {
-                temp_file: spill_file,
-                file: spill_data,
-            });
-        }
-        Ok(())
-    }
-
-    pub(crate) fn spill(
+    /// Encode and write a partition's batches to the provided writer.
+    /// Returns the number of bytes written.
+    pub(crate) fn write_to<W: Write>(
         &mut self,
         iter: &mut PartitionedBatchIterator,
-        runtime: &RuntimeEnv,
+        writer: &mut W,
         metrics: &ShufflePartitionerMetrics,
         write_buffer_size: usize,
         batch_size: usize,
     ) -> datafusion::common::Result<usize> {
         if let Some(batch) = iter.next() {
-            self.ensure_spill_file_created(runtime)?;
-
             let total_bytes_written = {
                 let mut buf_batch_writer = BufBatchWriter::new(
                     &mut self.shuffle_block_writer,
-                    &mut self.spill_file.as_mut().unwrap().file,
+                    writer,
                     write_buffer_size,
                     batch_size,
                 );
@@ -106,21 +124,9 @@ impl PartitionWriter {
                 buf_batch_writer.flush(&metrics.encode_time, &metrics.write_time)?;
                 bytes_written
             };
-
             Ok(total_bytes_written)
         } else {
             Ok(0)
         }
     }
-
-    pub(crate) fn path(&self) -> Option<&std::path::Path> {
-        self.spill_file
-            .as_ref()
-            .map(|spill_file| spill_file.temp_file.path())
-    }
-
-    #[cfg(test)]
-    pub(crate) fn has_spill_file(&self) -> bool {
-        self.spill_file.is_some()
-    }
 }

From dc5d494ef92ae0edac708bc780d43abea0d62c72 Mon Sep 17 00:00:00 2001
From: shekharrajak <shekharrajak@live.com>
Date: Mon, 6 Apr 2026 15:42:04 +0530
Subject: [PATCH 16/18] Use single spill file per spill event in shuffle
 repartitioner

---
 .../src/partitioners/multi_partition.rs       | 61 +++++++++++++++----
 1 file changed, 48 insertions(+), 13 deletions(-)

diff --git a/native/shuffle/src/partitioners/multi_partition.rs b/native/shuffle/src/partitioners/multi_partition.rs
index 7de9314f54..71c04f8d82 100644
--- a/native/shuffle/src/partitioners/multi_partition.rs
+++ b/native/shuffle/src/partitioners/multi_partition.rs
@@ -20,7 +20,7 @@ use crate::partitioners::partitioned_batch_iterator::{
     PartitionedBatchIterator, PartitionedBatchesProducer,
 };
 use crate::partitioners::ShufflePartitioner;
-use crate::writers::{BufBatchWriter, PartitionWriter};
+use crate::writers::{BufBatchWriter, PartitionSpillRange, PartitionWriter, SpillInfo};
 use crate::{comet_partitioning, CometPartitioning, CompressionCodec, ShuffleBlockWriter};
 use arrow::array::{ArrayRef, RecordBatch};
 use arrow::datatypes::SchemaRef;
@@ -125,6 +125,9 @@ pub(crate) struct MultiPartitionShuffleRepartitioner {
     tracing_enabled: bool,
     /// Size of the write buffer in bytes
     write_buffer_size: usize,
+    /// Combined spill files. Each entry is a single file containing data from
+    /// multiple partitions, created during one spill event.
+    spill_infos: Vec<SpillInfo>,
 }
 
 impl MultiPartitionShuffleRepartitioner {
@@ -190,6 +193,7 @@ impl MultiPartitionShuffleRepartitioner {
             reservation,
             tracing_enabled,
             write_buffer_size,
+            spill_infos: vec![],
         })
     }
 
@@ -502,20 +506,53 @@ impl MultiPartitionShuffleRepartitioner {
         with_trace("shuffle_spill", self.tracing_enabled, || {
             let num_output_partitions = self.partition_writers.len();
             let mut partitioned_batches = self.partitioned_batches();
-            let mut spilled_bytes = 0;
+            let mut spilled_bytes: usize = 0;
+
+            // Create a single temporary file for this spill event
+            let temp_file = self
+                .runtime
+                .disk_manager
+                .create_tmp_file("shuffle writer spill")?;
+            let mut spill_file = OpenOptions::new()
+                .write(true)
+                .create(true)
+                .truncate(true)
+                .open(temp_file.path())
+                .map_err(|e| {
+                    DataFusionError::Execution(format!("Error occurred while spilling {e}"))
+                })?;
+
+            let mut partition_ranges = Vec::with_capacity(num_output_partitions);
 
             for partition_id in 0..num_output_partitions {
                 let partition_writer = &mut self.partition_writers[partition_id];
                 let mut iter = partitioned_batches.produce(partition_id);
-                spilled_bytes += partition_writer.spill(
+
+                let offset = spill_file.stream_position()?;
+                let bytes_written = partition_writer.write_to(
                     &mut iter,
-                    &self.runtime,
+                    &mut spill_file,
                     &self.metrics,
                     self.write_buffer_size,
                     self.batch_size,
                 )?;
+
+                if bytes_written > 0 {
+                    partition_ranges.push(Some(PartitionSpillRange {
+                        offset,
+                        length: bytes_written as u64,
+                    }));
+                    spilled_bytes += bytes_written;
+                } else {
+                    partition_ranges.push(None);
+                }
             }
 
+            spill_file.flush()?;
+
+            self.spill_infos
+                .push(SpillInfo::new(temp_file, partition_ranges));
+
             self.reservation.free();
             self.metrics.spill_count.add(1);
             self.metrics.spilled_bytes.add(spilled_bytes);
@@ -524,8 +561,8 @@ impl MultiPartitionShuffleRepartitioner {
     }
 
     #[cfg(test)]
-    pub(crate) fn partition_writers(&self) -> &[PartitionWriter] {
-        &self.partition_writers
+    pub(crate) fn spill_count_files(&self) -> usize {
+        self.spill_infos.len()
     }
 }
 
@@ -579,14 +616,12 @@ impl ShufflePartitioner for MultiPartitionShuffleRepartitioner {
             for i in 0..num_output_partitions {
                 offsets[i] = output_data.stream_position()?;
 
-                // if we wrote a spill file for this partition then copy the
-                // contents into the shuffle file
-                if let Some(spill_path) = self.partition_writers[i].path() {
-                    // Use raw File handle (not BufReader) so that std::io::copy
-                    // can use copy_file_range/sendfile for zero-copy on Linux.
-                    let mut spill_file = File::open(spill_path)?;
+                // Copy spilled data for this partition from each spill file.
+                // Each SpillInfo is a single file containing data from all partitions
+                // ordered by partition ID, with byte ranges tracked per partition.
+                for spill_info in &self.spill_infos {
                     let mut write_timer = self.metrics.write_time.timer();
-                    std::io::copy(&mut spill_file, &mut output_data)?;
+                    spill_info.copy_partition_to(i, &mut output_data)?;
                     write_timer.stop();
                 }
 

From 9fa440ba80542976c127f2b568b839dd08b97de2 Mon Sep 17 00:00:00 2001
From: shekharrajak <shekharrajak@live.com>
Date: Mon, 6 Apr 2026 15:42:09 +0530
Subject: [PATCH 17/18] Add unit tests for single spill file shuffle behavior

---
 native/shuffle/src/shuffle_writer.rs | 274 +++++++++++++++++++++++++--
 1 file changed, 261 insertions(+), 13 deletions(-)

diff --git a/native/shuffle/src/shuffle_writer.rs b/native/shuffle/src/shuffle_writer.rs
index e649aaac69..1bfe15c0c6 100644
--- a/native/shuffle/src/shuffle_writer.rs
+++ b/native/shuffle/src/shuffle_writer.rs
@@ -367,25 +367,20 @@ mod test {
 
         repartitioner.insert_batch(batch.clone()).await.unwrap();
 
-        {
-            let partition_writers = repartitioner.partition_writers();
-            assert_eq!(partition_writers.len(), 2);
-
-            assert!(!partition_writers[0].has_spill_file());
-            assert!(!partition_writers[1].has_spill_file());
-        }
+        // before spill, no spill files should exist
+        assert_eq!(repartitioner.spill_count_files(), 0);
 
         repartitioner.spill().unwrap();
 
-        // after spill, there should be spill files
-        {
-            let partition_writers = repartitioner.partition_writers();
-            assert!(partition_writers[0].has_spill_file());
-            assert!(partition_writers[1].has_spill_file());
-        }
+        // after spill, exactly one combined spill file should exist (not one per partition)
+        assert_eq!(repartitioner.spill_count_files(), 1);
 
         // insert another batch after spilling
         repartitioner.insert_batch(batch.clone()).await.unwrap();
+
+        // spill again -- should create a second combined spill file
+        repartitioner.spill().unwrap();
+        assert_eq!(repartitioner.spill_count_files(), 2);
     }
 
     fn create_runtime(memory_limit: usize) -> Arc<RuntimeEnv> {
@@ -693,4 +688,257 @@ mod test {
         }
         total_rows
     }
+
+    /// Verify that spilling an empty repartitioner produces no spill files.
+    #[tokio::test]
+    async fn spill_empty_buffers_produces_no_file() {
+        let batch = create_batch(100);
+        let memory_limit = 512 * 1024;
+        let num_partitions = 4;
+        let runtime_env = create_runtime(memory_limit);
+        let metrics_set = ExecutionPlanMetricsSet::new();
+        let mut repartitioner = MultiPartitionShuffleRepartitioner::try_new(
+            0,
+            "/tmp/spill_empty_data.out".to_string(),
+            "/tmp/spill_empty_index.out".to_string(),
+            batch.schema(),
+            CometPartitioning::Hash(vec![Arc::new(Column::new("a", 0))], num_partitions),
+            ShufflePartitionerMetrics::new(&metrics_set, 0),
+            runtime_env,
+            1024,
+            CompressionCodec::Lz4Frame,
+            false,
+            1024 * 1024,
+        )
+        .unwrap();
+
+        // Spill with no data inserted -- should be a no-op
+        repartitioner.spill().unwrap();
+        assert_eq!(repartitioner.spill_count_files(), 0);
+    }
+
+    /// Verify that spilling with many partitions (some empty) still creates
+    /// exactly one spill file per spill event, and that shuffle_write succeeds.
+    #[test]
+    #[cfg_attr(miri, ignore)]
+    fn test_spill_with_sparse_partitions() {
+        // 100 rows across 50 partitions -- many partitions will be empty
+        shuffle_write_test(100, 5, 50, Some(10 * 1024));
+    }
+
+    /// Verify that the output of a spill-based shuffle contains the same total
+    /// row count and valid partition structure as a non-spill shuffle.
+    #[test]
+    #[cfg_attr(miri, ignore)]
+    fn test_spill_output_matches_non_spill() {
+        use std::fs;
+
+        let batch_size = 1000;
+        let num_batches = 10;
+        let num_partitions = 8;
+        let total_rows = batch_size * num_batches;
+
+        let batch = create_batch(batch_size);
+        let batches = (0..num_batches).map(|_| batch.clone()).collect::<Vec<_>>();
+
+        let parse_offsets = |index_data: &[u8]| -> Vec<i64> {
+            index_data
+                .chunks(8)
+                .map(|chunk| i64::from_le_bytes(chunk.try_into().unwrap()))
+                .collect()
+        };
+
+        let count_rows_in_partition = |data: &[u8], start: i64, end: i64| -> usize {
+            if start == end {
+                return 0;
+            }
+            read_all_ipc_blocks(&data[start as usize..end as usize])
+        };
+
+        // Run 1: no spilling (large memory limit)
+        {
+            let partitions = std::slice::from_ref(&batches);
+            let exec = ShuffleWriterExec::try_new(
+                Arc::new(DataSourceExec::new(Arc::new(
+                    MemorySourceConfig::try_new(partitions, batch.schema(), None).unwrap(),
+                ))),
+                CometPartitioning::Hash(vec![Arc::new(Column::new("a", 0))], num_partitions),
+                CompressionCodec::Zstd(1),
+                "/tmp/no_spill_data.out".to_string(),
+                "/tmp/no_spill_index.out".to_string(),
+                false,
+                1024 * 1024,
+            )
+            .unwrap();
+
+            let config = SessionConfig::new();
+            let runtime_env = Arc::new(
+                RuntimeEnvBuilder::new()
+                    .with_memory_limit(100 * 1024 * 1024, 1.0)
+                    .build()
+                    .unwrap(),
+            );
+            let ctx = SessionContext::new_with_config_rt(config, runtime_env);
+            let task_ctx = ctx.task_ctx();
+            let stream = exec.execute(0, task_ctx).unwrap();
+            let rt = Runtime::new().unwrap();
+            rt.block_on(collect(stream)).unwrap();
+        }
+
+        // Run 2: with spilling (memory limit forces spilling during insert_batch)
+        {
+            let partitions = std::slice::from_ref(&batches);
+            let exec = ShuffleWriterExec::try_new(
+                Arc::new(DataSourceExec::new(Arc::new(
+                    MemorySourceConfig::try_new(partitions, batch.schema(), None).unwrap(),
+                ))),
+                CometPartitioning::Hash(vec![Arc::new(Column::new("a", 0))], num_partitions),
+                CompressionCodec::Zstd(1),
+                "/tmp/spill_data.out".to_string(),
+                "/tmp/spill_index.out".to_string(),
+                false,
+                1024 * 1024,
+            )
+            .unwrap();
+
+            let config = SessionConfig::new();
+            let runtime_env = Arc::new(
+                RuntimeEnvBuilder::new()
+                    .with_memory_limit(512 * 1024, 1.0)
+                    .build()
+                    .unwrap(),
+            );
+            let ctx = SessionContext::new_with_config_rt(config, runtime_env);
+            let task_ctx = ctx.task_ctx();
+            let stream = exec.execute(0, task_ctx).unwrap();
+            let rt = Runtime::new().unwrap();
+            rt.block_on(collect(stream)).unwrap();
+        }
+
+        let no_spill_index = fs::read("/tmp/no_spill_index.out").unwrap();
+        let spill_index = fs::read("/tmp/spill_index.out").unwrap();
+
+        assert_eq!(
+            no_spill_index.len(),
+            spill_index.len(),
+            "Index files should have the same number of entries"
+        );
+
+        let no_spill_offsets = parse_offsets(&no_spill_index);
+        let spill_offsets = parse_offsets(&spill_index);
+
+        let no_spill_data = fs::read("/tmp/no_spill_data.out").unwrap();
+        let spill_data = fs::read("/tmp/spill_data.out").unwrap();
+
+        // Verify row counts per partition match between spill and non-spill runs
+        let mut no_spill_total_rows = 0;
+        let mut spill_total_rows = 0;
+        for i in 0..num_partitions {
+            let ns_rows = count_rows_in_partition(
+                &no_spill_data,
+                no_spill_offsets[i],
+                no_spill_offsets[i + 1],
+            );
+            let s_rows =
+                count_rows_in_partition(&spill_data, spill_offsets[i], spill_offsets[i + 1]);
+            assert_eq!(
+                ns_rows, s_rows,
+                "Partition {i} row count mismatch: no_spill={ns_rows}, spill={s_rows}"
+            );
+            no_spill_total_rows += ns_rows;
+            spill_total_rows += s_rows;
+        }
+
+        assert_eq!(
+            no_spill_total_rows, total_rows,
+            "Non-spill total row count mismatch"
+        );
+        assert_eq!(
+            spill_total_rows, total_rows,
+            "Spill total row count mismatch"
+        );
+
+        // Cleanup
+        let _ = fs::remove_file("/tmp/no_spill_data.out");
+        let _ = fs::remove_file("/tmp/no_spill_index.out");
+        let _ = fs::remove_file("/tmp/spill_data.out");
+        let _ = fs::remove_file("/tmp/spill_index.out");
+    }
+
+    /// Verify multiple spill events with subsequent insert_batch calls
+    /// produce correct output.
+    #[tokio::test]
+    #[cfg_attr(miri, ignore)]
+    async fn test_multiple_spills_then_write() {
+        let batch = create_batch(500);
+        let memory_limit = 512 * 1024;
+        let num_partitions = 4;
+        let runtime_env = create_runtime(memory_limit);
+        let metrics_set = ExecutionPlanMetricsSet::new();
+        let mut repartitioner = MultiPartitionShuffleRepartitioner::try_new(
+            0,
+            "/tmp/multi_spill_data.out".to_string(),
+            "/tmp/multi_spill_index.out".to_string(),
+            batch.schema(),
+            CometPartitioning::Hash(vec![Arc::new(Column::new("a", 0))], num_partitions),
+            ShufflePartitionerMetrics::new(&metrics_set, 0),
+            runtime_env,
+            1024,
+            CompressionCodec::Lz4Frame,
+            false,
+            1024 * 1024,
+        )
+        .unwrap();
+
+        // Insert -> spill -> insert -> spill -> insert (3 inserts, 2 spills)
+        repartitioner.insert_batch(batch.clone()).await.unwrap();
+        repartitioner.spill().unwrap();
+        assert_eq!(repartitioner.spill_count_files(), 1);
+
+        repartitioner.insert_batch(batch.clone()).await.unwrap();
+        repartitioner.spill().unwrap();
+        assert_eq!(repartitioner.spill_count_files(), 2);
+
+        repartitioner.insert_batch(batch.clone()).await.unwrap();
+        // Final shuffle_write merges 2 spill files + in-memory data
+        repartitioner.shuffle_write().unwrap();
+
+        // Verify output files exist and are non-empty
+        let data = std::fs::read("/tmp/multi_spill_data.out").unwrap();
+        assert!(!data.is_empty(), "Output data file should be non-empty");
+
+        let index = std::fs::read("/tmp/multi_spill_index.out").unwrap();
+        // Index should have (num_partitions + 1) * 8 bytes
+        assert_eq!(
+            index.len(),
+            (num_partitions + 1) * 8,
+            "Index file should have correct number of offset entries"
+        );
+
+        // Parse offsets and verify they are monotonically non-decreasing
+        let offsets: Vec<i64> = index
+            .chunks(8)
+            .map(|chunk| i64::from_le_bytes(chunk.try_into().unwrap()))
+            .collect();
+        assert_eq!(offsets[0], 0, "First offset should be 0");
+        for i in 1..offsets.len() {
+            assert!(
+                offsets[i] >= offsets[i - 1],
+                "Offsets must be monotonically non-decreasing: offset[{}]={} < offset[{}]={}",
+                i,
+                offsets[i],
+                i - 1,
+                offsets[i - 1]
+            );
+        }
+        assert_eq!(
+            *offsets.last().unwrap() as usize,
+            data.len(),
+            "Last offset should equal data file size"
+        );
+
+        // Cleanup
+        let _ = std::fs::remove_file("/tmp/multi_spill_data.out");
+        let _ = std::fs::remove_file("/tmp/multi_spill_index.out");
+    }
 }

From 989e7d18404b03ecaa216845666a2d9bf779d8e7 Mon Sep 17 00:00:00 2001
From: shekharrajak <shekharrajak@live.com>
Date: Tue, 7 Apr 2026 09:19:25 +0530
Subject: [PATCH 18/18] Cache spill file handles during finalize to avoid N*S
 open() calls

---
 .../src/partitioners/multi_partition.rs       | 28 +++++++++++++------
 native/shuffle/src/writers/spill.rs           | 18 +++++++++---
 2 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/native/shuffle/src/partitioners/multi_partition.rs b/native/shuffle/src/partitioners/multi_partition.rs
index 71c04f8d82..0113355e45 100644
--- a/native/shuffle/src/partitioners/multi_partition.rs
+++ b/native/shuffle/src/partitioners/multi_partition.rs
@@ -529,20 +529,22 @@ impl MultiPartitionShuffleRepartitioner {
                 let mut iter = partitioned_batches.produce(partition_id);
 
                 let offset = spill_file.stream_position()?;
-                let bytes_written = partition_writer.write_to(
+                partition_writer.write_to(
                     &mut iter,
                     &mut spill_file,
                     &self.metrics,
                     self.write_buffer_size,
                     self.batch_size,
                 )?;
+                let end_offset = spill_file.stream_position()?;
+                let actual_bytes = (end_offset - offset) as usize;
 
-                if bytes_written > 0 {
+                if actual_bytes > 0 {
                     partition_ranges.push(Some(PartitionSpillRange {
                         offset,
-                        length: bytes_written as u64,
+                        length: actual_bytes as u64,
                     }));
-                    spilled_bytes += bytes_written;
+                    spilled_bytes += actual_bytes;
                 } else {
                     partition_ranges.push(None);
                 }
@@ -612,16 +614,24 @@ impl ShufflePartitioner for MultiPartitionShuffleRepartitioner {
 
             let mut output_data = BufWriter::new(output_data);
 
+            // Pre-open all spill files once to avoid repeated File::open() calls.
+            // With N partitions and S spill files, this reduces open() calls from
+            // N*S to S.
+            let mut spill_handles: Vec<_> = self
+                .spill_infos
+                .iter()
+                .map(|info| info.open_for_read())
+                .collect::<datafusion::common::Result<Vec<_>>>()?;
+
             #[allow(clippy::needless_range_loop)]
             for i in 0..num_output_partitions {
                 offsets[i] = output_data.stream_position()?;
 
-                // Copy spilled data for this partition from each spill file.
-                // Each SpillInfo is a single file containing data from all partitions
-                // ordered by partition ID, with byte ranges tracked per partition.
-                for spill_info in &self.spill_infos {
+                // Copy spilled data for this partition from each spill file
+                // using pre-opened file handles.
+                for (spill_info, handle) in self.spill_infos.iter().zip(spill_handles.iter_mut()) {
                     let mut write_timer = self.metrics.write_time.timer();
-                    spill_info.copy_partition_to(i, &mut output_data)?;
+                    spill_info.copy_partition_with_handle(i, handle, &mut output_data)?;
                     write_timer.stop();
                 }
 
diff --git a/native/shuffle/src/writers/spill.rs b/native/shuffle/src/writers/spill.rs
index d4011845b8..093b9ce6b7 100644
--- a/native/shuffle/src/writers/spill.rs
+++ b/native/shuffle/src/writers/spill.rs
@@ -19,6 +19,7 @@ use super::ShuffleBlockWriter;
 use crate::metrics::ShufflePartitionerMetrics;
 use crate::partitioners::PartitionedBatchIterator;
 use crate::writers::buf_batch_writer::BufBatchWriter;
+use datafusion::common::DataFusionError;
 use datafusion::execution::disk_manager::RefCountedTempFile;
 use std::fs::File;
 use std::io::{Read, Seek, SeekFrom, Write};
@@ -55,26 +56,35 @@ impl SpillInfo {
         }
     }
 
-    /// Copy the data for `partition_id` from this spill file into `output`.
+    /// Copy the data for `partition_id` using a pre-opened file handle.
+    /// Avoids repeated File::open() calls when iterating over partitions.
     /// Returns the number of bytes copied.
-    pub(crate) fn copy_partition_to(
+    pub(crate) fn copy_partition_with_handle(
         &self,
         partition_id: usize,
+        spill_file: &mut File,
         output: &mut impl Write,
     ) -> datafusion::common::Result<u64> {
         if let Some(ref range) = self.partition_ranges[partition_id] {
             if range.length == 0 {
                 return Ok(0);
             }
-            let mut spill_file = File::open(&self.path)?;
             spill_file.seek(SeekFrom::Start(range.offset))?;
-            let mut limited = spill_file.take(range.length);
+            let mut limited = Read::take(spill_file, range.length);
             let copied = std::io::copy(&mut limited, output)?;
             Ok(copied)
         } else {
             Ok(0)
         }
     }
+
+    /// Open the spill file for reading. The returned handle can be reused
+    /// across multiple copy_partition_with_handle() calls.
+    pub(crate) fn open_for_read(&self) -> datafusion::common::Result<File> {
+        File::open(&self.path).map_err(|e| {
+            DataFusionError::Execution(format!("Failed to open spill file for reading: {e}"))
+        })
+    }
 }
 
 /// Manages encoding for a single shuffle partition. Does not own any spill file --