From 232fcedb1931df1701a29c6f4cab52f8ddb5b4c0 Mon Sep 17 00:00:00 2001 From: Soham <010Soham@users.noreply.github.com> Date: Fri, 19 Dec 2025 07:13:12 +0000 Subject: [PATCH] Fix bin pack chunk size for tiny target file size --- pyiceberg/io/pyarrow.py | 2 +- tests/io/test_pyarrow.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 1077f41f6a..efeb72cbd4 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -2681,7 +2681,7 @@ def bin_pack_arrow_table(tbl: pa.Table, target_file_size: int) -> Iterator[list[ from pyiceberg.utils.bin_packing import PackingIterator avg_row_size_bytes = tbl.nbytes / tbl.num_rows - target_rows_per_file = target_file_size // avg_row_size_bytes + target_rows_per_file = max(1, int(target_file_size / avg_row_size_bytes)) batches = tbl.to_batches(max_chunksize=target_rows_per_file) bin_packed_record_batches = PackingIterator( items=batches, diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 869e60f4aa..ea2928cae2 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2248,6 +2248,12 @@ def test_bin_pack_arrow_table(arrow_table_with_null: pa.Table) -> None: assert len(list(bin_packed)) == 5 +def test_bin_pack_arrow_table_target_size_smaller_than_row(arrow_table_with_null: pa.Table) -> None: + bin_packed = list(bin_pack_arrow_table(arrow_table_with_null, target_file_size=1)) + assert len(bin_packed) == arrow_table_with_null.num_rows + assert sum(batch.num_rows for bin_ in bin_packed for batch in bin_) == arrow_table_with_null.num_rows + + def test_schema_mismatch_type(table_schema_simple: Schema) -> None: other_schema = pa.schema( (