diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 1077f41f6a..efeb72cbd4 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -2681,7 +2681,7 @@ def bin_pack_arrow_table(tbl: pa.Table, target_file_size: int) -> Iterator[list[ from pyiceberg.utils.bin_packing import PackingIterator avg_row_size_bytes = tbl.nbytes / tbl.num_rows - target_rows_per_file = target_file_size // avg_row_size_bytes + target_rows_per_file = max(1, int(target_file_size / avg_row_size_bytes)) batches = tbl.to_batches(max_chunksize=target_rows_per_file) bin_packed_record_batches = PackingIterator( items=batches, diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 869e60f4aa..ea2928cae2 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2248,6 +2248,12 @@ def test_bin_pack_arrow_table(arrow_table_with_null: pa.Table) -> None: assert len(list(bin_packed)) == 5 +def test_bin_pack_arrow_table_target_size_smaller_than_row(arrow_table_with_null: pa.Table) -> None: + bin_packed = list(bin_pack_arrow_table(arrow_table_with_null, target_file_size=1)) + assert len(bin_packed) == arrow_table_with_null.num_rows + assert sum(batch.num_rows for bin_ in bin_packed for batch in bin_) == arrow_table_with_null.num_rows + + def test_schema_mismatch_type(table_schema_simple: Schema) -> None: other_schema = pa.schema( (