From 57a0ed6799016aae9ffc852524bc30491de75e06 Mon Sep 17 00:00:00 2001 From: Soham <010Soham@users.noreply.github.com> Date: Fri, 19 Dec 2025 09:57:02 +0000 Subject: [PATCH 1/4] Respect partition evolution in inspect.partitions --- pyiceberg/table/inspect.py | 6 +++--- pyiceberg/table/metadata.py | 14 ++++++++++---- tests/io/test_pyarrow.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/pyiceberg/table/inspect.py b/pyiceberg/table/inspect.py index bfe2fffa56..5da343ccb6 100644 --- a/pyiceberg/table/inspect.py +++ b/pyiceberg/table/inspect.py @@ -285,7 +285,9 @@ def partitions( ] ) - partition_record = self.tbl.metadata.specs_struct() + snapshot = self._get_snapshot(snapshot_id) + spec_ids = {manifest.partition_spec_id for manifest in snapshot.manifests(self.tbl.io)} + partition_record = self.tbl.metadata.specs_struct(spec_ids=spec_ids) has_partitions = len(partition_record.fields) > 0 if has_partitions: @@ -299,8 +301,6 @@ def partitions( table_schema = pa.unify_schemas([partitions_schema, table_schema]) - snapshot = self._get_snapshot(snapshot_id) - scan = DataScan( table_metadata=self.tbl.metadata, io=self.tbl.io, diff --git a/pyiceberg/table/metadata.py b/pyiceberg/table/metadata.py index 8ae930375a..8a55f77b11 100644 --- a/pyiceberg/table/metadata.py +++ b/pyiceberg/table/metadata.py @@ -18,6 +18,7 @@ import datetime import uuid +from collections.abc import Iterable from copy import copy from typing import Annotated, Any, Literal @@ -262,18 +263,23 @@ def specs(self) -> dict[int, PartitionSpec]: """Return a dict the partition specs this table.""" return {spec.spec_id: spec for spec in self.partition_specs} - def specs_struct(self) -> StructType: - """Produce a struct of all the combined PartitionSpecs. + def specs_struct(self, spec_ids: Iterable[int] | None = None) -> StructType: + """Produce a struct of the combined PartitionSpecs. The partition fields should be optional: Partition fields may be added later, in which case not all files would have the result field, and it may be null. - :return: A StructType that represents all the combined PartitionSpecs of the table + Args: + spec_ids: Optional iterable of spec IDs to include. When not provided, + all table specs are used. + + :return: A StructType that represents the combined PartitionSpecs of the table """ specs = self.specs() + selected_specs = specs.values() if spec_ids is None else [specs[spec_id] for spec_id in spec_ids if spec_id in specs] # Collect all the fields - struct_fields = {field.field_id: field for spec in specs.values() for field in spec.fields} + struct_fields = {field.field_id: field for spec in selected_specs for field in spec.fields} schema = self.schema() diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 869e60f4aa..a2ef72a412 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2588,6 +2588,34 @@ def test_inspect_partition_for_nested_field(catalog: InMemoryCatalog) -> None: assert {part["part"] for part in partitions} == {"data-a", "data-b"} +def test_inspect_partitions_respects_partition_evolution(catalog: InMemoryCatalog) -> None: + schema = Schema( + NestedField(id=1, name="dt", field_type=DateType(), required=False), + NestedField(id=2, name="category", field_type=StringType(), required=False), + ) + spec = PartitionSpec(PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="dt")) + catalog.create_namespace("default") + table = catalog.create_table("default.test_partition_evolution_inspect", schema=schema, partition_spec=spec) + + old_spec_id = table.spec().spec_id + old_data = [{"dt": date(2025, 1, 1), "category": "old"}] + table.append(pa.Table.from_pylist(old_data, schema=table.schema().as_arrow())) + + table.update_spec().add_identity("category").commit() + new_spec_id = table.spec().spec_id + assert new_spec_id != old_spec_id + + partitions_table = table.inspect.partitions() + partitions = partitions_table["partition"].to_pylist() + assert all("category" not in partition for partition in partitions) + + new_data = [{"dt": date(2025, 1, 2), "category": "new"}] + table.append(pa.Table.from_pylist(new_data, schema=table.schema().as_arrow())) + + partitions_table = table.inspect.partitions() + assert set(partitions_table["spec_id"].to_pylist()) == {old_spec_id, new_spec_id} + + def test_identity_partition_on_multi_columns() -> None: test_pa_schema = pa.schema([("born_year", pa.int64()), ("n_legs", pa.int64()), ("animal", pa.string())]) test_schema = Schema( From 751d44ed0407be86779aed8b5f4d5715cc94cc85 Mon Sep 17 00:00:00 2001 From: Soham <010Soham@users.noreply.github.com> Date: Fri, 19 Dec 2025 10:15:17 +0000 Subject: [PATCH 2/4] Adjust partition comparison for partition evolution --- tests/integration/test_inspect_table.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_inspect_table.py b/tests/integration/test_inspect_table.py index 4add18cf3f..ea0cca9bc5 100644 --- a/tests/integration/test_inspect_table.py +++ b/tests/integration/test_inspect_table.py @@ -18,6 +18,7 @@ import math from datetime import date, datetime +from typing import Any import pyarrow as pa import pytest @@ -208,9 +209,18 @@ def _inspect_files_asserts(df: pa.Table, spark_df: DataFrame) -> None: def _check_pyiceberg_df_equals_spark_df(df: pa.Table, spark_df: DataFrame) -> None: lhs = df.to_pandas().sort_values("last_updated_at") rhs = spark_df.toPandas().sort_values("last_updated_at") + + def _normalize_partition(d: dict[str, Any]) -> dict[str, Any]: + return {k: v for k, v in d.items() if v is not None} + for column in df.column_names: for left, right in zip(lhs[column].to_list(), rhs[column].to_list(), strict=True): - assert left == right, f"Difference in column {column}: {left} != {right}" + if column == "partition": + assert _normalize_partition(left) == _normalize_partition(right), ( + f"Difference in column {column}: {left} != {right}" + ) + else: + assert left == right, f"Difference in column {column}: {left} != {right}" @pytest.mark.integration From 60a61df3b08182108106e744dd242057ebfba359 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 24 Dec 2025 11:24:11 -0800 Subject: [PATCH 3/4] Update tests/io/test_pyarrow.py --- tests/io/test_pyarrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index a2ef72a412..db638c1412 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2595,7 +2595,7 @@ def test_inspect_partitions_respects_partition_evolution(catalog: InMemoryCatalo ) spec = PartitionSpec(PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="dt")) catalog.create_namespace("default") - table = catalog.create_table("default.test_partition_evolution_inspect", schema=schema, partition_spec=spec) + table = catalog.create_table("default.test_inspect_partitions_respects_partition_evolution", schema=schema, partition_spec=spec) old_spec_id = table.spec().spec_id old_data = [{"dt": date(2025, 1, 1), "category": "old"}] From 53807374cd2a0bd9709fc2560b8a3be4ecb88d60 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 24 Dec 2025 11:26:24 -0800 Subject: [PATCH 4/4] make lint --- tests/io/test_pyarrow.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index db638c1412..677e16dab3 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2595,7 +2595,9 @@ def test_inspect_partitions_respects_partition_evolution(catalog: InMemoryCatalo ) spec = PartitionSpec(PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="dt")) catalog.create_namespace("default") - table = catalog.create_table("default.test_inspect_partitions_respects_partition_evolution", schema=schema, partition_spec=spec) + table = catalog.create_table( + "default.test_inspect_partitions_respects_partition_evolution", schema=schema, partition_spec=spec + ) old_spec_id = table.spec().spec_id old_data = [{"dt": date(2025, 1, 1), "category": "old"}]