From 0b29e5e570db94c2d6434083b0f94905a9a1b4f2 Mon Sep 17 00:00:00 2001 From: nicolas-paris Date: Mon, 2 Feb 2026 16:54:30 +0100 Subject: [PATCH 1/3] fix(iceberg): normalize float/double stats before encoding --- .../iceberg/IcebergColumnStatsConverter.java | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/iceberg/IcebergColumnStatsConverter.java b/xtable-core/src/main/java/org/apache/xtable/iceberg/IcebergColumnStatsConverter.java index 60d4a453b..379c797ca 100644 --- a/xtable-core/src/main/java/org/apache/xtable/iceberg/IcebergColumnStatsConverter.java +++ b/xtable-core/src/main/java/org/apache/xtable/iceberg/IcebergColumnStatsConverter.java @@ -67,13 +67,13 @@ public Metrics toIceberg(Schema schema, long totalRowCount, List fie valueCounts.put(fieldId, columnStats.getNumValues()); nullValueCounts.put(fieldId, columnStats.getNumNulls()); Type fieldType = icebergField.type(); - if (columnStats.getRange().getMinValue() != null) { - lowerBounds.put( - fieldId, Conversions.toByteBuffer(fieldType, columnStats.getRange().getMinValue())); + Object minValue = normalizeStatValue(fieldType, columnStats.getRange().getMinValue()); + if (minValue != null) { + lowerBounds.put(fieldId, Conversions.toByteBuffer(fieldType, minValue)); } - if (columnStats.getRange().getMaxValue() != null) { - upperBounds.put( - fieldId, Conversions.toByteBuffer(fieldType, columnStats.getRange().getMaxValue())); + Object maxValue = normalizeStatValue(fieldType, columnStats.getRange().getMaxValue()); + if (maxValue != null) { + upperBounds.put(fieldId, Conversions.toByteBuffer(fieldType, maxValue)); } }); return new Metrics( @@ -130,4 +130,18 @@ private Object convertFromIcebergValue(Type fieldType, ByteBuffer value) { } return convertedValue; } + + private Object normalizeStatValue(Type fieldType, Object value) { + if (value == null || !(value instanceof Number)) { + return value; + } + switch (fieldType.typeId()) { + case FLOAT: + return ((Number) value).floatValue(); + case DOUBLE: + return ((Number) value).doubleValue(); + default: + return value; + } + } } From 26c1c1de605029cb3d040b7ff7dfdcc864d0699a Mon Sep 17 00:00:00 2001 From: nicolas-paris Date: Wed, 4 Feb 2026 14:41:35 +0100 Subject: [PATCH 2/3] fix(iceberg): coerce stats when schema evolution did happen on source table --- .../iceberg/IcebergColumnStatsConverter.java | 63 +++++++++++++++---- 1 file changed, 52 insertions(+), 11 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/iceberg/IcebergColumnStatsConverter.java b/xtable-core/src/main/java/org/apache/xtable/iceberg/IcebergColumnStatsConverter.java index 379c797ca..7fd59dee9 100644 --- a/xtable-core/src/main/java/org/apache/xtable/iceberg/IcebergColumnStatsConverter.java +++ b/xtable-core/src/main/java/org/apache/xtable/iceberg/IcebergColumnStatsConverter.java @@ -27,6 +27,7 @@ import lombok.AccessLevel; import lombok.NoArgsConstructor; +import lombok.extern.log4j.Log4j2; import org.apache.iceberg.Metrics; import org.apache.iceberg.Schema; @@ -40,6 +41,7 @@ import org.apache.xtable.model.stat.Range; /** Column stats extractor for iceberg table format. */ +@Log4j2 @NoArgsConstructor(access = AccessLevel.PRIVATE) public class IcebergColumnStatsConverter { private static final IcebergSchemaExtractor SCHEMA_EXTRACTOR = @@ -67,13 +69,15 @@ public Metrics toIceberg(Schema schema, long totalRowCount, List fie valueCounts.put(fieldId, columnStats.getNumValues()); nullValueCounts.put(fieldId, columnStats.getNumNulls()); Type fieldType = icebergField.type(); - Object minValue = normalizeStatValue(fieldType, columnStats.getRange().getMinValue()); - if (minValue != null) { - lowerBounds.put(fieldId, Conversions.toByteBuffer(fieldType, minValue)); + Object minValue = coerceStatValue(fieldType, columnStats.getRange().getMinValue()); + ByteBuffer minBuffer = safeToByteBuffer(fieldType, minValue, field.getPath(), "min"); + if (minBuffer != null) { + lowerBounds.put(fieldId, minBuffer); } - Object maxValue = normalizeStatValue(fieldType, columnStats.getRange().getMaxValue()); - if (maxValue != null) { - upperBounds.put(fieldId, Conversions.toByteBuffer(fieldType, maxValue)); + Object maxValue = coerceStatValue(fieldType, columnStats.getRange().getMaxValue()); + ByteBuffer maxBuffer = safeToByteBuffer(fieldType, maxValue, field.getPath(), "max"); + if (maxBuffer != null) { + upperBounds.put(fieldId, maxBuffer); } }); return new Metrics( @@ -131,15 +135,52 @@ private Object convertFromIcebergValue(Type fieldType, ByteBuffer value) { return convertedValue; } - private Object normalizeStatValue(Type fieldType, Object value) { - if (value == null || !(value instanceof Number)) { - return value; + private ByteBuffer safeToByteBuffer(Type fieldType, Object value, String fieldPath, String side) { + if (value == null) { + return null; + } + try { + return Conversions.toByteBuffer(fieldType, value); + } catch (RuntimeException e) { + log.warn( + "Skipping {} bound for field {} due to uncoercible value {} for type {}", + side, + fieldPath, + value, + fieldType, + e); + return null; + } + } + + private Object coerceStatValue(Type fieldType, Object value) { + if (value == null) { + return null; } switch (fieldType.typeId()) { + case STRING: + return value.toString(); case FLOAT: - return ((Number) value).floatValue(); + return value instanceof Number ? ((Number) value).floatValue() : null; case DOUBLE: - return ((Number) value).doubleValue(); + return value instanceof Number ? ((Number) value).doubleValue() : null; + case INTEGER: + return value instanceof Number ? ((Number) value).intValue() : null; + case LONG: + return value instanceof Number ? ((Number) value).longValue() : null; + case DATE: + return value instanceof Number ? ((Number) value).intValue() : null; + case TIME: + case TIMESTAMP: + return value instanceof Number ? ((Number) value).longValue() : null; + case BOOLEAN: + if (value instanceof Boolean) { + return value; + } + if (value instanceof Number) { + return ((Number) value).intValue() != 0; + } + return null; default: return value; } From e27d8a0929e07926031ea543198b3a14430db0de Mon Sep 17 00:00:00 2001 From: nicolas-paris Date: Thu, 5 Feb 2026 15:38:16 +0100 Subject: [PATCH 3/3] test case, type evolution iceberg --- .../TestIcebergColumnStatsConverter.java | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/xtable-core/src/test/java/org/apache/xtable/iceberg/TestIcebergColumnStatsConverter.java b/xtable-core/src/test/java/org/apache/xtable/iceberg/TestIcebergColumnStatsConverter.java index 9154a3d35..73edbe4b9 100644 --- a/xtable-core/src/test/java/org/apache/xtable/iceberg/TestIcebergColumnStatsConverter.java +++ b/xtable-core/src/test/java/org/apache/xtable/iceberg/TestIcebergColumnStatsConverter.java @@ -19,6 +19,7 @@ package org.apache.xtable.iceberg; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import java.nio.ByteBuffer; import java.nio.ByteOrder; @@ -382,6 +383,53 @@ public void fromIceberg() { assertEquals(expected, actual); } + @Test + public void testSchemaEvolutionStatsCoercionAndSkip() { + Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "int_field", Types.IntegerType.get()), + Types.NestedField.optional(2, "decimal_field", Types.DecimalType.of(9, 2))); + + InternalField intField = + InternalField.builder() + .name("int_field") + .schema(InternalSchema.builder().name("int").dataType(InternalType.INT).build()) + .build(); + InternalField decimalField = + InternalField.builder() + .name("decimal_field") + .schema(InternalSchema.builder().name("decimal").dataType(InternalType.DECIMAL).build()) + .build(); + + ColumnStat intStats = + ColumnStat.builder() + .field(intField) + .numValues(10) + .numNulls(0) + .totalSize(40) + .range(Range.vector(10L, 20L)) + .build(); + ColumnStat decimalStats = + ColumnStat.builder() + .field(decimalField) + .numValues(5) + .numNulls(0) + .totalSize(80) + .range(Range.vector("not-a-decimal", "also-bad")) + .build(); + + Metrics actual = + IcebergColumnStatsConverter.getInstance() + .toIceberg(icebergSchema, 10L, Arrays.asList(intStats, decimalStats)); + + assertEquals( + Conversions.toByteBuffer(Types.IntegerType.get(), 10), actual.lowerBounds().get(1)); + assertEquals( + Conversions.toByteBuffer(Types.IntegerType.get(), 20), actual.upperBounds().get(1)); + assertFalse(actual.lowerBounds().containsKey(2)); + assertFalse(actual.upperBounds().containsKey(2)); + } + private Map getBoundsMap( Object timestampColumnStatsValue, Object dateColumnStatsValue,