From 881b811bd9dd3fef3d43874b409cb4349449940b Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 23 Mar 2026 12:16:58 -0400 Subject: [PATCH 1/6] Fix iceberg-rust diffs after #3739. --- dev/diffs/iceberg-rust/1.10.0.diff | 987 ---------------------------- dev/diffs/iceberg-rust/1.8.1.diff | 990 ----------------------------- dev/diffs/iceberg-rust/1.9.1.diff | 988 ---------------------------- 3 files changed, 2965 deletions(-) diff --git a/dev/diffs/iceberg-rust/1.10.0.diff b/dev/diffs/iceberg-rust/1.10.0.diff index f2100daa5a..92fc080769 100644 --- a/dev/diffs/iceberg-rust/1.10.0.diff +++ b/dev/diffs/iceberg-rust/1.10.0.diff @@ -1,21 +1,3 @@ -diff --git a/build.gradle b/build.gradle -index 6bc052885..db2aca3a5 100644 ---- a/build.gradle -+++ b/build.gradle -@@ -878,6 +878,13 @@ project(':iceberg-parquet') { - implementation project(':iceberg-core') - implementation project(':iceberg-common') - -+ implementation("org.apache.datafusion:comet-spark-spark${sparkVersionsString}_${scalaVersion}:${libs.versions.comet.get()}") { -+ exclude group: 'org.apache.arrow' -+ exclude group: 'org.apache.parquet' -+ exclude group: 'org.apache.spark' -+ exclude group: 'org.apache.iceberg' -+ } -+ - implementation(libs.parquet.avro) { - exclude group: 'org.apache.avro', module: 'avro' - // already shaded by Parquet diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index eeabe54f5..867018058 100644 --- a/gradle/libs.versions.toml @@ -29,631 +11,6 @@ index eeabe54f5..867018058 100644 datasketches = "6.2.0" delta-standalone = "3.3.2" delta-spark = "3.3.2" -diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/CometTypeUtils.java b/parquet/src/main/java/org/apache/iceberg/parquet/CometTypeUtils.java -new file mode 100644 -index 000000000..ddf6c7de5 ---- /dev/null -+++ b/parquet/src/main/java/org/apache/iceberg/parquet/CometTypeUtils.java -@@ -0,0 +1,255 @@ -+/* -+ * Licensed to the Apache Software Foundation (ASF) under one -+ * or more contributor license agreements. See the NOTICE file -+ * distributed with this work for additional information -+ * regarding copyright ownership. The ASF licenses this file -+ * to you under the Apache License, Version 2.0 (the -+ * "License"); you may not use this file except in compliance -+ * with the License. You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, -+ * software distributed under the License is distributed on an -+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -+ * KIND, either express or implied. See the License for the -+ * specific language governing permissions and limitations -+ * under the License. -+ */ -+package org.apache.iceberg.parquet; -+ -+import java.util.Map; -+import org.apache.comet.parquet.ParquetColumnSpec; -+import org.apache.iceberg.relocated.com.google.common.collect.Maps; -+import org.apache.parquet.column.ColumnDescriptor; -+import org.apache.parquet.schema.LogicalTypeAnnotation; -+import org.apache.parquet.schema.PrimitiveType; -+import org.apache.parquet.schema.Type; -+import org.apache.parquet.schema.Types; -+ -+public class CometTypeUtils { -+ -+ private CometTypeUtils() {} -+ -+ public static ParquetColumnSpec descriptorToParquetColumnSpec(ColumnDescriptor descriptor) { -+ -+ String[] path = descriptor.getPath(); -+ PrimitiveType primitiveType = descriptor.getPrimitiveType(); -+ String physicalType = primitiveType.getPrimitiveTypeName().name(); -+ -+ int typeLength = -+ primitiveType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY -+ ? primitiveType.getTypeLength() -+ : 0; -+ -+ boolean isRepeated = primitiveType.getRepetition() == Type.Repetition.REPEATED; -+ -+ // ToDo: extract this into a Util method -+ String logicalTypeName = null; -+ Map logicalTypeParams = Maps.newHashMap(); -+ LogicalTypeAnnotation logicalType = primitiveType.getLogicalTypeAnnotation(); -+ -+ if (logicalType != null) { -+ logicalTypeName = logicalType.getClass().getSimpleName(); -+ -+ // Handle specific logical types -+ if (logicalType instanceof LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) { -+ LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimal = -+ (LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) logicalType; -+ logicalTypeParams.put("precision", String.valueOf(decimal.getPrecision())); -+ logicalTypeParams.put("scale", String.valueOf(decimal.getScale())); -+ } else if (logicalType instanceof LogicalTypeAnnotation.TimestampLogicalTypeAnnotation) { -+ LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestamp = -+ (LogicalTypeAnnotation.TimestampLogicalTypeAnnotation) logicalType; -+ logicalTypeParams.put("isAdjustedToUTC", String.valueOf(timestamp.isAdjustedToUTC())); -+ logicalTypeParams.put("unit", timestamp.getUnit().name()); -+ } else if (logicalType instanceof LogicalTypeAnnotation.TimeLogicalTypeAnnotation) { -+ LogicalTypeAnnotation.TimeLogicalTypeAnnotation time = -+ (LogicalTypeAnnotation.TimeLogicalTypeAnnotation) logicalType; -+ logicalTypeParams.put("isAdjustedToUTC", String.valueOf(time.isAdjustedToUTC())); -+ logicalTypeParams.put("unit", time.getUnit().name()); -+ } else if (logicalType instanceof LogicalTypeAnnotation.IntLogicalTypeAnnotation) { -+ LogicalTypeAnnotation.IntLogicalTypeAnnotation intType = -+ (LogicalTypeAnnotation.IntLogicalTypeAnnotation) logicalType; -+ logicalTypeParams.put("isSigned", String.valueOf(intType.isSigned())); -+ logicalTypeParams.put("bitWidth", String.valueOf(intType.getBitWidth())); -+ } -+ } -+ -+ return new ParquetColumnSpec( -+ 1, // ToDo: pass in the correct id -+ path, -+ physicalType, -+ typeLength, -+ isRepeated, -+ descriptor.getMaxDefinitionLevel(), -+ descriptor.getMaxRepetitionLevel(), -+ logicalTypeName, -+ logicalTypeParams); -+ } -+ -+ public static ColumnDescriptor buildColumnDescriptor(ParquetColumnSpec columnSpec) { -+ PrimitiveType.PrimitiveTypeName primType = -+ PrimitiveType.PrimitiveTypeName.valueOf(columnSpec.getPhysicalType()); -+ -+ Type.Repetition repetition; -+ if (columnSpec.getMaxRepetitionLevel() > 0) { -+ repetition = Type.Repetition.REPEATED; -+ } else if (columnSpec.getMaxDefinitionLevel() > 0) { -+ repetition = Type.Repetition.OPTIONAL; -+ } else { -+ repetition = Type.Repetition.REQUIRED; -+ } -+ -+ String name = columnSpec.getPath()[columnSpec.getPath().length - 1]; -+ // Reconstruct the logical type from parameters -+ LogicalTypeAnnotation logicalType = null; -+ if (columnSpec.getLogicalTypeName() != null) { -+ logicalType = -+ reconstructLogicalType( -+ columnSpec.getLogicalTypeName(), columnSpec.getLogicalTypeParams()); -+ } -+ -+ PrimitiveType primitiveType; -+ if (primType == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) { -+ primitiveType = -+ org.apache.parquet.schema.Types.primitive(primType, repetition) -+ .length(columnSpec.getTypeLength()) -+ .as(logicalType) -+ .id(columnSpec.getFieldId()) -+ .named(name); -+ } else { -+ primitiveType = -+ Types.primitive(primType, repetition) -+ .as(logicalType) -+ .id(columnSpec.getFieldId()) -+ .named(name); -+ } -+ -+ return new ColumnDescriptor( -+ columnSpec.getPath(), -+ primitiveType, -+ columnSpec.getMaxRepetitionLevel(), -+ columnSpec.getMaxDefinitionLevel()); -+ } -+ -+ private static LogicalTypeAnnotation reconstructLogicalType( -+ String logicalTypeName, java.util.Map params) { -+ -+ switch (logicalTypeName) { -+ // MAP -+ case "MapLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.mapType(); -+ -+ // LIST -+ case "ListLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.listType(); -+ -+ // STRING -+ case "StringLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.stringType(); -+ -+ // MAP_KEY_VALUE -+ case "MapKeyValueLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance(); -+ -+ // ENUM -+ case "EnumLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.enumType(); -+ -+ // DECIMAL -+ case "DecimalLogicalTypeAnnotation": -+ if (!params.containsKey("scale") || !params.containsKey("precision")) { -+ throw new IllegalArgumentException( -+ "Missing required parameters for DecimalLogicalTypeAnnotation: " + params); -+ } -+ int scale = Integer.parseInt(params.get("scale")); -+ int precision = Integer.parseInt(params.get("precision")); -+ return LogicalTypeAnnotation.decimalType(scale, precision); -+ -+ // DATE -+ case "DateLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.dateType(); -+ -+ // TIME -+ case "TimeLogicalTypeAnnotation": -+ if (!params.containsKey("isAdjustedToUTC") || !params.containsKey("unit")) { -+ throw new IllegalArgumentException( -+ "Missing required parameters for TimeLogicalTypeAnnotation: " + params); -+ } -+ -+ boolean isUTC = Boolean.parseBoolean(params.get("isAdjustedToUTC")); -+ String timeUnitStr = params.get("unit"); -+ -+ LogicalTypeAnnotation.TimeUnit timeUnit; -+ switch (timeUnitStr) { -+ case "MILLIS": -+ timeUnit = LogicalTypeAnnotation.TimeUnit.MILLIS; -+ break; -+ case "MICROS": -+ timeUnit = LogicalTypeAnnotation.TimeUnit.MICROS; -+ break; -+ case "NANOS": -+ timeUnit = LogicalTypeAnnotation.TimeUnit.NANOS; -+ break; -+ default: -+ throw new IllegalArgumentException("Unknown time unit: " + timeUnitStr); -+ } -+ return LogicalTypeAnnotation.timeType(isUTC, timeUnit); -+ -+ // TIMESTAMP -+ case "TimestampLogicalTypeAnnotation": -+ if (!params.containsKey("isAdjustedToUTC") || !params.containsKey("unit")) { -+ throw new IllegalArgumentException( -+ "Missing required parameters for TimestampLogicalTypeAnnotation: " + params); -+ } -+ boolean isAdjustedToUTC = Boolean.parseBoolean(params.get("isAdjustedToUTC")); -+ String unitStr = params.get("unit"); -+ -+ LogicalTypeAnnotation.TimeUnit unit; -+ switch (unitStr) { -+ case "MILLIS": -+ unit = LogicalTypeAnnotation.TimeUnit.MILLIS; -+ break; -+ case "MICROS": -+ unit = LogicalTypeAnnotation.TimeUnit.MICROS; -+ break; -+ case "NANOS": -+ unit = LogicalTypeAnnotation.TimeUnit.NANOS; -+ break; -+ default: -+ throw new IllegalArgumentException("Unknown timestamp unit: " + unitStr); -+ } -+ return LogicalTypeAnnotation.timestampType(isAdjustedToUTC, unit); -+ -+ // INTEGER -+ case "IntLogicalTypeAnnotation": -+ if (!params.containsKey("isSigned") || !params.containsKey("bitWidth")) { -+ throw new IllegalArgumentException( -+ "Missing required parameters for IntLogicalTypeAnnotation: " + params); -+ } -+ boolean isSigned = Boolean.parseBoolean(params.get("isSigned")); -+ int bitWidth = Integer.parseInt(params.get("bitWidth")); -+ return LogicalTypeAnnotation.intType(bitWidth, isSigned); -+ -+ // JSON -+ case "JsonLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.jsonType(); -+ -+ // BSON -+ case "BsonLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.bsonType(); -+ -+ // UUID -+ case "UUIDLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.uuidType(); -+ -+ // INTERVAL -+ case "IntervalLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance(); -+ -+ default: -+ throw new IllegalArgumentException("Unknown logical type: " + logicalTypeName); -+ } -+ } -+} -diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/CometVectorizedParquetReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/CometVectorizedParquetReader.java -new file mode 100644 -index 000000000..a3cba4018 ---- /dev/null -+++ b/parquet/src/main/java/org/apache/iceberg/parquet/CometVectorizedParquetReader.java -@@ -0,0 +1,260 @@ -+/* -+ * Licensed to the Apache Software Foundation (ASF) under one -+ * or more contributor license agreements. See the NOTICE file -+ * distributed with this work for additional information -+ * regarding copyright ownership. The ASF licenses this file -+ * to you under the Apache License, Version 2.0 (the -+ * "License"); you may not use this file except in compliance -+ * with the License. You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, -+ * software distributed under the License is distributed on an -+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -+ * KIND, either express or implied. See the License for the -+ * specific language governing permissions and limitations -+ * under the License. -+ */ -+package org.apache.iceberg.parquet; -+ -+import java.io.IOException; -+import java.io.UncheckedIOException; -+import java.nio.ByteBuffer; -+import java.util.List; -+import java.util.Map; -+import java.util.NoSuchElementException; -+import java.util.function.Function; -+import org.apache.comet.parquet.FileReader; -+import org.apache.comet.parquet.ParquetColumnSpec; -+import org.apache.comet.parquet.ReadOptions; -+import org.apache.comet.parquet.RowGroupReader; -+import org.apache.comet.parquet.WrappedInputFile; -+import org.apache.hadoop.conf.Configuration; -+import org.apache.iceberg.Schema; -+import org.apache.iceberg.exceptions.RuntimeIOException; -+import org.apache.iceberg.expressions.Expression; -+import org.apache.iceberg.expressions.Expressions; -+import org.apache.iceberg.io.CloseableGroup; -+import org.apache.iceberg.io.CloseableIterable; -+import org.apache.iceberg.io.CloseableIterator; -+import org.apache.iceberg.io.InputFile; -+import org.apache.iceberg.mapping.NameMapping; -+import org.apache.iceberg.relocated.com.google.common.collect.Lists; -+import org.apache.iceberg.util.ByteBuffers; -+import org.apache.parquet.ParquetReadOptions; -+import org.apache.parquet.column.ColumnDescriptor; -+import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; -+import org.apache.parquet.hadoop.metadata.ColumnPath; -+import org.apache.parquet.schema.MessageType; -+ -+public class CometVectorizedParquetReader extends CloseableGroup -+ implements CloseableIterable { -+ private final InputFile input; -+ private final ParquetReadOptions options; -+ private final Schema expectedSchema; -+ private final Function> batchReaderFunc; -+ private final Expression filter; -+ private final boolean reuseContainers; -+ private final boolean caseSensitive; -+ private final int batchSize; -+ private final NameMapping nameMapping; -+ private final Map properties; -+ private Long start = null; -+ private Long length = null; -+ private ByteBuffer fileEncryptionKey = null; -+ private ByteBuffer fileAADPrefix = null; -+ -+ public CometVectorizedParquetReader( -+ InputFile input, -+ Schema expectedSchema, -+ ParquetReadOptions options, -+ Function> readerFunc, -+ NameMapping nameMapping, -+ Expression filter, -+ boolean reuseContainers, -+ boolean caseSensitive, -+ int maxRecordsPerBatch, -+ Map properties, -+ Long start, -+ Long length, -+ ByteBuffer fileEncryptionKey, -+ ByteBuffer fileAADPrefix) { -+ this.input = input; -+ this.expectedSchema = expectedSchema; -+ this.options = options; -+ this.batchReaderFunc = readerFunc; -+ // replace alwaysTrue with null to avoid extra work evaluating a trivial filter -+ this.filter = filter == Expressions.alwaysTrue() ? null : filter; -+ this.reuseContainers = reuseContainers; -+ this.caseSensitive = caseSensitive; -+ this.batchSize = maxRecordsPerBatch; -+ this.nameMapping = nameMapping; -+ this.properties = properties; -+ this.start = start; -+ this.length = length; -+ this.fileEncryptionKey = fileEncryptionKey; -+ this.fileAADPrefix = fileAADPrefix; -+ } -+ -+ private ReadConf conf = null; -+ -+ private ReadConf init() { -+ if (conf == null) { -+ ReadConf readConf = -+ new ReadConf( -+ input, -+ options, -+ expectedSchema, -+ filter, -+ null, -+ batchReaderFunc, -+ nameMapping, -+ reuseContainers, -+ caseSensitive, -+ batchSize); -+ this.conf = readConf.copy(); -+ return readConf; -+ } -+ return conf; -+ } -+ -+ @Override -+ public CloseableIterator iterator() { -+ FileIterator iter = -+ new FileIterator<>(init(), properties, start, length, fileEncryptionKey, fileAADPrefix); -+ addCloseable(iter); -+ return iter; -+ } -+ -+ private static class FileIterator implements CloseableIterator { -+ // private final ParquetFileReader reader; -+ private final boolean[] shouldSkip; -+ private final VectorizedReader model; -+ private final long totalValues; -+ private final int batchSize; -+ private final List> columnChunkMetadata; -+ private final boolean reuseContainers; -+ private int nextRowGroup = 0; -+ private long nextRowGroupStart = 0; -+ private long valuesRead = 0; -+ private T last = null; -+ private final FileReader cometReader; -+ private ReadConf conf; -+ -+ FileIterator( -+ ReadConf conf, -+ Map properties, -+ Long start, -+ Long length, -+ ByteBuffer fileEncryptionKey, -+ ByteBuffer fileAADPrefix) { -+ this.shouldSkip = conf.shouldSkip(); -+ this.totalValues = conf.totalValues(); -+ this.reuseContainers = conf.reuseContainers(); -+ this.model = conf.vectorizedModel(); -+ this.batchSize = conf.batchSize(); -+ this.model.setBatchSize(this.batchSize); -+ this.columnChunkMetadata = conf.columnChunkMetadataForRowGroups(); -+ this.cometReader = -+ newCometReader( -+ conf.file(), -+ conf.projection(), -+ properties, -+ start, -+ length, -+ fileEncryptionKey, -+ fileAADPrefix); -+ this.conf = conf; -+ } -+ -+ private FileReader newCometReader( -+ InputFile file, -+ MessageType projection, -+ Map properties, -+ Long start, -+ Long length, -+ ByteBuffer fileEncryptionKey, -+ ByteBuffer fileAADPrefix) { -+ try { -+ ReadOptions cometOptions = ReadOptions.builder(new Configuration()).build(); -+ -+ FileReader fileReader = -+ new FileReader( -+ new WrappedInputFile(file), -+ cometOptions, -+ properties, -+ start, -+ length, -+ ByteBuffers.toByteArray(fileEncryptionKey), -+ ByteBuffers.toByteArray(fileAADPrefix)); -+ -+ List columnDescriptors = projection.getColumns(); -+ -+ List specs = Lists.newArrayList(); -+ -+ for (ColumnDescriptor descriptor : columnDescriptors) { -+ ParquetColumnSpec spec = CometTypeUtils.descriptorToParquetColumnSpec(descriptor); -+ specs.add(spec); -+ } -+ -+ fileReader.setRequestedSchemaFromSpecs(specs); -+ return fileReader; -+ } catch (IOException e) { -+ throw new UncheckedIOException("Failed to open Parquet file: " + file.location(), e); -+ } -+ } -+ -+ @Override -+ public boolean hasNext() { -+ return valuesRead < totalValues; -+ } -+ -+ @Override -+ public T next() { -+ if (!hasNext()) { -+ throw new NoSuchElementException(); -+ } -+ if (valuesRead >= nextRowGroupStart) { -+ advance(); -+ } -+ -+ // batchSize is an integer, so casting to integer is safe -+ int numValuesToRead = (int) Math.min(nextRowGroupStart - valuesRead, batchSize); -+ if (reuseContainers) { -+ this.last = model.read(last, numValuesToRead); -+ } else { -+ this.last = model.read(null, numValuesToRead); -+ } -+ valuesRead += numValuesToRead; -+ -+ return last; -+ } -+ -+ private void advance() { -+ while (shouldSkip[nextRowGroup]) { -+ nextRowGroup += 1; -+ cometReader.skipNextRowGroup(); -+ } -+ RowGroupReader pages; -+ try { -+ pages = cometReader.readNextRowGroup(); -+ } catch (IOException e) { -+ throw new RuntimeIOException(e); -+ } -+ -+ model.setRowGroupInfo(pages, columnChunkMetadata.get(nextRowGroup)); -+ nextRowGroupStart += pages.getRowCount(); -+ nextRowGroup += 1; -+ } -+ -+ @Override -+ public void close() throws IOException { -+ model.close(); -+ cometReader.close(); -+ if (conf != null && conf.reader() != null) { -+ conf.reader().close(); -+ } -+ } -+ } -+} -diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java -index 6f68fbe15..b740543f3 100644 ---- a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java -+++ b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java -@@ -1161,6 +1161,7 @@ public class Parquet { - private NameMapping nameMapping = null; - private ByteBuffer fileEncryptionKey = null; - private ByteBuffer fileAADPrefix = null; -+ private boolean isComet; - - private ReadBuilder(InputFile file) { - this.file = file; -@@ -1205,6 +1206,11 @@ public class Parquet { - return this; - } - -+ public ReadBuilder enableComet(boolean enableComet) { -+ this.isComet = enableComet; -+ return this; -+ } -+ - /** - * @deprecated will be removed in 2.0.0; use {@link #createReaderFunc(Function)} instead - */ -@@ -1300,7 +1306,7 @@ public class Parquet { - } - - @Override -- @SuppressWarnings({"unchecked", "checkstyle:CyclomaticComplexity"}) -+ @SuppressWarnings({"unchecked", "checkstyle:CyclomaticComplexity", "MethodLength"}) - public CloseableIterable build() { - FileDecryptionProperties fileDecryptionProperties = null; - if (fileEncryptionKey != null) { -@@ -1352,16 +1358,35 @@ public class Parquet { - } - - if (batchedReaderFunc != null) { -- return new VectorizedParquetReader<>( -- file, -- schema, -- options, -- batchedReaderFunc, -- mapping, -- filter, -- reuseContainers, -- caseSensitive, -- maxRecordsPerBatch); -+ if (isComet) { -+ LOG.info("Comet enabled"); -+ return new CometVectorizedParquetReader<>( -+ file, -+ schema, -+ options, -+ batchedReaderFunc, -+ mapping, -+ filter, -+ reuseContainers, -+ caseSensitive, -+ maxRecordsPerBatch, -+ properties, -+ start, -+ length, -+ fileEncryptionKey, -+ fileAADPrefix); -+ } else { -+ return new VectorizedParquetReader<>( -+ file, -+ schema, -+ options, -+ batchedReaderFunc, -+ mapping, -+ filter, -+ reuseContainers, -+ caseSensitive, -+ maxRecordsPerBatch); -+ } - } else { - Function> readBuilder = - readerFuncWithSchema != null -diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java -index 1fb2372ba..142e5fbad 100644 ---- a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java -+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java -@@ -157,6 +157,14 @@ class ReadConf { - return newReader; - } - -+ InputFile file() { -+ return file; -+ } -+ -+ MessageType projection() { -+ return projection; -+ } -+ - ParquetValueReader model() { - return model; - } diff --git a/spark/v3.5/build.gradle b/spark/v3.5/build.gradle index 69700d843..49ea338a4 100644 --- a/spark/v3.5/build.gradle @@ -817,350 +174,6 @@ index 68c537e34..1e9e90d53 100644 if (!enableDictionaryEncoding) { builder .config("parquet.dictionary.page.size", "1") -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java -index 81b7d83a7..eba1a2a0f 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java -@@ -19,18 +19,22 @@ - package org.apache.iceberg.spark.data.vectorized; - - import java.io.IOException; -+import org.apache.comet.CometConf; - import org.apache.comet.CometSchemaImporter; - import org.apache.comet.parquet.AbstractColumnReader; - import org.apache.comet.parquet.ColumnReader; -+import org.apache.comet.parquet.ParquetColumnSpec; -+import org.apache.comet.parquet.RowGroupReader; - import org.apache.comet.parquet.TypeUtil; - import org.apache.comet.parquet.Utils; - import org.apache.comet.shaded.arrow.memory.RootAllocator; -+import org.apache.iceberg.parquet.CometTypeUtils; - import org.apache.iceberg.parquet.VectorizedReader; - import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - import org.apache.iceberg.spark.SparkSchemaUtil; - import org.apache.iceberg.types.Types; - import org.apache.parquet.column.ColumnDescriptor; --import org.apache.parquet.column.page.PageReader; -+import org.apache.spark.sql.internal.SQLConf; - import org.apache.spark.sql.types.DataType; - import org.apache.spark.sql.types.Metadata; - import org.apache.spark.sql.types.StructField; -@@ -42,23 +46,28 @@ class CometColumnReader implements VectorizedReader { - - private final ColumnDescriptor descriptor; - private final DataType sparkType; -+ private final int fieldId; - - // The delegated ColumnReader from Comet side - private AbstractColumnReader delegate; - private boolean initialized = false; - private int batchSize = DEFAULT_BATCH_SIZE; - private CometSchemaImporter importer; -+ private ParquetColumnSpec spec; - -- CometColumnReader(DataType sparkType, ColumnDescriptor descriptor) { -+ CometColumnReader(DataType sparkType, ColumnDescriptor descriptor, int fieldId) { - this.sparkType = sparkType; - this.descriptor = descriptor; -+ this.fieldId = fieldId; - } - - CometColumnReader(Types.NestedField field) { - DataType dataType = SparkSchemaUtil.convert(field.type()); - StructField structField = new StructField(field.name(), dataType, false, Metadata.empty()); - this.sparkType = dataType; -- this.descriptor = TypeUtil.convertToParquet(structField); -+ this.descriptor = -+ CometTypeUtils.buildColumnDescriptor(TypeUtil.convertToParquetSpec(structField)); -+ this.fieldId = field.fieldId(); - } - - public AbstractColumnReader delegate() { -@@ -92,7 +101,26 @@ class CometColumnReader implements VectorizedReader { - } - - this.importer = new CometSchemaImporter(new RootAllocator()); -- this.delegate = Utils.getColumnReader(sparkType, descriptor, importer, batchSize, false, false); -+ -+ spec = CometTypeUtils.descriptorToParquetColumnSpec(descriptor); -+ -+ boolean useLegacyTime = -+ Boolean.parseBoolean( -+ SQLConf.get() -+ .getConfString( -+ CometConf.COMET_EXCEPTION_ON_LEGACY_DATE_TIMESTAMP().key(), "false")); -+ boolean useLazyMaterialization = -+ Boolean.parseBoolean( -+ SQLConf.get().getConfString(CometConf.COMET_USE_LAZY_MATERIALIZATION().key(), "false")); -+ this.delegate = -+ Utils.getColumnReader( -+ sparkType, -+ spec, -+ importer, -+ batchSize, -+ true, // Comet sets this to true for native execution -+ useLazyMaterialization, -+ useLegacyTime); - this.initialized = true; - } - -@@ -111,9 +139,9 @@ class CometColumnReader implements VectorizedReader { - *

NOTE: this should be called before reading a new Parquet column chunk, and after {@link - * CometColumnReader#reset} is called. - */ -- public void setPageReader(PageReader pageReader) throws IOException { -+ public void setPageReader(RowGroupReader pageStore) throws IOException { - Preconditions.checkState(initialized, "Invalid state: 'reset' should be called first"); -- ((ColumnReader) delegate).setPageReader(pageReader); -+ ((ColumnReader) delegate).setRowGroupReader(pageStore, spec); - } - - @Override -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java -index 04ac69476..916face2b 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java -@@ -22,8 +22,12 @@ import java.io.IOException; - import java.io.UncheckedIOException; - import java.util.List; - import java.util.Map; -+import org.apache.comet.CometRuntimeException; - import org.apache.comet.parquet.AbstractColumnReader; --import org.apache.comet.parquet.BatchReader; -+import org.apache.comet.parquet.IcebergCometBatchReader; -+import org.apache.comet.parquet.RowGroupReader; -+import org.apache.comet.vector.CometSelectionVector; -+import org.apache.comet.vector.CometVector; - import org.apache.iceberg.Schema; - import org.apache.iceberg.data.DeleteFilter; - import org.apache.iceberg.parquet.VectorizedReader; -@@ -55,7 +59,7 @@ class CometColumnarBatchReader implements VectorizedReader { - // calling BatchReader.nextBatch, the isDeleted value is not yet available, so - // DeleteColumnReader.readBatch must be called explicitly later, after the isDeleted value is - // available. -- private final BatchReader delegate; -+ private final IcebergCometBatchReader delegate; - private DeleteFilter deletes = null; - private long rowStartPosInBatch = 0; - -@@ -65,9 +69,7 @@ class CometColumnarBatchReader implements VectorizedReader { - this.hasIsDeletedColumn = - readers.stream().anyMatch(reader -> reader instanceof CometDeleteColumnReader); - -- AbstractColumnReader[] abstractColumnReaders = new AbstractColumnReader[readers.size()]; -- this.delegate = new BatchReader(abstractColumnReaders); -- delegate.setSparkSchema(SparkSchemaUtil.convert(schema)); -+ this.delegate = new IcebergCometBatchReader(readers.size(), SparkSchemaUtil.convert(schema)); - } - - @Override -@@ -79,19 +81,22 @@ class CometColumnarBatchReader implements VectorizedReader { - && !(readers[i] instanceof CometPositionColumnReader) - && !(readers[i] instanceof CometDeleteColumnReader)) { - readers[i].reset(); -- readers[i].setPageReader(pageStore.getPageReader(readers[i].descriptor())); -+ readers[i].setPageReader((RowGroupReader) pageStore); - } - } catch (IOException e) { - throw new UncheckedIOException("Failed to setRowGroupInfo for Comet vectorization", e); - } - } - -+ AbstractColumnReader[] delegateReaders = new AbstractColumnReader[readers.length]; - for (int i = 0; i < readers.length; i++) { -- delegate.getColumnReaders()[i] = this.readers[i].delegate(); -+ delegateReaders[i] = readers[i].delegate(); - } - -+ delegate.init(delegateReaders); -+ - this.rowStartPosInBatch = -- pageStore -+ ((RowGroupReader) pageStore) - .getRowIndexOffset() - .orElseThrow( - () -> -@@ -148,9 +153,17 @@ class CometColumnarBatchReader implements VectorizedReader { - Pair pair = buildRowIdMapping(vectors); - if (pair != null) { - int[] rowIdMapping = pair.first(); -- numLiveRows = pair.second(); -- for (int i = 0; i < vectors.length; i++) { -- vectors[i] = new ColumnVectorWithFilter(vectors[i], rowIdMapping); -+ if (pair.second() != null) { -+ numLiveRows = pair.second(); -+ for (int i = 0; i < vectors.length; i++) { -+ if (vectors[i] instanceof CometVector) { -+ vectors[i] = -+ new CometSelectionVector((CometVector) vectors[i], rowIdMapping, numLiveRows); -+ } else { -+ throw new CometRuntimeException( -+ "Unsupported column vector type: " + vectors[i].getClass()); -+ } -+ } - } - } - } -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java -index 047c96314..88d691a60 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java -@@ -21,6 +21,7 @@ package org.apache.iceberg.spark.data.vectorized; - import java.math.BigDecimal; - import java.nio.ByteBuffer; - import org.apache.comet.parquet.ConstantColumnReader; -+import org.apache.iceberg.parquet.CometTypeUtils; - import org.apache.iceberg.types.Types; - import org.apache.spark.sql.types.DataType; - import org.apache.spark.sql.types.DataTypes; -@@ -34,7 +35,11 @@ class CometConstantColumnReader extends CometColumnReader { - super(field); - // use delegate to set constant value on the native side to be consumed by native execution. - setDelegate( -- new ConstantColumnReader(sparkType(), descriptor(), convertToSparkValue(value), false)); -+ new ConstantColumnReader( -+ sparkType(), -+ CometTypeUtils.descriptorToParquetColumnSpec(descriptor()), -+ convertToSparkValue(value), -+ false)); - } - - @Override -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java -index 6235bfe48..cba108e43 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java -@@ -51,10 +51,10 @@ class CometDeleteColumnReader extends CometColumnReader { - DeleteColumnReader() { - super( - DataTypes.BooleanType, -- TypeUtil.convertToParquet( -+ TypeUtil.convertToParquetSpec( - new StructField("_deleted", DataTypes.BooleanType, false, Metadata.empty())), - false /* useDecimal128 = false */, -- false /* isConstant = false */); -+ false /* isConstant */); - this.isDeleted = new boolean[0]; - } - -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java -index bcc0e514c..98e80068c 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java -@@ -20,6 +20,7 @@ package org.apache.iceberg.spark.data.vectorized; - - import org.apache.comet.parquet.MetadataColumnReader; - import org.apache.comet.parquet.Native; -+import org.apache.iceberg.parquet.CometTypeUtils; - import org.apache.iceberg.types.Types; - import org.apache.parquet.column.ColumnDescriptor; - import org.apache.spark.sql.types.DataTypes; -@@ -44,7 +45,7 @@ class CometPositionColumnReader extends CometColumnReader { - PositionColumnReader(ColumnDescriptor descriptor) { - super( - DataTypes.LongType, -- descriptor, -+ CometTypeUtils.descriptorToParquetColumnSpec(descriptor), - false /* useDecimal128 = false */, - false /* isConstant = false */); - } -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java -index d36f1a727..56f8c9bff 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java -@@ -142,6 +142,7 @@ class CometVectorizedReaderBuilder extends TypeWithSchemaVisitor extends BaseReader taskGroup = (ScanTaskGroup) task; -+ return taskGroup.tasks().stream().allMatch(this::supportsCometBatchReads); -+ -+ } else if (task.isFileScanTask() && !task.isDataTask()) { -+ FileScanTask fileScanTask = task.asFileScanTask(); -+ // Comet can't handle delete files for now -+ return fileScanTask.file().format() == FileFormat.PARQUET; -+ -+ } else { -+ return false; -+ } -+ } -+ - // conditions for using ORC batch reads: - // - ORC vectorization is enabled - // - all tasks are of type FileScanTask and read only ORC files with no delete files -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java -index 106b296de..967b0d41d 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java -@@ -24,6 +24,7 @@ import java.util.Map; - import java.util.Optional; - import java.util.function.Supplier; - import java.util.stream.Collectors; -+import org.apache.comet.parquet.SupportsComet; - import org.apache.iceberg.BlobMetadata; - import org.apache.iceberg.ScanTask; - import org.apache.iceberg.ScanTaskGroup; -@@ -95,7 +96,7 @@ import org.apache.spark.sql.types.StructType; - import org.slf4j.Logger; - import org.slf4j.LoggerFactory; - --abstract class SparkScan implements Scan, SupportsReportStatistics { -+abstract class SparkScan implements Scan, SupportsReportStatistics, SupportsComet { - private static final Logger LOG = LoggerFactory.getLogger(SparkScan.class); - private static final String NDV_KEY = "ndv"; - -@@ -351,4 +352,10 @@ abstract class SparkScan implements Scan, SupportsReportStatistics { - return splitSize; - } - } -+ -+ @Override -+ public boolean isCometEnabled() { -+ SparkBatch batch = (SparkBatch) this.toBatch(); -+ return batch.useCometBatchReads(); -+ } - } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java index 404ba7284..00e97e96f 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java diff --git a/dev/diffs/iceberg-rust/1.8.1.diff b/dev/diffs/iceberg-rust/1.8.1.diff index 8325696880..27686ab8d5 100644 --- a/dev/diffs/iceberg-rust/1.8.1.diff +++ b/dev/diffs/iceberg-rust/1.8.1.diff @@ -1,21 +1,3 @@ -diff --git a/build.gradle b/build.gradle -index 7327b3890..7967109f0 100644 ---- a/build.gradle -+++ b/build.gradle -@@ -780,6 +780,13 @@ project(':iceberg-parquet') { - implementation project(':iceberg-core') - implementation project(':iceberg-common') - -+ implementation("org.apache.datafusion:comet-spark-spark${sparkVersionsString}_${scalaVersion}:${libs.versions.comet.get()}") { -+ exclude group: 'org.apache.arrow' -+ exclude group: 'org.apache.parquet' -+ exclude group: 'org.apache.spark' -+ exclude group: 'org.apache.iceberg' -+ } -+ - implementation(libs.parquet.avro) { - exclude group: 'org.apache.avro', module: 'avro' - // already shaded by Parquet diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 04ffa8f4e..3a57af315 100644 --- a/gradle/libs.versions.toml @@ -37,631 +19,6 @@ index 04ffa8f4e..3a57af315 100644 sqlite-jdbc = "3.48.0.0" testcontainers = "1.20.4" tez010 = "0.10.4" -diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/CometTypeUtils.java b/parquet/src/main/java/org/apache/iceberg/parquet/CometTypeUtils.java -new file mode 100644 -index 000000000..ddf6c7de5 ---- /dev/null -+++ b/parquet/src/main/java/org/apache/iceberg/parquet/CometTypeUtils.java -@@ -0,0 +1,255 @@ -+/* -+ * Licensed to the Apache Software Foundation (ASF) under one -+ * or more contributor license agreements. See the NOTICE file -+ * distributed with this work for additional information -+ * regarding copyright ownership. The ASF licenses this file -+ * to you under the Apache License, Version 2.0 (the -+ * "License"); you may not use this file except in compliance -+ * with the License. You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, -+ * software distributed under the License is distributed on an -+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -+ * KIND, either express or implied. See the License for the -+ * specific language governing permissions and limitations -+ * under the License. -+ */ -+package org.apache.iceberg.parquet; -+ -+import java.util.Map; -+import org.apache.comet.parquet.ParquetColumnSpec; -+import org.apache.iceberg.relocated.com.google.common.collect.Maps; -+import org.apache.parquet.column.ColumnDescriptor; -+import org.apache.parquet.schema.LogicalTypeAnnotation; -+import org.apache.parquet.schema.PrimitiveType; -+import org.apache.parquet.schema.Type; -+import org.apache.parquet.schema.Types; -+ -+public class CometTypeUtils { -+ -+ private CometTypeUtils() {} -+ -+ public static ParquetColumnSpec descriptorToParquetColumnSpec(ColumnDescriptor descriptor) { -+ -+ String[] path = descriptor.getPath(); -+ PrimitiveType primitiveType = descriptor.getPrimitiveType(); -+ String physicalType = primitiveType.getPrimitiveTypeName().name(); -+ -+ int typeLength = -+ primitiveType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY -+ ? primitiveType.getTypeLength() -+ : 0; -+ -+ boolean isRepeated = primitiveType.getRepetition() == Type.Repetition.REPEATED; -+ -+ // ToDo: extract this into a Util method -+ String logicalTypeName = null; -+ Map logicalTypeParams = Maps.newHashMap(); -+ LogicalTypeAnnotation logicalType = primitiveType.getLogicalTypeAnnotation(); -+ -+ if (logicalType != null) { -+ logicalTypeName = logicalType.getClass().getSimpleName(); -+ -+ // Handle specific logical types -+ if (logicalType instanceof LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) { -+ LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimal = -+ (LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) logicalType; -+ logicalTypeParams.put("precision", String.valueOf(decimal.getPrecision())); -+ logicalTypeParams.put("scale", String.valueOf(decimal.getScale())); -+ } else if (logicalType instanceof LogicalTypeAnnotation.TimestampLogicalTypeAnnotation) { -+ LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestamp = -+ (LogicalTypeAnnotation.TimestampLogicalTypeAnnotation) logicalType; -+ logicalTypeParams.put("isAdjustedToUTC", String.valueOf(timestamp.isAdjustedToUTC())); -+ logicalTypeParams.put("unit", timestamp.getUnit().name()); -+ } else if (logicalType instanceof LogicalTypeAnnotation.TimeLogicalTypeAnnotation) { -+ LogicalTypeAnnotation.TimeLogicalTypeAnnotation time = -+ (LogicalTypeAnnotation.TimeLogicalTypeAnnotation) logicalType; -+ logicalTypeParams.put("isAdjustedToUTC", String.valueOf(time.isAdjustedToUTC())); -+ logicalTypeParams.put("unit", time.getUnit().name()); -+ } else if (logicalType instanceof LogicalTypeAnnotation.IntLogicalTypeAnnotation) { -+ LogicalTypeAnnotation.IntLogicalTypeAnnotation intType = -+ (LogicalTypeAnnotation.IntLogicalTypeAnnotation) logicalType; -+ logicalTypeParams.put("isSigned", String.valueOf(intType.isSigned())); -+ logicalTypeParams.put("bitWidth", String.valueOf(intType.getBitWidth())); -+ } -+ } -+ -+ return new ParquetColumnSpec( -+ 1, // ToDo: pass in the correct id -+ path, -+ physicalType, -+ typeLength, -+ isRepeated, -+ descriptor.getMaxDefinitionLevel(), -+ descriptor.getMaxRepetitionLevel(), -+ logicalTypeName, -+ logicalTypeParams); -+ } -+ -+ public static ColumnDescriptor buildColumnDescriptor(ParquetColumnSpec columnSpec) { -+ PrimitiveType.PrimitiveTypeName primType = -+ PrimitiveType.PrimitiveTypeName.valueOf(columnSpec.getPhysicalType()); -+ -+ Type.Repetition repetition; -+ if (columnSpec.getMaxRepetitionLevel() > 0) { -+ repetition = Type.Repetition.REPEATED; -+ } else if (columnSpec.getMaxDefinitionLevel() > 0) { -+ repetition = Type.Repetition.OPTIONAL; -+ } else { -+ repetition = Type.Repetition.REQUIRED; -+ } -+ -+ String name = columnSpec.getPath()[columnSpec.getPath().length - 1]; -+ // Reconstruct the logical type from parameters -+ LogicalTypeAnnotation logicalType = null; -+ if (columnSpec.getLogicalTypeName() != null) { -+ logicalType = -+ reconstructLogicalType( -+ columnSpec.getLogicalTypeName(), columnSpec.getLogicalTypeParams()); -+ } -+ -+ PrimitiveType primitiveType; -+ if (primType == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) { -+ primitiveType = -+ org.apache.parquet.schema.Types.primitive(primType, repetition) -+ .length(columnSpec.getTypeLength()) -+ .as(logicalType) -+ .id(columnSpec.getFieldId()) -+ .named(name); -+ } else { -+ primitiveType = -+ Types.primitive(primType, repetition) -+ .as(logicalType) -+ .id(columnSpec.getFieldId()) -+ .named(name); -+ } -+ -+ return new ColumnDescriptor( -+ columnSpec.getPath(), -+ primitiveType, -+ columnSpec.getMaxRepetitionLevel(), -+ columnSpec.getMaxDefinitionLevel()); -+ } -+ -+ private static LogicalTypeAnnotation reconstructLogicalType( -+ String logicalTypeName, java.util.Map params) { -+ -+ switch (logicalTypeName) { -+ // MAP -+ case "MapLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.mapType(); -+ -+ // LIST -+ case "ListLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.listType(); -+ -+ // STRING -+ case "StringLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.stringType(); -+ -+ // MAP_KEY_VALUE -+ case "MapKeyValueLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance(); -+ -+ // ENUM -+ case "EnumLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.enumType(); -+ -+ // DECIMAL -+ case "DecimalLogicalTypeAnnotation": -+ if (!params.containsKey("scale") || !params.containsKey("precision")) { -+ throw new IllegalArgumentException( -+ "Missing required parameters for DecimalLogicalTypeAnnotation: " + params); -+ } -+ int scale = Integer.parseInt(params.get("scale")); -+ int precision = Integer.parseInt(params.get("precision")); -+ return LogicalTypeAnnotation.decimalType(scale, precision); -+ -+ // DATE -+ case "DateLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.dateType(); -+ -+ // TIME -+ case "TimeLogicalTypeAnnotation": -+ if (!params.containsKey("isAdjustedToUTC") || !params.containsKey("unit")) { -+ throw new IllegalArgumentException( -+ "Missing required parameters for TimeLogicalTypeAnnotation: " + params); -+ } -+ -+ boolean isUTC = Boolean.parseBoolean(params.get("isAdjustedToUTC")); -+ String timeUnitStr = params.get("unit"); -+ -+ LogicalTypeAnnotation.TimeUnit timeUnit; -+ switch (timeUnitStr) { -+ case "MILLIS": -+ timeUnit = LogicalTypeAnnotation.TimeUnit.MILLIS; -+ break; -+ case "MICROS": -+ timeUnit = LogicalTypeAnnotation.TimeUnit.MICROS; -+ break; -+ case "NANOS": -+ timeUnit = LogicalTypeAnnotation.TimeUnit.NANOS; -+ break; -+ default: -+ throw new IllegalArgumentException("Unknown time unit: " + timeUnitStr); -+ } -+ return LogicalTypeAnnotation.timeType(isUTC, timeUnit); -+ -+ // TIMESTAMP -+ case "TimestampLogicalTypeAnnotation": -+ if (!params.containsKey("isAdjustedToUTC") || !params.containsKey("unit")) { -+ throw new IllegalArgumentException( -+ "Missing required parameters for TimestampLogicalTypeAnnotation: " + params); -+ } -+ boolean isAdjustedToUTC = Boolean.parseBoolean(params.get("isAdjustedToUTC")); -+ String unitStr = params.get("unit"); -+ -+ LogicalTypeAnnotation.TimeUnit unit; -+ switch (unitStr) { -+ case "MILLIS": -+ unit = LogicalTypeAnnotation.TimeUnit.MILLIS; -+ break; -+ case "MICROS": -+ unit = LogicalTypeAnnotation.TimeUnit.MICROS; -+ break; -+ case "NANOS": -+ unit = LogicalTypeAnnotation.TimeUnit.NANOS; -+ break; -+ default: -+ throw new IllegalArgumentException("Unknown timestamp unit: " + unitStr); -+ } -+ return LogicalTypeAnnotation.timestampType(isAdjustedToUTC, unit); -+ -+ // INTEGER -+ case "IntLogicalTypeAnnotation": -+ if (!params.containsKey("isSigned") || !params.containsKey("bitWidth")) { -+ throw new IllegalArgumentException( -+ "Missing required parameters for IntLogicalTypeAnnotation: " + params); -+ } -+ boolean isSigned = Boolean.parseBoolean(params.get("isSigned")); -+ int bitWidth = Integer.parseInt(params.get("bitWidth")); -+ return LogicalTypeAnnotation.intType(bitWidth, isSigned); -+ -+ // JSON -+ case "JsonLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.jsonType(); -+ -+ // BSON -+ case "BsonLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.bsonType(); -+ -+ // UUID -+ case "UUIDLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.uuidType(); -+ -+ // INTERVAL -+ case "IntervalLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance(); -+ -+ default: -+ throw new IllegalArgumentException("Unknown logical type: " + logicalTypeName); -+ } -+ } -+} -diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/CometVectorizedParquetReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/CometVectorizedParquetReader.java -new file mode 100644 -index 000000000..a3cba4018 ---- /dev/null -+++ b/parquet/src/main/java/org/apache/iceberg/parquet/CometVectorizedParquetReader.java -@@ -0,0 +1,260 @@ -+/* -+ * Licensed to the Apache Software Foundation (ASF) under one -+ * or more contributor license agreements. See the NOTICE file -+ * distributed with this work for additional information -+ * regarding copyright ownership. The ASF licenses this file -+ * to you under the Apache License, Version 2.0 (the -+ * "License"); you may not use this file except in compliance -+ * with the License. You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, -+ * software distributed under the License is distributed on an -+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -+ * KIND, either express or implied. See the License for the -+ * specific language governing permissions and limitations -+ * under the License. -+ */ -+package org.apache.iceberg.parquet; -+ -+import java.io.IOException; -+import java.io.UncheckedIOException; -+import java.nio.ByteBuffer; -+import java.util.List; -+import java.util.Map; -+import java.util.NoSuchElementException; -+import java.util.function.Function; -+import org.apache.comet.parquet.FileReader; -+import org.apache.comet.parquet.ParquetColumnSpec; -+import org.apache.comet.parquet.ReadOptions; -+import org.apache.comet.parquet.RowGroupReader; -+import org.apache.comet.parquet.WrappedInputFile; -+import org.apache.hadoop.conf.Configuration; -+import org.apache.iceberg.Schema; -+import org.apache.iceberg.exceptions.RuntimeIOException; -+import org.apache.iceberg.expressions.Expression; -+import org.apache.iceberg.expressions.Expressions; -+import org.apache.iceberg.io.CloseableGroup; -+import org.apache.iceberg.io.CloseableIterable; -+import org.apache.iceberg.io.CloseableIterator; -+import org.apache.iceberg.io.InputFile; -+import org.apache.iceberg.mapping.NameMapping; -+import org.apache.iceberg.relocated.com.google.common.collect.Lists; -+import org.apache.iceberg.util.ByteBuffers; -+import org.apache.parquet.ParquetReadOptions; -+import org.apache.parquet.column.ColumnDescriptor; -+import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; -+import org.apache.parquet.hadoop.metadata.ColumnPath; -+import org.apache.parquet.schema.MessageType; -+ -+public class CometVectorizedParquetReader extends CloseableGroup -+ implements CloseableIterable { -+ private final InputFile input; -+ private final ParquetReadOptions options; -+ private final Schema expectedSchema; -+ private final Function> batchReaderFunc; -+ private final Expression filter; -+ private final boolean reuseContainers; -+ private final boolean caseSensitive; -+ private final int batchSize; -+ private final NameMapping nameMapping; -+ private final Map properties; -+ private Long start = null; -+ private Long length = null; -+ private ByteBuffer fileEncryptionKey = null; -+ private ByteBuffer fileAADPrefix = null; -+ -+ public CometVectorizedParquetReader( -+ InputFile input, -+ Schema expectedSchema, -+ ParquetReadOptions options, -+ Function> readerFunc, -+ NameMapping nameMapping, -+ Expression filter, -+ boolean reuseContainers, -+ boolean caseSensitive, -+ int maxRecordsPerBatch, -+ Map properties, -+ Long start, -+ Long length, -+ ByteBuffer fileEncryptionKey, -+ ByteBuffer fileAADPrefix) { -+ this.input = input; -+ this.expectedSchema = expectedSchema; -+ this.options = options; -+ this.batchReaderFunc = readerFunc; -+ // replace alwaysTrue with null to avoid extra work evaluating a trivial filter -+ this.filter = filter == Expressions.alwaysTrue() ? null : filter; -+ this.reuseContainers = reuseContainers; -+ this.caseSensitive = caseSensitive; -+ this.batchSize = maxRecordsPerBatch; -+ this.nameMapping = nameMapping; -+ this.properties = properties; -+ this.start = start; -+ this.length = length; -+ this.fileEncryptionKey = fileEncryptionKey; -+ this.fileAADPrefix = fileAADPrefix; -+ } -+ -+ private ReadConf conf = null; -+ -+ private ReadConf init() { -+ if (conf == null) { -+ ReadConf readConf = -+ new ReadConf( -+ input, -+ options, -+ expectedSchema, -+ filter, -+ null, -+ batchReaderFunc, -+ nameMapping, -+ reuseContainers, -+ caseSensitive, -+ batchSize); -+ this.conf = readConf.copy(); -+ return readConf; -+ } -+ return conf; -+ } -+ -+ @Override -+ public CloseableIterator iterator() { -+ FileIterator iter = -+ new FileIterator<>(init(), properties, start, length, fileEncryptionKey, fileAADPrefix); -+ addCloseable(iter); -+ return iter; -+ } -+ -+ private static class FileIterator implements CloseableIterator { -+ // private final ParquetFileReader reader; -+ private final boolean[] shouldSkip; -+ private final VectorizedReader model; -+ private final long totalValues; -+ private final int batchSize; -+ private final List> columnChunkMetadata; -+ private final boolean reuseContainers; -+ private int nextRowGroup = 0; -+ private long nextRowGroupStart = 0; -+ private long valuesRead = 0; -+ private T last = null; -+ private final FileReader cometReader; -+ private ReadConf conf; -+ -+ FileIterator( -+ ReadConf conf, -+ Map properties, -+ Long start, -+ Long length, -+ ByteBuffer fileEncryptionKey, -+ ByteBuffer fileAADPrefix) { -+ this.shouldSkip = conf.shouldSkip(); -+ this.totalValues = conf.totalValues(); -+ this.reuseContainers = conf.reuseContainers(); -+ this.model = conf.vectorizedModel(); -+ this.batchSize = conf.batchSize(); -+ this.model.setBatchSize(this.batchSize); -+ this.columnChunkMetadata = conf.columnChunkMetadataForRowGroups(); -+ this.cometReader = -+ newCometReader( -+ conf.file(), -+ conf.projection(), -+ properties, -+ start, -+ length, -+ fileEncryptionKey, -+ fileAADPrefix); -+ this.conf = conf; -+ } -+ -+ private FileReader newCometReader( -+ InputFile file, -+ MessageType projection, -+ Map properties, -+ Long start, -+ Long length, -+ ByteBuffer fileEncryptionKey, -+ ByteBuffer fileAADPrefix) { -+ try { -+ ReadOptions cometOptions = ReadOptions.builder(new Configuration()).build(); -+ -+ FileReader fileReader = -+ new FileReader( -+ new WrappedInputFile(file), -+ cometOptions, -+ properties, -+ start, -+ length, -+ ByteBuffers.toByteArray(fileEncryptionKey), -+ ByteBuffers.toByteArray(fileAADPrefix)); -+ -+ List columnDescriptors = projection.getColumns(); -+ -+ List specs = Lists.newArrayList(); -+ -+ for (ColumnDescriptor descriptor : columnDescriptors) { -+ ParquetColumnSpec spec = CometTypeUtils.descriptorToParquetColumnSpec(descriptor); -+ specs.add(spec); -+ } -+ -+ fileReader.setRequestedSchemaFromSpecs(specs); -+ return fileReader; -+ } catch (IOException e) { -+ throw new UncheckedIOException("Failed to open Parquet file: " + file.location(), e); -+ } -+ } -+ -+ @Override -+ public boolean hasNext() { -+ return valuesRead < totalValues; -+ } -+ -+ @Override -+ public T next() { -+ if (!hasNext()) { -+ throw new NoSuchElementException(); -+ } -+ if (valuesRead >= nextRowGroupStart) { -+ advance(); -+ } -+ -+ // batchSize is an integer, so casting to integer is safe -+ int numValuesToRead = (int) Math.min(nextRowGroupStart - valuesRead, batchSize); -+ if (reuseContainers) { -+ this.last = model.read(last, numValuesToRead); -+ } else { -+ this.last = model.read(null, numValuesToRead); -+ } -+ valuesRead += numValuesToRead; -+ -+ return last; -+ } -+ -+ private void advance() { -+ while (shouldSkip[nextRowGroup]) { -+ nextRowGroup += 1; -+ cometReader.skipNextRowGroup(); -+ } -+ RowGroupReader pages; -+ try { -+ pages = cometReader.readNextRowGroup(); -+ } catch (IOException e) { -+ throw new RuntimeIOException(e); -+ } -+ -+ model.setRowGroupInfo(pages, columnChunkMetadata.get(nextRowGroup)); -+ nextRowGroupStart += pages.getRowCount(); -+ nextRowGroup += 1; -+ } -+ -+ @Override -+ public void close() throws IOException { -+ model.close(); -+ cometReader.close(); -+ if (conf != null && conf.reader() != null) { -+ conf.reader().close(); -+ } -+ } -+ } -+} -diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java -index 2c37a5244..3442cfc43 100644 ---- a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java -+++ b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java -@@ -1075,6 +1075,7 @@ public class Parquet { - private NameMapping nameMapping = null; - private ByteBuffer fileEncryptionKey = null; - private ByteBuffer fileAADPrefix = null; -+ private boolean isComet; - - private ReadBuilder(InputFile file) { - this.file = file; -@@ -1172,6 +1173,11 @@ public class Parquet { - return this; - } - -+ public ReadBuilder enableComet(boolean enableComet) { -+ this.isComet = enableComet; -+ return this; -+ } -+ - public ReadBuilder withFileEncryptionKey(ByteBuffer encryptionKey) { - this.fileEncryptionKey = encryptionKey; - return this; -@@ -1182,7 +1188,7 @@ public class Parquet { - return this; - } - -- @SuppressWarnings({"unchecked", "checkstyle:CyclomaticComplexity"}) -+ @SuppressWarnings({"unchecked", "checkstyle:CyclomaticComplexity", "MethodLength"}) - public CloseableIterable build() { - FileDecryptionProperties fileDecryptionProperties = null; - if (fileEncryptionKey != null) { -@@ -1234,16 +1240,35 @@ public class Parquet { - } - - if (batchedReaderFunc != null) { -- return new VectorizedParquetReader<>( -- file, -- schema, -- options, -- batchedReaderFunc, -- mapping, -- filter, -- reuseContainers, -- caseSensitive, -- maxRecordsPerBatch); -+ if (isComet) { -+ LOG.info("Comet enabled"); -+ return new CometVectorizedParquetReader<>( -+ file, -+ schema, -+ options, -+ batchedReaderFunc, -+ mapping, -+ filter, -+ reuseContainers, -+ caseSensitive, -+ maxRecordsPerBatch, -+ properties, -+ start, -+ length, -+ fileEncryptionKey, -+ fileAADPrefix); -+ } else { -+ return new VectorizedParquetReader<>( -+ file, -+ schema, -+ options, -+ batchedReaderFunc, -+ mapping, -+ filter, -+ reuseContainers, -+ caseSensitive, -+ maxRecordsPerBatch); -+ } - } else { - return new org.apache.iceberg.parquet.ParquetReader<>( - file, schema, options, readerFunc, mapping, filter, reuseContainers, caseSensitive); -diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java -index 1fb2372ba..142e5fbad 100644 ---- a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java -+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java -@@ -157,6 +157,14 @@ class ReadConf { - return newReader; - } - -+ InputFile file() { -+ return file; -+ } -+ -+ MessageType projection() { -+ return projection; -+ } -+ - ParquetValueReader model() { - return model; - } diff --git a/spark/v3.5/build.gradle b/spark/v3.5/build.gradle index e2d2c7a7a..f64232dc5 100644 --- a/spark/v3.5/build.gradle @@ -854,353 +211,6 @@ index 68c537e34..1e9e90d53 100644 if (!enableDictionaryEncoding) { builder .config("parquet.dictionary.page.size", "1") -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java -index 4794863ab..8bb508f19 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java -@@ -20,21 +20,25 @@ package org.apache.iceberg.spark.data.vectorized; - - import java.io.IOException; - import java.util.Map; -+import org.apache.comet.CometConf; -+import org.apache.comet.CometSchemaImporter; - import org.apache.comet.parquet.AbstractColumnReader; - import org.apache.comet.parquet.ColumnReader; -+import org.apache.comet.parquet.ParquetColumnSpec; -+import org.apache.comet.parquet.RowGroupReader; - import org.apache.comet.parquet.TypeUtil; - import org.apache.comet.parquet.Utils; --import org.apache.comet.shaded.arrow.c.CometSchemaImporter; - import org.apache.comet.shaded.arrow.memory.RootAllocator; -+import org.apache.iceberg.parquet.CometTypeUtils; - import org.apache.iceberg.parquet.VectorizedReader; - import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - import org.apache.iceberg.spark.SparkSchemaUtil; - import org.apache.iceberg.types.Types; - import org.apache.parquet.column.ColumnDescriptor; - import org.apache.parquet.column.page.PageReadStore; --import org.apache.parquet.column.page.PageReader; - import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; - import org.apache.parquet.hadoop.metadata.ColumnPath; -+import org.apache.spark.sql.internal.SQLConf; - import org.apache.spark.sql.types.DataType; - import org.apache.spark.sql.types.Metadata; - import org.apache.spark.sql.types.StructField; -@@ -46,23 +50,28 @@ class CometColumnReader implements VectorizedReader { - - private final ColumnDescriptor descriptor; - private final DataType sparkType; -+ private final int fieldId; - - // The delegated ColumnReader from Comet side - private AbstractColumnReader delegate; - private boolean initialized = false; - private int batchSize = DEFAULT_BATCH_SIZE; - private CometSchemaImporter importer; -+ private ParquetColumnSpec spec; - -- CometColumnReader(DataType sparkType, ColumnDescriptor descriptor) { -+ CometColumnReader(DataType sparkType, ColumnDescriptor descriptor, int fieldId) { - this.sparkType = sparkType; - this.descriptor = descriptor; -+ this.fieldId = fieldId; - } - - CometColumnReader(Types.NestedField field) { - DataType dataType = SparkSchemaUtil.convert(field.type()); - StructField structField = new StructField(field.name(), dataType, false, Metadata.empty()); - this.sparkType = dataType; -- this.descriptor = TypeUtil.convertToParquet(structField); -+ this.descriptor = -+ CometTypeUtils.buildColumnDescriptor(TypeUtil.convertToParquetSpec(structField)); -+ this.fieldId = field.fieldId(); - } - - public AbstractColumnReader delegate() { -@@ -96,7 +105,26 @@ class CometColumnReader implements VectorizedReader { - } - - this.importer = new CometSchemaImporter(new RootAllocator()); -- this.delegate = Utils.getColumnReader(sparkType, descriptor, importer, batchSize, false, false); -+ -+ spec = CometTypeUtils.descriptorToParquetColumnSpec(descriptor); -+ -+ boolean useLegacyTime = -+ Boolean.parseBoolean( -+ SQLConf.get() -+ .getConfString( -+ CometConf.COMET_EXCEPTION_ON_LEGACY_DATE_TIMESTAMP().key(), "false")); -+ boolean useLazyMaterialization = -+ Boolean.parseBoolean( -+ SQLConf.get().getConfString(CometConf.COMET_USE_LAZY_MATERIALIZATION().key(), "false")); -+ this.delegate = -+ Utils.getColumnReader( -+ sparkType, -+ spec, -+ importer, -+ batchSize, -+ true, // Comet sets this to true for native execution -+ useLazyMaterialization, -+ useLegacyTime); - this.initialized = true; - } - -@@ -115,9 +143,9 @@ class CometColumnReader implements VectorizedReader { - *

NOTE: this should be called before reading a new Parquet column chunk, and after {@link - * CometColumnReader#reset} is called. - */ -- public void setPageReader(PageReader pageReader) throws IOException { -+ public void setPageReader(RowGroupReader pageStore) throws IOException { - Preconditions.checkState(initialized, "Invalid state: 'reset' should be called first"); -- ((ColumnReader) delegate).setPageReader(pageReader); -+ ((ColumnReader) delegate).setRowGroupReader(pageStore, spec); - } - - @Override -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java -index 1440e5d1d..85cca62e9 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java -@@ -22,8 +22,12 @@ import java.io.IOException; - import java.io.UncheckedIOException; - import java.util.List; - import java.util.Map; -+import org.apache.comet.CometRuntimeException; - import org.apache.comet.parquet.AbstractColumnReader; --import org.apache.comet.parquet.BatchReader; -+import org.apache.comet.parquet.IcebergCometBatchReader; -+import org.apache.comet.parquet.RowGroupReader; -+import org.apache.comet.vector.CometSelectionVector; -+import org.apache.comet.vector.CometVector; - import org.apache.iceberg.Schema; - import org.apache.iceberg.data.DeleteFilter; - import org.apache.iceberg.parquet.VectorizedReader; -@@ -55,7 +59,7 @@ class CometColumnarBatchReader implements VectorizedReader { - // calling BatchReader.nextBatch, the isDeleted value is not yet available, so - // DeleteColumnReader.readBatch must be called explicitly later, after the isDeleted value is - // available. -- private final BatchReader delegate; -+ private final IcebergCometBatchReader delegate; - private DeleteFilter deletes = null; - private long rowStartPosInBatch = 0; - -@@ -65,9 +69,7 @@ class CometColumnarBatchReader implements VectorizedReader { - this.hasIsDeletedColumn = - readers.stream().anyMatch(reader -> reader instanceof CometDeleteColumnReader); - -- AbstractColumnReader[] abstractColumnReaders = new AbstractColumnReader[readers.size()]; -- this.delegate = new BatchReader(abstractColumnReaders); -- delegate.setSparkSchema(SparkSchemaUtil.convert(schema)); -+ this.delegate = new IcebergCometBatchReader(readers.size(), SparkSchemaUtil.convert(schema)); - } - - @Override -@@ -85,19 +87,22 @@ class CometColumnarBatchReader implements VectorizedReader { - && !(readers[i] instanceof CometPositionColumnReader) - && !(readers[i] instanceof CometDeleteColumnReader)) { - readers[i].reset(); -- readers[i].setPageReader(pageStore.getPageReader(readers[i].descriptor())); -+ readers[i].setPageReader((RowGroupReader) pageStore); - } - } catch (IOException e) { - throw new UncheckedIOException("Failed to setRowGroupInfo for Comet vectorization", e); - } - } - -+ AbstractColumnReader[] delegateReaders = new AbstractColumnReader[readers.length]; - for (int i = 0; i < readers.length; i++) { -- delegate.getColumnReaders()[i] = this.readers[i].delegate(); -+ delegateReaders[i] = readers[i].delegate(); - } - -+ delegate.init(delegateReaders); -+ - this.rowStartPosInBatch = -- pageStore -+ ((RowGroupReader) pageStore) - .getRowIndexOffset() - .orElseThrow( - () -> -@@ -154,9 +159,17 @@ class CometColumnarBatchReader implements VectorizedReader { - Pair pair = buildRowIdMapping(vectors); - if (pair != null) { - int[] rowIdMapping = pair.first(); -- numLiveRows = pair.second(); -- for (int i = 0; i < vectors.length; i++) { -- vectors[i] = new ColumnVectorWithFilter(vectors[i], rowIdMapping); -+ if (pair.second() != null) { -+ numLiveRows = pair.second(); -+ for (int i = 0; i < vectors.length; i++) { -+ if (vectors[i] instanceof CometVector) { -+ vectors[i] = -+ new CometSelectionVector((CometVector) vectors[i], rowIdMapping, numLiveRows); -+ } else { -+ throw new CometRuntimeException( -+ "Unsupported column vector type: " + vectors[i].getClass()); -+ } -+ } - } - } - } -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java -index 047c96314..88d691a60 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java -@@ -21,6 +21,7 @@ package org.apache.iceberg.spark.data.vectorized; - import java.math.BigDecimal; - import java.nio.ByteBuffer; - import org.apache.comet.parquet.ConstantColumnReader; -+import org.apache.iceberg.parquet.CometTypeUtils; - import org.apache.iceberg.types.Types; - import org.apache.spark.sql.types.DataType; - import org.apache.spark.sql.types.DataTypes; -@@ -34,7 +35,11 @@ class CometConstantColumnReader extends CometColumnReader { - super(field); - // use delegate to set constant value on the native side to be consumed by native execution. - setDelegate( -- new ConstantColumnReader(sparkType(), descriptor(), convertToSparkValue(value), false)); -+ new ConstantColumnReader( -+ sparkType(), -+ CometTypeUtils.descriptorToParquetColumnSpec(descriptor()), -+ convertToSparkValue(value), -+ false)); - } - - @Override -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java -index 6235bfe48..cba108e43 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java -@@ -51,10 +51,10 @@ class CometDeleteColumnReader extends CometColumnReader { - DeleteColumnReader() { - super( - DataTypes.BooleanType, -- TypeUtil.convertToParquet( -+ TypeUtil.convertToParquetSpec( - new StructField("_deleted", DataTypes.BooleanType, false, Metadata.empty())), - false /* useDecimal128 = false */, -- false /* isConstant = false */); -+ false /* isConstant */); - this.isDeleted = new boolean[0]; - } - -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java -index bcc0e514c..98e80068c 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java -@@ -20,6 +20,7 @@ package org.apache.iceberg.spark.data.vectorized; - - import org.apache.comet.parquet.MetadataColumnReader; - import org.apache.comet.parquet.Native; -+import org.apache.iceberg.parquet.CometTypeUtils; - import org.apache.iceberg.types.Types; - import org.apache.parquet.column.ColumnDescriptor; - import org.apache.spark.sql.types.DataTypes; -@@ -44,7 +45,7 @@ class CometPositionColumnReader extends CometColumnReader { - PositionColumnReader(ColumnDescriptor descriptor) { - super( - DataTypes.LongType, -- descriptor, -+ CometTypeUtils.descriptorToParquetColumnSpec(descriptor), - false /* useDecimal128 = false */, - false /* isConstant = false */); - } -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java -index d36f1a727..56f8c9bff 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java -@@ -142,6 +142,7 @@ class CometVectorizedReaderBuilder extends TypeWithSchemaVisitor extends BaseReader taskGroup = (ScanTaskGroup) task; -+ return taskGroup.tasks().stream().allMatch(this::supportsCometBatchReads); -+ -+ } else if (task.isFileScanTask() && !task.isDataTask()) { -+ FileScanTask fileScanTask = task.asFileScanTask(); -+ // Comet can't handle delete files for now -+ return fileScanTask.file().format() == FileFormat.PARQUET; -+ -+ } else { -+ return false; -+ } -+ } -+ - // conditions for using ORC batch reads: - // - ORC vectorization is enabled - // - all tasks are of type FileScanTask and read only ORC files with no delete files -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java -index 019f3919d..656e0600a 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java -@@ -23,6 +23,7 @@ import java.util.List; - import java.util.Map; - import java.util.function.Supplier; - import java.util.stream.Collectors; -+import org.apache.comet.parquet.SupportsComet; - import org.apache.iceberg.BlobMetadata; - import org.apache.iceberg.ScanTask; - import org.apache.iceberg.ScanTaskGroup; -@@ -94,7 +95,7 @@ import org.apache.spark.sql.types.StructType; - import org.slf4j.Logger; - import org.slf4j.LoggerFactory; - --abstract class SparkScan implements Scan, SupportsReportStatistics { -+abstract class SparkScan implements Scan, SupportsReportStatistics, SupportsComet { - private static final Logger LOG = LoggerFactory.getLogger(SparkScan.class); - private static final String NDV_KEY = "ndv"; - -@@ -348,4 +349,10 @@ abstract class SparkScan implements Scan, SupportsReportStatistics { - return splitSize; - } - } -+ -+ @Override -+ public boolean isCometEnabled() { -+ SparkBatch batch = (SparkBatch) this.toBatch(); -+ return batch.useCometBatchReads(); -+ } - } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java index 404ba7284..00e97e96f 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java diff --git a/dev/diffs/iceberg-rust/1.9.1.diff b/dev/diffs/iceberg-rust/1.9.1.diff index 0f780cc22d..b89526a81a 100644 --- a/dev/diffs/iceberg-rust/1.9.1.diff +++ b/dev/diffs/iceberg-rust/1.9.1.diff @@ -1,21 +1,3 @@ -diff --git a/build.gradle b/build.gradle -index 998f2ee9e..017e61be9 100644 ---- a/build.gradle -+++ b/build.gradle -@@ -814,6 +814,13 @@ project(':iceberg-parquet') { - implementation project(':iceberg-core') - implementation project(':iceberg-common') - -+ implementation("org.apache.datafusion:comet-spark-spark${sparkVersionsString}_${scalaVersion}:${libs.versions.comet.get()}") { -+ exclude group: 'org.apache.arrow' -+ exclude group: 'org.apache.parquet' -+ exclude group: 'org.apache.spark' -+ exclude group: 'org.apache.iceberg' -+ } -+ - implementation(libs.parquet.avro) { - exclude group: 'org.apache.avro', module: 'avro' - // already shaded by Parquet diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index c50991c5f..3acb395a6 100644 --- a/gradle/libs.versions.toml @@ -28,631 +10,6 @@ index c50991c5f..3acb395a6 100644 datasketches = "6.2.0" delta-standalone = "3.3.1" delta-spark = "3.3.1" -diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/CometTypeUtils.java b/parquet/src/main/java/org/apache/iceberg/parquet/CometTypeUtils.java -new file mode 100644 -index 000000000..ddf6c7de5 ---- /dev/null -+++ b/parquet/src/main/java/org/apache/iceberg/parquet/CometTypeUtils.java -@@ -0,0 +1,255 @@ -+/* -+ * Licensed to the Apache Software Foundation (ASF) under one -+ * or more contributor license agreements. See the NOTICE file -+ * distributed with this work for additional information -+ * regarding copyright ownership. The ASF licenses this file -+ * to you under the Apache License, Version 2.0 (the -+ * "License"); you may not use this file except in compliance -+ * with the License. You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, -+ * software distributed under the License is distributed on an -+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -+ * KIND, either express or implied. See the License for the -+ * specific language governing permissions and limitations -+ * under the License. -+ */ -+package org.apache.iceberg.parquet; -+ -+import java.util.Map; -+import org.apache.comet.parquet.ParquetColumnSpec; -+import org.apache.iceberg.relocated.com.google.common.collect.Maps; -+import org.apache.parquet.column.ColumnDescriptor; -+import org.apache.parquet.schema.LogicalTypeAnnotation; -+import org.apache.parquet.schema.PrimitiveType; -+import org.apache.parquet.schema.Type; -+import org.apache.parquet.schema.Types; -+ -+public class CometTypeUtils { -+ -+ private CometTypeUtils() {} -+ -+ public static ParquetColumnSpec descriptorToParquetColumnSpec(ColumnDescriptor descriptor) { -+ -+ String[] path = descriptor.getPath(); -+ PrimitiveType primitiveType = descriptor.getPrimitiveType(); -+ String physicalType = primitiveType.getPrimitiveTypeName().name(); -+ -+ int typeLength = -+ primitiveType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY -+ ? primitiveType.getTypeLength() -+ : 0; -+ -+ boolean isRepeated = primitiveType.getRepetition() == Type.Repetition.REPEATED; -+ -+ // ToDo: extract this into a Util method -+ String logicalTypeName = null; -+ Map logicalTypeParams = Maps.newHashMap(); -+ LogicalTypeAnnotation logicalType = primitiveType.getLogicalTypeAnnotation(); -+ -+ if (logicalType != null) { -+ logicalTypeName = logicalType.getClass().getSimpleName(); -+ -+ // Handle specific logical types -+ if (logicalType instanceof LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) { -+ LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimal = -+ (LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) logicalType; -+ logicalTypeParams.put("precision", String.valueOf(decimal.getPrecision())); -+ logicalTypeParams.put("scale", String.valueOf(decimal.getScale())); -+ } else if (logicalType instanceof LogicalTypeAnnotation.TimestampLogicalTypeAnnotation) { -+ LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestamp = -+ (LogicalTypeAnnotation.TimestampLogicalTypeAnnotation) logicalType; -+ logicalTypeParams.put("isAdjustedToUTC", String.valueOf(timestamp.isAdjustedToUTC())); -+ logicalTypeParams.put("unit", timestamp.getUnit().name()); -+ } else if (logicalType instanceof LogicalTypeAnnotation.TimeLogicalTypeAnnotation) { -+ LogicalTypeAnnotation.TimeLogicalTypeAnnotation time = -+ (LogicalTypeAnnotation.TimeLogicalTypeAnnotation) logicalType; -+ logicalTypeParams.put("isAdjustedToUTC", String.valueOf(time.isAdjustedToUTC())); -+ logicalTypeParams.put("unit", time.getUnit().name()); -+ } else if (logicalType instanceof LogicalTypeAnnotation.IntLogicalTypeAnnotation) { -+ LogicalTypeAnnotation.IntLogicalTypeAnnotation intType = -+ (LogicalTypeAnnotation.IntLogicalTypeAnnotation) logicalType; -+ logicalTypeParams.put("isSigned", String.valueOf(intType.isSigned())); -+ logicalTypeParams.put("bitWidth", String.valueOf(intType.getBitWidth())); -+ } -+ } -+ -+ return new ParquetColumnSpec( -+ 1, // ToDo: pass in the correct id -+ path, -+ physicalType, -+ typeLength, -+ isRepeated, -+ descriptor.getMaxDefinitionLevel(), -+ descriptor.getMaxRepetitionLevel(), -+ logicalTypeName, -+ logicalTypeParams); -+ } -+ -+ public static ColumnDescriptor buildColumnDescriptor(ParquetColumnSpec columnSpec) { -+ PrimitiveType.PrimitiveTypeName primType = -+ PrimitiveType.PrimitiveTypeName.valueOf(columnSpec.getPhysicalType()); -+ -+ Type.Repetition repetition; -+ if (columnSpec.getMaxRepetitionLevel() > 0) { -+ repetition = Type.Repetition.REPEATED; -+ } else if (columnSpec.getMaxDefinitionLevel() > 0) { -+ repetition = Type.Repetition.OPTIONAL; -+ } else { -+ repetition = Type.Repetition.REQUIRED; -+ } -+ -+ String name = columnSpec.getPath()[columnSpec.getPath().length - 1]; -+ // Reconstruct the logical type from parameters -+ LogicalTypeAnnotation logicalType = null; -+ if (columnSpec.getLogicalTypeName() != null) { -+ logicalType = -+ reconstructLogicalType( -+ columnSpec.getLogicalTypeName(), columnSpec.getLogicalTypeParams()); -+ } -+ -+ PrimitiveType primitiveType; -+ if (primType == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) { -+ primitiveType = -+ org.apache.parquet.schema.Types.primitive(primType, repetition) -+ .length(columnSpec.getTypeLength()) -+ .as(logicalType) -+ .id(columnSpec.getFieldId()) -+ .named(name); -+ } else { -+ primitiveType = -+ Types.primitive(primType, repetition) -+ .as(logicalType) -+ .id(columnSpec.getFieldId()) -+ .named(name); -+ } -+ -+ return new ColumnDescriptor( -+ columnSpec.getPath(), -+ primitiveType, -+ columnSpec.getMaxRepetitionLevel(), -+ columnSpec.getMaxDefinitionLevel()); -+ } -+ -+ private static LogicalTypeAnnotation reconstructLogicalType( -+ String logicalTypeName, java.util.Map params) { -+ -+ switch (logicalTypeName) { -+ // MAP -+ case "MapLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.mapType(); -+ -+ // LIST -+ case "ListLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.listType(); -+ -+ // STRING -+ case "StringLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.stringType(); -+ -+ // MAP_KEY_VALUE -+ case "MapKeyValueLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance(); -+ -+ // ENUM -+ case "EnumLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.enumType(); -+ -+ // DECIMAL -+ case "DecimalLogicalTypeAnnotation": -+ if (!params.containsKey("scale") || !params.containsKey("precision")) { -+ throw new IllegalArgumentException( -+ "Missing required parameters for DecimalLogicalTypeAnnotation: " + params); -+ } -+ int scale = Integer.parseInt(params.get("scale")); -+ int precision = Integer.parseInt(params.get("precision")); -+ return LogicalTypeAnnotation.decimalType(scale, precision); -+ -+ // DATE -+ case "DateLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.dateType(); -+ -+ // TIME -+ case "TimeLogicalTypeAnnotation": -+ if (!params.containsKey("isAdjustedToUTC") || !params.containsKey("unit")) { -+ throw new IllegalArgumentException( -+ "Missing required parameters for TimeLogicalTypeAnnotation: " + params); -+ } -+ -+ boolean isUTC = Boolean.parseBoolean(params.get("isAdjustedToUTC")); -+ String timeUnitStr = params.get("unit"); -+ -+ LogicalTypeAnnotation.TimeUnit timeUnit; -+ switch (timeUnitStr) { -+ case "MILLIS": -+ timeUnit = LogicalTypeAnnotation.TimeUnit.MILLIS; -+ break; -+ case "MICROS": -+ timeUnit = LogicalTypeAnnotation.TimeUnit.MICROS; -+ break; -+ case "NANOS": -+ timeUnit = LogicalTypeAnnotation.TimeUnit.NANOS; -+ break; -+ default: -+ throw new IllegalArgumentException("Unknown time unit: " + timeUnitStr); -+ } -+ return LogicalTypeAnnotation.timeType(isUTC, timeUnit); -+ -+ // TIMESTAMP -+ case "TimestampLogicalTypeAnnotation": -+ if (!params.containsKey("isAdjustedToUTC") || !params.containsKey("unit")) { -+ throw new IllegalArgumentException( -+ "Missing required parameters for TimestampLogicalTypeAnnotation: " + params); -+ } -+ boolean isAdjustedToUTC = Boolean.parseBoolean(params.get("isAdjustedToUTC")); -+ String unitStr = params.get("unit"); -+ -+ LogicalTypeAnnotation.TimeUnit unit; -+ switch (unitStr) { -+ case "MILLIS": -+ unit = LogicalTypeAnnotation.TimeUnit.MILLIS; -+ break; -+ case "MICROS": -+ unit = LogicalTypeAnnotation.TimeUnit.MICROS; -+ break; -+ case "NANOS": -+ unit = LogicalTypeAnnotation.TimeUnit.NANOS; -+ break; -+ default: -+ throw new IllegalArgumentException("Unknown timestamp unit: " + unitStr); -+ } -+ return LogicalTypeAnnotation.timestampType(isAdjustedToUTC, unit); -+ -+ // INTEGER -+ case "IntLogicalTypeAnnotation": -+ if (!params.containsKey("isSigned") || !params.containsKey("bitWidth")) { -+ throw new IllegalArgumentException( -+ "Missing required parameters for IntLogicalTypeAnnotation: " + params); -+ } -+ boolean isSigned = Boolean.parseBoolean(params.get("isSigned")); -+ int bitWidth = Integer.parseInt(params.get("bitWidth")); -+ return LogicalTypeAnnotation.intType(bitWidth, isSigned); -+ -+ // JSON -+ case "JsonLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.jsonType(); -+ -+ // BSON -+ case "BsonLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.bsonType(); -+ -+ // UUID -+ case "UUIDLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.uuidType(); -+ -+ // INTERVAL -+ case "IntervalLogicalTypeAnnotation": -+ return LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance(); -+ -+ default: -+ throw new IllegalArgumentException("Unknown logical type: " + logicalTypeName); -+ } -+ } -+} -diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/CometVectorizedParquetReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/CometVectorizedParquetReader.java -new file mode 100644 -index 000000000..a3cba4018 ---- /dev/null -+++ b/parquet/src/main/java/org/apache/iceberg/parquet/CometVectorizedParquetReader.java -@@ -0,0 +1,260 @@ -+/* -+ * Licensed to the Apache Software Foundation (ASF) under one -+ * or more contributor license agreements. See the NOTICE file -+ * distributed with this work for additional information -+ * regarding copyright ownership. The ASF licenses this file -+ * to you under the Apache License, Version 2.0 (the -+ * "License"); you may not use this file except in compliance -+ * with the License. You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, -+ * software distributed under the License is distributed on an -+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -+ * KIND, either express or implied. See the License for the -+ * specific language governing permissions and limitations -+ * under the License. -+ */ -+package org.apache.iceberg.parquet; -+ -+import java.io.IOException; -+import java.io.UncheckedIOException; -+import java.nio.ByteBuffer; -+import java.util.List; -+import java.util.Map; -+import java.util.NoSuchElementException; -+import java.util.function.Function; -+import org.apache.comet.parquet.FileReader; -+import org.apache.comet.parquet.ParquetColumnSpec; -+import org.apache.comet.parquet.ReadOptions; -+import org.apache.comet.parquet.RowGroupReader; -+import org.apache.comet.parquet.WrappedInputFile; -+import org.apache.hadoop.conf.Configuration; -+import org.apache.iceberg.Schema; -+import org.apache.iceberg.exceptions.RuntimeIOException; -+import org.apache.iceberg.expressions.Expression; -+import org.apache.iceberg.expressions.Expressions; -+import org.apache.iceberg.io.CloseableGroup; -+import org.apache.iceberg.io.CloseableIterable; -+import org.apache.iceberg.io.CloseableIterator; -+import org.apache.iceberg.io.InputFile; -+import org.apache.iceberg.mapping.NameMapping; -+import org.apache.iceberg.relocated.com.google.common.collect.Lists; -+import org.apache.iceberg.util.ByteBuffers; -+import org.apache.parquet.ParquetReadOptions; -+import org.apache.parquet.column.ColumnDescriptor; -+import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; -+import org.apache.parquet.hadoop.metadata.ColumnPath; -+import org.apache.parquet.schema.MessageType; -+ -+public class CometVectorizedParquetReader extends CloseableGroup -+ implements CloseableIterable { -+ private final InputFile input; -+ private final ParquetReadOptions options; -+ private final Schema expectedSchema; -+ private final Function> batchReaderFunc; -+ private final Expression filter; -+ private final boolean reuseContainers; -+ private final boolean caseSensitive; -+ private final int batchSize; -+ private final NameMapping nameMapping; -+ private final Map properties; -+ private Long start = null; -+ private Long length = null; -+ private ByteBuffer fileEncryptionKey = null; -+ private ByteBuffer fileAADPrefix = null; -+ -+ public CometVectorizedParquetReader( -+ InputFile input, -+ Schema expectedSchema, -+ ParquetReadOptions options, -+ Function> readerFunc, -+ NameMapping nameMapping, -+ Expression filter, -+ boolean reuseContainers, -+ boolean caseSensitive, -+ int maxRecordsPerBatch, -+ Map properties, -+ Long start, -+ Long length, -+ ByteBuffer fileEncryptionKey, -+ ByteBuffer fileAADPrefix) { -+ this.input = input; -+ this.expectedSchema = expectedSchema; -+ this.options = options; -+ this.batchReaderFunc = readerFunc; -+ // replace alwaysTrue with null to avoid extra work evaluating a trivial filter -+ this.filter = filter == Expressions.alwaysTrue() ? null : filter; -+ this.reuseContainers = reuseContainers; -+ this.caseSensitive = caseSensitive; -+ this.batchSize = maxRecordsPerBatch; -+ this.nameMapping = nameMapping; -+ this.properties = properties; -+ this.start = start; -+ this.length = length; -+ this.fileEncryptionKey = fileEncryptionKey; -+ this.fileAADPrefix = fileAADPrefix; -+ } -+ -+ private ReadConf conf = null; -+ -+ private ReadConf init() { -+ if (conf == null) { -+ ReadConf readConf = -+ new ReadConf( -+ input, -+ options, -+ expectedSchema, -+ filter, -+ null, -+ batchReaderFunc, -+ nameMapping, -+ reuseContainers, -+ caseSensitive, -+ batchSize); -+ this.conf = readConf.copy(); -+ return readConf; -+ } -+ return conf; -+ } -+ -+ @Override -+ public CloseableIterator iterator() { -+ FileIterator iter = -+ new FileIterator<>(init(), properties, start, length, fileEncryptionKey, fileAADPrefix); -+ addCloseable(iter); -+ return iter; -+ } -+ -+ private static class FileIterator implements CloseableIterator { -+ // private final ParquetFileReader reader; -+ private final boolean[] shouldSkip; -+ private final VectorizedReader model; -+ private final long totalValues; -+ private final int batchSize; -+ private final List> columnChunkMetadata; -+ private final boolean reuseContainers; -+ private int nextRowGroup = 0; -+ private long nextRowGroupStart = 0; -+ private long valuesRead = 0; -+ private T last = null; -+ private final FileReader cometReader; -+ private ReadConf conf; -+ -+ FileIterator( -+ ReadConf conf, -+ Map properties, -+ Long start, -+ Long length, -+ ByteBuffer fileEncryptionKey, -+ ByteBuffer fileAADPrefix) { -+ this.shouldSkip = conf.shouldSkip(); -+ this.totalValues = conf.totalValues(); -+ this.reuseContainers = conf.reuseContainers(); -+ this.model = conf.vectorizedModel(); -+ this.batchSize = conf.batchSize(); -+ this.model.setBatchSize(this.batchSize); -+ this.columnChunkMetadata = conf.columnChunkMetadataForRowGroups(); -+ this.cometReader = -+ newCometReader( -+ conf.file(), -+ conf.projection(), -+ properties, -+ start, -+ length, -+ fileEncryptionKey, -+ fileAADPrefix); -+ this.conf = conf; -+ } -+ -+ private FileReader newCometReader( -+ InputFile file, -+ MessageType projection, -+ Map properties, -+ Long start, -+ Long length, -+ ByteBuffer fileEncryptionKey, -+ ByteBuffer fileAADPrefix) { -+ try { -+ ReadOptions cometOptions = ReadOptions.builder(new Configuration()).build(); -+ -+ FileReader fileReader = -+ new FileReader( -+ new WrappedInputFile(file), -+ cometOptions, -+ properties, -+ start, -+ length, -+ ByteBuffers.toByteArray(fileEncryptionKey), -+ ByteBuffers.toByteArray(fileAADPrefix)); -+ -+ List columnDescriptors = projection.getColumns(); -+ -+ List specs = Lists.newArrayList(); -+ -+ for (ColumnDescriptor descriptor : columnDescriptors) { -+ ParquetColumnSpec spec = CometTypeUtils.descriptorToParquetColumnSpec(descriptor); -+ specs.add(spec); -+ } -+ -+ fileReader.setRequestedSchemaFromSpecs(specs); -+ return fileReader; -+ } catch (IOException e) { -+ throw new UncheckedIOException("Failed to open Parquet file: " + file.location(), e); -+ } -+ } -+ -+ @Override -+ public boolean hasNext() { -+ return valuesRead < totalValues; -+ } -+ -+ @Override -+ public T next() { -+ if (!hasNext()) { -+ throw new NoSuchElementException(); -+ } -+ if (valuesRead >= nextRowGroupStart) { -+ advance(); -+ } -+ -+ // batchSize is an integer, so casting to integer is safe -+ int numValuesToRead = (int) Math.min(nextRowGroupStart - valuesRead, batchSize); -+ if (reuseContainers) { -+ this.last = model.read(last, numValuesToRead); -+ } else { -+ this.last = model.read(null, numValuesToRead); -+ } -+ valuesRead += numValuesToRead; -+ -+ return last; -+ } -+ -+ private void advance() { -+ while (shouldSkip[nextRowGroup]) { -+ nextRowGroup += 1; -+ cometReader.skipNextRowGroup(); -+ } -+ RowGroupReader pages; -+ try { -+ pages = cometReader.readNextRowGroup(); -+ } catch (IOException e) { -+ throw new RuntimeIOException(e); -+ } -+ -+ model.setRowGroupInfo(pages, columnChunkMetadata.get(nextRowGroup)); -+ nextRowGroupStart += pages.getRowCount(); -+ nextRowGroup += 1; -+ } -+ -+ @Override -+ public void close() throws IOException { -+ model.close(); -+ cometReader.close(); -+ if (conf != null && conf.reader() != null) { -+ conf.reader().close(); -+ } -+ } -+ } -+} -diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java -index 31f9e2a80..520f142c2 100644 ---- a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java -+++ b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java -@@ -1124,6 +1124,7 @@ public class Parquet { - private NameMapping nameMapping = null; - private ByteBuffer fileEncryptionKey = null; - private ByteBuffer fileAADPrefix = null; -+ private boolean isComet; - - private ReadBuilder(InputFile file) { - this.file = file; -@@ -1168,6 +1169,11 @@ public class Parquet { - return this; - } - -+ public ReadBuilder enableComet(boolean enableComet) { -+ this.isComet = enableComet; -+ return this; -+ } -+ - /** - * @deprecated will be removed in 2.0.0; use {@link #createReaderFunc(Function)} instead - */ -@@ -1263,7 +1269,7 @@ public class Parquet { - } - - @Override -- @SuppressWarnings({"unchecked", "checkstyle:CyclomaticComplexity"}) -+ @SuppressWarnings({"unchecked", "checkstyle:CyclomaticComplexity", "MethodLength"}) - public CloseableIterable build() { - FileDecryptionProperties fileDecryptionProperties = null; - if (fileEncryptionKey != null) { -@@ -1315,16 +1321,35 @@ public class Parquet { - } - - if (batchedReaderFunc != null) { -- return new VectorizedParquetReader<>( -- file, -- schema, -- options, -- batchedReaderFunc, -- mapping, -- filter, -- reuseContainers, -- caseSensitive, -- maxRecordsPerBatch); -+ if (isComet) { -+ LOG.info("Comet enabled"); -+ return new CometVectorizedParquetReader<>( -+ file, -+ schema, -+ options, -+ batchedReaderFunc, -+ mapping, -+ filter, -+ reuseContainers, -+ caseSensitive, -+ maxRecordsPerBatch, -+ properties, -+ start, -+ length, -+ fileEncryptionKey, -+ fileAADPrefix); -+ } else { -+ return new VectorizedParquetReader<>( -+ file, -+ schema, -+ options, -+ batchedReaderFunc, -+ mapping, -+ filter, -+ reuseContainers, -+ caseSensitive, -+ maxRecordsPerBatch); -+ } - } else { - Function> readBuilder = - readerFuncWithSchema != null -diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java -index 1fb2372ba..142e5fbad 100644 ---- a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java -+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java -@@ -157,6 +157,14 @@ class ReadConf { - return newReader; - } - -+ InputFile file() { -+ return file; -+ } -+ -+ MessageType projection() { -+ return projection; -+ } -+ - ParquetValueReader model() { - return model; - } diff --git a/spark/v3.5/build.gradle b/spark/v3.5/build.gradle index 572c32f92..d155f634a 100644 --- a/spark/v3.5/build.gradle @@ -845,351 +202,6 @@ index 68c537e34..1e9e90d53 100644 if (!enableDictionaryEncoding) { builder .config("parquet.dictionary.page.size", "1") -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java -index 16159dcbd..eba1a2a0f 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java -@@ -19,18 +19,22 @@ - package org.apache.iceberg.spark.data.vectorized; - - import java.io.IOException; -+import org.apache.comet.CometConf; -+import org.apache.comet.CometSchemaImporter; - import org.apache.comet.parquet.AbstractColumnReader; - import org.apache.comet.parquet.ColumnReader; -+import org.apache.comet.parquet.ParquetColumnSpec; -+import org.apache.comet.parquet.RowGroupReader; - import org.apache.comet.parquet.TypeUtil; - import org.apache.comet.parquet.Utils; --import org.apache.comet.shaded.arrow.c.CometSchemaImporter; - import org.apache.comet.shaded.arrow.memory.RootAllocator; -+import org.apache.iceberg.parquet.CometTypeUtils; - import org.apache.iceberg.parquet.VectorizedReader; - import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - import org.apache.iceberg.spark.SparkSchemaUtil; - import org.apache.iceberg.types.Types; - import org.apache.parquet.column.ColumnDescriptor; --import org.apache.parquet.column.page.PageReader; -+import org.apache.spark.sql.internal.SQLConf; - import org.apache.spark.sql.types.DataType; - import org.apache.spark.sql.types.Metadata; - import org.apache.spark.sql.types.StructField; -@@ -42,23 +46,28 @@ class CometColumnReader implements VectorizedReader { - - private final ColumnDescriptor descriptor; - private final DataType sparkType; -+ private final int fieldId; - - // The delegated ColumnReader from Comet side - private AbstractColumnReader delegate; - private boolean initialized = false; - private int batchSize = DEFAULT_BATCH_SIZE; - private CometSchemaImporter importer; -+ private ParquetColumnSpec spec; - -- CometColumnReader(DataType sparkType, ColumnDescriptor descriptor) { -+ CometColumnReader(DataType sparkType, ColumnDescriptor descriptor, int fieldId) { - this.sparkType = sparkType; - this.descriptor = descriptor; -+ this.fieldId = fieldId; - } - - CometColumnReader(Types.NestedField field) { - DataType dataType = SparkSchemaUtil.convert(field.type()); - StructField structField = new StructField(field.name(), dataType, false, Metadata.empty()); - this.sparkType = dataType; -- this.descriptor = TypeUtil.convertToParquet(structField); -+ this.descriptor = -+ CometTypeUtils.buildColumnDescriptor(TypeUtil.convertToParquetSpec(structField)); -+ this.fieldId = field.fieldId(); - } - - public AbstractColumnReader delegate() { -@@ -92,7 +101,26 @@ class CometColumnReader implements VectorizedReader { - } - - this.importer = new CometSchemaImporter(new RootAllocator()); -- this.delegate = Utils.getColumnReader(sparkType, descriptor, importer, batchSize, false, false); -+ -+ spec = CometTypeUtils.descriptorToParquetColumnSpec(descriptor); -+ -+ boolean useLegacyTime = -+ Boolean.parseBoolean( -+ SQLConf.get() -+ .getConfString( -+ CometConf.COMET_EXCEPTION_ON_LEGACY_DATE_TIMESTAMP().key(), "false")); -+ boolean useLazyMaterialization = -+ Boolean.parseBoolean( -+ SQLConf.get().getConfString(CometConf.COMET_USE_LAZY_MATERIALIZATION().key(), "false")); -+ this.delegate = -+ Utils.getColumnReader( -+ sparkType, -+ spec, -+ importer, -+ batchSize, -+ true, // Comet sets this to true for native execution -+ useLazyMaterialization, -+ useLegacyTime); - this.initialized = true; - } - -@@ -111,9 +139,9 @@ class CometColumnReader implements VectorizedReader { - *

NOTE: this should be called before reading a new Parquet column chunk, and after {@link - * CometColumnReader#reset} is called. - */ -- public void setPageReader(PageReader pageReader) throws IOException { -+ public void setPageReader(RowGroupReader pageStore) throws IOException { - Preconditions.checkState(initialized, "Invalid state: 'reset' should be called first"); -- ((ColumnReader) delegate).setPageReader(pageReader); -+ ((ColumnReader) delegate).setRowGroupReader(pageStore, spec); - } - - @Override -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java -index 04ac69476..916face2b 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java -@@ -22,8 +22,12 @@ import java.io.IOException; - import java.io.UncheckedIOException; - import java.util.List; - import java.util.Map; -+import org.apache.comet.CometRuntimeException; - import org.apache.comet.parquet.AbstractColumnReader; --import org.apache.comet.parquet.BatchReader; -+import org.apache.comet.parquet.IcebergCometBatchReader; -+import org.apache.comet.parquet.RowGroupReader; -+import org.apache.comet.vector.CometSelectionVector; -+import org.apache.comet.vector.CometVector; - import org.apache.iceberg.Schema; - import org.apache.iceberg.data.DeleteFilter; - import org.apache.iceberg.parquet.VectorizedReader; -@@ -55,7 +59,7 @@ class CometColumnarBatchReader implements VectorizedReader { - // calling BatchReader.nextBatch, the isDeleted value is not yet available, so - // DeleteColumnReader.readBatch must be called explicitly later, after the isDeleted value is - // available. -- private final BatchReader delegate; -+ private final IcebergCometBatchReader delegate; - private DeleteFilter deletes = null; - private long rowStartPosInBatch = 0; - -@@ -65,9 +69,7 @@ class CometColumnarBatchReader implements VectorizedReader { - this.hasIsDeletedColumn = - readers.stream().anyMatch(reader -> reader instanceof CometDeleteColumnReader); - -- AbstractColumnReader[] abstractColumnReaders = new AbstractColumnReader[readers.size()]; -- this.delegate = new BatchReader(abstractColumnReaders); -- delegate.setSparkSchema(SparkSchemaUtil.convert(schema)); -+ this.delegate = new IcebergCometBatchReader(readers.size(), SparkSchemaUtil.convert(schema)); - } - - @Override -@@ -79,19 +81,22 @@ class CometColumnarBatchReader implements VectorizedReader { - && !(readers[i] instanceof CometPositionColumnReader) - && !(readers[i] instanceof CometDeleteColumnReader)) { - readers[i].reset(); -- readers[i].setPageReader(pageStore.getPageReader(readers[i].descriptor())); -+ readers[i].setPageReader((RowGroupReader) pageStore); - } - } catch (IOException e) { - throw new UncheckedIOException("Failed to setRowGroupInfo for Comet vectorization", e); - } - } - -+ AbstractColumnReader[] delegateReaders = new AbstractColumnReader[readers.length]; - for (int i = 0; i < readers.length; i++) { -- delegate.getColumnReaders()[i] = this.readers[i].delegate(); -+ delegateReaders[i] = readers[i].delegate(); - } - -+ delegate.init(delegateReaders); -+ - this.rowStartPosInBatch = -- pageStore -+ ((RowGroupReader) pageStore) - .getRowIndexOffset() - .orElseThrow( - () -> -@@ -148,9 +153,17 @@ class CometColumnarBatchReader implements VectorizedReader { - Pair pair = buildRowIdMapping(vectors); - if (pair != null) { - int[] rowIdMapping = pair.first(); -- numLiveRows = pair.second(); -- for (int i = 0; i < vectors.length; i++) { -- vectors[i] = new ColumnVectorWithFilter(vectors[i], rowIdMapping); -+ if (pair.second() != null) { -+ numLiveRows = pair.second(); -+ for (int i = 0; i < vectors.length; i++) { -+ if (vectors[i] instanceof CometVector) { -+ vectors[i] = -+ new CometSelectionVector((CometVector) vectors[i], rowIdMapping, numLiveRows); -+ } else { -+ throw new CometRuntimeException( -+ "Unsupported column vector type: " + vectors[i].getClass()); -+ } -+ } - } - } - } -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java -index 047c96314..88d691a60 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java -@@ -21,6 +21,7 @@ package org.apache.iceberg.spark.data.vectorized; - import java.math.BigDecimal; - import java.nio.ByteBuffer; - import org.apache.comet.parquet.ConstantColumnReader; -+import org.apache.iceberg.parquet.CometTypeUtils; - import org.apache.iceberg.types.Types; - import org.apache.spark.sql.types.DataType; - import org.apache.spark.sql.types.DataTypes; -@@ -34,7 +35,11 @@ class CometConstantColumnReader extends CometColumnReader { - super(field); - // use delegate to set constant value on the native side to be consumed by native execution. - setDelegate( -- new ConstantColumnReader(sparkType(), descriptor(), convertToSparkValue(value), false)); -+ new ConstantColumnReader( -+ sparkType(), -+ CometTypeUtils.descriptorToParquetColumnSpec(descriptor()), -+ convertToSparkValue(value), -+ false)); - } - - @Override -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java -index 6235bfe48..cba108e43 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java -@@ -51,10 +51,10 @@ class CometDeleteColumnReader extends CometColumnReader { - DeleteColumnReader() { - super( - DataTypes.BooleanType, -- TypeUtil.convertToParquet( -+ TypeUtil.convertToParquetSpec( - new StructField("_deleted", DataTypes.BooleanType, false, Metadata.empty())), - false /* useDecimal128 = false */, -- false /* isConstant = false */); -+ false /* isConstant */); - this.isDeleted = new boolean[0]; - } - -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java -index bcc0e514c..98e80068c 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java -@@ -20,6 +20,7 @@ package org.apache.iceberg.spark.data.vectorized; - - import org.apache.comet.parquet.MetadataColumnReader; - import org.apache.comet.parquet.Native; -+import org.apache.iceberg.parquet.CometTypeUtils; - import org.apache.iceberg.types.Types; - import org.apache.parquet.column.ColumnDescriptor; - import org.apache.spark.sql.types.DataTypes; -@@ -44,7 +45,7 @@ class CometPositionColumnReader extends CometColumnReader { - PositionColumnReader(ColumnDescriptor descriptor) { - super( - DataTypes.LongType, -- descriptor, -+ CometTypeUtils.descriptorToParquetColumnSpec(descriptor), - false /* useDecimal128 = false */, - false /* isConstant = false */); - } -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java -index d36f1a727..56f8c9bff 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java -@@ -142,6 +142,7 @@ class CometVectorizedReaderBuilder extends TypeWithSchemaVisitor extends BaseReader taskGroup = (ScanTaskGroup) task; -+ return taskGroup.tasks().stream().allMatch(this::supportsCometBatchReads); -+ -+ } else if (task.isFileScanTask() && !task.isDataTask()) { -+ FileScanTask fileScanTask = task.asFileScanTask(); -+ // Comet can't handle delete files for now -+ return fileScanTask.file().format() == FileFormat.PARQUET; -+ -+ } else { -+ return false; -+ } -+ } -+ - // conditions for using ORC batch reads: - // - ORC vectorization is enabled - // - all tasks are of type FileScanTask and read only ORC files with no delete files -diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java -index 106b296de..967b0d41d 100644 ---- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java -+++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java -@@ -24,6 +24,7 @@ import java.util.Map; - import java.util.Optional; - import java.util.function.Supplier; - import java.util.stream.Collectors; -+import org.apache.comet.parquet.SupportsComet; - import org.apache.iceberg.BlobMetadata; - import org.apache.iceberg.ScanTask; - import org.apache.iceberg.ScanTaskGroup; -@@ -95,7 +96,7 @@ import org.apache.spark.sql.types.StructType; - import org.slf4j.Logger; - import org.slf4j.LoggerFactory; - --abstract class SparkScan implements Scan, SupportsReportStatistics { -+abstract class SparkScan implements Scan, SupportsReportStatistics, SupportsComet { - private static final Logger LOG = LoggerFactory.getLogger(SparkScan.class); - private static final String NDV_KEY = "ndv"; - -@@ -351,4 +352,10 @@ abstract class SparkScan implements Scan, SupportsReportStatistics { - return splitSize; - } - } -+ -+ @Override -+ public boolean isCometEnabled() { -+ SparkBatch batch = (SparkBatch) this.toBatch(); -+ return batch.useCometBatchReads(); -+ } - } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java index 404ba7284..00e97e96f 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java From 911f6f57822b8fb343d348ecbe5e5f040d493e13 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 23 Mar 2026 12:45:01 -0400 Subject: [PATCH 2/6] Fix iceberg-rust diffs after #3739. --- dev/diffs/iceberg-rust/1.8.1.diff | 20 -------------------- dev/diffs/iceberg-rust/1.9.1.diff | 20 -------------------- 2 files changed, 40 deletions(-) diff --git a/dev/diffs/iceberg-rust/1.8.1.diff b/dev/diffs/iceberg-rust/1.8.1.diff index 27686ab8d5..581739d776 100644 --- a/dev/diffs/iceberg-rust/1.8.1.diff +++ b/dev/diffs/iceberg-rust/1.8.1.diff @@ -23,15 +23,6 @@ diff --git a/spark/v3.5/build.gradle b/spark/v3.5/build.gradle index e2d2c7a7a..f64232dc5 100644 --- a/spark/v3.5/build.gradle +++ b/spark/v3.5/build.gradle -@@ -75,7 +75,7 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") { - exclude group: 'org.roaringbitmap' - } - -- compileOnly "org.apache.datafusion:comet-spark-spark${sparkMajorVersion}_${scalaVersion}:0.5.0" -+ compileOnly "org.apache.datafusion:comet-spark-spark${sparkMajorVersion}_${scalaVersion}:${libs.versions.comet.get()}" - - implementation libs.parquet.column - implementation libs.parquet.hadoop @@ -183,7 +183,7 @@ project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVer testImplementation libs.avro.avro testImplementation libs.parquet.hadoop @@ -49,17 +40,6 @@ index e2d2c7a7a..f64232dc5 100644 // runtime dependencies for running Hive Catalog based integration test integrationRuntimeOnly project(':iceberg-hive-metastore') -@@ -300,8 +301,8 @@ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersio - relocate 'org.apache.avro', 'org.apache.iceberg.shaded.org.apache.avro' - relocate 'avro.shaded', 'org.apache.iceberg.shaded.org.apache.avro.shaded' - relocate 'com.thoughtworks.paranamer', 'org.apache.iceberg.shaded.com.thoughtworks.paranamer' -- relocate 'org.apache.parquet', 'org.apache.iceberg.shaded.org.apache.parquet' -- relocate 'shaded.parquet', 'org.apache.iceberg.shaded.org.apache.parquet.shaded' -+// relocate 'org.apache.parquet', 'org.apache.iceberg.shaded.org.apache.parquet' -+// relocate 'shaded.parquet', 'org.apache.iceberg.shaded.org.apache.parquet.shaded' - relocate 'org.apache.orc', 'org.apache.iceberg.shaded.org.apache.orc' - relocate 'io.airlift', 'org.apache.iceberg.shaded.io.airlift' - relocate 'org.apache.hc.client5', 'org.apache.iceberg.shaded.org.apache.hc.client5' diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java index 578845e3d..4f44a73db 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java diff --git a/dev/diffs/iceberg-rust/1.9.1.diff b/dev/diffs/iceberg-rust/1.9.1.diff index b89526a81a..c79ea70890 100644 --- a/dev/diffs/iceberg-rust/1.9.1.diff +++ b/dev/diffs/iceberg-rust/1.9.1.diff @@ -14,15 +14,6 @@ diff --git a/spark/v3.5/build.gradle b/spark/v3.5/build.gradle index 572c32f92..d155f634a 100644 --- a/spark/v3.5/build.gradle +++ b/spark/v3.5/build.gradle -@@ -75,7 +75,7 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") { - exclude group: 'org.roaringbitmap' - } - -- compileOnly "org.apache.datafusion:comet-spark-spark${sparkMajorVersion}_${scalaVersion}:0.5.0" -+ compileOnly "org.apache.datafusion:comet-spark-spark${sparkMajorVersion}_${scalaVersion}:${libs.versions.comet.get()}" - - implementation libs.parquet.column - implementation libs.parquet.hadoop @@ -184,7 +184,7 @@ project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVer testImplementation libs.avro.avro testImplementation libs.parquet.hadoop @@ -40,17 +31,6 @@ index 572c32f92..d155f634a 100644 // runtime dependencies for running Hive Catalog based integration test integrationRuntimeOnly project(':iceberg-hive-metastore') -@@ -302,8 +303,8 @@ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersio - relocate 'org.apache.avro', 'org.apache.iceberg.shaded.org.apache.avro' - relocate 'avro.shaded', 'org.apache.iceberg.shaded.org.apache.avro.shaded' - relocate 'com.thoughtworks.paranamer', 'org.apache.iceberg.shaded.com.thoughtworks.paranamer' -- relocate 'org.apache.parquet', 'org.apache.iceberg.shaded.org.apache.parquet' -- relocate 'shaded.parquet', 'org.apache.iceberg.shaded.org.apache.parquet.shaded' -+// relocate 'org.apache.parquet', 'org.apache.iceberg.shaded.org.apache.parquet' -+// relocate 'shaded.parquet', 'org.apache.iceberg.shaded.org.apache.parquet.shaded' - relocate 'org.apache.orc', 'org.apache.iceberg.shaded.org.apache.orc' - relocate 'io.airlift', 'org.apache.iceberg.shaded.io.airlift' - relocate 'org.apache.hc.client5', 'org.apache.iceberg.shaded.org.apache.hc.client5' diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java index 578845e3d..4f44a73db 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java From 9531e93e207f56e1a4a8cb6bc7750b80a2f0b85d Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 23 Mar 2026 13:10:18 -0400 Subject: [PATCH 3/6] Fix iceberg-rust diffs after #3739. --- dev/diffs/iceberg-rust/1.10.0.diff | 780 +++++++++++++++++++++++++++++ 1 file changed, 780 insertions(+) diff --git a/dev/diffs/iceberg-rust/1.10.0.diff b/dev/diffs/iceberg-rust/1.10.0.diff index 92fc080769..35336395c7 100644 --- a/dev/diffs/iceberg-rust/1.10.0.diff +++ b/dev/diffs/iceberg-rust/1.10.0.diff @@ -795,3 +795,783 @@ index 9d2ce2b38..5e2336884 100644 } else { assertThat(planAsString).as("Should be no post scan filter").doesNotContain("Filter ("); } +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java +deleted file mode 100644 +index 81b7d83a70..0000000000 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java ++++ /dev/null +@@ -1,140 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.io.IOException; +-import org.apache.comet.CometSchemaImporter; +-import org.apache.comet.parquet.AbstractColumnReader; +-import org.apache.comet.parquet.ColumnReader; +-import org.apache.comet.parquet.TypeUtil; +-import org.apache.comet.parquet.Utils; +-import org.apache.comet.shaded.arrow.memory.RootAllocator; +-import org.apache.iceberg.parquet.VectorizedReader; +-import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +-import org.apache.iceberg.spark.SparkSchemaUtil; +-import org.apache.iceberg.types.Types; +-import org.apache.parquet.column.ColumnDescriptor; +-import org.apache.parquet.column.page.PageReader; +-import org.apache.spark.sql.types.DataType; +-import org.apache.spark.sql.types.Metadata; +-import org.apache.spark.sql.types.StructField; +-import org.apache.spark.sql.vectorized.ColumnVector; +- +-class CometColumnReader implements VectorizedReader { +- // use the Comet default batch size +- public static final int DEFAULT_BATCH_SIZE = 8192; +- +- private final ColumnDescriptor descriptor; +- private final DataType sparkType; +- +- // The delegated ColumnReader from Comet side +- private AbstractColumnReader delegate; +- private boolean initialized = false; +- private int batchSize = DEFAULT_BATCH_SIZE; +- private CometSchemaImporter importer; +- +- CometColumnReader(DataType sparkType, ColumnDescriptor descriptor) { +- this.sparkType = sparkType; +- this.descriptor = descriptor; +- } +- +- CometColumnReader(Types.NestedField field) { +- DataType dataType = SparkSchemaUtil.convert(field.type()); +- StructField structField = new StructField(field.name(), dataType, false, Metadata.empty()); +- this.sparkType = dataType; +- this.descriptor = TypeUtil.convertToParquet(structField); +- } +- +- public AbstractColumnReader delegate() { +- return delegate; +- } +- +- void setDelegate(AbstractColumnReader delegate) { +- this.delegate = delegate; +- } +- +- void setInitialized(boolean initialized) { +- this.initialized = initialized; +- } +- +- public int batchSize() { +- return batchSize; +- } +- +- /** +- * This method is to initialized/reset the CometColumnReader. This needs to be called for each row +- * group after readNextRowGroup, so a new dictionary encoding can be set for each of the new row +- * groups. +- */ +- public void reset() { +- if (importer != null) { +- importer.close(); +- } +- +- if (delegate != null) { +- delegate.close(); +- } +- +- this.importer = new CometSchemaImporter(new RootAllocator()); +- this.delegate = Utils.getColumnReader(sparkType, descriptor, importer, batchSize, false, false); +- this.initialized = true; +- } +- +- public ColumnDescriptor descriptor() { +- return descriptor; +- } +- +- /** Returns the Spark data type for this column. */ +- public DataType sparkType() { +- return sparkType; +- } +- +- /** +- * Set the page reader to be 'pageReader'. +- * +- *

NOTE: this should be called before reading a new Parquet column chunk, and after {@link +- * CometColumnReader#reset} is called. +- */ +- public void setPageReader(PageReader pageReader) throws IOException { +- Preconditions.checkState(initialized, "Invalid state: 'reset' should be called first"); +- ((ColumnReader) delegate).setPageReader(pageReader); +- } +- +- @Override +- public void close() { +- // close resources on native side +- if (importer != null) { +- importer.close(); +- } +- +- if (delegate != null) { +- delegate.close(); +- } +- } +- +- @Override +- public void setBatchSize(int size) { +- this.batchSize = size; +- } +- +- @Override +- public ColumnVector read(ColumnVector reuse, int numRowsToRead) { +- throw new UnsupportedOperationException("Not supported"); +- } +-} +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java +deleted file mode 100644 +index 04ac69476a..0000000000 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java ++++ /dev/null +@@ -1,197 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.io.IOException; +-import java.io.UncheckedIOException; +-import java.util.List; +-import java.util.Map; +-import org.apache.comet.parquet.AbstractColumnReader; +-import org.apache.comet.parquet.BatchReader; +-import org.apache.iceberg.Schema; +-import org.apache.iceberg.data.DeleteFilter; +-import org.apache.iceberg.parquet.VectorizedReader; +-import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +-import org.apache.iceberg.spark.SparkSchemaUtil; +-import org.apache.iceberg.util.Pair; +-import org.apache.parquet.column.page.PageReadStore; +-import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +-import org.apache.parquet.hadoop.metadata.ColumnPath; +-import org.apache.spark.sql.catalyst.InternalRow; +-import org.apache.spark.sql.vectorized.ColumnVector; +-import org.apache.spark.sql.vectorized.ColumnarBatch; +- +-/** +- * {@link VectorizedReader} that returns Spark's {@link ColumnarBatch} to support Spark's vectorized +- * read path. The {@link ColumnarBatch} returned is created by passing in the Arrow vectors +- * populated via delegated read calls to {@link CometColumnReader VectorReader(s)}. +- */ +-@SuppressWarnings("checkstyle:VisibilityModifier") +-class CometColumnarBatchReader implements VectorizedReader { +- +- private final CometColumnReader[] readers; +- private final boolean hasIsDeletedColumn; +- +- // The delegated BatchReader on the Comet side does the real work of loading a batch of rows. +- // The Comet BatchReader contains an array of ColumnReader. There is no need to explicitly call +- // ColumnReader.readBatch; instead, BatchReader.nextBatch will be called, which underneath calls +- // ColumnReader.readBatch. The only exception is DeleteColumnReader, because at the time of +- // calling BatchReader.nextBatch, the isDeleted value is not yet available, so +- // DeleteColumnReader.readBatch must be called explicitly later, after the isDeleted value is +- // available. +- private final BatchReader delegate; +- private DeleteFilter deletes = null; +- private long rowStartPosInBatch = 0; +- +- CometColumnarBatchReader(List> readers, Schema schema) { +- this.readers = +- readers.stream().map(CometColumnReader.class::cast).toArray(CometColumnReader[]::new); +- this.hasIsDeletedColumn = +- readers.stream().anyMatch(reader -> reader instanceof CometDeleteColumnReader); +- +- AbstractColumnReader[] abstractColumnReaders = new AbstractColumnReader[readers.size()]; +- this.delegate = new BatchReader(abstractColumnReaders); +- delegate.setSparkSchema(SparkSchemaUtil.convert(schema)); +- } +- +- @Override +- public void setRowGroupInfo( +- PageReadStore pageStore, Map metaData) { +- for (int i = 0; i < readers.length; i++) { +- try { +- if (!(readers[i] instanceof CometConstantColumnReader) +- && !(readers[i] instanceof CometPositionColumnReader) +- && !(readers[i] instanceof CometDeleteColumnReader)) { +- readers[i].reset(); +- readers[i].setPageReader(pageStore.getPageReader(readers[i].descriptor())); +- } +- } catch (IOException e) { +- throw new UncheckedIOException("Failed to setRowGroupInfo for Comet vectorization", e); +- } +- } +- +- for (int i = 0; i < readers.length; i++) { +- delegate.getColumnReaders()[i] = this.readers[i].delegate(); +- } +- +- this.rowStartPosInBatch = +- pageStore +- .getRowIndexOffset() +- .orElseThrow( +- () -> +- new IllegalArgumentException( +- "PageReadStore does not contain row index offset")); +- } +- +- public void setDeleteFilter(DeleteFilter deleteFilter) { +- this.deletes = deleteFilter; +- } +- +- @Override +- public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) { +- ColumnarBatch columnarBatch = new ColumnBatchLoader(numRowsToRead).loadDataToColumnBatch(); +- rowStartPosInBatch += numRowsToRead; +- return columnarBatch; +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- for (CometColumnReader reader : readers) { +- if (reader != null) { +- reader.setBatchSize(batchSize); +- } +- } +- } +- +- @Override +- public void close() { +- for (CometColumnReader reader : readers) { +- if (reader != null) { +- reader.close(); +- } +- } +- } +- +- private class ColumnBatchLoader { +- private final int batchSize; +- +- ColumnBatchLoader(int numRowsToRead) { +- Preconditions.checkArgument( +- numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead); +- this.batchSize = numRowsToRead; +- } +- +- ColumnarBatch loadDataToColumnBatch() { +- ColumnVector[] vectors = readDataToColumnVectors(); +- int numLiveRows = batchSize; +- +- if (hasIsDeletedColumn) { +- boolean[] isDeleted = buildIsDeleted(vectors); +- readDeletedColumn(vectors, isDeleted); +- } else { +- Pair pair = buildRowIdMapping(vectors); +- if (pair != null) { +- int[] rowIdMapping = pair.first(); +- numLiveRows = pair.second(); +- for (int i = 0; i < vectors.length; i++) { +- vectors[i] = new ColumnVectorWithFilter(vectors[i], rowIdMapping); +- } +- } +- } +- +- if (deletes != null && deletes.hasEqDeletes()) { +- vectors = ColumnarBatchUtil.removeExtraColumns(deletes, vectors); +- } +- +- ColumnarBatch batch = new ColumnarBatch(vectors); +- batch.setNumRows(numLiveRows); +- return batch; +- } +- +- private boolean[] buildIsDeleted(ColumnVector[] vectors) { +- return ColumnarBatchUtil.buildIsDeleted(vectors, deletes, rowStartPosInBatch, batchSize); +- } +- +- private Pair buildRowIdMapping(ColumnVector[] vectors) { +- return ColumnarBatchUtil.buildRowIdMapping(vectors, deletes, rowStartPosInBatch, batchSize); +- } +- +- ColumnVector[] readDataToColumnVectors() { +- ColumnVector[] columnVectors = new ColumnVector[readers.length]; +- // Fetch rows for all readers in the delegate +- delegate.nextBatch(batchSize); +- for (int i = 0; i < readers.length; i++) { +- columnVectors[i] = readers[i].delegate().currentBatch(); +- } +- +- return columnVectors; +- } +- +- void readDeletedColumn(ColumnVector[] columnVectors, boolean[] isDeleted) { +- for (int i = 0; i < readers.length; i++) { +- if (readers[i] instanceof CometDeleteColumnReader) { +- CometDeleteColumnReader deleteColumnReader = new CometDeleteColumnReader<>(isDeleted); +- deleteColumnReader.setBatchSize(batchSize); +- deleteColumnReader.delegate().readBatch(batchSize); +- columnVectors[i] = deleteColumnReader.delegate().currentBatch(); +- } +- } +- } +- } +-} +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java +deleted file mode 100644 +index 047c96314b..0000000000 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java ++++ /dev/null +@@ -1,65 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.math.BigDecimal; +-import java.nio.ByteBuffer; +-import org.apache.comet.parquet.ConstantColumnReader; +-import org.apache.iceberg.types.Types; +-import org.apache.spark.sql.types.DataType; +-import org.apache.spark.sql.types.DataTypes; +-import org.apache.spark.sql.types.Decimal; +-import org.apache.spark.sql.types.DecimalType; +-import org.apache.spark.unsafe.types.UTF8String; +- +-class CometConstantColumnReader extends CometColumnReader { +- +- CometConstantColumnReader(T value, Types.NestedField field) { +- super(field); +- // use delegate to set constant value on the native side to be consumed by native execution. +- setDelegate( +- new ConstantColumnReader(sparkType(), descriptor(), convertToSparkValue(value), false)); +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- super.setBatchSize(batchSize); +- delegate().setBatchSize(batchSize); +- setInitialized(true); +- } +- +- private Object convertToSparkValue(T value) { +- DataType dataType = sparkType(); +- // Match the value to Spark internal type if necessary +- if (dataType == DataTypes.StringType && value instanceof String) { +- // the internal type for StringType is UTF8String +- return UTF8String.fromString((String) value); +- } else if (dataType instanceof DecimalType && value instanceof BigDecimal) { +- // the internal type for DecimalType is Decimal +- return Decimal.apply((BigDecimal) value); +- } else if (dataType == DataTypes.BinaryType && value instanceof ByteBuffer) { +- // the internal type for DecimalType is byte[] +- // Iceberg default value should always use HeapBufferBuffer, so calling ByteBuffer.array() +- // should be safe. +- return ((ByteBuffer) value).array(); +- } else { +- return value; +- } +- } +-} +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java +deleted file mode 100644 +index 6235bfe486..0000000000 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java ++++ /dev/null +@@ -1,75 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import org.apache.comet.parquet.MetadataColumnReader; +-import org.apache.comet.parquet.Native; +-import org.apache.comet.parquet.TypeUtil; +-import org.apache.iceberg.MetadataColumns; +-import org.apache.iceberg.types.Types; +-import org.apache.spark.sql.types.DataTypes; +-import org.apache.spark.sql.types.Metadata; +-import org.apache.spark.sql.types.StructField; +- +-class CometDeleteColumnReader extends CometColumnReader { +- CometDeleteColumnReader(Types.NestedField field) { +- super(field); +- setDelegate(new DeleteColumnReader()); +- } +- +- CometDeleteColumnReader(boolean[] isDeleted) { +- super(MetadataColumns.IS_DELETED); +- setDelegate(new DeleteColumnReader(isDeleted)); +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- super.setBatchSize(batchSize); +- delegate().setBatchSize(batchSize); +- setInitialized(true); +- } +- +- private static class DeleteColumnReader extends MetadataColumnReader { +- private boolean[] isDeleted; +- +- DeleteColumnReader() { +- super( +- DataTypes.BooleanType, +- TypeUtil.convertToParquet( +- new StructField("_deleted", DataTypes.BooleanType, false, Metadata.empty())), +- false /* useDecimal128 = false */, +- false /* isConstant = false */); +- this.isDeleted = new boolean[0]; +- } +- +- DeleteColumnReader(boolean[] isDeleted) { +- this(); +- this.isDeleted = isDeleted; +- } +- +- @Override +- public void readBatch(int total) { +- Native.resetBatch(nativeHandle); +- // set isDeleted on the native side to be consumed by native execution +- Native.setIsDeleted(nativeHandle, isDeleted); +- +- super.readBatch(total); +- } +- } +-} +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java +deleted file mode 100644 +index bcc0e514c2..0000000000 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java ++++ /dev/null +@@ -1,62 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import org.apache.comet.parquet.MetadataColumnReader; +-import org.apache.comet.parquet.Native; +-import org.apache.iceberg.types.Types; +-import org.apache.parquet.column.ColumnDescriptor; +-import org.apache.spark.sql.types.DataTypes; +- +-class CometPositionColumnReader extends CometColumnReader { +- CometPositionColumnReader(Types.NestedField field) { +- super(field); +- setDelegate(new PositionColumnReader(descriptor())); +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- super.setBatchSize(batchSize); +- delegate().setBatchSize(batchSize); +- setInitialized(true); +- } +- +- private static class PositionColumnReader extends MetadataColumnReader { +- /** The current position value of the column that are used to initialize this column reader. */ +- private long position; +- +- PositionColumnReader(ColumnDescriptor descriptor) { +- super( +- DataTypes.LongType, +- descriptor, +- false /* useDecimal128 = false */, +- false /* isConstant = false */); +- } +- +- @Override +- public void readBatch(int total) { +- Native.resetBatch(nativeHandle); +- // set position on the native side to be consumed by native execution +- Native.setPosition(nativeHandle, position, total); +- position += total; +- +- super.readBatch(total); +- } +- } +-} +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java +deleted file mode 100644 +index d36f1a7274..0000000000 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java ++++ /dev/null +@@ -1,147 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.util.List; +-import java.util.Map; +-import java.util.function.Function; +-import java.util.stream.IntStream; +-import org.apache.iceberg.MetadataColumns; +-import org.apache.iceberg.Schema; +-import org.apache.iceberg.data.DeleteFilter; +-import org.apache.iceberg.parquet.TypeWithSchemaVisitor; +-import org.apache.iceberg.parquet.VectorizedReader; +-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +-import org.apache.iceberg.relocated.com.google.common.collect.Lists; +-import org.apache.iceberg.relocated.com.google.common.collect.Maps; +-import org.apache.iceberg.spark.SparkSchemaUtil; +-import org.apache.iceberg.types.Types; +-import org.apache.parquet.column.ColumnDescriptor; +-import org.apache.parquet.schema.GroupType; +-import org.apache.parquet.schema.MessageType; +-import org.apache.parquet.schema.PrimitiveType; +-import org.apache.parquet.schema.Type; +-import org.apache.spark.sql.catalyst.InternalRow; +- +-class CometVectorizedReaderBuilder extends TypeWithSchemaVisitor> { +- +- private final MessageType parquetSchema; +- private final Schema icebergSchema; +- private final Map idToConstant; +- private final Function>, VectorizedReader> readerFactory; +- private final DeleteFilter deleteFilter; +- +- CometVectorizedReaderBuilder( +- Schema expectedSchema, +- MessageType parquetSchema, +- Map idToConstant, +- Function>, VectorizedReader> readerFactory, +- DeleteFilter deleteFilter) { +- this.parquetSchema = parquetSchema; +- this.icebergSchema = expectedSchema; +- this.idToConstant = idToConstant; +- this.readerFactory = readerFactory; +- this.deleteFilter = deleteFilter; +- } +- +- @Override +- public VectorizedReader message( +- Types.StructType expected, MessageType message, List> fieldReaders) { +- GroupType groupType = message.asGroupType(); +- Map> readersById = Maps.newHashMap(); +- List fields = groupType.getFields(); +- +- IntStream.range(0, fields.size()) +- .filter(pos -> fields.get(pos).getId() != null) +- .forEach(pos -> readersById.put(fields.get(pos).getId().intValue(), fieldReaders.get(pos))); +- +- List icebergFields = +- expected != null ? expected.fields() : ImmutableList.of(); +- +- List> reorderedFields = +- Lists.newArrayListWithExpectedSize(icebergFields.size()); +- +- for (Types.NestedField field : icebergFields) { +- int id = field.fieldId(); +- VectorizedReader reader = readersById.get(id); +- if (idToConstant.containsKey(id)) { +- CometConstantColumnReader constantReader = +- new CometConstantColumnReader<>(idToConstant.get(id), field); +- reorderedFields.add(constantReader); +- } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { +- reorderedFields.add(new CometPositionColumnReader(field)); +- } else if (id == MetadataColumns.IS_DELETED.fieldId()) { +- CometColumnReader deleteReader = new CometDeleteColumnReader<>(field); +- reorderedFields.add(deleteReader); +- } else if (reader != null) { +- reorderedFields.add(reader); +- } else if (field.initialDefault() != null) { +- CometColumnReader constantReader = +- new CometConstantColumnReader<>(field.initialDefault(), field); +- reorderedFields.add(constantReader); +- } else if (field.isOptional()) { +- CometColumnReader constantReader = new CometConstantColumnReader<>(null, field); +- reorderedFields.add(constantReader); +- } else { +- throw new IllegalArgumentException( +- String.format("Missing required field: %s", field.name())); +- } +- } +- return vectorizedReader(reorderedFields); +- } +- +- protected VectorizedReader vectorizedReader(List> reorderedFields) { +- VectorizedReader reader = readerFactory.apply(reorderedFields); +- if (deleteFilter != null) { +- ((CometColumnarBatchReader) reader).setDeleteFilter(deleteFilter); +- } +- return reader; +- } +- +- @Override +- public VectorizedReader struct( +- Types.StructType expected, GroupType groupType, List> fieldReaders) { +- if (expected != null) { +- throw new UnsupportedOperationException( +- "Vectorized reads are not supported yet for struct fields"); +- } +- return null; +- } +- +- @Override +- public VectorizedReader primitive( +- org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { +- +- if (primitive.getId() == null) { +- return null; +- } +- int parquetFieldId = primitive.getId().intValue(); +- ColumnDescriptor desc = parquetSchema.getColumnDescription(currentPath()); +- // Nested types not yet supported for vectorized reads +- if (desc.getMaxRepetitionLevel() > 0) { +- return null; +- } +- Types.NestedField icebergField = icebergSchema.findField(parquetFieldId); +- if (icebergField == null) { +- return null; +- } +- +- return new CometColumnReader(SparkSchemaUtil.convert(icebergField.type()), desc); +- } +-} +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +index d95baa724b..eecfa6b358 100644 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java ++++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +@@ -83,22 +83,6 @@ public class VectorizedSparkParquetReaders { + expectedSchema, fileSchema, idToConstant, deleteFilter, ArrowAllocation.rootAllocator()); + } + +- public static CometColumnarBatchReader buildCometReader( +- Schema expectedSchema, +- MessageType fileSchema, +- Map idToConstant, +- DeleteFilter deleteFilter) { +- return (CometColumnarBatchReader) +- TypeWithSchemaVisitor.visit( +- expectedSchema.asStruct(), +- fileSchema, +- new CometVectorizedReaderBuilder( +- expectedSchema, +- fileSchema, +- idToConstant, +- readers -> new CometColumnarBatchReader(readers, expectedSchema), +- deleteFilter)); +- } + + // enables unsafe memory access to avoid costly checks to see if index is within bounds + // as long as it is not configured explicitly (see BoundsChecking in Arrow) +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +index a0f45e7610..78e7845911 100644 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java ++++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +@@ -34,7 +34,6 @@ import org.apache.iceberg.parquet.Parquet; + import org.apache.iceberg.relocated.com.google.common.collect.Sets; + import org.apache.iceberg.spark.OrcBatchReadConf; + import org.apache.iceberg.spark.ParquetBatchReadConf; +-import org.apache.iceberg.spark.ParquetReaderType; + import org.apache.iceberg.spark.data.vectorized.VectorizedSparkOrcReaders; + import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders; + import org.apache.iceberg.types.TypeUtil; +@@ -94,15 +93,9 @@ abstract class BaseBatchReader extends BaseReader { +- if (parquetConf.readerType() == ParquetReaderType.COMET) { +- return VectorizedSparkParquetReaders.buildCometReader( +- requiredSchema, fileSchema, idToConstant, deleteFilter); +- } else { +- return VectorizedSparkParquetReaders.buildReader( +- requiredSchema, fileSchema, idToConstant, deleteFilter); +- } +- }) ++ fileSchema -> ++ VectorizedSparkParquetReaders.buildReader( ++ requiredSchema, fileSchema, idToConstant, deleteFilter)) + .recordsPerBatch(parquetConf.batchSize()) + .filter(residual) + .caseSensitive(caseSensitive()) From 05c98616e5390520f860c38d3c5510f0d463f8d8 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 23 Mar 2026 13:47:13 -0400 Subject: [PATCH 4/6] Fix iceberg-rust diffs after #3739. --- dev/diffs/iceberg-rust/1.8.1.diff | 9 +++++++++ dev/diffs/iceberg-rust/1.9.1.diff | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/dev/diffs/iceberg-rust/1.8.1.diff b/dev/diffs/iceberg-rust/1.8.1.diff index 581739d776..92d99be749 100644 --- a/dev/diffs/iceberg-rust/1.8.1.diff +++ b/dev/diffs/iceberg-rust/1.8.1.diff @@ -23,6 +23,15 @@ diff --git a/spark/v3.5/build.gradle b/spark/v3.5/build.gradle index e2d2c7a7a..f64232dc5 100644 --- a/spark/v3.5/build.gradle +++ b/spark/v3.5/build.gradle +@@ -75,7 +75,7 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") { + exclude group: 'org.roaringbitmap' + } + +- compileOnly "org.apache.datafusion:comet-spark-spark${sparkMajorVersion}_${scalaVersion}:0.5.0" ++ compileOnly "org.apache.datafusion:comet-spark-spark${sparkMajorVersion}_${scalaVersion}:${libs.versions.comet.get()}" + + implementation libs.parquet.column + implementation libs.parquet.hadoop @@ -183,7 +183,7 @@ project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVer testImplementation libs.avro.avro testImplementation libs.parquet.hadoop diff --git a/dev/diffs/iceberg-rust/1.9.1.diff b/dev/diffs/iceberg-rust/1.9.1.diff index c79ea70890..023e378e27 100644 --- a/dev/diffs/iceberg-rust/1.9.1.diff +++ b/dev/diffs/iceberg-rust/1.9.1.diff @@ -14,6 +14,15 @@ diff --git a/spark/v3.5/build.gradle b/spark/v3.5/build.gradle index 572c32f92..d155f634a 100644 --- a/spark/v3.5/build.gradle +++ b/spark/v3.5/build.gradle +@@ -75,7 +75,7 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") { + exclude group: 'org.roaringbitmap' + } + +- compileOnly "org.apache.datafusion:comet-spark-spark${sparkMajorVersion}_${scalaVersion}:0.5.0" ++ compileOnly "org.apache.datafusion:comet-spark-spark${sparkMajorVersion}_${scalaVersion}:${libs.versions.comet.get()}" + + implementation libs.parquet.column + implementation libs.parquet.hadoop @@ -184,7 +184,7 @@ project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVer testImplementation libs.avro.avro testImplementation libs.parquet.hadoop From ad3b3d017472b5e969e93b5ab4222c07bb619131 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 23 Mar 2026 13:54:45 -0400 Subject: [PATCH 5/6] Fix iceberg-rust diffs after #3739. --- dev/diffs/iceberg-rust/1.10.0.diff | 780 ++++++++++++++++++++++++++++ dev/diffs/iceberg-rust/1.8.1.diff | 796 +++++++++++++++++++++++++++++ dev/diffs/iceberg-rust/1.9.1.diff | 780 ++++++++++++++++++++++++++++ 3 files changed, 2356 insertions(+) diff --git a/dev/diffs/iceberg-rust/1.10.0.diff b/dev/diffs/iceberg-rust/1.10.0.diff index 35336395c7..57a6c1c801 100644 --- a/dev/diffs/iceberg-rust/1.10.0.diff +++ b/dev/diffs/iceberg-rust/1.10.0.diff @@ -1575,3 +1575,783 @@ index a0f45e7610..78e7845911 100644 .recordsPerBatch(parquetConf.batchSize()) .filter(residual) .caseSensitive(caseSensitive()) +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java +deleted file mode 100644 +index 81b7d83a70..0000000000 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java ++++ /dev/null +@@ -1,140 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.io.IOException; +-import org.apache.comet.CometSchemaImporter; +-import org.apache.comet.parquet.AbstractColumnReader; +-import org.apache.comet.parquet.ColumnReader; +-import org.apache.comet.parquet.TypeUtil; +-import org.apache.comet.parquet.Utils; +-import org.apache.comet.shaded.arrow.memory.RootAllocator; +-import org.apache.iceberg.parquet.VectorizedReader; +-import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +-import org.apache.iceberg.spark.SparkSchemaUtil; +-import org.apache.iceberg.types.Types; +-import org.apache.parquet.column.ColumnDescriptor; +-import org.apache.parquet.column.page.PageReader; +-import org.apache.spark.sql.types.DataType; +-import org.apache.spark.sql.types.Metadata; +-import org.apache.spark.sql.types.StructField; +-import org.apache.spark.sql.vectorized.ColumnVector; +- +-class CometColumnReader implements VectorizedReader { +- // use the Comet default batch size +- public static final int DEFAULT_BATCH_SIZE = 8192; +- +- private final ColumnDescriptor descriptor; +- private final DataType sparkType; +- +- // The delegated ColumnReader from Comet side +- private AbstractColumnReader delegate; +- private boolean initialized = false; +- private int batchSize = DEFAULT_BATCH_SIZE; +- private CometSchemaImporter importer; +- +- CometColumnReader(DataType sparkType, ColumnDescriptor descriptor) { +- this.sparkType = sparkType; +- this.descriptor = descriptor; +- } +- +- CometColumnReader(Types.NestedField field) { +- DataType dataType = SparkSchemaUtil.convert(field.type()); +- StructField structField = new StructField(field.name(), dataType, false, Metadata.empty()); +- this.sparkType = dataType; +- this.descriptor = TypeUtil.convertToParquet(structField); +- } +- +- public AbstractColumnReader delegate() { +- return delegate; +- } +- +- void setDelegate(AbstractColumnReader delegate) { +- this.delegate = delegate; +- } +- +- void setInitialized(boolean initialized) { +- this.initialized = initialized; +- } +- +- public int batchSize() { +- return batchSize; +- } +- +- /** +- * This method is to initialized/reset the CometColumnReader. This needs to be called for each row +- * group after readNextRowGroup, so a new dictionary encoding can be set for each of the new row +- * groups. +- */ +- public void reset() { +- if (importer != null) { +- importer.close(); +- } +- +- if (delegate != null) { +- delegate.close(); +- } +- +- this.importer = new CometSchemaImporter(new RootAllocator()); +- this.delegate = Utils.getColumnReader(sparkType, descriptor, importer, batchSize, false, false); +- this.initialized = true; +- } +- +- public ColumnDescriptor descriptor() { +- return descriptor; +- } +- +- /** Returns the Spark data type for this column. */ +- public DataType sparkType() { +- return sparkType; +- } +- +- /** +- * Set the page reader to be 'pageReader'. +- * +- *

NOTE: this should be called before reading a new Parquet column chunk, and after {@link +- * CometColumnReader#reset} is called. +- */ +- public void setPageReader(PageReader pageReader) throws IOException { +- Preconditions.checkState(initialized, "Invalid state: 'reset' should be called first"); +- ((ColumnReader) delegate).setPageReader(pageReader); +- } +- +- @Override +- public void close() { +- // close resources on native side +- if (importer != null) { +- importer.close(); +- } +- +- if (delegate != null) { +- delegate.close(); +- } +- } +- +- @Override +- public void setBatchSize(int size) { +- this.batchSize = size; +- } +- +- @Override +- public ColumnVector read(ColumnVector reuse, int numRowsToRead) { +- throw new UnsupportedOperationException("Not supported"); +- } +-} +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java +deleted file mode 100644 +index 04ac69476a..0000000000 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java ++++ /dev/null +@@ -1,197 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.io.IOException; +-import java.io.UncheckedIOException; +-import java.util.List; +-import java.util.Map; +-import org.apache.comet.parquet.AbstractColumnReader; +-import org.apache.comet.parquet.BatchReader; +-import org.apache.iceberg.Schema; +-import org.apache.iceberg.data.DeleteFilter; +-import org.apache.iceberg.parquet.VectorizedReader; +-import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +-import org.apache.iceberg.spark.SparkSchemaUtil; +-import org.apache.iceberg.util.Pair; +-import org.apache.parquet.column.page.PageReadStore; +-import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +-import org.apache.parquet.hadoop.metadata.ColumnPath; +-import org.apache.spark.sql.catalyst.InternalRow; +-import org.apache.spark.sql.vectorized.ColumnVector; +-import org.apache.spark.sql.vectorized.ColumnarBatch; +- +-/** +- * {@link VectorizedReader} that returns Spark's {@link ColumnarBatch} to support Spark's vectorized +- * read path. The {@link ColumnarBatch} returned is created by passing in the Arrow vectors +- * populated via delegated read calls to {@link CometColumnReader VectorReader(s)}. +- */ +-@SuppressWarnings("checkstyle:VisibilityModifier") +-class CometColumnarBatchReader implements VectorizedReader { +- +- private final CometColumnReader[] readers; +- private final boolean hasIsDeletedColumn; +- +- // The delegated BatchReader on the Comet side does the real work of loading a batch of rows. +- // The Comet BatchReader contains an array of ColumnReader. There is no need to explicitly call +- // ColumnReader.readBatch; instead, BatchReader.nextBatch will be called, which underneath calls +- // ColumnReader.readBatch. The only exception is DeleteColumnReader, because at the time of +- // calling BatchReader.nextBatch, the isDeleted value is not yet available, so +- // DeleteColumnReader.readBatch must be called explicitly later, after the isDeleted value is +- // available. +- private final BatchReader delegate; +- private DeleteFilter deletes = null; +- private long rowStartPosInBatch = 0; +- +- CometColumnarBatchReader(List> readers, Schema schema) { +- this.readers = +- readers.stream().map(CometColumnReader.class::cast).toArray(CometColumnReader[]::new); +- this.hasIsDeletedColumn = +- readers.stream().anyMatch(reader -> reader instanceof CometDeleteColumnReader); +- +- AbstractColumnReader[] abstractColumnReaders = new AbstractColumnReader[readers.size()]; +- this.delegate = new BatchReader(abstractColumnReaders); +- delegate.setSparkSchema(SparkSchemaUtil.convert(schema)); +- } +- +- @Override +- public void setRowGroupInfo( +- PageReadStore pageStore, Map metaData) { +- for (int i = 0; i < readers.length; i++) { +- try { +- if (!(readers[i] instanceof CometConstantColumnReader) +- && !(readers[i] instanceof CometPositionColumnReader) +- && !(readers[i] instanceof CometDeleteColumnReader)) { +- readers[i].reset(); +- readers[i].setPageReader(pageStore.getPageReader(readers[i].descriptor())); +- } +- } catch (IOException e) { +- throw new UncheckedIOException("Failed to setRowGroupInfo for Comet vectorization", e); +- } +- } +- +- for (int i = 0; i < readers.length; i++) { +- delegate.getColumnReaders()[i] = this.readers[i].delegate(); +- } +- +- this.rowStartPosInBatch = +- pageStore +- .getRowIndexOffset() +- .orElseThrow( +- () -> +- new IllegalArgumentException( +- "PageReadStore does not contain row index offset")); +- } +- +- public void setDeleteFilter(DeleteFilter deleteFilter) { +- this.deletes = deleteFilter; +- } +- +- @Override +- public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) { +- ColumnarBatch columnarBatch = new ColumnBatchLoader(numRowsToRead).loadDataToColumnBatch(); +- rowStartPosInBatch += numRowsToRead; +- return columnarBatch; +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- for (CometColumnReader reader : readers) { +- if (reader != null) { +- reader.setBatchSize(batchSize); +- } +- } +- } +- +- @Override +- public void close() { +- for (CometColumnReader reader : readers) { +- if (reader != null) { +- reader.close(); +- } +- } +- } +- +- private class ColumnBatchLoader { +- private final int batchSize; +- +- ColumnBatchLoader(int numRowsToRead) { +- Preconditions.checkArgument( +- numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead); +- this.batchSize = numRowsToRead; +- } +- +- ColumnarBatch loadDataToColumnBatch() { +- ColumnVector[] vectors = readDataToColumnVectors(); +- int numLiveRows = batchSize; +- +- if (hasIsDeletedColumn) { +- boolean[] isDeleted = buildIsDeleted(vectors); +- readDeletedColumn(vectors, isDeleted); +- } else { +- Pair pair = buildRowIdMapping(vectors); +- if (pair != null) { +- int[] rowIdMapping = pair.first(); +- numLiveRows = pair.second(); +- for (int i = 0; i < vectors.length; i++) { +- vectors[i] = new ColumnVectorWithFilter(vectors[i], rowIdMapping); +- } +- } +- } +- +- if (deletes != null && deletes.hasEqDeletes()) { +- vectors = ColumnarBatchUtil.removeExtraColumns(deletes, vectors); +- } +- +- ColumnarBatch batch = new ColumnarBatch(vectors); +- batch.setNumRows(numLiveRows); +- return batch; +- } +- +- private boolean[] buildIsDeleted(ColumnVector[] vectors) { +- return ColumnarBatchUtil.buildIsDeleted(vectors, deletes, rowStartPosInBatch, batchSize); +- } +- +- private Pair buildRowIdMapping(ColumnVector[] vectors) { +- return ColumnarBatchUtil.buildRowIdMapping(vectors, deletes, rowStartPosInBatch, batchSize); +- } +- +- ColumnVector[] readDataToColumnVectors() { +- ColumnVector[] columnVectors = new ColumnVector[readers.length]; +- // Fetch rows for all readers in the delegate +- delegate.nextBatch(batchSize); +- for (int i = 0; i < readers.length; i++) { +- columnVectors[i] = readers[i].delegate().currentBatch(); +- } +- +- return columnVectors; +- } +- +- void readDeletedColumn(ColumnVector[] columnVectors, boolean[] isDeleted) { +- for (int i = 0; i < readers.length; i++) { +- if (readers[i] instanceof CometDeleteColumnReader) { +- CometDeleteColumnReader deleteColumnReader = new CometDeleteColumnReader<>(isDeleted); +- deleteColumnReader.setBatchSize(batchSize); +- deleteColumnReader.delegate().readBatch(batchSize); +- columnVectors[i] = deleteColumnReader.delegate().currentBatch(); +- } +- } +- } +- } +-} +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java +deleted file mode 100644 +index c665002e8f..0000000000 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java ++++ /dev/null +@@ -1,65 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.math.BigDecimal; +-import java.nio.ByteBuffer; +-import org.apache.comet.parquet.ConstantColumnReader; +-import org.apache.iceberg.types.Types; +-import org.apache.spark.sql.types.DataType; +-import org.apache.spark.sql.types.DataTypes; +-import org.apache.spark.sql.types.Decimal; +-import org.apache.spark.sql.types.DecimalType; +-import org.apache.spark.unsafe.types.UTF8String; +- +-class CometConstantColumnReader extends CometColumnReader { +- +- CometConstantColumnReader(T value, Types.NestedField field) { +- super(field); +- // use delegate to set constant value on the native side to be consumed by native execution. +- setDelegate( +- new ConstantColumnReader(sparkType(), descriptor(), convertToSparkValue(value), false)); +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- super.setBatchSize(batchSize); +- delegate().setBatchSize(batchSize); +- setInitialized(true); +- } +- +- private Object convertToSparkValue(T value) { +- DataType dataType = sparkType(); +- // Match the value to Spark internal type if necessary +- if (dataType == DataTypes.StringType && value instanceof String) { +- // the internal type for StringType is UTF8String +- return UTF8String.fromString((String) value); +- } else if (dataType instanceof DecimalType && value instanceof BigDecimal) { +- // the internal type for DecimalType is Decimal +- return Decimal.apply((BigDecimal) value); +- } else if (dataType == DataTypes.BinaryType && value instanceof ByteBuffer) { +- // the internal type for DecimalType is byte[] +- // Iceberg default value should always use HeapBufferBuffer, so calling ByteBuffer.array() +- // should be safe. +- return ((java.nio.ByteBuffer) value).array(); +- } else { +- return value; +- } +- } +-} +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java +deleted file mode 100644 +index 4a28fc51da..0000000000 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java ++++ /dev/null +@@ -1,75 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import org.apache.comet.parquet.MetadataColumnReader; +-import org.apache.comet.parquet.Native; +-import org.apache.comet.parquet.TypeUtil; +-import org.apache.iceberg.MetadataColumns; +-import org.apache.iceberg.types.Types; +-import org.apache.spark.sql.types.DataTypes; +-import org.apache.spark.sql.types.Metadata; +-import org.apache.spark.sql.types.StructField; +- +-class CometDeleteColumnReader extends CometColumnReader { +- CometDeleteColumnReader(Types.NestedField field) { +- super(field); +- setDelegate(new DeleteColumnReader()); +- } +- +- CometDeleteColumnReader(boolean[] isDeleted) { +- super(MetadataColumns.IS_DELETED); +- setDelegate(new DeleteColumnReader(isDeleted)); +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- super.setBatchSize(batchSize); +- delegate().setBatchSize(batchSize); +- setInitialized(true); +- } +- +- private static class DeleteColumnReader extends MetadataColumnReader { +- private boolean[] isDeleted; +- +- DeleteColumnReader() { +- super( +- DataTypes.BooleanType, +- TypeUtil.convertToParquet( +- new StructField("_deleted", DataTypes.BooleanType, false, Metadata.empty())), +- false /* useDecimal128 = false */, +- false /* isConstant */); +- this.isDeleted = new boolean[0]; +- } +- +- DeleteColumnReader(boolean[] isDeleted) { +- this(); +- this.isDeleted = isDeleted; +- } +- +- @Override +- public void readBatch(int total) { +- Native.resetBatch(nativeHandle); +- // set isDeleted on the native side to be consumed by native execution +- Native.setIsDeleted(nativeHandle, isDeleted); +- +- super.readBatch(total); +- } +- } +-} +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java +deleted file mode 100644 +index 1949a71798..0000000000 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java ++++ /dev/null +@@ -1,62 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import org.apache.comet.parquet.MetadataColumnReader; +-import org.apache.comet.parquet.Native; +-import org.apache.iceberg.types.Types; +-import org.apache.parquet.column.ColumnDescriptor; +-import org.apache.spark.sql.types.DataTypes; +- +-class CometPositionColumnReader extends CometColumnReader { +- CometPositionColumnReader(Types.NestedField field) { +- super(field); +- setDelegate(new PositionColumnReader(descriptor())); +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- super.setBatchSize(batchSize); +- delegate().setBatchSize(batchSize); +- setInitialized(true); +- } +- +- private static class PositionColumnReader extends MetadataColumnReader { +- /** The current position value of the column that are used to initialize this column reader. */ +- private long position; +- +- PositionColumnReader(ColumnDescriptor descriptor) { +- super( +- DataTypes.LongType, +- descriptor, +- false /* useDecimal128 = false */, +- false /* isConstant */); +- } +- +- @Override +- public void readBatch(int total) { +- Native.resetBatch(nativeHandle); +- // set position on the native side to be consumed by native execution +- Native.setPosition(nativeHandle, position, total); +- position += total; +- +- super.readBatch(total); +- } +- } +-} +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java +deleted file mode 100644 +index d36f1a7274..0000000000 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java ++++ /dev/null +@@ -1,147 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.util.List; +-import java.util.Map; +-import java.util.function.Function; +-import java.util.stream.IntStream; +-import org.apache.iceberg.MetadataColumns; +-import org.apache.iceberg.Schema; +-import org.apache.iceberg.data.DeleteFilter; +-import org.apache.iceberg.parquet.TypeWithSchemaVisitor; +-import org.apache.iceberg.parquet.VectorizedReader; +-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +-import org.apache.iceberg.relocated.com.google.common.collect.Lists; +-import org.apache.iceberg.relocated.com.google.common.collect.Maps; +-import org.apache.iceberg.spark.SparkSchemaUtil; +-import org.apache.iceberg.types.Types; +-import org.apache.parquet.column.ColumnDescriptor; +-import org.apache.parquet.schema.GroupType; +-import org.apache.parquet.schema.MessageType; +-import org.apache.parquet.schema.PrimitiveType; +-import org.apache.parquet.schema.Type; +-import org.apache.spark.sql.catalyst.InternalRow; +- +-class CometVectorizedReaderBuilder extends TypeWithSchemaVisitor> { +- +- private final MessageType parquetSchema; +- private final Schema icebergSchema; +- private final Map idToConstant; +- private final Function>, VectorizedReader> readerFactory; +- private final DeleteFilter deleteFilter; +- +- CometVectorizedReaderBuilder( +- Schema expectedSchema, +- MessageType parquetSchema, +- Map idToConstant, +- Function>, VectorizedReader> readerFactory, +- DeleteFilter deleteFilter) { +- this.parquetSchema = parquetSchema; +- this.icebergSchema = expectedSchema; +- this.idToConstant = idToConstant; +- this.readerFactory = readerFactory; +- this.deleteFilter = deleteFilter; +- } +- +- @Override +- public VectorizedReader message( +- Types.StructType expected, MessageType message, List> fieldReaders) { +- GroupType groupType = message.asGroupType(); +- Map> readersById = Maps.newHashMap(); +- List fields = groupType.getFields(); +- +- IntStream.range(0, fields.size()) +- .filter(pos -> fields.get(pos).getId() != null) +- .forEach(pos -> readersById.put(fields.get(pos).getId().intValue(), fieldReaders.get(pos))); +- +- List icebergFields = +- expected != null ? expected.fields() : ImmutableList.of(); +- +- List> reorderedFields = +- Lists.newArrayListWithExpectedSize(icebergFields.size()); +- +- for (Types.NestedField field : icebergFields) { +- int id = field.fieldId(); +- VectorizedReader reader = readersById.get(id); +- if (idToConstant.containsKey(id)) { +- CometConstantColumnReader constantReader = +- new CometConstantColumnReader<>(idToConstant.get(id), field); +- reorderedFields.add(constantReader); +- } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { +- reorderedFields.add(new CometPositionColumnReader(field)); +- } else if (id == MetadataColumns.IS_DELETED.fieldId()) { +- CometColumnReader deleteReader = new CometDeleteColumnReader<>(field); +- reorderedFields.add(deleteReader); +- } else if (reader != null) { +- reorderedFields.add(reader); +- } else if (field.initialDefault() != null) { +- CometColumnReader constantReader = +- new CometConstantColumnReader<>(field.initialDefault(), field); +- reorderedFields.add(constantReader); +- } else if (field.isOptional()) { +- CometColumnReader constantReader = new CometConstantColumnReader<>(null, field); +- reorderedFields.add(constantReader); +- } else { +- throw new IllegalArgumentException( +- String.format("Missing required field: %s", field.name())); +- } +- } +- return vectorizedReader(reorderedFields); +- } +- +- protected VectorizedReader vectorizedReader(List> reorderedFields) { +- VectorizedReader reader = readerFactory.apply(reorderedFields); +- if (deleteFilter != null) { +- ((CometColumnarBatchReader) reader).setDeleteFilter(deleteFilter); +- } +- return reader; +- } +- +- @Override +- public VectorizedReader struct( +- Types.StructType expected, GroupType groupType, List> fieldReaders) { +- if (expected != null) { +- throw new UnsupportedOperationException( +- "Vectorized reads are not supported yet for struct fields"); +- } +- return null; +- } +- +- @Override +- public VectorizedReader primitive( +- org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { +- +- if (primitive.getId() == null) { +- return null; +- } +- int parquetFieldId = primitive.getId().intValue(); +- ColumnDescriptor desc = parquetSchema.getColumnDescription(currentPath()); +- // Nested types not yet supported for vectorized reads +- if (desc.getMaxRepetitionLevel() > 0) { +- return null; +- } +- Types.NestedField icebergField = icebergSchema.findField(parquetFieldId); +- if (icebergField == null) { +- return null; +- } +- +- return new CometColumnReader(SparkSchemaUtil.convert(icebergField.type()), desc); +- } +-} +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +index d95baa724b..eecfa6b358 100644 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java ++++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +@@ -83,22 +83,6 @@ public class VectorizedSparkParquetReaders { + expectedSchema, fileSchema, idToConstant, deleteFilter, ArrowAllocation.rootAllocator()); + } + +- public static CometColumnarBatchReader buildCometReader( +- Schema expectedSchema, +- MessageType fileSchema, +- Map idToConstant, +- DeleteFilter deleteFilter) { +- return (CometColumnarBatchReader) +- TypeWithSchemaVisitor.visit( +- expectedSchema.asStruct(), +- fileSchema, +- new CometVectorizedReaderBuilder( +- expectedSchema, +- fileSchema, +- idToConstant, +- readers -> new CometColumnarBatchReader(readers, expectedSchema), +- deleteFilter)); +- } + + // enables unsafe memory access to avoid costly checks to see if index is within bounds + // as long as it is not configured explicitly (see BoundsChecking in Arrow) +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +index a0f45e7610..78e7845911 100644 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java ++++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +@@ -34,7 +34,6 @@ import org.apache.iceberg.parquet.Parquet; + import org.apache.iceberg.relocated.com.google.common.collect.Sets; + import org.apache.iceberg.spark.OrcBatchReadConf; + import org.apache.iceberg.spark.ParquetBatchReadConf; +-import org.apache.iceberg.spark.ParquetReaderType; + import org.apache.iceberg.spark.data.vectorized.VectorizedSparkOrcReaders; + import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders; + import org.apache.iceberg.types.TypeUtil; +@@ -94,15 +93,9 @@ abstract class BaseBatchReader extends BaseReader { +- if (parquetConf.readerType() == ParquetReaderType.COMET) { +- return VectorizedSparkParquetReaders.buildCometReader( +- requiredSchema, fileSchema, idToConstant, deleteFilter); +- } else { +- return VectorizedSparkParquetReaders.buildReader( +- requiredSchema, fileSchema, idToConstant, deleteFilter); +- } +- }) ++ fileSchema -> ++ VectorizedSparkParquetReaders.buildReader( ++ requiredSchema, fileSchema, idToConstant, deleteFilter)) + .recordsPerBatch(parquetConf.batchSize()) + .filter(residual) + .caseSensitive(caseSensitive()) diff --git a/dev/diffs/iceberg-rust/1.8.1.diff b/dev/diffs/iceberg-rust/1.8.1.diff index 92d99be749..0b993c3898 100644 --- a/dev/diffs/iceberg-rust/1.8.1.diff +++ b/dev/diffs/iceberg-rust/1.8.1.diff @@ -899,3 +899,799 @@ index 6719c45ca..251545440 100644 sourceColumnName, tableName, tableName(OTHER_TABLE_NAME), +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java +deleted file mode 100644 +index 4794863ab1..0000000000 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java ++++ /dev/null +@@ -1,150 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.io.IOException; +-import java.util.Map; +-import org.apache.comet.parquet.AbstractColumnReader; +-import org.apache.comet.parquet.ColumnReader; +-import org.apache.comet.parquet.TypeUtil; +-import org.apache.comet.parquet.Utils; +-import org.apache.comet.shaded.arrow.c.CometSchemaImporter; +-import org.apache.comet.shaded.arrow.memory.RootAllocator; +-import org.apache.iceberg.parquet.VectorizedReader; +-import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +-import org.apache.iceberg.spark.SparkSchemaUtil; +-import org.apache.iceberg.types.Types; +-import org.apache.parquet.column.ColumnDescriptor; +-import org.apache.parquet.column.page.PageReadStore; +-import org.apache.parquet.column.page.PageReader; +-import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +-import org.apache.parquet.hadoop.metadata.ColumnPath; +-import org.apache.spark.sql.types.DataType; +-import org.apache.spark.sql.types.Metadata; +-import org.apache.spark.sql.types.StructField; +-import org.apache.spark.sql.vectorized.ColumnVector; +- +-class CometColumnReader implements VectorizedReader { +- // use the Comet default batch size +- public static final int DEFAULT_BATCH_SIZE = 8192; +- +- private final ColumnDescriptor descriptor; +- private final DataType sparkType; +- +- // The delegated ColumnReader from Comet side +- private AbstractColumnReader delegate; +- private boolean initialized = false; +- private int batchSize = DEFAULT_BATCH_SIZE; +- private CometSchemaImporter importer; +- +- CometColumnReader(DataType sparkType, ColumnDescriptor descriptor) { +- this.sparkType = sparkType; +- this.descriptor = descriptor; +- } +- +- CometColumnReader(Types.NestedField field) { +- DataType dataType = SparkSchemaUtil.convert(field.type()); +- StructField structField = new StructField(field.name(), dataType, false, Metadata.empty()); +- this.sparkType = dataType; +- this.descriptor = TypeUtil.convertToParquet(structField); +- } +- +- public AbstractColumnReader delegate() { +- return delegate; +- } +- +- void setDelegate(AbstractColumnReader delegate) { +- this.delegate = delegate; +- } +- +- void setInitialized(boolean initialized) { +- this.initialized = initialized; +- } +- +- public int batchSize() { +- return batchSize; +- } +- +- /** +- * This method is to initialized/reset the CometColumnReader. This needs to be called for each row +- * group after readNextRowGroup, so a new dictionary encoding can be set for each of the new row +- * groups. +- */ +- public void reset() { +- if (importer != null) { +- importer.close(); +- } +- +- if (delegate != null) { +- delegate.close(); +- } +- +- this.importer = new CometSchemaImporter(new RootAllocator()); +- this.delegate = Utils.getColumnReader(sparkType, descriptor, importer, batchSize, false, false); +- this.initialized = true; +- } +- +- public ColumnDescriptor descriptor() { +- return descriptor; +- } +- +- /** Returns the Spark data type for this column. */ +- public DataType sparkType() { +- return sparkType; +- } +- +- /** +- * Set the page reader to be 'pageReader'. +- * +- *

NOTE: this should be called before reading a new Parquet column chunk, and after {@link +- * CometColumnReader#reset} is called. +- */ +- public void setPageReader(PageReader pageReader) throws IOException { +- Preconditions.checkState(initialized, "Invalid state: 'reset' should be called first"); +- ((ColumnReader) delegate).setPageReader(pageReader); +- } +- +- @Override +- public void close() { +- // close resources on native side +- if (importer != null) { +- importer.close(); +- } +- +- if (delegate != null) { +- delegate.close(); +- } +- } +- +- @Override +- public void setBatchSize(int size) { +- this.batchSize = size; +- } +- +- @Override +- public void setRowGroupInfo( +- PageReadStore pageReadStore, Map map, long size) { +- throw new UnsupportedOperationException("Not supported"); +- } +- +- @Override +- public ColumnVector read(ColumnVector reuse, int numRowsToRead) { +- throw new UnsupportedOperationException("Not supported"); +- } +-} +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java +deleted file mode 100644 +index 1440e5d1d3..0000000000 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java ++++ /dev/null +@@ -1,203 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.io.IOException; +-import java.io.UncheckedIOException; +-import java.util.List; +-import java.util.Map; +-import org.apache.comet.parquet.AbstractColumnReader; +-import org.apache.comet.parquet.BatchReader; +-import org.apache.iceberg.Schema; +-import org.apache.iceberg.data.DeleteFilter; +-import org.apache.iceberg.parquet.VectorizedReader; +-import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +-import org.apache.iceberg.spark.SparkSchemaUtil; +-import org.apache.iceberg.util.Pair; +-import org.apache.parquet.column.page.PageReadStore; +-import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +-import org.apache.parquet.hadoop.metadata.ColumnPath; +-import org.apache.spark.sql.catalyst.InternalRow; +-import org.apache.spark.sql.vectorized.ColumnVector; +-import org.apache.spark.sql.vectorized.ColumnarBatch; +- +-/** +- * {@link VectorizedReader} that returns Spark's {@link ColumnarBatch} to support Spark's vectorized +- * read path. The {@link ColumnarBatch} returned is created by passing in the Arrow vectors +- * populated via delegated read calls to {@link CometColumnReader VectorReader(s)}. +- */ +-@SuppressWarnings("checkstyle:VisibilityModifier") +-class CometColumnarBatchReader implements VectorizedReader { +- +- private final CometColumnReader[] readers; +- private final boolean hasIsDeletedColumn; +- +- // The delegated BatchReader on the Comet side does the real work of loading a batch of rows. +- // The Comet BatchReader contains an array of ColumnReader. There is no need to explicitly call +- // ColumnReader.readBatch; instead, BatchReader.nextBatch will be called, which underneath calls +- // ColumnReader.readBatch. The only exception is DeleteColumnReader, because at the time of +- // calling BatchReader.nextBatch, the isDeleted value is not yet available, so +- // DeleteColumnReader.readBatch must be called explicitly later, after the isDeleted value is +- // available. +- private final BatchReader delegate; +- private DeleteFilter deletes = null; +- private long rowStartPosInBatch = 0; +- +- CometColumnarBatchReader(List> readers, Schema schema) { +- this.readers = +- readers.stream().map(CometColumnReader.class::cast).toArray(CometColumnReader[]::new); +- this.hasIsDeletedColumn = +- readers.stream().anyMatch(reader -> reader instanceof CometDeleteColumnReader); +- +- AbstractColumnReader[] abstractColumnReaders = new AbstractColumnReader[readers.size()]; +- this.delegate = new BatchReader(abstractColumnReaders); +- delegate.setSparkSchema(SparkSchemaUtil.convert(schema)); +- } +- +- @Override +- public void setRowGroupInfo( +- PageReadStore pageStore, Map metaData, long rowPosition) { +- setRowGroupInfo(pageStore, metaData); +- } +- +- @Override +- public void setRowGroupInfo( +- PageReadStore pageStore, Map metaData) { +- for (int i = 0; i < readers.length; i++) { +- try { +- if (!(readers[i] instanceof CometConstantColumnReader) +- && !(readers[i] instanceof CometPositionColumnReader) +- && !(readers[i] instanceof CometDeleteColumnReader)) { +- readers[i].reset(); +- readers[i].setPageReader(pageStore.getPageReader(readers[i].descriptor())); +- } +- } catch (IOException e) { +- throw new UncheckedIOException("Failed to setRowGroupInfo for Comet vectorization", e); +- } +- } +- +- for (int i = 0; i < readers.length; i++) { +- delegate.getColumnReaders()[i] = this.readers[i].delegate(); +- } +- +- this.rowStartPosInBatch = +- pageStore +- .getRowIndexOffset() +- .orElseThrow( +- () -> +- new IllegalArgumentException( +- "PageReadStore does not contain row index offset")); +- } +- +- public void setDeleteFilter(DeleteFilter deleteFilter) { +- this.deletes = deleteFilter; +- } +- +- @Override +- public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) { +- ColumnarBatch columnarBatch = new ColumnBatchLoader(numRowsToRead).loadDataToColumnBatch(); +- rowStartPosInBatch += numRowsToRead; +- return columnarBatch; +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- for (CometColumnReader reader : readers) { +- if (reader != null) { +- reader.setBatchSize(batchSize); +- } +- } +- } +- +- @Override +- public void close() { +- for (CometColumnReader reader : readers) { +- if (reader != null) { +- reader.close(); +- } +- } +- } +- +- private class ColumnBatchLoader { +- private final int batchSize; +- +- ColumnBatchLoader(int numRowsToRead) { +- Preconditions.checkArgument( +- numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead); +- this.batchSize = numRowsToRead; +- } +- +- ColumnarBatch loadDataToColumnBatch() { +- ColumnVector[] vectors = readDataToColumnVectors(); +- int numLiveRows = batchSize; +- +- if (hasIsDeletedColumn) { +- boolean[] isDeleted = buildIsDeleted(vectors); +- readDeletedColumn(vectors, isDeleted); +- } else { +- Pair pair = buildRowIdMapping(vectors); +- if (pair != null) { +- int[] rowIdMapping = pair.first(); +- numLiveRows = pair.second(); +- for (int i = 0; i < vectors.length; i++) { +- vectors[i] = new ColumnVectorWithFilter(vectors[i], rowIdMapping); +- } +- } +- } +- +- if (deletes != null && deletes.hasEqDeletes()) { +- vectors = ColumnarBatchUtil.removeExtraColumns(deletes, vectors); +- } +- +- ColumnarBatch batch = new ColumnarBatch(vectors); +- batch.setNumRows(numLiveRows); +- return batch; +- } +- +- private boolean[] buildIsDeleted(ColumnVector[] vectors) { +- return ColumnarBatchUtil.buildIsDeleted(vectors, deletes, rowStartPosInBatch, batchSize); +- } +- +- private Pair buildRowIdMapping(ColumnVector[] vectors) { +- return ColumnarBatchUtil.buildRowIdMapping(vectors, deletes, rowStartPosInBatch, batchSize); +- } +- +- ColumnVector[] readDataToColumnVectors() { +- ColumnVector[] columnVectors = new ColumnVector[readers.length]; +- // Fetch rows for all readers in the delegate +- delegate.nextBatch(batchSize); +- for (int i = 0; i < readers.length; i++) { +- columnVectors[i] = readers[i].delegate().currentBatch(); +- } +- +- return columnVectors; +- } +- +- void readDeletedColumn(ColumnVector[] columnVectors, boolean[] isDeleted) { +- for (int i = 0; i < readers.length; i++) { +- if (readers[i] instanceof CometDeleteColumnReader) { +- CometDeleteColumnReader deleteColumnReader = new CometDeleteColumnReader<>(isDeleted); +- deleteColumnReader.setBatchSize(batchSize); +- deleteColumnReader.delegate().readBatch(batchSize); +- columnVectors[i] = deleteColumnReader.delegate().currentBatch(); +- } +- } +- } +- } +-} +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java +deleted file mode 100644 +index c665002e8f..0000000000 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java ++++ /dev/null +@@ -1,65 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.math.BigDecimal; +-import java.nio.ByteBuffer; +-import org.apache.comet.parquet.ConstantColumnReader; +-import org.apache.iceberg.types.Types; +-import org.apache.spark.sql.types.DataType; +-import org.apache.spark.sql.types.DataTypes; +-import org.apache.spark.sql.types.Decimal; +-import org.apache.spark.sql.types.DecimalType; +-import org.apache.spark.unsafe.types.UTF8String; +- +-class CometConstantColumnReader extends CometColumnReader { +- +- CometConstantColumnReader(T value, Types.NestedField field) { +- super(field); +- // use delegate to set constant value on the native side to be consumed by native execution. +- setDelegate( +- new ConstantColumnReader(sparkType(), descriptor(), convertToSparkValue(value), false)); +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- super.setBatchSize(batchSize); +- delegate().setBatchSize(batchSize); +- setInitialized(true); +- } +- +- private Object convertToSparkValue(T value) { +- DataType dataType = sparkType(); +- // Match the value to Spark internal type if necessary +- if (dataType == DataTypes.StringType && value instanceof String) { +- // the internal type for StringType is UTF8String +- return UTF8String.fromString((String) value); +- } else if (dataType instanceof DecimalType && value instanceof BigDecimal) { +- // the internal type for DecimalType is Decimal +- return Decimal.apply((BigDecimal) value); +- } else if (dataType == DataTypes.BinaryType && value instanceof ByteBuffer) { +- // the internal type for DecimalType is byte[] +- // Iceberg default value should always use HeapBufferBuffer, so calling ByteBuffer.array() +- // should be safe. +- return ((java.nio.ByteBuffer) value).array(); +- } else { +- return value; +- } +- } +-} +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java +deleted file mode 100644 +index 4a28fc51da..0000000000 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java ++++ /dev/null +@@ -1,75 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import org.apache.comet.parquet.MetadataColumnReader; +-import org.apache.comet.parquet.Native; +-import org.apache.comet.parquet.TypeUtil; +-import org.apache.iceberg.MetadataColumns; +-import org.apache.iceberg.types.Types; +-import org.apache.spark.sql.types.DataTypes; +-import org.apache.spark.sql.types.Metadata; +-import org.apache.spark.sql.types.StructField; +- +-class CometDeleteColumnReader extends CometColumnReader { +- CometDeleteColumnReader(Types.NestedField field) { +- super(field); +- setDelegate(new DeleteColumnReader()); +- } +- +- CometDeleteColumnReader(boolean[] isDeleted) { +- super(MetadataColumns.IS_DELETED); +- setDelegate(new DeleteColumnReader(isDeleted)); +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- super.setBatchSize(batchSize); +- delegate().setBatchSize(batchSize); +- setInitialized(true); +- } +- +- private static class DeleteColumnReader extends MetadataColumnReader { +- private boolean[] isDeleted; +- +- DeleteColumnReader() { +- super( +- DataTypes.BooleanType, +- TypeUtil.convertToParquet( +- new StructField("_deleted", DataTypes.BooleanType, false, Metadata.empty())), +- false /* useDecimal128 = false */, +- false /* isConstant */); +- this.isDeleted = new boolean[0]; +- } +- +- DeleteColumnReader(boolean[] isDeleted) { +- this(); +- this.isDeleted = isDeleted; +- } +- +- @Override +- public void readBatch(int total) { +- Native.resetBatch(nativeHandle); +- // set isDeleted on the native side to be consumed by native execution +- Native.setIsDeleted(nativeHandle, isDeleted); +- +- super.readBatch(total); +- } +- } +-} +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java +deleted file mode 100644 +index 1949a71798..0000000000 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java ++++ /dev/null +@@ -1,62 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import org.apache.comet.parquet.MetadataColumnReader; +-import org.apache.comet.parquet.Native; +-import org.apache.iceberg.types.Types; +-import org.apache.parquet.column.ColumnDescriptor; +-import org.apache.spark.sql.types.DataTypes; +- +-class CometPositionColumnReader extends CometColumnReader { +- CometPositionColumnReader(Types.NestedField field) { +- super(field); +- setDelegate(new PositionColumnReader(descriptor())); +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- super.setBatchSize(batchSize); +- delegate().setBatchSize(batchSize); +- setInitialized(true); +- } +- +- private static class PositionColumnReader extends MetadataColumnReader { +- /** The current position value of the column that are used to initialize this column reader. */ +- private long position; +- +- PositionColumnReader(ColumnDescriptor descriptor) { +- super( +- DataTypes.LongType, +- descriptor, +- false /* useDecimal128 = false */, +- false /* isConstant */); +- } +- +- @Override +- public void readBatch(int total) { +- Native.resetBatch(nativeHandle); +- // set position on the native side to be consumed by native execution +- Native.setPosition(nativeHandle, position, total); +- position += total; +- +- super.readBatch(total); +- } +- } +-} +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java +deleted file mode 100644 +index d36f1a7274..0000000000 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java ++++ /dev/null +@@ -1,147 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.util.List; +-import java.util.Map; +-import java.util.function.Function; +-import java.util.stream.IntStream; +-import org.apache.iceberg.MetadataColumns; +-import org.apache.iceberg.Schema; +-import org.apache.iceberg.data.DeleteFilter; +-import org.apache.iceberg.parquet.TypeWithSchemaVisitor; +-import org.apache.iceberg.parquet.VectorizedReader; +-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +-import org.apache.iceberg.relocated.com.google.common.collect.Lists; +-import org.apache.iceberg.relocated.com.google.common.collect.Maps; +-import org.apache.iceberg.spark.SparkSchemaUtil; +-import org.apache.iceberg.types.Types; +-import org.apache.parquet.column.ColumnDescriptor; +-import org.apache.parquet.schema.GroupType; +-import org.apache.parquet.schema.MessageType; +-import org.apache.parquet.schema.PrimitiveType; +-import org.apache.parquet.schema.Type; +-import org.apache.spark.sql.catalyst.InternalRow; +- +-class CometVectorizedReaderBuilder extends TypeWithSchemaVisitor> { +- +- private final MessageType parquetSchema; +- private final Schema icebergSchema; +- private final Map idToConstant; +- private final Function>, VectorizedReader> readerFactory; +- private final DeleteFilter deleteFilter; +- +- CometVectorizedReaderBuilder( +- Schema expectedSchema, +- MessageType parquetSchema, +- Map idToConstant, +- Function>, VectorizedReader> readerFactory, +- DeleteFilter deleteFilter) { +- this.parquetSchema = parquetSchema; +- this.icebergSchema = expectedSchema; +- this.idToConstant = idToConstant; +- this.readerFactory = readerFactory; +- this.deleteFilter = deleteFilter; +- } +- +- @Override +- public VectorizedReader message( +- Types.StructType expected, MessageType message, List> fieldReaders) { +- GroupType groupType = message.asGroupType(); +- Map> readersById = Maps.newHashMap(); +- List fields = groupType.getFields(); +- +- IntStream.range(0, fields.size()) +- .filter(pos -> fields.get(pos).getId() != null) +- .forEach(pos -> readersById.put(fields.get(pos).getId().intValue(), fieldReaders.get(pos))); +- +- List icebergFields = +- expected != null ? expected.fields() : ImmutableList.of(); +- +- List> reorderedFields = +- Lists.newArrayListWithExpectedSize(icebergFields.size()); +- +- for (Types.NestedField field : icebergFields) { +- int id = field.fieldId(); +- VectorizedReader reader = readersById.get(id); +- if (idToConstant.containsKey(id)) { +- CometConstantColumnReader constantReader = +- new CometConstantColumnReader<>(idToConstant.get(id), field); +- reorderedFields.add(constantReader); +- } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { +- reorderedFields.add(new CometPositionColumnReader(field)); +- } else if (id == MetadataColumns.IS_DELETED.fieldId()) { +- CometColumnReader deleteReader = new CometDeleteColumnReader<>(field); +- reorderedFields.add(deleteReader); +- } else if (reader != null) { +- reorderedFields.add(reader); +- } else if (field.initialDefault() != null) { +- CometColumnReader constantReader = +- new CometConstantColumnReader<>(field.initialDefault(), field); +- reorderedFields.add(constantReader); +- } else if (field.isOptional()) { +- CometColumnReader constantReader = new CometConstantColumnReader<>(null, field); +- reorderedFields.add(constantReader); +- } else { +- throw new IllegalArgumentException( +- String.format("Missing required field: %s", field.name())); +- } +- } +- return vectorizedReader(reorderedFields); +- } +- +- protected VectorizedReader vectorizedReader(List> reorderedFields) { +- VectorizedReader reader = readerFactory.apply(reorderedFields); +- if (deleteFilter != null) { +- ((CometColumnarBatchReader) reader).setDeleteFilter(deleteFilter); +- } +- return reader; +- } +- +- @Override +- public VectorizedReader struct( +- Types.StructType expected, GroupType groupType, List> fieldReaders) { +- if (expected != null) { +- throw new UnsupportedOperationException( +- "Vectorized reads are not supported yet for struct fields"); +- } +- return null; +- } +- +- @Override +- public VectorizedReader primitive( +- org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { +- +- if (primitive.getId() == null) { +- return null; +- } +- int parquetFieldId = primitive.getId().intValue(); +- ColumnDescriptor desc = parquetSchema.getColumnDescription(currentPath()); +- // Nested types not yet supported for vectorized reads +- if (desc.getMaxRepetitionLevel() > 0) { +- return null; +- } +- Types.NestedField icebergField = icebergSchema.findField(parquetFieldId); +- if (icebergField == null) { +- return null; +- } +- +- return new CometColumnReader(SparkSchemaUtil.convert(icebergField.type()), desc); +- } +-} +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +index b523bc5bff..4aa274a0c7 100644 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java ++++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +@@ -70,22 +70,6 @@ public class VectorizedSparkParquetReaders { + deleteFilter)); + } + +- public static CometColumnarBatchReader buildCometReader( +- Schema expectedSchema, +- MessageType fileSchema, +- Map idToConstant, +- DeleteFilter deleteFilter) { +- return (CometColumnarBatchReader) +- TypeWithSchemaVisitor.visit( +- expectedSchema.asStruct(), +- fileSchema, +- new CometVectorizedReaderBuilder( +- expectedSchema, +- fileSchema, +- idToConstant, +- readers -> new CometColumnarBatchReader(readers, expectedSchema), +- deleteFilter)); +- } + + // enables unsafe memory access to avoid costly checks to see if index is within bounds + // as long as it is not configured explicitly (see BoundsChecking in Arrow) +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +index 780e1750a5..25f253eede 100644 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java ++++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +@@ -34,7 +34,6 @@ import org.apache.iceberg.parquet.Parquet; + import org.apache.iceberg.relocated.com.google.common.collect.Sets; + import org.apache.iceberg.spark.OrcBatchReadConf; + import org.apache.iceberg.spark.ParquetBatchReadConf; +-import org.apache.iceberg.spark.ParquetReaderType; + import org.apache.iceberg.spark.data.vectorized.VectorizedSparkOrcReaders; + import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders; + import org.apache.iceberg.types.TypeUtil; +@@ -92,15 +91,9 @@ abstract class BaseBatchReader extends BaseReader { +- if (parquetConf.readerType() == ParquetReaderType.COMET) { +- return VectorizedSparkParquetReaders.buildCometReader( +- requiredSchema, fileSchema, idToConstant, deleteFilter); +- } else { +- return VectorizedSparkParquetReaders.buildReader( +- requiredSchema, fileSchema, idToConstant, deleteFilter); +- } +- }) ++ fileSchema -> ++ VectorizedSparkParquetReaders.buildReader( ++ requiredSchema, fileSchema, idToConstant, deleteFilter)) + .recordsPerBatch(parquetConf.batchSize()) + .filter(residual) + .caseSensitive(caseSensitive()) diff --git a/dev/diffs/iceberg-rust/1.9.1.diff b/dev/diffs/iceberg-rust/1.9.1.diff index 023e378e27..67bd9fc88c 100644 --- a/dev/diffs/iceberg-rust/1.9.1.diff +++ b/dev/diffs/iceberg-rust/1.9.1.diff @@ -890,3 +890,783 @@ index 6719c45ca..251545440 100644 sourceColumnName, tableName, tableName(OTHER_TABLE_NAME), +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java +deleted file mode 100644 +index 16159dcbdf..0000000000 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java ++++ /dev/null +@@ -1,140 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.io.IOException; +-import org.apache.comet.parquet.AbstractColumnReader; +-import org.apache.comet.parquet.ColumnReader; +-import org.apache.comet.parquet.TypeUtil; +-import org.apache.comet.parquet.Utils; +-import org.apache.comet.shaded.arrow.c.CometSchemaImporter; +-import org.apache.comet.shaded.arrow.memory.RootAllocator; +-import org.apache.iceberg.parquet.VectorizedReader; +-import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +-import org.apache.iceberg.spark.SparkSchemaUtil; +-import org.apache.iceberg.types.Types; +-import org.apache.parquet.column.ColumnDescriptor; +-import org.apache.parquet.column.page.PageReader; +-import org.apache.spark.sql.types.DataType; +-import org.apache.spark.sql.types.Metadata; +-import org.apache.spark.sql.types.StructField; +-import org.apache.spark.sql.vectorized.ColumnVector; +- +-class CometColumnReader implements VectorizedReader { +- // use the Comet default batch size +- public static final int DEFAULT_BATCH_SIZE = 8192; +- +- private final ColumnDescriptor descriptor; +- private final DataType sparkType; +- +- // The delegated ColumnReader from Comet side +- private AbstractColumnReader delegate; +- private boolean initialized = false; +- private int batchSize = DEFAULT_BATCH_SIZE; +- private CometSchemaImporter importer; +- +- CometColumnReader(DataType sparkType, ColumnDescriptor descriptor) { +- this.sparkType = sparkType; +- this.descriptor = descriptor; +- } +- +- CometColumnReader(Types.NestedField field) { +- DataType dataType = SparkSchemaUtil.convert(field.type()); +- StructField structField = new StructField(field.name(), dataType, false, Metadata.empty()); +- this.sparkType = dataType; +- this.descriptor = TypeUtil.convertToParquet(structField); +- } +- +- public AbstractColumnReader delegate() { +- return delegate; +- } +- +- void setDelegate(AbstractColumnReader delegate) { +- this.delegate = delegate; +- } +- +- void setInitialized(boolean initialized) { +- this.initialized = initialized; +- } +- +- public int batchSize() { +- return batchSize; +- } +- +- /** +- * This method is to initialized/reset the CometColumnReader. This needs to be called for each row +- * group after readNextRowGroup, so a new dictionary encoding can be set for each of the new row +- * groups. +- */ +- public void reset() { +- if (importer != null) { +- importer.close(); +- } +- +- if (delegate != null) { +- delegate.close(); +- } +- +- this.importer = new CometSchemaImporter(new RootAllocator()); +- this.delegate = Utils.getColumnReader(sparkType, descriptor, importer, batchSize, false, false); +- this.initialized = true; +- } +- +- public ColumnDescriptor descriptor() { +- return descriptor; +- } +- +- /** Returns the Spark data type for this column. */ +- public DataType sparkType() { +- return sparkType; +- } +- +- /** +- * Set the page reader to be 'pageReader'. +- * +- *

NOTE: this should be called before reading a new Parquet column chunk, and after {@link +- * CometColumnReader#reset} is called. +- */ +- public void setPageReader(PageReader pageReader) throws IOException { +- Preconditions.checkState(initialized, "Invalid state: 'reset' should be called first"); +- ((ColumnReader) delegate).setPageReader(pageReader); +- } +- +- @Override +- public void close() { +- // close resources on native side +- if (importer != null) { +- importer.close(); +- } +- +- if (delegate != null) { +- delegate.close(); +- } +- } +- +- @Override +- public void setBatchSize(int size) { +- this.batchSize = size; +- } +- +- @Override +- public ColumnVector read(ColumnVector reuse, int numRowsToRead) { +- throw new UnsupportedOperationException("Not supported"); +- } +-} +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java +deleted file mode 100644 +index 04ac69476a..0000000000 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java ++++ /dev/null +@@ -1,197 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.io.IOException; +-import java.io.UncheckedIOException; +-import java.util.List; +-import java.util.Map; +-import org.apache.comet.parquet.AbstractColumnReader; +-import org.apache.comet.parquet.BatchReader; +-import org.apache.iceberg.Schema; +-import org.apache.iceberg.data.DeleteFilter; +-import org.apache.iceberg.parquet.VectorizedReader; +-import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +-import org.apache.iceberg.spark.SparkSchemaUtil; +-import org.apache.iceberg.util.Pair; +-import org.apache.parquet.column.page.PageReadStore; +-import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +-import org.apache.parquet.hadoop.metadata.ColumnPath; +-import org.apache.spark.sql.catalyst.InternalRow; +-import org.apache.spark.sql.vectorized.ColumnVector; +-import org.apache.spark.sql.vectorized.ColumnarBatch; +- +-/** +- * {@link VectorizedReader} that returns Spark's {@link ColumnarBatch} to support Spark's vectorized +- * read path. The {@link ColumnarBatch} returned is created by passing in the Arrow vectors +- * populated via delegated read calls to {@link CometColumnReader VectorReader(s)}. +- */ +-@SuppressWarnings("checkstyle:VisibilityModifier") +-class CometColumnarBatchReader implements VectorizedReader { +- +- private final CometColumnReader[] readers; +- private final boolean hasIsDeletedColumn; +- +- // The delegated BatchReader on the Comet side does the real work of loading a batch of rows. +- // The Comet BatchReader contains an array of ColumnReader. There is no need to explicitly call +- // ColumnReader.readBatch; instead, BatchReader.nextBatch will be called, which underneath calls +- // ColumnReader.readBatch. The only exception is DeleteColumnReader, because at the time of +- // calling BatchReader.nextBatch, the isDeleted value is not yet available, so +- // DeleteColumnReader.readBatch must be called explicitly later, after the isDeleted value is +- // available. +- private final BatchReader delegate; +- private DeleteFilter deletes = null; +- private long rowStartPosInBatch = 0; +- +- CometColumnarBatchReader(List> readers, Schema schema) { +- this.readers = +- readers.stream().map(CometColumnReader.class::cast).toArray(CometColumnReader[]::new); +- this.hasIsDeletedColumn = +- readers.stream().anyMatch(reader -> reader instanceof CometDeleteColumnReader); +- +- AbstractColumnReader[] abstractColumnReaders = new AbstractColumnReader[readers.size()]; +- this.delegate = new BatchReader(abstractColumnReaders); +- delegate.setSparkSchema(SparkSchemaUtil.convert(schema)); +- } +- +- @Override +- public void setRowGroupInfo( +- PageReadStore pageStore, Map metaData) { +- for (int i = 0; i < readers.length; i++) { +- try { +- if (!(readers[i] instanceof CometConstantColumnReader) +- && !(readers[i] instanceof CometPositionColumnReader) +- && !(readers[i] instanceof CometDeleteColumnReader)) { +- readers[i].reset(); +- readers[i].setPageReader(pageStore.getPageReader(readers[i].descriptor())); +- } +- } catch (IOException e) { +- throw new UncheckedIOException("Failed to setRowGroupInfo for Comet vectorization", e); +- } +- } +- +- for (int i = 0; i < readers.length; i++) { +- delegate.getColumnReaders()[i] = this.readers[i].delegate(); +- } +- +- this.rowStartPosInBatch = +- pageStore +- .getRowIndexOffset() +- .orElseThrow( +- () -> +- new IllegalArgumentException( +- "PageReadStore does not contain row index offset")); +- } +- +- public void setDeleteFilter(DeleteFilter deleteFilter) { +- this.deletes = deleteFilter; +- } +- +- @Override +- public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) { +- ColumnarBatch columnarBatch = new ColumnBatchLoader(numRowsToRead).loadDataToColumnBatch(); +- rowStartPosInBatch += numRowsToRead; +- return columnarBatch; +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- for (CometColumnReader reader : readers) { +- if (reader != null) { +- reader.setBatchSize(batchSize); +- } +- } +- } +- +- @Override +- public void close() { +- for (CometColumnReader reader : readers) { +- if (reader != null) { +- reader.close(); +- } +- } +- } +- +- private class ColumnBatchLoader { +- private final int batchSize; +- +- ColumnBatchLoader(int numRowsToRead) { +- Preconditions.checkArgument( +- numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead); +- this.batchSize = numRowsToRead; +- } +- +- ColumnarBatch loadDataToColumnBatch() { +- ColumnVector[] vectors = readDataToColumnVectors(); +- int numLiveRows = batchSize; +- +- if (hasIsDeletedColumn) { +- boolean[] isDeleted = buildIsDeleted(vectors); +- readDeletedColumn(vectors, isDeleted); +- } else { +- Pair pair = buildRowIdMapping(vectors); +- if (pair != null) { +- int[] rowIdMapping = pair.first(); +- numLiveRows = pair.second(); +- for (int i = 0; i < vectors.length; i++) { +- vectors[i] = new ColumnVectorWithFilter(vectors[i], rowIdMapping); +- } +- } +- } +- +- if (deletes != null && deletes.hasEqDeletes()) { +- vectors = ColumnarBatchUtil.removeExtraColumns(deletes, vectors); +- } +- +- ColumnarBatch batch = new ColumnarBatch(vectors); +- batch.setNumRows(numLiveRows); +- return batch; +- } +- +- private boolean[] buildIsDeleted(ColumnVector[] vectors) { +- return ColumnarBatchUtil.buildIsDeleted(vectors, deletes, rowStartPosInBatch, batchSize); +- } +- +- private Pair buildRowIdMapping(ColumnVector[] vectors) { +- return ColumnarBatchUtil.buildRowIdMapping(vectors, deletes, rowStartPosInBatch, batchSize); +- } +- +- ColumnVector[] readDataToColumnVectors() { +- ColumnVector[] columnVectors = new ColumnVector[readers.length]; +- // Fetch rows for all readers in the delegate +- delegate.nextBatch(batchSize); +- for (int i = 0; i < readers.length; i++) { +- columnVectors[i] = readers[i].delegate().currentBatch(); +- } +- +- return columnVectors; +- } +- +- void readDeletedColumn(ColumnVector[] columnVectors, boolean[] isDeleted) { +- for (int i = 0; i < readers.length; i++) { +- if (readers[i] instanceof CometDeleteColumnReader) { +- CometDeleteColumnReader deleteColumnReader = new CometDeleteColumnReader<>(isDeleted); +- deleteColumnReader.setBatchSize(batchSize); +- deleteColumnReader.delegate().readBatch(batchSize); +- columnVectors[i] = deleteColumnReader.delegate().currentBatch(); +- } +- } +- } +- } +-} +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java +deleted file mode 100644 +index c665002e8f..0000000000 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java ++++ /dev/null +@@ -1,65 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.math.BigDecimal; +-import java.nio.ByteBuffer; +-import org.apache.comet.parquet.ConstantColumnReader; +-import org.apache.iceberg.types.Types; +-import org.apache.spark.sql.types.DataType; +-import org.apache.spark.sql.types.DataTypes; +-import org.apache.spark.sql.types.Decimal; +-import org.apache.spark.sql.types.DecimalType; +-import org.apache.spark.unsafe.types.UTF8String; +- +-class CometConstantColumnReader extends CometColumnReader { +- +- CometConstantColumnReader(T value, Types.NestedField field) { +- super(field); +- // use delegate to set constant value on the native side to be consumed by native execution. +- setDelegate( +- new ConstantColumnReader(sparkType(), descriptor(), convertToSparkValue(value), false)); +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- super.setBatchSize(batchSize); +- delegate().setBatchSize(batchSize); +- setInitialized(true); +- } +- +- private Object convertToSparkValue(T value) { +- DataType dataType = sparkType(); +- // Match the value to Spark internal type if necessary +- if (dataType == DataTypes.StringType && value instanceof String) { +- // the internal type for StringType is UTF8String +- return UTF8String.fromString((String) value); +- } else if (dataType instanceof DecimalType && value instanceof BigDecimal) { +- // the internal type for DecimalType is Decimal +- return Decimal.apply((BigDecimal) value); +- } else if (dataType == DataTypes.BinaryType && value instanceof ByteBuffer) { +- // the internal type for DecimalType is byte[] +- // Iceberg default value should always use HeapBufferBuffer, so calling ByteBuffer.array() +- // should be safe. +- return ((java.nio.ByteBuffer) value).array(); +- } else { +- return value; +- } +- } +-} +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java +deleted file mode 100644 +index 4a28fc51da..0000000000 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java ++++ /dev/null +@@ -1,75 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import org.apache.comet.parquet.MetadataColumnReader; +-import org.apache.comet.parquet.Native; +-import org.apache.comet.parquet.TypeUtil; +-import org.apache.iceberg.MetadataColumns; +-import org.apache.iceberg.types.Types; +-import org.apache.spark.sql.types.DataTypes; +-import org.apache.spark.sql.types.Metadata; +-import org.apache.spark.sql.types.StructField; +- +-class CometDeleteColumnReader extends CometColumnReader { +- CometDeleteColumnReader(Types.NestedField field) { +- super(field); +- setDelegate(new DeleteColumnReader()); +- } +- +- CometDeleteColumnReader(boolean[] isDeleted) { +- super(MetadataColumns.IS_DELETED); +- setDelegate(new DeleteColumnReader(isDeleted)); +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- super.setBatchSize(batchSize); +- delegate().setBatchSize(batchSize); +- setInitialized(true); +- } +- +- private static class DeleteColumnReader extends MetadataColumnReader { +- private boolean[] isDeleted; +- +- DeleteColumnReader() { +- super( +- DataTypes.BooleanType, +- TypeUtil.convertToParquet( +- new StructField("_deleted", DataTypes.BooleanType, false, Metadata.empty())), +- false /* useDecimal128 = false */, +- false /* isConstant */); +- this.isDeleted = new boolean[0]; +- } +- +- DeleteColumnReader(boolean[] isDeleted) { +- this(); +- this.isDeleted = isDeleted; +- } +- +- @Override +- public void readBatch(int total) { +- Native.resetBatch(nativeHandle); +- // set isDeleted on the native side to be consumed by native execution +- Native.setIsDeleted(nativeHandle, isDeleted); +- +- super.readBatch(total); +- } +- } +-} +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java +deleted file mode 100644 +index 1949a71798..0000000000 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java ++++ /dev/null +@@ -1,62 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import org.apache.comet.parquet.MetadataColumnReader; +-import org.apache.comet.parquet.Native; +-import org.apache.iceberg.types.Types; +-import org.apache.parquet.column.ColumnDescriptor; +-import org.apache.spark.sql.types.DataTypes; +- +-class CometPositionColumnReader extends CometColumnReader { +- CometPositionColumnReader(Types.NestedField field) { +- super(field); +- setDelegate(new PositionColumnReader(descriptor())); +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- super.setBatchSize(batchSize); +- delegate().setBatchSize(batchSize); +- setInitialized(true); +- } +- +- private static class PositionColumnReader extends MetadataColumnReader { +- /** The current position value of the column that are used to initialize this column reader. */ +- private long position; +- +- PositionColumnReader(ColumnDescriptor descriptor) { +- super( +- DataTypes.LongType, +- descriptor, +- false /* useDecimal128 = false */, +- false /* isConstant */); +- } +- +- @Override +- public void readBatch(int total) { +- Native.resetBatch(nativeHandle); +- // set position on the native side to be consumed by native execution +- Native.setPosition(nativeHandle, position, total); +- position += total; +- +- super.readBatch(total); +- } +- } +-} +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java +deleted file mode 100644 +index d36f1a7274..0000000000 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java ++++ /dev/null +@@ -1,147 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.util.List; +-import java.util.Map; +-import java.util.function.Function; +-import java.util.stream.IntStream; +-import org.apache.iceberg.MetadataColumns; +-import org.apache.iceberg.Schema; +-import org.apache.iceberg.data.DeleteFilter; +-import org.apache.iceberg.parquet.TypeWithSchemaVisitor; +-import org.apache.iceberg.parquet.VectorizedReader; +-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +-import org.apache.iceberg.relocated.com.google.common.collect.Lists; +-import org.apache.iceberg.relocated.com.google.common.collect.Maps; +-import org.apache.iceberg.spark.SparkSchemaUtil; +-import org.apache.iceberg.types.Types; +-import org.apache.parquet.column.ColumnDescriptor; +-import org.apache.parquet.schema.GroupType; +-import org.apache.parquet.schema.MessageType; +-import org.apache.parquet.schema.PrimitiveType; +-import org.apache.parquet.schema.Type; +-import org.apache.spark.sql.catalyst.InternalRow; +- +-class CometVectorizedReaderBuilder extends TypeWithSchemaVisitor> { +- +- private final MessageType parquetSchema; +- private final Schema icebergSchema; +- private final Map idToConstant; +- private final Function>, VectorizedReader> readerFactory; +- private final DeleteFilter deleteFilter; +- +- CometVectorizedReaderBuilder( +- Schema expectedSchema, +- MessageType parquetSchema, +- Map idToConstant, +- Function>, VectorizedReader> readerFactory, +- DeleteFilter deleteFilter) { +- this.parquetSchema = parquetSchema; +- this.icebergSchema = expectedSchema; +- this.idToConstant = idToConstant; +- this.readerFactory = readerFactory; +- this.deleteFilter = deleteFilter; +- } +- +- @Override +- public VectorizedReader message( +- Types.StructType expected, MessageType message, List> fieldReaders) { +- GroupType groupType = message.asGroupType(); +- Map> readersById = Maps.newHashMap(); +- List fields = groupType.getFields(); +- +- IntStream.range(0, fields.size()) +- .filter(pos -> fields.get(pos).getId() != null) +- .forEach(pos -> readersById.put(fields.get(pos).getId().intValue(), fieldReaders.get(pos))); +- +- List icebergFields = +- expected != null ? expected.fields() : ImmutableList.of(); +- +- List> reorderedFields = +- Lists.newArrayListWithExpectedSize(icebergFields.size()); +- +- for (Types.NestedField field : icebergFields) { +- int id = field.fieldId(); +- VectorizedReader reader = readersById.get(id); +- if (idToConstant.containsKey(id)) { +- CometConstantColumnReader constantReader = +- new CometConstantColumnReader<>(idToConstant.get(id), field); +- reorderedFields.add(constantReader); +- } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { +- reorderedFields.add(new CometPositionColumnReader(field)); +- } else if (id == MetadataColumns.IS_DELETED.fieldId()) { +- CometColumnReader deleteReader = new CometDeleteColumnReader<>(field); +- reorderedFields.add(deleteReader); +- } else if (reader != null) { +- reorderedFields.add(reader); +- } else if (field.initialDefault() != null) { +- CometColumnReader constantReader = +- new CometConstantColumnReader<>(field.initialDefault(), field); +- reorderedFields.add(constantReader); +- } else if (field.isOptional()) { +- CometColumnReader constantReader = new CometConstantColumnReader<>(null, field); +- reorderedFields.add(constantReader); +- } else { +- throw new IllegalArgumentException( +- String.format("Missing required field: %s", field.name())); +- } +- } +- return vectorizedReader(reorderedFields); +- } +- +- protected VectorizedReader vectorizedReader(List> reorderedFields) { +- VectorizedReader reader = readerFactory.apply(reorderedFields); +- if (deleteFilter != null) { +- ((CometColumnarBatchReader) reader).setDeleteFilter(deleteFilter); +- } +- return reader; +- } +- +- @Override +- public VectorizedReader struct( +- Types.StructType expected, GroupType groupType, List> fieldReaders) { +- if (expected != null) { +- throw new UnsupportedOperationException( +- "Vectorized reads are not supported yet for struct fields"); +- } +- return null; +- } +- +- @Override +- public VectorizedReader primitive( +- org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { +- +- if (primitive.getId() == null) { +- return null; +- } +- int parquetFieldId = primitive.getId().intValue(); +- ColumnDescriptor desc = parquetSchema.getColumnDescription(currentPath()); +- // Nested types not yet supported for vectorized reads +- if (desc.getMaxRepetitionLevel() > 0) { +- return null; +- } +- Types.NestedField icebergField = icebergSchema.findField(parquetFieldId); +- if (icebergField == null) { +- return null; +- } +- +- return new CometColumnReader(SparkSchemaUtil.convert(icebergField.type()), desc); +- } +-} +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +index b523bc5bff..4aa274a0c7 100644 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java ++++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +@@ -70,22 +70,6 @@ public class VectorizedSparkParquetReaders { + deleteFilter)); + } + +- public static CometColumnarBatchReader buildCometReader( +- Schema expectedSchema, +- MessageType fileSchema, +- Map idToConstant, +- DeleteFilter deleteFilter) { +- return (CometColumnarBatchReader) +- TypeWithSchemaVisitor.visit( +- expectedSchema.asStruct(), +- fileSchema, +- new CometVectorizedReaderBuilder( +- expectedSchema, +- fileSchema, +- idToConstant, +- readers -> new CometColumnarBatchReader(readers, expectedSchema), +- deleteFilter)); +- } + + // enables unsafe memory access to avoid costly checks to see if index is within bounds + // as long as it is not configured explicitly (see BoundsChecking in Arrow) +diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +index 780e1750a5..25f253eede 100644 +--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java ++++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +@@ -34,7 +34,6 @@ import org.apache.iceberg.parquet.Parquet; + import org.apache.iceberg.relocated.com.google.common.collect.Sets; + import org.apache.iceberg.spark.OrcBatchReadConf; + import org.apache.iceberg.spark.ParquetBatchReadConf; +-import org.apache.iceberg.spark.ParquetReaderType; + import org.apache.iceberg.spark.data.vectorized.VectorizedSparkOrcReaders; + import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders; + import org.apache.iceberg.types.TypeUtil; +@@ -92,15 +91,9 @@ abstract class BaseBatchReader extends BaseReader { +- if (parquetConf.readerType() == ParquetReaderType.COMET) { +- return VectorizedSparkParquetReaders.buildCometReader( +- requiredSchema, fileSchema, idToConstant, deleteFilter); +- } else { +- return VectorizedSparkParquetReaders.buildReader( +- requiredSchema, fileSchema, idToConstant, deleteFilter); +- } +- }) ++ fileSchema -> ++ VectorizedSparkParquetReaders.buildReader( ++ requiredSchema, fileSchema, idToConstant, deleteFilter)) + .recordsPerBatch(parquetConf.batchSize()) + .filter(residual) + .caseSensitive(caseSensitive()) From f24616f253f4dfb2d77bd34d95c71f73b87657b6 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 23 Mar 2026 14:25:19 -0400 Subject: [PATCH 6/6] Fix iceberg-rust diffs after #3739. --- dev/diffs/iceberg-rust/1.8.1.diff | 794 ++++++++++++++++++++++++++++++ dev/diffs/iceberg-rust/1.9.1.diff | 778 +++++++++++++++++++++++++++++ 2 files changed, 1572 insertions(+) diff --git a/dev/diffs/iceberg-rust/1.8.1.diff b/dev/diffs/iceberg-rust/1.8.1.diff index 0b993c3898..8990c862c5 100644 --- a/dev/diffs/iceberg-rust/1.8.1.diff +++ b/dev/diffs/iceberg-rust/1.8.1.diff @@ -1695,3 +1695,797 @@ index 780e1750a5..25f253eede 100644 .recordsPerBatch(parquetConf.batchSize()) .filter(residual) .caseSensitive(caseSensitive()) +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java +deleted file mode 100644 +index 4794863ab1..0000000000 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java ++++ /dev/null +@@ -1,150 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.io.IOException; +-import java.util.Map; +-import org.apache.comet.parquet.AbstractColumnReader; +-import org.apache.comet.parquet.ColumnReader; +-import org.apache.comet.parquet.TypeUtil; +-import org.apache.comet.parquet.Utils; +-import org.apache.comet.shaded.arrow.c.CometSchemaImporter; +-import org.apache.comet.shaded.arrow.memory.RootAllocator; +-import org.apache.iceberg.parquet.VectorizedReader; +-import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +-import org.apache.iceberg.spark.SparkSchemaUtil; +-import org.apache.iceberg.types.Types; +-import org.apache.parquet.column.ColumnDescriptor; +-import org.apache.parquet.column.page.PageReadStore; +-import org.apache.parquet.column.page.PageReader; +-import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +-import org.apache.parquet.hadoop.metadata.ColumnPath; +-import org.apache.spark.sql.types.DataType; +-import org.apache.spark.sql.types.Metadata; +-import org.apache.spark.sql.types.StructField; +-import org.apache.spark.sql.vectorized.ColumnVector; +- +-class CometColumnReader implements VectorizedReader { +- // use the Comet default batch size +- public static final int DEFAULT_BATCH_SIZE = 8192; +- +- private final ColumnDescriptor descriptor; +- private final DataType sparkType; +- +- // The delegated ColumnReader from Comet side +- private AbstractColumnReader delegate; +- private boolean initialized = false; +- private int batchSize = DEFAULT_BATCH_SIZE; +- private CometSchemaImporter importer; +- +- CometColumnReader(DataType sparkType, ColumnDescriptor descriptor) { +- this.sparkType = sparkType; +- this.descriptor = descriptor; +- } +- +- CometColumnReader(Types.NestedField field) { +- DataType dataType = SparkSchemaUtil.convert(field.type()); +- StructField structField = new StructField(field.name(), dataType, false, Metadata.empty()); +- this.sparkType = dataType; +- this.descriptor = TypeUtil.convertToParquet(structField); +- } +- +- public AbstractColumnReader delegate() { +- return delegate; +- } +- +- void setDelegate(AbstractColumnReader delegate) { +- this.delegate = delegate; +- } +- +- void setInitialized(boolean initialized) { +- this.initialized = initialized; +- } +- +- public int batchSize() { +- return batchSize; +- } +- +- /** +- * This method is to initialized/reset the CometColumnReader. This needs to be called for each row +- * group after readNextRowGroup, so a new dictionary encoding can be set for each of the new row +- * groups. +- */ +- public void reset() { +- if (importer != null) { +- importer.close(); +- } +- +- if (delegate != null) { +- delegate.close(); +- } +- +- this.importer = new CometSchemaImporter(new RootAllocator()); +- this.delegate = Utils.getColumnReader(sparkType, descriptor, importer, batchSize, false, false); +- this.initialized = true; +- } +- +- public ColumnDescriptor descriptor() { +- return descriptor; +- } +- +- /** Returns the Spark data type for this column. */ +- public DataType sparkType() { +- return sparkType; +- } +- +- /** +- * Set the page reader to be 'pageReader'. +- * +- *

NOTE: this should be called before reading a new Parquet column chunk, and after {@link +- * CometColumnReader#reset} is called. +- */ +- public void setPageReader(PageReader pageReader) throws IOException { +- Preconditions.checkState(initialized, "Invalid state: 'reset' should be called first"); +- ((ColumnReader) delegate).setPageReader(pageReader); +- } +- +- @Override +- public void close() { +- // close resources on native side +- if (importer != null) { +- importer.close(); +- } +- +- if (delegate != null) { +- delegate.close(); +- } +- } +- +- @Override +- public void setBatchSize(int size) { +- this.batchSize = size; +- } +- +- @Override +- public void setRowGroupInfo( +- PageReadStore pageReadStore, Map map, long size) { +- throw new UnsupportedOperationException("Not supported"); +- } +- +- @Override +- public ColumnVector read(ColumnVector reuse, int numRowsToRead) { +- throw new UnsupportedOperationException("Not supported"); +- } +-} +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java +deleted file mode 100644 +index 1440e5d1d3..0000000000 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java ++++ /dev/null +@@ -1,203 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.io.IOException; +-import java.io.UncheckedIOException; +-import java.util.List; +-import java.util.Map; +-import org.apache.comet.parquet.AbstractColumnReader; +-import org.apache.comet.parquet.BatchReader; +-import org.apache.iceberg.Schema; +-import org.apache.iceberg.data.DeleteFilter; +-import org.apache.iceberg.parquet.VectorizedReader; +-import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +-import org.apache.iceberg.spark.SparkSchemaUtil; +-import org.apache.iceberg.util.Pair; +-import org.apache.parquet.column.page.PageReadStore; +-import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +-import org.apache.parquet.hadoop.metadata.ColumnPath; +-import org.apache.spark.sql.catalyst.InternalRow; +-import org.apache.spark.sql.vectorized.ColumnVector; +-import org.apache.spark.sql.vectorized.ColumnarBatch; +- +-/** +- * {@link VectorizedReader} that returns Spark's {@link ColumnarBatch} to support Spark's vectorized +- * read path. The {@link ColumnarBatch} returned is created by passing in the Arrow vectors +- * populated via delegated read calls to {@link CometColumnReader VectorReader(s)}. +- */ +-@SuppressWarnings("checkstyle:VisibilityModifier") +-class CometColumnarBatchReader implements VectorizedReader { +- +- private final CometColumnReader[] readers; +- private final boolean hasIsDeletedColumn; +- +- // The delegated BatchReader on the Comet side does the real work of loading a batch of rows. +- // The Comet BatchReader contains an array of ColumnReader. There is no need to explicitly call +- // ColumnReader.readBatch; instead, BatchReader.nextBatch will be called, which underneath calls +- // ColumnReader.readBatch. The only exception is DeleteColumnReader, because at the time of +- // calling BatchReader.nextBatch, the isDeleted value is not yet available, so +- // DeleteColumnReader.readBatch must be called explicitly later, after the isDeleted value is +- // available. +- private final BatchReader delegate; +- private DeleteFilter deletes = null; +- private long rowStartPosInBatch = 0; +- +- CometColumnarBatchReader(List> readers, Schema schema) { +- this.readers = +- readers.stream().map(CometColumnReader.class::cast).toArray(CometColumnReader[]::new); +- this.hasIsDeletedColumn = +- readers.stream().anyMatch(reader -> reader instanceof CometDeleteColumnReader); +- +- AbstractColumnReader[] abstractColumnReaders = new AbstractColumnReader[readers.size()]; +- this.delegate = new BatchReader(abstractColumnReaders); +- delegate.setSparkSchema(SparkSchemaUtil.convert(schema)); +- } +- +- @Override +- public void setRowGroupInfo( +- PageReadStore pageStore, Map metaData, long rowPosition) { +- setRowGroupInfo(pageStore, metaData); +- } +- +- @Override +- public void setRowGroupInfo( +- PageReadStore pageStore, Map metaData) { +- for (int i = 0; i < readers.length; i++) { +- try { +- if (!(readers[i] instanceof CometConstantColumnReader) +- && !(readers[i] instanceof CometPositionColumnReader) +- && !(readers[i] instanceof CometDeleteColumnReader)) { +- readers[i].reset(); +- readers[i].setPageReader(pageStore.getPageReader(readers[i].descriptor())); +- } +- } catch (IOException e) { +- throw new UncheckedIOException("Failed to setRowGroupInfo for Comet vectorization", e); +- } +- } +- +- for (int i = 0; i < readers.length; i++) { +- delegate.getColumnReaders()[i] = this.readers[i].delegate(); +- } +- +- this.rowStartPosInBatch = +- pageStore +- .getRowIndexOffset() +- .orElseThrow( +- () -> +- new IllegalArgumentException( +- "PageReadStore does not contain row index offset")); +- } +- +- public void setDeleteFilter(DeleteFilter deleteFilter) { +- this.deletes = deleteFilter; +- } +- +- @Override +- public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) { +- ColumnarBatch columnarBatch = new ColumnBatchLoader(numRowsToRead).loadDataToColumnBatch(); +- rowStartPosInBatch += numRowsToRead; +- return columnarBatch; +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- for (CometColumnReader reader : readers) { +- if (reader != null) { +- reader.setBatchSize(batchSize); +- } +- } +- } +- +- @Override +- public void close() { +- for (CometColumnReader reader : readers) { +- if (reader != null) { +- reader.close(); +- } +- } +- } +- +- private class ColumnBatchLoader { +- private final int batchSize; +- +- ColumnBatchLoader(int numRowsToRead) { +- Preconditions.checkArgument( +- numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead); +- this.batchSize = numRowsToRead; +- } +- +- ColumnarBatch loadDataToColumnBatch() { +- ColumnVector[] vectors = readDataToColumnVectors(); +- int numLiveRows = batchSize; +- +- if (hasIsDeletedColumn) { +- boolean[] isDeleted = buildIsDeleted(vectors); +- readDeletedColumn(vectors, isDeleted); +- } else { +- Pair pair = buildRowIdMapping(vectors); +- if (pair != null) { +- int[] rowIdMapping = pair.first(); +- numLiveRows = pair.second(); +- for (int i = 0; i < vectors.length; i++) { +- vectors[i] = new ColumnVectorWithFilter(vectors[i], rowIdMapping); +- } +- } +- } +- +- if (deletes != null && deletes.hasEqDeletes()) { +- vectors = ColumnarBatchUtil.removeExtraColumns(deletes, vectors); +- } +- +- ColumnarBatch batch = new ColumnarBatch(vectors); +- batch.setNumRows(numLiveRows); +- return batch; +- } +- +- private boolean[] buildIsDeleted(ColumnVector[] vectors) { +- return ColumnarBatchUtil.buildIsDeleted(vectors, deletes, rowStartPosInBatch, batchSize); +- } +- +- private Pair buildRowIdMapping(ColumnVector[] vectors) { +- return ColumnarBatchUtil.buildRowIdMapping(vectors, deletes, rowStartPosInBatch, batchSize); +- } +- +- ColumnVector[] readDataToColumnVectors() { +- ColumnVector[] columnVectors = new ColumnVector[readers.length]; +- // Fetch rows for all readers in the delegate +- delegate.nextBatch(batchSize); +- for (int i = 0; i < readers.length; i++) { +- columnVectors[i] = readers[i].delegate().currentBatch(); +- } +- +- return columnVectors; +- } +- +- void readDeletedColumn(ColumnVector[] columnVectors, boolean[] isDeleted) { +- for (int i = 0; i < readers.length; i++) { +- if (readers[i] instanceof CometDeleteColumnReader) { +- CometDeleteColumnReader deleteColumnReader = new CometDeleteColumnReader<>(isDeleted); +- deleteColumnReader.setBatchSize(batchSize); +- deleteColumnReader.delegate().readBatch(batchSize); +- columnVectors[i] = deleteColumnReader.delegate().currentBatch(); +- } +- } +- } +- } +-} +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java +deleted file mode 100644 +index 047c96314b..0000000000 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java ++++ /dev/null +@@ -1,65 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.math.BigDecimal; +-import java.nio.ByteBuffer; +-import org.apache.comet.parquet.ConstantColumnReader; +-import org.apache.iceberg.types.Types; +-import org.apache.spark.sql.types.DataType; +-import org.apache.spark.sql.types.DataTypes; +-import org.apache.spark.sql.types.Decimal; +-import org.apache.spark.sql.types.DecimalType; +-import org.apache.spark.unsafe.types.UTF8String; +- +-class CometConstantColumnReader extends CometColumnReader { +- +- CometConstantColumnReader(T value, Types.NestedField field) { +- super(field); +- // use delegate to set constant value on the native side to be consumed by native execution. +- setDelegate( +- new ConstantColumnReader(sparkType(), descriptor(), convertToSparkValue(value), false)); +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- super.setBatchSize(batchSize); +- delegate().setBatchSize(batchSize); +- setInitialized(true); +- } +- +- private Object convertToSparkValue(T value) { +- DataType dataType = sparkType(); +- // Match the value to Spark internal type if necessary +- if (dataType == DataTypes.StringType && value instanceof String) { +- // the internal type for StringType is UTF8String +- return UTF8String.fromString((String) value); +- } else if (dataType instanceof DecimalType && value instanceof BigDecimal) { +- // the internal type for DecimalType is Decimal +- return Decimal.apply((BigDecimal) value); +- } else if (dataType == DataTypes.BinaryType && value instanceof ByteBuffer) { +- // the internal type for DecimalType is byte[] +- // Iceberg default value should always use HeapBufferBuffer, so calling ByteBuffer.array() +- // should be safe. +- return ((ByteBuffer) value).array(); +- } else { +- return value; +- } +- } +-} +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java +deleted file mode 100644 +index 6235bfe486..0000000000 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java ++++ /dev/null +@@ -1,75 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import org.apache.comet.parquet.MetadataColumnReader; +-import org.apache.comet.parquet.Native; +-import org.apache.comet.parquet.TypeUtil; +-import org.apache.iceberg.MetadataColumns; +-import org.apache.iceberg.types.Types; +-import org.apache.spark.sql.types.DataTypes; +-import org.apache.spark.sql.types.Metadata; +-import org.apache.spark.sql.types.StructField; +- +-class CometDeleteColumnReader extends CometColumnReader { +- CometDeleteColumnReader(Types.NestedField field) { +- super(field); +- setDelegate(new DeleteColumnReader()); +- } +- +- CometDeleteColumnReader(boolean[] isDeleted) { +- super(MetadataColumns.IS_DELETED); +- setDelegate(new DeleteColumnReader(isDeleted)); +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- super.setBatchSize(batchSize); +- delegate().setBatchSize(batchSize); +- setInitialized(true); +- } +- +- private static class DeleteColumnReader extends MetadataColumnReader { +- private boolean[] isDeleted; +- +- DeleteColumnReader() { +- super( +- DataTypes.BooleanType, +- TypeUtil.convertToParquet( +- new StructField("_deleted", DataTypes.BooleanType, false, Metadata.empty())), +- false /* useDecimal128 = false */, +- false /* isConstant = false */); +- this.isDeleted = new boolean[0]; +- } +- +- DeleteColumnReader(boolean[] isDeleted) { +- this(); +- this.isDeleted = isDeleted; +- } +- +- @Override +- public void readBatch(int total) { +- Native.resetBatch(nativeHandle); +- // set isDeleted on the native side to be consumed by native execution +- Native.setIsDeleted(nativeHandle, isDeleted); +- +- super.readBatch(total); +- } +- } +-} +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java +deleted file mode 100644 +index bcc0e514c2..0000000000 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java ++++ /dev/null +@@ -1,62 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import org.apache.comet.parquet.MetadataColumnReader; +-import org.apache.comet.parquet.Native; +-import org.apache.iceberg.types.Types; +-import org.apache.parquet.column.ColumnDescriptor; +-import org.apache.spark.sql.types.DataTypes; +- +-class CometPositionColumnReader extends CometColumnReader { +- CometPositionColumnReader(Types.NestedField field) { +- super(field); +- setDelegate(new PositionColumnReader(descriptor())); +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- super.setBatchSize(batchSize); +- delegate().setBatchSize(batchSize); +- setInitialized(true); +- } +- +- private static class PositionColumnReader extends MetadataColumnReader { +- /** The current position value of the column that are used to initialize this column reader. */ +- private long position; +- +- PositionColumnReader(ColumnDescriptor descriptor) { +- super( +- DataTypes.LongType, +- descriptor, +- false /* useDecimal128 = false */, +- false /* isConstant = false */); +- } +- +- @Override +- public void readBatch(int total) { +- Native.resetBatch(nativeHandle); +- // set position on the native side to be consumed by native execution +- Native.setPosition(nativeHandle, position, total); +- position += total; +- +- super.readBatch(total); +- } +- } +-} +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java +deleted file mode 100644 +index d36f1a7274..0000000000 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java ++++ /dev/null +@@ -1,147 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.util.List; +-import java.util.Map; +-import java.util.function.Function; +-import java.util.stream.IntStream; +-import org.apache.iceberg.MetadataColumns; +-import org.apache.iceberg.Schema; +-import org.apache.iceberg.data.DeleteFilter; +-import org.apache.iceberg.parquet.TypeWithSchemaVisitor; +-import org.apache.iceberg.parquet.VectorizedReader; +-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +-import org.apache.iceberg.relocated.com.google.common.collect.Lists; +-import org.apache.iceberg.relocated.com.google.common.collect.Maps; +-import org.apache.iceberg.spark.SparkSchemaUtil; +-import org.apache.iceberg.types.Types; +-import org.apache.parquet.column.ColumnDescriptor; +-import org.apache.parquet.schema.GroupType; +-import org.apache.parquet.schema.MessageType; +-import org.apache.parquet.schema.PrimitiveType; +-import org.apache.parquet.schema.Type; +-import org.apache.spark.sql.catalyst.InternalRow; +- +-class CometVectorizedReaderBuilder extends TypeWithSchemaVisitor> { +- +- private final MessageType parquetSchema; +- private final Schema icebergSchema; +- private final Map idToConstant; +- private final Function>, VectorizedReader> readerFactory; +- private final DeleteFilter deleteFilter; +- +- CometVectorizedReaderBuilder( +- Schema expectedSchema, +- MessageType parquetSchema, +- Map idToConstant, +- Function>, VectorizedReader> readerFactory, +- DeleteFilter deleteFilter) { +- this.parquetSchema = parquetSchema; +- this.icebergSchema = expectedSchema; +- this.idToConstant = idToConstant; +- this.readerFactory = readerFactory; +- this.deleteFilter = deleteFilter; +- } +- +- @Override +- public VectorizedReader message( +- Types.StructType expected, MessageType message, List> fieldReaders) { +- GroupType groupType = message.asGroupType(); +- Map> readersById = Maps.newHashMap(); +- List fields = groupType.getFields(); +- +- IntStream.range(0, fields.size()) +- .filter(pos -> fields.get(pos).getId() != null) +- .forEach(pos -> readersById.put(fields.get(pos).getId().intValue(), fieldReaders.get(pos))); +- +- List icebergFields = +- expected != null ? expected.fields() : ImmutableList.of(); +- +- List> reorderedFields = +- Lists.newArrayListWithExpectedSize(icebergFields.size()); +- +- for (Types.NestedField field : icebergFields) { +- int id = field.fieldId(); +- VectorizedReader reader = readersById.get(id); +- if (idToConstant.containsKey(id)) { +- CometConstantColumnReader constantReader = +- new CometConstantColumnReader<>(idToConstant.get(id), field); +- reorderedFields.add(constantReader); +- } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { +- reorderedFields.add(new CometPositionColumnReader(field)); +- } else if (id == MetadataColumns.IS_DELETED.fieldId()) { +- CometColumnReader deleteReader = new CometDeleteColumnReader<>(field); +- reorderedFields.add(deleteReader); +- } else if (reader != null) { +- reorderedFields.add(reader); +- } else if (field.initialDefault() != null) { +- CometColumnReader constantReader = +- new CometConstantColumnReader<>(field.initialDefault(), field); +- reorderedFields.add(constantReader); +- } else if (field.isOptional()) { +- CometColumnReader constantReader = new CometConstantColumnReader<>(null, field); +- reorderedFields.add(constantReader); +- } else { +- throw new IllegalArgumentException( +- String.format("Missing required field: %s", field.name())); +- } +- } +- return vectorizedReader(reorderedFields); +- } +- +- protected VectorizedReader vectorizedReader(List> reorderedFields) { +- VectorizedReader reader = readerFactory.apply(reorderedFields); +- if (deleteFilter != null) { +- ((CometColumnarBatchReader) reader).setDeleteFilter(deleteFilter); +- } +- return reader; +- } +- +- @Override +- public VectorizedReader struct( +- Types.StructType expected, GroupType groupType, List> fieldReaders) { +- if (expected != null) { +- throw new UnsupportedOperationException( +- "Vectorized reads are not supported yet for struct fields"); +- } +- return null; +- } +- +- @Override +- public VectorizedReader primitive( +- org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { +- +- if (primitive.getId() == null) { +- return null; +- } +- int parquetFieldId = primitive.getId().intValue(); +- ColumnDescriptor desc = parquetSchema.getColumnDescription(currentPath()); +- // Nested types not yet supported for vectorized reads +- if (desc.getMaxRepetitionLevel() > 0) { +- return null; +- } +- Types.NestedField icebergField = icebergSchema.findField(parquetFieldId); +- if (icebergField == null) { +- return null; +- } +- +- return new CometColumnReader(SparkSchemaUtil.convert(icebergField.type()), desc); +- } +-} +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java ++++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +@@ -70,22 +70,6 @@ + deleteFilter)); + } + +- public static CometColumnarBatchReader buildCometReader( +- Schema expectedSchema, +- MessageType fileSchema, +- Map idToConstant, +- DeleteFilter deleteFilter) { +- return (CometColumnarBatchReader) +- TypeWithSchemaVisitor.visit( +- expectedSchema.asStruct(), +- fileSchema, +- new CometVectorizedReaderBuilder( +- expectedSchema, +- fileSchema, +- idToConstant, +- readers -> new CometColumnarBatchReader(readers, expectedSchema), +- deleteFilter)); +- } + + // enables unsafe memory access to avoid costly checks to see if index is within bounds + // as long as it is not configured explicitly (see BoundsChecking in Arrow) +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java ++++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +@@ -34,7 +34,6 @@ + import org.apache.iceberg.relocated.com.google.common.collect.Sets; + import org.apache.iceberg.spark.OrcBatchReadConf; + import org.apache.iceberg.spark.ParquetBatchReadConf; +-import org.apache.iceberg.spark.ParquetReaderType; + import org.apache.iceberg.spark.data.vectorized.VectorizedSparkOrcReaders; + import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders; + import org.apache.iceberg.types.TypeUtil; +@@ -92,15 +91,9 @@ + .project(requiredSchema) + .split(start, length) + .createBatchedReaderFunc( +- fileSchema -> { +- if (parquetConf.readerType() == ParquetReaderType.COMET) { +- return VectorizedSparkParquetReaders.buildCometReader( +- requiredSchema, fileSchema, idToConstant, deleteFilter); +- } else { +- return VectorizedSparkParquetReaders.buildReader( +- requiredSchema, fileSchema, idToConstant, deleteFilter); +- } +- }) ++ fileSchema -> ++ VectorizedSparkParquetReaders.buildReader( ++ requiredSchema, fileSchema, idToConstant, deleteFilter)) + .recordsPerBatch(parquetConf.batchSize()) + .filter(residual) + .caseSensitive(caseSensitive()) diff --git a/dev/diffs/iceberg-rust/1.9.1.diff b/dev/diffs/iceberg-rust/1.9.1.diff index 67bd9fc88c..7a24dcf42b 100644 --- a/dev/diffs/iceberg-rust/1.9.1.diff +++ b/dev/diffs/iceberg-rust/1.9.1.diff @@ -1670,3 +1670,781 @@ index 780e1750a5..25f253eede 100644 .recordsPerBatch(parquetConf.batchSize()) .filter(residual) .caseSensitive(caseSensitive()) +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java +deleted file mode 100644 +index 16159dcbdf..0000000000 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnReader.java ++++ /dev/null +@@ -1,140 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.io.IOException; +-import org.apache.comet.parquet.AbstractColumnReader; +-import org.apache.comet.parquet.ColumnReader; +-import org.apache.comet.parquet.TypeUtil; +-import org.apache.comet.parquet.Utils; +-import org.apache.comet.shaded.arrow.c.CometSchemaImporter; +-import org.apache.comet.shaded.arrow.memory.RootAllocator; +-import org.apache.iceberg.parquet.VectorizedReader; +-import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +-import org.apache.iceberg.spark.SparkSchemaUtil; +-import org.apache.iceberg.types.Types; +-import org.apache.parquet.column.ColumnDescriptor; +-import org.apache.parquet.column.page.PageReader; +-import org.apache.spark.sql.types.DataType; +-import org.apache.spark.sql.types.Metadata; +-import org.apache.spark.sql.types.StructField; +-import org.apache.spark.sql.vectorized.ColumnVector; +- +-class CometColumnReader implements VectorizedReader { +- // use the Comet default batch size +- public static final int DEFAULT_BATCH_SIZE = 8192; +- +- private final ColumnDescriptor descriptor; +- private final DataType sparkType; +- +- // The delegated ColumnReader from Comet side +- private AbstractColumnReader delegate; +- private boolean initialized = false; +- private int batchSize = DEFAULT_BATCH_SIZE; +- private CometSchemaImporter importer; +- +- CometColumnReader(DataType sparkType, ColumnDescriptor descriptor) { +- this.sparkType = sparkType; +- this.descriptor = descriptor; +- } +- +- CometColumnReader(Types.NestedField field) { +- DataType dataType = SparkSchemaUtil.convert(field.type()); +- StructField structField = new StructField(field.name(), dataType, false, Metadata.empty()); +- this.sparkType = dataType; +- this.descriptor = TypeUtil.convertToParquet(structField); +- } +- +- public AbstractColumnReader delegate() { +- return delegate; +- } +- +- void setDelegate(AbstractColumnReader delegate) { +- this.delegate = delegate; +- } +- +- void setInitialized(boolean initialized) { +- this.initialized = initialized; +- } +- +- public int batchSize() { +- return batchSize; +- } +- +- /** +- * This method is to initialized/reset the CometColumnReader. This needs to be called for each row +- * group after readNextRowGroup, so a new dictionary encoding can be set for each of the new row +- * groups. +- */ +- public void reset() { +- if (importer != null) { +- importer.close(); +- } +- +- if (delegate != null) { +- delegate.close(); +- } +- +- this.importer = new CometSchemaImporter(new RootAllocator()); +- this.delegate = Utils.getColumnReader(sparkType, descriptor, importer, batchSize, false, false); +- this.initialized = true; +- } +- +- public ColumnDescriptor descriptor() { +- return descriptor; +- } +- +- /** Returns the Spark data type for this column. */ +- public DataType sparkType() { +- return sparkType; +- } +- +- /** +- * Set the page reader to be 'pageReader'. +- * +- *

NOTE: this should be called before reading a new Parquet column chunk, and after {@link +- * CometColumnReader#reset} is called. +- */ +- public void setPageReader(PageReader pageReader) throws IOException { +- Preconditions.checkState(initialized, "Invalid state: 'reset' should be called first"); +- ((ColumnReader) delegate).setPageReader(pageReader); +- } +- +- @Override +- public void close() { +- // close resources on native side +- if (importer != null) { +- importer.close(); +- } +- +- if (delegate != null) { +- delegate.close(); +- } +- } +- +- @Override +- public void setBatchSize(int size) { +- this.batchSize = size; +- } +- +- @Override +- public ColumnVector read(ColumnVector reuse, int numRowsToRead) { +- throw new UnsupportedOperationException("Not supported"); +- } +-} +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java +deleted file mode 100644 +index 04ac69476a..0000000000 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometColumnarBatchReader.java ++++ /dev/null +@@ -1,197 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.io.IOException; +-import java.io.UncheckedIOException; +-import java.util.List; +-import java.util.Map; +-import org.apache.comet.parquet.AbstractColumnReader; +-import org.apache.comet.parquet.BatchReader; +-import org.apache.iceberg.Schema; +-import org.apache.iceberg.data.DeleteFilter; +-import org.apache.iceberg.parquet.VectorizedReader; +-import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +-import org.apache.iceberg.spark.SparkSchemaUtil; +-import org.apache.iceberg.util.Pair; +-import org.apache.parquet.column.page.PageReadStore; +-import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +-import org.apache.parquet.hadoop.metadata.ColumnPath; +-import org.apache.spark.sql.catalyst.InternalRow; +-import org.apache.spark.sql.vectorized.ColumnVector; +-import org.apache.spark.sql.vectorized.ColumnarBatch; +- +-/** +- * {@link VectorizedReader} that returns Spark's {@link ColumnarBatch} to support Spark's vectorized +- * read path. The {@link ColumnarBatch} returned is created by passing in the Arrow vectors +- * populated via delegated read calls to {@link CometColumnReader VectorReader(s)}. +- */ +-@SuppressWarnings("checkstyle:VisibilityModifier") +-class CometColumnarBatchReader implements VectorizedReader { +- +- private final CometColumnReader[] readers; +- private final boolean hasIsDeletedColumn; +- +- // The delegated BatchReader on the Comet side does the real work of loading a batch of rows. +- // The Comet BatchReader contains an array of ColumnReader. There is no need to explicitly call +- // ColumnReader.readBatch; instead, BatchReader.nextBatch will be called, which underneath calls +- // ColumnReader.readBatch. The only exception is DeleteColumnReader, because at the time of +- // calling BatchReader.nextBatch, the isDeleted value is not yet available, so +- // DeleteColumnReader.readBatch must be called explicitly later, after the isDeleted value is +- // available. +- private final BatchReader delegate; +- private DeleteFilter deletes = null; +- private long rowStartPosInBatch = 0; +- +- CometColumnarBatchReader(List> readers, Schema schema) { +- this.readers = +- readers.stream().map(CometColumnReader.class::cast).toArray(CometColumnReader[]::new); +- this.hasIsDeletedColumn = +- readers.stream().anyMatch(reader -> reader instanceof CometDeleteColumnReader); +- +- AbstractColumnReader[] abstractColumnReaders = new AbstractColumnReader[readers.size()]; +- this.delegate = new BatchReader(abstractColumnReaders); +- delegate.setSparkSchema(SparkSchemaUtil.convert(schema)); +- } +- +- @Override +- public void setRowGroupInfo( +- PageReadStore pageStore, Map metaData) { +- for (int i = 0; i < readers.length; i++) { +- try { +- if (!(readers[i] instanceof CometConstantColumnReader) +- && !(readers[i] instanceof CometPositionColumnReader) +- && !(readers[i] instanceof CometDeleteColumnReader)) { +- readers[i].reset(); +- readers[i].setPageReader(pageStore.getPageReader(readers[i].descriptor())); +- } +- } catch (IOException e) { +- throw new UncheckedIOException("Failed to setRowGroupInfo for Comet vectorization", e); +- } +- } +- +- for (int i = 0; i < readers.length; i++) { +- delegate.getColumnReaders()[i] = this.readers[i].delegate(); +- } +- +- this.rowStartPosInBatch = +- pageStore +- .getRowIndexOffset() +- .orElseThrow( +- () -> +- new IllegalArgumentException( +- "PageReadStore does not contain row index offset")); +- } +- +- public void setDeleteFilter(DeleteFilter deleteFilter) { +- this.deletes = deleteFilter; +- } +- +- @Override +- public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) { +- ColumnarBatch columnarBatch = new ColumnBatchLoader(numRowsToRead).loadDataToColumnBatch(); +- rowStartPosInBatch += numRowsToRead; +- return columnarBatch; +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- for (CometColumnReader reader : readers) { +- if (reader != null) { +- reader.setBatchSize(batchSize); +- } +- } +- } +- +- @Override +- public void close() { +- for (CometColumnReader reader : readers) { +- if (reader != null) { +- reader.close(); +- } +- } +- } +- +- private class ColumnBatchLoader { +- private final int batchSize; +- +- ColumnBatchLoader(int numRowsToRead) { +- Preconditions.checkArgument( +- numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead); +- this.batchSize = numRowsToRead; +- } +- +- ColumnarBatch loadDataToColumnBatch() { +- ColumnVector[] vectors = readDataToColumnVectors(); +- int numLiveRows = batchSize; +- +- if (hasIsDeletedColumn) { +- boolean[] isDeleted = buildIsDeleted(vectors); +- readDeletedColumn(vectors, isDeleted); +- } else { +- Pair pair = buildRowIdMapping(vectors); +- if (pair != null) { +- int[] rowIdMapping = pair.first(); +- numLiveRows = pair.second(); +- for (int i = 0; i < vectors.length; i++) { +- vectors[i] = new ColumnVectorWithFilter(vectors[i], rowIdMapping); +- } +- } +- } +- +- if (deletes != null && deletes.hasEqDeletes()) { +- vectors = ColumnarBatchUtil.removeExtraColumns(deletes, vectors); +- } +- +- ColumnarBatch batch = new ColumnarBatch(vectors); +- batch.setNumRows(numLiveRows); +- return batch; +- } +- +- private boolean[] buildIsDeleted(ColumnVector[] vectors) { +- return ColumnarBatchUtil.buildIsDeleted(vectors, deletes, rowStartPosInBatch, batchSize); +- } +- +- private Pair buildRowIdMapping(ColumnVector[] vectors) { +- return ColumnarBatchUtil.buildRowIdMapping(vectors, deletes, rowStartPosInBatch, batchSize); +- } +- +- ColumnVector[] readDataToColumnVectors() { +- ColumnVector[] columnVectors = new ColumnVector[readers.length]; +- // Fetch rows for all readers in the delegate +- delegate.nextBatch(batchSize); +- for (int i = 0; i < readers.length; i++) { +- columnVectors[i] = readers[i].delegate().currentBatch(); +- } +- +- return columnVectors; +- } +- +- void readDeletedColumn(ColumnVector[] columnVectors, boolean[] isDeleted) { +- for (int i = 0; i < readers.length; i++) { +- if (readers[i] instanceof CometDeleteColumnReader) { +- CometDeleteColumnReader deleteColumnReader = new CometDeleteColumnReader<>(isDeleted); +- deleteColumnReader.setBatchSize(batchSize); +- deleteColumnReader.delegate().readBatch(batchSize); +- columnVectors[i] = deleteColumnReader.delegate().currentBatch(); +- } +- } +- } +- } +-} +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java +deleted file mode 100644 +index 047c96314b..0000000000 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometConstantColumnReader.java ++++ /dev/null +@@ -1,65 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.math.BigDecimal; +-import java.nio.ByteBuffer; +-import org.apache.comet.parquet.ConstantColumnReader; +-import org.apache.iceberg.types.Types; +-import org.apache.spark.sql.types.DataType; +-import org.apache.spark.sql.types.DataTypes; +-import org.apache.spark.sql.types.Decimal; +-import org.apache.spark.sql.types.DecimalType; +-import org.apache.spark.unsafe.types.UTF8String; +- +-class CometConstantColumnReader extends CometColumnReader { +- +- CometConstantColumnReader(T value, Types.NestedField field) { +- super(field); +- // use delegate to set constant value on the native side to be consumed by native execution. +- setDelegate( +- new ConstantColumnReader(sparkType(), descriptor(), convertToSparkValue(value), false)); +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- super.setBatchSize(batchSize); +- delegate().setBatchSize(batchSize); +- setInitialized(true); +- } +- +- private Object convertToSparkValue(T value) { +- DataType dataType = sparkType(); +- // Match the value to Spark internal type if necessary +- if (dataType == DataTypes.StringType && value instanceof String) { +- // the internal type for StringType is UTF8String +- return UTF8String.fromString((String) value); +- } else if (dataType instanceof DecimalType && value instanceof BigDecimal) { +- // the internal type for DecimalType is Decimal +- return Decimal.apply((BigDecimal) value); +- } else if (dataType == DataTypes.BinaryType && value instanceof ByteBuffer) { +- // the internal type for DecimalType is byte[] +- // Iceberg default value should always use HeapBufferBuffer, so calling ByteBuffer.array() +- // should be safe. +- return ((ByteBuffer) value).array(); +- } else { +- return value; +- } +- } +-} +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java +deleted file mode 100644 +index 6235bfe486..0000000000 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometDeleteColumnReader.java ++++ /dev/null +@@ -1,75 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import org.apache.comet.parquet.MetadataColumnReader; +-import org.apache.comet.parquet.Native; +-import org.apache.comet.parquet.TypeUtil; +-import org.apache.iceberg.MetadataColumns; +-import org.apache.iceberg.types.Types; +-import org.apache.spark.sql.types.DataTypes; +-import org.apache.spark.sql.types.Metadata; +-import org.apache.spark.sql.types.StructField; +- +-class CometDeleteColumnReader extends CometColumnReader { +- CometDeleteColumnReader(Types.NestedField field) { +- super(field); +- setDelegate(new DeleteColumnReader()); +- } +- +- CometDeleteColumnReader(boolean[] isDeleted) { +- super(MetadataColumns.IS_DELETED); +- setDelegate(new DeleteColumnReader(isDeleted)); +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- super.setBatchSize(batchSize); +- delegate().setBatchSize(batchSize); +- setInitialized(true); +- } +- +- private static class DeleteColumnReader extends MetadataColumnReader { +- private boolean[] isDeleted; +- +- DeleteColumnReader() { +- super( +- DataTypes.BooleanType, +- TypeUtil.convertToParquet( +- new StructField("_deleted", DataTypes.BooleanType, false, Metadata.empty())), +- false /* useDecimal128 = false */, +- false /* isConstant = false */); +- this.isDeleted = new boolean[0]; +- } +- +- DeleteColumnReader(boolean[] isDeleted) { +- this(); +- this.isDeleted = isDeleted; +- } +- +- @Override +- public void readBatch(int total) { +- Native.resetBatch(nativeHandle); +- // set isDeleted on the native side to be consumed by native execution +- Native.setIsDeleted(nativeHandle, isDeleted); +- +- super.readBatch(total); +- } +- } +-} +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java +deleted file mode 100644 +index bcc0e514c2..0000000000 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometPositionColumnReader.java ++++ /dev/null +@@ -1,62 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import org.apache.comet.parquet.MetadataColumnReader; +-import org.apache.comet.parquet.Native; +-import org.apache.iceberg.types.Types; +-import org.apache.parquet.column.ColumnDescriptor; +-import org.apache.spark.sql.types.DataTypes; +- +-class CometPositionColumnReader extends CometColumnReader { +- CometPositionColumnReader(Types.NestedField field) { +- super(field); +- setDelegate(new PositionColumnReader(descriptor())); +- } +- +- @Override +- public void setBatchSize(int batchSize) { +- super.setBatchSize(batchSize); +- delegate().setBatchSize(batchSize); +- setInitialized(true); +- } +- +- private static class PositionColumnReader extends MetadataColumnReader { +- /** The current position value of the column that are used to initialize this column reader. */ +- private long position; +- +- PositionColumnReader(ColumnDescriptor descriptor) { +- super( +- DataTypes.LongType, +- descriptor, +- false /* useDecimal128 = false */, +- false /* isConstant = false */); +- } +- +- @Override +- public void readBatch(int total) { +- Native.resetBatch(nativeHandle); +- // set position on the native side to be consumed by native execution +- Native.setPosition(nativeHandle, position, total); +- position += total; +- +- super.readBatch(total); +- } +- } +-} +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java +deleted file mode 100644 +index d36f1a7274..0000000000 +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/CometVectorizedReaderBuilder.java ++++ /dev/null +@@ -1,147 +0,0 @@ +-/* +- * Licensed to the Apache Software Foundation (ASF) under one +- * or more contributor license agreements. See the NOTICE file +- * distributed with this work for additional information +- * regarding copyright ownership. The ASF licenses this file +- * to you under the Apache License, Version 2.0 (the +- * "License"); you may not use this file except in compliance +- * with the License. You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, +- * software distributed under the License is distributed on an +- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +- * KIND, either express or implied. See the License for the +- * specific language governing permissions and limitations +- * under the License. +- */ +-package org.apache.iceberg.spark.data.vectorized; +- +-import java.util.List; +-import java.util.Map; +-import java.util.function.Function; +-import java.util.stream.IntStream; +-import org.apache.iceberg.MetadataColumns; +-import org.apache.iceberg.Schema; +-import org.apache.iceberg.data.DeleteFilter; +-import org.apache.iceberg.parquet.TypeWithSchemaVisitor; +-import org.apache.iceberg.parquet.VectorizedReader; +-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +-import org.apache.iceberg.relocated.com.google.common.collect.Lists; +-import org.apache.iceberg.relocated.com.google.common.collect.Maps; +-import org.apache.iceberg.spark.SparkSchemaUtil; +-import org.apache.iceberg.types.Types; +-import org.apache.parquet.column.ColumnDescriptor; +-import org.apache.parquet.schema.GroupType; +-import org.apache.parquet.schema.MessageType; +-import org.apache.parquet.schema.PrimitiveType; +-import org.apache.parquet.schema.Type; +-import org.apache.spark.sql.catalyst.InternalRow; +- +-class CometVectorizedReaderBuilder extends TypeWithSchemaVisitor> { +- +- private final MessageType parquetSchema; +- private final Schema icebergSchema; +- private final Map idToConstant; +- private final Function>, VectorizedReader> readerFactory; +- private final DeleteFilter deleteFilter; +- +- CometVectorizedReaderBuilder( +- Schema expectedSchema, +- MessageType parquetSchema, +- Map idToConstant, +- Function>, VectorizedReader> readerFactory, +- DeleteFilter deleteFilter) { +- this.parquetSchema = parquetSchema; +- this.icebergSchema = expectedSchema; +- this.idToConstant = idToConstant; +- this.readerFactory = readerFactory; +- this.deleteFilter = deleteFilter; +- } +- +- @Override +- public VectorizedReader message( +- Types.StructType expected, MessageType message, List> fieldReaders) { +- GroupType groupType = message.asGroupType(); +- Map> readersById = Maps.newHashMap(); +- List fields = groupType.getFields(); +- +- IntStream.range(0, fields.size()) +- .filter(pos -> fields.get(pos).getId() != null) +- .forEach(pos -> readersById.put(fields.get(pos).getId().intValue(), fieldReaders.get(pos))); +- +- List icebergFields = +- expected != null ? expected.fields() : ImmutableList.of(); +- +- List> reorderedFields = +- Lists.newArrayListWithExpectedSize(icebergFields.size()); +- +- for (Types.NestedField field : icebergFields) { +- int id = field.fieldId(); +- VectorizedReader reader = readersById.get(id); +- if (idToConstant.containsKey(id)) { +- CometConstantColumnReader constantReader = +- new CometConstantColumnReader<>(idToConstant.get(id), field); +- reorderedFields.add(constantReader); +- } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { +- reorderedFields.add(new CometPositionColumnReader(field)); +- } else if (id == MetadataColumns.IS_DELETED.fieldId()) { +- CometColumnReader deleteReader = new CometDeleteColumnReader<>(field); +- reorderedFields.add(deleteReader); +- } else if (reader != null) { +- reorderedFields.add(reader); +- } else if (field.initialDefault() != null) { +- CometColumnReader constantReader = +- new CometConstantColumnReader<>(field.initialDefault(), field); +- reorderedFields.add(constantReader); +- } else if (field.isOptional()) { +- CometColumnReader constantReader = new CometConstantColumnReader<>(null, field); +- reorderedFields.add(constantReader); +- } else { +- throw new IllegalArgumentException( +- String.format("Missing required field: %s", field.name())); +- } +- } +- return vectorizedReader(reorderedFields); +- } +- +- protected VectorizedReader vectorizedReader(List> reorderedFields) { +- VectorizedReader reader = readerFactory.apply(reorderedFields); +- if (deleteFilter != null) { +- ((CometColumnarBatchReader) reader).setDeleteFilter(deleteFilter); +- } +- return reader; +- } +- +- @Override +- public VectorizedReader struct( +- Types.StructType expected, GroupType groupType, List> fieldReaders) { +- if (expected != null) { +- throw new UnsupportedOperationException( +- "Vectorized reads are not supported yet for struct fields"); +- } +- return null; +- } +- +- @Override +- public VectorizedReader primitive( +- org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { +- +- if (primitive.getId() == null) { +- return null; +- } +- int parquetFieldId = primitive.getId().intValue(); +- ColumnDescriptor desc = parquetSchema.getColumnDescription(currentPath()); +- // Nested types not yet supported for vectorized reads +- if (desc.getMaxRepetitionLevel() > 0) { +- return null; +- } +- Types.NestedField icebergField = icebergSchema.findField(parquetFieldId); +- if (icebergField == null) { +- return null; +- } +- +- return new CometColumnReader(SparkSchemaUtil.convert(icebergField.type()), desc); +- } +-} +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java ++++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +@@ -70,22 +70,6 @@ + deleteFilter)); + } + +- public static CometColumnarBatchReader buildCometReader( +- Schema expectedSchema, +- MessageType fileSchema, +- Map idToConstant, +- DeleteFilter deleteFilter) { +- return (CometColumnarBatchReader) +- TypeWithSchemaVisitor.visit( +- expectedSchema.asStruct(), +- fileSchema, +- new CometVectorizedReaderBuilder( +- expectedSchema, +- fileSchema, +- idToConstant, +- readers -> new CometColumnarBatchReader(readers, expectedSchema), +- deleteFilter)); +- } + + // enables unsafe memory access to avoid costly checks to see if index is within bounds + // as long as it is not configured explicitly (see BoundsChecking in Arrow) +diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +--- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java ++++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +@@ -34,7 +34,6 @@ + import org.apache.iceberg.relocated.com.google.common.collect.Sets; + import org.apache.iceberg.spark.OrcBatchReadConf; + import org.apache.iceberg.spark.ParquetBatchReadConf; +-import org.apache.iceberg.spark.ParquetReaderType; + import org.apache.iceberg.spark.data.vectorized.VectorizedSparkOrcReaders; + import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders; + import org.apache.iceberg.types.TypeUtil; +@@ -92,15 +91,9 @@ + .project(requiredSchema) + .split(start, length) + .createBatchedReaderFunc( +- fileSchema -> { +- if (parquetConf.readerType() == ParquetReaderType.COMET) { +- return VectorizedSparkParquetReaders.buildCometReader( +- requiredSchema, fileSchema, idToConstant, deleteFilter); +- } else { +- return VectorizedSparkParquetReaders.buildReader( +- requiredSchema, fileSchema, idToConstant, deleteFilter); +- } +- }) ++ fileSchema -> ++ VectorizedSparkParquetReaders.buildReader( ++ requiredSchema, fileSchema, idToConstant, deleteFilter)) + .recordsPerBatch(parquetConf.batchSize()) + .filter(residual) + .caseSensitive(caseSensitive())