From e541a7174c4ce187960f7db4f7c3755e5710170e Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Wed, 10 Dec 2025 20:55:47 +0100
Subject: [PATCH 001/100] given a parquet file return data from a certain
 modification time

---
 .../xtable/parquet/ParquetDataManager.java    | 153 ++++++++++++++++++
 1 file changed, 153 insertions(+)
 create mode 100644 xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
new file mode 100644
index 000000000..7cf950950
--- /dev/null
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+package org.apache.xtable.parquet;
+
+import java.io.IOException;
+import java.time.Instant;
+
+import lombok.Builder;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.hadoop.ParquetReader;
+import org.apache.parquet.hadoop.ParquetWriter;
+import org.apache.parquet.hadoop.example.GroupReadSupport;
+import org.apache.parquet.hadoop.example.GroupWriteSupport;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.parquet.schema.Type;
+
+@Builder
+public class ParquetDataManager {
+  private static final String TIME_FIELD = "modified_at";
+  public Path outputFile;
+  private ParquetReader<Group> reader;
+  private ParquetMetadataExtractor metadataExtractor = ParquetMetadataExtractor.getInstance();
+
+  private Path writeNewParquetFile(Configuration conf, ParquetReader reader, Path file) {
+    ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, file);
+    int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE;
+    try (ParquetWriter<Group> writer =
+        new ParquetWriter<Group>(
+            outputFile,
+            new GroupWriteSupport(),
+            parquetFileConfig.getCodec(),
+            (int) parquetFileConfig.getRowGroupSize(),
+            pageSize,
+            pageSize, // dictionaryPageSize
+            true, // enableDictionary
+            false, // enableValidation
+            ParquetWriter.DEFAULT_WRITER_VERSION,
+            conf)) {
+      Group currentGroup = null;
+      while ((currentGroup = (Group) reader.read()) != null) {
+        writer.write(currentGroup);
+      }
+    } catch (Exception e) {
+
+      e.printStackTrace();
+    }
+    return outputFile;
+  }
+
+  public long convertTimeField(Type timeFieldType, int timeIndex, Group record) {
+    PrimitiveType timeFieldTypePrimitive = timeFieldType.asPrimitiveType();
+    long timestampValue = record.getLong(timeIndex, 0);
+
+    // Check the logical type to determine the unit:
+    LogicalTypeAnnotation logicalType = timeFieldTypePrimitive.getLogicalTypeAnnotation();
+    // check the cases of Nanos and millis as well
+    if (logicalType != null
+        && logicalType.equals(
+            LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS))) {
+      // convert microseconds to milliseconds
+      timestampValue = timestampValue / 1000;
+    }
+    // continue case INT96 and BINARY (time as string)
+    return timestampValue;
+  }
+
+  public ParquetReader readParquetDataGroupsFromTime(
+      String filePath, Configuration conf, Instant modifTime) {
+    Path file = new Path(filePath);
+    try {
+      reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
+      MessageType schema =
+          metadataExtractor.readParquetMetadata(conf, file).getFileMetaData().getSchema();
+      Group record = null;
+
+      // get the index of the modified_at field
+      int timeIndex = schema.getFieldIndex(TIME_FIELD);
+      Type timeFieldType = null;
+      long recordTime = 0L;
+      int isAfterModificationTime = 0;
+      while ((reader.read()) != null) {
+        // TODO check if TIME_FIELD needs to be extracted or can use default one
+        timeFieldType = schema.getType(TIME_FIELD);
+        recordTime = convertTimeField(timeFieldType, timeIndex, record);
+        isAfterModificationTime = Instant.ofEpochMilli(recordTime).compareTo(modifTime);
+        if (isAfterModificationTime >= 0) {
+          break;
+        }
+      }
+      // write the remaining reader using a writer
+
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+
+    return reader;
+  }
+}
+
+class ParquetFileConfig {
+  private final MessageType schema;
+  private final long rowGroupSize;
+  private final CompressionCodecName codec;
+
+  public ParquetFileConfig(Configuration conf, Path file) {
+    ParquetMetadata metadata =
+        ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file);
+    MessageType schema = metadata.getFileMetaData().getSchema();
+    if (metadata.getBlocks().isEmpty()) {
+      throw new IllegalStateException("Baseline Parquet file has no row groups.");
+    }
+    long rowGroupSize = metadata.getBlocks().get(0).getTotalByteSize();
+    CompressionCodecName codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec();
+    this.schema = schema;
+    this.rowGroupSize = rowGroupSize;
+    this.codec = codec;
+  }
+
+  public MessageType getSchema() {
+    return schema;
+  }
+
+  public long getRowGroupSize() {
+    return rowGroupSize;
+  }
+
+  public CompressionCodecName getCodec() {
+    return codec;
+  }
+}

From 15e282a04a56d6d28cbfb6f3d3fb9b9afe040417 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 13 Dec 2025 18:20:00 +0100
Subject: [PATCH 002/100] create the path based on the partition then inject
 the file to append into the parquet table

---
 .../xtable/parquet/ParquetDataManager.java    | 135 ++++++------------
 .../xtable/parquet/ParquetFileConfig.java     |  57 ++++++++
 2 files changed, 100 insertions(+), 92 deletions(-)
 create mode 100644 xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 7cf950950..a80ab8a61 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -20,6 +20,9 @@
 
 import java.io.IOException;
 import java.time.Instant;
+import java.time.ZoneId;
+import java.time.format.DateTimeFormatter;
+import java.util.UUID;
 
 import lombok.Builder;
 
@@ -30,22 +33,51 @@
 import org.apache.parquet.hadoop.ParquetWriter;
 import org.apache.parquet.hadoop.example.GroupReadSupport;
 import org.apache.parquet.hadoop.example.GroupWriteSupport;
-import org.apache.parquet.hadoop.metadata.CompressionCodecName;
-import org.apache.parquet.hadoop.metadata.ParquetMetadata;
-import org.apache.parquet.schema.LogicalTypeAnnotation;
-import org.apache.parquet.schema.MessageType;
-import org.apache.parquet.schema.PrimitiveType;
-import org.apache.parquet.schema.Type;
 
 @Builder
 public class ParquetDataManager {
-  private static final String TIME_FIELD = "modified_at";
-  public Path outputFile;
-  private ParquetReader<Group> reader;
   private ParquetMetadataExtractor metadataExtractor = ParquetMetadataExtractor.getInstance();
 
-  private Path writeNewParquetFile(Configuration conf, ParquetReader reader, Path file) {
-    ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, file);
+  public ParquetReader readParquetDataAsReader(String filePath, Configuration conf) {
+    ParquetReader reader = null;
+    Path file = new Path(filePath);
+    try {
+      reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+    return reader;
+  }
+
+  private Path appendNewParquetFile(
+      Configuration conf,
+      String rootPath,
+      Path fileToAppend,
+      String partitionColumn,
+      Instant modifTime) {
+    Path finalFile = null;
+    // construct the file path to inject into the existing partitioned file
+    String partitionValue =
+        DateTimeFormatter.ISO_LOCAL_DATE.format(
+            modifTime.atZone(ZoneId.systemDefault()).toLocalDate());
+    String partitionDir = partitionColumn + partitionValue;
+    String fileName = "part-" + System.currentTimeMillis() + "-" + UUID.randomUUID() + ".parquet";
+    Path outputFile = new Path(new Path(rootPath, partitionDir), fileName);
+    // return its reader for convenience of writing
+    ParquetReader reader = readParquetDataAsReader(fileToAppend.getName(), conf);
+    // append/write it in the right partition
+    finalFile = writeNewParquetFile(conf, reader, fileToAppend, outputFile);
+    return finalFile;
+  }
+
+  private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAppend) {
+    ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, fileToAppend);
+    return parquetFileConfig;
+  }
+
+  private Path writeNewParquetFile(
+      Configuration conf, ParquetReader reader, Path fileToAppend, Path outputFile) {
+    ParquetFileConfig parquetFileConfig = getParquetFileConfig(conf, fileToAppend);
     int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE;
     try (ParquetWriter<Group> writer =
         new ParquetWriter<Group>(
@@ -69,85 +101,4 @@ private Path writeNewParquetFile(Configuration conf, ParquetReader reader, Path
     }
     return outputFile;
   }
-
-  public long convertTimeField(Type timeFieldType, int timeIndex, Group record) {
-    PrimitiveType timeFieldTypePrimitive = timeFieldType.asPrimitiveType();
-    long timestampValue = record.getLong(timeIndex, 0);
-
-    // Check the logical type to determine the unit:
-    LogicalTypeAnnotation logicalType = timeFieldTypePrimitive.getLogicalTypeAnnotation();
-    // check the cases of Nanos and millis as well
-    if (logicalType != null
-        && logicalType.equals(
-            LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS))) {
-      // convert microseconds to milliseconds
-      timestampValue = timestampValue / 1000;
-    }
-    // continue case INT96 and BINARY (time as string)
-    return timestampValue;
-  }
-
-  public ParquetReader readParquetDataGroupsFromTime(
-      String filePath, Configuration conf, Instant modifTime) {
-    Path file = new Path(filePath);
-    try {
-      reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
-      MessageType schema =
-          metadataExtractor.readParquetMetadata(conf, file).getFileMetaData().getSchema();
-      Group record = null;
-
-      // get the index of the modified_at field
-      int timeIndex = schema.getFieldIndex(TIME_FIELD);
-      Type timeFieldType = null;
-      long recordTime = 0L;
-      int isAfterModificationTime = 0;
-      while ((reader.read()) != null) {
-        // TODO check if TIME_FIELD needs to be extracted or can use default one
-        timeFieldType = schema.getType(TIME_FIELD);
-        recordTime = convertTimeField(timeFieldType, timeIndex, record);
-        isAfterModificationTime = Instant.ofEpochMilli(recordTime).compareTo(modifTime);
-        if (isAfterModificationTime >= 0) {
-          break;
-        }
-      }
-      // write the remaining reader using a writer
-
-    } catch (IOException e) {
-      e.printStackTrace();
-    }
-
-    return reader;
-  }
-}
-
-class ParquetFileConfig {
-  private final MessageType schema;
-  private final long rowGroupSize;
-  private final CompressionCodecName codec;
-
-  public ParquetFileConfig(Configuration conf, Path file) {
-    ParquetMetadata metadata =
-        ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file);
-    MessageType schema = metadata.getFileMetaData().getSchema();
-    if (metadata.getBlocks().isEmpty()) {
-      throw new IllegalStateException("Baseline Parquet file has no row groups.");
-    }
-    long rowGroupSize = metadata.getBlocks().get(0).getTotalByteSize();
-    CompressionCodecName codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec();
-    this.schema = schema;
-    this.rowGroupSize = rowGroupSize;
-    this.codec = codec;
-  }
-
-  public MessageType getSchema() {
-    return schema;
-  }
-
-  public long getRowGroupSize() {
-    return rowGroupSize;
-  }
-
-  public CompressionCodecName getCodec() {
-    return codec;
-  }
 }
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
new file mode 100644
index 000000000..6cdf1d4f3
--- /dev/null
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+package org.apache.xtable.parquet;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.schema.MessageType;
+
+class ParquetFileConfig {
+  private final MessageType schema;
+  private final long rowGroupSize;
+  private final CompressionCodecName codec;
+
+  public ParquetFileConfig(Configuration conf, Path file) {
+    ParquetMetadata metadata =
+        ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file);
+    MessageType schema = metadata.getFileMetaData().getSchema();
+    if (metadata.getBlocks().isEmpty()) {
+      throw new IllegalStateException("Baseline Parquet file has no row groups.");
+    }
+    long rowGroupSize = metadata.getBlocks().get(0).getTotalByteSize();
+    CompressionCodecName codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec();
+    this.schema = schema;
+    this.rowGroupSize = rowGroupSize;
+    this.codec = codec;
+  }
+
+  public MessageType getSchema() {
+    return schema;
+  }
+
+  public long getRowGroupSize() {
+    return rowGroupSize;
+  }
+
+  public CompressionCodecName getCodec() {
+    return codec;
+  }
+}

From 2ee71c938d8f1eed47d83a2f00cbd47bfabee448 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 14 Dec 2025 21:19:42 +0100
Subject: [PATCH 003/100] Handle case of path construction with file
 partitioned over many fields, interfacing with ConversionSource

---
 .../xtable/parquet/ParquetDataManager.java    | 40 +++++++++++++++----
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index a80ab8a61..9a38c775a 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -22,7 +22,9 @@
 import java.time.Instant;
 import java.time.ZoneId;
 import java.time.format.DateTimeFormatter;
+import java.util.List;
 import java.util.UUID;
+import java.util.stream.Collectors;
 
 import lombok.Builder;
 
@@ -34,6 +36,11 @@
 import org.apache.parquet.hadoop.example.GroupReadSupport;
 import org.apache.parquet.hadoop.example.GroupWriteSupport;
 
+import org.apache.xtable.model.schema.InternalPartitionField;
+import org.apache.xtable.model.stat.PartitionValue;
+import org.apache.xtable.model.stat.Range;
+import org.apache.xtable.model.storage.InternalDataFile;
+
 @Builder
 public class ParquetDataManager {
   private ParquetMetadataExtractor metadataExtractor = ParquetMetadataExtractor.getInstance();
@@ -49,18 +56,37 @@ public ParquetReader readParquetDataAsReader(String filePath, Configuration conf
     return reader;
   }
 
+  // partition fields are already computed, given a parquet file InternalDataFile must be derived
+  // (e.g., using createInternalDataFileFromParquetFile())
   private Path appendNewParquetFile(
       Configuration conf,
       String rootPath,
-      Path fileToAppend,
-      String partitionColumn,
-      Instant modifTime) {
+      InternalDataFile internalParquetFile,
+      List<InternalPartitionField> partitionFields) {
     Path finalFile = null;
+    String partitionDir = "";
+    List<PartitionValue> partitionValues = internalParquetFile.getPartitionValues();
+    Instant modifTime = Instant.ofEpochMilli(internalParquetFile.getLastModified());
+    Path fileToAppend = new Path(internalParquetFile.toString());
     // construct the file path to inject into the existing partitioned file
-    String partitionValue =
-        DateTimeFormatter.ISO_LOCAL_DATE.format(
-            modifTime.atZone(ZoneId.systemDefault()).toLocalDate());
-    String partitionDir = partitionColumn + partitionValue;
+    if (partitionValues == null || partitionValues.isEmpty()) {
+      String partitionValue =
+          DateTimeFormatter.ISO_LOCAL_DATE.format(
+              modifTime.atZone(ZoneId.systemDefault()).toLocalDate());
+      partitionDir = partitionFields.get(0).getSourceField().getName() + partitionValue;
+    } else {
+      // handle multiple partitioning case (year and month etc.)
+      partitionDir =
+          partitionValues.stream()
+              .map(
+                  pv -> {
+                    Range epochValueObject = pv.getRange();
+                    // epochValueObject is always sure to be long
+                    String valueStr = String.valueOf(epochValueObject.getMaxValue());
+                    return pv.getPartitionField() + "=" + valueStr;
+                  })
+              .collect(Collectors.joining("/"));
+    }
     String fileName = "part-" + System.currentTimeMillis() + "-" + UUID.randomUUID() + ".parquet";
     Path outputFile = new Path(new Path(rootPath, partitionDir), fileName);
     // return its reader for convenience of writing

From 6032e5f62aa74347a3403ed544320d4fff918d07 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 14 Dec 2025 22:03:44 +0100
Subject: [PATCH 004/100] test append Parquet file into table init

---
 .../parquet/TestParquetDataManager.java       | 75 +++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java
new file mode 100644
index 000000000..2365b5fef
--- /dev/null
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+package org.apache.xtable.parquet;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+public class TestParquetDataManager {
+  @TempDir static java.nio.file.Path tempDir = Paths.get("./");
+
+  @Test
+  public void testAppendParquetFile() throws IOException {
+    SparkSession spark =
+        SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate();
+
+    StructType schema =
+        DataTypes.createStructType(
+            new StructField[] {
+              DataTypes.createStructField("id", DataTypes.IntegerType, false),
+              DataTypes.createStructField("value", DataTypes.StringType, false),
+              DataTypes.createStructField("year", DataTypes.IntegerType, false),
+              DataTypes.createStructField("month", DataTypes.IntegerType, false)
+            });
+
+    List<Row> data =
+        Arrays.asList(
+            RowFactory.create(101, "A", 2025, 12),
+            RowFactory.create(102, "B", 2025, 12),
+            RowFactory.create(201, "C", 2025, 11),
+            RowFactory.create(301, "D", 2024, 7));
+
+    Dataset<Row> df = spark.createDataFrame(data, schema);
+    java.nio.file.Path path = tempDir.resolve("parquet-partitioned_table_test");
+    File file = path.toFile();
+    String outputPath = file.getPath();
+
+    df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
+
+    // TODO create an InternalDataFile and Partition Fields for testing purposes
+
+    // TODO test appendNewParquetFile()
+
+    // TODO validate the final table
+    spark.stop();
+  }
+}

From f6fdc72be71ea937de7a06e0d95aa6fa893879c2 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Mon, 15 Dec 2025 21:05:06 +0100
Subject: [PATCH 005/100] add function to test schema equivalence before
 appending

---
 .../java/org/apache/xtable/parquet/ParquetDataManager.java  | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 9a38c775a..f0e8a22cd 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -56,6 +56,12 @@ public ParquetReader readParquetDataAsReader(String filePath, Configuration conf
     return reader;
   }
 
+  // check required before appending the file
+  private boolean checkSchemaIsSame(Configuration conf, Path fileToAppend, Path fileFromTable) {
+    ParquetFileConfig schemaFileAppend = getParquetFileConfig(conf, fileToAppend);
+    ParquetFileConfig schemaFileFromTable = getParquetFileConfig(conf, fileFromTable);
+    return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema());
+  }
   // partition fields are already computed, given a parquet file InternalDataFile must be derived
   // (e.g., using createInternalDataFileFromParquetFile())
   private Path appendNewParquetFile(

From a94c3f36c9587cfe62ff7330ef5537e03aff8751 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Tue, 16 Dec 2025 12:59:02 +0100
Subject: [PATCH 006/100] construct path to inject to based on partitions

---
 .../xtable/parquet/ParquetDataManager.java    | 175 +++++++++++++++---
 .../parquet/TestParquetDataManager.java       |   9 +-
 2 files changed, 151 insertions(+), 33 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index f0e8a22cd..a19d8f585 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -19,16 +19,18 @@
 package org.apache.xtable.parquet;
 
 import java.io.IOException;
-import java.time.Instant;
-import java.time.ZoneId;
+import java.time.*;
 import java.time.format.DateTimeFormatter;
-import java.util.List;
-import java.util.UUID;
+import java.time.temporal.ChronoUnit;
+import java.util.*;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;
 
 import lombok.Builder;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.parquet.example.data.Group;
 import org.apache.parquet.hadoop.ParquetReader;
@@ -36,10 +38,7 @@
 import org.apache.parquet.hadoop.example.GroupReadSupport;
 import org.apache.parquet.hadoop.example.GroupWriteSupport;
 
-import org.apache.xtable.model.schema.InternalPartitionField;
-import org.apache.xtable.model.stat.PartitionValue;
-import org.apache.xtable.model.stat.Range;
-import org.apache.xtable.model.storage.InternalDataFile;
+import org.apache.iceberg.PartitionField;
 
 @Builder
 public class ParquetDataManager {
@@ -56,48 +55,170 @@ public ParquetReader readParquetDataAsReader(String filePath, Configuration conf
     return reader;
   }
 
+  // TODO check if footer of added file can cause problems (catalog) when reading the full table
+  // after appending
   // check required before appending the file
   private boolean checkSchemaIsSame(Configuration conf, Path fileToAppend, Path fileFromTable) {
     ParquetFileConfig schemaFileAppend = getParquetFileConfig(conf, fileToAppend);
     ParquetFileConfig schemaFileFromTable = getParquetFileConfig(conf, fileFromTable);
     return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema());
   }
+
+  Instant parsePartitionDirToStartTime(String partitionDir, List<PartitionField> partitionFields) {
+    Map<String, String> partitionValues = new HashMap<>();
+    String[] parts = partitionDir.split("/");
+    for (String part : parts) {
+      String[] keyValue = part.split("=");
+      if (keyValue.length == 2) {
+        partitionValues.put(keyValue[0].toLowerCase(), keyValue[1]);
+      }
+    }
+    int year = 1970;
+    int month = 1;
+    int day = 1;
+    int hour = 0;
+    int minute = 0;
+    int second = 0;
+
+    for (PartitionField field : partitionFields) {
+      String fieldName = field.name().toLowerCase();
+      String value = partitionValues.get(fieldName);
+
+      if (value != null) {
+        try {
+          int intValue = Integer.parseInt(value);
+
+          switch (fieldName) {
+            case "year":
+              year = intValue;
+              break;
+            case "month":
+              month = intValue;
+              break;
+            case "day":
+              day = intValue;
+              break;
+            case "hour":
+              hour = intValue;
+              break;
+            case "minute":
+              minute = intValue;
+              break;
+            case "second":
+              second = intValue;
+              break;
+            default:
+              break;
+          }
+        } catch (NumberFormatException e) {
+          System.err.println(
+              "Warning: Invalid number format for partition field '" + fieldName + "': " + value);
+        }
+      }
+    }
+
+    try {
+      LocalDateTime localDateTime = LocalDateTime.of(year, month, day, hour, minute, second);
+      return localDateTime.toInstant(ZoneOffset.UTC);
+    } catch (java.time.DateTimeException e) {
+      throw new IllegalArgumentException(
+          "Invalid partition directory date components: " + partitionDir, e);
+    }
+  }
+
+  private ChronoUnit getGranularityUnit(List<PartitionField> partitionFields) {
+    if (partitionFields == null || partitionFields.isEmpty()) {
+      return ChronoUnit.DAYS;
+    }
+    // get the most granular field (the last one in the list, e.g., 'hour' after 'year' and 'month')
+    String lastFieldName = partitionFields.get(partitionFields.size() - 1).name();
+
+    if (lastFieldName.equalsIgnoreCase("year")) {
+      return ChronoUnit.YEARS;
+    } else if (lastFieldName.equalsIgnoreCase("month")) {
+      return ChronoUnit.MONTHS;
+    } else if (lastFieldName.equalsIgnoreCase("day") || lastFieldName.equalsIgnoreCase("date")) {
+      return ChronoUnit.DAYS;
+    } else if (lastFieldName.equalsIgnoreCase("hour")) {
+      return ChronoUnit.HOURS;
+    } else if (lastFieldName.equalsIgnoreCase("minute")) {
+      return ChronoUnit.MINUTES;
+    } else {
+      return ChronoUnit.DAYS;
+    }
+  }
+
+  private String findTargetPartitionFolder(
+      Instant modifTime,
+      List<PartitionField> partitionFields,
+      // could be retrieved from getParquetFiles() of ParquetConversionSource
+      Stream<LocatedFileStatus> parquetFiles)
+      throws IOException {
+
+    long modifTimeMillis = modifTime.toEpochMilli();
+    final ZoneId ZONE = ZoneId.systemDefault();
+    ChronoUnit partitionUnit = getGranularityUnit(partitionFields);
+    List<String> allFolders =
+        parquetFiles.map(status -> status.getPath().toString()).collect(Collectors.toList());
+
+    // sort the folders by their start time
+    allFolders.sort(Comparator.comparing(f -> parsePartitionDirToStartTime(f, partitionFields)));
+
+    for (int i = 0; i < allFolders.size(); i++) {
+      String currentFolder = allFolders.get(i);
+
+      // start time of the current folder
+      Instant currentStartTime = parsePartitionDirToStartTime(currentFolder, partitionFields);
+      long currentStartMillis = currentStartTime.toEpochMilli();
+
+      // determine the end time (which is the start time of the next logical partition)
+      long nextStartMillis;
+      if (i + 1 < allFolders.size()) {
+        nextStartMillis =
+            parsePartitionDirToStartTime(allFolders.get(i + 1), partitionFields).toEpochMilli();
+      } else {
+        ZonedDateTime currentStartZDT = currentStartTime.atZone(ZONE);
+        ZonedDateTime nextStartZDT = currentStartZDT.plus(1, partitionUnit);
+        nextStartMillis = nextStartZDT.toInstant().toEpochMilli();
+      }
+      if (modifTimeMillis >= currentStartMillis && modifTimeMillis < nextStartMillis) {
+        return currentFolder;
+      }
+    }
+    return "";
+  }
   // partition fields are already computed, given a parquet file InternalDataFile must be derived
   // (e.g., using createInternalDataFileFromParquetFile())
   private Path appendNewParquetFile(
       Configuration conf,
       String rootPath,
-      InternalDataFile internalParquetFile,
-      List<InternalPartitionField> partitionFields) {
+      FileStatus parquetFile,
+      Stream<LocatedFileStatus> parquetFiles,
+      List<PartitionField> partitionFields) {
     Path finalFile = null;
     String partitionDir = "";
-    List<PartitionValue> partitionValues = internalParquetFile.getPartitionValues();
-    Instant modifTime = Instant.ofEpochMilli(internalParquetFile.getLastModified());
-    Path fileToAppend = new Path(internalParquetFile.toString());
+    Instant modifTime = Instant.ofEpochMilli(parquetFile.getModificationTime());
+    Path fileToAppend = parquetFile.getPath();
     // construct the file path to inject into the existing partitioned file
-    if (partitionValues == null || partitionValues.isEmpty()) {
+    if (partitionFields == null || partitionFields.isEmpty()) {
       String partitionValue =
           DateTimeFormatter.ISO_LOCAL_DATE.format(
               modifTime.atZone(ZoneId.systemDefault()).toLocalDate());
-      partitionDir = partitionFields.get(0).getSourceField().getName() + partitionValue;
+      partitionDir = partitionFields.get(0).name() + partitionValue;
     } else {
-      // handle multiple partitioning case (year and month etc.)
-      partitionDir =
-          partitionValues.stream()
-              .map(
-                  pv -> {
-                    Range epochValueObject = pv.getRange();
-                    // epochValueObject is always sure to be long
-                    String valueStr = String.valueOf(epochValueObject.getMaxValue());
-                    return pv.getPartitionField() + "=" + valueStr;
-                  })
-              .collect(Collectors.joining("/"));
+      // handle multiple partitioning case (year and month etc.), find the target partition dir
+      try {
+        partitionDir = findTargetPartitionFolder(modifTime, partitionFields, parquetFiles);
+      } catch (IOException e) {
+        e.printStackTrace();
+      }
     }
+    // construct the path
     String fileName = "part-" + System.currentTimeMillis() + "-" + UUID.randomUUID() + ".parquet";
     Path outputFile = new Path(new Path(rootPath, partitionDir), fileName);
     // return its reader for convenience of writing
     ParquetReader reader = readParquetDataAsReader(fileToAppend.getName(), conf);
-    // append/write it in the right partition
+    // then inject/append/write it in the right partition
     finalFile = writeNewParquetFile(conf, reader, fileToAppend, outputFile);
     return finalFile;
   }
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java
index 2365b5fef..9db52a70d 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java
@@ -18,8 +18,8 @@
  
 package org.apache.xtable.parquet;
 
-import java.io.File;
 import java.io.IOException;
+import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.List;
@@ -32,10 +32,8 @@
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 import org.junit.jupiter.api.Test;
-import org.junit.jupiter.api.io.TempDir;
 
 public class TestParquetDataManager {
-  @TempDir static java.nio.file.Path tempDir = Paths.get("./");
 
   @Test
   public void testAppendParquetFile() throws IOException {
@@ -59,9 +57,8 @@ public void testAppendParquetFile() throws IOException {
             RowFactory.create(301, "D", 2024, 7));
 
     Dataset<Row> df = spark.createDataFrame(data, schema);
-    java.nio.file.Path path = tempDir.resolve("parquet-partitioned_table_test");
-    File file = path.toFile();
-    String outputPath = file.getPath();
+    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test");
+    String outputPath = fixedPath.toFile().getName();
 
     df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
 

From f8bdbfe2ff26cb816e8d57f566f3de5aaf6b6831 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Wed, 17 Dec 2025 00:25:01 +0100
Subject: [PATCH 007/100] fix imports

---
 .../xtable/parquet/ParquetDataManager.java    | 32 ++++++++++++-------
 ...Manager.java => ITParquetDataManager.java} |  2 +-
 2 files changed, 22 insertions(+), 12 deletions(-)
 rename xtable-core/src/test/java/org/apache/xtable/parquet/{TestParquetDataManager.java => ITParquetDataManager.java} (98%)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index a19d8f585..23c590642 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -19,10 +19,18 @@
 package org.apache.xtable.parquet;
 
 import java.io.IOException;
-import java.time.*;
+import java.time.Instant;
+import java.time.LocalDateTime;
+import java.time.ZoneId;
+import java.time.ZoneOffset;
+import java.time.ZonedDateTime;
 import java.time.format.DateTimeFormatter;
 import java.time.temporal.ChronoUnit;
-import java.util.*;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.UUID;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
@@ -38,7 +46,7 @@
 import org.apache.parquet.hadoop.example.GroupReadSupport;
 import org.apache.parquet.hadoop.example.GroupWriteSupport;
 
-import org.apache.iceberg.PartitionField;
+import org.apache.xtable.model.schema.InternalPartitionField;
 
 @Builder
 public class ParquetDataManager {
@@ -64,7 +72,8 @@ private boolean checkSchemaIsSame(Configuration conf, Path fileToAppend, Path fi
     return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema());
   }
 
-  Instant parsePartitionDirToStartTime(String partitionDir, List<PartitionField> partitionFields) {
+  Instant parsePartitionDirToStartTime(
+      String partitionDir, List<InternalPartitionField> partitionFields) {
     Map<String, String> partitionValues = new HashMap<>();
     String[] parts = partitionDir.split("/");
     for (String part : parts) {
@@ -80,8 +89,8 @@ Instant parsePartitionDirToStartTime(String partitionDir, List<PartitionField> p
     int minute = 0;
     int second = 0;
 
-    for (PartitionField field : partitionFields) {
-      String fieldName = field.name().toLowerCase();
+    for (InternalPartitionField field : partitionFields) {
+      String fieldName = field.getSourceField().getName().toLowerCase();
       String value = partitionValues.get(fieldName);
 
       if (value != null) {
@@ -126,12 +135,13 @@ Instant parsePartitionDirToStartTime(String partitionDir, List<PartitionField> p
     }
   }
 
-  private ChronoUnit getGranularityUnit(List<PartitionField> partitionFields) {
+  private ChronoUnit getGranularityUnit(List<InternalPartitionField> partitionFields) {
     if (partitionFields == null || partitionFields.isEmpty()) {
       return ChronoUnit.DAYS;
     }
     // get the most granular field (the last one in the list, e.g., 'hour' after 'year' and 'month')
-    String lastFieldName = partitionFields.get(partitionFields.size() - 1).name();
+    String lastFieldName =
+        partitionFields.get(partitionFields.size() - 1).getSourceField().getName();
 
     if (lastFieldName.equalsIgnoreCase("year")) {
       return ChronoUnit.YEARS;
@@ -150,7 +160,7 @@ private ChronoUnit getGranularityUnit(List<PartitionField> partitionFields) {
 
   private String findTargetPartitionFolder(
       Instant modifTime,
-      List<PartitionField> partitionFields,
+      List<InternalPartitionField> partitionFields,
       // could be retrieved from getParquetFiles() of ParquetConversionSource
       Stream<LocatedFileStatus> parquetFiles)
       throws IOException {
@@ -194,7 +204,7 @@ private Path appendNewParquetFile(
       String rootPath,
       FileStatus parquetFile,
       Stream<LocatedFileStatus> parquetFiles,
-      List<PartitionField> partitionFields) {
+      List<InternalPartitionField> partitionFields) {
     Path finalFile = null;
     String partitionDir = "";
     Instant modifTime = Instant.ofEpochMilli(parquetFile.getModificationTime());
@@ -204,7 +214,7 @@ private Path appendNewParquetFile(
       String partitionValue =
           DateTimeFormatter.ISO_LOCAL_DATE.format(
               modifTime.atZone(ZoneId.systemDefault()).toLocalDate());
-      partitionDir = partitionFields.get(0).name() + partitionValue;
+      partitionDir = partitionFields.get(0).getSourceField().getName() + partitionValue;
     } else {
       // handle multiple partitioning case (year and month etc.), find the target partition dir
       try {
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
similarity index 98%
rename from xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java
rename to xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 9db52a70d..30d044578 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -33,7 +33,7 @@
 import org.apache.spark.sql.types.StructType;
 import org.junit.jupiter.api.Test;
 
-public class TestParquetDataManager {
+public class ITParquetDataManager {
 
   @Test
   public void testAppendParquetFile() throws IOException {

From c04a983733bc67614c224caca696333ba96867eb Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Wed, 17 Dec 2025 20:57:24 +0100
Subject: [PATCH 008/100] refactoring (lombok, logs, javadocs and function and
 approach commenting)

---
 .../xtable/parquet/ParquetDataManager.java    | 32 +++++++++++++----
 .../xtable/parquet/ParquetFileConfig.java     | 35 ++++++++-----------
 2 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 23c590642..ca0496684 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -35,6 +35,7 @@
 import java.util.stream.Stream;
 
 import lombok.Builder;
+import lombok.extern.log4j.Log4j2;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
@@ -48,17 +49,27 @@
 
 import org.apache.xtable.model.schema.InternalPartitionField;
 
+/**
+ * Manages Parquet file operations including reading, writing, and partition discovery and path
+ * construction.
+ *
+ * <p>This class provides functions to handle Parquet metadata, validate schemas during appends, and
+ * calculate target partition directories based on file modification times and defined partition
+ * fields.
+ */
+@Log4j2
 @Builder
 public class ParquetDataManager {
   private ParquetMetadataExtractor metadataExtractor = ParquetMetadataExtractor.getInstance();
 
-  public ParquetReader readParquetDataAsReader(String filePath, Configuration conf) {
+  public ParquetReader getParquetReader(String filePath, Configuration conf) throws IOException {
     ParquetReader reader = null;
     Path file = new Path(filePath);
     try {
       reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
     } catch (IOException e) {
-      e.printStackTrace();
+      log.error("Unexpected error during Parquet read: {}", filePath, e);
+      throw new IOException("Unexpected error reading Parquet file", e);
     }
     return reader;
   }
@@ -158,6 +169,9 @@ private ChronoUnit getGranularityUnit(List<InternalPartitionField> partitionFiel
     }
   }
 
+  // find the target partition by comparing the modficationTime of the file to add with the current
+  // tables
+  // if the modifTime falls between two partitions then we know it belongs to the first
   private String findTargetPartitionFolder(
       Instant modifTime,
       List<InternalPartitionField> partitionFields,
@@ -167,6 +181,7 @@ private String findTargetPartitionFolder(
 
     long modifTimeMillis = modifTime.toEpochMilli();
     final ZoneId ZONE = ZoneId.systemDefault();
+    // find the partition granularity (days, hours...)
     ChronoUnit partitionUnit = getGranularityUnit(partitionFields);
     List<String> allFolders =
         parquetFiles.map(status -> status.getPath().toString()).collect(Collectors.toList());
@@ -189,6 +204,7 @@ private String findTargetPartitionFolder(
       } else {
         ZonedDateTime currentStartZDT = currentStartTime.atZone(ZONE);
         ZonedDateTime nextStartZDT = currentStartZDT.plus(1, partitionUnit);
+        // to evaluate the partition date value and make comparable with the modifTime in Instant
         nextStartMillis = nextStartZDT.toInstant().toEpochMilli();
       }
       if (modifTimeMillis >= currentStartMillis && modifTimeMillis < nextStartMillis) {
@@ -204,7 +220,8 @@ private Path appendNewParquetFile(
       String rootPath,
       FileStatus parquetFile,
       Stream<LocatedFileStatus> parquetFiles,
-      List<InternalPartitionField> partitionFields) {
+      List<InternalPartitionField> partitionFields)
+      throws IOException {
     Path finalFile = null;
     String partitionDir = "";
     Instant modifTime = Instant.ofEpochMilli(parquetFile.getModificationTime());
@@ -227,7 +244,7 @@ private Path appendNewParquetFile(
     String fileName = "part-" + System.currentTimeMillis() + "-" + UUID.randomUUID() + ".parquet";
     Path outputFile = new Path(new Path(rootPath, partitionDir), fileName);
     // return its reader for convenience of writing
-    ParquetReader reader = readParquetDataAsReader(fileToAppend.getName(), conf);
+    ParquetReader reader = getParquetReader(fileToAppend.getName(), conf);
     // then inject/append/write it in the right partition
     finalFile = writeNewParquetFile(conf, reader, fileToAppend, outputFile);
     return finalFile;
@@ -239,7 +256,8 @@ private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAp
   }
 
   private Path writeNewParquetFile(
-      Configuration conf, ParquetReader reader, Path fileToAppend, Path outputFile) {
+      Configuration conf, ParquetReader reader, Path fileToAppend, Path outputFile)
+      throws IOException {
     ParquetFileConfig parquetFileConfig = getParquetFileConfig(conf, fileToAppend);
     int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE;
     try (ParquetWriter<Group> writer =
@@ -259,8 +277,8 @@ private Path writeNewParquetFile(
         writer.write(currentGroup);
       }
     } catch (Exception e) {
-
-      e.printStackTrace();
+      log.error("Unexpected error during Parquet write: {}", outputFile, e);
+      throw new IOException("Failed to complete Parquet write operation", e);
     }
     return outputFile;
   }
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
index 6cdf1d4f3..e4b645f5d 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
@@ -18,40 +18,33 @@
  
 package org.apache.xtable.parquet;
 
+import lombok.AccessLevel;
+import lombok.Getter;
+import lombok.experimental.FieldDefaults;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.parquet.hadoop.metadata.CompressionCodecName;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.schema.MessageType;
 
+@Getter
+@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
 class ParquetFileConfig {
-  private final MessageType schema;
-  private final long rowGroupSize;
-  private final CompressionCodecName codec;
+  MessageType schema;
+  long rowGroupSize;
+  CompressionCodecName codec;
 
   public ParquetFileConfig(Configuration conf, Path file) {
     ParquetMetadata metadata =
         ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file);
-    MessageType schema = metadata.getFileMetaData().getSchema();
+
     if (metadata.getBlocks().isEmpty()) {
-      throw new IllegalStateException("Baseline Parquet file has no row groups.");
+      throw new IllegalStateException("Parquet file contains no row groups.");
     }
-    long rowGroupSize = metadata.getBlocks().get(0).getTotalByteSize();
-    CompressionCodecName codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec();
-    this.schema = schema;
-    this.rowGroupSize = rowGroupSize;
-    this.codec = codec;
-  }
-
-  public MessageType getSchema() {
-    return schema;
-  }
-
-  public long getRowGroupSize() {
-    return rowGroupSize;
-  }
 
-  public CompressionCodecName getCodec() {
-    return codec;
+    this.schema = metadata.getFileMetaData().getSchema();
+    this.rowGroupSize = metadata.getBlocks().get(0).getTotalByteSize();
+    this.codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec();
   }
 }

From 5f2541e9c81d91b8b2a1cc3fe986457a7e3bb96c Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Thu, 1 Jan 2026 18:03:56 +0100
Subject: [PATCH 009/100] use appendFile to append a file into a table while
 tracking the appends using the FileStatus' modifTime attribute

---
 .../xtable/parquet/ParquetDataManager.java    | 139 ++++++++++++------
 1 file changed, 93 insertions(+), 46 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index ca0496684..19777be81 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -39,13 +39,20 @@
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.parquet.example.data.Group;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.ParquetFileWriter;
 import org.apache.parquet.hadoop.ParquetReader;
 import org.apache.parquet.hadoop.ParquetWriter;
 import org.apache.parquet.hadoop.example.GroupReadSupport;
 import org.apache.parquet.hadoop.example.GroupWriteSupport;
+import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.hadoop.util.HadoopOutputFile;
+import org.apache.parquet.io.InputFile;
+import org.apache.parquet.schema.MessageType;
 
 import org.apache.xtable.model.schema.InternalPartitionField;
 
@@ -61,27 +68,8 @@
 @Builder
 public class ParquetDataManager {
   private ParquetMetadataExtractor metadataExtractor = ParquetMetadataExtractor.getInstance();
-
-  public ParquetReader getParquetReader(String filePath, Configuration conf) throws IOException {
-    ParquetReader reader = null;
-    Path file = new Path(filePath);
-    try {
-      reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
-    } catch (IOException e) {
-      log.error("Unexpected error during Parquet read: {}", filePath, e);
-      throw new IOException("Unexpected error reading Parquet file", e);
-    }
-    return reader;
-  }
-
-  // TODO check if footer of added file can cause problems (catalog) when reading the full table
-  // after appending
-  // check required before appending the file
-  private boolean checkSchemaIsSame(Configuration conf, Path fileToAppend, Path fileFromTable) {
-    ParquetFileConfig schemaFileAppend = getParquetFileConfig(conf, fileToAppend);
-    ParquetFileConfig schemaFileFromTable = getParquetFileConfig(conf, fileFromTable);
-    return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema());
-  }
+  private static final long DEFAULT_BLOCK_SIZE = 128 * 1024 * 1024; // 128MB
+  private static final int DEFAULT_PAGE_SIZE = 1 * 1024 * 1024; // 1MB
 
   Instant parsePartitionDirToStartTime(
       String partitionDir, List<InternalPartitionField> partitionFields) {
@@ -213,6 +201,34 @@ private String findTargetPartitionFolder(
     }
     return "";
   }
+
+  private Path writeNewParquetFile(
+      Configuration conf, ParquetReader reader, Path fileToAppend, Path outputFile)
+      throws IOException {
+    ParquetFileConfig parquetFileConfig = getParquetFileConfig(conf, fileToAppend);
+    int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE;
+    try (ParquetWriter<Group> writer =
+        new ParquetWriter<Group>(
+            outputFile,
+            new GroupWriteSupport(),
+            parquetFileConfig.getCodec(),
+            (int) parquetFileConfig.getRowGroupSize(),
+            pageSize,
+            pageSize, // dictionaryPageSize
+            true, // enableDictionary
+            false, // enableValidation
+            ParquetWriter.DEFAULT_WRITER_VERSION,
+            conf)) {
+      Group currentGroup = null;
+      while ((currentGroup = (Group) reader.read()) != null) {
+        writer.write(currentGroup);
+      }
+    } catch (Exception e) {
+      log.error("Unexpected error during Parquet write: {}", outputFile, e);
+      throw new IOException("Failed to complete Parquet write operation", e);
+    }
+    return outputFile;
+  }
   // partition fields are already computed, given a parquet file InternalDataFile must be derived
   // (e.g., using createInternalDataFileFromParquetFile())
   private Path appendNewParquetFile(
@@ -250,36 +266,67 @@ private Path appendNewParquetFile(
     return finalFile;
   }
 
+  public ParquetReader getParquetReader(String filePath, Configuration conf) throws IOException {
+    ParquetReader reader = null;
+    Path file = new Path(filePath);
+    try {
+      reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
+    } catch (IOException e) {
+      log.error("Unexpected error during Parquet read: {}", filePath, e);
+      throw new IOException("Unexpected error reading Parquet file", e);
+    }
+    return reader;
+  }
+
+  /* Alternative Approach (without path construction) using Parquet API to append a file */
+
+  // after appending check required before appending the file
+  private boolean checkIfSchemaIsSame(Configuration conf, Path fileToAppend, Path fileFromTable) {
+    ParquetFileConfig schemaFileAppend = getParquetFileConfig(conf, fileToAppend);
+    ParquetFileConfig schemaFileFromTable = getParquetFileConfig(conf, fileFromTable);
+    return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema());
+  }
+
   private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAppend) {
     ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, fileToAppend);
     return parquetFileConfig;
   }
 
-  private Path writeNewParquetFile(
-      Configuration conf, ParquetReader reader, Path fileToAppend, Path outputFile)
-      throws IOException {
-    ParquetFileConfig parquetFileConfig = getParquetFileConfig(conf, fileToAppend);
-    int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE;
-    try (ParquetWriter<Group> writer =
-        new ParquetWriter<Group>(
-            outputFile,
-            new GroupWriteSupport(),
-            parquetFileConfig.getCodec(),
-            (int) parquetFileConfig.getRowGroupSize(),
-            pageSize,
-            pageSize, // dictionaryPageSize
-            true, // enableDictionary
-            false, // enableValidation
-            ParquetWriter.DEFAULT_WRITER_VERSION,
-            conf)) {
-      Group currentGroup = null;
-      while ((currentGroup = (Group) reader.read()) != null) {
-        writer.write(currentGroup);
-      }
-    } catch (Exception e) {
-      log.error("Unexpected error during Parquet write: {}", outputFile, e);
-      throw new IOException("Failed to complete Parquet write operation", e);
+  // Method2 to append a file into a table
+  public void mergeParquetFiles(
+      Path outputPath, Path filePath, Path fileToAppend, MessageType schema) throws IOException {
+    Configuration conf = new Configuration();
+    ParquetFileWriter writer =
+        new ParquetFileWriter(
+            HadoopOutputFile.fromPath(outputPath, conf),
+            schema,
+            ParquetFileWriter.Mode.CREATE,
+            DEFAULT_BLOCK_SIZE,
+            0);
+    // write the initial table with the appended file to add into the outputPath
+    writer.start();
+    HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf);
+    InputFile inputFileToAppend = HadoopInputFile.fromPath(fileToAppend, conf);
+    if (checkIfSchemaIsSame(conf, fileToAppend, filePath)) {
+      writer.appendFile(inputFile);
+      writer.appendFile(inputFileToAppend);
     }
-    return outputFile;
+    Map<String, String> combinedMeta = new HashMap<>();
+    // track the append date and save it in the footer
+    int appendCount =
+        Integer.parseInt(
+            ParquetFileReader.readFooter(conf, filePath)
+                .getFileMetaData()
+                .getKeyValueMetaData()
+                .getOrDefault("total_appends", "0"));
+    String newKey = "append_date_" + (appendCount + 1);
+    // get the equivalent fileStatus from the file-to-append Path
+    FileSystem fs = FileSystem.get(conf);
+    FileStatus fileStatus = fs.getFileStatus(fileToAppend);
+    // save its modification time (for later sync related retrieval) in the metadata of the output
+    // table
+    combinedMeta.put(newKey, String.valueOf(fileStatus.getModificationTime()));
+    combinedMeta.put("total_appends", String.valueOf(appendCount + 1));
+    writer.end(combinedMeta);
   }
 }

From 47e707688d3ced9ee34c477c988224dc5c663ff4 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Thu, 1 Jan 2026 19:01:06 +0100
Subject: [PATCH 010/100] find the files that satisfy to the time condition

---
 .../xtable/parquet/ParquetDataManager.java    | 65 +++++++++++--------
 1 file changed, 39 insertions(+), 26 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 19777be81..777b829be 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -26,11 +26,7 @@
 import java.time.ZonedDateTime;
 import java.time.format.DateTimeFormatter;
 import java.time.temporal.ChronoUnit;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.UUID;
+import java.util.*;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
@@ -47,8 +43,8 @@
 import org.apache.parquet.hadoop.ParquetFileWriter;
 import org.apache.parquet.hadoop.ParquetReader;
 import org.apache.parquet.hadoop.ParquetWriter;
-import org.apache.parquet.hadoop.example.GroupReadSupport;
 import org.apache.parquet.hadoop.example.GroupWriteSupport;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.hadoop.util.HadoopInputFile;
 import org.apache.parquet.hadoop.util.HadoopOutputFile;
 import org.apache.parquet.io.InputFile;
@@ -231,7 +227,7 @@ private Path writeNewParquetFile(
   }
   // partition fields are already computed, given a parquet file InternalDataFile must be derived
   // (e.g., using createInternalDataFileFromParquetFile())
-  private Path appendNewParquetFile(
+  private void appendNewParquetFile(
       Configuration conf,
       String rootPath,
       FileStatus parquetFile,
@@ -259,26 +255,12 @@ private Path appendNewParquetFile(
     // construct the path
     String fileName = "part-" + System.currentTimeMillis() + "-" + UUID.randomUUID() + ".parquet";
     Path outputFile = new Path(new Path(rootPath, partitionDir), fileName);
-    // return its reader for convenience of writing
-    ParquetReader reader = getParquetReader(fileToAppend.getName(), conf);
     // then inject/append/write it in the right partition
-    finalFile = writeNewParquetFile(conf, reader, fileToAppend, outputFile);
-    return finalFile;
+    MessageType schema = getParquetFileConfig(conf, new Path(rootPath)).getSchema();
+    AppendNewParquetFiles(outputFile, new Path(rootPath), fileToAppend, schema);
   }
 
-  public ParquetReader getParquetReader(String filePath, Configuration conf) throws IOException {
-    ParquetReader reader = null;
-    Path file = new Path(filePath);
-    try {
-      reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
-    } catch (IOException e) {
-      log.error("Unexpected error during Parquet read: {}", filePath, e);
-      throw new IOException("Unexpected error reading Parquet file", e);
-    }
-    return reader;
-  }
-
-  /* Alternative Approach (without path construction) using Parquet API to append a file */
+  /* Use Parquet API to append to a file */
 
   // after appending check required before appending the file
   private boolean checkIfSchemaIsSame(Configuration conf, Path fileToAppend, Path fileFromTable) {
@@ -292,8 +274,8 @@ private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAp
     return parquetFileConfig;
   }
 
-  // Method2 to append a file into a table
-  public void mergeParquetFiles(
+  // append a file into a table
+  public void AppendNewParquetFiles(
       Path outputPath, Path filePath, Path fileToAppend, MessageType schema) throws IOException {
     Configuration conf = new Configuration();
     ParquetFileWriter writer =
@@ -329,4 +311,35 @@ public void mergeParquetFiles(
     combinedMeta.put("total_appends", String.valueOf(appendCount + 1));
     writer.end(combinedMeta);
   }
+  // Find and retrieve the file paths that satisfy the time condition
+  // Each partition folder contains one merged_file in turn containing many appends
+  public List<Path> findFilesAfterModifTime(
+      Configuration conf, Path directoryPath, long targetModifTime) throws IOException {
+    List<Path> results = new ArrayList<>();
+    FileSystem fs = directoryPath.getFileSystem(conf);
+
+    FileStatus[] statuses =
+        fs.listStatus(directoryPath, path -> path.getName().endsWith(".parquet"));
+
+    for (FileStatus status : statuses) {
+      Path filePath = status.getPath();
+
+      try {
+        ParquetMetadata footer = ParquetFileReader.readFooter(conf, filePath);
+        Map<String, String> meta = footer.getFileMetaData().getKeyValueMetaData();
+
+        int totalAppends = Integer.parseInt(meta.getOrDefault("total_appends", "0"));
+        if (Long.parseLong(meta.get("append_date_0")) > targetModifTime) {
+          results.add(filePath);
+          break;
+        } else if (Long.parseLong(meta.get("append_date_" + totalAppends)) < targetModifTime) {
+          continue;
+        }
+      } catch (Exception e) {
+        log.error("Could not read metadata for: {}", filePath, e);
+        throw new IOException("Could not read metadata for", e);
+      }
+    }
+    return results;
+  }
 }

From fbb09ecbc284c7c1eaa96a22934b91134f0e9750 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Thu, 1 Jan 2026 19:16:18 +0100
Subject: [PATCH 011/100] treat appends as separate files to add in the target
 partition folder

---
 .../xtable/parquet/ParquetDataManager.java    | 23 ++++++++++++++-----
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 777b829be..cc1f473d0 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -34,10 +34,7 @@
 import lombok.extern.log4j.Log4j2;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.LocatedFileStatus;
-import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.*;
 import org.apache.parquet.example.data.Group;
 import org.apache.parquet.hadoop.ParquetFileReader;
 import org.apache.parquet.hadoop.ParquetFileWriter;
@@ -257,7 +254,15 @@ private void appendNewParquetFile(
     Path outputFile = new Path(new Path(rootPath, partitionDir), fileName);
     // then inject/append/write it in the right partition
     MessageType schema = getParquetFileConfig(conf, new Path(rootPath)).getSchema();
-    AppendNewParquetFiles(outputFile, new Path(rootPath), fileToAppend, schema);
+    appendNewParquetFiles(outputFile, new Path(rootPath), fileToAppend, schema);
+
+    // OR
+    /*FileSystem fs = outputFile.getFileSystem(conf);
+    if (!fs.exists(outputFile.getParent())) {
+      fs.mkdirs(outputFile.getParent());
+    }
+    // copy a seperate file into the target partition folder
+    FileUtil.copy(fs, fileToAppend, fs, outputFile, false, conf);*/
   }
 
   /* Use Parquet API to append to a file */
@@ -275,7 +280,7 @@ private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAp
   }
 
   // append a file into a table
-  public void AppendNewParquetFiles(
+  public void appendNewParquetFiles(
       Path outputPath, Path filePath, Path fileToAppend, MessageType schema) throws IOException {
     Configuration conf = new Configuration();
     ParquetFileWriter writer =
@@ -335,11 +340,17 @@ public List<Path> findFilesAfterModifTime(
         } else if (Long.parseLong(meta.get("append_date_" + totalAppends)) < targetModifTime) {
           continue;
         }
+        // OR
+        /*if (status.getPath().getName().endsWith(".parquet") &&
+                status.getModificationTime() > targetModifTime) {
+          results.add(status.getPath());
+        }*/
       } catch (Exception e) {
         log.error("Could not read metadata for: {}", filePath, e);
         throw new IOException("Could not read metadata for", e);
       }
     }
+
     return results;
   }
 }

From fe19a6055c92079a938ff2feb83d105e1b6c720e Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Fri, 2 Jan 2026 20:10:35 +0100
Subject: [PATCH 012/100] update approach: selective block compaction

---
 .../xtable/parquet/ParquetDataManager.java    | 295 +++++-------------
 .../xtable/parquet/ParquetFileConfig.java     |   2 +-
 2 files changed, 73 insertions(+), 224 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index cc1f473d0..563c6ca3d 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -19,36 +19,23 @@
 package org.apache.xtable.parquet;
 
 import java.io.IOException;
-import java.time.Instant;
-import java.time.LocalDateTime;
-import java.time.ZoneId;
-import java.time.ZoneOffset;
-import java.time.ZonedDateTime;
-import java.time.format.DateTimeFormatter;
-import java.time.temporal.ChronoUnit;
 import java.util.*;
-import java.util.stream.Collectors;
-import java.util.stream.Stream;
 
 import lombok.Builder;
 import lombok.extern.log4j.Log4j2;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.*;
-import org.apache.parquet.example.data.Group;
+import org.apache.parquet.format.converter.ParquetMetadataConverter;
 import org.apache.parquet.hadoop.ParquetFileReader;
 import org.apache.parquet.hadoop.ParquetFileWriter;
-import org.apache.parquet.hadoop.ParquetReader;
-import org.apache.parquet.hadoop.ParquetWriter;
-import org.apache.parquet.hadoop.example.GroupWriteSupport;
+import org.apache.parquet.hadoop.metadata.BlockMetaData;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.hadoop.util.HadoopInputFile;
 import org.apache.parquet.hadoop.util.HadoopOutputFile;
 import org.apache.parquet.io.InputFile;
 import org.apache.parquet.schema.MessageType;
 
-import org.apache.xtable.model.schema.InternalPartitionField;
-
 /**
  * Manages Parquet file operations including reading, writing, and partition discovery and path
  * construction.
@@ -64,207 +51,6 @@ public class ParquetDataManager {
   private static final long DEFAULT_BLOCK_SIZE = 128 * 1024 * 1024; // 128MB
   private static final int DEFAULT_PAGE_SIZE = 1 * 1024 * 1024; // 1MB
 
-  Instant parsePartitionDirToStartTime(
-      String partitionDir, List<InternalPartitionField> partitionFields) {
-    Map<String, String> partitionValues = new HashMap<>();
-    String[] parts = partitionDir.split("/");
-    for (String part : parts) {
-      String[] keyValue = part.split("=");
-      if (keyValue.length == 2) {
-        partitionValues.put(keyValue[0].toLowerCase(), keyValue[1]);
-      }
-    }
-    int year = 1970;
-    int month = 1;
-    int day = 1;
-    int hour = 0;
-    int minute = 0;
-    int second = 0;
-
-    for (InternalPartitionField field : partitionFields) {
-      String fieldName = field.getSourceField().getName().toLowerCase();
-      String value = partitionValues.get(fieldName);
-
-      if (value != null) {
-        try {
-          int intValue = Integer.parseInt(value);
-
-          switch (fieldName) {
-            case "year":
-              year = intValue;
-              break;
-            case "month":
-              month = intValue;
-              break;
-            case "day":
-              day = intValue;
-              break;
-            case "hour":
-              hour = intValue;
-              break;
-            case "minute":
-              minute = intValue;
-              break;
-            case "second":
-              second = intValue;
-              break;
-            default:
-              break;
-          }
-        } catch (NumberFormatException e) {
-          System.err.println(
-              "Warning: Invalid number format for partition field '" + fieldName + "': " + value);
-        }
-      }
-    }
-
-    try {
-      LocalDateTime localDateTime = LocalDateTime.of(year, month, day, hour, minute, second);
-      return localDateTime.toInstant(ZoneOffset.UTC);
-    } catch (java.time.DateTimeException e) {
-      throw new IllegalArgumentException(
-          "Invalid partition directory date components: " + partitionDir, e);
-    }
-  }
-
-  private ChronoUnit getGranularityUnit(List<InternalPartitionField> partitionFields) {
-    if (partitionFields == null || partitionFields.isEmpty()) {
-      return ChronoUnit.DAYS;
-    }
-    // get the most granular field (the last one in the list, e.g., 'hour' after 'year' and 'month')
-    String lastFieldName =
-        partitionFields.get(partitionFields.size() - 1).getSourceField().getName();
-
-    if (lastFieldName.equalsIgnoreCase("year")) {
-      return ChronoUnit.YEARS;
-    } else if (lastFieldName.equalsIgnoreCase("month")) {
-      return ChronoUnit.MONTHS;
-    } else if (lastFieldName.equalsIgnoreCase("day") || lastFieldName.equalsIgnoreCase("date")) {
-      return ChronoUnit.DAYS;
-    } else if (lastFieldName.equalsIgnoreCase("hour")) {
-      return ChronoUnit.HOURS;
-    } else if (lastFieldName.equalsIgnoreCase("minute")) {
-      return ChronoUnit.MINUTES;
-    } else {
-      return ChronoUnit.DAYS;
-    }
-  }
-
-  // find the target partition by comparing the modficationTime of the file to add with the current
-  // tables
-  // if the modifTime falls between two partitions then we know it belongs to the first
-  private String findTargetPartitionFolder(
-      Instant modifTime,
-      List<InternalPartitionField> partitionFields,
-      // could be retrieved from getParquetFiles() of ParquetConversionSource
-      Stream<LocatedFileStatus> parquetFiles)
-      throws IOException {
-
-    long modifTimeMillis = modifTime.toEpochMilli();
-    final ZoneId ZONE = ZoneId.systemDefault();
-    // find the partition granularity (days, hours...)
-    ChronoUnit partitionUnit = getGranularityUnit(partitionFields);
-    List<String> allFolders =
-        parquetFiles.map(status -> status.getPath().toString()).collect(Collectors.toList());
-
-    // sort the folders by their start time
-    allFolders.sort(Comparator.comparing(f -> parsePartitionDirToStartTime(f, partitionFields)));
-
-    for (int i = 0; i < allFolders.size(); i++) {
-      String currentFolder = allFolders.get(i);
-
-      // start time of the current folder
-      Instant currentStartTime = parsePartitionDirToStartTime(currentFolder, partitionFields);
-      long currentStartMillis = currentStartTime.toEpochMilli();
-
-      // determine the end time (which is the start time of the next logical partition)
-      long nextStartMillis;
-      if (i + 1 < allFolders.size()) {
-        nextStartMillis =
-            parsePartitionDirToStartTime(allFolders.get(i + 1), partitionFields).toEpochMilli();
-      } else {
-        ZonedDateTime currentStartZDT = currentStartTime.atZone(ZONE);
-        ZonedDateTime nextStartZDT = currentStartZDT.plus(1, partitionUnit);
-        // to evaluate the partition date value and make comparable with the modifTime in Instant
-        nextStartMillis = nextStartZDT.toInstant().toEpochMilli();
-      }
-      if (modifTimeMillis >= currentStartMillis && modifTimeMillis < nextStartMillis) {
-        return currentFolder;
-      }
-    }
-    return "";
-  }
-
-  private Path writeNewParquetFile(
-      Configuration conf, ParquetReader reader, Path fileToAppend, Path outputFile)
-      throws IOException {
-    ParquetFileConfig parquetFileConfig = getParquetFileConfig(conf, fileToAppend);
-    int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE;
-    try (ParquetWriter<Group> writer =
-        new ParquetWriter<Group>(
-            outputFile,
-            new GroupWriteSupport(),
-            parquetFileConfig.getCodec(),
-            (int) parquetFileConfig.getRowGroupSize(),
-            pageSize,
-            pageSize, // dictionaryPageSize
-            true, // enableDictionary
-            false, // enableValidation
-            ParquetWriter.DEFAULT_WRITER_VERSION,
-            conf)) {
-      Group currentGroup = null;
-      while ((currentGroup = (Group) reader.read()) != null) {
-        writer.write(currentGroup);
-      }
-    } catch (Exception e) {
-      log.error("Unexpected error during Parquet write: {}", outputFile, e);
-      throw new IOException("Failed to complete Parquet write operation", e);
-    }
-    return outputFile;
-  }
-  // partition fields are already computed, given a parquet file InternalDataFile must be derived
-  // (e.g., using createInternalDataFileFromParquetFile())
-  private void appendNewParquetFile(
-      Configuration conf,
-      String rootPath,
-      FileStatus parquetFile,
-      Stream<LocatedFileStatus> parquetFiles,
-      List<InternalPartitionField> partitionFields)
-      throws IOException {
-    Path finalFile = null;
-    String partitionDir = "";
-    Instant modifTime = Instant.ofEpochMilli(parquetFile.getModificationTime());
-    Path fileToAppend = parquetFile.getPath();
-    // construct the file path to inject into the existing partitioned file
-    if (partitionFields == null || partitionFields.isEmpty()) {
-      String partitionValue =
-          DateTimeFormatter.ISO_LOCAL_DATE.format(
-              modifTime.atZone(ZoneId.systemDefault()).toLocalDate());
-      partitionDir = partitionFields.get(0).getSourceField().getName() + partitionValue;
-    } else {
-      // handle multiple partitioning case (year and month etc.), find the target partition dir
-      try {
-        partitionDir = findTargetPartitionFolder(modifTime, partitionFields, parquetFiles);
-      } catch (IOException e) {
-        e.printStackTrace();
-      }
-    }
-    // construct the path
-    String fileName = "part-" + System.currentTimeMillis() + "-" + UUID.randomUUID() + ".parquet";
-    Path outputFile = new Path(new Path(rootPath, partitionDir), fileName);
-    // then inject/append/write it in the right partition
-    MessageType schema = getParquetFileConfig(conf, new Path(rootPath)).getSchema();
-    appendNewParquetFiles(outputFile, new Path(rootPath), fileToAppend, schema);
-
-    // OR
-    /*FileSystem fs = outputFile.getFileSystem(conf);
-    if (!fs.exists(outputFile.getParent())) {
-      fs.mkdirs(outputFile.getParent());
-    }
-    // copy a seperate file into the target partition folder
-    FileUtil.copy(fs, fileToAppend, fs, outputFile, false, conf);*/
-  }
-
   /* Use Parquet API to append to a file */
 
   // after appending check required before appending the file
@@ -279,23 +65,25 @@ private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAp
     return parquetFileConfig;
   }
 
-  // append a file into a table
-  public void appendNewParquetFiles(
-      Path outputPath, Path filePath, Path fileToAppend, MessageType schema) throws IOException {
+  // append a file into a table (merges two files into one .parquet under a partition folder)
+  public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema)
+      throws IOException {
     Configuration conf = new Configuration();
+    long firstBlockIndex = getParquetFileConfig(conf, fileToAppend).getRowGroupSize();
     ParquetFileWriter writer =
         new ParquetFileWriter(
-            HadoopOutputFile.fromPath(outputPath, conf),
+            HadoopOutputFile.fromPath(filePath, conf),
+            // HadoopOutputFile.fromPath(outputPath, conf),
             schema,
             ParquetFileWriter.Mode.CREATE,
             DEFAULT_BLOCK_SIZE,
             0);
     // write the initial table with the appended file to add into the outputPath
     writer.start();
-    HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf);
+    // HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf);
     InputFile inputFileToAppend = HadoopInputFile.fromPath(fileToAppend, conf);
     if (checkIfSchemaIsSame(conf, fileToAppend, filePath)) {
-      writer.appendFile(inputFile);
+      // writer.appendFile(inputFile);
       writer.appendFile(inputFileToAppend);
     }
     Map<String, String> combinedMeta = new HashMap<>();
@@ -310,12 +98,73 @@ public void appendNewParquetFiles(
     // get the equivalent fileStatus from the file-to-append Path
     FileSystem fs = FileSystem.get(conf);
     FileStatus fileStatus = fs.getFileStatus(fileToAppend);
-    // save its modification time (for later sync related retrieval) in the metadata of the output
+    // save block indexes and modification time (for later sync related retrieval) in the metadata
+    // of the output
     // table
     combinedMeta.put(newKey, String.valueOf(fileStatus.getModificationTime()));
     combinedMeta.put("total_appends", String.valueOf(appendCount + 1));
+    combinedMeta.put(
+        "index_start_block_of_append_" + String.valueOf(appendCount + 1),
+        String.valueOf(firstBlockIndex));
     writer.end(combinedMeta);
   }
+  // selective compaction of parquet blocks
+  public List<Path> formNewTargetFiles(Configuration conf, Path directoryPath, long targetModifTime)
+      throws IOException {
+    List<Path> finalPaths = new ArrayList<>();
+    FileSystem fs = directoryPath.getFileSystem(conf);
+
+    FileStatus[] statuses =
+        fs.listStatus(directoryPath, path -> path.getName().endsWith(".parquet"));
+
+    for (FileStatus status : statuses) {
+
+      Path filePath = status.getPath();
+      ParquetMetadata bigFileFooter =
+          ParquetFileReader.readFooter(conf, filePath, ParquetMetadataConverter.NO_FILTER);
+      MessageType schema = bigFileFooter.getFileMetaData().getSchema();
+      int totalAppends =
+          Integer.parseInt(
+              bigFileFooter
+                  .getFileMetaData()
+                  .getKeyValueMetaData()
+                  .getOrDefault("total_appends", "0"));
+      for (int i = 1; i < totalAppends; i++) {
+        int startBlock =
+            Integer.valueOf(
+                bigFileFooter
+                    .getFileMetaData()
+                    .getKeyValueMetaData()
+                    .get("index_start_block_of_append_" + totalAppends));
+        int endBlock =
+            Integer.valueOf(
+                bigFileFooter
+                    .getFileMetaData()
+                    .getKeyValueMetaData()
+                    .get("index_end_block_of_append_" + (totalAppends + 1)));
+        Path targetSyncFilePath =
+            new Path(
+                status.getPath().getName() + String.valueOf(startBlock) + String.valueOf(endBlock));
+        try (ParquetFileWriter writer = new ParquetFileWriter(conf, schema, targetSyncFilePath)) {
+          writer.start();
+          for (int j = 0; j < bigFileFooter.getBlocks().size(); j++) {
+            BlockMetaData blockMetadata = bigFileFooter.getBlocks().get(j);
+            if (j >= startBlock
+                && j <= endBlock
+                && status.getModificationTime() >= targetModifTime) {
+              List<BlockMetaData> blockList =
+                  Collections.<BlockMetaData>singletonList(blockMetadata);
+              FSDataInputStream targetInputStream = fs.open(status.getPath());
+              writer.appendRowGroups(targetInputStream, blockList, false);
+            }
+          }
+          writer.end(new HashMap<>());
+          finalPaths.add(targetSyncFilePath);
+        }
+      }
+    }
+    return finalPaths;
+  }
   // Find and retrieve the file paths that satisfy the time condition
   // Each partition folder contains one merged_file in turn containing many appends
   public List<Path> findFilesAfterModifTime(
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
index e4b645f5d..64c9e9cfc 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
@@ -44,7 +44,7 @@ public ParquetFileConfig(Configuration conf, Path file) {
     }
 
     this.schema = metadata.getFileMetaData().getSchema();
-    this.rowGroupSize = metadata.getBlocks().get(0).getTotalByteSize();
+    this.rowGroupSize = metadata.getBlocks().size(); // .get(0).getTotalByteSize()
     this.codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec();
   }
 }

From da7f300b18172b21f968ba4583d2fbe98dc0488c Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Fri, 2 Jan 2026 20:50:01 +0100
Subject: [PATCH 013/100] update approach: added a basic test to check data
 selection using modificationTime selector

---
 .../xtable/parquet/ParquetDataManager.java    | 41 +------------
 .../xtable/parquet/ITParquetDataManager.java  | 58 +++++++++++++++++--
 2 files changed, 56 insertions(+), 43 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 563c6ca3d..75f3c4f80 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -109,8 +109,8 @@ public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType
     writer.end(combinedMeta);
   }
   // selective compaction of parquet blocks
-  public List<Path> formNewTargetFiles(Configuration conf, Path directoryPath, long targetModifTime)
-      throws IOException {
+  public static List<Path> formNewTargetFiles(
+      Configuration conf, Path directoryPath, long targetModifTime) throws IOException {
     List<Path> finalPaths = new ArrayList<>();
     FileSystem fs = directoryPath.getFileSystem(conf);
 
@@ -165,41 +165,4 @@ public List<Path> formNewTargetFiles(Configuration conf, Path directoryPath, lon
     }
     return finalPaths;
   }
-  // Find and retrieve the file paths that satisfy the time condition
-  // Each partition folder contains one merged_file in turn containing many appends
-  public List<Path> findFilesAfterModifTime(
-      Configuration conf, Path directoryPath, long targetModifTime) throws IOException {
-    List<Path> results = new ArrayList<>();
-    FileSystem fs = directoryPath.getFileSystem(conf);
-
-    FileStatus[] statuses =
-        fs.listStatus(directoryPath, path -> path.getName().endsWith(".parquet"));
-
-    for (FileStatus status : statuses) {
-      Path filePath = status.getPath();
-
-      try {
-        ParquetMetadata footer = ParquetFileReader.readFooter(conf, filePath);
-        Map<String, String> meta = footer.getFileMetaData().getKeyValueMetaData();
-
-        int totalAppends = Integer.parseInt(meta.getOrDefault("total_appends", "0"));
-        if (Long.parseLong(meta.get("append_date_0")) > targetModifTime) {
-          results.add(filePath);
-          break;
-        } else if (Long.parseLong(meta.get("append_date_" + totalAppends)) < targetModifTime) {
-          continue;
-        }
-        // OR
-        /*if (status.getPath().getName().endsWith(".parquet") &&
-                status.getModificationTime() > targetModifTime) {
-          results.add(status.getPath());
-        }*/
-      } catch (Exception e) {
-        log.error("Could not read metadata for: {}", filePath, e);
-        throw new IOException("Could not read metadata for", e);
-      }
-    }
-
-    return results;
-  }
 }
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 30d044578..c084eb6df 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -18,12 +18,17 @@
  
 package org.apache.xtable.parquet;
 
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
 import java.io.IOException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.List;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;
@@ -39,7 +44,7 @@ public class ITParquetDataManager {
   public void testAppendParquetFile() throws IOException {
     SparkSession spark =
         SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate();
-
+    Configuration conf = spark.sparkContext().hadoopConfiguration();
     StructType schema =
         DataTypes.createStructType(
             new StructField[] {
@@ -62,11 +67,56 @@ public void testAppendParquetFile() throws IOException {
 
     df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
 
-    // TODO create an InternalDataFile and Partition Fields for testing purposes
+    // test find files to sync
+    long targetModifTime = System.currentTimeMillis() - 360000;
+    org.apache.hadoop.fs.Path hdfsPath =
+        new org.apache.hadoop.fs.Path("target/fixed-parquet-data/parquet-partitioned_table_test");
+    FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf);
+    // set the modification time to the file
+    updateModificationTimeRecursive(fs, hdfsPath, targetModifTime);
+    // create new file to append using Spark
+    List<Row> futureDataToSync =
+        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2027, 7));
+    Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
+    dfToSync.write().partitionBy("year", "month").mode("append").parquet(outputPath);
+    long newModifTime = System.currentTimeMillis() - 5000;
+    List<String> newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=7");
+
+    for (String partition : newPartitions) {
+      org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(hdfsPath, partition);
+      if (fs.exists(partitionPath)) {
+        updateModificationTimeRecursive(fs, partitionPath, newModifTime);
+      }
+    }
+    List<org.apache.hadoop.fs.Path> resultingFiles =
+        ParquetDataManager.formNewTargetFiles(
+            conf, new org.apache.hadoop.fs.Path(fixedPath.toUri()), newModifTime);
+    // check if resultingFiles contains the append data only (through the partition names)
+    for (org.apache.hadoop.fs.Path p : resultingFiles) {
+      String pathString = p.toString();
+      //  should be TRUE
+      boolean isNewData = pathString.contains("year=2026") || pathString.contains("year=2027");
 
-    // TODO test appendNewParquetFile()
+      // should be FALSE
+      boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025");
+
+      assertTrue(isNewData, "Path should belong to appended data: " + pathString);
+      assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
+    }
+    // TODO test appendNewParquetFile() (using a non-Spark approach to append a parquet file)
 
-    // TODO validate the final table
     spark.stop();
   }
+
+  private void updateModificationTimeRecursive(
+      FileSystem fs, org.apache.hadoop.fs.Path path, long time) throws IOException {
+    org.apache.hadoop.fs.RemoteIterator<org.apache.hadoop.fs.LocatedFileStatus> it =
+        fs.listFiles(path, true);
+    while (it.hasNext()) {
+      org.apache.hadoop.fs.LocatedFileStatus status = it.next();
+      if (status.getPath().getName().endsWith(".parquet")) {
+        fs.setTimes(status.getPath(), time, -1);
+      }
+    }
+  }
 }

From a8730b7c33fd4aabe9066a855b630bfb0e08bdda Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Fri, 2 Jan 2026 21:18:28 +0100
Subject: [PATCH 014/100] fix append based on partition value

---
 .../xtable/parquet/ParquetDataManager.java    | 30 +++++++++++++++----
 .../xtable/parquet/ParquetFileConfig.java     |  4 +--
 .../xtable/parquet/ITParquetDataManager.java  |  2 ++
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 75f3c4f80..6585fda41 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -65,25 +65,45 @@ private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAp
     return parquetFileConfig;
   }
 
+  public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageType schema) throws IOException {
+    Configuration conf = new Configuration();
+
+    // e.g., "year=2024/month=12"
+    String targetPartition = getPartitionPath(targetPath);
+    String sourcePartition = getPartitionPath(sourcePath);
+
+    if (!targetPartition.equals(sourcePartition)) {
+      throw new IllegalArgumentException("Partition Mismatch! Cannot merge " +
+              sourcePartition + " into " + targetPartition);
+    }
+
+    // append files within the same partition foldr
+    appendNewParquetFiles(targetPath, sourcePath, schema);
+  }
+
+  private String getPartitionPath(Path path) {
+    // Returns the parent directory name (e.g., country=US)
+    // For nested partitions, you might need path.getParent().getParent()...
+    return path.getParent().getName();
+  }
   // append a file into a table (merges two files into one .parquet under a partition folder)
   public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema)
       throws IOException {
     Configuration conf = new Configuration();
-    long firstBlockIndex = getParquetFileConfig(conf, fileToAppend).getRowGroupSize();
+    long firstBlockIndex = getParquetFileConfig(conf, filePath).getRowGroupIndex();
     ParquetFileWriter writer =
         new ParquetFileWriter(
             HadoopOutputFile.fromPath(filePath, conf),
-            // HadoopOutputFile.fromPath(outputPath, conf),
             schema,
             ParquetFileWriter.Mode.CREATE,
             DEFAULT_BLOCK_SIZE,
             0);
     // write the initial table with the appended file to add into the outputPath
     writer.start();
-    // HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf);
+    HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf);
     InputFile inputFileToAppend = HadoopInputFile.fromPath(fileToAppend, conf);
     if (checkIfSchemaIsSame(conf, fileToAppend, filePath)) {
-      // writer.appendFile(inputFile);
+      writer.appendFile(inputFile);
       writer.appendFile(inputFileToAppend);
     }
     Map<String, String> combinedMeta = new HashMap<>();
@@ -105,7 +125,7 @@ public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType
     combinedMeta.put("total_appends", String.valueOf(appendCount + 1));
     combinedMeta.put(
         "index_start_block_of_append_" + String.valueOf(appendCount + 1),
-        String.valueOf(firstBlockIndex));
+        String.valueOf(firstBlockIndex+1));
     writer.end(combinedMeta);
   }
   // selective compaction of parquet blocks
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
index 64c9e9cfc..33ee08d38 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
@@ -32,7 +32,7 @@
 @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
 class ParquetFileConfig {
   MessageType schema;
-  long rowGroupSize;
+  long rowGroupIndex;
   CompressionCodecName codec;
 
   public ParquetFileConfig(Configuration conf, Path file) {
@@ -44,7 +44,7 @@ public ParquetFileConfig(Configuration conf, Path file) {
     }
 
     this.schema = metadata.getFileMetaData().getSchema();
-    this.rowGroupSize = metadata.getBlocks().size(); // .get(0).getTotalByteSize()
+    this.rowGroupIndex = metadata.getBlocks().size();
     this.codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec();
   }
 }
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index c084eb6df..3aff654a1 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -79,6 +79,8 @@ public void testAppendParquetFile() throws IOException {
         Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2027, 7));
     Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
     dfToSync.write().partitionBy("year", "month").mode("append").parquet(outputPath);
+    // TODO create the folders manually for a new partition value as appendFile works only within the same partition value
+
     long newModifTime = System.currentTimeMillis() - 5000;
     List<String> newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=7");
 

From d19ccbf2ebe776f69dbf65e560c8bcf4433f8599 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Fri, 2 Jan 2026 22:46:38 +0100
Subject: [PATCH 015/100] fix test with basic example where partitions are not
 considered

---
 .../xtable/parquet/ParquetDataManager.java    |  9 ++++---
 .../xtable/parquet/ITParquetDataManager.java  | 26 ++++++++-----------
 2 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 6585fda41..01fb6b75f 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -65,7 +65,8 @@ private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAp
     return parquetFileConfig;
   }
 
-  public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageType schema) throws IOException {
+  public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageType schema)
+      throws IOException {
     Configuration conf = new Configuration();
 
     // e.g., "year=2024/month=12"
@@ -73,8 +74,8 @@ public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageTy
     String sourcePartition = getPartitionPath(sourcePath);
 
     if (!targetPartition.equals(sourcePartition)) {
-      throw new IllegalArgumentException("Partition Mismatch! Cannot merge " +
-              sourcePartition + " into " + targetPartition);
+      throw new IllegalArgumentException(
+          "Partition Mismatch! Cannot merge " + sourcePartition + " into " + targetPartition);
     }
 
     // append files within the same partition foldr
@@ -125,7 +126,7 @@ public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType
     combinedMeta.put("total_appends", String.valueOf(appendCount + 1));
     combinedMeta.put(
         "index_start_block_of_append_" + String.valueOf(appendCount + 1),
-        String.valueOf(firstBlockIndex+1));
+        String.valueOf(firstBlockIndex + 1));
     writer.end(combinedMeta);
   }
   // selective compaction of parquet blocks
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 3aff654a1..61a9a9cf5 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -18,7 +18,6 @@
  
 package org.apache.xtable.parquet;
 
-import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.io.IOException;
@@ -55,11 +54,9 @@ public void testAppendParquetFile() throws IOException {
             });
 
     List<Row> data =
-        Arrays.asList(
-            RowFactory.create(101, "A", 2025, 12),
-            RowFactory.create(102, "B", 2025, 12),
-            RowFactory.create(201, "C", 2025, 11),
-            RowFactory.create(301, "D", 2024, 7));
+        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12));
+    /* RowFactory.create(201, "C", 2025, 11),
+    RowFactory.create(301, "D", 2024, 7));*/
 
     Dataset<Row> df = spark.createDataFrame(data, schema);
     Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test");
@@ -69,20 +66,20 @@ public void testAppendParquetFile() throws IOException {
 
     // test find files to sync
     long targetModifTime = System.currentTimeMillis() - 360000;
-    org.apache.hadoop.fs.Path hdfsPath =
-        new org.apache.hadoop.fs.Path("target/fixed-parquet-data/parquet-partitioned_table_test");
+    org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath);
     FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf);
     // set the modification time to the file
     updateModificationTimeRecursive(fs, hdfsPath, targetModifTime);
     // create new file to append using Spark
     List<Row> futureDataToSync =
-        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2027, 7));
+        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2026, 12));
     Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
     dfToSync.write().partitionBy("year", "month").mode("append").parquet(outputPath);
-    // TODO create the folders manually for a new partition value as appendFile works only within the same partition value
+    // TODO create the folders manually for a new partition value as appendFile works only within
+    // the same partition value
 
     long newModifTime = System.currentTimeMillis() - 5000;
-    List<String> newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=7");
+    List<String> newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7");
 
     for (String partition : newPartitions) {
       org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(hdfsPath, partition);
@@ -91,19 +88,18 @@ public void testAppendParquetFile() throws IOException {
       }
     }
     List<org.apache.hadoop.fs.Path> resultingFiles =
-        ParquetDataManager.formNewTargetFiles(
-            conf, new org.apache.hadoop.fs.Path(fixedPath.toUri()), newModifTime);
+        ParquetDataManager.formNewTargetFiles(conf, hdfsPath, newModifTime);
     // check if resultingFiles contains the append data only (through the partition names)
     for (org.apache.hadoop.fs.Path p : resultingFiles) {
       String pathString = p.toString();
       //  should be TRUE
-      boolean isNewData = pathString.contains("year=2026") || pathString.contains("year=2027");
+      boolean isNewData = pathString.contains("year=2026"); // || pathString.contains("year=2027");
 
       // should be FALSE
       boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025");
 
       assertTrue(isNewData, "Path should belong to appended data: " + pathString);
-      assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
+      //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
     }
     // TODO test appendNewParquetFile() (using a non-Spark approach to append a parquet file)
 

From aecb204ede4ad15851e1c403866e5cb88366be2a Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Fri, 2 Jan 2026 23:06:36 +0100
Subject: [PATCH 016/100] fix test with basic example where partitions are not
 considered2

---
 .../xtable/parquet/ITParquetDataManager.java  | 22 ++++++++++++-------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 61a9a9cf5..ad5a41f2a 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -60,7 +60,8 @@ public void testAppendParquetFile() throws IOException {
 
     Dataset<Row> df = spark.createDataFrame(data, schema);
     Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test");
-    String outputPath = fixedPath.toFile().getName();
+    String outputPath = fixedPath.toString();
+    ;
 
     df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
 
@@ -79,25 +80,30 @@ public void testAppendParquetFile() throws IOException {
     // the same partition value
 
     long newModifTime = System.currentTimeMillis() - 5000;
-    List<String> newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7");
-
-    for (String partition : newPartitions) {
+    // TODO many partitions case
+    // List<String> newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7");
+    /* for (String partition : newPartitions) {
       org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(hdfsPath, partition);
       if (fs.exists(partitionPath)) {
         updateModificationTimeRecursive(fs, partitionPath, newModifTime);
       }
-    }
+    }*/
+    org.apache.hadoop.fs.Path partitionPath =
+        new org.apache.hadoop.fs.Path(hdfsPath, "year=2026/month=12");
+    updateModificationTimeRecursive(fs, partitionPath, newModifTime);
     List<org.apache.hadoop.fs.Path> resultingFiles =
         ParquetDataManager.formNewTargetFiles(conf, hdfsPath, newModifTime);
     // check if resultingFiles contains the append data only (through the partition names)
     for (org.apache.hadoop.fs.Path p : resultingFiles) {
       String pathString = p.toString();
-      //  should be TRUE
+      //  should be TRUE (test for many partitions)
       boolean isNewData = pathString.contains("year=2026"); // || pathString.contains("year=2027");
 
-      // should be FALSE
+      // should be FALSE (test for many partitions)
       boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025");
-
+      long modTime = fs.getFileStatus(p).getModificationTime();
+      // test for one partition value
+      assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString);
       assertTrue(isNewData, "Path should belong to appended data: " + pathString);
       //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
     }

From 0ec8cbb7ed740cbe72687d02878742167bca9ac8 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Fri, 2 Jan 2026 23:22:34 +0100
Subject: [PATCH 017/100] fix test with basic example where partitions are not
 considered3

---
 .../xtable/parquet/ParquetDataManager.java      | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 01fb6b75f..e342cdacb 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -122,11 +122,16 @@ public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType
     // save block indexes and modification time (for later sync related retrieval) in the metadata
     // of the output
     // table
-    combinedMeta.put(newKey, String.valueOf(fileStatus.getModificationTime()));
-    combinedMeta.put("total_appends", String.valueOf(appendCount + 1));
+    int currentAppendIdx = appendCount + 1;
+    long startBlock = firstBlockIndex;
+    long blocksAdded = ParquetFileReader.readFooter(conf, fileToAppend).getBlocks().size();
+    long endBlock = startBlock + blocksAdded;
+
+    combinedMeta.put("total_appends", String.valueOf(currentAppendIdx));
+    combinedMeta.put("index_start_block_of_append_" + currentAppendIdx, String.valueOf(startBlock));
+    combinedMeta.put("index_end_block_of_append_" + currentAppendIdx, String.valueOf(endBlock));
     combinedMeta.put(
-        "index_start_block_of_append_" + String.valueOf(appendCount + 1),
-        String.valueOf(firstBlockIndex + 1));
+        "append_date_" + currentAppendIdx, String.valueOf(fileStatus.getModificationTime()));
     writer.end(combinedMeta);
   }
   // selective compaction of parquet blocks
@@ -156,13 +161,13 @@ public static List<Path> formNewTargetFiles(
                 bigFileFooter
                     .getFileMetaData()
                     .getKeyValueMetaData()
-                    .get("index_start_block_of_append_" + totalAppends));
+                    .get("index_start_block_of_append_" + i));
         int endBlock =
             Integer.valueOf(
                 bigFileFooter
                     .getFileMetaData()
                     .getKeyValueMetaData()
-                    .get("index_end_block_of_append_" + (totalAppends + 1)));
+                    .get("index_end_block_of_append_" + i));
         Path targetSyncFilePath =
             new Path(
                 status.getPath().getName() + String.valueOf(startBlock) + String.valueOf(endBlock));

From 9cb75dfd13c1180b38095e6d3ab5b0f80d97f2c8 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 11:29:34 +0100
Subject: [PATCH 018/100] test with time of last append is now

---
 .../java/org/apache/xtable/parquet/ITParquetDataManager.java  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index ad5a41f2a..ac2a94887 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -79,7 +79,7 @@ public void testAppendParquetFile() throws IOException {
     // TODO create the folders manually for a new partition value as appendFile works only within
     // the same partition value
 
-    long newModifTime = System.currentTimeMillis() - 5000;
+    long newModifTime = System.currentTimeMillis() - 50000;
     // TODO many partitions case
     // List<String> newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7");
     /* for (String partition : newPartitions) {
@@ -90,7 +90,7 @@ public void testAppendParquetFile() throws IOException {
     }*/
     org.apache.hadoop.fs.Path partitionPath =
         new org.apache.hadoop.fs.Path(hdfsPath, "year=2026/month=12");
-    updateModificationTimeRecursive(fs, partitionPath, newModifTime);
+    // updateModificationTimeRecursive(fs, partitionPath, newModifTime);
     List<org.apache.hadoop.fs.Path> resultingFiles =
         ParquetDataManager.formNewTargetFiles(conf, hdfsPath, newModifTime);
     // check if resultingFiles contains the append data only (through the partition names)

From 9e125f242ccd56fa1770ae27dbd6c0e0db4bc3e1 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 12:07:27 +0100
Subject: [PATCH 019/100] test appendFile with Parquet: TODO test with multiple
 partitions 1) append and 2) filter for sync

---
 .../xtable/parquet/ParquetDataManager.java    |  14 +--
 .../xtable/parquet/ITParquetDataManager.java  | 101 +++++++++++++++++-
 2 files changed, 106 insertions(+), 9 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index e342cdacb..5e8bb25bb 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -54,13 +54,14 @@ public class ParquetDataManager {
   /* Use Parquet API to append to a file */
 
   // after appending check required before appending the file
-  private boolean checkIfSchemaIsSame(Configuration conf, Path fileToAppend, Path fileFromTable) {
+  private static boolean checkIfSchemaIsSame(
+      Configuration conf, Path fileToAppend, Path fileFromTable) {
     ParquetFileConfig schemaFileAppend = getParquetFileConfig(conf, fileToAppend);
     ParquetFileConfig schemaFileFromTable = getParquetFileConfig(conf, fileFromTable);
     return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema());
   }
 
-  private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAppend) {
+  private static ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAppend) {
     ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, fileToAppend);
     return parquetFileConfig;
   }
@@ -88,7 +89,7 @@ private String getPartitionPath(Path path) {
     return path.getParent().getName();
   }
   // append a file into a table (merges two files into one .parquet under a partition folder)
-  public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema)
+  public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema)
       throws IOException {
     Configuration conf = new Configuration();
     long firstBlockIndex = getParquetFileConfig(conf, filePath).getRowGroupIndex();
@@ -133,15 +134,16 @@ public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType
     combinedMeta.put(
         "append_date_" + currentAppendIdx, String.valueOf(fileStatus.getModificationTime()));
     writer.end(combinedMeta);
+    return filePath;
   }
   // selective compaction of parquet blocks
   public static List<Path> formNewTargetFiles(
-      Configuration conf, Path directoryPath, long targetModifTime) throws IOException {
+      Configuration conf, Path partitionPath, long targetModifTime) throws IOException {
     List<Path> finalPaths = new ArrayList<>();
-    FileSystem fs = directoryPath.getFileSystem(conf);
+    FileSystem fs = partitionPath.getFileSystem(conf);
 
     FileStatus[] statuses =
-        fs.listStatus(directoryPath, path -> path.getName().endsWith(".parquet"));
+        fs.listStatus(partitionPath, path -> path.getName().endsWith(".parquet"));
 
     for (FileStatus status : statuses) {
 
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index ac2a94887..b26460a32 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -28,6 +28,10 @@
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.parquet.schema.Types;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;
@@ -40,9 +44,9 @@
 public class ITParquetDataManager {
 
   @Test
-  public void testAppendParquetFile() throws IOException {
+  public void testFormParquetFileSinglePartition() throws IOException {
     SparkSession spark =
-        SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate();
+        SparkSession.builder().appName("TestCreateFunctionnality").master("local[*]").getOrCreate();
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     StructType schema =
         DataTypes.createStructType(
@@ -107,7 +111,98 @@ public void testAppendParquetFile() throws IOException {
       assertTrue(isNewData, "Path should belong to appended data: " + pathString);
       //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
     }
-    // TODO test appendNewParquetFile() (using a non-Spark approach to append a parquet file)
+    spark.stop();
+  }
+
+  @Test
+  public void testAppendParquetFileSinglePartition() throws IOException {
+    SparkSession spark =
+        SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate();
+    Configuration conf = spark.sparkContext().hadoopConfiguration();
+    MessageType schemaParquet =
+        Types.buildMessage()
+            .required(PrimitiveType.PrimitiveTypeName.INT32)
+            .named("id")
+            .required(PrimitiveType.PrimitiveTypeName.BINARY)
+            .as(LogicalTypeAnnotation.stringType())
+            .named("value")
+            .required(PrimitiveType.PrimitiveTypeName.INT32)
+            .named("year")
+            .required(PrimitiveType.PrimitiveTypeName.INT32)
+            .named("month")
+            .named("parquet_schema");
+    StructType schema =
+        DataTypes.createStructType(
+            new StructField[] {
+              DataTypes.createStructField("id", DataTypes.IntegerType, false),
+              DataTypes.createStructField("value", DataTypes.StringType, false),
+              DataTypes.createStructField("year", DataTypes.IntegerType, false),
+              DataTypes.createStructField("month", DataTypes.IntegerType, false)
+            });
+    List<Row> data =
+        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12));
+    /* RowFactory.create(201, "C", 2025, 11),
+    RowFactory.create(301, "D", 2024, 7));*/
+
+    Dataset<Row> df = spark.createDataFrame(data, schema);
+    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test");
+    Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test");
+    String outputPath = fixedPath.toString();
+    String finalAppendFilePath = appendFilePath.toString();
+
+    df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
+
+    // test find files to sync
+    long targetModifTime = System.currentTimeMillis() - 360000;
+    org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath);
+    FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf);
+    // set the modification time to the table file
+    updateModificationTimeRecursive(fs, hdfsPath, targetModifTime);
+    // create new file to append using Spark
+    List<Row> futureDataToSync =
+        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2026, 12));
+    Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
+    dfToSync.write().partitionBy("year", "month").mode("overwrite").parquet(finalAppendFilePath);
+    long newModifTime = System.currentTimeMillis() - 50000;
+    // update modifTime for file to append
+    updateModificationTimeRecursive(
+        fs, new org.apache.hadoop.fs.Path(finalAppendFilePath), newModifTime);
+    org.apache.hadoop.fs.Path outputFile =
+        ParquetDataManager.appendNewParquetFiles(
+            new org.apache.hadoop.fs.Path(outputPath),
+            new org.apache.hadoop.fs.Path(finalAppendFilePath),
+            schemaParquet);
+
+    // TODO many partitions case
+    // List<String> newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7");
+    /* for (String partition : newPartitions) {
+      org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(hdfsPath, partition);
+      if (fs.exists(partitionPath)) {
+        updateModificationTimeRecursive(fs, partitionPath, newModifTime);
+      }
+    }*/
+    org.apache.hadoop.fs.Path partitionPath =
+        new org.apache.hadoop.fs.Path(hdfsPath, "year=2026/month=12");
+    // updateModificationTimeRecursive(fs, partitionPath, newModifTime);
+
+    // test whether the appended data can be selectively filtered for incr sync (e.g. get me the
+    // data to sync only)
+    List<org.apache.hadoop.fs.Path> resultingFiles =
+        ParquetDataManager.formNewTargetFiles(conf, outputFile, newModifTime);
+    // check if resultingFiles contains the append data only (through the partition names)
+    for (org.apache.hadoop.fs.Path p : resultingFiles) {
+      String pathString = p.toString();
+      //  should be TRUE (test for many partitions)
+      boolean isNewData = pathString.contains("year=2026"); // || pathString.contains("year=2027");
+
+      // should be FALSE (test for many partitions)
+      boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025");
+      long modTime = fs.getFileStatus(p).getModificationTime();
+      // test for one partition value
+      assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString);
+      assertTrue(isNewData, "Path should belong to appended data: " + pathString);
+      //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
+    }
 
     spark.stop();
   }

From 233ca770baccb5a6fd00d800bafb1029a7cb5720 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 13:10:02 +0100
Subject: [PATCH 020/100] merge recursively one partition files

---
 .../xtable/parquet/ParquetDataManager.java    | 47 ++++++++++++++++++-
 .../xtable/parquet/ITParquetDataManager.java  |  5 +-
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 5e8bb25bb..6384f5c01 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -88,7 +88,7 @@ private String getPartitionPath(Path path) {
     // For nested partitions, you might need path.getParent().getParent()...
     return path.getParent().getName();
   }
-  // append a file into a table (merges two files into one .parquet under a partition folder)
+  // append a file (merges two files into one .parquet under a partition folder)
   public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema)
       throws IOException {
     Configuration conf = new Configuration();
@@ -193,4 +193,49 @@ public static List<Path> formNewTargetFiles(
     }
     return finalPaths;
   }
+
+  public static Path appendPartitionedData(Path targetPath, Path sourcePath, MessageType schema)
+      throws IOException {
+    Configuration conf = new Configuration();
+    FileSystem fs = sourcePath.getFileSystem(conf);
+
+    List<Path> targetFiles = collectParquetFiles(fs, targetPath);
+
+    List<Path> sourceFiles = collectParquetFiles(fs, sourcePath);
+
+    if (targetFiles.isEmpty()) {
+      throw new IOException("Target directory contains no parquet files to append to.");
+    }
+    if (sourceFiles.isEmpty()) {
+      return targetPath;
+    }
+    Path masterTargetFile = targetFiles.get(0);
+
+    for (int i = 1; i < targetFiles.size(); i++) {
+      appendNewParquetFiles(masterTargetFile, targetFiles.get(i), schema);
+    }
+    for (Path sourceFile : sourceFiles) {
+      appendNewParquetFiles(masterTargetFile, sourceFile, schema);
+    }
+
+    return masterTargetFile;
+  }
+
+  private static List<Path> collectParquetFiles(FileSystem fs, Path root) throws IOException {
+    List<Path> parquetFiles = new ArrayList<>();
+    if (!fs.exists(root)) return parquetFiles;
+
+    if (fs.getFileStatus(root).isDirectory()) {
+      RemoteIterator<LocatedFileStatus> it = fs.listFiles(root, true);
+      while (it.hasNext()) {
+        Path p = it.next().getPath();
+        if (p.getName().endsWith(".parquet")) {
+          parquetFiles.add(p);
+        }
+      }
+    } else {
+      parquetFiles.add(root);
+    }
+    return parquetFiles;
+  }
 }
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index b26460a32..3d7619cf3 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -167,11 +167,14 @@ public void testAppendParquetFileSinglePartition() throws IOException {
     // update modifTime for file to append
     updateModificationTimeRecursive(
         fs, new org.apache.hadoop.fs.Path(finalAppendFilePath), newModifTime);
+    // recursively append all files under a specific partition path (e.g. "year=2026/month=12")
+    // (assuming that outputPath and finalAppendFilePath are same partitions for different files)
     org.apache.hadoop.fs.Path outputFile =
-        ParquetDataManager.appendNewParquetFiles(
+        ParquetDataManager.appendPartitionedData(
             new org.apache.hadoop.fs.Path(outputPath),
             new org.apache.hadoop.fs.Path(finalAppendFilePath),
             schemaParquet);
+    // can create big file which can be split if reaches a certain threshold size
 
     // TODO many partitions case
     // List<String> newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7");

From b4cba5a1c358689a8953e7b8d7916d2d2d51abc7 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 13:39:56 +0100
Subject: [PATCH 021/100] fix paths for files to append

---
 .../main/java/org/apache/xtable/parquet/ParquetDataManager.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 6384f5c01..da88a50e2 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -97,7 +97,7 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
         new ParquetFileWriter(
             HadoopOutputFile.fromPath(filePath, conf),
             schema,
-            ParquetFileWriter.Mode.CREATE,
+            ParquetFileWriter.Mode.OVERWRITE,
             DEFAULT_BLOCK_SIZE,
             0);
     // write the initial table with the appended file to add into the outputPath

From a564b29a7a11659b9ac647996cebfe1f7a34ca51 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 14:08:27 +0100
Subject: [PATCH 022/100] fix bug of appending file path

---
 .../java/org/apache/xtable/parquet/ParquetDataManager.java   | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index da88a50e2..fbf546d37 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -93,9 +93,10 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
       throws IOException {
     Configuration conf = new Configuration();
     long firstBlockIndex = getParquetFileConfig(conf, filePath).getRowGroupIndex();
+    Path tempPath = new Path(filePath.getParent(), "." + filePath.getName() + ".tmp");
     ParquetFileWriter writer =
         new ParquetFileWriter(
-            HadoopOutputFile.fromPath(filePath, conf),
+            HadoopOutputFile.fromPath(tempPath, conf),
             schema,
             ParquetFileWriter.Mode.OVERWRITE,
             DEFAULT_BLOCK_SIZE,
@@ -134,6 +135,8 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
     combinedMeta.put(
         "append_date_" + currentAppendIdx, String.valueOf(fileStatus.getModificationTime()));
     writer.end(combinedMeta);
+    fs.delete(filePath, false);
+    fs.rename(tempPath, filePath);
     return filePath;
   }
   // selective compaction of parquet blocks

From d1ceafba5cef0f8a916a994d44e74bbecfb45f64 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 14:38:15 +0100
Subject: [PATCH 023/100] fix bug of schema

---
 .../java/org/apache/xtable/parquet/ITParquetDataManager.java | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 3d7619cf3..6c490bcbb 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -119,6 +119,7 @@ public void testAppendParquetFileSinglePartition() throws IOException {
     SparkSession spark =
         SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate();
     Configuration conf = spark.sparkContext().hadoopConfiguration();
+    // In testAppendParquetFileSinglePartition
     MessageType schemaParquet =
         Types.buildMessage()
             .required(PrimitiveType.PrimitiveTypeName.INT32)
@@ -126,10 +127,6 @@ public void testAppendParquetFileSinglePartition() throws IOException {
             .required(PrimitiveType.PrimitiveTypeName.BINARY)
             .as(LogicalTypeAnnotation.stringType())
             .named("value")
-            .required(PrimitiveType.PrimitiveTypeName.INT32)
-            .named("year")
-            .required(PrimitiveType.PrimitiveTypeName.INT32)
-            .named("month")
             .named("parquet_schema");
     StructType schema =
         DataTypes.createStructType(

From 649250fbfb939367ce4108e7e6bef267911859b8 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 15:10:27 +0100
Subject: [PATCH 024/100] make sure returned files are non empty when iterating
 in the partitions

---
 .../org/apache/xtable/parquet/ParquetDataManager.java     | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index fbf546d37..2650e4791 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -226,14 +226,18 @@ public static Path appendPartitionedData(Path targetPath, Path sourcePath, Messa
 
   private static List<Path> collectParquetFiles(FileSystem fs, Path root) throws IOException {
     List<Path> parquetFiles = new ArrayList<>();
+    Configuration conf = fs.getConf();
     if (!fs.exists(root)) return parquetFiles;
 
     if (fs.getFileStatus(root).isDirectory()) {
       RemoteIterator<LocatedFileStatus> it = fs.listFiles(root, true);
       while (it.hasNext()) {
         Path p = it.next().getPath();
-        if (p.getName().endsWith(".parquet")) {
-          parquetFiles.add(p);
+        if (p.getName().endsWith(".parquet") && !p.getName().startsWith(".")) {
+          ParquetMetadata footer = ParquetFileReader.readFooter(conf, p);
+          if (!footer.getBlocks().isEmpty()) {
+            parquetFiles.add(p);
+          }
         }
       }
     } else {

From 24ff828b03af7d03cdecb3556fbe71254802d682 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 15:32:58 +0100
Subject: [PATCH 025/100] make sure returned files are non empty when iterating
 in the partitions 2

---
 .../java/org/apache/xtable/parquet/ParquetDataManager.java  | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 2650e4791..991879d05 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -233,9 +233,11 @@ private static List<Path> collectParquetFiles(FileSystem fs, Path root) throws I
       RemoteIterator<LocatedFileStatus> it = fs.listFiles(root, true);
       while (it.hasNext()) {
         Path p = it.next().getPath();
-        if (p.getName().endsWith(".parquet") && !p.getName().startsWith(".")) {
+        if (p.getName().endsWith(".parquet")
+            && !p.toString().contains("/.")
+            && !p.toString().contains("/_")) {
           ParquetMetadata footer = ParquetFileReader.readFooter(conf, p);
-          if (!footer.getBlocks().isEmpty()) {
+          if (footer.getBlocks() != null && !footer.getBlocks().isEmpty()) {
             parquetFiles.add(p);
           }
         }

From 463d2eea37664ce1aab0414c8ac366f665abb1e6 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 16:20:21 +0100
Subject: [PATCH 026/100] discard empty files when appending

---
 .../org/apache/xtable/parquet/ParquetDataManager.java    | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 991879d05..942bf95e8 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -215,9 +215,18 @@ public static Path appendPartitionedData(Path targetPath, Path sourcePath, Messa
     Path masterTargetFile = targetFiles.get(0);
 
     for (int i = 1; i < targetFiles.size(); i++) {
+      long len = fs.getFileStatus(targetFiles.get(i)).getLen();
+      System.out.println("DEBUG: Attempting to append " + targetFiles.get(i) + " Size: " + len);
       appendNewParquetFiles(masterTargetFile, targetFiles.get(i), schema);
     }
     for (Path sourceFile : sourceFiles) {
+      ParquetMetadata sourceFooter = ParquetFileReader.readFooter(conf, sourceFile);
+      if (sourceFooter.getBlocks().isEmpty()) {
+        System.out.println("SKIPPING: " + sourceFile + " has no data.");
+        continue;
+      }
+      long len = fs.getFileStatus(sourceFile).getLen();
+      System.out.println("DEBUG: Attempting to append " + sourceFile + " Size: " + len);
       appendNewParquetFiles(masterTargetFile, sourceFile, schema);
     }
 

From 71d1c34f97f12db45840277a7f7f4cfa095e8222 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 16:45:59 +0100
Subject: [PATCH 027/100] discard empty files when appending

---
 .../apache/xtable/parquet/ParquetDataManager.java   | 13 ++++++++++++-
 .../apache/xtable/parquet/ITParquetDataManager.java |  7 ++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 942bf95e8..76efa93fb 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -212,7 +212,18 @@ public static Path appendPartitionedData(Path targetPath, Path sourcePath, Messa
     if (sourceFiles.isEmpty()) {
       return targetPath;
     }
-    Path masterTargetFile = targetFiles.get(0);
+    Path masterTargetFile = null;
+    for (Path p : targetFiles) {
+      ParquetMetadata metadata = ParquetFileReader.readFooter(conf, p);
+      if (!metadata.getBlocks().isEmpty()) {
+        masterTargetFile = p;
+        break;
+      }
+    }
+
+    if (masterTargetFile == null) {
+      throw new IOException("Target directory contains no files with valid data blocks.");
+    }
 
     for (int i = 1; i < targetFiles.size(); i++) {
       long len = fs.getFileStatus(targetFiles.get(i)).getLen();
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 6c490bcbb..bd419c885 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -159,7 +159,12 @@ public void testAppendParquetFileSinglePartition() throws IOException {
     List<Row> futureDataToSync =
         Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2026, 12));
     Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
-    dfToSync.write().partitionBy("year", "month").mode("overwrite").parquet(finalAppendFilePath);
+    dfToSync
+        .coalesce(1)
+        .write()
+        .partitionBy("year", "month")
+        .mode("overwrite")
+        .parquet(finalAppendFilePath);
     long newModifTime = System.currentTimeMillis() - 50000;
     // update modifTime for file to append
     updateModificationTimeRecursive(

From 3319a91429eec91faeaabb3dce4d0eac95b5b3a5 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 17:08:36 +0100
Subject: [PATCH 028/100] one spark session for both tests

---
 .../xtable/parquet/ITParquetDataManager.java  | 27 ++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index bd419c885..d12e3016d 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -39,14 +39,22 @@
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 
 public class ITParquetDataManager {
+  private static SparkSession spark;
+
+  @BeforeAll
+  public static void setup() {
+    spark = SparkSession.builder().appName("ParquetTest").master("local[*]").getOrCreate();
+  }
 
   @Test
   public void testFormParquetFileSinglePartition() throws IOException {
-    SparkSession spark =
-        SparkSession.builder().appName("TestCreateFunctionnality").master("local[*]").getOrCreate();
+    /*SparkSession spark =
+    SparkSession.builder().appName("TestCreateFunctionnality").master("local[*]").getOrCreate();*/
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     StructType schema =
         DataTypes.createStructType(
@@ -111,13 +119,13 @@ public void testFormParquetFileSinglePartition() throws IOException {
       assertTrue(isNewData, "Path should belong to appended data: " + pathString);
       //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
     }
-    spark.stop();
+    // spark.stop();
   }
 
   @Test
   public void testAppendParquetFileSinglePartition() throws IOException {
-    SparkSession spark =
-        SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate();
+    /*SparkSession spark =
+    SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate();*/
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     // In testAppendParquetFileSinglePartition
     MessageType schemaParquet =
@@ -209,7 +217,7 @@ public void testAppendParquetFileSinglePartition() throws IOException {
       //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
     }
 
-    spark.stop();
+    // spark.stop();
   }
 
   private void updateModificationTimeRecursive(
@@ -223,4 +231,11 @@ private void updateModificationTimeRecursive(
       }
     }
   }
+
+  @AfterAll
+  public static void tearDown() {
+    if (spark != null) {
+      spark.stop();
+    }
+  }
 }

From 013ffe4c000b68992460679180eb448ef63c1974 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 17:41:31 +0100
Subject: [PATCH 029/100] bug fix

---
 .../xtable/parquet/ParquetDataManager.java      | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 76efa93fb..471925c1d 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -93,6 +93,8 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
       throws IOException {
     Configuration conf = new Configuration();
     long firstBlockIndex = getParquetFileConfig(conf, filePath).getRowGroupIndex();
+    ParquetMetadata existingFooter = ParquetFileReader.readFooter(conf, filePath);
+    Map<String, String> existingMeta = existingFooter.getFileMetaData().getKeyValueMetaData();
     Path tempPath = new Path(filePath.getParent(), "." + filePath.getName() + ".tmp");
     ParquetFileWriter writer =
         new ParquetFileWriter(
@@ -110,14 +112,11 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
       writer.appendFile(inputFileToAppend);
     }
     Map<String, String> combinedMeta = new HashMap<>();
+    if (existingMeta != null) {
+      combinedMeta.putAll(existingMeta);
+    }
     // track the append date and save it in the footer
-    int appendCount =
-        Integer.parseInt(
-            ParquetFileReader.readFooter(conf, filePath)
-                .getFileMetaData()
-                .getKeyValueMetaData()
-                .getOrDefault("total_appends", "0"));
-    String newKey = "append_date_" + (appendCount + 1);
+    int appendCount = Integer.parseInt(combinedMeta.getOrDefault("total_appends", "0"));
     // get the equivalent fileStatus from the file-to-append Path
     FileSystem fs = FileSystem.get(conf);
     FileStatus fileStatus = fs.getFileStatus(fileToAppend);
@@ -127,7 +126,7 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
     int currentAppendIdx = appendCount + 1;
     long startBlock = firstBlockIndex;
     long blocksAdded = ParquetFileReader.readFooter(conf, fileToAppend).getBlocks().size();
-    long endBlock = startBlock + blocksAdded;
+    long endBlock = startBlock + blocksAdded - 1;
 
     combinedMeta.put("total_appends", String.valueOf(currentAppendIdx));
     combinedMeta.put("index_start_block_of_append_" + currentAppendIdx, String.valueOf(startBlock));
@@ -160,7 +159,7 @@ public static List<Path> formNewTargetFiles(
                   .getFileMetaData()
                   .getKeyValueMetaData()
                   .getOrDefault("total_appends", "0"));
-      for (int i = 1; i < totalAppends; i++) {
+      for (int i = 1; i <= totalAppends; i++) {
         int startBlock =
             Integer.valueOf(
                 bigFileFooter

From b939a0ac1229ca6d8fd5cec695d98a0f31c5d395 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 18:03:43 +0100
Subject: [PATCH 030/100] bug fix

---
 .../java/org/apache/xtable/parquet/ParquetDataManager.java | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 471925c1d..bf0035630 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -172,9 +172,10 @@ public static List<Path> formNewTargetFiles(
                     .getFileMetaData()
                     .getKeyValueMetaData()
                     .get("index_end_block_of_append_" + i));
-        Path targetSyncFilePath =
-            new Path(
-                status.getPath().getName() + String.valueOf(startBlock) + String.valueOf(endBlock));
+        String newFileName =
+            String.format(
+                "%s_block%d_%d.parquet", status.getPath().getName(), startBlock, endBlock);
+        Path targetSyncFilePath = new Path(status.getPath().getParent(), newFileName);
         try (ParquetFileWriter writer = new ParquetFileWriter(conf, schema, targetSyncFilePath)) {
           writer.start();
           for (int j = 0; j < bigFileFooter.getBlocks().size(); j++) {

From b6d8ddccb49eb83e687f327fee9e3e80f356421c Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 18:36:47 +0100
Subject: [PATCH 031/100] cleanups + TODO: partitions match and merge

---
 .../apache/xtable/parquet/ParquetDataManager.java    |  6 ++----
 .../apache/xtable/parquet/ITParquetDataManager.java  | 12 ------------
 2 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index bf0035630..7263612e6 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -65,10 +65,10 @@ private static ParquetFileConfig getParquetFileConfig(Configuration conf, Path f
     ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, fileToAppend);
     return parquetFileConfig;
   }
-
+  // TODO use this method to match the partition folders and merge them one by one (if one exists
+  // but not in other file then create it)
   public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageType schema)
       throws IOException {
-    Configuration conf = new Configuration();
 
     // e.g., "year=2024/month=12"
     String targetPartition = getPartitionPath(targetPath);
@@ -84,8 +84,6 @@ public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageTy
   }
 
   private String getPartitionPath(Path path) {
-    // Returns the parent directory name (e.g., country=US)
-    // For nested partitions, you might need path.getParent().getParent()...
     return path.getParent().getName();
   }
   // append a file (merges two files into one .parquet under a partition folder)
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index d12e3016d..420618b92 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -53,8 +53,6 @@ public static void setup() {
 
   @Test
   public void testFormParquetFileSinglePartition() throws IOException {
-    /*SparkSession spark =
-    SparkSession.builder().appName("TestCreateFunctionnality").master("local[*]").getOrCreate();*/
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     StructType schema =
         DataTypes.createStructType(
@@ -67,9 +65,6 @@ public void testFormParquetFileSinglePartition() throws IOException {
 
     List<Row> data =
         Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12));
-    /* RowFactory.create(201, "C", 2025, 11),
-    RowFactory.create(301, "D", 2024, 7));*/
-
     Dataset<Row> df = spark.createDataFrame(data, schema);
     Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test");
     String outputPath = fixedPath.toString();
@@ -119,13 +114,10 @@ public void testFormParquetFileSinglePartition() throws IOException {
       assertTrue(isNewData, "Path should belong to appended data: " + pathString);
       //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
     }
-    // spark.stop();
   }
 
   @Test
   public void testAppendParquetFileSinglePartition() throws IOException {
-    /*SparkSession spark =
-    SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate();*/
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     // In testAppendParquetFileSinglePartition
     MessageType schemaParquet =
@@ -146,8 +138,6 @@ public void testAppendParquetFileSinglePartition() throws IOException {
             });
     List<Row> data =
         Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12));
-    /* RowFactory.create(201, "C", 2025, 11),
-    RowFactory.create(301, "D", 2024, 7));*/
 
     Dataset<Row> df = spark.createDataFrame(data, schema);
     Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test");
@@ -216,8 +206,6 @@ public void testAppendParquetFileSinglePartition() throws IOException {
       assertTrue(isNewData, "Path should belong to appended data: " + pathString);
       //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
     }
-
-    // spark.stop();
   }
 
   private void updateModificationTimeRecursive(

From c18ab1c23a16d12accb00e40842261e4931778a8 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 4 Jan 2026 15:57:08 +0100
Subject: [PATCH 032/100] added test for many partitions, TODO integrate
 functions into ParquetConversionSource

---
 .../xtable/parquet/ParquetDataManager.java    |  46 ++++++
 .../xtable/parquet/ITParquetDataManager.java  | 149 +++++++++++++-----
 2 files changed, 156 insertions(+), 39 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 7263612e6..cfd3d66c3 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -265,4 +265,50 @@ private static List<Path> collectParquetFiles(FileSystem fs, Path root) throws I
     }
     return parquetFiles;
   }
+
+  public static List<Path> mergeDatasetsByPartition(
+      Path targetRoot, Path sourceRoot, MessageType schema) throws IOException {
+    List<Path> finalPartitionPaths = new ArrayList<>();
+    Configuration conf = new Configuration();
+    FileSystem fs = targetRoot.getFileSystem(conf);
+
+    Map<String, Path> sourcePartitions = mapPartitions(fs, sourceRoot);
+
+    Map<String, Path> targetPartitions = mapPartitions(fs, targetRoot);
+
+    for (Map.Entry<String, Path> entry : sourcePartitions.entrySet()) {
+      String partitionKey = entry.getKey();
+      Path sourcePartitionPath = entry.getValue();
+
+      if (targetPartitions.containsKey(partitionKey)) {
+        Path targetPartitionPath = targetPartitions.get(partitionKey);
+
+        finalPartitionPaths.add(
+            appendPartitionedData(targetPartitionPath, sourcePartitionPath, schema));
+      } else {
+        // TODO logic for when a partition exists in source but not in target
+        // simply merge the files in the respective partitions using appendPartitionedData() that
+        // takes only one Path/dir
+
+      }
+    }
+    return finalPartitionPaths;
+  }
+
+  private static Map<String, Path> mapPartitions(FileSystem fs, Path root) throws IOException {
+    Map<String, Path> partitionMap = new HashMap<>();
+    RemoteIterator<LocatedFileStatus> it = fs.listFiles(root, true);
+
+    while (it.hasNext()) {
+      Path filePath = it.next().getPath();
+      if (filePath.getName().endsWith(".parquet") && !filePath.toString().contains("/_")) {
+        Path parent = filePath.getParent();
+        String relative = parent.toString().replace(root.toString(), "");
+        if (relative.startsWith("/")) relative = relative.substring(1);
+
+        partitionMap.put(relative, parent);
+      }
+    }
+    return partitionMap;
+  }
 }
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 420618b92..fe2bcc069 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -18,11 +18,13 @@
  
 package org.apache.xtable.parquet;
 
+import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.io.IOException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 
@@ -41,6 +43,7 @@
 import org.apache.spark.sql.types.StructType;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 public class ITParquetDataManager {
@@ -52,6 +55,7 @@ public static void setup() {
   }
 
   @Test
+  @Disabled("This test already passed CI")
   public void testFormParquetFileSinglePartition() throws IOException {
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     StructType schema =
@@ -68,8 +72,6 @@ public void testFormParquetFileSinglePartition() throws IOException {
     Dataset<Row> df = spark.createDataFrame(data, schema);
     Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test");
     String outputPath = fixedPath.toString();
-    ;
-
     df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
 
     // test find files to sync
@@ -83,40 +85,26 @@ public void testFormParquetFileSinglePartition() throws IOException {
         Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2026, 12));
     Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
     dfToSync.write().partitionBy("year", "month").mode("append").parquet(outputPath);
-    // TODO create the folders manually for a new partition value as appendFile works only within
-    // the same partition value
 
     long newModifTime = System.currentTimeMillis() - 50000;
-    // TODO many partitions case
-    // List<String> newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7");
-    /* for (String partition : newPartitions) {
-      org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(hdfsPath, partition);
-      if (fs.exists(partitionPath)) {
-        updateModificationTimeRecursive(fs, partitionPath, newModifTime);
-      }
-    }*/
-    org.apache.hadoop.fs.Path partitionPath =
-        new org.apache.hadoop.fs.Path(hdfsPath, "year=2026/month=12");
-    // updateModificationTimeRecursive(fs, partitionPath, newModifTime);
+
     List<org.apache.hadoop.fs.Path> resultingFiles =
         ParquetDataManager.formNewTargetFiles(conf, hdfsPath, newModifTime);
     // check if resultingFiles contains the append data only (through the partition names)
     for (org.apache.hadoop.fs.Path p : resultingFiles) {
       String pathString = p.toString();
-      //  should be TRUE (test for many partitions)
-      boolean isNewData = pathString.contains("year=2026"); // || pathString.contains("year=2027");
 
-      // should be FALSE (test for many partitions)
-      boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025");
+      boolean isNewData = pathString.contains("year=2026");
+
       long modTime = fs.getFileStatus(p).getModificationTime();
       // test for one partition value
       assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString);
       assertTrue(isNewData, "Path should belong to appended data: " + pathString);
-      //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
     }
   }
 
   @Test
+  @Disabled("This test already passed CI")
   public void testAppendParquetFileSinglePartition() throws IOException {
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     // In testAppendParquetFileSinglePartition
@@ -174,19 +162,7 @@ public void testAppendParquetFileSinglePartition() throws IOException {
             new org.apache.hadoop.fs.Path(outputPath),
             new org.apache.hadoop.fs.Path(finalAppendFilePath),
             schemaParquet);
-    // can create big file which can be split if reaches a certain threshold size
-
-    // TODO many partitions case
-    // List<String> newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7");
-    /* for (String partition : newPartitions) {
-      org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(hdfsPath, partition);
-      if (fs.exists(partitionPath)) {
-        updateModificationTimeRecursive(fs, partitionPath, newModifTime);
-      }
-    }*/
-    org.apache.hadoop.fs.Path partitionPath =
-        new org.apache.hadoop.fs.Path(hdfsPath, "year=2026/month=12");
-    // updateModificationTimeRecursive(fs, partitionPath, newModifTime);
+    // TODO can create big file which can be split if reaches a certain threshold size
 
     // test whether the appended data can be selectively filtered for incr sync (e.g. get me the
     // data to sync only)
@@ -195,16 +171,111 @@ public void testAppendParquetFileSinglePartition() throws IOException {
     // check if resultingFiles contains the append data only (through the partition names)
     for (org.apache.hadoop.fs.Path p : resultingFiles) {
       String pathString = p.toString();
-      //  should be TRUE (test for many partitions)
-      boolean isNewData = pathString.contains("year=2026"); // || pathString.contains("year=2027");
-
-      // should be FALSE (test for many partitions)
-      boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025");
+      boolean isNewData = pathString.contains("year=2026");
       long modTime = fs.getFileStatus(p).getModificationTime();
       // test for one partition value
       assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString);
       assertTrue(isNewData, "Path should belong to appended data: " + pathString);
-      //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
+    }
+  }
+
+  @Test
+  public void testAppendParquetFileMultiplePartition() throws IOException {
+    Configuration conf = spark.sparkContext().hadoopConfiguration();
+
+    MessageType schemaParquet =
+        Types.buildMessage()
+            .required(PrimitiveType.PrimitiveTypeName.INT32)
+            .named("id")
+            .required(PrimitiveType.PrimitiveTypeName.BINARY)
+            .as(LogicalTypeAnnotation.stringType())
+            .named("value")
+            .named("parquet_schema");
+    StructType schema =
+        DataTypes.createStructType(
+            new StructField[] {
+              DataTypes.createStructField("id", DataTypes.IntegerType, false),
+              DataTypes.createStructField("value", DataTypes.StringType, false),
+              DataTypes.createStructField("year", DataTypes.IntegerType, false),
+              DataTypes.createStructField("month", DataTypes.IntegerType, false)
+            });
+    List<Row> data =
+        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2027, 11));
+
+    Dataset<Row> df = spark.createDataFrame(data, schema);
+    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test");
+    Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test");
+    String outputPath = fixedPath.toString();
+    String finalAppendFilePath = appendFilePath.toString();
+
+    df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
+
+    // test find files to sync
+    long targetModifTime = System.currentTimeMillis() - 360000;
+    org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath);
+    FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf);
+    // set the modification time to the table file
+    // update modifTime for file to append
+    // many partitions case
+    List<String> newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=11");
+    for (String partition : newPartitions) {
+      org.apache.hadoop.fs.Path partitionPath =
+          new org.apache.hadoop.fs.Path(outputPath, partition);
+      if (fs.exists(partitionPath)) {
+        updateModificationTimeRecursive(fs, partitionPath, targetModifTime);
+      }
+    }
+    // create new file to append using Spark
+    List<Row> futureDataToSync =
+        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2027, 11));
+    Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
+    dfToSync
+        .coalesce(1)
+        .write()
+        .partitionBy("year", "month")
+        .mode("overwrite")
+        .parquet(finalAppendFilePath);
+    long newModifTime = System.currentTimeMillis() - 50000;
+    for (String partition : newPartitions) {
+      org.apache.hadoop.fs.Path partitionPath =
+          new org.apache.hadoop.fs.Path(finalAppendFilePath, partition);
+      if (fs.exists(partitionPath)) {
+        updateModificationTimeRecursive(fs, partitionPath, newModifTime);
+      }
+    }
+    // recursively append all files under a specific partition path (e.g. "year=2026/month=12")
+    // (assuming that outputPath and finalAppendFilePath are same partitions for different files)
+    // the result contains all partition paths with the parquet files compacted
+    List<org.apache.hadoop.fs.Path> outputPaths =
+        ParquetDataManager.mergeDatasetsByPartition(
+            new org.apache.hadoop.fs.Path(outputPath),
+            new org.apache.hadoop.fs.Path(finalAppendFilePath),
+            schemaParquet);
+    // TODO can create big file which can be split if reaches a certain threshold size
+    // implement bin-packing function for large files
+
+    // test whether the appended data can be selectively filtered for incr sync (e.g. get me the
+    // data to sync only)
+    List<List<org.apache.hadoop.fs.Path>> finalPaths = new ArrayList<>();
+    for (org.apache.hadoop.fs.Path resPath : outputPaths) {
+      List<org.apache.hadoop.fs.Path> resultingFiles =
+          ParquetDataManager.formNewTargetFiles(conf, resPath, newModifTime);
+      finalPaths.add(resultingFiles);
+    }
+    // check if resultingFiles contains the append data only
+    for (List<org.apache.hadoop.fs.Path> fPaths : finalPaths) {
+      for (org.apache.hadoop.fs.Path p : fPaths) {
+        String pathString = p.toString();
+        //  should be TRUE
+        boolean isNewData = pathString.contains("year=2026") || pathString.contains("year=2027");
+        // should be FALSE
+        boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025");
+        long modTime = fs.getFileStatus(p).getModificationTime();
+        // test for one partition value
+        assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString);
+        assertTrue(isNewData, "Path should belong to appended data: " + pathString);
+        assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
+      }
     }
   }
 

From b7c613e0def55e32b121e65ceef45da241b9d5e3 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 4 Jan 2026 17:39:04 +0100
Subject: [PATCH 033/100] selecting data bug fix, TODO integrate functions into
 ParquetConversionSource

---
 .../java/org/apache/xtable/parquet/ParquetDataManager.java | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index cfd3d66c3..9fd6fd925 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -170,6 +170,9 @@ public static List<Path> formNewTargetFiles(
                     .getFileMetaData()
                     .getKeyValueMetaData()
                     .get("index_end_block_of_append_" + i));
+        long modifTime =
+            Long.parseLong(
+                bigFileFooter.getFileMetaData().getKeyValueMetaData().get("append_date_" + i));
         String newFileName =
             String.format(
                 "%s_block%d_%d.parquet", status.getPath().getName(), startBlock, endBlock);
@@ -178,9 +181,7 @@ public static List<Path> formNewTargetFiles(
           writer.start();
           for (int j = 0; j < bigFileFooter.getBlocks().size(); j++) {
             BlockMetaData blockMetadata = bigFileFooter.getBlocks().get(j);
-            if (j >= startBlock
-                && j <= endBlock
-                && status.getModificationTime() >= targetModifTime) {
+            if (j >= startBlock && j <= endBlock && modifTime >= targetModifTime) {
               List<BlockMetaData> blockList =
                   Collections.<BlockMetaData>singletonList(blockMetadata);
               FSDataInputStream targetInputStream = fs.open(status.getPath());

From 2a75f4925daedc63f6186fc34033478e16ba5621 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 4 Jan 2026 17:59:33 +0100
Subject: [PATCH 034/100] run all tests, TODO integrate functions into
 ParquetConversionSource

---
 .../java/org/apache/xtable/parquet/ITParquetDataManager.java    | 2 --
 1 file changed, 2 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index fe2bcc069..de49a1d86 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -55,7 +55,6 @@ public static void setup() {
   }
 
   @Test
-  @Disabled("This test already passed CI")
   public void testFormParquetFileSinglePartition() throws IOException {
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     StructType schema =
@@ -104,7 +103,6 @@ public void testFormParquetFileSinglePartition() throws IOException {
   }
 
   @Test
-  @Disabled("This test already passed CI")
   public void testAppendParquetFileSinglePartition() throws IOException {
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     // In testAppendParquetFileSinglePartition

From f1538b07e0dc79ec9e36d2cc437f4209c01a0f3d Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 4 Jan 2026 18:49:33 +0100
Subject: [PATCH 035/100] run all tests, TODO integrate functions into
 ParquetConversionSource

---
 .../apache/xtable/parquet/ITParquetDataManager.java    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index de49a1d86..0049af0bb 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -69,7 +69,7 @@ public void testFormParquetFileSinglePartition() throws IOException {
     List<Row> data =
         Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12));
     Dataset<Row> df = spark.createDataFrame(data, schema);
-    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test");
+    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test_0");
     String outputPath = fixedPath.toString();
     df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
 
@@ -126,8 +126,8 @@ public void testAppendParquetFileSinglePartition() throws IOException {
         Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12));
 
     Dataset<Row> df = spark.createDataFrame(data, schema);
-    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test");
-    Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test");
+    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_1");
+    Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test_1");
     String outputPath = fixedPath.toString();
     String finalAppendFilePath = appendFilePath.toString();
 
@@ -201,8 +201,8 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
         Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2027, 11));
 
     Dataset<Row> df = spark.createDataFrame(data, schema);
-    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test");
-    Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test");
+    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_2");
+    Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test_2");
     String outputPath = fixedPath.toString();
     String finalAppendFilePath = appendFilePath.toString();
 

From cdedeaecf8a467229edf758166f62e74ddb25bab Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 4 Jan 2026 19:08:47 +0100
Subject: [PATCH 036/100] spotless:apply, TODO integrate functions into
 ParquetConversionSource

---
 .../java/org/apache/xtable/parquet/ITParquetDataManager.java     | 1 -
 1 file changed, 1 deletion(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 0049af0bb..e47ddcf11 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -43,7 +43,6 @@
 import org.apache.spark.sql.types.StructType;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 public class ITParquetDataManager {

From e87312031363e60f20029c7e6827414ea6420fb5 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 4 Jan 2026 23:50:19 +0100
Subject: [PATCH 037/100] bug fixes, TODO integrate functions into
 ParquetConversionSource

---
 .../xtable/parquet/ParquetDataManager.java    | 48 +++++++++----------
 .../xtable/parquet/ITParquetDataManager.java  | 27 +++++++++--
 2 files changed, 45 insertions(+), 30 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 9fd6fd925..bac6a5e74 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -65,27 +65,7 @@ private static ParquetFileConfig getParquetFileConfig(Configuration conf, Path f
     ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, fileToAppend);
     return parquetFileConfig;
   }
-  // TODO use this method to match the partition folders and merge them one by one (if one exists
-  // but not in other file then create it)
-  public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageType schema)
-      throws IOException {
-
-    // e.g., "year=2024/month=12"
-    String targetPartition = getPartitionPath(targetPath);
-    String sourcePartition = getPartitionPath(sourcePath);
-
-    if (!targetPartition.equals(sourcePartition)) {
-      throw new IllegalArgumentException(
-          "Partition Mismatch! Cannot merge " + sourcePartition + " into " + targetPartition);
-    }
-
-    // append files within the same partition foldr
-    appendNewParquetFiles(targetPath, sourcePath, schema);
-  }
-
-  private String getPartitionPath(Path path) {
-    return path.getParent().getName();
-  }
+  // TODO add safe guards for possible empty parquet files
   // append a file (merges two files into one .parquet under a partition folder)
   public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema)
       throws IOException {
@@ -105,6 +85,12 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
     writer.start();
     HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf);
     InputFile inputFileToAppend = HadoopInputFile.fromPath(fileToAppend, conf);
+    // get the equivalent fileStatus from the file-to-append Path
+    FileSystem fs = FileSystem.get(conf);
+    FileStatus fileStatus = fs.getFileStatus(fileToAppend);
+    // save modif Time (not needed for real case scenario)
+    // long prevModifTime = fileStatus.getModificationTime();
+
     if (checkIfSchemaIsSame(conf, fileToAppend, filePath)) {
       writer.appendFile(inputFile);
       writer.appendFile(inputFileToAppend);
@@ -115,9 +101,7 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
     }
     // track the append date and save it in the footer
     int appendCount = Integer.parseInt(combinedMeta.getOrDefault("total_appends", "0"));
-    // get the equivalent fileStatus from the file-to-append Path
-    FileSystem fs = FileSystem.get(conf);
-    FileStatus fileStatus = fs.getFileStatus(fileToAppend);
+
     // save block indexes and modification time (for later sync related retrieval) in the metadata
     // of the output
     // table
@@ -133,9 +117,13 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
         "append_date_" + currentAppendIdx, String.valueOf(fileStatus.getModificationTime()));
     writer.end(combinedMeta);
     fs.delete(filePath, false);
+    // restore modifTime not needed actually (only for test purposes)
+    // the append happens here so the time must NOT updated manually
+    // fs.setTimes(tempPath, prevModifTime, -1);
     fs.rename(tempPath, filePath);
     return filePath;
   }
+  // TODO add safe guards for possible empty parquet files
   // selective compaction of parquet blocks
   public static List<Path> formNewTargetFiles(
       Configuration conf, Path partitionPath, long targetModifTime) throws IOException {
@@ -178,6 +166,7 @@ public static List<Path> formNewTargetFiles(
                 "%s_block%d_%d.parquet", status.getPath().getName(), startBlock, endBlock);
         Path targetSyncFilePath = new Path(status.getPath().getParent(), newFileName);
         try (ParquetFileWriter writer = new ParquetFileWriter(conf, schema, targetSyncFilePath)) {
+          boolean wasDataAppended = false;
           writer.start();
           for (int j = 0; j < bigFileFooter.getBlocks().size(); j++) {
             BlockMetaData blockMetadata = bigFileFooter.getBlocks().get(j);
@@ -186,10 +175,17 @@ public static List<Path> formNewTargetFiles(
                   Collections.<BlockMetaData>singletonList(blockMetadata);
               FSDataInputStream targetInputStream = fs.open(status.getPath());
               writer.appendRowGroups(targetInputStream, blockList, false);
+              wasDataAppended = true;
             }
           }
+          // manually restore the modifTime after using appendRowGroups()
           writer.end(new HashMap<>());
-          finalPaths.add(targetSyncFilePath);
+          if (wasDataAppended) {
+            fs.setTimes(targetSyncFilePath, modifTime, -1);
+            finalPaths.add(targetSyncFilePath);
+          } else {
+            fs.delete(targetSyncFilePath, false);
+          }
         }
       }
     }
@@ -236,7 +232,9 @@ public static Path appendPartitionedData(Path targetPath, Path sourcePath, Messa
         continue;
       }
       long len = fs.getFileStatus(sourceFile).getLen();
+      long len_2 = fs.getFileStatus(masterTargetFile).getLen();
       System.out.println("DEBUG: Attempting to append " + sourceFile + " Size: " + len);
+      System.out.println("DEBUG: Attempting to append " + masterTargetFile + " Size: " + len_2);
       appendNewParquetFiles(masterTargetFile, sourceFile, schema);
     }
 
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index e47ddcf11..83d2c0a91 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -43,6 +43,7 @@
 import org.apache.spark.sql.types.StructType;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 public class ITParquetDataManager {
@@ -54,6 +55,7 @@ public static void setup() {
   }
 
   @Test
+  @Disabled("")
   public void testFormParquetFileSinglePartition() throws IOException {
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     StructType schema =
@@ -102,6 +104,7 @@ public void testFormParquetFileSinglePartition() throws IOException {
   }
 
   @Test
+  @Disabled("")
   public void testAppendParquetFileSinglePartition() throws IOException {
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     // In testAppendParquetFileSinglePartition
@@ -197,7 +200,11 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
               DataTypes.createStructField("month", DataTypes.IntegerType, false)
             });
     List<Row> data =
-        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2027, 11));
+        Arrays.asList(
+            RowFactory.create(100, "A", 2026, 12),
+            RowFactory.create(101, "AA", 2026, 12),
+            RowFactory.create(102, "CB", 2027, 11),
+            RowFactory.create(103, "BA", 2027, 11));
 
     Dataset<Row> df = spark.createDataFrame(data, schema);
     Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_2");
@@ -205,7 +212,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     String outputPath = fixedPath.toString();
     String finalAppendFilePath = appendFilePath.toString();
 
-    df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
+    df.coalesce(1).write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
 
     // test find files to sync
     long targetModifTime = System.currentTimeMillis() - 360000;
@@ -224,10 +231,13 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     }
     // create new file to append using Spark
     List<Row> futureDataToSync =
-        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2027, 11));
+        Arrays.asList(
+            RowFactory.create(101, "A", 2026, 12),
+            RowFactory.create(301, "D", 2027, 11),
+            RowFactory.create(302, "DA", 2027, 11));
     Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
     dfToSync
-        .coalesce(1)
+        .coalesce(1) // since we are exec this test on low data volume
         .write()
         .partitionBy("year", "month")
         .mode("overwrite")
@@ -248,6 +258,13 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
             new org.apache.hadoop.fs.Path(outputPath),
             new org.apache.hadoop.fs.Path(finalAppendFilePath),
             schemaParquet);
+    // reset time FOR TESTING PURPOSES ONLY
+    // (in real, newModifTime time is the appending time as set by the appendRow() in the
+    // DataManager)
+    // THIS IS NEED TO CHECK AGAINST THE TIME FROM WHICH THE LAST APPEND HAPPENED
+    for (org.apache.hadoop.fs.Path oPath : outputPaths) {
+      fs.setTimes(oPath, newModifTime, -1);
+    }
     // TODO can create big file which can be split if reaches a certain threshold size
     // implement bin-packing function for large files
 
@@ -269,7 +286,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
         boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025");
         long modTime = fs.getFileStatus(p).getModificationTime();
         // test for one partition value
-        assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString);
+        assertTrue(modTime >= newModifTime, "File discovered was actually old data: " + pathString);
         assertTrue(isNewData, "Path should belong to appended data: " + pathString);
         assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
       }

From 4d0f245c02512840d16c1567a40201efea7ca678 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Mon, 5 Jan 2026 09:51:17 +0100
Subject: [PATCH 038/100] bug fixes, TODO integrate functions into
 ParquetConversionSource

---
 .../xtable/parquet/ParquetDataManager.java    | 32 +++++++++++++------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index bac6a5e74..e0f8daee2 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -88,9 +88,6 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
     // get the equivalent fileStatus from the file-to-append Path
     FileSystem fs = FileSystem.get(conf);
     FileStatus fileStatus = fs.getFileStatus(fileToAppend);
-    // save modif Time (not needed for real case scenario)
-    // long prevModifTime = fileStatus.getModificationTime();
-
     if (checkIfSchemaIsSame(conf, fileToAppend, filePath)) {
       writer.appendFile(inputFile);
       writer.appendFile(inputFileToAppend);
@@ -106,9 +103,23 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
     // of the output
     // table
     int currentAppendIdx = appendCount + 1;
-    long startBlock = firstBlockIndex;
-    long blocksAdded = ParquetFileReader.readFooter(conf, fileToAppend).getBlocks().size();
-    long endBlock = startBlock + blocksAdded - 1;
+    List<BlockMetaData> initialBlocks = existingFooter.getBlocks();
+    long totalInitialCompressedSize = 0;
+    // first the blocks start at 0
+    for (BlockMetaData block : initialBlocks) {
+      totalInitialCompressedSize += block.getCompressedSize();
+    }
+    long startBlock = 0;
+    if (appendCount > 0) {
+      startBlock = totalInitialCompressedSize;
+    }
+
+    List<BlockMetaData> addedBlocks = ParquetFileReader.readFooter(conf, fileToAppend).getBlocks();
+    long totalAddedCompressedSize = 0;
+    for (BlockMetaData block : addedBlocks) {
+      totalAddedCompressedSize += block.getCompressedSize();
+    }
+    long endBlock = totalInitialCompressedSize + totalAddedCompressedSize;
 
     combinedMeta.put("total_appends", String.valueOf(currentAppendIdx));
     combinedMeta.put("index_start_block_of_append_" + currentAppendIdx, String.valueOf(startBlock));
@@ -117,9 +128,6 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
         "append_date_" + currentAppendIdx, String.valueOf(fileStatus.getModificationTime()));
     writer.end(combinedMeta);
     fs.delete(filePath, false);
-    // restore modifTime not needed actually (only for test purposes)
-    // the append happens here so the time must NOT updated manually
-    // fs.setTimes(tempPath, prevModifTime, -1);
     fs.rename(tempPath, filePath);
     return filePath;
   }
@@ -170,7 +178,11 @@ public static List<Path> formNewTargetFiles(
           writer.start();
           for (int j = 0; j < bigFileFooter.getBlocks().size(); j++) {
             BlockMetaData blockMetadata = bigFileFooter.getBlocks().get(j);
-            if (j >= startBlock && j <= endBlock && modifTime >= targetModifTime) {
+            long currentBlockStart = blockMetadata.getStartingPos();
+            long currentBlockEnd = currentBlockStart + blockMetadata.getCompressedSize();
+            if (currentBlockStart >= startBlock
+                && currentBlockEnd <= endBlock
+                && modifTime >= targetModifTime) {
               List<BlockMetaData> blockList =
                   Collections.<BlockMetaData>singletonList(blockMetadata);
               FSDataInputStream targetInputStream = fs.open(status.getPath());

From 219656e02194c807c93868e7ab42c96d4e6fc136 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Mon, 12 Jan 2026 22:29:29 +0100
Subject: [PATCH 039/100] handle lightweight metadata in the conversionSource

---
 .../parquet/ParquetConversionSource.java      |  70 ++--
 .../xtable/parquet/ParquetDataManager.java    | 298 ++----------------
 .../xtable/parquet/ParquetFileConfig.java     |  20 +-
 .../xtable/parquet/ITParquetDataManager.java  | 275 ----------------
 4 files changed, 90 insertions(+), 573 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
index e7eff9616..26d758026 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
@@ -56,6 +56,10 @@ public class ParquetConversionSource implements ConversionSource<Long> {
 
   private static final ParquetSchemaExtractor schemaExtractor =
       ParquetSchemaExtractor.getInstance();
+  private static final ParquetMetadataExtractor metadataExtractor =
+      ParquetMetadataExtractor.getInstance();
+  private static final ParquetDataManager parquetDataManagerExtractor =
+      ParquetDataManager.getInstance();
 
   private static final ParquetMetadataExtractor parquetMetadataExtractor =
       ParquetMetadataExtractor.getInstance();
@@ -69,7 +73,7 @@ public class ParquetConversionSource implements ConversionSource<Long> {
   private final String basePath;
   @NonNull private final Configuration hadoopConf;
 
-  private InternalTable createInternalTableFromFile(LocatedFileStatus latestFile) {
+  private InternalTable createInternalTableFromFile(ParquetFileConfig latestFile) {
     ParquetMetadata parquetMetadata =
         parquetMetadataExtractor.readParquetMetadata(hadoopConf, latestFile.getPath());
     MessageType parquetSchema = parquetMetadataExtractor.getSchema(parquetMetadata);
@@ -87,15 +91,41 @@ private InternalTable createInternalTableFromFile(LocatedFileStatus latestFile)
         .layoutStrategy(dataLayoutStrategy)
         .partitioningFields(partitionFields)
         .readSchema(schema)
-        .latestCommitTime(Instant.ofEpochMilli(latestFile.getModificationTime()))
+        .latestCommitTime(Instant.ofEpochMilli(latestFile.getModifTime()))
         .build();
   }
 
+  public Stream<ParquetFileConfig> getConfigsFromStream(
+      Stream<LocatedFileStatus> fileStream, Configuration conf) {
+
+    return fileStream.map(
+        fileStatus -> {
+          Path path = fileStatus.getPath();
+
+          ParquetMetadata metadata =
+              ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path);
+
+          return ParquetFileConfig.builder()
+              .schema(metadata.getFileMetaData().getSchema())
+              .metadata(metadata)
+              .path(path)
+              .size(fileStatus.getLen())
+              .modifTime(fileStatus.getModificationTime())
+              .rowGroupIndex(0L)
+              .codec(
+                  metadata.getBlocks().isEmpty()
+                      ? null
+                      : metadata.getBlocks().get(0).getColumns().get(0).getCodec())
+              .build();
+        });
+  }
+
   @Override
   public InternalTable getTable(Long modificationTime) {
     // get parquetFile at specific time modificationTime
     Stream<LocatedFileStatus> parquetFiles = getParquetFiles(hadoopConf, basePath);
-    LocatedFileStatus file = getParquetFileAt(parquetFiles, modificationTime);
+    Stream<ParquetFileConfig> parquetFilesMetadata = getConfigsFromStream(parquetFiles, hadoopConf);
+    ParquetFileConfig file = getParquetFileAt(parquetFilesMetadata, modificationTime);
     return createInternalTableFromFile(file);
   }
 
@@ -156,7 +186,10 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
         parquetFiles
             .filter(fileStatus -> fileStatus.getModificationTime() > modificationTime)
             .collect(Collectors.toList());
-    InternalTable internalTable = getMostRecentTable(parquetFiles);
+    List<ParquetFileConfig> tableChangesAfterMetadata =
+        parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
+            hadoopConf, parquetFiles, modificationTime);
+    InternalTable internalTable = getMostRecentTable(tableChangesAfterMetadata.stream());
     for (FileStatus tableStatus : tableChangesAfter) {
       InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(tableStatus);
       addedInternalDataFiles.add(currentDataFile);
@@ -168,15 +201,15 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
         .build();
   }
 
-  private InternalTable getMostRecentTable(Stream<LocatedFileStatus> parquetFiles) {
-    LocatedFileStatus latestFile = getMostRecentParquetFile(parquetFiles);
+  private InternalTable getMostRecentTable(Stream<ParquetFileConfig> parquetFiles) {
+    ParquetFileConfig latestFile = getMostRecentParquetFile(parquetFiles);
     return createInternalTableFromFile(latestFile);
   }
 
   @Override
   public InternalTable getCurrentTable() {
     Stream<LocatedFileStatus> parquetFiles = getParquetFiles(hadoopConf, basePath);
-    return getMostRecentTable(parquetFiles);
+    return getMostRecentTable(getConfigsFromStream(parquetFiles, hadoopConf));
   }
 
   /**
@@ -189,29 +222,30 @@ public InternalSnapshot getCurrentSnapshot() {
     // to avoid consume the stream call the method twice to return the same stream of parquet files
     Stream<InternalDataFile> internalDataFiles =
         getInternalDataFiles(getParquetFiles(hadoopConf, basePath));
-    InternalTable table = getMostRecentTable(getParquetFiles(hadoopConf, basePath));
+    Stream<ParquetFileConfig> parquetFilesMetadata =
+        getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf);
+    InternalTable table = getMostRecentTable(parquetFilesMetadata);
     return InternalSnapshot.builder()
         .table(table)
         .sourceIdentifier(
-            getCommitIdentifier(
-                getMostRecentParquetFile(getParquetFiles(hadoopConf, basePath))
-                    .getModificationTime()))
+            getCommitIdentifier(getMostRecentParquetFile(parquetFilesMetadata).getModifTime()))
         .partitionedDataFiles(PartitionFileGroup.fromFiles(internalDataFiles))
         .build();
   }
 
-  private LocatedFileStatus getMostRecentParquetFile(Stream<LocatedFileStatus> parquetFiles) {
+  private ParquetFileConfig getMostRecentParquetFile(Stream<ParquetFileConfig> parquetFiles) {
     return parquetFiles
-        .max(Comparator.comparing(FileStatus::getModificationTime))
+        .max(Comparator.comparing(ParquetFileConfig::getModifTime))
         .orElseThrow(() -> new IllegalStateException("No files found"));
   }
 
-  private LocatedFileStatus getParquetFileAt(
-      Stream<LocatedFileStatus> parquetFiles, long modificationTime) {
-    return parquetFiles
-        .filter(fileStatus -> fileStatus.getModificationTime() == modificationTime)
+  private ParquetFileConfig getParquetFileAt(
+      Stream<ParquetFileConfig> parquetConfigs, long targetTime) {
+
+    return parquetConfigs
+        .filter(config -> config.getModifTime() >= targetTime)
         .findFirst()
-        .orElseThrow(() -> new IllegalStateException("No file found at " + modificationTime));
+        .orElseThrow(() -> new IllegalStateException("No file found at or after " + targetTime));
   }
 
   private Stream<LocatedFileStatus> getParquetFiles(Configuration hadoopConf, String basePath) {
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index e0f8daee2..5d09352ea 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -18,23 +18,14 @@
  
 package org.apache.xtable.parquet;
 
-import java.io.IOException;
 import java.util.*;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
 
-import lombok.Builder;
 import lombok.extern.log4j.Log4j2;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.*;
-import org.apache.parquet.format.converter.ParquetMetadataConverter;
-import org.apache.parquet.hadoop.ParquetFileReader;
-import org.apache.parquet.hadoop.ParquetFileWriter;
-import org.apache.parquet.hadoop.metadata.BlockMetaData;
-import org.apache.parquet.hadoop.metadata.ParquetMetadata;
-import org.apache.parquet.hadoop.util.HadoopInputFile;
-import org.apache.parquet.hadoop.util.HadoopOutputFile;
-import org.apache.parquet.io.InputFile;
-import org.apache.parquet.schema.MessageType;
 
 /**
  * Manages Parquet file operations including reading, writing, and partition discovery and path
@@ -45,281 +36,30 @@
  * fields.
  */
 @Log4j2
-@Builder
 public class ParquetDataManager {
-  private ParquetMetadataExtractor metadataExtractor = ParquetMetadataExtractor.getInstance();
-  private static final long DEFAULT_BLOCK_SIZE = 128 * 1024 * 1024; // 128MB
-  private static final int DEFAULT_PAGE_SIZE = 1 * 1024 * 1024; // 1MB
+  public static final ParquetDataManager INSTANCE = new ParquetDataManager();
 
-  /* Use Parquet API to append to a file */
-
-  // after appending check required before appending the file
-  private static boolean checkIfSchemaIsSame(
-      Configuration conf, Path fileToAppend, Path fileFromTable) {
-    ParquetFileConfig schemaFileAppend = getParquetFileConfig(conf, fileToAppend);
-    ParquetFileConfig schemaFileFromTable = getParquetFileConfig(conf, fileFromTable);
-    return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema());
-  }
-
-  private static ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAppend) {
-    ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, fileToAppend);
-    return parquetFileConfig;
-  }
-  // TODO add safe guards for possible empty parquet files
-  // append a file (merges two files into one .parquet under a partition folder)
-  public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema)
-      throws IOException {
-    Configuration conf = new Configuration();
-    long firstBlockIndex = getParquetFileConfig(conf, filePath).getRowGroupIndex();
-    ParquetMetadata existingFooter = ParquetFileReader.readFooter(conf, filePath);
-    Map<String, String> existingMeta = existingFooter.getFileMetaData().getKeyValueMetaData();
-    Path tempPath = new Path(filePath.getParent(), "." + filePath.getName() + ".tmp");
-    ParquetFileWriter writer =
-        new ParquetFileWriter(
-            HadoopOutputFile.fromPath(tempPath, conf),
-            schema,
-            ParquetFileWriter.Mode.OVERWRITE,
-            DEFAULT_BLOCK_SIZE,
-            0);
-    // write the initial table with the appended file to add into the outputPath
-    writer.start();
-    HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf);
-    InputFile inputFileToAppend = HadoopInputFile.fromPath(fileToAppend, conf);
-    // get the equivalent fileStatus from the file-to-append Path
-    FileSystem fs = FileSystem.get(conf);
-    FileStatus fileStatus = fs.getFileStatus(fileToAppend);
-    if (checkIfSchemaIsSame(conf, fileToAppend, filePath)) {
-      writer.appendFile(inputFile);
-      writer.appendFile(inputFileToAppend);
-    }
-    Map<String, String> combinedMeta = new HashMap<>();
-    if (existingMeta != null) {
-      combinedMeta.putAll(existingMeta);
-    }
-    // track the append date and save it in the footer
-    int appendCount = Integer.parseInt(combinedMeta.getOrDefault("total_appends", "0"));
-
-    // save block indexes and modification time (for later sync related retrieval) in the metadata
-    // of the output
-    // table
-    int currentAppendIdx = appendCount + 1;
-    List<BlockMetaData> initialBlocks = existingFooter.getBlocks();
-    long totalInitialCompressedSize = 0;
-    // first the blocks start at 0
-    for (BlockMetaData block : initialBlocks) {
-      totalInitialCompressedSize += block.getCompressedSize();
-    }
-    long startBlock = 0;
-    if (appendCount > 0) {
-      startBlock = totalInitialCompressedSize;
-    }
-
-    List<BlockMetaData> addedBlocks = ParquetFileReader.readFooter(conf, fileToAppend).getBlocks();
-    long totalAddedCompressedSize = 0;
-    for (BlockMetaData block : addedBlocks) {
-      totalAddedCompressedSize += block.getCompressedSize();
-    }
-    long endBlock = totalInitialCompressedSize + totalAddedCompressedSize;
-
-    combinedMeta.put("total_appends", String.valueOf(currentAppendIdx));
-    combinedMeta.put("index_start_block_of_append_" + currentAppendIdx, String.valueOf(startBlock));
-    combinedMeta.put("index_end_block_of_append_" + currentAppendIdx, String.valueOf(endBlock));
-    combinedMeta.put(
-        "append_date_" + currentAppendIdx, String.valueOf(fileStatus.getModificationTime()));
-    writer.end(combinedMeta);
-    fs.delete(filePath, false);
-    fs.rename(tempPath, filePath);
-    return filePath;
-  }
-  // TODO add safe guards for possible empty parquet files
-  // selective compaction of parquet blocks
-  public static List<Path> formNewTargetFiles(
-      Configuration conf, Path partitionPath, long targetModifTime) throws IOException {
-    List<Path> finalPaths = new ArrayList<>();
-    FileSystem fs = partitionPath.getFileSystem(conf);
-
-    FileStatus[] statuses =
-        fs.listStatus(partitionPath, path -> path.getName().endsWith(".parquet"));
-
-    for (FileStatus status : statuses) {
-
-      Path filePath = status.getPath();
-      ParquetMetadata bigFileFooter =
-          ParquetFileReader.readFooter(conf, filePath, ParquetMetadataConverter.NO_FILTER);
-      MessageType schema = bigFileFooter.getFileMetaData().getSchema();
-      int totalAppends =
-          Integer.parseInt(
-              bigFileFooter
-                  .getFileMetaData()
-                  .getKeyValueMetaData()
-                  .getOrDefault("total_appends", "0"));
-      for (int i = 1; i <= totalAppends; i++) {
-        int startBlock =
-            Integer.valueOf(
-                bigFileFooter
-                    .getFileMetaData()
-                    .getKeyValueMetaData()
-                    .get("index_start_block_of_append_" + i));
-        int endBlock =
-            Integer.valueOf(
-                bigFileFooter
-                    .getFileMetaData()
-                    .getKeyValueMetaData()
-                    .get("index_end_block_of_append_" + i));
-        long modifTime =
-            Long.parseLong(
-                bigFileFooter.getFileMetaData().getKeyValueMetaData().get("append_date_" + i));
-        String newFileName =
-            String.format(
-                "%s_block%d_%d.parquet", status.getPath().getName(), startBlock, endBlock);
-        Path targetSyncFilePath = new Path(status.getPath().getParent(), newFileName);
-        try (ParquetFileWriter writer = new ParquetFileWriter(conf, schema, targetSyncFilePath)) {
-          boolean wasDataAppended = false;
-          writer.start();
-          for (int j = 0; j < bigFileFooter.getBlocks().size(); j++) {
-            BlockMetaData blockMetadata = bigFileFooter.getBlocks().get(j);
-            long currentBlockStart = blockMetadata.getStartingPos();
-            long currentBlockEnd = currentBlockStart + blockMetadata.getCompressedSize();
-            if (currentBlockStart >= startBlock
-                && currentBlockEnd <= endBlock
-                && modifTime >= targetModifTime) {
-              List<BlockMetaData> blockList =
-                  Collections.<BlockMetaData>singletonList(blockMetadata);
-              FSDataInputStream targetInputStream = fs.open(status.getPath());
-              writer.appendRowGroups(targetInputStream, blockList, false);
-              wasDataAppended = true;
-            }
-          }
-          // manually restore the modifTime after using appendRowGroups()
-          writer.end(new HashMap<>());
-          if (wasDataAppended) {
-            fs.setTimes(targetSyncFilePath, modifTime, -1);
-            finalPaths.add(targetSyncFilePath);
-          } else {
-            fs.delete(targetSyncFilePath, false);
-          }
-        }
-      }
-    }
-    return finalPaths;
-  }
-
-  public static Path appendPartitionedData(Path targetPath, Path sourcePath, MessageType schema)
-      throws IOException {
-    Configuration conf = new Configuration();
-    FileSystem fs = sourcePath.getFileSystem(conf);
-
-    List<Path> targetFiles = collectParquetFiles(fs, targetPath);
-
-    List<Path> sourceFiles = collectParquetFiles(fs, sourcePath);
-
-    if (targetFiles.isEmpty()) {
-      throw new IOException("Target directory contains no parquet files to append to.");
-    }
-    if (sourceFiles.isEmpty()) {
-      return targetPath;
-    }
-    Path masterTargetFile = null;
-    for (Path p : targetFiles) {
-      ParquetMetadata metadata = ParquetFileReader.readFooter(conf, p);
-      if (!metadata.getBlocks().isEmpty()) {
-        masterTargetFile = p;
-        break;
-      }
-    }
-
-    if (masterTargetFile == null) {
-      throw new IOException("Target directory contains no files with valid data blocks.");
-    }
-
-    for (int i = 1; i < targetFiles.size(); i++) {
-      long len = fs.getFileStatus(targetFiles.get(i)).getLen();
-      System.out.println("DEBUG: Attempting to append " + targetFiles.get(i) + " Size: " + len);
-      appendNewParquetFiles(masterTargetFile, targetFiles.get(i), schema);
-    }
-    for (Path sourceFile : sourceFiles) {
-      ParquetMetadata sourceFooter = ParquetFileReader.readFooter(conf, sourceFile);
-      if (sourceFooter.getBlocks().isEmpty()) {
-        System.out.println("SKIPPING: " + sourceFile + " has no data.");
-        continue;
-      }
-      long len = fs.getFileStatus(sourceFile).getLen();
-      long len_2 = fs.getFileStatus(masterTargetFile).getLen();
-      System.out.println("DEBUG: Attempting to append " + sourceFile + " Size: " + len);
-      System.out.println("DEBUG: Attempting to append " + masterTargetFile + " Size: " + len_2);
-      appendNewParquetFiles(masterTargetFile, sourceFile, schema);
-    }
-
-    return masterTargetFile;
-  }
-
-  private static List<Path> collectParquetFiles(FileSystem fs, Path root) throws IOException {
-    List<Path> parquetFiles = new ArrayList<>();
-    Configuration conf = fs.getConf();
-    if (!fs.exists(root)) return parquetFiles;
-
-    if (fs.getFileStatus(root).isDirectory()) {
-      RemoteIterator<LocatedFileStatus> it = fs.listFiles(root, true);
-      while (it.hasNext()) {
-        Path p = it.next().getPath();
-        if (p.getName().endsWith(".parquet")
-            && !p.toString().contains("/.")
-            && !p.toString().contains("/_")) {
-          ParquetMetadata footer = ParquetFileReader.readFooter(conf, p);
-          if (footer.getBlocks() != null && !footer.getBlocks().isEmpty()) {
-            parquetFiles.add(p);
-          }
-        }
-      }
-    } else {
-      parquetFiles.add(root);
-    }
-    return parquetFiles;
+  public static ParquetDataManager getInstance() {
+    return INSTANCE;
   }
 
-  public static List<Path> mergeDatasetsByPartition(
-      Path targetRoot, Path sourceRoot, MessageType schema) throws IOException {
-    List<Path> finalPartitionPaths = new ArrayList<>();
-    Configuration conf = new Configuration();
-    FileSystem fs = targetRoot.getFileSystem(conf);
-
-    Map<String, Path> sourcePartitions = mapPartitions(fs, sourceRoot);
-
-    Map<String, Path> targetPartitions = mapPartitions(fs, targetRoot);
-
-    for (Map.Entry<String, Path> entry : sourcePartitions.entrySet()) {
-      String partitionKey = entry.getKey();
-      Path sourcePartitionPath = entry.getValue();
+  public List<ParquetFileConfig> getParquetFilesMetadataInRange(
+      Configuration conf, Stream<LocatedFileStatus> parquetFiles, long startTime, long endTime) {
 
-      if (targetPartitions.containsKey(partitionKey)) {
-        Path targetPartitionPath = targetPartitions.get(partitionKey);
-
-        finalPartitionPaths.add(
-            appendPartitionedData(targetPartitionPath, sourcePartitionPath, schema));
-      } else {
-        // TODO logic for when a partition exists in source but not in target
-        // simply merge the files in the respective partitions using appendPartitionedData() that
-        // takes only one Path/dir
-
-      }
-    }
-    return finalPartitionPaths;
+    return parquetFiles
+        .filter(
+            file ->
+                file.getModificationTime() >= startTime && file.getModificationTime() <= endTime)
+        .map(file -> new ParquetFileConfig(conf, file.getPath()))
+        .collect(Collectors.toList());
   }
 
-  private static Map<String, Path> mapPartitions(FileSystem fs, Path root) throws IOException {
-    Map<String, Path> partitionMap = new HashMap<>();
-    RemoteIterator<LocatedFileStatus> it = fs.listFiles(root, true);
-
-    while (it.hasNext()) {
-      Path filePath = it.next().getPath();
-      if (filePath.getName().endsWith(".parquet") && !filePath.toString().contains("/_")) {
-        Path parent = filePath.getParent();
-        String relative = parent.toString().replace(root.toString(), "");
-        if (relative.startsWith("/")) relative = relative.substring(1);
+  public List<ParquetFileConfig> getParquetFilesMetadataAfterTime(
+      Configuration conf, Stream<LocatedFileStatus> parquetFiles, long syncTime) {
 
-        partitionMap.put(relative, parent);
-      }
-    }
-    return partitionMap;
+    return parquetFiles
+        .filter(file -> file.getModificationTime() >= syncTime)
+        .map(file -> new ParquetFileConfig(conf, file.getPath()))
+        .collect(Collectors.toList());
   }
 }
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
index 33ee08d38..9e7755011 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
@@ -18,31 +18,49 @@
  
 package org.apache.xtable.parquet;
 
+import java.io.IOException;
+
 import lombok.AccessLevel;
+import lombok.Builder;
 import lombok.Getter;
 import lombok.experimental.FieldDefaults;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+import org.apache.parquet.hadoop.metadata.BlockMetaData;
 import org.apache.parquet.hadoop.metadata.CompressionCodecName;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.schema.MessageType;
 
 @Getter
 @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
+@Builder
 class ParquetFileConfig {
   MessageType schema;
+  ParquetMetadata metadata;
   long rowGroupIndex;
+  long modifTime;
+  long size;
   CompressionCodecName codec;
+  Path path;
 
   public ParquetFileConfig(Configuration conf, Path file) {
+    long modifTime = -1L;
     ParquetMetadata metadata =
         ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file);
 
     if (metadata.getBlocks().isEmpty()) {
       throw new IllegalStateException("Parquet file contains no row groups.");
     }
-
+    try {
+      modifTime = file.getFileSystem(conf).getFileStatus(file).getModificationTime();
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+    this.path = file;
+    this.modifTime = modifTime;
+    this.size = metadata.getBlocks().stream().mapToLong(BlockMetaData::getTotalByteSize).sum();
+    this.metadata = metadata;
     this.schema = metadata.getFileMetaData().getSchema();
     this.rowGroupIndex = metadata.getBlocks().size();
     this.codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec();
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 83d2c0a91..8d8a16a2d 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -18,33 +18,9 @@
  
 package org.apache.xtable.parquet;
 
-import static org.junit.jupiter.api.Assertions.assertFalse;
-import static org.junit.jupiter.api.Assertions.assertTrue;
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.parquet.schema.LogicalTypeAnnotation;
-import org.apache.parquet.schema.MessageType;
-import org.apache.parquet.schema.PrimitiveType;
-import org.apache.parquet.schema.Types;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
 import org.apache.spark.sql.SparkSession;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Disabled;
-import org.junit.jupiter.api.Test;
 
 public class ITParquetDataManager {
   private static SparkSession spark;
@@ -54,257 +30,6 @@ public static void setup() {
     spark = SparkSession.builder().appName("ParquetTest").master("local[*]").getOrCreate();
   }
 
-  @Test
-  @Disabled("")
-  public void testFormParquetFileSinglePartition() throws IOException {
-    Configuration conf = spark.sparkContext().hadoopConfiguration();
-    StructType schema =
-        DataTypes.createStructType(
-            new StructField[] {
-              DataTypes.createStructField("id", DataTypes.IntegerType, false),
-              DataTypes.createStructField("value", DataTypes.StringType, false),
-              DataTypes.createStructField("year", DataTypes.IntegerType, false),
-              DataTypes.createStructField("month", DataTypes.IntegerType, false)
-            });
-
-    List<Row> data =
-        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12));
-    Dataset<Row> df = spark.createDataFrame(data, schema);
-    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test_0");
-    String outputPath = fixedPath.toString();
-    df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
-
-    // test find files to sync
-    long targetModifTime = System.currentTimeMillis() - 360000;
-    org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath);
-    FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf);
-    // set the modification time to the file
-    updateModificationTimeRecursive(fs, hdfsPath, targetModifTime);
-    // create new file to append using Spark
-    List<Row> futureDataToSync =
-        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2026, 12));
-    Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
-    dfToSync.write().partitionBy("year", "month").mode("append").parquet(outputPath);
-
-    long newModifTime = System.currentTimeMillis() - 50000;
-
-    List<org.apache.hadoop.fs.Path> resultingFiles =
-        ParquetDataManager.formNewTargetFiles(conf, hdfsPath, newModifTime);
-    // check if resultingFiles contains the append data only (through the partition names)
-    for (org.apache.hadoop.fs.Path p : resultingFiles) {
-      String pathString = p.toString();
-
-      boolean isNewData = pathString.contains("year=2026");
-
-      long modTime = fs.getFileStatus(p).getModificationTime();
-      // test for one partition value
-      assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString);
-      assertTrue(isNewData, "Path should belong to appended data: " + pathString);
-    }
-  }
-
-  @Test
-  @Disabled("")
-  public void testAppendParquetFileSinglePartition() throws IOException {
-    Configuration conf = spark.sparkContext().hadoopConfiguration();
-    // In testAppendParquetFileSinglePartition
-    MessageType schemaParquet =
-        Types.buildMessage()
-            .required(PrimitiveType.PrimitiveTypeName.INT32)
-            .named("id")
-            .required(PrimitiveType.PrimitiveTypeName.BINARY)
-            .as(LogicalTypeAnnotation.stringType())
-            .named("value")
-            .named("parquet_schema");
-    StructType schema =
-        DataTypes.createStructType(
-            new StructField[] {
-              DataTypes.createStructField("id", DataTypes.IntegerType, false),
-              DataTypes.createStructField("value", DataTypes.StringType, false),
-              DataTypes.createStructField("year", DataTypes.IntegerType, false),
-              DataTypes.createStructField("month", DataTypes.IntegerType, false)
-            });
-    List<Row> data =
-        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12));
-
-    Dataset<Row> df = spark.createDataFrame(data, schema);
-    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_1");
-    Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test_1");
-    String outputPath = fixedPath.toString();
-    String finalAppendFilePath = appendFilePath.toString();
-
-    df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
-
-    // test find files to sync
-    long targetModifTime = System.currentTimeMillis() - 360000;
-    org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath);
-    FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf);
-    // set the modification time to the table file
-    updateModificationTimeRecursive(fs, hdfsPath, targetModifTime);
-    // create new file to append using Spark
-    List<Row> futureDataToSync =
-        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2026, 12));
-    Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
-    dfToSync
-        .coalesce(1)
-        .write()
-        .partitionBy("year", "month")
-        .mode("overwrite")
-        .parquet(finalAppendFilePath);
-    long newModifTime = System.currentTimeMillis() - 50000;
-    // update modifTime for file to append
-    updateModificationTimeRecursive(
-        fs, new org.apache.hadoop.fs.Path(finalAppendFilePath), newModifTime);
-    // recursively append all files under a specific partition path (e.g. "year=2026/month=12")
-    // (assuming that outputPath and finalAppendFilePath are same partitions for different files)
-    org.apache.hadoop.fs.Path outputFile =
-        ParquetDataManager.appendPartitionedData(
-            new org.apache.hadoop.fs.Path(outputPath),
-            new org.apache.hadoop.fs.Path(finalAppendFilePath),
-            schemaParquet);
-    // TODO can create big file which can be split if reaches a certain threshold size
-
-    // test whether the appended data can be selectively filtered for incr sync (e.g. get me the
-    // data to sync only)
-    List<org.apache.hadoop.fs.Path> resultingFiles =
-        ParquetDataManager.formNewTargetFiles(conf, outputFile, newModifTime);
-    // check if resultingFiles contains the append data only (through the partition names)
-    for (org.apache.hadoop.fs.Path p : resultingFiles) {
-      String pathString = p.toString();
-      boolean isNewData = pathString.contains("year=2026");
-      long modTime = fs.getFileStatus(p).getModificationTime();
-      // test for one partition value
-      assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString);
-      assertTrue(isNewData, "Path should belong to appended data: " + pathString);
-    }
-  }
-
-  @Test
-  public void testAppendParquetFileMultiplePartition() throws IOException {
-    Configuration conf = spark.sparkContext().hadoopConfiguration();
-
-    MessageType schemaParquet =
-        Types.buildMessage()
-            .required(PrimitiveType.PrimitiveTypeName.INT32)
-            .named("id")
-            .required(PrimitiveType.PrimitiveTypeName.BINARY)
-            .as(LogicalTypeAnnotation.stringType())
-            .named("value")
-            .named("parquet_schema");
-    StructType schema =
-        DataTypes.createStructType(
-            new StructField[] {
-              DataTypes.createStructField("id", DataTypes.IntegerType, false),
-              DataTypes.createStructField("value", DataTypes.StringType, false),
-              DataTypes.createStructField("year", DataTypes.IntegerType, false),
-              DataTypes.createStructField("month", DataTypes.IntegerType, false)
-            });
-    List<Row> data =
-        Arrays.asList(
-            RowFactory.create(100, "A", 2026, 12),
-            RowFactory.create(101, "AA", 2026, 12),
-            RowFactory.create(102, "CB", 2027, 11),
-            RowFactory.create(103, "BA", 2027, 11));
-
-    Dataset<Row> df = spark.createDataFrame(data, schema);
-    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_2");
-    Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test_2");
-    String outputPath = fixedPath.toString();
-    String finalAppendFilePath = appendFilePath.toString();
-
-    df.coalesce(1).write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
-
-    // test find files to sync
-    long targetModifTime = System.currentTimeMillis() - 360000;
-    org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath);
-    FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf);
-    // set the modification time to the table file
-    // update modifTime for file to append
-    // many partitions case
-    List<String> newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=11");
-    for (String partition : newPartitions) {
-      org.apache.hadoop.fs.Path partitionPath =
-          new org.apache.hadoop.fs.Path(outputPath, partition);
-      if (fs.exists(partitionPath)) {
-        updateModificationTimeRecursive(fs, partitionPath, targetModifTime);
-      }
-    }
-    // create new file to append using Spark
-    List<Row> futureDataToSync =
-        Arrays.asList(
-            RowFactory.create(101, "A", 2026, 12),
-            RowFactory.create(301, "D", 2027, 11),
-            RowFactory.create(302, "DA", 2027, 11));
-    Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
-    dfToSync
-        .coalesce(1) // since we are exec this test on low data volume
-        .write()
-        .partitionBy("year", "month")
-        .mode("overwrite")
-        .parquet(finalAppendFilePath);
-    long newModifTime = System.currentTimeMillis() - 50000;
-    for (String partition : newPartitions) {
-      org.apache.hadoop.fs.Path partitionPath =
-          new org.apache.hadoop.fs.Path(finalAppendFilePath, partition);
-      if (fs.exists(partitionPath)) {
-        updateModificationTimeRecursive(fs, partitionPath, newModifTime);
-      }
-    }
-    // recursively append all files under a specific partition path (e.g. "year=2026/month=12")
-    // (assuming that outputPath and finalAppendFilePath are same partitions for different files)
-    // the result contains all partition paths with the parquet files compacted
-    List<org.apache.hadoop.fs.Path> outputPaths =
-        ParquetDataManager.mergeDatasetsByPartition(
-            new org.apache.hadoop.fs.Path(outputPath),
-            new org.apache.hadoop.fs.Path(finalAppendFilePath),
-            schemaParquet);
-    // reset time FOR TESTING PURPOSES ONLY
-    // (in real, newModifTime time is the appending time as set by the appendRow() in the
-    // DataManager)
-    // THIS IS NEED TO CHECK AGAINST THE TIME FROM WHICH THE LAST APPEND HAPPENED
-    for (org.apache.hadoop.fs.Path oPath : outputPaths) {
-      fs.setTimes(oPath, newModifTime, -1);
-    }
-    // TODO can create big file which can be split if reaches a certain threshold size
-    // implement bin-packing function for large files
-
-    // test whether the appended data can be selectively filtered for incr sync (e.g. get me the
-    // data to sync only)
-    List<List<org.apache.hadoop.fs.Path>> finalPaths = new ArrayList<>();
-    for (org.apache.hadoop.fs.Path resPath : outputPaths) {
-      List<org.apache.hadoop.fs.Path> resultingFiles =
-          ParquetDataManager.formNewTargetFiles(conf, resPath, newModifTime);
-      finalPaths.add(resultingFiles);
-    }
-    // check if resultingFiles contains the append data only
-    for (List<org.apache.hadoop.fs.Path> fPaths : finalPaths) {
-      for (org.apache.hadoop.fs.Path p : fPaths) {
-        String pathString = p.toString();
-        //  should be TRUE
-        boolean isNewData = pathString.contains("year=2026") || pathString.contains("year=2027");
-        // should be FALSE
-        boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025");
-        long modTime = fs.getFileStatus(p).getModificationTime();
-        // test for one partition value
-        assertTrue(modTime >= newModifTime, "File discovered was actually old data: " + pathString);
-        assertTrue(isNewData, "Path should belong to appended data: " + pathString);
-        assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
-      }
-    }
-  }
-
-  private void updateModificationTimeRecursive(
-      FileSystem fs, org.apache.hadoop.fs.Path path, long time) throws IOException {
-    org.apache.hadoop.fs.RemoteIterator<org.apache.hadoop.fs.LocatedFileStatus> it =
-        fs.listFiles(path, true);
-    while (it.hasNext()) {
-      org.apache.hadoop.fs.LocatedFileStatus status = it.next();
-      if (status.getPath().getName().endsWith(".parquet")) {
-        fs.setTimes(status.getPath(), time, -1);
-      }
-    }
-  }
-
   @AfterAll
   public static void tearDown() {
     if (spark != null) {

From ff809d7b6aa294b234ec679f693dc8f058d25898 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Mon, 12 Jan 2026 22:36:47 +0100
Subject: [PATCH 040/100] fix CI

---
 .../main/java/org/apache/xtable/parquet/ParquetFileConfig.java  | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
index 9e7755011..7a038124a 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
@@ -21,6 +21,7 @@
 import java.io.IOException;
 
 import lombok.AccessLevel;
+import lombok.AllArgsConstructor;
 import lombok.Builder;
 import lombok.Getter;
 import lombok.experimental.FieldDefaults;
@@ -35,6 +36,7 @@
 @Getter
 @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
 @Builder
+@AllArgsConstructor
 class ParquetFileConfig {
   MessageType schema;
   ParquetMetadata metadata;

From e06368f9d92a3ac4553352dd01e65e1464c7cc39 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Mon, 12 Jan 2026 23:05:15 +0100
Subject: [PATCH 041/100] fix CI, refactoring and isIncrementalSyncSafeFrom
 implementation

---
 .../parquet/ParquetConversionSource.java      | 78 ++++++++++++++-----
 1 file changed, 57 insertions(+), 21 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
index 26d758026..935a5c942 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
@@ -24,11 +24,11 @@
 import java.nio.file.Paths;
 import java.time.Instant;
 import java.util.*;
-import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
 import lombok.Builder;
 import lombok.NonNull;
+import lombok.extern.log4j.Log4j2;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.*;
@@ -52,6 +52,7 @@
 import org.apache.xtable.spi.extractor.ConversionSource;
 
 @Builder
+@Log4j2
 public class ParquetConversionSource implements ConversionSource<Long> {
 
   private static final ParquetSchemaExtractor schemaExtractor =
@@ -129,13 +130,13 @@ public InternalTable getTable(Long modificationTime) {
     return createInternalTableFromFile(file);
   }
 
-  private Stream<InternalDataFile> getInternalDataFiles(Stream<LocatedFileStatus> parquetFiles) {
+  private Stream<InternalDataFile> getInternalDataFiles(Stream<ParquetFileConfig> parquetFiles) {
     return parquetFiles.map(
         file ->
             InternalDataFile.builder()
                 .physicalPath(file.getPath().toString())
                 .fileFormat(FileFormat.APACHE_PARQUET)
-                .fileSizeBytes(file.getLen())
+                .fileSizeBytes(file.getSize())
                 .partitionValues(
                     partitionValueExtractor.extractPartitionValues(
                         partitionSpecExtractor.spec(
@@ -144,14 +145,14 @@ private Stream<InternalDataFile> getInternalDataFiles(Stream<LocatedFileStatus>
                                     hadoopConf, file.getPath()),
                                 file.getPath().toString())),
                         HudiPathUtils.getPartitionPath(new Path(basePath), file.getPath())))
-                .lastModified(file.getModificationTime())
+                .lastModified(file.getModifTime())
                 .columnStats(
                     parquetStatsExtractor.getColumnStatsForaFile(
                         parquetMetadataExtractor.readParquetMetadata(hadoopConf, file.getPath())))
                 .build());
   }
 
-  private InternalDataFile createInternalDataFileFromParquetFile(FileStatus parquetFile) {
+  private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileConfig parquetFile) {
     return InternalDataFile.builder()
         .physicalPath(parquetFile.getPath().toString())
         .partitionValues(
@@ -162,8 +163,8 @@ private InternalDataFile createInternalDataFileFromParquetFile(FileStatus parque
                             hadoopConf, parquetFile.getPath()),
                         parquetFile.getPath().toString())),
                 basePath))
-        .lastModified(parquetFile.getModificationTime())
-        .fileSizeBytes(parquetFile.getLen())
+        .lastModified(parquetFile.getModifTime())
+        .fileSizeBytes(parquetFile.getSize())
         .columnStats(
             parquetStatsExtractor.getColumnStatsForaFile(
                 parquetMetadataExtractor.readParquetMetadata(hadoopConf, parquetFile.getPath())))
@@ -182,16 +183,15 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
     Stream<LocatedFileStatus> parquetFiles = getParquetFiles(hadoopConf, basePath);
     Set<InternalDataFile> addedInternalDataFiles = new HashSet<>();
 
-    List<FileStatus> tableChangesAfter =
-        parquetFiles
-            .filter(fileStatus -> fileStatus.getModificationTime() > modificationTime)
-            .collect(Collectors.toList());
+    List<ParquetFileConfig> filesMetadata =
+        parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
+            hadoopConf, parquetFiles, modificationTime);
     List<ParquetFileConfig> tableChangesAfterMetadata =
         parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
             hadoopConf, parquetFiles, modificationTime);
     InternalTable internalTable = getMostRecentTable(tableChangesAfterMetadata.stream());
-    for (FileStatus tableStatus : tableChangesAfter) {
-      InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(tableStatus);
+    for (ParquetFileConfig fileMetadata : filesMetadata) {
+      InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(fileMetadata);
       addedInternalDataFiles.add(currentDataFile);
     }
 
@@ -219,16 +219,18 @@ public InternalTable getCurrentTable() {
    */
   @Override
   public InternalSnapshot getCurrentSnapshot() {
-    // to avoid consume the stream call the method twice to return the same stream of parquet files
     Stream<InternalDataFile> internalDataFiles =
-        getInternalDataFiles(getParquetFiles(hadoopConf, basePath));
-    Stream<ParquetFileConfig> parquetFilesMetadata =
-        getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf);
-    InternalTable table = getMostRecentTable(parquetFilesMetadata);
+        getInternalDataFiles(
+            getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf));
+    InternalTable table =
+        getMostRecentTable(getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf));
     return InternalSnapshot.builder()
         .table(table)
         .sourceIdentifier(
-            getCommitIdentifier(getMostRecentParquetFile(parquetFilesMetadata).getModifTime()))
+            getCommitIdentifier(
+                getMostRecentParquetFile(
+                        getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf))
+                    .getModifTime()))
         .partitionedDataFiles(PartitionFileGroup.fromFiles(internalDataFiles))
         .build();
   }
@@ -262,8 +264,42 @@ private Stream<LocatedFileStatus> getParquetFiles(Configuration hadoopConf, Stri
   }
 
   @Override
-  public boolean isIncrementalSyncSafeFrom(Instant instant) {
-    return false;
+  public boolean isIncrementalSyncSafeFrom(Instant timeInMillis) {
+    Stream<ParquetFileConfig> parquetFilesMetadata =
+        getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf);
+    LongSummaryStatistics stats =
+        parquetFilesMetadata.mapToLong(ParquetFileConfig::getModifTime).summaryStatistics();
+
+    if (stats.getCount() == 0) {
+      log.warn("No parquet files found in table {}. Incremental sync is not possible.", tableName);
+      return false;
+    }
+
+    long earliestModTime = stats.getMin();
+    long latestModTime = stats.getMax();
+
+    if (timeInMillis.toEpochMilli() > latestModTime) {
+      log.warn(
+          "Instant {} is in the future relative to the data. Latest file time: {}",
+          timeInMillis.toEpochMilli(),
+          Instant.ofEpochMilli(latestModTime));
+      return false;
+    }
+
+    if (earliestModTime > timeInMillis.toEpochMilli()) {
+      log.warn(
+          "Incremental sync is not safe. Earliest available metadata (time={}) is newer "
+              + "than requested instant {}. Data history has been truncated.",
+          Instant.ofEpochMilli(earliestModTime),
+          timeInMillis.toEpochMilli());
+      return false;
+    }
+
+    log.info(
+        "Incremental sync is safe from instant {} for table {}",
+        timeInMillis.toEpochMilli(),
+        tableName);
+    return true;
   }
 
   @Override

From 7cdccd01e9012b6af74ce6b88d171555d784d80a Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Mon, 12 Jan 2026 23:44:36 +0100
Subject: [PATCH 042/100] added test

---
 .../xtable/parquet/ITParquetDataManager.java  | 114 ++++++++++++++++++
 1 file changed, 114 insertions(+)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 8d8a16a2d..db17958ac 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -18,18 +18,132 @@
  
 package org.apache.xtable.parquet;
 
+import static org.junit.jupiter.api.Assertions.*;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
 import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+import org.apache.xtable.model.InternalTable;
+import org.apache.xtable.model.storage.TableFormat;
 
 public class ITParquetDataManager {
   private static SparkSession spark;
+  private ParquetConversionSource conversionSource;
 
   @BeforeAll
   public static void setup() {
     spark = SparkSession.builder().appName("ParquetTest").master("local[*]").getOrCreate();
   }
 
+  @Test
+  public void testAppendParquetFileMultiplePartition() throws IOException {
+
+    Configuration conf = spark.sparkContext().hadoopConfiguration();
+
+    StructType schema =
+        DataTypes.createStructType(
+            new StructField[] {
+              DataTypes.createStructField("id", DataTypes.IntegerType, false),
+              DataTypes.createStructField("value", DataTypes.StringType, false),
+              DataTypes.createStructField("year", DataTypes.IntegerType, false),
+              DataTypes.createStructField("month", DataTypes.IntegerType, false)
+            });
+    List<Row> data =
+        Arrays.asList(
+            RowFactory.create(100, "A", 2026, 12),
+            RowFactory.create(101, "AA", 2026, 12),
+            RowFactory.create(102, "CB", 2027, 11),
+            RowFactory.create(103, "BA", 2027, 11));
+
+    Dataset<Row> df = spark.createDataFrame(data, schema);
+    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_2");
+    Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test_2");
+    String outputPath = fixedPath.toString();
+    String finalAppendFilePath = appendFilePath.toString();
+
+    df.coalesce(1).write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
+
+    // test find files to sync
+    long targetModifTime = System.currentTimeMillis() - 360000;
+    org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath);
+    FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf);
+    // set the modification time to the table file
+    // update modifTime for file to append
+    // many partitions case
+    List<String> newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=11");
+    for (String partition : newPartitions) {
+      org.apache.hadoop.fs.Path partitionPath =
+          new org.apache.hadoop.fs.Path(outputPath, partition);
+      if (fs.exists(partitionPath)) {
+        updateModificationTimeRecursive(fs, partitionPath, targetModifTime);
+      }
+    }
+    // create new file to append using Spark
+    List<Row> futureDataToSync =
+        Arrays.asList(
+            RowFactory.create(101, "A", 2026, 12),
+            RowFactory.create(301, "D", 2027, 11),
+            RowFactory.create(302, "DA", 2027, 11));
+    Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
+    dfToSync
+        .coalesce(1)
+        .write()
+        .partitionBy("year", "month")
+        .mode("overwrite")
+        .parquet(finalAppendFilePath);
+    long newModifTime = System.currentTimeMillis() - 50000;
+    for (String partition : newPartitions) {
+      org.apache.hadoop.fs.Path partitionPath =
+          new org.apache.hadoop.fs.Path(finalAppendFilePath, partition);
+      if (fs.exists(partitionPath)) {
+        updateModificationTimeRecursive(fs, partitionPath, newModifTime);
+      }
+    }
+    Dataset<Row> dfWithNewTimes = spark.read().parquet(finalAppendFilePath);
+    dfWithNewTimes
+        .coalesce(1)
+        .write()
+        .partitionBy("year", "month")
+        .mode("append")
+        .parquet(outputPath);
+    fs.delete(new org.apache.hadoop.fs.Path(finalAppendFilePath), true);
+    // conversionSource operations
+    InternalTable result = conversionSource.getTable(newModifTime);
+    assertEquals(newModifTime, result.getLatestCommitTime());
+    assertNotNull(result);
+    assertEquals("parquet_table_test_2", result.getName());
+    assertEquals(TableFormat.PARQUET, result.getTableFormat());
+    assertNotNull(result.getReadSchema());
+  }
+
+  private void updateModificationTimeRecursive(
+      FileSystem fs, org.apache.hadoop.fs.Path path, long time) throws IOException {
+    org.apache.hadoop.fs.RemoteIterator<org.apache.hadoop.fs.LocatedFileStatus> it =
+        fs.listFiles(path, true);
+    while (it.hasNext()) {
+      org.apache.hadoop.fs.LocatedFileStatus status = it.next();
+      if (status.getPath().getName().endsWith(".parquet")) {
+        fs.setTimes(status.getPath(), time, -1);
+      }
+    }
+  }
+
   @AfterAll
   public static void tearDown() {
     if (spark != null) {

From b919146226e12768522e27ae4c0c3e1af43cbe35 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Mon, 12 Jan 2026 23:52:08 +0100
Subject: [PATCH 043/100] added test: init conversionSource instance

---
 .../org/apache/xtable/parquet/ITParquetDataManager.java   | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index db17958ac..30aa2f6e2 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -124,6 +124,14 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
         .parquet(outputPath);
     fs.delete(new org.apache.hadoop.fs.Path(finalAppendFilePath), true);
     // conversionSource operations
+    conversionSource =
+        ParquetConversionSource.builder()
+            .tableName("parquet_table_test_2")
+            .basePath(fixedPath.toString())
+            .hadoopConf(new Configuration()) // Required due to @NonNull
+            .partitionValueExtractor(null)
+            .partitionSpecExtractor(null)
+            .build();
     InternalTable result = conversionSource.getTable(newModifTime);
     assertEquals(newModifTime, result.getLatestCommitTime());
     assertNotNull(result);

From 8ff5aa63d6fb7aaefe6fa8888ea78afa2defe60c Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Tue, 13 Jan 2026 00:36:34 +0100
Subject: [PATCH 044/100] added test: init conversionSource instance: fix error

---
 .../java/org/apache/xtable/parquet/ITParquetDataManager.java  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 30aa2f6e2..192e7ab3b 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -127,8 +127,8 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     conversionSource =
         ParquetConversionSource.builder()
             .tableName("parquet_table_test_2")
-            .basePath(fixedPath.toString())
-            .hadoopConf(new Configuration()) // Required due to @NonNull
+            .basePath(fixedPath.toAbsolutePath().toUri().toString())
+            .hadoopConf(new Configuration())
             .partitionValueExtractor(null)
             .partitionSpecExtractor(null)
             .build();

From db9d7aba0816c8d472b98344818f6526f7683b85 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Tue, 13 Jan 2026 02:44:29 +0100
Subject: [PATCH 045/100] added test: fix error

---
 .../parquet/ParquetConversionSource.java      | 13 ++--
 .../xtable/parquet/ITParquetDataManager.java  | 67 ++++++++++++++++---
 2 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
index 935a5c942..7cc66dcea 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
@@ -159,15 +159,11 @@ private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileConfig
             partitionValueExtractor.extractPartitionValues(
                 partitionSpecExtractor.spec(
                     partitionValueExtractor.extractSchemaForParquetPartitions(
-                        parquetMetadataExtractor.readParquetMetadata(
-                            hadoopConf, parquetFile.getPath()),
-                        parquetFile.getPath().toString())),
+                        parquetFile.getMetadata(), parquetFile.getPath().toString())),
                 basePath))
         .lastModified(parquetFile.getModifTime())
         .fileSizeBytes(parquetFile.getSize())
-        .columnStats(
-            parquetStatsExtractor.getColumnStatsForaFile(
-                parquetMetadataExtractor.readParquetMetadata(hadoopConf, parquetFile.getPath())))
+        .columnStats(parquetStatsExtractor.getColumnStatsForaFile(parquetFile.getMetadata()))
         .build();
   }
 
@@ -180,15 +176,14 @@ public CommitsBacklog<Long> getCommitsBacklog(InstantsForIncrementalSync syncIns
 
   @Override
   public TableChange getTableChangeForCommit(Long modificationTime) {
-    Stream<LocatedFileStatus> parquetFiles = getParquetFiles(hadoopConf, basePath);
     Set<InternalDataFile> addedInternalDataFiles = new HashSet<>();
 
     List<ParquetFileConfig> filesMetadata =
         parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
-            hadoopConf, parquetFiles, modificationTime);
+            hadoopConf, getParquetFiles(hadoopConf, basePath), modificationTime);
     List<ParquetFileConfig> tableChangesAfterMetadata =
         parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
-            hadoopConf, parquetFiles, modificationTime);
+            hadoopConf, getParquetFiles(hadoopConf, basePath), modificationTime);
     InternalTable internalTable = getMostRecentTable(tableChangesAfterMetadata.stream());
     for (ParquetFileConfig fileMetadata : filesMetadata) {
       InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(fileMetadata);
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 192e7ab3b..15bccd721 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -23,11 +23,15 @@
 import java.io.IOException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
+import java.time.Instant;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Properties;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;
@@ -39,16 +43,24 @@
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 
+import org.apache.xtable.conversion.SourceTable;
+import org.apache.xtable.model.InternalSnapshot;
 import org.apache.xtable.model.InternalTable;
 import org.apache.xtable.model.storage.TableFormat;
 
 public class ITParquetDataManager {
   private static SparkSession spark;
-  private ParquetConversionSource conversionSource;
+  private static ParquetConversionSourceProvider conversionSourceProvider;
+  public static final String PARTITION_FIELD_SPEC_CONFIG =
+      "xtable.parquet.source.partition_field_spec_config";
 
   @BeforeAll
   public static void setup() {
     spark = SparkSession.builder().appName("ParquetTest").master("local[*]").getOrCreate();
+    conversionSourceProvider = new ParquetConversionSourceProvider();
+    Configuration hadoopConf = new Configuration();
+    hadoopConf.set("fs.defaultFS", "file:///");
+    conversionSourceProvider.init(hadoopConf);
   }
 
   @Test
@@ -80,13 +92,14 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     df.coalesce(1).write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
 
     // test find files to sync
-    long targetModifTime = System.currentTimeMillis() - 360000;
+
     org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath);
     FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf);
     // set the modification time to the table file
     // update modifTime for file to append
     // many partitions case
     List<String> newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=11");
+    long targetModifTime = System.currentTimeMillis() - 360000;
     for (String partition : newPartitions) {
       org.apache.hadoop.fs.Path partitionPath =
           new org.apache.hadoop.fs.Path(outputPath, partition);
@@ -107,14 +120,14 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
         .partitionBy("year", "month")
         .mode("overwrite")
         .parquet(finalAppendFilePath);
-    long newModifTime = System.currentTimeMillis() - 50000;
+    /*long newModifTime = System.currentTimeMillis() - 50000;
     for (String partition : newPartitions) {
       org.apache.hadoop.fs.Path partitionPath =
           new org.apache.hadoop.fs.Path(finalAppendFilePath, partition);
       if (fs.exists(partitionPath)) {
         updateModificationTimeRecursive(fs, partitionPath, newModifTime);
       }
-    }
+    }*/
     Dataset<Row> dfWithNewTimes = spark.read().parquet(finalAppendFilePath);
     dfWithNewTimes
         .coalesce(1)
@@ -124,20 +137,52 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
         .parquet(outputPath);
     fs.delete(new org.apache.hadoop.fs.Path(finalAppendFilePath), true);
     // conversionSource operations
-    conversionSource =
-        ParquetConversionSource.builder()
-            .tableName("parquet_table_test_2")
+    Properties sourceProperties = new Properties();
+    String partitionConfig = "id:MONTH:year=yyyy/month=MM";
+    sourceProperties.put(PARTITION_FIELD_SPEC_CONFIG, partitionConfig);
+    SourceTable tableConfig =
+        SourceTable.builder()
+            .name("parquet_table_test_2")
             .basePath(fixedPath.toAbsolutePath().toUri().toString())
-            .hadoopConf(new Configuration())
-            .partitionValueExtractor(null)
-            .partitionSpecExtractor(null)
+            .additionalProperties(sourceProperties)
+            .formatName(TableFormat.PARQUET)
             .build();
+
+    ParquetConversionSource conversionSource =
+        conversionSourceProvider.getConversionSourceInstance(tableConfig);
+
+    long newModifTime = System.currentTimeMillis() - 50000;
+
+    for (String partition : newPartitions) {
+      org.apache.hadoop.fs.Path partitionPath =
+          new org.apache.hadoop.fs.Path(outputPath, partition);
+
+      RemoteIterator<LocatedFileStatus> it = fs.listFiles(partitionPath, false);
+      while (it.hasNext()) {
+        LocatedFileStatus fileStatus = it.next();
+
+        if (fileStatus.getModificationTime() > newModifTime) {
+          fs.setTimes(fileStatus.getPath(), newModifTime, -1);
+        } else {
+
+          fs.setTimes(fileStatus.getPath(), targetModifTime, -1);
+        }
+      }
+
+      fs.setTimes(partitionPath, newModifTime, -1);
+    }
+
     InternalTable result = conversionSource.getTable(newModifTime);
-    assertEquals(newModifTime, result.getLatestCommitTime());
+    assertEquals(
+        Instant.ofEpochMilli(newModifTime).toString(), result.getLatestCommitTime().toString());
     assertNotNull(result);
     assertEquals("parquet_table_test_2", result.getName());
     assertEquals(TableFormat.PARQUET, result.getTableFormat());
     assertNotNull(result.getReadSchema());
+    // TableChange changes = conversionSource.getTableChangeForCommit(newModifTime);
+    InternalSnapshot snapshot = conversionSource.getCurrentSnapshot();
+    // assertNotNull(changes);
+    assertNotNull(snapshot);
   }
 
   private void updateModificationTimeRecursive(

From 324d703207ea0c89615711ccfd92fbd85cfab39c Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Tue, 13 Jan 2026 12:43:59 +0100
Subject: [PATCH 046/100] more tests + bug fixes and reformatting

---
 .../org/apache/xtable/parquet/ParquetConversionSource.java | 7 ++++++-
 .../org/apache/xtable/parquet/ITParquetDataManager.java    | 5 +++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
index 7cc66dcea..24b86cac5 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
@@ -160,7 +160,7 @@ private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileConfig
                 partitionSpecExtractor.spec(
                     partitionValueExtractor.extractSchemaForParquetPartitions(
                         parquetFile.getMetadata(), parquetFile.getPath().toString())),
-                basePath))
+                HudiPathUtils.getPartitionPath(new Path(basePath), parquetFile.getPath())))
         .lastModified(parquetFile.getModifTime())
         .fileSizeBytes(parquetFile.getSize())
         .columnStats(parquetStatsExtractor.getColumnStatsForaFile(parquetFile.getMetadata()))
@@ -191,6 +191,11 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
     }
 
     return TableChange.builder()
+        .sourceIdentifier(
+            getCommitIdentifier(
+                getMostRecentParquetFile(
+                        getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf))
+                    .getModifTime()))
         .tableAsOfChange(internalTable)
         .filesDiff(InternalFilesDiff.builder().filesAdded(addedInternalDataFiles).build())
         .build();
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 15bccd721..90f28808f 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -46,6 +46,7 @@
 import org.apache.xtable.conversion.SourceTable;
 import org.apache.xtable.model.InternalSnapshot;
 import org.apache.xtable.model.InternalTable;
+import org.apache.xtable.model.TableChange;
 import org.apache.xtable.model.storage.TableFormat;
 
 public class ITParquetDataManager {
@@ -179,10 +180,10 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     assertEquals("parquet_table_test_2", result.getName());
     assertEquals(TableFormat.PARQUET, result.getTableFormat());
     assertNotNull(result.getReadSchema());
-    // TableChange changes = conversionSource.getTableChangeForCommit(newModifTime);
     InternalSnapshot snapshot = conversionSource.getCurrentSnapshot();
-    // assertNotNull(changes);
     assertNotNull(snapshot);
+    TableChange changes = conversionSource.getTableChangeForCommit(newModifTime);
+    assertNotNull(changes);
   }
 
   private void updateModificationTimeRecursive(

From 3c453e69355f4ad0c0d2f487d45ca912fd620f54 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Tue, 13 Jan 2026 13:17:28 +0100
Subject: [PATCH 047/100] CI fix

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 855894ec2..be4c56dcb 100644
--- a/pom.xml
+++ b/pom.xml
@@ -61,7 +61,7 @@
         <project.version>0.2.0-SNAPSHOT</project.version>
         <!-- To ensure build content is re-producible -->
         <project.build.outputTimestamp>2025-01-01T00:00:00Z</project.build.outputTimestamp>
-        <maven.compiler.target>8</maven.compiler.target>
+        <maven.compiler.target>17</maven.compiler.target>
         <maven-compiler-plugin.version>3.13.0</maven-compiler-plugin.version>
         <avro.version>1.11.4</avro.version>
         <log4j.version>2.22.0</log4j.version>

From e4c0b4cc2470d5c425193a85c87d95c552e170ad Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Tue, 13 Jan 2026 13:20:42 +0100
Subject: [PATCH 048/100] CI fix

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index be4c56dcb..855894ec2 100644
--- a/pom.xml
+++ b/pom.xml
@@ -61,7 +61,7 @@
         <project.version>0.2.0-SNAPSHOT</project.version>
         <!-- To ensure build content is re-producible -->
         <project.build.outputTimestamp>2025-01-01T00:00:00Z</project.build.outputTimestamp>
-        <maven.compiler.target>17</maven.compiler.target>
+        <maven.compiler.target>8</maven.compiler.target>
         <maven-compiler-plugin.version>3.13.0</maven-compiler-plugin.version>
         <avro.version>1.11.4</avro.version>
         <log4j.version>2.22.0</log4j.version>

From 4315282bb23b0caec4aac5b508f631065c75bd47 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Tue, 13 Jan 2026 15:53:08 +0100
Subject: [PATCH 049/100] CI fix

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 855894ec2..ba5205b65 100644
--- a/pom.xml
+++ b/pom.xml
@@ -104,7 +104,7 @@
         <scala-collection-compat.version>2.12.0</scala-collection-compat.version>
         <quarkus.platform.artifact-id>quarkus-bom</quarkus.platform.artifact-id>
         <quarkus.platform.group-id>io.quarkus.platform</quarkus.platform.group-id>
-        <quarkus.platform.version>3.2.12.Final</quarkus.platform.version> <!-- compatible with Java 11 -->
+        <quarkus.platform.version>3.2.9.Final</quarkus.platform.version> <!-- compatible with Java 11 3.2.12.Final-->
         <antlr4.version>4.9.3</antlr4.version>
         <jol.core.version>0.16</jol.core.version>
         

From 14179291503de7ad51b48657f4b8c0023c054a34 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Tue, 13 Jan 2026 16:18:49 +0100
Subject: [PATCH 050/100] CI fix

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index ba5205b65..739ac5cb8 100644
--- a/pom.xml
+++ b/pom.xml
@@ -104,7 +104,7 @@
         <scala-collection-compat.version>2.12.0</scala-collection-compat.version>
         <quarkus.platform.artifact-id>quarkus-bom</quarkus.platform.artifact-id>
         <quarkus.platform.group-id>io.quarkus.platform</quarkus.platform.group-id>
-        <quarkus.platform.version>3.2.9.Final</quarkus.platform.version> <!-- compatible with Java 11 3.2.12.Final-->
+        <quarkus.platform.version>2.16.12.Final</quarkus.platform.version> <!-- compatible with Java 11 3.2.12.Final-->
         <antlr4.version>4.9.3</antlr4.version>
         <jol.core.version>0.16</jol.core.version>
         

From 7620c013a93c00da6ac9f783eaa62c61b8bd271d Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Tue, 13 Jan 2026 16:46:16 +0100
Subject: [PATCH 051/100] CI fix

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 739ac5cb8..855894ec2 100644
--- a/pom.xml
+++ b/pom.xml
@@ -104,7 +104,7 @@
         <scala-collection-compat.version>2.12.0</scala-collection-compat.version>
         <quarkus.platform.artifact-id>quarkus-bom</quarkus.platform.artifact-id>
         <quarkus.platform.group-id>io.quarkus.platform</quarkus.platform.group-id>
-        <quarkus.platform.version>2.16.12.Final</quarkus.platform.version> <!-- compatible with Java 11 3.2.12.Final-->
+        <quarkus.platform.version>3.2.12.Final</quarkus.platform.version> <!-- compatible with Java 11 -->
         <antlr4.version>4.9.3</antlr4.version>
         <jol.core.version>0.16</jol.core.version>
         

From cd66151d9e1fa9368c385e42c747bb2f4ef75fea Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 17 Jan 2026 00:35:00 +0100
Subject: [PATCH 052/100] CI fix

---
 .../parquet/ParquetConversionSource.java      | 114 ++++++------------
 .../xtable/parquet/ParquetDataManager.java    |  61 ++++++++++
 .../xtable/parquet/ITParquetDataManager.java  |  10 +-
 3 files changed, 98 insertions(+), 87 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
index 24b86cac5..ad5422e0a 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
@@ -18,10 +18,6 @@
  
 package org.apache.xtable.parquet;
 
-import java.io.IOException;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.nio.file.Paths;
 import java.time.Instant;
 import java.util.*;
 import java.util.stream.Stream;
@@ -33,11 +29,9 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.functional.RemoteIterators;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.schema.MessageType;
 
-import org.apache.xtable.exception.ReadException;
 import org.apache.xtable.hudi.*;
 import org.apache.xtable.hudi.HudiPathUtils;
 import org.apache.xtable.model.*;
@@ -96,37 +90,15 @@ private InternalTable createInternalTableFromFile(ParquetFileConfig latestFile)
         .build();
   }
 
-  public Stream<ParquetFileConfig> getConfigsFromStream(
-      Stream<LocatedFileStatus> fileStream, Configuration conf) {
-
-    return fileStream.map(
-        fileStatus -> {
-          Path path = fileStatus.getPath();
-
-          ParquetMetadata metadata =
-              ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path);
-
-          return ParquetFileConfig.builder()
-              .schema(metadata.getFileMetaData().getSchema())
-              .metadata(metadata)
-              .path(path)
-              .size(fileStatus.getLen())
-              .modifTime(fileStatus.getModificationTime())
-              .rowGroupIndex(0L)
-              .codec(
-                  metadata.getBlocks().isEmpty()
-                      ? null
-                      : metadata.getBlocks().get(0).getColumns().get(0).getCodec())
-              .build();
-        });
-  }
-
   @Override
   public InternalTable getTable(Long modificationTime) {
     // get parquetFile at specific time modificationTime
-    Stream<LocatedFileStatus> parquetFiles = getParquetFiles(hadoopConf, basePath);
-    Stream<ParquetFileConfig> parquetFilesMetadata = getConfigsFromStream(parquetFiles, hadoopConf);
-    ParquetFileConfig file = getParquetFileAt(parquetFilesMetadata, modificationTime);
+    Stream<LocatedFileStatus> parquetFiles =
+        parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath);
+    Stream<ParquetFileConfig> parquetFilesMetadata =
+        parquetDataManagerExtractor.getConfigsFromStream(parquetFiles, hadoopConf);
+    ParquetFileConfig file =
+        parquetDataManagerExtractor.getParquetFileAt(parquetFilesMetadata, modificationTime);
     return createInternalTableFromFile(file);
   }
 
@@ -180,10 +152,14 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
 
     List<ParquetFileConfig> filesMetadata =
         parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
-            hadoopConf, getParquetFiles(hadoopConf, basePath), modificationTime);
+            hadoopConf,
+            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath),
+            modificationTime);
     List<ParquetFileConfig> tableChangesAfterMetadata =
         parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
-            hadoopConf, getParquetFiles(hadoopConf, basePath), modificationTime);
+            hadoopConf,
+            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath),
+            modificationTime);
     InternalTable internalTable = getMostRecentTable(tableChangesAfterMetadata.stream());
     for (ParquetFileConfig fileMetadata : filesMetadata) {
       InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(fileMetadata);
@@ -193,8 +169,11 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
     return TableChange.builder()
         .sourceIdentifier(
             getCommitIdentifier(
-                getMostRecentParquetFile(
-                        getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf))
+                parquetDataManagerExtractor
+                    .getMostRecentParquetFile(
+                        parquetDataManagerExtractor.getConfigsFromStream(
+                            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath),
+                            hadoopConf))
                     .getModifTime()))
         .tableAsOfChange(internalTable)
         .filesDiff(InternalFilesDiff.builder().filesAdded(addedInternalDataFiles).build())
@@ -202,71 +181,48 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
   }
 
   private InternalTable getMostRecentTable(Stream<ParquetFileConfig> parquetFiles) {
-    ParquetFileConfig latestFile = getMostRecentParquetFile(parquetFiles);
+    ParquetFileConfig latestFile =
+        parquetDataManagerExtractor.getMostRecentParquetFile(parquetFiles);
     return createInternalTableFromFile(latestFile);
   }
 
   @Override
   public InternalTable getCurrentTable() {
-    Stream<LocatedFileStatus> parquetFiles = getParquetFiles(hadoopConf, basePath);
-    return getMostRecentTable(getConfigsFromStream(parquetFiles, hadoopConf));
+    Stream<LocatedFileStatus> parquetFiles =
+        parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath);
+    return getMostRecentTable(
+        parquetDataManagerExtractor.getConfigsFromStream(parquetFiles, hadoopConf));
   }
 
-  /**
-   * get current snapshot
-   *
-   * @return
-   */
   @Override
   public InternalSnapshot getCurrentSnapshot() {
     Stream<InternalDataFile> internalDataFiles =
         getInternalDataFiles(
-            getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf));
+            parquetDataManagerExtractor.getConfigsFromStream(
+                parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), hadoopConf));
     InternalTable table =
-        getMostRecentTable(getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf));
+        getMostRecentTable(
+            parquetDataManagerExtractor.getConfigsFromStream(
+                parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), hadoopConf));
     return InternalSnapshot.builder()
         .table(table)
         .sourceIdentifier(
             getCommitIdentifier(
-                getMostRecentParquetFile(
-                        getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf))
+                parquetDataManagerExtractor
+                    .getMostRecentParquetFile(
+                        parquetDataManagerExtractor.getConfigsFromStream(
+                            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath),
+                            hadoopConf))
                     .getModifTime()))
         .partitionedDataFiles(PartitionFileGroup.fromFiles(internalDataFiles))
         .build();
   }
 
-  private ParquetFileConfig getMostRecentParquetFile(Stream<ParquetFileConfig> parquetFiles) {
-    return parquetFiles
-        .max(Comparator.comparing(ParquetFileConfig::getModifTime))
-        .orElseThrow(() -> new IllegalStateException("No files found"));
-  }
-
-  private ParquetFileConfig getParquetFileAt(
-      Stream<ParquetFileConfig> parquetConfigs, long targetTime) {
-
-    return parquetConfigs
-        .filter(config -> config.getModifTime() >= targetTime)
-        .findFirst()
-        .orElseThrow(() -> new IllegalStateException("No file found at or after " + targetTime));
-  }
-
-  private Stream<LocatedFileStatus> getParquetFiles(Configuration hadoopConf, String basePath) {
-    try {
-      FileSystem fs = FileSystem.get(hadoopConf);
-      URI uriBasePath = new URI(basePath);
-      String parentPath = Paths.get(uriBasePath).toString();
-      RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(new Path(parentPath), true);
-      return RemoteIterators.toList(iterator).stream()
-          .filter(file -> file.getPath().getName().endsWith("parquet"));
-    } catch (IOException | URISyntaxException e) {
-      throw new ReadException("Unable to read files from file system", e);
-    }
-  }
-
   @Override
   public boolean isIncrementalSyncSafeFrom(Instant timeInMillis) {
     Stream<ParquetFileConfig> parquetFilesMetadata =
-        getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf);
+        parquetDataManagerExtractor.getConfigsFromStream(
+            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), hadoopConf);
     LongSummaryStatistics stats =
         parquetFilesMetadata.mapToLong(ParquetFileConfig::getModifTime).summaryStatistics();
 
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 5d09352ea..a018e037c 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -18,6 +18,10 @@
  
 package org.apache.xtable.parquet;
 
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.nio.file.Paths;
 import java.util.*;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
@@ -26,6 +30,10 @@
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.*;
+import org.apache.hadoop.util.functional.RemoteIterators;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+
+import org.apache.xtable.exception.ReadException;
 
 /**
  * Manages Parquet file operations including reading, writing, and partition discovery and path
@@ -43,6 +51,59 @@ public static ParquetDataManager getInstance() {
     return INSTANCE;
   }
 
+  public ParquetFileConfig getMostRecentParquetFile(Stream<ParquetFileConfig> parquetFiles) {
+    return parquetFiles
+        .max(Comparator.comparing(ParquetFileConfig::getModifTime))
+        .orElseThrow(() -> new IllegalStateException("No files found"));
+  }
+
+  public ParquetFileConfig getParquetFileAt(
+      Stream<ParquetFileConfig> parquetConfigs, long targetTime) {
+
+    return parquetConfigs
+        .filter(config -> config.getModifTime() >= targetTime)
+        .findFirst()
+        .orElseThrow(() -> new IllegalStateException("No file found at or after " + targetTime));
+  }
+
+  public Stream<LocatedFileStatus> getParquetFiles(Configuration hadoopConf, String basePath) {
+    try {
+      FileSystem fs = FileSystem.get(hadoopConf);
+      URI uriBasePath = new URI(basePath);
+      String parentPath = Paths.get(uriBasePath).toString();
+      RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(new Path(parentPath), true);
+      return RemoteIterators.toList(iterator).stream()
+          .filter(file -> file.getPath().getName().endsWith("parquet"));
+    } catch (IOException | URISyntaxException e) {
+      throw new ReadException("Unable to read files from file system", e);
+    }
+  }
+
+  public Stream<ParquetFileConfig> getConfigsFromStream(
+      Stream<LocatedFileStatus> fileStream, Configuration conf) {
+
+    return fileStream.map(
+        fileStatus -> {
+          Path path = fileStatus.getPath();
+
+          ParquetMetadata metadata =
+              ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path);
+
+          return ParquetFileConfig.builder()
+              .schema(metadata.getFileMetaData().getSchema())
+              .metadata(metadata)
+              .path(path)
+              .size(fileStatus.getLen())
+              .modifTime(fileStatus.getModificationTime())
+              .rowGroupIndex(metadata.getBlocks().size())
+              .codec(
+                  metadata.getBlocks().isEmpty()
+                      ? null
+                      : metadata.getBlocks().get(0).getColumns().get(0).getCodec())
+              .build();
+        });
+  }
+
   public List<ParquetFileConfig> getParquetFilesMetadataInRange(
       Configuration conf, Stream<LocatedFileStatus> parquetFiles, long startTime, long endTime) {
 
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 90f28808f..3aed202e3 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -121,14 +121,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
         .partitionBy("year", "month")
         .mode("overwrite")
         .parquet(finalAppendFilePath);
-    /*long newModifTime = System.currentTimeMillis() - 50000;
-    for (String partition : newPartitions) {
-      org.apache.hadoop.fs.Path partitionPath =
-          new org.apache.hadoop.fs.Path(finalAppendFilePath, partition);
-      if (fs.exists(partitionPath)) {
-        updateModificationTimeRecursive(fs, partitionPath, newModifTime);
-      }
-    }*/
+
     Dataset<Row> dfWithNewTimes = spark.read().parquet(finalAppendFilePath);
     dfWithNewTimes
         .coalesce(1)
@@ -139,6 +132,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     fs.delete(new org.apache.hadoop.fs.Path(finalAppendFilePath), true);
     // conversionSource operations
     Properties sourceProperties = new Properties();
+    // TODO use timestamp col instead (as done in ITParquetConversionSource)
     String partitionConfig = "id:MONTH:year=yyyy/month=MM";
     sourceProperties.put(PARTITION_FIELD_SPEC_CONFIG, partitionConfig);
     SourceTable tableConfig =

From 4e4c5cba21ae6396f9626f27fe60139d7313815d Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 18 Jan 2026 21:24:12 +0100
Subject: [PATCH 053/100] merge ready fixes & tests + refactoring &
 reformatting

---
 .../xtable/parquet/ParquetDataManager.java    |  8 +++-----
 .../xtable/parquet/ITParquetDataManager.java  | 19 +++++++++++++------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index a018e037c..ff59f156b 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -36,12 +36,10 @@
 import org.apache.xtable.exception.ReadException;
 
 /**
- * Manages Parquet file operations including reading, writing, and partition discovery and path
- * construction.
+ * Manages Parquet File's Metadata
  *
- * <p>This class provides functions to handle Parquet metadata, validate schemas during appends, and
- * calculate target partition directories based on file modification times and defined partition
- * fields.
+ * <p>This class provides functions to handle Parquet metadata, creating metadata objects from
+ * parquet files and filtering the files based on the modification times.
  */
 @Log4j2
 public class ParquetDataManager {
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 3aed202e3..bb32a5024 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -18,6 +18,7 @@
  
 package org.apache.xtable.parquet;
 
+import static org.apache.spark.sql.functions.expr;
 import static org.junit.jupiter.api.Assertions.*;
 
 import java.io.IOException;
@@ -84,12 +85,12 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
             RowFactory.create(102, "CB", 2027, 11),
             RowFactory.create(103, "BA", 2027, 11));
 
-    Dataset<Row> df = spark.createDataFrame(data, schema);
+    Dataset<Row> dfInit = spark.createDataFrame(data, schema);
     Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_2");
     Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test_2");
     String outputPath = fixedPath.toString();
     String finalAppendFilePath = appendFilePath.toString();
-
+    Dataset<Row> df = dfInit.withColumn("full_date", expr("make_date(year, month, 1)"));
     df.coalesce(1).write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
 
     // test find files to sync
@@ -101,6 +102,8 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     // many partitions case
     List<String> newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=11");
     long targetModifTime = System.currentTimeMillis() - 360000;
+    long newModifTime = System.currentTimeMillis() - 50000;
+    long testTime = System.currentTimeMillis() - 90000; // between two prev times
     for (String partition : newPartitions) {
       org.apache.hadoop.fs.Path partitionPath =
           new org.apache.hadoop.fs.Path(outputPath, partition);
@@ -114,7 +117,8 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
             RowFactory.create(101, "A", 2026, 12),
             RowFactory.create(301, "D", 2027, 11),
             RowFactory.create(302, "DA", 2027, 11));
-    Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
+    Dataset<Row> dfToSyncInit = spark.createDataFrame(futureDataToSync, schema);
+    Dataset<Row> dfToSync = dfToSyncInit.withColumn("full_date", expr("make_date(year, month, 1)"));
     dfToSync
         .coalesce(1)
         .write()
@@ -132,8 +136,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     fs.delete(new org.apache.hadoop.fs.Path(finalAppendFilePath), true);
     // conversionSource operations
     Properties sourceProperties = new Properties();
-    // TODO use timestamp col instead (as done in ITParquetConversionSource)
-    String partitionConfig = "id:MONTH:year=yyyy/month=MM";
+    String partitionConfig = "full_date:MONTH:year=yyyy/month=MM";
     sourceProperties.put(PARTITION_FIELD_SPEC_CONFIG, partitionConfig);
     SourceTable tableConfig =
         SourceTable.builder()
@@ -146,7 +149,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     ParquetConversionSource conversionSource =
         conversionSourceProvider.getConversionSourceInstance(tableConfig);
 
-    long newModifTime = System.currentTimeMillis() - 50000;
+    // long newModifTime = System.currentTimeMillis() - 50000;
 
     for (String partition : newPartitions) {
       org.apache.hadoop.fs.Path partitionPath =
@@ -178,6 +181,10 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     assertNotNull(snapshot);
     TableChange changes = conversionSource.getTableChangeForCommit(newModifTime);
     assertNotNull(changes);
+    Instant instantBeforeFirstSnapshot =
+        Instant.ofEpochMilli(snapshot.getTable().getLatestCommitTime().toEpochMilli());
+    assertTrue(instantBeforeFirstSnapshot.toEpochMilli() == newModifTime);
+    assertTrue(conversionSource.isIncrementalSyncSafeFrom(Instant.ofEpochMilli(testTime)));
   }
 
   private void updateModificationTimeRecursive(

From 212e50a941ae18b6cb7823d4675f1a0c79fa0c96 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Mon, 19 Jan 2026 21:44:53 +0100
Subject: [PATCH 054/100] optimization: read metadata once for target file,
 read parquet files once, reformatting

---
 .../parquet/ParquetConversionSource.java      | 111 +++++++++---------
 .../xtable/parquet/ParquetDataManager.java    | 100 ++++++++++------
 .../xtable/parquet/ParquetFileConfig.java     |  25 ++--
 .../xtable/parquet/ITParquetDataManager.java  |   2 +-
 4 files changed, 134 insertions(+), 104 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
index ad5422e0a..10eccdf40 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
@@ -68,6 +68,16 @@ public class ParquetConversionSource implements ConversionSource<Long> {
   private final String basePath;
   @NonNull private final Configuration hadoopConf;
 
+  private List<LocatedFileStatus> parquetFiles;
+
+  // helper method to ensure files are loaded once
+  private List<LocatedFileStatus> getOrLoadParquetFiles() {
+    if (this.parquetFiles == null) {
+      this.parquetFiles = parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath);
+    }
+    return this.parquetFiles;
+  }
+
   private InternalTable createInternalTableFromFile(ParquetFileConfig latestFile) {
     ParquetMetadata parquetMetadata =
         parquetMetadataExtractor.readParquetMetadata(hadoopConf, latestFile.getPath());
@@ -86,42 +96,38 @@ private InternalTable createInternalTableFromFile(ParquetFileConfig latestFile)
         .layoutStrategy(dataLayoutStrategy)
         .partitioningFields(partitionFields)
         .readSchema(schema)
-        .latestCommitTime(Instant.ofEpochMilli(latestFile.getModifTime()))
+        .latestCommitTime(Instant.ofEpochMilli(latestFile.getModificationTime()))
         .build();
   }
 
   @Override
   public InternalTable getTable(Long modificationTime) {
     // get parquetFile at specific time modificationTime
-    Stream<LocatedFileStatus> parquetFiles =
-        parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath);
-    Stream<ParquetFileConfig> parquetFilesMetadata =
-        parquetDataManagerExtractor.getConfigsFromStream(parquetFiles, hadoopConf);
-    ParquetFileConfig file =
-        parquetDataManagerExtractor.getParquetFileAt(parquetFilesMetadata, modificationTime);
+    LocatedFileStatus parquetFile =
+        parquetDataManagerExtractor.getParquetDataFileAt(getOrLoadParquetFiles(), modificationTime);
+    ParquetFileConfig file = parquetDataManagerExtractor.getConfigFromFile(parquetFile, hadoopConf);
     return createInternalTableFromFile(file);
   }
 
   private Stream<InternalDataFile> getInternalDataFiles(Stream<ParquetFileConfig> parquetFiles) {
     return parquetFiles.map(
-        file ->
-            InternalDataFile.builder()
-                .physicalPath(file.getPath().toString())
-                .fileFormat(FileFormat.APACHE_PARQUET)
-                .fileSizeBytes(file.getSize())
-                .partitionValues(
-                    partitionValueExtractor.extractPartitionValues(
-                        partitionSpecExtractor.spec(
-                            partitionValueExtractor.extractSchemaForParquetPartitions(
-                                parquetMetadataExtractor.readParquetMetadata(
-                                    hadoopConf, file.getPath()),
-                                file.getPath().toString())),
-                        HudiPathUtils.getPartitionPath(new Path(basePath), file.getPath())))
-                .lastModified(file.getModifTime())
-                .columnStats(
-                    parquetStatsExtractor.getColumnStatsForaFile(
-                        parquetMetadataExtractor.readParquetMetadata(hadoopConf, file.getPath())))
-                .build());
+        file -> {
+          ParquetMetadata metadata =
+              parquetMetadataExtractor.readParquetMetadata(hadoopConf, file.getPath());
+          return InternalDataFile.builder()
+              .physicalPath(file.getPath().toString())
+              .fileFormat(FileFormat.APACHE_PARQUET)
+              .fileSizeBytes(file.getSize())
+              .partitionValues(
+                  partitionValueExtractor.extractPartitionValues(
+                      partitionSpecExtractor.spec(
+                          partitionValueExtractor.extractSchemaForParquetPartitions(
+                              metadata, file.getPath().toString())),
+                      HudiPathUtils.getPartitionPath(new Path(basePath), file.getPath())))
+              .lastModified(file.getModificationTime())
+              .columnStats(parquetStatsExtractor.getColumnStatsForaFile(metadata))
+              .build();
+        });
   }
 
   private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileConfig parquetFile) {
@@ -133,7 +139,7 @@ private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileConfig
                     partitionValueExtractor.extractSchemaForParquetPartitions(
                         parquetFile.getMetadata(), parquetFile.getPath().toString())),
                 HudiPathUtils.getPartitionPath(new Path(basePath), parquetFile.getPath())))
-        .lastModified(parquetFile.getModifTime())
+        .lastModified(parquetFile.getModificationTime())
         .fileSizeBytes(parquetFile.getSize())
         .columnStats(parquetStatsExtractor.getColumnStatsForaFile(parquetFile.getMetadata()))
         .build();
@@ -152,15 +158,11 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
 
     List<ParquetFileConfig> filesMetadata =
         parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
-            hadoopConf,
-            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath),
-            modificationTime);
+            hadoopConf, getOrLoadParquetFiles(), modificationTime);
     List<ParquetFileConfig> tableChangesAfterMetadata =
         parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
-            hadoopConf,
-            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath),
-            modificationTime);
-    InternalTable internalTable = getMostRecentTable(tableChangesAfterMetadata.stream());
+            hadoopConf, getOrLoadParquetFiles(), modificationTime);
+    InternalTable internalTable = getMostRecentTableConfig(tableChangesAfterMetadata.stream());
     for (ParquetFileConfig fileMetadata : filesMetadata) {
       InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(fileMetadata);
       addedInternalDataFiles.add(currentDataFile);
@@ -170,50 +172,44 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
         .sourceIdentifier(
             getCommitIdentifier(
                 parquetDataManagerExtractor
-                    .getMostRecentParquetFile(
-                        parquetDataManagerExtractor.getConfigsFromStream(
-                            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath),
-                            hadoopConf))
-                    .getModifTime()))
+                    .getMostRecentParquetFile(getOrLoadParquetFiles(), hadoopConf)
+                    .getModificationTime()))
         .tableAsOfChange(internalTable)
         .filesDiff(InternalFilesDiff.builder().filesAdded(addedInternalDataFiles).build())
         .build();
   }
 
-  private InternalTable getMostRecentTable(Stream<ParquetFileConfig> parquetFiles) {
+  private InternalTable getMostRecentTable(
+      List<LocatedFileStatus> parquetFiles, Configuration conf) {
+    ParquetFileConfig latestFile =
+        parquetDataManagerExtractor.getMostRecentParquetFile(parquetFiles, conf);
+    return createInternalTableFromFile(latestFile);
+  }
+
+  private InternalTable getMostRecentTableConfig(Stream<ParquetFileConfig> parquetFiles) {
     ParquetFileConfig latestFile =
-        parquetDataManagerExtractor.getMostRecentParquetFile(parquetFiles);
+        parquetDataManagerExtractor.getMostRecentParquetFileConfig(parquetFiles);
     return createInternalTableFromFile(latestFile);
   }
 
   @Override
   public InternalTable getCurrentTable() {
-    Stream<LocatedFileStatus> parquetFiles =
-        parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath);
-    return getMostRecentTable(
-        parquetDataManagerExtractor.getConfigsFromStream(parquetFiles, hadoopConf));
+    return getMostRecentTable(getOrLoadParquetFiles(), hadoopConf);
   }
 
   @Override
   public InternalSnapshot getCurrentSnapshot() {
     Stream<InternalDataFile> internalDataFiles =
         getInternalDataFiles(
-            parquetDataManagerExtractor.getConfigsFromStream(
-                parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), hadoopConf));
-    InternalTable table =
-        getMostRecentTable(
-            parquetDataManagerExtractor.getConfigsFromStream(
-                parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), hadoopConf));
+            parquetDataManagerExtractor.getConfigsFromStream(getOrLoadParquetFiles(), hadoopConf));
+    InternalTable table = getMostRecentTable(getOrLoadParquetFiles(), hadoopConf);
     return InternalSnapshot.builder()
         .table(table)
         .sourceIdentifier(
             getCommitIdentifier(
                 parquetDataManagerExtractor
-                    .getMostRecentParquetFile(
-                        parquetDataManagerExtractor.getConfigsFromStream(
-                            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath),
-                            hadoopConf))
-                    .getModifTime()))
+                    .getMostRecentParquetFile(getOrLoadParquetFiles(), hadoopConf)
+                    .getModificationTime()))
         .partitionedDataFiles(PartitionFileGroup.fromFiles(internalDataFiles))
         .build();
   }
@@ -221,10 +217,9 @@ public InternalSnapshot getCurrentSnapshot() {
   @Override
   public boolean isIncrementalSyncSafeFrom(Instant timeInMillis) {
     Stream<ParquetFileConfig> parquetFilesMetadata =
-        parquetDataManagerExtractor.getConfigsFromStream(
-            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), hadoopConf);
+        parquetDataManagerExtractor.getConfigsFromStream(getOrLoadParquetFiles(), hadoopConf);
     LongSummaryStatistics stats =
-        parquetFilesMetadata.mapToLong(ParquetFileConfig::getModifTime).summaryStatistics();
+        parquetFilesMetadata.mapToLong(ParquetFileConfig::getModificationTime).summaryStatistics();
 
     if (stats.getCount() == 0) {
       log.warn("No parquet files found in table {}. Incremental sync is not possible.", tableName);
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index ff59f156b..c6372c198 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -49,57 +49,89 @@ public static ParquetDataManager getInstance() {
     return INSTANCE;
   }
 
-  public ParquetFileConfig getMostRecentParquetFile(Stream<ParquetFileConfig> parquetFiles) {
+  public ParquetFileConfig getMostRecentParquetFile(
+      List<LocatedFileStatus> parquetFiles, Configuration conf) {
+    LocatedFileStatus file =
+        parquetFiles.stream()
+            .max(Comparator.comparing(LocatedFileStatus::getModificationTime))
+            .orElseThrow(() -> new IllegalStateException("No files found"));
+    return getConfigFromFile(file, conf);
+  }
+
+  public ParquetFileConfig getMostRecentParquetFileConfig(Stream<ParquetFileConfig> parquetFiles) {
     return parquetFiles
-        .max(Comparator.comparing(ParquetFileConfig::getModifTime))
+        .max(Comparator.comparing(ParquetFileConfig::getModificationTime))
         .orElseThrow(() -> new IllegalStateException("No files found"));
   }
 
-  public ParquetFileConfig getParquetFileAt(
-      Stream<ParquetFileConfig> parquetConfigs, long targetTime) {
+  public LocatedFileStatus getParquetDataFileAt(
+      List<LocatedFileStatus> parquetFiles, long targetTime) {
 
-    return parquetConfigs
-        .filter(config -> config.getModifTime() >= targetTime)
+    return parquetFiles.stream()
+        .filter(file -> file.getModificationTime() >= targetTime)
         .findFirst()
         .orElseThrow(() -> new IllegalStateException("No file found at or after " + targetTime));
   }
 
-  public Stream<LocatedFileStatus> getParquetFiles(Configuration hadoopConf, String basePath) {
+  public ParquetFileConfig getConfigFromFile(LocatedFileStatus file, Configuration conf) {
+
+    Path path = file.getPath();
+
+    ParquetMetadata metadata =
+        ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path);
+
+    return ParquetFileConfig.builder()
+        .schema(metadata.getFileMetaData().getSchema())
+        .metadata(metadata)
+        .path(path)
+        .size(file.getLen())
+        .modificationTime(file.getModificationTime())
+        .rowGroupIndex(metadata.getBlocks().size())
+        .codec(
+            metadata.getBlocks().isEmpty()
+                ? null
+                : metadata.getBlocks().get(0).getColumns().get(0).getCodec())
+        .build();
+  }
+
+  public List<LocatedFileStatus> getParquetFiles(Configuration hadoopConf, String basePath) {
     try {
       FileSystem fs = FileSystem.get(hadoopConf);
       URI uriBasePath = new URI(basePath);
       String parentPath = Paths.get(uriBasePath).toString();
       RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(new Path(parentPath), true);
       return RemoteIterators.toList(iterator).stream()
-          .filter(file -> file.getPath().getName().endsWith("parquet"));
+          .filter(file -> file.getPath().getName().endsWith("parquet"))
+          .collect(Collectors.toList());
     } catch (IOException | URISyntaxException e) {
       throw new ReadException("Unable to read files from file system", e);
     }
   }
 
   public Stream<ParquetFileConfig> getConfigsFromStream(
-      Stream<LocatedFileStatus> fileStream, Configuration conf) {
-
-    return fileStream.map(
-        fileStatus -> {
-          Path path = fileStatus.getPath();
-
-          ParquetMetadata metadata =
-              ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path);
-
-          return ParquetFileConfig.builder()
-              .schema(metadata.getFileMetaData().getSchema())
-              .metadata(metadata)
-              .path(path)
-              .size(fileStatus.getLen())
-              .modifTime(fileStatus.getModificationTime())
-              .rowGroupIndex(metadata.getBlocks().size())
-              .codec(
-                  metadata.getBlocks().isEmpty()
-                      ? null
-                      : metadata.getBlocks().get(0).getColumns().get(0).getCodec())
-              .build();
-        });
+      List<LocatedFileStatus> fileStream, Configuration conf) {
+
+    return fileStream.stream()
+        .map(
+            fileStatus -> {
+              Path path = fileStatus.getPath();
+
+              ParquetMetadata metadata =
+                  ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path);
+
+              return ParquetFileConfig.builder()
+                  .schema(metadata.getFileMetaData().getSchema())
+                  .metadata(metadata)
+                  .path(path)
+                  .size(fileStatus.getLen())
+                  .modificationTime(fileStatus.getModificationTime())
+                  .rowGroupIndex(metadata.getBlocks().size())
+                  .codec(
+                      metadata.getBlocks().isEmpty()
+                          ? null
+                          : metadata.getBlocks().get(0).getColumns().get(0).getCodec())
+                  .build();
+            });
   }
 
   public List<ParquetFileConfig> getParquetFilesMetadataInRange(
@@ -109,16 +141,16 @@ public List<ParquetFileConfig> getParquetFilesMetadataInRange(
         .filter(
             file ->
                 file.getModificationTime() >= startTime && file.getModificationTime() <= endTime)
-        .map(file -> new ParquetFileConfig(conf, file.getPath()))
+        .map(file -> new ParquetFileConfig(conf, file))
         .collect(Collectors.toList());
   }
 
   public List<ParquetFileConfig> getParquetFilesMetadataAfterTime(
-      Configuration conf, Stream<LocatedFileStatus> parquetFiles, long syncTime) {
+      Configuration conf, List<LocatedFileStatus> parquetFiles, long syncTime) {
 
-    return parquetFiles
+    return parquetFiles.stream()
         .filter(file -> file.getModificationTime() >= syncTime)
-        .map(file -> new ParquetFileConfig(conf, file.getPath()))
+        .map(file -> new ParquetFileConfig(conf, file))
         .collect(Collectors.toList());
   }
 }
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
index 7a038124a..ec40a79f5 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
@@ -18,21 +18,24 @@
  
 package org.apache.xtable.parquet;
 
-import java.io.IOException;
-
 import lombok.AccessLevel;
 import lombok.AllArgsConstructor;
 import lombok.Builder;
 import lombok.Getter;
 import lombok.experimental.FieldDefaults;
+import lombok.extern.log4j.Log4j2;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.parquet.hadoop.metadata.BlockMetaData;
 import org.apache.parquet.hadoop.metadata.CompressionCodecName;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.schema.MessageType;
 
+import org.apache.xtable.exception.ReadException;
+
+@Log4j2
 @Getter
 @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
 @Builder
@@ -41,26 +44,26 @@ class ParquetFileConfig {
   MessageType schema;
   ParquetMetadata metadata;
   long rowGroupIndex;
-  long modifTime;
+  long modificationTime;
   long size;
   CompressionCodecName codec;
   Path path;
 
-  public ParquetFileConfig(Configuration conf, Path file) {
-    long modifTime = -1L;
+  public ParquetFileConfig(Configuration conf, FileStatus file) {
+    long modificationTime = -1L;
     ParquetMetadata metadata =
-        ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file);
+        ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file.getPath());
 
     if (metadata.getBlocks().isEmpty()) {
       throw new IllegalStateException("Parquet file contains no row groups.");
     }
     try {
-      modifTime = file.getFileSystem(conf).getFileStatus(file).getModificationTime();
-    } catch (IOException e) {
-      e.printStackTrace();
+      modificationTime = file.getModificationTime();
+    } catch (ReadException e) {
+      log.warn("File reading error: " + e.getMessage());
     }
-    this.path = file;
-    this.modifTime = modifTime;
+    this.path = file.getPath();
+    this.modificationTime = modificationTime;
     this.size = metadata.getBlocks().stream().mapToLong(BlockMetaData::getTotalByteSize).sum();
     this.metadata = metadata;
     this.schema = metadata.getFileMetaData().getSchema();
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index bb32a5024..08e11ed93 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -183,7 +183,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     assertNotNull(changes);
     Instant instantBeforeFirstSnapshot =
         Instant.ofEpochMilli(snapshot.getTable().getLatestCommitTime().toEpochMilli());
-    assertTrue(instantBeforeFirstSnapshot.toEpochMilli() == newModifTime);
+    assertEquals(instantBeforeFirstSnapshot.toEpochMilli(), newModifTime);
     assertTrue(conversionSource.isIncrementalSyncSafeFrom(Instant.ofEpochMilli(testTime)));
   }
 

From 5ea0b5a2630802d6646a4ee468f850ae6911bb91 Mon Sep 17 00:00:00 2001
From: Timothy Brown <tim@onehouse.ai>
Date: Sun, 1 Feb 2026 21:55:48 -0500
Subject: [PATCH 055/100] Lazily fetch info, move state into ParquetDataManager

---
 .../parquet/ParquetConversionSource.java      | 85 +++++++---------
 .../xtable/parquet/ParquetDataManager.java    | 97 ++++---------------
 .../xtable/parquet/ParquetFileConfig.java     | 73 --------------
 .../xtable/parquet/ParquetFileInfo.java       | 51 ++++++++++
 .../xtable/parquet/ITParquetDataManager.java  |  9 +-
 5 files changed, 110 insertions(+), 205 deletions(-)
 delete mode 100644 xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
 create mode 100644 xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileInfo.java

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
index 10eccdf40..3ef6e85b9 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
@@ -45,16 +45,11 @@
 import org.apache.xtable.model.storage.InternalDataFile;
 import org.apache.xtable.spi.extractor.ConversionSource;
 
-@Builder
 @Log4j2
 public class ParquetConversionSource implements ConversionSource<Long> {
 
   private static final ParquetSchemaExtractor schemaExtractor =
       ParquetSchemaExtractor.getInstance();
-  private static final ParquetMetadataExtractor metadataExtractor =
-      ParquetMetadataExtractor.getInstance();
-  private static final ParquetDataManager parquetDataManagerExtractor =
-      ParquetDataManager.getInstance();
 
   private static final ParquetMetadataExtractor parquetMetadataExtractor =
       ParquetMetadataExtractor.getInstance();
@@ -67,18 +62,24 @@ public class ParquetConversionSource implements ConversionSource<Long> {
   private final String tableName;
   private final String basePath;
   @NonNull private final Configuration hadoopConf;
-
-  private List<LocatedFileStatus> parquetFiles;
-
-  // helper method to ensure files are loaded once
-  private List<LocatedFileStatus> getOrLoadParquetFiles() {
-    if (this.parquetFiles == null) {
-      this.parquetFiles = parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath);
-    }
-    return this.parquetFiles;
+  private final ParquetDataManager parquetDataManager;
+
+  @Builder
+  ParquetConversionSource(
+      String tableName,
+      String basePath,
+      @NonNull Configuration hadoopConf,
+      ParquetPartitionValueExtractor partitionValueExtractor,
+      PathBasedPartitionSpecExtractor partitionSpecExtractor) {
+    this.tableName = tableName;
+    this.basePath = basePath;
+    this.hadoopConf = hadoopConf;
+    this.partitionValueExtractor = partitionValueExtractor;
+    this.partitionSpecExtractor = partitionSpecExtractor;
+    this.parquetDataManager = new ParquetDataManager(hadoopConf, basePath);
   }
 
-  private InternalTable createInternalTableFromFile(ParquetFileConfig latestFile) {
+  private InternalTable createInternalTableFromFile(ParquetFileInfo latestFile) {
     ParquetMetadata parquetMetadata =
         parquetMetadataExtractor.readParquetMetadata(hadoopConf, latestFile.getPath());
     MessageType parquetSchema = parquetMetadataExtractor.getSchema(parquetMetadata);
@@ -103,13 +104,12 @@ private InternalTable createInternalTableFromFile(ParquetFileConfig latestFile)
   @Override
   public InternalTable getTable(Long modificationTime) {
     // get parquetFile at specific time modificationTime
-    LocatedFileStatus parquetFile =
-        parquetDataManagerExtractor.getParquetDataFileAt(getOrLoadParquetFiles(), modificationTime);
-    ParquetFileConfig file = parquetDataManagerExtractor.getConfigFromFile(parquetFile, hadoopConf);
+    LocatedFileStatus parquetFile = parquetDataManager.getParquetDataFileAt(modificationTime);
+    ParquetFileInfo file = new ParquetFileInfo(hadoopConf, parquetFile);
     return createInternalTableFromFile(file);
   }
 
-  private Stream<InternalDataFile> getInternalDataFiles(Stream<ParquetFileConfig> parquetFiles) {
+  private Stream<InternalDataFile> getInternalDataFiles(Stream<ParquetFileInfo> parquetFiles) {
     return parquetFiles.map(
         file -> {
           ParquetMetadata metadata =
@@ -130,7 +130,7 @@ private Stream<InternalDataFile> getInternalDataFiles(Stream<ParquetFileConfig>
         });
   }
 
-  private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileConfig parquetFile) {
+  private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileInfo parquetFile) {
     return InternalDataFile.builder()
         .physicalPath(parquetFile.getPath().toString())
         .partitionValues(
@@ -155,15 +155,11 @@ public CommitsBacklog<Long> getCommitsBacklog(InstantsForIncrementalSync syncIns
   @Override
   public TableChange getTableChangeForCommit(Long modificationTime) {
     Set<InternalDataFile> addedInternalDataFiles = new HashSet<>();
-
-    List<ParquetFileConfig> filesMetadata =
-        parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
-            hadoopConf, getOrLoadParquetFiles(), modificationTime);
-    List<ParquetFileConfig> tableChangesAfterMetadata =
-        parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
-            hadoopConf, getOrLoadParquetFiles(), modificationTime);
-    InternalTable internalTable = getMostRecentTableConfig(tableChangesAfterMetadata.stream());
-    for (ParquetFileConfig fileMetadata : filesMetadata) {
+    List<ParquetFileInfo> tableChangesAfterModificationTime =
+        parquetDataManager.getParquetFilesMetadataAfterTime(hadoopConf, modificationTime);
+    InternalTable internalTable =
+        getMostRecentTableConfig(tableChangesAfterModificationTime.stream());
+    for (ParquetFileInfo fileMetadata : tableChangesAfterModificationTime) {
       InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(fileMetadata);
       addedInternalDataFiles.add(currentDataFile);
     }
@@ -171,55 +167,46 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
     return TableChange.builder()
         .sourceIdentifier(
             getCommitIdentifier(
-                parquetDataManagerExtractor
-                    .getMostRecentParquetFile(getOrLoadParquetFiles(), hadoopConf)
-                    .getModificationTime()))
+                parquetDataManager.getMostRecentParquetFile().getModificationTime()))
         .tableAsOfChange(internalTable)
         .filesDiff(InternalFilesDiff.builder().filesAdded(addedInternalDataFiles).build())
         .build();
   }
 
-  private InternalTable getMostRecentTable(
-      List<LocatedFileStatus> parquetFiles, Configuration conf) {
-    ParquetFileConfig latestFile =
-        parquetDataManagerExtractor.getMostRecentParquetFile(parquetFiles, conf);
+  private InternalTable getMostRecentTable() {
+    ParquetFileInfo latestFile = parquetDataManager.getMostRecentParquetFile();
     return createInternalTableFromFile(latestFile);
   }
 
-  private InternalTable getMostRecentTableConfig(Stream<ParquetFileConfig> parquetFiles) {
-    ParquetFileConfig latestFile =
-        parquetDataManagerExtractor.getMostRecentParquetFileConfig(parquetFiles);
+  private InternalTable getMostRecentTableConfig(Stream<ParquetFileInfo> parquetFiles) {
+    ParquetFileInfo latestFile = parquetDataManager.getMostRecentParquetFileConfig(parquetFiles);
     return createInternalTableFromFile(latestFile);
   }
 
   @Override
   public InternalTable getCurrentTable() {
-    return getMostRecentTable(getOrLoadParquetFiles(), hadoopConf);
+    return getMostRecentTable();
   }
 
   @Override
   public InternalSnapshot getCurrentSnapshot() {
     Stream<InternalDataFile> internalDataFiles =
-        getInternalDataFiles(
-            parquetDataManagerExtractor.getConfigsFromStream(getOrLoadParquetFiles(), hadoopConf));
-    InternalTable table = getMostRecentTable(getOrLoadParquetFiles(), hadoopConf);
+        getInternalDataFiles(parquetDataManager.getCurrentFileInfo());
+    InternalTable table = getMostRecentTable();
     return InternalSnapshot.builder()
         .table(table)
         .sourceIdentifier(
             getCommitIdentifier(
-                parquetDataManagerExtractor
-                    .getMostRecentParquetFile(getOrLoadParquetFiles(), hadoopConf)
-                    .getModificationTime()))
+                parquetDataManager.getMostRecentParquetFile().getModificationTime()))
         .partitionedDataFiles(PartitionFileGroup.fromFiles(internalDataFiles))
         .build();
   }
 
   @Override
   public boolean isIncrementalSyncSafeFrom(Instant timeInMillis) {
-    Stream<ParquetFileConfig> parquetFilesMetadata =
-        parquetDataManagerExtractor.getConfigsFromStream(getOrLoadParquetFiles(), hadoopConf);
+    Stream<ParquetFileInfo> parquetFilesMetadata = parquetDataManager.getCurrentFileInfo();
     LongSummaryStatistics stats =
-        parquetFilesMetadata.mapToLong(ParquetFileConfig::getModificationTime).summaryStatistics();
+        parquetFilesMetadata.mapToLong(ParquetFileInfo::getModificationTime).summaryStatistics();
 
     if (stats.getCount() == 0) {
       log.warn("No parquet files found in table {}. Incremental sync is not possible.", tableName);
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index c6372c198..3e1ea5081 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -26,12 +26,13 @@
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
+import lombok.Getter;
+import lombok.RequiredArgsConstructor;
 import lombok.extern.log4j.Log4j2;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.util.functional.RemoteIterators;
-import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 
 import org.apache.xtable.exception.ReadException;
 
@@ -42,59 +43,37 @@
  * parquet files and filtering the files based on the modification times.
  */
 @Log4j2
+@RequiredArgsConstructor
 public class ParquetDataManager {
-  public static final ParquetDataManager INSTANCE = new ParquetDataManager();
+  private final Configuration hadoopConf;
+  private final String basePath;
 
-  public static ParquetDataManager getInstance() {
-    return INSTANCE;
-  }
+  @Getter(lazy = true)
+  private final List<LocatedFileStatus> parquetFiles = loadParquetFiles(hadoopConf, basePath);
 
-  public ParquetFileConfig getMostRecentParquetFile(
-      List<LocatedFileStatus> parquetFiles, Configuration conf) {
+  ParquetFileInfo getMostRecentParquetFile() {
     LocatedFileStatus file =
-        parquetFiles.stream()
+        getParquetFiles().stream()
             .max(Comparator.comparing(LocatedFileStatus::getModificationTime))
             .orElseThrow(() -> new IllegalStateException("No files found"));
-    return getConfigFromFile(file, conf);
+    return new ParquetFileInfo(hadoopConf, file);
   }
 
-  public ParquetFileConfig getMostRecentParquetFileConfig(Stream<ParquetFileConfig> parquetFiles) {
+  ParquetFileInfo getMostRecentParquetFileConfig(Stream<ParquetFileInfo> parquetFiles) {
     return parquetFiles
-        .max(Comparator.comparing(ParquetFileConfig::getModificationTime))
+        .max(Comparator.comparing(ParquetFileInfo::getModificationTime))
         .orElseThrow(() -> new IllegalStateException("No files found"));
   }
 
-  public LocatedFileStatus getParquetDataFileAt(
-      List<LocatedFileStatus> parquetFiles, long targetTime) {
+  public LocatedFileStatus getParquetDataFileAt(long targetTime) {
 
-    return parquetFiles.stream()
+    return getParquetFiles().stream()
         .filter(file -> file.getModificationTime() >= targetTime)
         .findFirst()
         .orElseThrow(() -> new IllegalStateException("No file found at or after " + targetTime));
   }
 
-  public ParquetFileConfig getConfigFromFile(LocatedFileStatus file, Configuration conf) {
-
-    Path path = file.getPath();
-
-    ParquetMetadata metadata =
-        ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path);
-
-    return ParquetFileConfig.builder()
-        .schema(metadata.getFileMetaData().getSchema())
-        .metadata(metadata)
-        .path(path)
-        .size(file.getLen())
-        .modificationTime(file.getModificationTime())
-        .rowGroupIndex(metadata.getBlocks().size())
-        .codec(
-            metadata.getBlocks().isEmpty()
-                ? null
-                : metadata.getBlocks().get(0).getColumns().get(0).getCodec())
-        .build();
-  }
-
-  public List<LocatedFileStatus> getParquetFiles(Configuration hadoopConf, String basePath) {
+  private List<LocatedFileStatus> loadParquetFiles(Configuration hadoopConf, String basePath) {
     try {
       FileSystem fs = FileSystem.get(hadoopConf);
       URI uriBasePath = new URI(basePath);
@@ -108,49 +87,15 @@ public List<LocatedFileStatus> getParquetFiles(Configuration hadoopConf, String
     }
   }
 
-  public Stream<ParquetFileConfig> getConfigsFromStream(
-      List<LocatedFileStatus> fileStream, Configuration conf) {
-
-    return fileStream.stream()
-        .map(
-            fileStatus -> {
-              Path path = fileStatus.getPath();
-
-              ParquetMetadata metadata =
-                  ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path);
-
-              return ParquetFileConfig.builder()
-                  .schema(metadata.getFileMetaData().getSchema())
-                  .metadata(metadata)
-                  .path(path)
-                  .size(fileStatus.getLen())
-                  .modificationTime(fileStatus.getModificationTime())
-                  .rowGroupIndex(metadata.getBlocks().size())
-                  .codec(
-                      metadata.getBlocks().isEmpty()
-                          ? null
-                          : metadata.getBlocks().get(0).getColumns().get(0).getCodec())
-                  .build();
-            });
+  Stream<ParquetFileInfo> getCurrentFileInfo() {
+    return getParquetFiles().stream()
+        .map(fileStatus -> new ParquetFileInfo(hadoopConf, fileStatus));
   }
 
-  public List<ParquetFileConfig> getParquetFilesMetadataInRange(
-      Configuration conf, Stream<LocatedFileStatus> parquetFiles, long startTime, long endTime) {
-
-    return parquetFiles
-        .filter(
-            file ->
-                file.getModificationTime() >= startTime && file.getModificationTime() <= endTime)
-        .map(file -> new ParquetFileConfig(conf, file))
-        .collect(Collectors.toList());
-  }
-
-  public List<ParquetFileConfig> getParquetFilesMetadataAfterTime(
-      Configuration conf, List<LocatedFileStatus> parquetFiles, long syncTime) {
-
-    return parquetFiles.stream()
+  List<ParquetFileInfo> getParquetFilesMetadataAfterTime(Configuration conf, long syncTime) {
+    return getParquetFiles().stream()
         .filter(file -> file.getModificationTime() >= syncTime)
-        .map(file -> new ParquetFileConfig(conf, file))
+        .map(file -> new ParquetFileInfo(conf, file))
         .collect(Collectors.toList());
   }
 }
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
deleted file mode 100644
index ec40a79f5..000000000
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- 
-package org.apache.xtable.parquet;
-
-import lombok.AccessLevel;
-import lombok.AllArgsConstructor;
-import lombok.Builder;
-import lombok.Getter;
-import lombok.experimental.FieldDefaults;
-import lombok.extern.log4j.Log4j2;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.Path;
-import org.apache.parquet.hadoop.metadata.BlockMetaData;
-import org.apache.parquet.hadoop.metadata.CompressionCodecName;
-import org.apache.parquet.hadoop.metadata.ParquetMetadata;
-import org.apache.parquet.schema.MessageType;
-
-import org.apache.xtable.exception.ReadException;
-
-@Log4j2
-@Getter
-@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
-@Builder
-@AllArgsConstructor
-class ParquetFileConfig {
-  MessageType schema;
-  ParquetMetadata metadata;
-  long rowGroupIndex;
-  long modificationTime;
-  long size;
-  CompressionCodecName codec;
-  Path path;
-
-  public ParquetFileConfig(Configuration conf, FileStatus file) {
-    long modificationTime = -1L;
-    ParquetMetadata metadata =
-        ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file.getPath());
-
-    if (metadata.getBlocks().isEmpty()) {
-      throw new IllegalStateException("Parquet file contains no row groups.");
-    }
-    try {
-      modificationTime = file.getModificationTime();
-    } catch (ReadException e) {
-      log.warn("File reading error: " + e.getMessage());
-    }
-    this.path = file.getPath();
-    this.modificationTime = modificationTime;
-    this.size = metadata.getBlocks().stream().mapToLong(BlockMetaData::getTotalByteSize).sum();
-    this.metadata = metadata;
-    this.schema = metadata.getFileMetaData().getSchema();
-    this.rowGroupIndex = metadata.getBlocks().size();
-    this.codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec();
-  }
-}
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileInfo.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileInfo.java
new file mode 100644
index 000000000..539349edb
--- /dev/null
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileInfo.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+package org.apache.xtable.parquet;
+
+import lombok.Getter;
+import lombok.Value;
+import lombok.extern.log4j.Log4j2;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+
+@Log4j2
+@Value
+class ParquetFileInfo {
+  @Getter(lazy = true)
+  ParquetMetadata metadata = getMetadataInternal();
+
+  long modificationTime;
+  long size;
+  Path path;
+  Configuration conf;
+
+  public ParquetFileInfo(Configuration conf, FileStatus file) {
+    this.modificationTime = file.getModificationTime();
+    this.path = file.getPath();
+    this.size = file.getLen();
+    this.conf = conf;
+  }
+
+  private ParquetMetadata getMetadataInternal() {
+    return ParquetMetadataExtractor.readParquetMetadata(conf, path);
+  }
+}
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 08e11ed93..0d140c764 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -149,8 +149,6 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     ParquetConversionSource conversionSource =
         conversionSourceProvider.getConversionSourceInstance(tableConfig);
 
-    // long newModifTime = System.currentTimeMillis() - 50000;
-
     for (String partition : newPartitions) {
       org.apache.hadoop.fs.Path partitionPath =
           new org.apache.hadoop.fs.Path(outputPath, partition);
@@ -162,11 +160,9 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
         if (fileStatus.getModificationTime() > newModifTime) {
           fs.setTimes(fileStatus.getPath(), newModifTime, -1);
         } else {
-
           fs.setTimes(fileStatus.getPath(), targetModifTime, -1);
         }
       }
-
       fs.setTimes(partitionPath, newModifTime, -1);
     }
 
@@ -189,10 +185,9 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
 
   private void updateModificationTimeRecursive(
       FileSystem fs, org.apache.hadoop.fs.Path path, long time) throws IOException {
-    org.apache.hadoop.fs.RemoteIterator<org.apache.hadoop.fs.LocatedFileStatus> it =
-        fs.listFiles(path, true);
+    RemoteIterator<LocatedFileStatus> it = fs.listFiles(path, true);
     while (it.hasNext()) {
-      org.apache.hadoop.fs.LocatedFileStatus status = it.next();
+      LocatedFileStatus status = it.next();
       if (status.getPath().getName().endsWith(".parquet")) {
         fs.setTimes(status.getPath(), time, -1);
       }

From 64a5a2dcded503c80c158198b95d57cdb4da5476 Mon Sep 17 00:00:00 2001
From: Timothy Brown <tim@onehouse.ai>
Date: Wed, 11 Feb 2026 20:33:12 -0500
Subject: [PATCH 056/100] add TestParquetDataManager, simplify data manager
 methods

---
 .../parquet/ParquetConversionSource.java      |  48 +-
 .../xtable/parquet/ParquetDataManager.java    |  43 +-
 .../parquet/ITParquetConversionSource.java    | 129 +++++
 .../xtable/parquet/ITParquetDataManager.java  | 203 --------
 .../parquet/TestParquetDataManager.java       | 453 ++++++++++++++++++
 5 files changed, 627 insertions(+), 249 deletions(-)
 delete mode 100644 xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
 create mode 100644 xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
index 3ef6e85b9..5768dac03 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
@@ -19,7 +19,12 @@
 package org.apache.xtable.parquet;
 
 import java.time.Instant;
-import java.util.*;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.OptionalLong;
+import java.util.Set;
+import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
 import lombok.Builder;
@@ -104,8 +109,7 @@ private InternalTable createInternalTableFromFile(ParquetFileInfo latestFile) {
   @Override
   public InternalTable getTable(Long modificationTime) {
     // get parquetFile at specific time modificationTime
-    LocatedFileStatus parquetFile = parquetDataManager.getParquetDataFileAt(modificationTime);
-    ParquetFileInfo file = new ParquetFileInfo(hadoopConf, parquetFile);
+    ParquetFileInfo file = parquetDataManager.getParquetDataFileAt(modificationTime);
     return createInternalTableFromFile(file);
   }
 
@@ -154,15 +158,15 @@ public CommitsBacklog<Long> getCommitsBacklog(InstantsForIncrementalSync syncIns
 
   @Override
   public TableChange getTableChangeForCommit(Long modificationTime) {
-    Set<InternalDataFile> addedInternalDataFiles = new HashSet<>();
+
     List<ParquetFileInfo> tableChangesAfterModificationTime =
-        parquetDataManager.getParquetFilesMetadataAfterTime(hadoopConf, modificationTime);
+        parquetDataManager.getParquetFilesMetadataAfterTime(modificationTime);
     InternalTable internalTable =
         getMostRecentTableConfig(tableChangesAfterModificationTime.stream());
-    for (ParquetFileInfo fileMetadata : tableChangesAfterModificationTime) {
-      InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(fileMetadata);
-      addedInternalDataFiles.add(currentDataFile);
-    }
+    Set<InternalDataFile> addedInternalDataFiles =
+        tableChangesAfterModificationTime.stream()
+            .map(this::createInternalDataFileFromParquetFile)
+            .collect(Collectors.toSet());
 
     return TableChange.builder()
         .sourceIdentifier(
@@ -179,7 +183,10 @@ private InternalTable getMostRecentTable() {
   }
 
   private InternalTable getMostRecentTableConfig(Stream<ParquetFileInfo> parquetFiles) {
-    ParquetFileInfo latestFile = parquetDataManager.getMostRecentParquetFileConfig(parquetFiles);
+    ParquetFileInfo latestFile =
+        parquetFiles
+            .max(Comparator.comparing(ParquetFileInfo::getModificationTime))
+            .orElseThrow(() -> new IllegalStateException("No files found"));
     return createInternalTableFromFile(latestFile);
   }
 
@@ -205,35 +212,26 @@ public InternalSnapshot getCurrentSnapshot() {
   @Override
   public boolean isIncrementalSyncSafeFrom(Instant timeInMillis) {
     Stream<ParquetFileInfo> parquetFilesMetadata = parquetDataManager.getCurrentFileInfo();
-    LongSummaryStatistics stats =
-        parquetFilesMetadata.mapToLong(ParquetFileInfo::getModificationTime).summaryStatistics();
+    OptionalLong earliestModTimeOpt =
+        parquetFilesMetadata.mapToLong(ParquetFileInfo::getModificationTime).min();
 
-    if (stats.getCount() == 0) {
+    if (!earliestModTimeOpt.isPresent()) {
       log.warn("No parquet files found in table {}. Incremental sync is not possible.", tableName);
       return false;
     }
 
-    long earliestModTime = stats.getMin();
-    long latestModTime = stats.getMax();
-
-    if (timeInMillis.toEpochMilli() > latestModTime) {
-      log.warn(
-          "Instant {} is in the future relative to the data. Latest file time: {}",
-          timeInMillis.toEpochMilli(),
-          Instant.ofEpochMilli(latestModTime));
-      return false;
-    }
+    long earliestModTime = earliestModTimeOpt.getAsLong();
 
     if (earliestModTime > timeInMillis.toEpochMilli()) {
       log.warn(
           "Incremental sync is not safe. Earliest available metadata (time={}) is newer "
-              + "than requested instant {}. Data history has been truncated.",
+              + "than requested instant {}.",
           Instant.ofEpochMilli(earliestModTime),
           timeInMillis.toEpochMilli());
       return false;
     }
 
-    log.info(
+    log.debug(
         "Incremental sync is safe from instant {} for table {}",
         timeInMillis.toEpochMilli(),
         tableName);
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 3e1ea5081..84cb46690 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -20,12 +20,11 @@
 
 import java.io.IOException;
 import java.net.URI;
-import java.net.URISyntaxException;
-import java.nio.file.Paths;
 import java.util.*;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
+import lombok.AccessLevel;
 import lombok.Getter;
 import lombok.RequiredArgsConstructor;
 import lombok.extern.log4j.Log4j2;
@@ -47,9 +46,20 @@
 public class ParquetDataManager {
   private final Configuration hadoopConf;
   private final String basePath;
+  private final FileSystem fileSystem;
 
-  @Getter(lazy = true)
-  private final List<LocatedFileStatus> parquetFiles = loadParquetFiles(hadoopConf, basePath);
+  public ParquetDataManager(Configuration hadoopConf, String basePath) {
+    this.hadoopConf = hadoopConf;
+    this.basePath = basePath;
+    try {
+      this.fileSystem = FileSystem.get(URI.create(basePath), hadoopConf);
+    } catch (IOException e) {
+      throw new ReadException("Unable to initialize file system for base path: " + basePath, e);
+    }
+  }
+
+  @Getter(value = AccessLevel.PRIVATE, lazy = true)
+  private final List<LocatedFileStatus> parquetFiles = loadParquetFiles();
 
   ParquetFileInfo getMostRecentParquetFile() {
     LocatedFileStatus file =
@@ -59,30 +69,21 @@ ParquetFileInfo getMostRecentParquetFile() {
     return new ParquetFileInfo(hadoopConf, file);
   }
 
-  ParquetFileInfo getMostRecentParquetFileConfig(Stream<ParquetFileInfo> parquetFiles) {
-    return parquetFiles
-        .max(Comparator.comparing(ParquetFileInfo::getModificationTime))
-        .orElseThrow(() -> new IllegalStateException("No files found"));
-  }
-
-  public LocatedFileStatus getParquetDataFileAt(long targetTime) {
-
+  ParquetFileInfo getParquetDataFileAt(long targetTime) {
     return getParquetFiles().stream()
         .filter(file -> file.getModificationTime() >= targetTime)
-        .findFirst()
+        .min(Comparator.comparing(LocatedFileStatus::getModificationTime))
+        .map(file -> new ParquetFileInfo(hadoopConf, file))
         .orElseThrow(() -> new IllegalStateException("No file found at or after " + targetTime));
   }
 
-  private List<LocatedFileStatus> loadParquetFiles(Configuration hadoopConf, String basePath) {
+  private List<LocatedFileStatus> loadParquetFiles() {
     try {
-      FileSystem fs = FileSystem.get(hadoopConf);
-      URI uriBasePath = new URI(basePath);
-      String parentPath = Paths.get(uriBasePath).toString();
-      RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(new Path(parentPath), true);
+      RemoteIterator<LocatedFileStatus> iterator = fileSystem.listFiles(new Path(basePath), true);
       return RemoteIterators.toList(iterator).stream()
           .filter(file -> file.getPath().getName().endsWith("parquet"))
           .collect(Collectors.toList());
-    } catch (IOException | URISyntaxException e) {
+    } catch (IOException e) {
       throw new ReadException("Unable to read files from file system", e);
     }
   }
@@ -92,10 +93,10 @@ Stream<ParquetFileInfo> getCurrentFileInfo() {
         .map(fileStatus -> new ParquetFileInfo(hadoopConf, fileStatus));
   }
 
-  List<ParquetFileInfo> getParquetFilesMetadataAfterTime(Configuration conf, long syncTime) {
+  List<ParquetFileInfo> getParquetFilesMetadataAfterTime(long syncTime) {
     return getParquetFiles().stream()
         .filter(file -> file.getModificationTime() >= syncTime)
-        .map(file -> new ParquetFileInfo(conf, file))
+        .map(file -> new ParquetFileInfo(hadoopConf, file))
         .collect(Collectors.toList());
   }
 }
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 6dda9db11..ace3f4173 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -18,16 +18,21 @@
  
 package org.apache.xtable.parquet;
 
+import static org.apache.spark.sql.functions.expr;
 import static org.apache.xtable.GenericTable.getTableName;
 import static org.apache.xtable.model.storage.TableFormat.*;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
 
+import java.io.IOException;
 import java.net.URISyntaxException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.sql.Timestamp;
 import java.time.Duration;
+import java.time.Instant;
 import java.util.*;
 import java.util.Arrays;
 import java.util.Collections;
@@ -43,6 +48,10 @@
 import lombok.Builder;
 import lombok.Value;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.*;
@@ -52,6 +61,7 @@
 import org.apache.spark.sql.types.StructType;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.io.TempDir;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.Arguments;
@@ -67,6 +77,10 @@
 import org.apache.xtable.conversion.SourceTable;
 import org.apache.xtable.conversion.TargetTable;
 import org.apache.xtable.hudi.HudiTestUtil;
+import org.apache.xtable.model.InternalSnapshot;
+import org.apache.xtable.model.InternalTable;
+import org.apache.xtable.model.TableChange;
+import org.apache.xtable.model.storage.TableFormat;
 import org.apache.xtable.model.sync.SyncMode;
 
 public class ITParquetConversionSource {
@@ -350,6 +364,121 @@ public void testFilePartitionedData(TableFormatPartitionDataHolder tableFormatPa
     }
   }
 
+  @Test
+  public void testIncrementalSyncWithMultiplePartitions() throws IOException {
+
+    Configuration conf = sparkSession.sparkContext().hadoopConfiguration();
+
+    StructType schema =
+        DataTypes.createStructType(
+            new StructField[] {
+              DataTypes.createStructField("id", DataTypes.IntegerType, false),
+              DataTypes.createStructField("value", DataTypes.StringType, false),
+              DataTypes.createStructField("year", DataTypes.IntegerType, false),
+              DataTypes.createStructField("month", DataTypes.IntegerType, false)
+            });
+    List<Row> data =
+        Arrays.asList(
+            RowFactory.create(100, "A", 2026, 12),
+            RowFactory.create(101, "AA", 2026, 12),
+            RowFactory.create(102, "CB", 2027, 11),
+            RowFactory.create(103, "BA", 2027, 11));
+
+    Dataset<Row> dfInit = sparkSession.createDataFrame(data, schema);
+    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_2");
+    String outputPath = fixedPath.toString();
+    Dataset<Row> df = dfInit.withColumn("full_date", expr("make_date(year, month, 1)"));
+    df.coalesce(1).write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
+
+    // test find files to sync
+    FileSystem fs = FileSystem.get(fixedPath.toUri(), conf);
+    // set the modification time to the table file
+    // update modificationTime for file to append
+    // many partitions case
+    List<String> newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=11");
+    long targetModificationTime = System.currentTimeMillis() - 360000;
+    long newModificationTime = System.currentTimeMillis() - 50000;
+    long testTime = System.currentTimeMillis() - 90000; // between two prev times
+    for (String partition : newPartitions) {
+      org.apache.hadoop.fs.Path partitionPath =
+          new org.apache.hadoop.fs.Path(outputPath, partition);
+      if (fs.exists(partitionPath)) {
+        updateModificationTimeRecursive(fs, partitionPath, targetModificationTime);
+      }
+    }
+    // create new file to append using Spark
+    List<Row> futureDataToSync =
+        Arrays.asList(
+            RowFactory.create(101, "A", 2026, 12),
+            RowFactory.create(301, "D", 2027, 11),
+            RowFactory.create(302, "DA", 2027, 11));
+    Dataset<Row> dfToSyncInit = sparkSession.createDataFrame(futureDataToSync, schema);
+    Dataset<Row> dfToSync = dfToSyncInit.withColumn("full_date", expr("make_date(year, month, 1)"));
+    dfToSync.coalesce(1).write().partitionBy("year", "month").mode("append").parquet(outputPath);
+
+    // conversionSource operations
+    Properties sourceProperties = new Properties();
+    String partitionConfig = "full_date:MONTH:year=yyyy/month=MM";
+    sourceProperties.put(PARTITION_FIELD_SPEC_CONFIG, partitionConfig);
+    SourceTable tableConfig =
+        SourceTable.builder()
+            .name("parquet_table_test_2")
+            .basePath(fixedPath.toAbsolutePath().toUri().toString())
+            .additionalProperties(sourceProperties)
+            .formatName(TableFormat.PARQUET)
+            .build();
+
+    ParquetConversionSourceProvider conversionSourceProvider =
+        new ParquetConversionSourceProvider();
+    conversionSourceProvider.init(conf);
+    ParquetConversionSource conversionSource =
+        conversionSourceProvider.getConversionSourceInstance(tableConfig);
+
+    for (String partition : newPartitions) {
+      org.apache.hadoop.fs.Path partitionPath =
+          new org.apache.hadoop.fs.Path(outputPath, partition);
+
+      RemoteIterator<LocatedFileStatus> it = fs.listFiles(partitionPath, false);
+      while (it.hasNext()) {
+        LocatedFileStatus fileStatus = it.next();
+
+        if (fileStatus.getModificationTime() > newModificationTime) {
+          fs.setTimes(fileStatus.getPath(), newModificationTime, -1);
+        } else {
+          fs.setTimes(fileStatus.getPath(), targetModificationTime, -1);
+        }
+      }
+      fs.setTimes(partitionPath, newModificationTime, -1);
+    }
+
+    InternalTable result = conversionSource.getTable(newModificationTime);
+    assertEquals(
+        Instant.ofEpochMilli(newModificationTime).toString(),
+        result.getLatestCommitTime().toString());
+    assertEquals("parquet_table_test_2", result.getName());
+    assertEquals(TableFormat.PARQUET, result.getTableFormat());
+    assertNotNull(result.getReadSchema());
+    InternalSnapshot snapshot = conversionSource.getCurrentSnapshot();
+    assertNotNull(snapshot);
+    TableChange changes = conversionSource.getTableChangeForCommit(newModificationTime);
+    assertNotNull(changes);
+    Instant instantBeforeFirstSnapshot =
+        Instant.ofEpochMilli(snapshot.getTable().getLatestCommitTime().toEpochMilli());
+    assertEquals(instantBeforeFirstSnapshot.toEpochMilli(), newModificationTime);
+    assertTrue(conversionSource.isIncrementalSyncSafeFrom(Instant.ofEpochMilli(testTime)));
+  }
+
+  private void updateModificationTimeRecursive(
+      FileSystem fs, org.apache.hadoop.fs.Path path, long time) throws IOException {
+    RemoteIterator<LocatedFileStatus> it = fs.listFiles(path, true);
+    while (it.hasNext()) {
+      LocatedFileStatus status = it.next();
+      if (status.getPath().getName().endsWith(".parquet")) {
+        fs.setTimes(status.getPath(), time, -1);
+      }
+    }
+  }
+
   private void checkDatasetEquivalenceWithFilter(
       String sourceFormat,
       GenericTable<?, ?> sourceTable,
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
deleted file mode 100644
index 0d140c764..000000000
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- 
-package org.apache.xtable.parquet;
-
-import static org.apache.spark.sql.functions.expr;
-import static org.junit.jupiter.api.Assertions.*;
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.time.Instant;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Properties;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.LocatedFileStatus;
-import org.apache.hadoop.fs.RemoteIterator;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SparkSession;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-import org.junit.jupiter.api.AfterAll;
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
-
-import org.apache.xtable.conversion.SourceTable;
-import org.apache.xtable.model.InternalSnapshot;
-import org.apache.xtable.model.InternalTable;
-import org.apache.xtable.model.TableChange;
-import org.apache.xtable.model.storage.TableFormat;
-
-public class ITParquetDataManager {
-  private static SparkSession spark;
-  private static ParquetConversionSourceProvider conversionSourceProvider;
-  public static final String PARTITION_FIELD_SPEC_CONFIG =
-      "xtable.parquet.source.partition_field_spec_config";
-
-  @BeforeAll
-  public static void setup() {
-    spark = SparkSession.builder().appName("ParquetTest").master("local[*]").getOrCreate();
-    conversionSourceProvider = new ParquetConversionSourceProvider();
-    Configuration hadoopConf = new Configuration();
-    hadoopConf.set("fs.defaultFS", "file:///");
-    conversionSourceProvider.init(hadoopConf);
-  }
-
-  @Test
-  public void testAppendParquetFileMultiplePartition() throws IOException {
-
-    Configuration conf = spark.sparkContext().hadoopConfiguration();
-
-    StructType schema =
-        DataTypes.createStructType(
-            new StructField[] {
-              DataTypes.createStructField("id", DataTypes.IntegerType, false),
-              DataTypes.createStructField("value", DataTypes.StringType, false),
-              DataTypes.createStructField("year", DataTypes.IntegerType, false),
-              DataTypes.createStructField("month", DataTypes.IntegerType, false)
-            });
-    List<Row> data =
-        Arrays.asList(
-            RowFactory.create(100, "A", 2026, 12),
-            RowFactory.create(101, "AA", 2026, 12),
-            RowFactory.create(102, "CB", 2027, 11),
-            RowFactory.create(103, "BA", 2027, 11));
-
-    Dataset<Row> dfInit = spark.createDataFrame(data, schema);
-    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_2");
-    Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test_2");
-    String outputPath = fixedPath.toString();
-    String finalAppendFilePath = appendFilePath.toString();
-    Dataset<Row> df = dfInit.withColumn("full_date", expr("make_date(year, month, 1)"));
-    df.coalesce(1).write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
-
-    // test find files to sync
-
-    org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath);
-    FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf);
-    // set the modification time to the table file
-    // update modifTime for file to append
-    // many partitions case
-    List<String> newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=11");
-    long targetModifTime = System.currentTimeMillis() - 360000;
-    long newModifTime = System.currentTimeMillis() - 50000;
-    long testTime = System.currentTimeMillis() - 90000; // between two prev times
-    for (String partition : newPartitions) {
-      org.apache.hadoop.fs.Path partitionPath =
-          new org.apache.hadoop.fs.Path(outputPath, partition);
-      if (fs.exists(partitionPath)) {
-        updateModificationTimeRecursive(fs, partitionPath, targetModifTime);
-      }
-    }
-    // create new file to append using Spark
-    List<Row> futureDataToSync =
-        Arrays.asList(
-            RowFactory.create(101, "A", 2026, 12),
-            RowFactory.create(301, "D", 2027, 11),
-            RowFactory.create(302, "DA", 2027, 11));
-    Dataset<Row> dfToSyncInit = spark.createDataFrame(futureDataToSync, schema);
-    Dataset<Row> dfToSync = dfToSyncInit.withColumn("full_date", expr("make_date(year, month, 1)"));
-    dfToSync
-        .coalesce(1)
-        .write()
-        .partitionBy("year", "month")
-        .mode("overwrite")
-        .parquet(finalAppendFilePath);
-
-    Dataset<Row> dfWithNewTimes = spark.read().parquet(finalAppendFilePath);
-    dfWithNewTimes
-        .coalesce(1)
-        .write()
-        .partitionBy("year", "month")
-        .mode("append")
-        .parquet(outputPath);
-    fs.delete(new org.apache.hadoop.fs.Path(finalAppendFilePath), true);
-    // conversionSource operations
-    Properties sourceProperties = new Properties();
-    String partitionConfig = "full_date:MONTH:year=yyyy/month=MM";
-    sourceProperties.put(PARTITION_FIELD_SPEC_CONFIG, partitionConfig);
-    SourceTable tableConfig =
-        SourceTable.builder()
-            .name("parquet_table_test_2")
-            .basePath(fixedPath.toAbsolutePath().toUri().toString())
-            .additionalProperties(sourceProperties)
-            .formatName(TableFormat.PARQUET)
-            .build();
-
-    ParquetConversionSource conversionSource =
-        conversionSourceProvider.getConversionSourceInstance(tableConfig);
-
-    for (String partition : newPartitions) {
-      org.apache.hadoop.fs.Path partitionPath =
-          new org.apache.hadoop.fs.Path(outputPath, partition);
-
-      RemoteIterator<LocatedFileStatus> it = fs.listFiles(partitionPath, false);
-      while (it.hasNext()) {
-        LocatedFileStatus fileStatus = it.next();
-
-        if (fileStatus.getModificationTime() > newModifTime) {
-          fs.setTimes(fileStatus.getPath(), newModifTime, -1);
-        } else {
-          fs.setTimes(fileStatus.getPath(), targetModifTime, -1);
-        }
-      }
-      fs.setTimes(partitionPath, newModifTime, -1);
-    }
-
-    InternalTable result = conversionSource.getTable(newModifTime);
-    assertEquals(
-        Instant.ofEpochMilli(newModifTime).toString(), result.getLatestCommitTime().toString());
-    assertNotNull(result);
-    assertEquals("parquet_table_test_2", result.getName());
-    assertEquals(TableFormat.PARQUET, result.getTableFormat());
-    assertNotNull(result.getReadSchema());
-    InternalSnapshot snapshot = conversionSource.getCurrentSnapshot();
-    assertNotNull(snapshot);
-    TableChange changes = conversionSource.getTableChangeForCommit(newModifTime);
-    assertNotNull(changes);
-    Instant instantBeforeFirstSnapshot =
-        Instant.ofEpochMilli(snapshot.getTable().getLatestCommitTime().toEpochMilli());
-    assertEquals(instantBeforeFirstSnapshot.toEpochMilli(), newModifTime);
-    assertTrue(conversionSource.isIncrementalSyncSafeFrom(Instant.ofEpochMilli(testTime)));
-  }
-
-  private void updateModificationTimeRecursive(
-      FileSystem fs, org.apache.hadoop.fs.Path path, long time) throws IOException {
-    RemoteIterator<LocatedFileStatus> it = fs.listFiles(path, true);
-    while (it.hasNext()) {
-      LocatedFileStatus status = it.next();
-      if (status.getPath().getName().endsWith(".parquet")) {
-        fs.setTimes(status.getPath(), time, -1);
-      }
-    }
-  }
-
-  @AfterAll
-  public static void tearDown() {
-    if (spark != null) {
-      spark.stop();
-    }
-  }
-}
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java
new file mode 100644
index 000000000..5d68497c9
--- /dev/null
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java
@@ -0,0 +1,453 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+package org.apache.xtable.parquet;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.ArgumentMatchers.anyBoolean;
+import static org.mockito.ArgumentMatchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.attribute.FileTime;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+class TestParquetDataManager {
+
+  private static final Configuration CONF = new Configuration();
+
+  // Helper method to create a mock LocatedFileStatus
+  private static LocatedFileStatus createMockFileStatus(String name, long modTime) {
+    LocatedFileStatus mockStatus = mock(LocatedFileStatus.class);
+    org.apache.hadoop.fs.Path mockPath = mock(org.apache.hadoop.fs.Path.class);
+    when(mockPath.getName()).thenReturn(name);
+    when(mockStatus.getPath()).thenReturn(mockPath);
+    when(mockStatus.getModificationTime()).thenReturn(modTime);
+    return mockStatus;
+  }
+
+  private static class StubbedRemoteIterator implements RemoteIterator<LocatedFileStatus> {
+    private final Iterator<LocatedFileStatus> files;
+
+    public StubbedRemoteIterator(List<LocatedFileStatus> files) {
+      this.files = files.iterator();
+    }
+
+    @Override
+    public boolean hasNext() {
+      return files.hasNext();
+    }
+
+    @Override
+    public LocatedFileStatus next() {
+      return files.next();
+    }
+  }
+
+  // Helper method to create a mock FileSystem
+  private static FileSystem createMockFileSystem(RemoteIterator<LocatedFileStatus> iterator)
+      throws IOException {
+    FileSystem mockFs = mock(FileSystem.class);
+    when(mockFs.listFiles(any(org.apache.hadoop.fs.Path.class), eq(true))).thenReturn(iterator);
+    return mockFs;
+  }
+
+  @Test
+  void testGetMostRecentParquetFile_withMultipleFiles() throws IOException {
+    LocatedFileStatus file1 = createMockFileStatus("file1.parquet", 1000L);
+    LocatedFileStatus file2 = createMockFileStatus("file2.parquet", 3000L);
+    LocatedFileStatus file3 = createMockFileStatus("file3.parquet", 2000L);
+
+    RemoteIterator<LocatedFileStatus> iterator =
+        new StubbedRemoteIterator(Arrays.asList(file1, file2, file3));
+    FileSystem mockFs = createMockFileSystem(iterator);
+
+    ParquetDataManager manager = new ParquetDataManager(CONF, "test-path", mockFs);
+
+    ParquetFileInfo result = manager.getMostRecentParquetFile();
+
+    assertNotNull(result);
+    assertEquals(3000L, result.getModificationTime());
+  }
+
+  @Test
+  void testGetMostRecentParquetFile_noFiles() throws IOException {
+    RemoteIterator<LocatedFileStatus> iterator = new StubbedRemoteIterator(new ArrayList<>());
+    FileSystem mockFs = createMockFileSystem(iterator);
+
+    ParquetDataManager manager = new ParquetDataManager(CONF, "test-path", mockFs);
+
+    IllegalStateException exception =
+        assertThrows(IllegalStateException.class, manager::getMostRecentParquetFile);
+    assertEquals("No files found", exception.getMessage());
+  }
+
+  @Test
+  void testGetParquetDataFileAt_exactMatch() throws IOException {
+    LocatedFileStatus file1 = createMockFileStatus("file1.parquet", 1000L);
+    LocatedFileStatus file2 = createMockFileStatus("file2.parquet", 2000L);
+    LocatedFileStatus file3 = createMockFileStatus("file3.parquet", 3000L);
+
+    RemoteIterator<LocatedFileStatus> iterator =
+        new StubbedRemoteIterator(Arrays.asList(file1, file2, file3));
+    FileSystem mockFs = createMockFileSystem(iterator);
+
+    ParquetDataManager manager = new ParquetDataManager(CONF, "test-path", mockFs);
+
+    ParquetFileInfo result = manager.getParquetDataFileAt(2000L);
+
+    assertNotNull(result);
+    assertEquals(2000L, result.getModificationTime());
+  }
+
+  @Test
+  void testGetParquetDataFileAt_firstAfterTime() throws IOException {
+    LocatedFileStatus file1 = createMockFileStatus("file1.parquet", 1000L);
+    LocatedFileStatus file2 = createMockFileStatus("file2.parquet", 2500L);
+    LocatedFileStatus file3 = createMockFileStatus("file3.parquet", 3000L);
+
+    RemoteIterator<LocatedFileStatus> iterator =
+        new StubbedRemoteIterator(Arrays.asList(file1, file2, file3));
+    FileSystem mockFs = createMockFileSystem(iterator);
+
+    ParquetDataManager manager = new ParquetDataManager(CONF, "test-path", mockFs);
+
+    ParquetFileInfo result = manager.getParquetDataFileAt(2000L);
+
+    assertNotNull(result);
+    assertEquals(2500L, result.getModificationTime());
+  }
+
+  @Test
+  void testGetParquetDataFileAt_multipleAfterTime() throws IOException {
+    LocatedFileStatus file1 = createMockFileStatus("file1.parquet", 1000L);
+    LocatedFileStatus file2 = createMockFileStatus("file2.parquet", 2500L);
+    LocatedFileStatus file3 = createMockFileStatus("file3.parquet", 3000L);
+
+    RemoteIterator<LocatedFileStatus> iterator =
+        new StubbedRemoteIterator(Arrays.asList(file1, file2, file3));
+    FileSystem mockFs = createMockFileSystem(iterator);
+
+    ParquetDataManager manager = new ParquetDataManager(CONF, "test-path", mockFs);
+
+    ParquetFileInfo result = manager.getParquetDataFileAt(2000L);
+
+    assertNotNull(result);
+    assertEquals(2500L, result.getModificationTime());
+  }
+
+  @Test
+  void testGetParquetDataFileAt_noMatch() throws IOException {
+    RemoteIterator<LocatedFileStatus> iterator = new StubbedRemoteIterator(Collections.emptyList());
+    FileSystem mockFs = createMockFileSystem(iterator);
+
+    ParquetDataManager manager = new ParquetDataManager(CONF, "test-path", mockFs);
+
+    IllegalStateException exception =
+        assertThrows(IllegalStateException.class, () -> manager.getParquetDataFileAt(5000L));
+    assertTrue(exception.getMessage().contains("No file found at or after 5000"));
+  }
+
+  @Test
+  void testGetParquetDataFileAt_allBefore() throws IOException {
+    LocatedFileStatus file1 = createMockFileStatus("file1.parquet", 1000L);
+    LocatedFileStatus file2 = createMockFileStatus("file2.parquet", 2000L);
+
+    RemoteIterator<LocatedFileStatus> iterator =
+        new StubbedRemoteIterator(Arrays.asList(file1, file2));
+    FileSystem mockFs = createMockFileSystem(iterator);
+
+    ParquetDataManager manager = new ParquetDataManager(CONF, "test-path", mockFs);
+
+    IllegalStateException exception =
+        assertThrows(IllegalStateException.class, () -> manager.getParquetDataFileAt(3000L));
+    assertTrue(exception.getMessage().contains("No file found at or after 3000"));
+  }
+
+  @Test
+  void testGetCurrentFileInfo_multipleFiles() throws IOException {
+    LocatedFileStatus file1 = createMockFileStatus("file1.parquet", 1000L);
+    LocatedFileStatus file2 = createMockFileStatus("file2.parquet", 2000L);
+    LocatedFileStatus file3 = createMockFileStatus("file3.parquet", 3000L);
+
+    RemoteIterator<LocatedFileStatus> iterator =
+        new StubbedRemoteIterator(Arrays.asList(file1, file2, file3));
+    FileSystem mockFs = createMockFileSystem(iterator);
+
+    ParquetDataManager manager = new ParquetDataManager(CONF, "test-path", mockFs);
+
+    Stream<ParquetFileInfo> result = manager.getCurrentFileInfo();
+
+    assertNotNull(result);
+    List<ParquetFileInfo> fileList = result.collect(Collectors.toList());
+    assertEquals(3, fileList.size());
+    assertEquals(1000L, fileList.get(0).getModificationTime());
+    assertEquals(2000L, fileList.get(1).getModificationTime());
+    assertEquals(3000L, fileList.get(2).getModificationTime());
+  }
+
+  @Test
+  void testGetCurrentFileInfo_emptyList() throws IOException {
+    RemoteIterator<LocatedFileStatus> iterator = new StubbedRemoteIterator(Collections.emptyList());
+    FileSystem mockFs = createMockFileSystem(iterator);
+
+    ParquetDataManager manager = new ParquetDataManager(CONF, "test-path", mockFs);
+
+    Stream<ParquetFileInfo> result = manager.getCurrentFileInfo();
+
+    assertNotNull(result);
+    assertEquals(0, result.count());
+  }
+
+  @Test
+  void testGetCurrentFileInfo_streamCharacteristics() throws IOException {
+    LocatedFileStatus file1 = createMockFileStatus("file1.parquet", 1000L);
+    LocatedFileStatus file2 = createMockFileStatus("file2.parquet", 2000L);
+
+    RemoteIterator<LocatedFileStatus> iterator =
+        new StubbedRemoteIterator(Arrays.asList(file1, file2));
+    FileSystem mockFs = createMockFileSystem(iterator);
+
+    ParquetDataManager manager = new ParquetDataManager(CONF, "test-path", mockFs);
+
+    Stream<ParquetFileInfo> result = manager.getCurrentFileInfo();
+
+    assertNotNull(result);
+    assertTrue(result.allMatch(info -> info.getModificationTime() > 0));
+  }
+
+  @Test
+  void testGetParquetFilesMetadataAfterTime_someMatch() throws IOException {
+    LocatedFileStatus file1 = createMockFileStatus("file1.parquet", 1000L);
+    LocatedFileStatus file2 = createMockFileStatus("file2.parquet", 2000L);
+    LocatedFileStatus file3 = createMockFileStatus("file3.parquet", 3000L);
+
+    RemoteIterator<LocatedFileStatus> iterator =
+        new StubbedRemoteIterator(Arrays.asList(file1, file2, file3));
+    FileSystem mockFs = createMockFileSystem(iterator);
+
+    ParquetDataManager manager = new ParquetDataManager(CONF, "test-path", mockFs);
+
+    List<ParquetFileInfo> result = manager.getParquetFilesMetadataAfterTime(2000L);
+
+    assertNotNull(result);
+    assertEquals(2, result.size());
+    assertEquals(2000L, result.get(0).getModificationTime());
+    assertEquals(3000L, result.get(1).getModificationTime());
+  }
+
+  @Test
+  void testGetParquetFilesMetadataAfterTime_allMatch() throws IOException {
+    LocatedFileStatus file1 = createMockFileStatus("file1.parquet", 2000L);
+    LocatedFileStatus file2 = createMockFileStatus("file2.parquet", 3000L);
+
+    RemoteIterator<LocatedFileStatus> iterator =
+        new StubbedRemoteIterator(Arrays.asList(file1, file2));
+    FileSystem mockFs = createMockFileSystem(iterator);
+
+    ParquetDataManager manager = new ParquetDataManager(CONF, "test-path", mockFs);
+
+    List<ParquetFileInfo> result = manager.getParquetFilesMetadataAfterTime(1000L);
+
+    assertNotNull(result);
+    assertEquals(2, result.size());
+  }
+
+  @Test
+  void testGetParquetFilesMetadataAfterTime_noneMatch() throws IOException {
+    LocatedFileStatus file1 = createMockFileStatus("file1.parquet", 1000L);
+    LocatedFileStatus file2 = createMockFileStatus("file2.parquet", 2000L);
+
+    RemoteIterator<LocatedFileStatus> iterator =
+        new StubbedRemoteIterator(Arrays.asList(file1, file2));
+    FileSystem mockFs = createMockFileSystem(iterator);
+
+    ParquetDataManager manager = new ParquetDataManager(CONF, "test-path", mockFs);
+
+    List<ParquetFileInfo> result = manager.getParquetFilesMetadataAfterTime(5000L);
+
+    assertNotNull(result);
+    assertEquals(0, result.size());
+  }
+
+  @Test
+  void testGetParquetFilesMetadataAfterTime_exactTimeMatch() throws IOException {
+    LocatedFileStatus file1 = createMockFileStatus("file1.parquet", 1000L);
+    LocatedFileStatus file2 = createMockFileStatus("file2.parquet", 2000L);
+    LocatedFileStatus file3 = createMockFileStatus("file3.parquet", 3000L);
+
+    RemoteIterator<LocatedFileStatus> iterator =
+        new StubbedRemoteIterator(Arrays.asList(file1, file2, file3));
+    FileSystem mockFs = createMockFileSystem(iterator);
+
+    ParquetDataManager manager = new ParquetDataManager(CONF, "test-path", mockFs);
+
+    List<ParquetFileInfo> result = manager.getParquetFilesMetadataAfterTime(2000L);
+
+    assertNotNull(result);
+    assertEquals(2, result.size());
+    assertEquals(2000L, result.get(0).getModificationTime());
+    assertEquals(3000L, result.get(1).getModificationTime());
+  }
+
+  @Test
+  void testGetParquetFiles_caching() throws IOException {
+    LocatedFileStatus file = createMockFileStatus("file.parquet", 1000L);
+    RemoteIterator<LocatedFileStatus> iterator =
+        new StubbedRemoteIterator(Collections.singletonList(file));
+    FileSystem mockFs = createMockFileSystem(iterator);
+
+    ParquetDataManager manager = new ParquetDataManager(CONF, "test-path", mockFs);
+
+    // No filesystem access should happen yet
+    verify(mockFs, never()).listFiles(any(org.apache.hadoop.fs.Path.class), anyBoolean());
+
+    // Access multiple times
+    manager.getCurrentFileInfo();
+    manager.getMostRecentParquetFile();
+    manager.getParquetFilesMetadataAfterTime(0L);
+
+    // Verify filesystem was accessed only once
+    verify(mockFs, times(1)).listFiles(any(org.apache.hadoop.fs.Path.class), anyBoolean());
+  }
+
+  @Test
+  void testOnlyParquetFilesIncluded() throws IOException {
+    LocatedFileStatus parquetFile = createMockFileStatus("data.parquet", 1000L);
+    LocatedFileStatus txtFile = createMockFileStatus("readme.txt", 2000L);
+    LocatedFileStatus jsonFile = createMockFileStatus("config.json", 3000L);
+
+    RemoteIterator<LocatedFileStatus> iterator =
+        new StubbedRemoteIterator(Arrays.asList(parquetFile, txtFile, jsonFile));
+    FileSystem mockFs = createMockFileSystem(iterator);
+
+    ParquetDataManager manager = new ParquetDataManager(CONF, "test-path", mockFs);
+
+    List<ParquetFileInfo> result = manager.getParquetFilesMetadataAfterTime(0L);
+
+    assertEquals(1, result.size());
+    assertEquals(1000L, result.get(0).getModificationTime());
+  }
+
+  @Test
+  void testWithRealFileSystem_multipleFiles(@TempDir Path tempDir) throws IOException {
+    // Create multiple parquet files with different modification times
+    Path file1 = tempDir.resolve("data1.parquet");
+    Path file2 = tempDir.resolve("data2.parquet");
+    Path file3 = tempDir.resolve("data3.parquet");
+
+    Files.createFile(file1);
+    Files.createFile(file2);
+    Files.createFile(file3);
+
+    // Set different modification times
+    Files.setLastModifiedTime(file1, FileTime.fromMillis(1000L));
+    Files.setLastModifiedTime(file2, FileTime.fromMillis(3000L));
+    Files.setLastModifiedTime(file3, FileTime.fromMillis(2000L));
+
+    ParquetDataManager manager = new ParquetDataManager(CONF, tempDir.toString());
+
+    ParquetFileInfo mostRecent = manager.getMostRecentParquetFile();
+    assertNotNull(mostRecent);
+    assertEquals(3000L, mostRecent.getModificationTime());
+
+    List<ParquetFileInfo> afterTime = manager.getParquetFilesMetadataAfterTime(2000L);
+    assertEquals(2, afterTime.size());
+  }
+
+  @Test
+  void testWithRealFileSystem_nestedDirectories(@TempDir Path tempDir) throws IOException {
+    // Create nested directory structure
+    Path subDir1 = tempDir.resolve("subdir1");
+    Path subDir2 = tempDir.resolve("subdir2");
+    Files.createDirectories(subDir1);
+    Files.createDirectories(subDir2);
+
+    // Create parquet files in different directories
+    Path file1 = tempDir.resolve("root.parquet");
+    Path file2 = subDir1.resolve("nested1.parquet");
+    Path file3 = subDir2.resolve("nested2.parquet");
+
+    Files.createFile(file1);
+    Files.createFile(file2);
+    Files.createFile(file3);
+
+    Files.setLastModifiedTime(file1, FileTime.fromMillis(1000L));
+    Files.setLastModifiedTime(file2, FileTime.fromMillis(2000L));
+    Files.setLastModifiedTime(file3, FileTime.fromMillis(3000L));
+    ParquetDataManager manager = new ParquetDataManager(CONF, tempDir.toString());
+
+    List<ParquetFileInfo> allFiles = manager.getParquetFilesMetadataAfterTime(0L);
+    assertEquals(3, allFiles.size());
+
+    ParquetFileInfo mostRecent = manager.getMostRecentParquetFile();
+    assertEquals(3000L, mostRecent.getModificationTime());
+  }
+
+  @Test
+  void testWithRealFileSystem_mixedFileTypes(@TempDir Path tempDir) throws IOException {
+    // Create mix of parquet and non-parquet files
+    Path parquetFile1 = tempDir.resolve("data1.parquet");
+    Path parquetFile2 = tempDir.resolve("data2.parquet");
+    Path txtFile = tempDir.resolve("readme.txt");
+    Path jsonFile = tempDir.resolve("config.json");
+
+    Files.createFile(parquetFile1);
+    Files.createFile(parquetFile2);
+    Files.createFile(txtFile);
+    Files.createFile(jsonFile);
+
+    Files.setLastModifiedTime(parquetFile1, FileTime.fromMillis(1000L));
+    Files.setLastModifiedTime(parquetFile2, FileTime.fromMillis(2000L));
+    Files.setLastModifiedTime(txtFile, FileTime.fromMillis(3000L));
+    Files.setLastModifiedTime(jsonFile, FileTime.fromMillis(4000L));
+
+    ParquetDataManager manager = new ParquetDataManager(CONF, tempDir.toString());
+
+    List<ParquetFileInfo> allFiles = manager.getParquetFilesMetadataAfterTime(0L);
+    // Only parquet files should be included
+    assertEquals(2, allFiles.size());
+
+    ParquetFileInfo mostRecent = manager.getMostRecentParquetFile();
+    // Most recent parquet file, not the txt or json
+    assertEquals(2000L, mostRecent.getModificationTime());
+  }
+}

From 2135ccf2b87b75bc23b3e23670f346ec8f29058c Mon Sep 17 00:00:00 2001
From: Timothy Brown <tim@onehouse.ai>
Date: Wed, 11 Feb 2026 21:28:30 -0500
Subject: [PATCH 057/100] make IT parameterized, add incremental sync

---
 .../parquet/ITParquetConversionSource.java    | 188 ++++++------------
 1 file changed, 59 insertions(+), 129 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index ace3f4173..4c0ed2ab7 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -114,27 +114,19 @@ public static void teardown() {
       sparkSession = null;
     }
   }
-  // delimiter must be / and not - or any other one
-  private static Stream<Arguments> provideArgsForFilePartitionTesting() {
-    String partitionConfig = // "timestamp:YEAR:year=yyyy";
-        "timestamp:MONTH:year=yyyy/month=MM"; // or "timestamp:YEAR:year=yyyy", or //
-    // timestamp:DAY:year=yyyy/month=MM/day=dd
-    return Stream.of(
-        Arguments.of(
-            buildArgsForPartition(
-                PARQUET, Arrays.asList(ICEBERG, DELTA, HUDI), partitionConfig, partitionConfig)));
-  }
 
   private static TableFormatPartitionDataHolder buildArgsForPartition(
       String sourceFormat,
       List<String> targetFormats,
       String hudiPartitionConfig,
-      String xTablePartitionConfig) {
+      String xTablePartitionConfig,
+      SyncMode syncMode) {
     return TableFormatPartitionDataHolder.builder()
         .sourceTableFormat(sourceFormat)
         .targetTableFormats(targetFormats)
         .hudiSourceConfig(Optional.ofNullable(hudiPartitionConfig))
         .xTablePartitionConfig(xTablePartitionConfig)
+        .syncMode(syncMode)
         .build();
   }
 
@@ -179,12 +171,12 @@ private static ConversionConfig getTableSyncConfig(
         .build();
   }
 
-  private static Stream<Arguments> provideArgsForFileNonPartitionTesting() {
-    String partitionConfig = null;
-    return Stream.of(
-        Arguments.of(
-            buildArgsForPartition(
-                PARQUET, Arrays.asList(ICEBERG, DELTA, HUDI), partitionConfig, partitionConfig)));
+  private static Stream<Arguments> provideArgsForSyncTesting() {
+    List<String> partitionConfigs = Arrays.asList(null, "timestamp:MONTH:year=yyyy/month=MM");
+    return partitionConfigs.stream().flatMap(partitionConfig ->
+      Arrays.stream(SyncMode.values()).map(syncMode -> Arguments.of(
+          buildArgsForPartition(
+              PARQUET, Arrays.asList(ICEBERG, DELTA, HUDI), partitionConfig, partitionConfig, syncMode))));
   }
 
   private ConversionSourceProvider<?> getConversionSourceProvider(String sourceTableFormat) {
@@ -199,9 +191,8 @@ private ConversionSourceProvider<?> getConversionSourceProvider(String sourceTab
   }
 
   @ParameterizedTest
-  @MethodSource("provideArgsForFileNonPartitionTesting")
-  public void testFileNonPartitionedData(
-      TableFormatPartitionDataHolder tableFormatPartitionDataHolder) throws URISyntaxException {
+  @MethodSource("provideArgsForSyncTesting")
+  void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
     String tableName = getTableName();
     String sourceTableFormat = tableFormatPartitionDataHolder.getSourceTableFormat();
     List<String> targetTableFormats = tableFormatPartitionDataHolder.getTargetTableFormats();
@@ -235,17 +226,15 @@ public void testFileNonPartitionedData(
                   new MetadataBuilder().putString("precision", "millis").build())
             });
     Dataset<Row> df = sparkSession.createDataFrame(data, schema);
-    String dataPath = tempDir.toAbsolutePath().toString() + "/non_partitioned_data";
-    df.write().mode(SaveMode.Overwrite).parquet(dataPath);
-    GenericTable table;
-    table =
-        GenericTable.getInstance(
-            tableName, Paths.get(dataPath), sparkSession, jsc, sourceTableFormat, false);
-    try (GenericTable tableToClose = table) {
+    String dataPath = tempDir.toAbsolutePath() + (xTablePartitionConfig == null ? "/non_partitioned_data_" : "/partitioned_data_") + tableFormatPartitionDataHolder.getSyncMode();
+    writeData(df, dataPath, xTablePartitionConfig);
+    boolean isPartitioned = xTablePartitionConfig != null;
+    try (GenericTable table = GenericTable.getInstance(
+        tableName, Paths.get(dataPath), sparkSession, jsc, sourceTableFormat, isPartitioned)) {
       ConversionConfig conversionConfig =
           getTableSyncConfig(
               sourceTableFormat,
-              SyncMode.FULL,
+              tableFormatPartitionDataHolder.getSyncMode(),
               tableName,
               table,
               targetTableFormats,
@@ -254,77 +243,11 @@ public void testFileNonPartitionedData(
       ConversionController conversionController =
           new ConversionController(jsc.hadoopConfiguration());
       conversionController.sync(conversionConfig, conversionSourceProvider);
-      checkDatasetEquivalenceWithFilter(sourceTableFormat, tableToClose, targetTableFormats, false);
-    }
-  }
-
-  @ParameterizedTest
-  @MethodSource("provideArgsForFilePartitionTesting")
-  public void testFilePartitionedData(TableFormatPartitionDataHolder tableFormatPartitionDataHolder)
-      throws URISyntaxException {
-    String tableName = getTableName();
-    String sourceTableFormat = tableFormatPartitionDataHolder.getSourceTableFormat();
-    List<String> targetTableFormats = tableFormatPartitionDataHolder.getTargetTableFormats();
-    String xTablePartitionConfig = tableFormatPartitionDataHolder.getXTablePartitionConfig();
-    ConversionSourceProvider<?> conversionSourceProvider =
-        getConversionSourceProvider(sourceTableFormat);
-    // create the data
-    List<Row> data =
-        Arrays.asList(
-            RowFactory.create(1, "Alice", true, 30.1, new Timestamp(System.currentTimeMillis())),
-            RowFactory.create(
-                2, "Bob", false, 24.6, new Timestamp(System.currentTimeMillis() + 1000)),
-            RowFactory.create(
-                3, "Charlie", true, 35.2, new Timestamp(System.currentTimeMillis() + 2000)),
-            RowFactory.create(
-                4, "David", false, 29.5, new Timestamp(System.currentTimeMillis() + 3000)),
-            RowFactory.create(
-                5, "Eve", true, 22.2, new Timestamp(System.currentTimeMillis() + 4000)));
+      checkDatasetEquivalenceWithFilter(sourceTableFormat, table, targetTableFormats, isPartitioned);
 
-    schema =
-        DataTypes.createStructType(
-            new StructField[] {
-              DataTypes.createStructField("id", DataTypes.IntegerType, false),
-              DataTypes.createStructField("name", DataTypes.StringType, false),
-              DataTypes.createStructField("hasSiblings", DataTypes.BooleanType, false),
-              DataTypes.createStructField("age", DataTypes.DoubleType, false),
-              DataTypes.createStructField(
-                  "timestamp",
-                  DataTypes.TimestampType,
-                  false,
-                  new MetadataBuilder().putString("precision", "millis").build())
-            });
-    Dataset<Row> df = sparkSession.createDataFrame(data, schema);
-    String dataPathPart = tempDir.toAbsolutePath() + "/partitioned_data";
-    df.withColumn("year", functions.year(functions.col("timestamp").cast(DataTypes.TimestampType)))
-        .withColumn(
-            "month",
-            functions.date_format(functions.col("timestamp").cast(DataTypes.TimestampType), "MM"))
-        .write()
-        .mode(SaveMode.Overwrite)
-        .partitionBy("year", "month")
-        .parquet(dataPathPart);
-    GenericTable table;
-    table =
-        GenericTable.getInstance(
-            tableName, Paths.get(dataPathPart), sparkSession, jsc, sourceTableFormat, true);
-    try (GenericTable tableToClose = table) {
-      ConversionConfig conversionConfig =
-          getTableSyncConfig(
-              sourceTableFormat,
-              SyncMode.FULL,
-              tableName,
-              table,
-              targetTableFormats,
-              xTablePartitionConfig,
-              null);
-      ConversionController conversionController =
-          new ConversionController(jsc.hadoopConfiguration());
-      conversionController.sync(conversionConfig, conversionSourceProvider);
-      checkDatasetEquivalenceWithFilter(sourceTableFormat, tableToClose, targetTableFormats, true);
       // update the current parquet file data with another attribute the sync again
       List<Row> dataToAppend =
-          Arrays.asList(
+          Collections.singletonList(
               RowFactory.create(
                   10,
                   "BobAppended",
@@ -333,34 +256,41 @@ public void testFilePartitionedData(TableFormatPartitionDataHolder tableFormatPa
                   new Timestamp(System.currentTimeMillis() + 1500)));
 
       Dataset<Row> dfAppend = sparkSession.createDataFrame(dataToAppend, schema);
-      dfAppend
-          .withColumn(
-              "year", functions.year(functions.col("timestamp").cast(DataTypes.TimestampType)))
-          .withColumn(
+      writeData(dfAppend, dataPath, xTablePartitionConfig);
+
+      ConversionConfig conversionConfigAppended =
+          getTableSyncConfig(
+              sourceTableFormat,
+              tableFormatPartitionDataHolder.getSyncMode(),
+              tableName,
+              table,
+              targetTableFormats,
+              xTablePartitionConfig,
+              null);
+      ConversionController conversionControllerAppended =
+          new ConversionController(jsc.hadoopConfiguration());
+      conversionControllerAppended.sync(conversionConfigAppended, conversionSourceProvider);
+      checkDatasetEquivalenceWithFilter(sourceTableFormat, table, targetTableFormats, isPartitioned);
+    }
+  }
+
+  private void writeData(Dataset<Row> df, String dataPath, String partitionConfig) {
+    if (partitionConfig != null) {
+      // extract partition columns from config
+      String[] partitionCols = Arrays.stream(partitionConfig.split(":")[2].split("/")).map(s -> s.split("=")[0]).toArray(String[]::new);
+      // add partition columns to dataframe
+      for (String partitionCol : partitionCols) {
+        if (partitionCol.equals("year")) {
+          df = df.withColumn("year", functions.year(functions.col("timestamp").cast(DataTypes.TimestampType)));
+        } else if (partitionCol.equals("month")) {
+          df = df.withColumn(
               "month",
-              functions.date_format(functions.col("timestamp").cast(DataTypes.TimestampType), "MM"))
-          .write()
-          .mode(SaveMode.Append)
-          .partitionBy("year", "month")
-          .parquet(dataPathPart);
-      GenericTable tableAppend;
-      tableAppend =
-          GenericTable.getInstance(
-              tableName, Paths.get(dataPathPart), sparkSession, jsc, sourceTableFormat, true);
-      try (GenericTable tableToCloseAppended = tableAppend) {
-        ConversionConfig conversionConfigAppended =
-            getTableSyncConfig(
-                sourceTableFormat,
-                SyncMode.FULL,
-                tableName,
-                tableAppend,
-                targetTableFormats,
-                xTablePartitionConfig,
-                null);
-        ConversionController conversionControllerAppended =
-            new ConversionController(jsc.hadoopConfiguration());
-        conversionControllerAppended.sync(conversionConfigAppended, conversionSourceProvider);
+              functions.date_format(functions.col("timestamp").cast(DataTypes.TimestampType), "MM"));
+        }
       }
+      df.write().mode(SaveMode.Append).partitionBy(partitionCols).parquet(dataPath);
+    } else {
+      df.write().mode(SaveMode.Append).parquet(dataPath);
     }
   }
 
@@ -483,8 +413,7 @@ private void checkDatasetEquivalenceWithFilter(
       String sourceFormat,
       GenericTable<?, ?> sourceTable,
       List<String> targetFormats,
-      boolean isPartitioned)
-      throws URISyntaxException {
+      boolean isPartitioned) {
     checkDatasetEquivalence(
         sourceFormat,
         sourceTable,
@@ -502,8 +431,7 @@ private void checkDatasetEquivalence(
       List<String> targetFormats,
       Map<String, Map<String, String>> targetOptions,
       Integer expectedCount,
-      boolean isPartitioned)
-      throws URISyntaxException {
+      boolean isPartitioned) {
     Dataset<Row> sourceRows =
         sparkSession
             .read()
@@ -511,7 +439,8 @@ private void checkDatasetEquivalence(
             .options(sourceOptions)
             .option("recursiveFileLookup", "true")
             .option("pathGlobFilter", "*.parquet")
-            .parquet(sourceTable.getDataPath());
+            .parquet(sourceTable.getDataPath())
+            .orderBy("id"); // order by id to ensure deterministic order for comparison
     Map<String, Dataset<Row>> targetRowsByFormat =
         targetFormats.stream()
             .collect(
@@ -530,7 +459,8 @@ private void checkDatasetEquivalence(
                           .read()
                           .options(finalTargetOptions)
                           .format(targetFormat.toLowerCase())
-                          .load(sourceTable.getDataPath());
+                          .load(sourceTable.getDataPath())
+                          .orderBy("id");
                     }));
 
     String[] selectColumnsArr = schema.fieldNames();
@@ -543,7 +473,6 @@ private void checkDatasetEquivalence(
       String format = entry.getKey();
 
       Dataset<Row> targetRows = entry.getValue();
-      targetRows.show();
 
       List<String> dataset2Rows = targetRows.selectExpr(selectColumnsArr).toJSON().collectAsList();
 
@@ -587,5 +516,6 @@ private static class TableFormatPartitionDataHolder {
     String xTablePartitionConfig;
     Optional<String> hudiSourceConfig;
     String filter;
+    SyncMode syncMode;
   }
 }

From e76ac21ebd40edef172c4cf0f8990e6fb505626b Mon Sep 17 00:00:00 2001
From: Timothy Brown <tim@onehouse.ai>
Date: Wed, 11 Feb 2026 21:52:52 -0500
Subject: [PATCH 058/100] fix schema extraction to fix snapshot sync

---
 .../parquet/ParquetConversionSource.java      | 22 ++-----------------
 .../ParquetPartitionValueExtractor.java       |  4 ++--
 .../xtable/parquet/ParquetStatsExtractor.java |  3 +--
 .../parquet/ITParquetConversionSource.java    |  2 +-
 4 files changed, 6 insertions(+), 25 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
index 5768dac03..6f1649ef2 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
@@ -114,24 +114,7 @@ public InternalTable getTable(Long modificationTime) {
   }
 
   private Stream<InternalDataFile> getInternalDataFiles(Stream<ParquetFileInfo> parquetFiles) {
-    return parquetFiles.map(
-        file -> {
-          ParquetMetadata metadata =
-              parquetMetadataExtractor.readParquetMetadata(hadoopConf, file.getPath());
-          return InternalDataFile.builder()
-              .physicalPath(file.getPath().toString())
-              .fileFormat(FileFormat.APACHE_PARQUET)
-              .fileSizeBytes(file.getSize())
-              .partitionValues(
-                  partitionValueExtractor.extractPartitionValues(
-                      partitionSpecExtractor.spec(
-                          partitionValueExtractor.extractSchemaForParquetPartitions(
-                              metadata, file.getPath().toString())),
-                      HudiPathUtils.getPartitionPath(new Path(basePath), file.getPath())))
-              .lastModified(file.getModificationTime())
-              .columnStats(parquetStatsExtractor.getColumnStatsForaFile(metadata))
-              .build();
-        });
+    return parquetFiles.map(this::createInternalDataFileFromParquetFile);
   }
 
   private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileInfo parquetFile) {
@@ -140,8 +123,7 @@ private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileInfo p
         .partitionValues(
             partitionValueExtractor.extractPartitionValues(
                 partitionSpecExtractor.spec(
-                    partitionValueExtractor.extractSchemaForParquetPartitions(
-                        parquetFile.getMetadata(), parquetFile.getPath().toString())),
+                    partitionValueExtractor.extractSchemaForParquetPartitions(parquetFile.getMetadata())),
                 HudiPathUtils.getPartitionPath(new Path(basePath), parquetFile.getPath())))
         .lastModified(parquetFile.getModificationTime())
         .fileSizeBytes(parquetFile.getSize())
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetPartitionValueExtractor.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetPartitionValueExtractor.java
index ab4ceaf48..26ac67ff0 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetPartitionValueExtractor.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetPartitionValueExtractor.java
@@ -131,8 +131,8 @@ public static ParquetPartitionValueExtractor getInstance() {
     return INSTANCE;
   }
 
-  public InternalSchema extractSchemaForParquetPartitions(ParquetMetadata footer, String path) {
+  public InternalSchema extractSchemaForParquetPartitions(ParquetMetadata footer) {
     MessageType parquetSchema = parquetMetadataExtractor.getSchema(footer);
-    return schemaExtractor.toInternalSchema(parquetSchema, path);
+    return schemaExtractor.toInternalSchema(parquetSchema, "");
   }
 }
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetStatsExtractor.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetStatsExtractor.java
index a1393b980..547d86ff6 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetStatsExtractor.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetStatsExtractor.java
@@ -140,8 +140,7 @@ public static InternalDataFile toInternalDataFile(Configuration hadoopConf, Path
         partitionValueExtractor.extractPartitionValues(
             partitionSpecExtractor.spec(
                 partitionValueExtractor.extractSchemaForParquetPartitions(
-                    parquetMetadataExtractor.readParquetMetadata(hadoopConf, file.getPath()),
-                    file.getPath().toString())),
+                    parquetMetadataExtractor.readParquetMetadata(hadoopConf, file.getPath()))),
             parentPath.toString());
     return InternalDataFile.builder()
         .physicalPath(parentPath.toString())
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 4c0ed2ab7..fd86d094d 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -295,7 +295,7 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
   }
 
   @Test
-  public void testIncrementalSyncWithMultiplePartitions() throws IOException {
+  void testIncrementalSyncWithMultiplePartitions() throws IOException {
 
     Configuration conf = sparkSession.sparkContext().hadoopConfiguration();
 

From 9901c1b7de4237d28c6e1c39950dd8a3847b3e80 Mon Sep 17 00:00:00 2001
From: Timothy Brown <tim@onehouse.ai>
Date: Wed, 11 Feb 2026 22:11:56 -0500
Subject: [PATCH 059/100] add optimizations

---
 .../parquet/ParquetConversionSource.java      | 33 +++++++++----------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
index 6f1649ef2..6af40c968 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
@@ -32,22 +32,24 @@
 import lombok.extern.log4j.Log4j2;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.*;
 import org.apache.hadoop.fs.Path;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.schema.MessageType;
 
-import org.apache.xtable.hudi.*;
 import org.apache.xtable.hudi.HudiPathUtils;
-import org.apache.xtable.model.*;
+import org.apache.xtable.hudi.PathBasedPartitionSpecExtractor;
 import org.apache.xtable.model.CommitsBacklog;
 import org.apache.xtable.model.InstantsForIncrementalSync;
+import org.apache.xtable.model.InternalSnapshot;
+import org.apache.xtable.model.InternalTable;
 import org.apache.xtable.model.TableChange;
 import org.apache.xtable.model.schema.InternalPartitionField;
 import org.apache.xtable.model.schema.InternalSchema;
-import org.apache.xtable.model.storage.*;
-import org.apache.xtable.model.storage.FileFormat;
+import org.apache.xtable.model.storage.DataLayoutStrategy;
 import org.apache.xtable.model.storage.InternalDataFile;
+import org.apache.xtable.model.storage.InternalFilesDiff;
+import org.apache.xtable.model.storage.PartitionFileGroup;
+import org.apache.xtable.model.storage.TableFormat;
 import org.apache.xtable.spi.extractor.ConversionSource;
 
 @Log4j2
@@ -85,8 +87,7 @@ public class ParquetConversionSource implements ConversionSource<Long> {
   }
 
   private InternalTable createInternalTableFromFile(ParquetFileInfo latestFile) {
-    ParquetMetadata parquetMetadata =
-        parquetMetadataExtractor.readParquetMetadata(hadoopConf, latestFile.getPath());
+    ParquetMetadata parquetMetadata = latestFile.getMetadata();
     MessageType parquetSchema = parquetMetadataExtractor.getSchema(parquetMetadata);
     InternalSchema schema = schemaExtractor.toInternalSchema(parquetSchema, "");
     List<InternalPartitionField> partitionFields = partitionSpecExtractor.spec(schema);
@@ -113,18 +114,14 @@ public InternalTable getTable(Long modificationTime) {
     return createInternalTableFromFile(file);
   }
 
-  private Stream<InternalDataFile> getInternalDataFiles(Stream<ParquetFileInfo> parquetFiles) {
-    return parquetFiles.map(this::createInternalDataFileFromParquetFile);
+  private Stream<InternalDataFile> getInternalDataFiles(Stream<ParquetFileInfo> parquetFiles, InternalSchema schema) {
+    return parquetFiles.map(file -> createInternalDataFileFromParquetFile(file, schema));
   }
 
-  private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileInfo parquetFile) {
+  private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileInfo parquetFile, InternalSchema schema) {
     return InternalDataFile.builder()
         .physicalPath(parquetFile.getPath().toString())
-        .partitionValues(
-            partitionValueExtractor.extractPartitionValues(
-                partitionSpecExtractor.spec(
-                    partitionValueExtractor.extractSchemaForParquetPartitions(parquetFile.getMetadata())),
-                HudiPathUtils.getPartitionPath(new Path(basePath), parquetFile.getPath())))
+        .partitionValues(partitionValueExtractor.extractPartitionValues(partitionSpecExtractor.spec(schema), HudiPathUtils.getPartitionPath(new Path(basePath), parquetFile.getPath())))
         .lastModified(parquetFile.getModificationTime())
         .fileSizeBytes(parquetFile.getSize())
         .columnStats(parquetStatsExtractor.getColumnStatsForaFile(parquetFile.getMetadata()))
@@ -147,7 +144,7 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
         getMostRecentTableConfig(tableChangesAfterModificationTime.stream());
     Set<InternalDataFile> addedInternalDataFiles =
         tableChangesAfterModificationTime.stream()
-            .map(this::createInternalDataFileFromParquetFile)
+            .map(file -> createInternalDataFileFromParquetFile(file, internalTable.getReadSchema()))
             .collect(Collectors.toSet());
 
     return TableChange.builder()
@@ -179,9 +176,9 @@ public InternalTable getCurrentTable() {
 
   @Override
   public InternalSnapshot getCurrentSnapshot() {
-    Stream<InternalDataFile> internalDataFiles =
-        getInternalDataFiles(parquetDataManager.getCurrentFileInfo());
     InternalTable table = getMostRecentTable();
+    Stream<InternalDataFile> internalDataFiles =
+        getInternalDataFiles(parquetDataManager.getCurrentFileInfo(), table.getReadSchema());
     return InternalSnapshot.builder()
         .table(table)
         .sourceIdentifier(

From a2bb6c38ad4c21d8640844f1a676c7b6017aee4d Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Fri, 13 Feb 2026 23:16:42 +0100
Subject: [PATCH 060/100] fix for ParquetDataManager Test (mock())

---
 .../parquet/ParquetConversionSource.java      | 11 ++-
 .../xtable/parquet/ParquetDataManager.java    |  4 +-
 .../parquet/ITParquetConversionSource.java    | 72 ++++++++++++++-----
 .../parquet/TestParquetDataManager.java       |  4 +-
 4 files changed, 66 insertions(+), 25 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
index 6af40c968..356fe8f60 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
@@ -114,14 +114,19 @@ public InternalTable getTable(Long modificationTime) {
     return createInternalTableFromFile(file);
   }
 
-  private Stream<InternalDataFile> getInternalDataFiles(Stream<ParquetFileInfo> parquetFiles, InternalSchema schema) {
+  private Stream<InternalDataFile> getInternalDataFiles(
+      Stream<ParquetFileInfo> parquetFiles, InternalSchema schema) {
     return parquetFiles.map(file -> createInternalDataFileFromParquetFile(file, schema));
   }
 
-  private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileInfo parquetFile, InternalSchema schema) {
+  private InternalDataFile createInternalDataFileFromParquetFile(
+      ParquetFileInfo parquetFile, InternalSchema schema) {
     return InternalDataFile.builder()
         .physicalPath(parquetFile.getPath().toString())
-        .partitionValues(partitionValueExtractor.extractPartitionValues(partitionSpecExtractor.spec(schema), HudiPathUtils.getPartitionPath(new Path(basePath), parquetFile.getPath())))
+        .partitionValues(
+            partitionValueExtractor.extractPartitionValues(
+                partitionSpecExtractor.spec(schema),
+                HudiPathUtils.getPartitionPath(new Path(basePath), parquetFile.getPath())))
         .lastModified(parquetFile.getModificationTime())
         .fileSizeBytes(parquetFile.getSize())
         .columnStats(parquetStatsExtractor.getColumnStatsForaFile(parquetFile.getMetadata()))
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 84cb46690..c1db5ee48 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -20,6 +20,7 @@
 
 import java.io.IOException;
 import java.net.URI;
+import java.nio.file.Paths;
 import java.util.*;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
@@ -52,7 +53,8 @@ public ParquetDataManager(Configuration hadoopConf, String basePath) {
     this.hadoopConf = hadoopConf;
     this.basePath = basePath;
     try {
-      this.fileSystem = FileSystem.get(URI.create(basePath), hadoopConf);
+      URI uri = Paths.get(basePath).toUri();
+      this.fileSystem = FileSystem.get(uri, hadoopConf);
     } catch (IOException e) {
       throw new ReadException("Unable to initialize file system for base path: " + basePath, e);
     }
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index fd86d094d..6f72671ad 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -27,7 +27,6 @@
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.io.IOException;
-import java.net.URISyntaxException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.sql.Timestamp;
@@ -99,6 +98,7 @@ public static void setupOnce() {
     sparkConf.set("spark.sql.parquet.writeLegacyFormat", "false");
     sparkConf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS");
 
+    sparkConf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer");
     sparkSession = SparkSession.builder().config(sparkConf).getOrCreate();
     jsc = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
   }
@@ -142,11 +142,12 @@ private static ConversionConfig getTableSyncConfig(
     if (partitionConfig != null) {
       sourceProperties.put(PARTITION_FIELD_SPEC_CONFIG, partitionConfig);
     }
+    String absolutePath = new java.io.File(java.net.URI.create(table.getDataPath())).getPath();
     SourceTable sourceTable =
         SourceTable.builder()
             .name(tableName)
             .formatName(sourceTableFormat)
-            .basePath(table.getBasePath())
+            .basePath(absolutePath)
             .dataPath(table.getDataPath())
             .additionalProperties(sourceProperties)
             .build();
@@ -159,7 +160,7 @@ private static ConversionConfig getTableSyncConfig(
                         .name(tableName)
                         .formatName(formatName)
                         // set the metadata path to the data path as the default (required by Hudi)
-                        .basePath(table.getDataPath())
+                        .basePath(absolutePath)
                         .metadataRetention(metadataRetention)
                         .build())
             .collect(Collectors.toList());
@@ -173,10 +174,19 @@ private static ConversionConfig getTableSyncConfig(
 
   private static Stream<Arguments> provideArgsForSyncTesting() {
     List<String> partitionConfigs = Arrays.asList(null, "timestamp:MONTH:year=yyyy/month=MM");
-    return partitionConfigs.stream().flatMap(partitionConfig ->
-      Arrays.stream(SyncMode.values()).map(syncMode -> Arguments.of(
-          buildArgsForPartition(
-              PARQUET, Arrays.asList(ICEBERG, DELTA, HUDI), partitionConfig, partitionConfig, syncMode))));
+    return partitionConfigs.stream()
+        .flatMap(
+            partitionConfig ->
+                Arrays.stream(SyncMode.values())
+                    .map(
+                        syncMode ->
+                            Arguments.of(
+                                buildArgsForPartition(
+                                    PARQUET,
+                                    Arrays.asList(ICEBERG, DELTA, HUDI),
+                                    partitionConfig,
+                                    partitionConfig,
+                                    syncMode))));
   }
 
   private ConversionSourceProvider<?> getConversionSourceProvider(String sourceTableFormat) {
@@ -226,11 +236,23 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
                   new MetadataBuilder().putString("precision", "millis").build())
             });
     Dataset<Row> df = sparkSession.createDataFrame(data, schema);
-    String dataPath = tempDir.toAbsolutePath() + (xTablePartitionConfig == null ? "/non_partitioned_data_" : "/partitioned_data_") + tableFormatPartitionDataHolder.getSyncMode();
+    // String dataPath = tempDir.toAbsolutePath() + (xTablePartitionConfig == null ?
+    // "/non_partitioned_data_" : "/partitioned_data_") +
+    // tableFormatPartitionDataHolder.getSyncMode();
+    String dataPath =
+        tempDir
+            .resolve(
+                (xTablePartitionConfig == null ? "non_partitioned_data_" : "partitioned_data_")
+                    + tableFormatPartitionDataHolder.getSyncMode())
+            .toString();
+
     writeData(df, dataPath, xTablePartitionConfig);
     boolean isPartitioned = xTablePartitionConfig != null;
-    try (GenericTable table = GenericTable.getInstance(
-        tableName, Paths.get(dataPath), sparkSession, jsc, sourceTableFormat, isPartitioned)) {
+
+    java.nio.file.Path pathForXTable = java.nio.file.Paths.get(dataPath);
+    try (GenericTable table =
+        GenericTable.getInstance(
+            tableName, pathForXTable, sparkSession, jsc, sourceTableFormat, isPartitioned)) {
       ConversionConfig conversionConfig =
           getTableSyncConfig(
               sourceTableFormat,
@@ -243,7 +265,8 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
       ConversionController conversionController =
           new ConversionController(jsc.hadoopConfiguration());
       conversionController.sync(conversionConfig, conversionSourceProvider);
-      checkDatasetEquivalenceWithFilter(sourceTableFormat, table, targetTableFormats, isPartitioned);
+      checkDatasetEquivalenceWithFilter(
+          sourceTableFormat, table, targetTableFormats, isPartitioned);
 
       // update the current parquet file data with another attribute the sync again
       List<Row> dataToAppend =
@@ -270,22 +293,30 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
       ConversionController conversionControllerAppended =
           new ConversionController(jsc.hadoopConfiguration());
       conversionControllerAppended.sync(conversionConfigAppended, conversionSourceProvider);
-      checkDatasetEquivalenceWithFilter(sourceTableFormat, table, targetTableFormats, isPartitioned);
+      checkDatasetEquivalenceWithFilter(
+          sourceTableFormat, table, targetTableFormats, isPartitioned);
     }
   }
 
   private void writeData(Dataset<Row> df, String dataPath, String partitionConfig) {
     if (partitionConfig != null) {
       // extract partition columns from config
-      String[] partitionCols = Arrays.stream(partitionConfig.split(":")[2].split("/")).map(s -> s.split("=")[0]).toArray(String[]::new);
+      String[] partitionCols =
+          Arrays.stream(partitionConfig.split(":")[2].split("/"))
+              .map(s -> s.split("=")[0])
+              .toArray(String[]::new);
       // add partition columns to dataframe
       for (String partitionCol : partitionCols) {
         if (partitionCol.equals("year")) {
-          df = df.withColumn("year", functions.year(functions.col("timestamp").cast(DataTypes.TimestampType)));
+          df =
+              df.withColumn(
+                  "year", functions.year(functions.col("timestamp").cast(DataTypes.TimestampType)));
         } else if (partitionCol.equals("month")) {
-          df = df.withColumn(
-              "month",
-              functions.date_format(functions.col("timestamp").cast(DataTypes.TimestampType), "MM"));
+          df =
+              df.withColumn(
+                  "month",
+                  functions.date_format(
+                      functions.col("timestamp").cast(DataTypes.TimestampType), "MM"));
         }
       }
       df.write().mode(SaveMode.Append).partitionBy(partitionCols).parquet(dataPath);
@@ -316,8 +347,10 @@ void testIncrementalSyncWithMultiplePartitions() throws IOException {
 
     Dataset<Row> dfInit = sparkSession.createDataFrame(data, schema);
     Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_2");
-    String outputPath = fixedPath.toString();
+    // String outputPath = fixedPath.toString();
     Dataset<Row> df = dfInit.withColumn("full_date", expr("make_date(year, month, 1)"));
+    String outputPath =
+        new java.io.File("target/fixed-parquet-data/parquet_table_test_2").getAbsolutePath();
     df.coalesce(1).write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
 
     // test find files to sync
@@ -350,10 +383,11 @@ void testIncrementalSyncWithMultiplePartitions() throws IOException {
     Properties sourceProperties = new Properties();
     String partitionConfig = "full_date:MONTH:year=yyyy/month=MM";
     sourceProperties.put(PARTITION_FIELD_SPEC_CONFIG, partitionConfig);
+
     SourceTable tableConfig =
         SourceTable.builder()
             .name("parquet_table_test_2")
-            .basePath(fixedPath.toAbsolutePath().toUri().toString())
+            .basePath(fixedPath.toAbsolutePath().toString()) // removed toUri()
             .additionalProperties(sourceProperties)
             .formatName(TableFormat.PARQUET)
             .build();
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java
index 5d68497c9..7a0b0ce30 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java
@@ -24,7 +24,6 @@
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.mockito.ArgumentMatchers.any;
 import static org.mockito.ArgumentMatchers.anyBoolean;
-import static org.mockito.ArgumentMatchers.eq;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.never;
 import static org.mockito.Mockito.times;
@@ -61,6 +60,7 @@ private static LocatedFileStatus createMockFileStatus(String name, long modTime)
     when(mockPath.getName()).thenReturn(name);
     when(mockStatus.getPath()).thenReturn(mockPath);
     when(mockStatus.getModificationTime()).thenReturn(modTime);
+    when(mockStatus.getLen()).thenReturn(1024L);
     return mockStatus;
   }
 
@@ -86,7 +86,7 @@ public LocatedFileStatus next() {
   private static FileSystem createMockFileSystem(RemoteIterator<LocatedFileStatus> iterator)
       throws IOException {
     FileSystem mockFs = mock(FileSystem.class);
-    when(mockFs.listFiles(any(org.apache.hadoop.fs.Path.class), eq(true))).thenReturn(iterator);
+    when(mockFs.listFiles(any(org.apache.hadoop.fs.Path.class), anyBoolean())).thenReturn(iterator);
     return mockFs;
   }
 

From 2d23f3194b942c48c1cb8b7c798e6bac82e4246c Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 14 Feb 2026 00:09:41 +0100
Subject: [PATCH 061/100] before resycing delete the first synced files
 metadata

---
 .../parquet/ITParquetConversionSource.java    | 32 ++++++++++++++++---
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 6f72671ad..a07c0cbb5 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -200,6 +200,33 @@ private ConversionSourceProvider<?> getConversionSourceProvider(String sourceTab
     }
   }
 
+  private void cleanupTargetMetadata(String dataPath, List<String> formats) {
+    for (String format : formats) {
+      String metadataFolder = "";
+      switch (format.toUpperCase()) {
+        case "ICEBERG":
+          metadataFolder = "metadata";
+          break;
+        case "DELTA":
+          metadataFolder = "_delta_log";
+          break;
+        case "HUDI":
+          metadataFolder = ".hoodie";
+          break;
+      }
+      if (!metadataFolder.isEmpty()) {
+        try {
+          org.apache.hadoop.fs.Path path = new org.apache.hadoop.fs.Path(dataPath, metadataFolder);
+          org.apache.hadoop.fs.FileSystem fs = path.getFileSystem(jsc.hadoopConfiguration());
+          if (fs.exists(path)) {
+            fs.delete(path, true);
+          }
+        } catch (IOException e) {
+        }
+      }
+    }
+  }
+
   @ParameterizedTest
   @MethodSource("provideArgsForSyncTesting")
   void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
@@ -236,9 +263,6 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
                   new MetadataBuilder().putString("precision", "millis").build())
             });
     Dataset<Row> df = sparkSession.createDataFrame(data, schema);
-    // String dataPath = tempDir.toAbsolutePath() + (xTablePartitionConfig == null ?
-    // "/non_partitioned_data_" : "/partitioned_data_") +
-    // tableFormatPartitionDataHolder.getSyncMode();
     String dataPath =
         tempDir
             .resolve(
@@ -280,7 +304,7 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
 
       Dataset<Row> dfAppend = sparkSession.createDataFrame(dataToAppend, schema);
       writeData(dfAppend, dataPath, xTablePartitionConfig);
-
+      cleanupTargetMetadata(dataPath, targetTableFormats);
       ConversionConfig conversionConfigAppended =
           getTableSyncConfig(
               sourceTableFormat,

From ceb924c5d60dfad41fdd0f13611cfe8f644c44da Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 14 Feb 2026 20:24:07 +0100
Subject: [PATCH 062/100] in order to append parquet files, perform union then
 spark overwrite (not append)

---
 .../parquet/ITParquetConversionSource.java    | 86 ++++++++-----------
 1 file changed, 37 insertions(+), 49 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index a07c0cbb5..6b7ea369d 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -200,33 +200,6 @@ private ConversionSourceProvider<?> getConversionSourceProvider(String sourceTab
     }
   }
 
-  private void cleanupTargetMetadata(String dataPath, List<String> formats) {
-    for (String format : formats) {
-      String metadataFolder = "";
-      switch (format.toUpperCase()) {
-        case "ICEBERG":
-          metadataFolder = "metadata";
-          break;
-        case "DELTA":
-          metadataFolder = "_delta_log";
-          break;
-        case "HUDI":
-          metadataFolder = ".hoodie";
-          break;
-      }
-      if (!metadataFolder.isEmpty()) {
-        try {
-          org.apache.hadoop.fs.Path path = new org.apache.hadoop.fs.Path(dataPath, metadataFolder);
-          org.apache.hadoop.fs.FileSystem fs = path.getFileSystem(jsc.hadoopConfiguration());
-          if (fs.exists(path)) {
-            fs.delete(path, true);
-          }
-        } catch (IOException e) {
-        }
-      }
-    }
-  }
-
   @ParameterizedTest
   @MethodSource("provideArgsForSyncTesting")
   void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
@@ -304,7 +277,6 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
 
       Dataset<Row> dfAppend = sparkSession.createDataFrame(dataToAppend, schema);
       writeData(dfAppend, dataPath, xTablePartitionConfig);
-      cleanupTargetMetadata(dataPath, targetTableFormats);
       ConversionConfig conversionConfigAppended =
           getTableSyncConfig(
               sourceTableFormat,
@@ -323,29 +295,45 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
   }
 
   private void writeData(Dataset<Row> df, String dataPath, String partitionConfig) {
-    if (partitionConfig != null) {
-      // extract partition columns from config
-      String[] partitionCols =
-          Arrays.stream(partitionConfig.split(":")[2].split("/"))
-              .map(s -> s.split("=")[0])
-              .toArray(String[]::new);
-      // add partition columns to dataframe
-      for (String partitionCol : partitionCols) {
-        if (partitionCol.equals("year")) {
-          df =
-              df.withColumn(
-                  "year", functions.year(functions.col("timestamp").cast(DataTypes.TimestampType)));
-        } else if (partitionCol.equals("month")) {
-          df =
-              df.withColumn(
-                  "month",
-                  functions.date_format(
-                      functions.col("timestamp").cast(DataTypes.TimestampType), "MM"));
+    try {
+      org.apache.hadoop.fs.Path path = new org.apache.hadoop.fs.Path(dataPath);
+      org.apache.hadoop.fs.FileSystem fs = path.getFileSystem(jsc.hadoopConfiguration());
+
+      if (partitionConfig != null) {
+        // extract partition columns from config
+        String[] partitionCols =
+            Arrays.stream(partitionConfig.split(":")[2].split("/"))
+                .map(s -> s.split("=")[0])
+                .toArray(String[]::new);
+        // add partition columns to dataframe
+        for (String partitionCol : partitionCols) {
+          if (partitionCol.equals("year")) {
+            df =
+                df.withColumn(
+                    "year",
+                    functions.year(functions.col("timestamp").cast(DataTypes.TimestampType)));
+          } else if (partitionCol.equals("month")) {
+            df =
+                df.withColumn(
+                    "month",
+                    functions.date_format(
+                        functions.col("timestamp").cast(DataTypes.TimestampType), "MM"));
+          }
+        }
+        if (fs.exists(new org.apache.hadoop.fs.Path(dataPath))) {
+          Dataset<Row> existingData = sparkSession.read().parquet(dataPath);
+          df = existingData.unionByName(df);
         }
+        df.write().mode(SaveMode.Overwrite).partitionBy(partitionCols).parquet(dataPath);
+      } else {
+        if (fs.exists(new org.apache.hadoop.fs.Path(dataPath))) {
+          Dataset<Row> existingData = sparkSession.read().parquet(dataPath);
+          df = existingData.unionByName(df);
+        }
+        df.write().mode(SaveMode.Overwrite).parquet(dataPath);
       }
-      df.write().mode(SaveMode.Append).partitionBy(partitionCols).parquet(dataPath);
-    } else {
-      df.write().mode(SaveMode.Append).parquet(dataPath);
+    } catch (IOException ioException) {
+      ioException.printStackTrace();
     }
   }
 

From 5b0a9fdb9334e835492a07b885800dcd3faaf23d Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 14 Feb 2026 20:53:58 +0100
Subject: [PATCH 063/100] solving directory issue when overwriting data using
 spark

---
 .../parquet/ITParquetConversionSource.java    | 44 ++++++++++++-------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 6b7ea369d..13203c453 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -296,16 +296,16 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
 
   private void writeData(Dataset<Row> df, String dataPath, String partitionConfig) {
     try {
-      org.apache.hadoop.fs.Path path = new org.apache.hadoop.fs.Path(dataPath);
-      org.apache.hadoop.fs.FileSystem fs = path.getFileSystem(jsc.hadoopConfiguration());
+      org.apache.hadoop.fs.Path finalPath = new org.apache.hadoop.fs.Path(dataPath);
+      org.apache.hadoop.fs.FileSystem fs = finalPath.getFileSystem(jsc.hadoopConfiguration());
 
+      String[] partitionCols = null;
       if (partitionConfig != null) {
-        // extract partition columns from config
-        String[] partitionCols =
+        partitionCols =
             Arrays.stream(partitionConfig.split(":")[2].split("/"))
                 .map(s -> s.split("=")[0])
                 .toArray(String[]::new);
-        // add partition columns to dataframe
+
         for (String partitionCol : partitionCols) {
           if (partitionCol.equals("year")) {
             df =
@@ -318,22 +318,34 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
                     "month",
                     functions.date_format(
                         functions.col("timestamp").cast(DataTypes.TimestampType), "MM"));
+          } else if (partitionCol.equals("day")) {
+            df =
+                df.withColumn(
+                    "day",
+                    functions.date_format(
+                        functions.col("timestamp").cast(DataTypes.TimestampType), "dd"));
           }
         }
-        if (fs.exists(new org.apache.hadoop.fs.Path(dataPath))) {
-          Dataset<Row> existingData = sparkSession.read().parquet(dataPath);
-          df = existingData.unionByName(df);
-        }
-        df.write().mode(SaveMode.Overwrite).partitionBy(partitionCols).parquet(dataPath);
+      }
+
+      if (fs.exists(finalPath)) {
+        Dataset<Row> existingData = sparkSession.read().parquet(dataPath);
+        df = existingData.unionByName(df);
+      }
+
+      String tempPath = dataPath + "_temp_" + System.currentTimeMillis();
+      org.apache.spark.sql.DataFrameWriter<Row> writer = df.write().mode(SaveMode.Overwrite);
+
+      if (partitionCols != null && partitionCols.length > 0) {
+        writer.partitionBy(partitionCols).parquet(tempPath);
       } else {
-        if (fs.exists(new org.apache.hadoop.fs.Path(dataPath))) {
-          Dataset<Row> existingData = sparkSession.read().parquet(dataPath);
-          df = existingData.unionByName(df);
-        }
-        df.write().mode(SaveMode.Overwrite).parquet(dataPath);
+        writer.parquet(tempPath);
       }
+      fs.delete(finalPath, true);
+      fs.rename(new org.apache.hadoop.fs.Path(tempPath), finalPath);
+
     } catch (IOException ioException) {
-      ioException.printStackTrace();
+      throw new RuntimeException("Data refresh failed for: " + dataPath, ioException);
     }
   }
 

From 80e1927ef5ad6d66ae2cd219c82c66660ddcfd3e Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 14 Feb 2026 21:14:01 +0100
Subject: [PATCH 064/100] solving directory issue when overwriting data using
 spark2

---
 .../org/apache/xtable/parquet/ITParquetConversionSource.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 13203c453..05b3b5f9c 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -329,7 +329,7 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
       }
 
       if (fs.exists(finalPath)) {
-        Dataset<Row> existingData = sparkSession.read().parquet(dataPath);
+        Dataset<Row> existingData = sparkSession.read().parquet(dataPath + "/**/*.parquet");
         df = existingData.unionByName(df);
       }
 

From 0aedd70c2130fe28d56b09629e7ab4e5976aad4a Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 14 Feb 2026 21:33:28 +0100
Subject: [PATCH 065/100] solving directory issue when overwriting data using
 spark3

---
 .../apache/xtable/parquet/ITParquetConversionSource.java   | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 05b3b5f9c..9c9f2944b 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -329,7 +329,12 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
       }
 
       if (fs.exists(finalPath)) {
-        Dataset<Row> existingData = sparkSession.read().parquet(dataPath + "/**/*.parquet");
+        Dataset<Row> existingData =
+            sparkSession
+                .read()
+                .option("recursiveFileLookup", "true")
+                .option("pathGlobFilter", "*.parquet")
+                .parquet(dataPath);
         df = existingData.unionByName(df);
       }
 

From 5cbeeeb292b991f1990a3a6b14aced31ce717734 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 14 Feb 2026 21:56:16 +0100
Subject: [PATCH 066/100] solving directory issue when overwriting data using
 spark4

---
 .../parquet/ITParquetConversionSource.java     | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 9c9f2944b..3d20f0e19 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -329,12 +329,18 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
       }
 
       if (fs.exists(finalPath)) {
-        Dataset<Row> existingData =
-            sparkSession
-                .read()
-                .option("recursiveFileLookup", "true")
-                .option("pathGlobFilter", "*.parquet")
-                .parquet(dataPath);
+        Dataset<Row> existingData;
+        if (partitionCols != null && partitionCols.length > 0) {
+          existingData =
+              sparkSession
+                  .read()
+                  .option("basePath", dataPath)
+                  .parquet(dataPath + "/" + partitionCols[0] + "=*");
+        } else {
+          existingData =
+              sparkSession.read().option("pathGlobFilter", "*.parquet").parquet(dataPath);
+        }
+
         df = existingData.unionByName(df);
       }
 

From 13e5a6540f822f7aa4429c26234e0c1323d4bd5d Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 14 Feb 2026 22:16:39 +0100
Subject: [PATCH 067/100] solving directory issue when overwriting data using
 spark5

---
 .../org/apache/xtable/parquet/ITParquetConversionSource.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 3d20f0e19..7052bc82c 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -335,7 +335,7 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
               sparkSession
                   .read()
                   .option("basePath", dataPath)
-                  .parquet(dataPath + "/" + partitionCols[0] + "=*");
+                  .parquet(dataPath + "/*=*");
         } else {
           existingData =
               sparkSession.read().option("pathGlobFilter", "*.parquet").parquet(dataPath);

From fa776465cb2bd8e862498842d93452ea27acedbd Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 14 Feb 2026 22:37:14 +0100
Subject: [PATCH 068/100] solving directory issue when overwriting data using
 spark6

---
 .../org/apache/xtable/parquet/ITParquetConversionSource.java  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 7052bc82c..cf292b0b2 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -352,7 +352,9 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
       } else {
         writer.parquet(tempPath);
       }
-      fs.delete(finalPath, true);
+      if (fs.exists(finalPath)) {
+        fs.delete(finalPath, true);
+      }
       fs.rename(new org.apache.hadoop.fs.Path(tempPath), finalPath);
 
     } catch (IOException ioException) {

From e53fa4b5fb0222ae809201ce1bed8406a0884df5 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 14 Feb 2026 23:00:41 +0100
Subject: [PATCH 069/100] solving directory issue when overwriting data using
 spark7

---
 .../org/apache/xtable/parquet/ITParquetConversionSource.java    | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index cf292b0b2..e407d1880 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -335,6 +335,8 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
               sparkSession
                   .read()
                   .option("basePath", dataPath)
+                  .option("pathGlobFilter", "*.parquet")
+                  .option("recursiveFileLookup", "true")
                   .parquet(dataPath + "/*=*");
         } else {
           existingData =

From a6b75d59e867d1456e71ffe02caa553f17e2f093 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 14 Feb 2026 23:26:02 +0100
Subject: [PATCH 070/100] revert writeData changes

---
 .../parquet/ITParquetConversionSource.java    | 119 ++++++++----------
 1 file changed, 55 insertions(+), 64 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index e407d1880..24bb85913 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -200,6 +200,33 @@ private ConversionSourceProvider<?> getConversionSourceProvider(String sourceTab
     }
   }
 
+  private void cleanupTargetMetadata(String dataPath, List<String> formats) {
+    for (String format : formats) {
+      String metadataFolder = "";
+      switch (format.toUpperCase()) {
+        case "ICEBERG":
+          metadataFolder = "metadata";
+          break;
+        case "DELTA":
+          metadataFolder = "_delta_log";
+          break;
+        case "HUDI":
+          metadataFolder = ".hoodie";
+          break;
+      }
+      if (!metadataFolder.isEmpty()) {
+        try {
+          org.apache.hadoop.fs.Path path = new org.apache.hadoop.fs.Path(dataPath, metadataFolder);
+          org.apache.hadoop.fs.FileSystem fs = path.getFileSystem(jsc.hadoopConfiguration());
+          if (fs.exists(path)) {
+            fs.delete(path, true);
+          }
+        } catch (IOException e) {
+        }
+      }
+    }
+  }
+
   @ParameterizedTest
   @MethodSource("provideArgsForSyncTesting")
   void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
@@ -277,6 +304,7 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
 
       Dataset<Row> dfAppend = sparkSession.createDataFrame(dataToAppend, schema);
       writeData(dfAppend, dataPath, xTablePartitionConfig);
+      cleanupTargetMetadata(dataPath, targetTableFormats);
       ConversionConfig conversionConfigAppended =
           getTableSyncConfig(
               sourceTableFormat,
@@ -295,72 +323,35 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
   }
 
   private void writeData(Dataset<Row> df, String dataPath, String partitionConfig) {
-    try {
-      org.apache.hadoop.fs.Path finalPath = new org.apache.hadoop.fs.Path(dataPath);
-      org.apache.hadoop.fs.FileSystem fs = finalPath.getFileSystem(jsc.hadoopConfiguration());
-
-      String[] partitionCols = null;
-      if (partitionConfig != null) {
-        partitionCols =
-            Arrays.stream(partitionConfig.split(":")[2].split("/"))
-                .map(s -> s.split("=")[0])
-                .toArray(String[]::new);
-
-        for (String partitionCol : partitionCols) {
-          if (partitionCol.equals("year")) {
-            df =
-                df.withColumn(
-                    "year",
-                    functions.year(functions.col("timestamp").cast(DataTypes.TimestampType)));
-          } else if (partitionCol.equals("month")) {
-            df =
-                df.withColumn(
-                    "month",
-                    functions.date_format(
-                        functions.col("timestamp").cast(DataTypes.TimestampType), "MM"));
-          } else if (partitionCol.equals("day")) {
-            df =
-                df.withColumn(
-                    "day",
-                    functions.date_format(
-                        functions.col("timestamp").cast(DataTypes.TimestampType), "dd"));
-          }
-        }
-      }
-
-      if (fs.exists(finalPath)) {
-        Dataset<Row> existingData;
-        if (partitionCols != null && partitionCols.length > 0) {
-          existingData =
-              sparkSession
-                  .read()
-                  .option("basePath", dataPath)
-                  .option("pathGlobFilter", "*.parquet")
-                  .option("recursiveFileLookup", "true")
-                  .parquet(dataPath + "/*=*");
-        } else {
-          existingData =
-              sparkSession.read().option("pathGlobFilter", "*.parquet").parquet(dataPath);
+    if (partitionConfig != null) {
+      // extract partition columns from config
+      String[] partitionCols =
+          Arrays.stream(partitionConfig.split(":")[2].split("/"))
+              .map(s -> s.split("=")[0])
+              .toArray(String[]::new);
+      // add partition columns to dataframe
+      for (String partitionCol : partitionCols) {
+        if (partitionCol.equals("year")) {
+          df =
+              df.withColumn(
+                  "year", functions.year(functions.col("timestamp").cast(DataTypes.TimestampType)));
+        } else if (partitionCol.equals("month")) {
+          df =
+              df.withColumn(
+                  "month",
+                  functions.date_format(
+                      functions.col("timestamp").cast(DataTypes.TimestampType), "MM"));
+        } else if (partitionCol.equals("day")) {
+          df =
+              df.withColumn(
+                  "day",
+                  functions.date_format(
+                      functions.col("timestamp").cast(DataTypes.TimestampType), "dd"));
         }
-
-        df = existingData.unionByName(df);
-      }
-
-      String tempPath = dataPath + "_temp_" + System.currentTimeMillis();
-      org.apache.spark.sql.DataFrameWriter<Row> writer = df.write().mode(SaveMode.Overwrite);
-
-      if (partitionCols != null && partitionCols.length > 0) {
-        writer.partitionBy(partitionCols).parquet(tempPath);
-      } else {
-        writer.parquet(tempPath);
-      }
-      if (fs.exists(finalPath)) {
-        fs.delete(finalPath, true);
       }
-      fs.rename(new org.apache.hadoop.fs.Path(tempPath), finalPath);
-
-    } catch (IOException ioException) {
-      throw new RuntimeException("Data refresh failed for: " + dataPath, ioException);
+      df.write().mode(SaveMode.Append).partitionBy(partitionCols).parquet(dataPath);
+    } else {
+      df.write().mode(SaveMode.Append).parquet(dataPath);
     }
   }
 

From bdc9f408f3f4f16c73e582990a5e21c997f5688d Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 15 Feb 2026 12:56:19 +0100
Subject: [PATCH 071/100] solving directory issue when overwriting data using
 spark8

---
 .../parquet/ITParquetConversionSource.java    | 86 +++++++++++++------
 1 file changed, 58 insertions(+), 28 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 24bb85913..ca41bf178 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -304,7 +304,7 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
 
       Dataset<Row> dfAppend = sparkSession.createDataFrame(dataToAppend, schema);
       writeData(dfAppend, dataPath, xTablePartitionConfig);
-      cleanupTargetMetadata(dataPath, targetTableFormats);
+      // cleanupTargetMetadata(dataPath, targetTableFormats);
       ConversionConfig conversionConfigAppended =
           getTableSyncConfig(
               sourceTableFormat,
@@ -323,35 +323,65 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
   }
 
   private void writeData(Dataset<Row> df, String dataPath, String partitionConfig) {
-    if (partitionConfig != null) {
-      // extract partition columns from config
-      String[] partitionCols =
-          Arrays.stream(partitionConfig.split(":")[2].split("/"))
-              .map(s -> s.split("=")[0])
-              .toArray(String[]::new);
-      // add partition columns to dataframe
-      for (String partitionCol : partitionCols) {
-        if (partitionCol.equals("year")) {
-          df =
-              df.withColumn(
-                  "year", functions.year(functions.col("timestamp").cast(DataTypes.TimestampType)));
-        } else if (partitionCol.equals("month")) {
-          df =
-              df.withColumn(
-                  "month",
-                  functions.date_format(
-                      functions.col("timestamp").cast(DataTypes.TimestampType), "MM"));
-        } else if (partitionCol.equals("day")) {
-          df =
-              df.withColumn(
-                  "day",
-                  functions.date_format(
-                      functions.col("timestamp").cast(DataTypes.TimestampType), "dd"));
+    try {
+      org.apache.hadoop.fs.Path finalPath = new org.apache.hadoop.fs.Path(dataPath);
+      org.apache.hadoop.fs.FileSystem fs = finalPath.getFileSystem(jsc.hadoopConfiguration());
+
+      String[] partitionCols = null;
+      if (partitionConfig != null) {
+        partitionCols =
+            Arrays.stream(partitionConfig.split(":")[2].split("/"))
+                .map(s -> s.split("=")[0])
+                .toArray(String[]::new);
+
+        for (String partitionCol : partitionCols) {
+          if (partitionCol.equals("year")) {
+            df =
+                df.withColumn(
+                    "year",
+                    functions.year(functions.col("timestamp").cast(DataTypes.TimestampType)));
+          } else if (partitionCol.equals("month")) {
+            df =
+                df.withColumn(
+                    "month",
+                    functions.date_format(
+                        functions.col("timestamp").cast(DataTypes.TimestampType), "MM"));
+          } else if (partitionCol.equals("day")) {
+            df =
+                df.withColumn(
+                    "day",
+                    functions.date_format(
+                        functions.col("timestamp").cast(DataTypes.TimestampType), "dd"));
+          }
         }
       }
-      df.write().mode(SaveMode.Append).partitionBy(partitionCols).parquet(dataPath);
-    } else {
-      df.write().mode(SaveMode.Append).parquet(dataPath);
+
+      if (fs.exists(finalPath)) {
+        Dataset<Row> existingData;
+        if (partitionCols != null && partitionCols.length > 0) {
+          existingData = sparkSession.read().option("basePath", dataPath).parquet(dataPath);
+        } else {
+          existingData = sparkSession.read().parquet(dataPath);
+        }
+
+        df = existingData.unionByName(df);
+      }
+
+      String tempPath = dataPath + "_temp_" + System.currentTimeMillis();
+      org.apache.spark.sql.DataFrameWriter<Row> writer = df.write().mode(SaveMode.Overwrite);
+
+      if (partitionCols != null && partitionCols.length > 0) {
+        writer.partitionBy(partitionCols).parquet(tempPath);
+      } else {
+        writer.parquet(tempPath);
+      }
+      if (fs.exists(finalPath)) {
+        fs.delete(finalPath, true);
+      }
+      fs.rename(new org.apache.hadoop.fs.Path(tempPath), finalPath);
+
+    } catch (IOException ioException) {
+      throw new RuntimeException("Data refresh failed for: " + dataPath, ioException);
     }
   }
 

From 43d986d283c74e29dfbc21b914eb96e93f04d0d6 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 15 Feb 2026 13:20:41 +0100
Subject: [PATCH 072/100] solving directory issue when overwriting data using
 spark9

---
 .../org/apache/xtable/parquet/ITParquetConversionSource.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index ca41bf178..545b2ba42 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -359,7 +359,7 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
       if (fs.exists(finalPath)) {
         Dataset<Row> existingData;
         if (partitionCols != null && partitionCols.length > 0) {
-          existingData = sparkSession.read().option("basePath", dataPath).parquet(dataPath);
+          existingData = sparkSession.read().option("basePath", dataPath).parquet(dataPath + "/*=*");
         } else {
           existingData = sparkSession.read().parquet(dataPath);
         }

From 5f67a0fd1e8247dbe5f88c29054b094c9a1b2e32 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 15 Feb 2026 13:43:09 +0100
Subject: [PATCH 073/100] solving directory issue when overwriting data using
 spark9

---
 .../org/apache/xtable/parquet/ITParquetConversionSource.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 545b2ba42..90aad869a 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -345,7 +345,7 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
                 df.withColumn(
                     "month",
                     functions.date_format(
-                        functions.col("timestamp").cast(DataTypes.TimestampType), "MM"));
+                        functions.col("timestamp").cast(DataTypes.TimestampType), "MM").cast(DataTypes.StringType));
           } else if (partitionCol.equals("day")) {
             df =
                 df.withColumn(

From dae07b7e8760c9c7d3507fa52e3f5defed2baebc Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 15 Feb 2026 14:09:16 +0100
Subject: [PATCH 074/100] revert changes

---
 .../parquet/ITParquetConversionSource.java    | 86 ++++++-------------
 1 file changed, 28 insertions(+), 58 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 90aad869a..24bb85913 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -304,7 +304,7 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
 
       Dataset<Row> dfAppend = sparkSession.createDataFrame(dataToAppend, schema);
       writeData(dfAppend, dataPath, xTablePartitionConfig);
-      // cleanupTargetMetadata(dataPath, targetTableFormats);
+      cleanupTargetMetadata(dataPath, targetTableFormats);
       ConversionConfig conversionConfigAppended =
           getTableSyncConfig(
               sourceTableFormat,
@@ -323,65 +323,35 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
   }
 
   private void writeData(Dataset<Row> df, String dataPath, String partitionConfig) {
-    try {
-      org.apache.hadoop.fs.Path finalPath = new org.apache.hadoop.fs.Path(dataPath);
-      org.apache.hadoop.fs.FileSystem fs = finalPath.getFileSystem(jsc.hadoopConfiguration());
-
-      String[] partitionCols = null;
-      if (partitionConfig != null) {
-        partitionCols =
-            Arrays.stream(partitionConfig.split(":")[2].split("/"))
-                .map(s -> s.split("=")[0])
-                .toArray(String[]::new);
-
-        for (String partitionCol : partitionCols) {
-          if (partitionCol.equals("year")) {
-            df =
-                df.withColumn(
-                    "year",
-                    functions.year(functions.col("timestamp").cast(DataTypes.TimestampType)));
-          } else if (partitionCol.equals("month")) {
-            df =
-                df.withColumn(
-                    "month",
-                    functions.date_format(
-                        functions.col("timestamp").cast(DataTypes.TimestampType), "MM").cast(DataTypes.StringType));
-          } else if (partitionCol.equals("day")) {
-            df =
-                df.withColumn(
-                    "day",
-                    functions.date_format(
-                        functions.col("timestamp").cast(DataTypes.TimestampType), "dd"));
-          }
-        }
-      }
-
-      if (fs.exists(finalPath)) {
-        Dataset<Row> existingData;
-        if (partitionCols != null && partitionCols.length > 0) {
-          existingData = sparkSession.read().option("basePath", dataPath).parquet(dataPath + "/*=*");
-        } else {
-          existingData = sparkSession.read().parquet(dataPath);
+    if (partitionConfig != null) {
+      // extract partition columns from config
+      String[] partitionCols =
+          Arrays.stream(partitionConfig.split(":")[2].split("/"))
+              .map(s -> s.split("=")[0])
+              .toArray(String[]::new);
+      // add partition columns to dataframe
+      for (String partitionCol : partitionCols) {
+        if (partitionCol.equals("year")) {
+          df =
+              df.withColumn(
+                  "year", functions.year(functions.col("timestamp").cast(DataTypes.TimestampType)));
+        } else if (partitionCol.equals("month")) {
+          df =
+              df.withColumn(
+                  "month",
+                  functions.date_format(
+                      functions.col("timestamp").cast(DataTypes.TimestampType), "MM"));
+        } else if (partitionCol.equals("day")) {
+          df =
+              df.withColumn(
+                  "day",
+                  functions.date_format(
+                      functions.col("timestamp").cast(DataTypes.TimestampType), "dd"));
         }
-
-        df = existingData.unionByName(df);
       }
-
-      String tempPath = dataPath + "_temp_" + System.currentTimeMillis();
-      org.apache.spark.sql.DataFrameWriter<Row> writer = df.write().mode(SaveMode.Overwrite);
-
-      if (partitionCols != null && partitionCols.length > 0) {
-        writer.partitionBy(partitionCols).parquet(tempPath);
-      } else {
-        writer.parquet(tempPath);
-      }
-      if (fs.exists(finalPath)) {
-        fs.delete(finalPath, true);
-      }
-      fs.rename(new org.apache.hadoop.fs.Path(tempPath), finalPath);
-
-    } catch (IOException ioException) {
-      throw new RuntimeException("Data refresh failed for: " + dataPath, ioException);
+      df.write().mode(SaveMode.Append).partitionBy(partitionCols).parquet(dataPath);
+    } else {
+      df.write().mode(SaveMode.Append).parquet(dataPath);
     }
   }
 

From 9e0de0af81aac33b21e0f347d600cbe13ff09192 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 15 Feb 2026 15:00:58 +0100
Subject: [PATCH 075/100] testing without cleaning metadata for the second sync

---
 .../parquet/ITParquetConversionSource.java    | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 24bb85913..25bc7ea10 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -270,7 +270,7 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
                     + tableFormatPartitionDataHolder.getSyncMode())
             .toString();
 
-    writeData(df, dataPath, xTablePartitionConfig);
+    writeData(df, dataPath, xTablePartitionConfig, sourceTableFormat);
     boolean isPartitioned = xTablePartitionConfig != null;
 
     java.nio.file.Path pathForXTable = java.nio.file.Paths.get(dataPath);
@@ -303,8 +303,13 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
                   new Timestamp(System.currentTimeMillis() + 1500)));
 
       Dataset<Row> dfAppend = sparkSession.createDataFrame(dataToAppend, schema);
-      writeData(dfAppend, dataPath, xTablePartitionConfig);
-      cleanupTargetMetadata(dataPath, targetTableFormats);
+      writeData(dfAppend, dataPath, xTablePartitionConfig, sourceTableFormat);
+      try {
+        Thread.sleep(1000);
+      } catch (InterruptedException e) {
+        Thread.currentThread().interrupt();
+      }
+      // cleanupTargetMetadata(dataPath, targetTableFormats);
       ConversionConfig conversionConfigAppended =
           getTableSyncConfig(
               sourceTableFormat,
@@ -322,7 +327,8 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
     }
   }
 
-  private void writeData(Dataset<Row> df, String dataPath, String partitionConfig) {
+  private void writeData(
+      Dataset<Row> df, String dataPath, String partitionConfig, String sourceTableFormat) {
     if (partitionConfig != null) {
       // extract partition columns from config
       String[] partitionCols =
@@ -349,9 +355,13 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
                       functions.col("timestamp").cast(DataTypes.TimestampType), "dd"));
         }
       }
-      df.write().mode(SaveMode.Append).partitionBy(partitionCols).parquet(dataPath);
+      df.write()
+          .format(sourceTableFormat)
+          .mode(SaveMode.Append)
+          .partitionBy(partitionCols)
+          .save(dataPath);
     } else {
-      df.write().mode(SaveMode.Append).parquet(dataPath);
+      df.write().format(sourceTableFormat).mode(SaveMode.Append).save(dataPath);
     }
   }
 

From 8c1b8f69da34f36d31fa81687cfff57115504f92 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 15 Feb 2026 15:29:52 +0100
Subject: [PATCH 076/100] testing without cleaning metadata for the second sync

---
 .../apache/xtable/parquet/ITParquetConversionSource.java | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 25bc7ea10..55827fa36 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -267,7 +267,9 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
         tempDir
             .resolve(
                 (xTablePartitionConfig == null ? "non_partitioned_data_" : "partitioned_data_")
-                    + tableFormatPartitionDataHolder.getSyncMode())
+                    + tableFormatPartitionDataHolder.getSyncMode()
+                    + "_"
+                    + UUID.randomUUID().toString())
             .toString();
 
     writeData(df, dataPath, xTablePartitionConfig, sourceTableFormat);
@@ -304,11 +306,6 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
 
       Dataset<Row> dfAppend = sparkSession.createDataFrame(dataToAppend, schema);
       writeData(dfAppend, dataPath, xTablePartitionConfig, sourceTableFormat);
-      try {
-        Thread.sleep(1000);
-      } catch (InterruptedException e) {
-        Thread.currentThread().interrupt();
-      }
       // cleanupTargetMetadata(dataPath, targetTableFormats);
       ConversionConfig conversionConfigAppended =
           getTableSyncConfig(

From 84bae497a4a262896001bbe34329256e0bcf2c0b Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 15 Feb 2026 15:52:46 +0100
Subject: [PATCH 077/100] testing without cleaning metadata for the second sync

---
 .../parquet/ITParquetConversionSource.java    | 36 ++++++++++++-------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 55827fa36..3b90b6d87 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -267,12 +267,10 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
         tempDir
             .resolve(
                 (xTablePartitionConfig == null ? "non_partitioned_data_" : "partitioned_data_")
-                    + tableFormatPartitionDataHolder.getSyncMode()
-                    + "_"
-                    + UUID.randomUUID().toString())
+                    + tableFormatPartitionDataHolder.getSyncMode())
             .toString();
 
-    writeData(df, dataPath, xTablePartitionConfig, sourceTableFormat);
+    writeData(df, dataPath, xTablePartitionConfig, sourceTableFormat, true);
     boolean isPartitioned = xTablePartitionConfig != null;
 
     java.nio.file.Path pathForXTable = java.nio.file.Paths.get(dataPath);
@@ -305,7 +303,7 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
                   new Timestamp(System.currentTimeMillis() + 1500)));
 
       Dataset<Row> dfAppend = sparkSession.createDataFrame(dataToAppend, schema);
-      writeData(dfAppend, dataPath, xTablePartitionConfig, sourceTableFormat);
+      writeData(dfAppend, dataPath, xTablePartitionConfig, sourceTableFormat, false);
       // cleanupTargetMetadata(dataPath, targetTableFormats);
       ConversionConfig conversionConfigAppended =
           getTableSyncConfig(
@@ -325,7 +323,7 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
   }
 
   private void writeData(
-      Dataset<Row> df, String dataPath, String partitionConfig, String sourceTableFormat) {
+      Dataset<Row> df, String dataPath, String partitionConfig, String sourceTableFormat, boolean firstCall) {
     if (partitionConfig != null) {
       // extract partition columns from config
       String[] partitionCols =
@@ -352,13 +350,27 @@ private void writeData(
                       functions.col("timestamp").cast(DataTypes.TimestampType), "dd"));
         }
       }
-      df.write()
-          .format(sourceTableFormat)
-          .mode(SaveMode.Append)
-          .partitionBy(partitionCols)
-          .save(dataPath);
+      if (firstCall) {
+        df.write()
+                .format(sourceTableFormat)
+                .mode(SaveMode.Overwrite)
+                .partitionBy(partitionCols)
+                .save(dataPath);
+      }
+      else{
+        df.write()
+                .format(sourceTableFormat)
+                .mode(SaveMode.Append)
+                .partitionBy(partitionCols)
+                .save(dataPath);
+      }
     } else {
-      df.write().format(sourceTableFormat).mode(SaveMode.Append).save(dataPath);
+      if (firstCall) {
+        df.write().format(sourceTableFormat).mode(SaveMode.Overwrite).save(dataPath);
+      }
+      else{
+        df.write().format(sourceTableFormat).mode(SaveMode.Append).save(dataPath);
+      }
     }
   }
 

From 6ce28a332418c9c8c13f86b7611f1ec9c72a9faf Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 15 Feb 2026 16:33:01 +0100
Subject: [PATCH 078/100] testing without cleaning metadata for the second sync

---
 .../parquet/ITParquetConversionSource.java    | 40 +++++++------------
 1 file changed, 15 insertions(+), 25 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 3b90b6d87..44b6b5c0d 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -270,7 +270,7 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
                     + tableFormatPartitionDataHolder.getSyncMode())
             .toString();
 
-    writeData(df, dataPath, xTablePartitionConfig, sourceTableFormat, true);
+    writeData(df, dataPath, xTablePartitionConfig);
     boolean isPartitioned = xTablePartitionConfig != null;
 
     java.nio.file.Path pathForXTable = java.nio.file.Paths.get(dataPath);
@@ -303,8 +303,8 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
                   new Timestamp(System.currentTimeMillis() + 1500)));
 
       Dataset<Row> dfAppend = sparkSession.createDataFrame(dataToAppend, schema);
-      writeData(dfAppend, dataPath, xTablePartitionConfig, sourceTableFormat, false);
-      // cleanupTargetMetadata(dataPath, targetTableFormats);
+      writeData(dfAppend, dataPath, xTablePartitionConfig);
+      //cleanupTargetMetadata(dataPath, targetTableFormats);
       ConversionConfig conversionConfigAppended =
           getTableSyncConfig(
               sourceTableFormat,
@@ -322,8 +322,7 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
     }
   }
 
-  private void writeData(
-      Dataset<Row> df, String dataPath, String partitionConfig, String sourceTableFormat, boolean firstCall) {
+  private void writeData(Dataset<Row> df, String dataPath, String partitionConfig) {
     if (partitionConfig != null) {
       // extract partition columns from config
       String[] partitionCols =
@@ -350,27 +349,18 @@ private void writeData(
                       functions.col("timestamp").cast(DataTypes.TimestampType), "dd"));
         }
       }
-      if (firstCall) {
-        df.write()
-                .format(sourceTableFormat)
-                .mode(SaveMode.Overwrite)
-                .partitionBy(partitionCols)
-                .save(dataPath);
-      }
-      else{
-        df.write()
-                .format(sourceTableFormat)
-                .mode(SaveMode.Append)
-                .partitionBy(partitionCols)
-                .save(dataPath);
-      }
+      Dataset<Row> existingData = sparkSession.read().parquet(dataPath);
+      Dataset<Row> combinedData = existingData.unionByName(df, true);
+      combinedData.write()
+              .mode(SaveMode.Overwrite)
+              .partitionBy(partitionCols)
+              .parquet(dataPath);
+      //df.write().mode(SaveMode.Append).partitionBy(partitionCols).parquet(dataPath);
     } else {
-      if (firstCall) {
-        df.write().format(sourceTableFormat).mode(SaveMode.Overwrite).save(dataPath);
-      }
-      else{
-        df.write().format(sourceTableFormat).mode(SaveMode.Append).save(dataPath);
-      }
+      Dataset<Row> existingData = sparkSession.read().parquet(dataPath);
+      Dataset<Row> combinedData = existingData.unionByName(df, true);
+      //df.write().mode(SaveMode.Append).parquet(dataPath);
+      combinedData.write().mode(SaveMode.Overwrite).parquet(dataPath);
     }
   }
 

From cf2564c6d6850e616984595c834cbdd011540912 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 15 Feb 2026 16:49:30 +0100
Subject: [PATCH 079/100] testing without cleaning metadata for the second sync

---
 .../parquet/ITParquetConversionSource.java    | 20 +++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 44b6b5c0d..2690db335 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -349,16 +349,28 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
                       functions.col("timestamp").cast(DataTypes.TimestampType), "dd"));
         }
       }
-      Dataset<Row> existingData = sparkSession.read().parquet(dataPath);
-      Dataset<Row> combinedData = existingData.unionByName(df, true);
+      Dataset<Row> combinedData;
+      try {
+        Dataset<Row> existingData = sparkSession.read().parquet(dataPath);
+        combinedData = existingData.unionByName(df, true);
+      } catch (Exception e) {
+        // If the path doesn't exist or is empty, the current DF is the "existing" data
+        combinedData = df;
+      }
       combinedData.write()
               .mode(SaveMode.Overwrite)
               .partitionBy(partitionCols)
               .parquet(dataPath);
       //df.write().mode(SaveMode.Append).partitionBy(partitionCols).parquet(dataPath);
     } else {
-      Dataset<Row> existingData = sparkSession.read().parquet(dataPath);
-      Dataset<Row> combinedData = existingData.unionByName(df, true);
+      Dataset<Row> combinedData;
+      try {
+        Dataset<Row> existingData = sparkSession.read().parquet(dataPath);
+        combinedData = existingData.unionByName(df, true);
+      } catch (Exception e) {
+        // If the path doesn't exist or is empty, the current DF is the "existing" data
+        combinedData = df;
+      }
       //df.write().mode(SaveMode.Append).parquet(dataPath);
       combinedData.write().mode(SaveMode.Overwrite).parquet(dataPath);
     }

From 6cb87ee0258ef26731ef546a6d56b9ee5446c4c5 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 15 Feb 2026 17:10:05 +0100
Subject: [PATCH 080/100] revert changes

---
 .../parquet/ITParquetConversionSource.java    | 27 +++----------------
 1 file changed, 3 insertions(+), 24 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 2690db335..24bb85913 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -304,7 +304,7 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
 
       Dataset<Row> dfAppend = sparkSession.createDataFrame(dataToAppend, schema);
       writeData(dfAppend, dataPath, xTablePartitionConfig);
-      //cleanupTargetMetadata(dataPath, targetTableFormats);
+      cleanupTargetMetadata(dataPath, targetTableFormats);
       ConversionConfig conversionConfigAppended =
           getTableSyncConfig(
               sourceTableFormat,
@@ -349,30 +349,9 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
                       functions.col("timestamp").cast(DataTypes.TimestampType), "dd"));
         }
       }
-      Dataset<Row> combinedData;
-      try {
-        Dataset<Row> existingData = sparkSession.read().parquet(dataPath);
-        combinedData = existingData.unionByName(df, true);
-      } catch (Exception e) {
-        // If the path doesn't exist or is empty, the current DF is the "existing" data
-        combinedData = df;
-      }
-      combinedData.write()
-              .mode(SaveMode.Overwrite)
-              .partitionBy(partitionCols)
-              .parquet(dataPath);
-      //df.write().mode(SaveMode.Append).partitionBy(partitionCols).parquet(dataPath);
+      df.write().mode(SaveMode.Append).partitionBy(partitionCols).parquet(dataPath);
     } else {
-      Dataset<Row> combinedData;
-      try {
-        Dataset<Row> existingData = sparkSession.read().parquet(dataPath);
-        combinedData = existingData.unionByName(df, true);
-      } catch (Exception e) {
-        // If the path doesn't exist or is empty, the current DF is the "existing" data
-        combinedData = df;
-      }
-      //df.write().mode(SaveMode.Append).parquet(dataPath);
-      combinedData.write().mode(SaveMode.Overwrite).parquet(dataPath);
+      df.write().mode(SaveMode.Append).parquet(dataPath);
     }
   }
 

From 5c250934d1ad8f27d6bb34c6f93cac69a3dab5f6 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 15 Feb 2026 17:43:54 +0100
Subject: [PATCH 081/100] revert changes

---
 .../apache/xtable/parquet/ITParquetConversionSource.java    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 24bb85913..f7683614e 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -304,7 +304,7 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
 
       Dataset<Row> dfAppend = sparkSession.createDataFrame(dataToAppend, schema);
       writeData(dfAppend, dataPath, xTablePartitionConfig);
-      cleanupTargetMetadata(dataPath, targetTableFormats);
+      //cleanupTargetMetadata(dataPath, targetTableFormats);
       ConversionConfig conversionConfigAppended =
           getTableSyncConfig(
               sourceTableFormat,
@@ -349,9 +349,9 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
                       functions.col("timestamp").cast(DataTypes.TimestampType), "dd"));
         }
       }
-      df.write().mode(SaveMode.Append).partitionBy(partitionCols).parquet(dataPath);
+      df.write().mode("append").partitionBy(partitionCols).parquet(dataPath);
     } else {
-      df.write().mode(SaveMode.Append).parquet(dataPath);
+      df.write().mode("append").parquet(dataPath);
     }
   }
 

From 17a313415baa14e7b1b9f2c7195dd8346065ec00 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 15 Feb 2026 18:01:41 +0100
Subject: [PATCH 082/100] revert changes

---
 .../apache/xtable/parquet/ITParquetConversionSource.java    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index f7683614e..24bb85913 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -304,7 +304,7 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
 
       Dataset<Row> dfAppend = sparkSession.createDataFrame(dataToAppend, schema);
       writeData(dfAppend, dataPath, xTablePartitionConfig);
-      //cleanupTargetMetadata(dataPath, targetTableFormats);
+      cleanupTargetMetadata(dataPath, targetTableFormats);
       ConversionConfig conversionConfigAppended =
           getTableSyncConfig(
               sourceTableFormat,
@@ -349,9 +349,9 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
                       functions.col("timestamp").cast(DataTypes.TimestampType), "dd"));
         }
       }
-      df.write().mode("append").partitionBy(partitionCols).parquet(dataPath);
+      df.write().mode(SaveMode.Append).partitionBy(partitionCols).parquet(dataPath);
     } else {
-      df.write().mode("append").parquet(dataPath);
+      df.write().mode(SaveMode.Append).parquet(dataPath);
     }
   }
 

From 5b0d8dc69f0405b4fde71ae5aaa9d234c558a30e Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 15 Feb 2026 20:27:04 +0100
Subject: [PATCH 083/100] use no tempDir

---
 .../xtable/parquet/ITParquetConversionSource.java    | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 24bb85913..d3032b32d 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -263,12 +263,18 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
                   new MetadataBuilder().putString("precision", "millis").build())
             });
     Dataset<Row> df = sparkSession.createDataFrame(data, schema);
-    String dataPath =
+
+      java.io.File baseDir = new java.io.File("target/fixed-parquet-data/parquet_table_test_1");
+      String subFolder = (xTablePartitionConfig == null ? "non_partitioned_data_" : "partitioned_data_")
+              + tableFormatPartitionDataHolder.getSyncMode();
+
+      String dataPath = new java.io.File(baseDir, subFolder).getAbsolutePath();
+    /*String dataPath =
         tempDir
             .resolve(
                 (xTablePartitionConfig == null ? "non_partitioned_data_" : "partitioned_data_")
                     + tableFormatPartitionDataHolder.getSyncMode())
-            .toString();
+            .toString();*/
 
     writeData(df, dataPath, xTablePartitionConfig);
     boolean isPartitioned = xTablePartitionConfig != null;
@@ -304,7 +310,7 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
 
       Dataset<Row> dfAppend = sparkSession.createDataFrame(dataToAppend, schema);
       writeData(dfAppend, dataPath, xTablePartitionConfig);
-      cleanupTargetMetadata(dataPath, targetTableFormats);
+      //cleanupTargetMetadata(dataPath, targetTableFormats);
       ConversionConfig conversionConfigAppended =
           getTableSyncConfig(
               sourceTableFormat,

From 8db84a02bdb27970d2cb6be4b61be69b08d8d91b Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 15 Feb 2026 21:24:33 +0100
Subject: [PATCH 084/100] revert changes

---
 .../parquet/ITParquetConversionSource.java       | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index d3032b32d..57d28b39a 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -263,23 +263,17 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
                   new MetadataBuilder().putString("precision", "millis").build())
             });
     Dataset<Row> df = sparkSession.createDataFrame(data, schema);
-
-      java.io.File baseDir = new java.io.File("target/fixed-parquet-data/parquet_table_test_1");
-      String subFolder = (xTablePartitionConfig == null ? "non_partitioned_data_" : "partitioned_data_")
-              + tableFormatPartitionDataHolder.getSyncMode();
-
-      String dataPath = new java.io.File(baseDir, subFolder).getAbsolutePath();
-    /*String dataPath =
+    String dataPath =
         tempDir
             .resolve(
                 (xTablePartitionConfig == null ? "non_partitioned_data_" : "partitioned_data_")
                     + tableFormatPartitionDataHolder.getSyncMode())
-            .toString();*/
+            .toString();
 
     writeData(df, dataPath, xTablePartitionConfig);
     boolean isPartitioned = xTablePartitionConfig != null;
 
-    java.nio.file.Path pathForXTable = java.nio.file.Paths.get(dataPath);
+    Path pathForXTable = Paths.get(dataPath);
     try (GenericTable table =
         GenericTable.getInstance(
             tableName, pathForXTable, sparkSession, jsc, sourceTableFormat, isPartitioned)) {
@@ -310,7 +304,7 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
 
       Dataset<Row> dfAppend = sparkSession.createDataFrame(dataToAppend, schema);
       writeData(dfAppend, dataPath, xTablePartitionConfig);
-      //cleanupTargetMetadata(dataPath, targetTableFormats);
+      cleanupTargetMetadata(dataPath, targetTableFormats);
       ConversionConfig conversionConfigAppended =
           getTableSyncConfig(
               sourceTableFormat,
@@ -423,7 +417,7 @@ void testIncrementalSyncWithMultiplePartitions() throws IOException {
     SourceTable tableConfig =
         SourceTable.builder()
             .name("parquet_table_test_2")
-            .basePath(fixedPath.toAbsolutePath().toString()) // removed toUri()
+            .basePath(fixedPath.toAbsolutePath().toString())
             .additionalProperties(sourceProperties)
             .formatName(TableFormat.PARQUET)
             .build();

From b8e0d16664e341a76f0046c9ae984f5f86eec40f Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 21 Feb 2026 21:36:36 +0100
Subject: [PATCH 085/100] use union() to append data for the second sync()

---
 .../parquet/ITParquetConversionSource.java    | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 57d28b39a..21d94013a 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -323,13 +323,14 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
   }
 
   private void writeData(Dataset<Row> df, String dataPath, String partitionConfig) {
+
+    String[] partitionCols = new String[0];
     if (partitionConfig != null) {
-      // extract partition columns from config
-      String[] partitionCols =
+      partitionCols =
           Arrays.stream(partitionConfig.split(":")[2].split("/"))
               .map(s -> s.split("=")[0])
               .toArray(String[]::new);
-      // add partition columns to dataframe
+
       for (String partitionCol : partitionCols) {
         if (partitionCol.equals("year")) {
           df =
@@ -349,9 +350,20 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
                       functions.col("timestamp").cast(DataTypes.TimestampType), "dd"));
         }
       }
-      df.write().mode(SaveMode.Append).partitionBy(partitionCols).parquet(dataPath);
+    }
+
+    Dataset<Row> finalDf;
+    try {
+      Dataset<Row> existingDf = sparkSession.read().parquet(dataPath);
+      finalDf = existingDf.unionByName(df);
+    } catch (Exception e) {
+      finalDf = df;
+    }
+
+    if (partitionCols.length > 0) {
+      finalDf.write().mode(SaveMode.Overwrite).partitionBy(partitionCols).parquet(dataPath);
     } else {
-      df.write().mode(SaveMode.Append).parquet(dataPath);
+      finalDf.write().mode(SaveMode.Overwrite).parquet(dataPath);
     }
   }
 

From fdaeb4d186586967930dd88535e080384e3e7e5f Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 21 Feb 2026 21:58:52 +0100
Subject: [PATCH 086/100] use union() to append data for the second sync()

---
 .../parquet/ITParquetConversionSource.java    | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 21d94013a..2b9c0a972 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -323,7 +323,6 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
   }
 
   private void writeData(Dataset<Row> df, String dataPath, String partitionConfig) {
-
     String[] partitionCols = new String[0];
     if (partitionConfig != null) {
       partitionCols =
@@ -342,24 +341,24 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
                   "month",
                   functions.date_format(
                       functions.col("timestamp").cast(DataTypes.TimestampType), "MM"));
-        } else if (partitionCol.equals("day")) {
-          df =
-              df.withColumn(
-                  "day",
-                  functions.date_format(
-                      functions.col("timestamp").cast(DataTypes.TimestampType), "dd"));
         }
       }
     }
 
-    Dataset<Row> finalDf;
+    List<Row> combinedDataList = new ArrayList<>();
+
     try {
-      Dataset<Row> existingDf = sparkSession.read().parquet(dataPath);
-      finalDf = existingDf.unionByName(df);
+
+      List<Row> existingData = sparkSession.read().parquet(dataPath).collectAsList();
+      combinedDataList.addAll(existingData);
     } catch (Exception e) {
-      finalDf = df;
     }
 
+    combinedDataList.addAll(df.collectAsList());
+
+    StructType finalSchema = (partitionConfig != null) ? df.schema() : schema;
+    Dataset<Row> finalDf = sparkSession.createDataFrame(combinedDataList, finalSchema);
+
     if (partitionCols.length > 0) {
       finalDf.write().mode(SaveMode.Overwrite).partitionBy(partitionCols).parquet(dataPath);
     } else {

From b2f214d2ad67507a3749552209a24c4c4f7a332f Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 21 Feb 2026 22:19:56 +0100
Subject: [PATCH 087/100] use union() to append data for the second sync()

---
 .../apache/xtable/parquet/ITParquetConversionSource.java  | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 2b9c0a972..d9edf731c 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -349,7 +349,13 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
 
     try {
 
-      List<Row> existingData = sparkSession.read().parquet(dataPath).collectAsList();
+      List<Row> existingData =
+          sparkSession
+              .read()
+              .option("recursiveFileLookup", "true")
+              .option("pathGlobFilter", "*.parquet")
+              .parquet(dataPath)
+              .collectAsList();
       combinedDataList.addAll(existingData);
     } catch (Exception e) {
     }

From 913ac29ceb5a274005aabd4d5934aa3d2774d9ac Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 21 Feb 2026 22:42:07 +0100
Subject: [PATCH 088/100] use union() to append data for the second sync()

---
 .../parquet/ITParquetConversionSource.java    | 30 ++++++++-----------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index d9edf731c..5cef511e6 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -323,6 +323,7 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
   }
 
   private void writeData(Dataset<Row> df, String dataPath, String partitionConfig) {
+    Dataset<Row> newDfWithPartitions = df;
     String[] partitionCols = new String[0];
     if (partitionConfig != null) {
       partitionCols =
@@ -332,12 +333,12 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
 
       for (String partitionCol : partitionCols) {
         if (partitionCol.equals("year")) {
-          df =
-              df.withColumn(
+          newDfWithPartitions =
+              newDfWithPartitions.withColumn(
                   "year", functions.year(functions.col("timestamp").cast(DataTypes.TimestampType)));
         } else if (partitionCol.equals("month")) {
-          df =
-              df.withColumn(
+          newDfWithPartitions =
+              newDfWithPartitions.withColumn(
                   "month",
                   functions.date_format(
                       functions.col("timestamp").cast(DataTypes.TimestampType), "MM"));
@@ -345,25 +346,18 @@ private void writeData(Dataset<Row> df, String dataPath, String partitionConfig)
       }
     }
 
-    List<Row> combinedDataList = new ArrayList<>();
-
+    Dataset<Row> combinedDf;
     try {
 
-      List<Row> existingData =
-          sparkSession
-              .read()
-              .option("recursiveFileLookup", "true")
-              .option("pathGlobFilter", "*.parquet")
-              .parquet(dataPath)
-              .collectAsList();
-      combinedDataList.addAll(existingData);
+      Dataset<Row> existingDf = sparkSession.read().parquet(dataPath);
+
+      combinedDf = existingDf.unionByName(newDfWithPartitions, true);
     } catch (Exception e) {
+      combinedDf = newDfWithPartitions;
     }
 
-    combinedDataList.addAll(df.collectAsList());
-
-    StructType finalSchema = (partitionConfig != null) ? df.schema() : schema;
-    Dataset<Row> finalDf = sparkSession.createDataFrame(combinedDataList, finalSchema);
+    List<Row> localRows = combinedDf.collectAsList();
+    Dataset<Row> finalDf = sparkSession.createDataFrame(localRows, combinedDf.schema());
 
     if (partitionCols.length > 0) {
       finalDf.write().mode(SaveMode.Overwrite).partitionBy(partitionCols).parquet(dataPath);

From 9e243a16213c74e839907a0d1129e4e99416ee4b Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 21 Feb 2026 23:01:51 +0100
Subject: [PATCH 089/100] use union() to append data for the second sync()
 reverted

---
 .../parquet/ITParquetConversionSource.java    | 41 +++++++------------
 1 file changed, 15 insertions(+), 26 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 5cef511e6..57d28b39a 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -323,46 +323,35 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
   }
 
   private void writeData(Dataset<Row> df, String dataPath, String partitionConfig) {
-    Dataset<Row> newDfWithPartitions = df;
-    String[] partitionCols = new String[0];
     if (partitionConfig != null) {
-      partitionCols =
+      // extract partition columns from config
+      String[] partitionCols =
           Arrays.stream(partitionConfig.split(":")[2].split("/"))
               .map(s -> s.split("=")[0])
               .toArray(String[]::new);
-
+      // add partition columns to dataframe
       for (String partitionCol : partitionCols) {
         if (partitionCol.equals("year")) {
-          newDfWithPartitions =
-              newDfWithPartitions.withColumn(
+          df =
+              df.withColumn(
                   "year", functions.year(functions.col("timestamp").cast(DataTypes.TimestampType)));
         } else if (partitionCol.equals("month")) {
-          newDfWithPartitions =
-              newDfWithPartitions.withColumn(
+          df =
+              df.withColumn(
                   "month",
                   functions.date_format(
                       functions.col("timestamp").cast(DataTypes.TimestampType), "MM"));
+        } else if (partitionCol.equals("day")) {
+          df =
+              df.withColumn(
+                  "day",
+                  functions.date_format(
+                      functions.col("timestamp").cast(DataTypes.TimestampType), "dd"));
         }
       }
-    }
-
-    Dataset<Row> combinedDf;
-    try {
-
-      Dataset<Row> existingDf = sparkSession.read().parquet(dataPath);
-
-      combinedDf = existingDf.unionByName(newDfWithPartitions, true);
-    } catch (Exception e) {
-      combinedDf = newDfWithPartitions;
-    }
-
-    List<Row> localRows = combinedDf.collectAsList();
-    Dataset<Row> finalDf = sparkSession.createDataFrame(localRows, combinedDf.schema());
-
-    if (partitionCols.length > 0) {
-      finalDf.write().mode(SaveMode.Overwrite).partitionBy(partitionCols).parquet(dataPath);
+      df.write().mode(SaveMode.Append).partitionBy(partitionCols).parquet(dataPath);
     } else {
-      finalDf.write().mode(SaveMode.Overwrite).parquet(dataPath);
+      df.write().mode(SaveMode.Append).parquet(dataPath);
     }
   }
 

From a7412c32bd5cefd6c606d911938ca1eb385239cd Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 22 Feb 2026 16:51:57 +0100
Subject: [PATCH 090/100] fix second sync error

---
 .../apache/xtable/parquet/ITParquetConversionSource.java    | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 57d28b39a..897dfb2c5 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -207,12 +207,6 @@ private void cleanupTargetMetadata(String dataPath, List<String> formats) {
         case "ICEBERG":
           metadataFolder = "metadata";
           break;
-        case "DELTA":
-          metadataFolder = "_delta_log";
-          break;
-        case "HUDI":
-          metadataFolder = ".hoodie";
-          break;
       }
       if (!metadataFolder.isEmpty()) {
         try {

From c028112e0e2c54ce9e86a92a124ec35353432c04 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 22 Feb 2026 16:58:38 +0100
Subject: [PATCH 091/100] fix second sync error

---
 .../apache/xtable/parquet/ITParquetConversionSource.java    | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 897dfb2c5..4bafed49a 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -207,6 +207,12 @@ private void cleanupTargetMetadata(String dataPath, List<String> formats) {
         case "ICEBERG":
           metadataFolder = "metadata";
           break;
+//        case "DELTA":
+//          metadataFolder = "_delta_log";
+//          break;
+//        case "HUDI":
+//          metadataFolder = ".hoodie";
+//          break;
       }
       if (!metadataFolder.isEmpty()) {
         try {

From 089e8696fb6158f75b398558af690ca9182ab0e7 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 22 Feb 2026 17:19:37 +0100
Subject: [PATCH 092/100] fix second sync error

---
 .../parquet/ITParquetConversionSource.java    | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 4bafed49a..60425300c 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -97,7 +97,7 @@ public static void setupOnce() {
     sparkConf.set("parquet.avro.write-old-list-structure", "false");
     sparkConf.set("spark.sql.parquet.writeLegacyFormat", "false");
     sparkConf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS");
-
+    sparkConf.set("parquet.summary.metadata.level", "NONE");
     sparkConf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer");
     sparkSession = SparkSession.builder().config(sparkConf).getOrCreate();
     jsc = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
@@ -207,12 +207,12 @@ private void cleanupTargetMetadata(String dataPath, List<String> formats) {
         case "ICEBERG":
           metadataFolder = "metadata";
           break;
-//        case "DELTA":
-//          metadataFolder = "_delta_log";
-//          break;
-//        case "HUDI":
-//          metadataFolder = ".hoodie";
-//          break;
+        case "DELTA":
+          metadataFolder = "_delta_log";
+          break;
+        case "HUDI":
+          metadataFolder = ".hoodie";
+          break;
       }
       if (!metadataFolder.isEmpty()) {
         try {
@@ -304,7 +304,7 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
 
       Dataset<Row> dfAppend = sparkSession.createDataFrame(dataToAppend, schema);
       writeData(dfAppend, dataPath, xTablePartitionConfig);
-      cleanupTargetMetadata(dataPath, targetTableFormats);
+      // cleanupTargetMetadata(dataPath, targetTableFormats);
       ConversionConfig conversionConfigAppended =
           getTableSyncConfig(
               sourceTableFormat,
@@ -501,8 +501,8 @@ private void checkDatasetEquivalence(
             .read()
             .schema(schema)
             .options(sourceOptions)
-            .option("recursiveFileLookup", "true")
-            .option("pathGlobFilter", "*.parquet")
+            // .option("recursiveFileLookup", "true")
+            // .option("pathGlobFilter", "*.parquet")
             .parquet(sourceTable.getDataPath())
             .orderBy("id"); // order by id to ensure deterministic order for comparison
     Map<String, Dataset<Row>> targetRowsByFormat =

From f1f8be1c251db62fed1fc25896a199141f047bcf Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 22 Feb 2026 17:26:22 +0100
Subject: [PATCH 093/100] fix second sync error

---
 .../org/apache/xtable/parquet/ITParquetConversionSource.java  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 60425300c..1c6cb545b 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -501,8 +501,8 @@ private void checkDatasetEquivalence(
             .read()
             .schema(schema)
             .options(sourceOptions)
-            // .option("recursiveFileLookup", "true")
-            // .option("pathGlobFilter", "*.parquet")
+            .option("recursiveFileLookup", "true")
+            .option("pathGlobFilter", "*.parquet")
             .parquet(sourceTable.getDataPath())
             .orderBy("id"); // order by id to ensure deterministic order for comparison
     Map<String, Dataset<Row>> targetRowsByFormat =

From 6588d1d37c2fb150dcf3eedb6bd83c5f4ad42448 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 22 Feb 2026 17:35:49 +0100
Subject: [PATCH 094/100] fix second sync error

---
 .../parquet/ITParquetConversionSource.java    | 32 ++-----------------
 1 file changed, 2 insertions(+), 30 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 1c6cb545b..a0a163446 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -97,7 +97,7 @@ public static void setupOnce() {
     sparkConf.set("parquet.avro.write-old-list-structure", "false");
     sparkConf.set("spark.sql.parquet.writeLegacyFormat", "false");
     sparkConf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS");
-    sparkConf.set("parquet.summary.metadata.level", "NONE");
+
     sparkConf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer");
     sparkSession = SparkSession.builder().config(sparkConf).getOrCreate();
     jsc = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
@@ -200,33 +200,6 @@ private ConversionSourceProvider<?> getConversionSourceProvider(String sourceTab
     }
   }
 
-  private void cleanupTargetMetadata(String dataPath, List<String> formats) {
-    for (String format : formats) {
-      String metadataFolder = "";
-      switch (format.toUpperCase()) {
-        case "ICEBERG":
-          metadataFolder = "metadata";
-          break;
-        case "DELTA":
-          metadataFolder = "_delta_log";
-          break;
-        case "HUDI":
-          metadataFolder = ".hoodie";
-          break;
-      }
-      if (!metadataFolder.isEmpty()) {
-        try {
-          org.apache.hadoop.fs.Path path = new org.apache.hadoop.fs.Path(dataPath, metadataFolder);
-          org.apache.hadoop.fs.FileSystem fs = path.getFileSystem(jsc.hadoopConfiguration());
-          if (fs.exists(path)) {
-            fs.delete(path, true);
-          }
-        } catch (IOException e) {
-        }
-      }
-    }
-  }
-
   @ParameterizedTest
   @MethodSource("provideArgsForSyncTesting")
   void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
@@ -304,7 +277,6 @@ void testSync(TableFormatPartitionDataHolder tableFormatPartitionDataHolder) {
 
       Dataset<Row> dfAppend = sparkSession.createDataFrame(dataToAppend, schema);
       writeData(dfAppend, dataPath, xTablePartitionConfig);
-      // cleanupTargetMetadata(dataPath, targetTableFormats);
       ConversionConfig conversionConfigAppended =
           getTableSyncConfig(
               sourceTableFormat,
@@ -501,7 +473,7 @@ private void checkDatasetEquivalence(
             .read()
             .schema(schema)
             .options(sourceOptions)
-            .option("recursiveFileLookup", "true")
+            // .option("recursiveFileLookup", "true")
             .option("pathGlobFilter", "*.parquet")
             .parquet(sourceTable.getDataPath())
             .orderBy("id"); // order by id to ensure deterministic order for comparison

From da9e401c1c19e7f00106ad13a4fde2b98288328a Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 22 Feb 2026 17:43:08 +0100
Subject: [PATCH 095/100] fix second sync error

---
 .../org/apache/xtable/parquet/ITParquetConversionSource.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index a0a163446..64e166e6c 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -97,7 +97,7 @@ public static void setupOnce() {
     sparkConf.set("parquet.avro.write-old-list-structure", "false");
     sparkConf.set("spark.sql.parquet.writeLegacyFormat", "false");
     sparkConf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS");
-
+    sparkConf.set("parquet.summary.metadata.level","NONE");
     sparkConf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer");
     sparkSession = SparkSession.builder().config(sparkConf).getOrCreate();
     jsc = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());

From 15567ab87584e2c22d41074a88ee0784ce5e390c Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 22 Feb 2026 17:50:58 +0100
Subject: [PATCH 096/100] fix second sync error

---
 .../org/apache/xtable/parquet/ITParquetConversionSource.java     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index 64e166e6c..b1c955f87 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -98,6 +98,7 @@ public static void setupOnce() {
     sparkConf.set("spark.sql.parquet.writeLegacyFormat", "false");
     sparkConf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS");
     sparkConf.set("parquet.summary.metadata.level","NONE");
+    sparkConf.set("spark.sql.sources.partitionDiscovery.enabled", "false")
     sparkConf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer");
     sparkSession = SparkSession.builder().config(sparkConf).getOrCreate();
     jsc = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());

From 119b68cd6b421a78e3f6b8b6e57cf72bf42d57da Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 22 Feb 2026 17:51:16 +0100
Subject: [PATCH 097/100] fix second sync error

---
 .../org/apache/xtable/parquet/ITParquetConversionSource.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index b1c955f87..a26cbcee3 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -98,7 +98,7 @@ public static void setupOnce() {
     sparkConf.set("spark.sql.parquet.writeLegacyFormat", "false");
     sparkConf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS");
     sparkConf.set("parquet.summary.metadata.level","NONE");
-    sparkConf.set("spark.sql.sources.partitionDiscovery.enabled", "false")
+    sparkConf.set("spark.sql.sources.partitionDiscovery.enabled", "false");
     sparkConf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer");
     sparkSession = SparkSession.builder().config(sparkConf).getOrCreate();
     jsc = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());

From ca49c63c1b93d7c19778bf5a13965c0b87318925 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 22 Feb 2026 18:02:31 +0100
Subject: [PATCH 098/100] fix second sync error

---
 .../apache/xtable/parquet/ITParquetConversionSource.java    | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index a26cbcee3..a532554ff 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -97,8 +97,6 @@ public static void setupOnce() {
     sparkConf.set("parquet.avro.write-old-list-structure", "false");
     sparkConf.set("spark.sql.parquet.writeLegacyFormat", "false");
     sparkConf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS");
-    sparkConf.set("parquet.summary.metadata.level","NONE");
-    sparkConf.set("spark.sql.sources.partitionDiscovery.enabled", "false");
     sparkConf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer");
     sparkSession = SparkSession.builder().config(sparkConf).getOrCreate();
     jsc = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
@@ -475,8 +473,8 @@ private void checkDatasetEquivalence(
             .schema(schema)
             .options(sourceOptions)
             // .option("recursiveFileLookup", "true")
-            .option("pathGlobFilter", "*.parquet")
-            .parquet(sourceTable.getDataPath())
+            // .option("pathGlobFilter", "*.parquet")
+            .parquet(sourceTable.getDataPath() + "/*.parquet")
             .orderBy("id"); // order by id to ensure deterministic order for comparison
     Map<String, Dataset<Row>> targetRowsByFormat =
         targetFormats.stream()

From 97111357cdcbb2973ac5523872c3ccd95c73817d Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 22 Feb 2026 18:11:23 +0100
Subject: [PATCH 099/100] fix second sync error

---
 .../org/apache/xtable/parquet/ITParquetConversionSource.java  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index a532554ff..a984e8d71 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -473,8 +473,8 @@ private void checkDatasetEquivalence(
             .schema(schema)
             .options(sourceOptions)
             // .option("recursiveFileLookup", "true")
-            // .option("pathGlobFilter", "*.parquet")
-            .parquet(sourceTable.getDataPath() + "/*.parquet")
+            .option("pathGlobFilter", "[!m]*")
+            .parquet(sourceTable.getDataPath())
             .orderBy("id"); // order by id to ensure deterministic order for comparison
     Map<String, Dataset<Row>> targetRowsByFormat =
         targetFormats.stream()

From a62a24c6e7ae6d0aff295e954cbafe27768092c6 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 22 Feb 2026 18:19:42 +0100
Subject: [PATCH 100/100] fix second sync error

---
 .../org/apache/xtable/parquet/ITParquetConversionSource.java  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
index a984e8d71..218ba4d75 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetConversionSource.java
@@ -472,8 +472,8 @@ private void checkDatasetEquivalence(
             .read()
             .schema(schema)
             .options(sourceOptions)
-            // .option("recursiveFileLookup", "true")
-            .option("pathGlobFilter", "[!m]*")
+            .option("recursiveFileLookup", "true")
+            .option("pathGlobFilter", "*.parquet")
             .parquet(sourceTable.getDataPath())
             .orderBy("id"); // order by id to ensure deterministic order for comparison
     Map<String, Dataset<Row>> targetRowsByFormat =