From c97664fe02c2d862286e79e921d983ee02063b45 Mon Sep 17 00:00:00 2001
From: Vamsi <vamsikarnika@gmail.com>
Date: Thu, 12 Mar 2026 16:37:27 +0530
Subject: [PATCH 01/12] [HUDI-8371] Fixing column stats index with MDT for few
 scenarios

---
 .../HoodieBackedTableMetadataWriter.java      |  54 ++-
 .../testutils/LogFileColStatsTestUtil.java    |  96 ++++
 .../org/apache/hudi/common/fs/FSUtils.java    |   8 +
 .../log/HoodieUnMergedLogRecordScanner.java   |   2 +-
 .../metadata/HoodieTableMetadataUtil.java     | 155 ++++--
 .../apache/hudi/common/fs/TestFSUtils.java    |   4 +
 .../metadata/TestHoodieMetadataPayload.java   | 295 ++++++++++++
 .../metadata/TestHoodieTableMetadataUtil.java | 315 ++++++++++++
 ...ap-rollback1-column-stats-index-table.json |   2 +
 ...w-bootstrap1-column-stats-index-table.json |   4 +
 ...w-bootstrap2-column-stats-index-table.json |   5 +
 .../cow-clean1-column-stats-index-table.json  |   2 +
 ...0484-e7e1-48b6-8289-1a7c483b530b-c000.json |   1 +
 ...ap-rollback1-column-stats-index-table.json |   2 +
 ...r-bootstrap1-column-stats-index-table.json |   3 +
 ...r-bootstrap2-column-stats-index-table.json |   5 +
 .../mor-clean1-column-stats-index-table.json  |   2 +
 ...elete-block1-column-stats-index-table.json |   3 +
 ...0484-e7e1-48b6-8289-1a7c483b530b-c000.json |  10 +
 ...0484-e7e1-48b6-8289-1a7c483b530b-c000.json |   5 +
 ...0484-e7e1-48b6-8289-1a7c483b530b-c000.json |   5 +
 .../functional/ColumnStatIndexTestBase.scala  | 164 +++++--
 .../functional/TestColumnStatsIndex.scala     | 450 +++++++++++++++++-
 .../TestColumnStatsIndexWithSQL.scala         |  26 +-
 24 files changed, 1493 insertions(+), 125 deletions(-)
 create mode 100644 hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/LogFileColStatsTestUtil.java
 create mode 100644 hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java
 create mode 100644 hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java
 create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap-rollback1-column-stats-index-table.json
 create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap1-column-stats-index-table.json
 create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap2-column-stats-index-table.json
 create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-clean1-column-stats-index-table.json
 create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/delete-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json
 create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap-rollback1-column-stats-index-table.json
 create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap1-column-stats-index-table.json
 create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap2-column-stats-index-table.json
 create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-clean1-column-stats-index-table.json
 create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-delete-block1-column-stats-index-table.json
 create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update2-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json
 create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update3-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json
 create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update4-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json

diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
index d6e7a8f626ebe..68b02ad6d39ba 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
@@ -237,13 +237,13 @@ public List<MetadataPartitionType> getEnabledPartitionTypes() {
   protected boolean initializeIfNeeded(HoodieTableMetaClient dataMetaClient,
                                        Option<String> inflightInstantTimestamp) throws IOException {
     HoodieTimer timer = HoodieTimer.start();
-    List<MetadataPartitionType> partitionsToInit = new ArrayList<>(MetadataPartitionType.values().length);
+    List<MetadataPartitionType> metadataPartitionsToInit = new ArrayList<>(MetadataPartitionType.values().length);
 
     try {
       boolean exists = metadataTableExists(dataMetaClient);
       if (!exists) {
         // FILES partition is always required
-        partitionsToInit.add(MetadataPartitionType.FILES);
+        metadataPartitionsToInit.add(MetadataPartitionType.FILES);
       }
 
       // check if any of the enabled partition types needs to be initialized
@@ -253,10 +253,10 @@ protected boolean initializeIfNeeded(HoodieTableMetaClient dataMetaClient,
         LOG.info("Async metadata indexing disabled and following partitions already initialized: " + completedPartitions);
         this.enabledPartitionTypes.stream()
             .filter(p -> !completedPartitions.contains(p.getPartitionPath()) && !MetadataPartitionType.FILES.equals(p))
-            .forEach(partitionsToInit::add);
+            .forEach(metadataPartitionsToInit::add);
       }
 
-      if (partitionsToInit.isEmpty()) {
+      if (metadataPartitionsToInit.isEmpty()) {
         // No partitions left to initialize, since all the metadata enabled partitions are either initialized before
         // or current in the process of initialization.
         initMetadataReader();
@@ -266,13 +266,7 @@ protected boolean initializeIfNeeded(HoodieTableMetaClient dataMetaClient,
       // If there is no commit on the dataset yet, use the SOLO_COMMIT_TIMESTAMP as the instant time for initial commit
       // Otherwise, we use the timestamp of the latest completed action.
       String initializationTime = dataMetaClient.getActiveTimeline().filterCompletedInstants().lastInstant().map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP);
-
-      // Initialize partitions for the first time using data from the files on the file system
-      if (!initializeFromFilesystem(initializationTime, partitionsToInit, inflightInstantTimestamp)) {
-        LOG.error("Failed to initialize MDT from filesystem");
-        return false;
-      }
-
+      initializeFromFilesystem(initializationTime, metadataPartitionsToInit, inflightInstantTimestamp);
       metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.INITIALIZE_STR, timer.endTimer()));
       return true;
     } catch (IOException e) {
@@ -345,11 +339,12 @@ private boolean isBootstrapNeeded(Option<HoodieInstant> latestMetadataInstant) {
    * @param partitionsToInit         - List of MDT partitions to initialize
    * @param inflightInstantTimestamp - Current action instant responsible for this initialization
    */
-  private boolean initializeFromFilesystem(String initializationTime, List<MetadataPartitionType> partitionsToInit,
+  private void initializeFromFilesystem(String initializationTime, List<MetadataPartitionType> partitionsToInit,
                                            Option<String> inflightInstantTimestamp) throws IOException {
     if (anyPendingDataInstant(dataMetaClient, inflightInstantTimestamp)) {
-      return false;
+      return;
     }
+    Set<String> pendingDataInstants = getPendingDataInstants(dataMetaClient);
 
     // FILES partition is always required and is initialized first
     boolean filesPartitionAvailable = dataMetaClient.getTableConfig().isMetadataPartitionAvailable(MetadataPartitionType.FILES);
@@ -374,7 +369,7 @@ private boolean initializeFromFilesystem(String initializationTime, List<Metadat
     // Get a complete list of files and partitions from the file system or from already initialized FILES partition of MDT
     List<DirectoryInfo> partitionInfoList;
     if (filesPartitionAvailable) {
-      partitionInfoList = listAllPartitionsFromMDT(initializationTime);
+      partitionInfoList = listAllPartitionsFromMDT(initializationTime, pendingDataInstants);
     } else {
       // if auto initialization is enabled, then we need to list all partitions from the file system
       if (dataWriteConfig.getMetadataConfig().shouldAutoInitialize()) {
@@ -424,8 +419,7 @@ private boolean initializeFromFilesystem(String initializationTime, List<Metadat
             + " bootstrap failed for " + metadataMetaClient.getBasePath(), e);
       }
 
-      LOG.info(String.format("Initializing %s index with %d mappings and %d file groups.", partitionType.name(), fileGroupCountAndRecordsPair.getKey(),
-          fileGroupCountAndRecordsPair.getValue().count()));
+      LOG.info(String.format("Initializing %s index with %d mappings.", partitionType.name(), fileGroupCountAndRecordsPair.getKey()));
       HoodieTimer partitionInitTimer = HoodieTimer.start();
 
       // Generate the file groups
@@ -443,8 +437,6 @@ private boolean initializeFromFilesystem(String initializationTime, List<Metadat
       long totalInitTime = partitionInitTimer.endTimer();
       LOG.info(String.format("Initializing %s index in metadata table took " + totalInitTime + " in ms", partitionType.name()));
     }
-
-    return true;
   }
 
   /**
@@ -473,8 +465,11 @@ private String generateUniqueCommitInstantTime(String initializationTime) {
   }
 
   private Pair<Integer, HoodieData<HoodieRecord>> initializeColumnStatsPartition(Map<String, Map<String, Long>> partitionToFilesMap) {
+    // during initialization, we need stats for base and log files.
     HoodieData<HoodieRecord> records = HoodieTableMetadataUtil.convertFilesToColumnStatsRecords(
-        engineContext, Collections.emptyMap(), partitionToFilesMap, getRecordsGenerationParams());
+        engineContext, Collections.emptyMap(), partitionToFilesMap, dataMetaClient, dataWriteConfig.isMetadataColumnStatsIndexEnabled(),
+        dataWriteConfig.getColumnStatsIndexParallelism(), dataWriteConfig.getColumnsEnabledForColumnStatsIndex(),
+        dataWriteConfig.getMetadataConfig().getMaxReaderBufferSize());
 
     final int fileGroupCount = dataWriteConfig.getMetadataConfig().getColumnStatsIndexFileGroupCount();
     return Pair.of(fileGroupCount, records);
@@ -571,6 +566,16 @@ private boolean anyPendingDataInstant(HoodieTableMetaClient dataMetaClient, Opti
     return false;
   }
 
+  private Set<String> getPendingDataInstants(HoodieTableMetaClient dataMetaClient) {
+    // Initialize excluding the pending operations on the dataset
+    return dataMetaClient.getActiveTimeline()
+        .getInstantsAsStream().filter(i -> !i.isCompleted())
+        // regular writers should not be blocked due to pending indexing action
+        .filter(i -> !HoodieTimeline.INDEXING_ACTION.equals(i.getAction()))
+        .map(HoodieInstant::getTimestamp)
+        .collect(Collectors.toSet());
+  }
+
   private HoodieTableMetaClient initializeMetaClient() throws IOException {
     return HoodieTableMetaClient.withPropertyBuilder()
         .setTableType(HoodieTableType.MERGE_ON_READ)
@@ -646,13 +651,14 @@ private List<DirectoryInfo> listAllPartitionsFromFilesystem(String initializatio
    * @param initializationTime Files which have a timestamp after this are neglected
    * @return List consisting of {@code DirectoryInfo} for each partition found.
    */
-  private List<DirectoryInfo> listAllPartitionsFromMDT(String initializationTime) throws IOException {
-    List<DirectoryInfo> dirinfoList = new LinkedList<>();
-    List<String> allPartitionPaths = metadata.getAllPartitionPaths().stream()
+  private List<DirectoryInfo> listAllPartitionsFromMDT(String initializationTime, Set<String> pendingDataInstants) throws IOException {
+    List<String> allAbsolutePartitionPaths = metadata.getAllPartitionPaths().stream()
         .map(partitionPath -> dataWriteConfig.getBasePath() + "/" + partitionPath).collect(Collectors.toList());
-    Map<String, FileStatus[]> partitionFileMap = metadata.getAllFilesInPartitions(allPartitionPaths);
+    Map<String, FileStatus[]> partitionFileMap = metadata.getAllFilesInPartitions(allAbsolutePartitionPaths);
+    List<DirectoryInfo> dirinfoList = new ArrayList<>(partitionFileMap.size());
     for (Map.Entry<String, FileStatus[]> entry : partitionFileMap.entrySet()) {
-      dirinfoList.add(new DirectoryInfo(entry.getKey(), entry.getValue(), initializationTime));
+      String relativeDirPath = FSUtils.getRelativePartitionPath(new Path(dataWriteConfig.getBasePath()), new Path(entry.getKey()));
+      dirinfoList.add(new DirectoryInfo(relativeDirPath, entry.getValue(), initializationTime));
     }
     return dirinfoList;
   }
diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/LogFileColStatsTestUtil.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/LogFileColStatsTestUtil.java
new file mode 100644
index 0000000000000..464ad5ddca1e4
--- /dev/null
+++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/LogFileColStatsTestUtil.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.testutils;
+
+import org.apache.hudi.common.model.HoodieColumnRangeMetadata;
+import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.TableSchemaResolver;
+import org.apache.hudi.common.table.log.HoodieUnMergedLogRecordScanner;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.exception.HoodieException;
+
+import org.apache.avro.Schema;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.expressions.GenericRow;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import static org.apache.hudi.metadata.HoodieTableMetadataUtil.collectColumnRangeMetadata;
+
+/**
+ * Util methods used in tests to fetch col stats records for a log file.
+ */
+public class LogFileColStatsTestUtil {
+
+  public static Option<Row> getLogFileColumnRangeMetadata(String filePath, HoodieTableMetaClient datasetMetaClient, String latestCommitTime,
+                                                  List<String> columnsToIndex, Option<Schema> writerSchemaOpt,
+                                                  int maxBufferSize) throws IOException {
+    if (writerSchemaOpt.isPresent()) {
+      List<Schema.Field> fieldsToIndex = writerSchemaOpt.get().getFields().stream()
+          .filter(field -> columnsToIndex.contains(field.name()))
+          .collect(Collectors.toList());
+      List<HoodieRecord> records = new ArrayList<>();
+      HoodieUnMergedLogRecordScanner scanner = HoodieUnMergedLogRecordScanner.newBuilder()
+          .withStorage(datasetMetaClient.getStorage())
+          .withBasePath(datasetMetaClient.getBasePath())
+          .withLogFilePaths(Collections.singletonList(filePath))
+          .withBufferSize(maxBufferSize)
+          .withLatestInstantTime(latestCommitTime)
+          .withReaderSchema(writerSchemaOpt.get())
+          .withLogRecordScannerCallback(records::add)
+          .build();
+      scanner.scan();
+      if (records.isEmpty()) {
+        return Option.empty();
+      }
+      Map<String, HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataMap =
+          collectColumnRangeMetadata(records, fieldsToIndex, filePath, writerSchemaOpt.get());
+      List<HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataList = new ArrayList<>(columnRangeMetadataMap.values());
+      return Option.of(getColStatsEntry(filePath, columnRangeMetadataList));
+    } else {
+      throw new HoodieException("Writer schema needs to be set");
+    }
+  }
+
+  private static Row getColStatsEntry(String logFilePath, List<HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataList) {
+    Collections.sort(columnRangeMetadataList, (o1, o2) -> o1.getColumnName().compareTo(o2.getColumnName()));
+    Object[] values = new Object[(columnRangeMetadataList.size() * 3) + 2];
+    values[0] = logFilePath.substring(logFilePath.lastIndexOf("/") + 1);
+    values[1] = columnRangeMetadataList.get(0).getValueCount();
+    int counter = 2;
+    for (HoodieColumnRangeMetadata columnRangeMetadata: columnRangeMetadataList) {
+      values[counter++] = columnRangeMetadata.getValueCount();
+      values[counter++] = columnRangeMetadata.getMinValue();
+      values[counter++] = columnRangeMetadata.getMaxValue();
+    }
+    return new GenericRow(values);
+  }
+
+  public static Option<Schema> getSchemaForTable(HoodieTableMetaClient metaClient) throws Exception {
+    TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient);
+    return Option.of(schemaResolver.getTableAvroSchema());
+  }
+}
+
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java
index 91c966d00a2bd..1e834e5a06dba 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java
@@ -211,6 +211,14 @@ public static String getFileId(String fullFileName) {
     return fullFileName.split("_", 2)[0];
   }
 
+  /**
+   * @param filePath
+   * @returns the filename from the given path. Path could be the absolute path or just partition path and file name.
+   */
+  public static String getFileNameFromPath(String filePath) {
+    return filePath.substring(filePath.lastIndexOf("/") + 1);
+  }
+
   /**
    * Gets all partition paths assuming date partitioning (year, month, day) three levels down.
    */
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java
index f62ec0febd578..99fe6c1ff54f2 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java
@@ -79,7 +79,7 @@ public <T> void processNextRecord(HoodieRecord<T> hoodieRecord) throws Exception
 
   @Override
   protected void processNextDeletedRecord(DeleteRecord deleteRecord) {
-    throw new IllegalStateException("Not expected to see delete records in this log-scan mode. Check Job Config");
+    // no - op
   }
 
   /**
diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
index 62b0232583293..f4ba94136b9c9 100644
--- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
+++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
@@ -44,12 +44,14 @@
 import org.apache.hudi.common.model.HoodieLogFile;
 import org.apache.hudi.common.model.HoodieRecord;
 import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType;
+import org.apache.hudi.common.model.HoodieRecordPayload;
 import org.apache.hudi.common.model.HoodieRecordGlobalLocation;
 import org.apache.hudi.common.model.HoodieWriteStat;
 import org.apache.hudi.common.table.HoodieTableConfig;
 import org.apache.hudi.common.table.HoodieTableMetaClient;
 import org.apache.hudi.common.table.TableSchemaResolver;
 import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
+import org.apache.hudi.common.table.log.HoodieUnMergedLogRecordScanner;
 import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
 import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline;
 import org.apache.hudi.common.table.timeline.HoodieInstant;
@@ -72,6 +74,7 @@
 import org.apache.hudi.io.storage.HoodieFileReaderFactory;
 import org.apache.hudi.util.Lazy;
 
+import com.google.common.annotations.VisibleForTesting;
 import org.apache.avro.AvroTypeException;
 import org.apache.avro.LogicalTypes;
 import org.apache.avro.Schema;
@@ -119,6 +122,7 @@
 import static org.apache.hudi.common.config.HoodieCommonConfig.MAX_MEMORY_FOR_COMPACTION;
 import static org.apache.hudi.common.config.HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE;
 import static org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator.MILLIS_INSTANT_ID_LENGTH;
+import static org.apache.hudi.common.fs.FSUtils.getFileNameFromPath;
 import static org.apache.hudi.common.util.StringUtils.isNullOrEmpty;
 import static org.apache.hudi.common.util.ValidationUtils.checkState;
 import static org.apache.hudi.metadata.HoodieMetadataPayload.RECORD_INDEX_MISSING_FILEINDEX_FALLBACK;
@@ -645,12 +649,8 @@ public static HoodieData<HoodieRecord> convertMetadataToColumnStatsRecords(Hoodi
     return engineContext.parallelize(deleteFileList, parallelism)
         .flatMap(deleteFileInfoPair -> {
           String partitionPath = deleteFileInfoPair.getLeft();
-          String filePath = deleteFileInfoPair.getRight();
-
-          if (filePath.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
-            return getColumnStatsRecords(partitionPath, filePath, dataTableMetaClient, columnsToIndex, true).iterator();
-          }
-          return Collections.emptyListIterator();
+          String fileName = deleteFileInfoPair.getRight();
+          return getColumnStatsRecords(partitionPath, fileName, dataTableMetaClient, columnsToIndex, true).iterator();
         });
   }
 
@@ -858,12 +858,18 @@ public static HoodieData<HoodieRecord> convertFilesToBloomFilterRecords(HoodieEn
   public static HoodieData<HoodieRecord> convertFilesToColumnStatsRecords(HoodieEngineContext engineContext,
                                                                           Map<String, List<String>> partitionToDeletedFiles,
                                                                           Map<String, Map<String, Long>> partitionToAppendedFiles,
-                                                                          MetadataRecordsGenerationParams recordsGenerationParams) {
+                                                                          HoodieTableMetaClient dataMetaClient,
+                                                                          boolean isColumnStatsIndexEnabled,
+                                                                          int columnStatsIndexParallelism,
+                                                                          List<String> targetColumnsForColumnStatsIndex,
+                                                                          int maxReaderBufferSize) {
+    if (!isColumnStatsIndexEnabled) {
+      return engineContext.emptyHoodieData();
+    }
     // Find the columns to index
-    HoodieTableMetaClient dataTableMetaClient = recordsGenerationParams.getDataMetaClient();
     final List<String> columnsToIndex =
-        getColumnsToIndex(recordsGenerationParams,
-            Lazy.lazily(() -> tryResolveSchemaForTable(dataTableMetaClient)));
+        getColumnsToIndex(true, targetColumnsForColumnStatsIndex,
+            Lazy.lazily(() -> tryResolveSchemaForTable(dataMetaClient)));
     if (columnsToIndex.isEmpty()) {
       // In case there are no columns to index, bail
       return engineContext.emptyHoodieData();
@@ -875,18 +881,12 @@ public static HoodieData<HoodieRecord> convertFilesToColumnStatsRecords(HoodieEn
     final List<Tuple3<String, String, Boolean>> partitionFileFlagTupleList = fetchPartitionFileInfoTriplets(partitionToDeletedFiles, partitionToAppendedFiles);
 
     // Create records MDT
-    int parallelism = Math.max(Math.min(partitionFileFlagTupleList.size(), recordsGenerationParams.getColumnStatsIndexParallelism()), 1);
+    int parallelism = Math.max(Math.min(partitionFileFlagTupleList.size(), columnStatsIndexParallelism), 1);
     return engineContext.parallelize(partitionFileFlagTupleList, parallelism).flatMap(partitionFileFlagTuple -> {
-      final String partitionName = partitionFileFlagTuple.f0;
+      final String partitionPath = partitionFileFlagTuple.f0;
       final String filename = partitionFileFlagTuple.f1;
       final boolean isDeleted = partitionFileFlagTuple.f2;
-      if (!FSUtils.isBaseFile(new Path(filename)) || !filename.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
-        LOG.warn(String.format("Ignoring file %s as it is not a PARQUET file", filename));
-        return Stream.<HoodieRecord>empty().iterator();
-      }
-
-      final String filePathWithPartition = partitionName + "/" + filename;
-      return getColumnStatsRecords(partitionName, filePathWithPartition, dataTableMetaClient, columnsToIndex, isDeleted).iterator();
+      return getColumnStatsRecords(partitionPath, filename, dataMetaClient, columnsToIndex, isDeleted, maxReaderBufferSize).iterator();
     });
   }
 
@@ -1095,6 +1095,27 @@ private static List<String> getColumnsToIndex(MetadataRecordsGenerationParams re
         .orElse(Collections.emptyList());
   }
 
+  /**
+   * Get the list of columns for the table for column stats indexing
+   */
+  private static List<String> getColumnsToIndex(boolean isColumnStatsIndexEnabled,
+                                                List<String> targetColumnsForColumnStatsIndex,
+                                                Lazy<Option<Schema>> lazyWriterSchemaOpt) {
+    checkState(isColumnStatsIndexEnabled);
+
+    if (!targetColumnsForColumnStatsIndex.isEmpty()) {
+      return targetColumnsForColumnStatsIndex;
+    }
+
+    Option<Schema> writerSchemaOpt = lazyWriterSchemaOpt.get();
+    return writerSchemaOpt
+        .map(writerSchema ->
+            writerSchema.getFields().stream()
+                .map(Schema.Field::name)
+                .collect(Collectors.toList()))
+        .orElse(Collections.emptyList());
+  }
+
   private static Stream<HoodieRecord> translateWriteStatToColumnStats(HoodieWriteStat writeStat,
                                                                       HoodieTableMetaClient datasetMetaClient,
                                                                       List<String> columnsToIndex) {
@@ -1104,54 +1125,114 @@ private static Stream<HoodieRecord> translateWriteStatToColumnStats(HoodieWriteS
       return HoodieMetadataPayload.createColumnStatsRecords(writeStat.getPartitionPath(), columnRangeMetadataList, false);
     }
 
-    return getColumnStatsRecords(writeStat.getPartitionPath(), writeStat.getPath(), datasetMetaClient, columnsToIndex, false);
+    String filePath = writeStat.getPath();
+    return getColumnStatsRecords(writeStat.getPartitionPath(), getFileNameFromPath(filePath), datasetMetaClient, columnsToIndex, false);
   }
 
   private static Stream<HoodieRecord> getColumnStatsRecords(String partitionPath,
-                                                            String filePath,
+                                                            String fileName,
                                                             HoodieTableMetaClient datasetMetaClient,
                                                             List<String> columnsToIndex,
                                                             boolean isDeleted) {
-    String filePartitionPath = filePath.startsWith("/") ? filePath.substring(1) : filePath;
-    String fileName = FSUtils.getFileName(filePath, partitionPath);
+    return getColumnStatsRecords(partitionPath, fileName, datasetMetaClient, columnsToIndex, isDeleted, -1);
+  }
+
+  private static Stream<HoodieRecord> getColumnStatsRecords(String partitionPath,
+                                                            String fileName,
+                                                            HoodieTableMetaClient datasetMetaClient,
+                                                            List<String> columnsToIndex,
+                                                            boolean isDeleted,
+                                                            int maxBufferSize) {
 
     if (isDeleted) {
-      // TODO we should delete records instead of stubbing them
       List<HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataList = columnsToIndex.stream()
           .map(entry -> HoodieColumnRangeMetadata.stub(fileName, entry))
           .collect(Collectors.toList());
 
       return HoodieMetadataPayload.createColumnStatsRecords(partitionPath, columnRangeMetadataList, true);
     }
-
     List<HoodieColumnRangeMetadata<Comparable>> columnRangeMetadata =
-        readColumnRangeMetadataFrom(filePartitionPath, datasetMetaClient, columnsToIndex);
+        readColumnRangeMetadataFrom(partitionPath, fileName, datasetMetaClient, columnsToIndex, maxBufferSize);
 
     return HoodieMetadataPayload.createColumnStatsRecords(partitionPath, columnRangeMetadata, false);
   }
 
-  private static List<HoodieColumnRangeMetadata<Comparable>> readColumnRangeMetadataFrom(String filePath,
+  private static List<HoodieColumnRangeMetadata<Comparable>> readColumnRangeMetadataFrom(String partitionPath,
+                                                                                         String fileName,
                                                                                          HoodieTableMetaClient datasetMetaClient,
-                                                                                         List<String> columnsToIndex) {
+                                                                                         List<String> columnsToIndex,
+                                                                                         int maxBufferSize) {
+    String partitionPathFileName = (partitionPath.equals(EMPTY_PARTITION_NAME) || partitionPath.equals(NON_PARTITIONED_NAME)) ? fileName
+        : partitionPath + "/" + fileName;
     try {
-      if (filePath.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
-        Path fullFilePath = new Path(datasetMetaClient.getBasePath(), filePath);
-        List<HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataList =
-            new ParquetUtils().readRangeFromParquetMetadata(datasetMetaClient.getHadoopConf(), fullFilePath, columnsToIndex);
-
-        return columnRangeMetadataList;
+      Path fullFilePath = new Path(datasetMetaClient.getBasePath(), partitionPathFileName);
+      if (partitionPathFileName.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
+        return new ParquetUtils().readRangeFromParquetMetadata(datasetMetaClient.getHadoopConf(), fullFilePath, columnsToIndex);
+      } else if (FSUtils.isLogFile(fileName)) {
+        Option<Schema> writerSchemaOpt = tryResolveSchemaForTable(datasetMetaClient);
+        LOG.warn("Reading log file: {}, to build column range metadata.", partitionPathFileName);
+        return getLogFileColumnRangeMetadata(fullFilePath.toString(), datasetMetaClient, columnsToIndex, writerSchemaOpt, maxBufferSize);
       }
-
-      LOG.warn("Column range index not supported for: " + filePath);
+      LOG.warn("Column range index not supported for: {}", partitionPathFileName);
       return Collections.emptyList();
     } catch (Exception e) {
       // NOTE: In case reading column range metadata from individual file failed,
       //       we simply fall back, in lieu of failing the whole task
-      LOG.error("Failed to fetch column range metadata for: " + filePath);
+      LOG.error("Failed to fetch column range metadata for: {}", partitionPathFileName);
       return Collections.emptyList();
     }
   }
 
+  /**
+   * Read column range metadata from log file.
+   */
+  @VisibleForTesting
+  protected static List<HoodieColumnRangeMetadata<Comparable>> getLogFileColumnRangeMetadata(String filePath,
+                                                                                             HoodieTableMetaClient datasetMetaClient,
+                                                                                             List<String> columnsToIndex,
+                                                                                             Option<Schema> writerSchemaOpt,
+                                                                                             int maxBufferSize) throws IOException {
+    if (writerSchemaOpt.isPresent()) {
+      List<Schema.Field> fieldsToIndex = writerSchemaOpt.get().getFields().stream()
+          .filter(field -> columnsToIndex.contains(field.name()))
+          .collect(Collectors.toList());
+      // read log file records without merging
+      List<HoodieRecord> hoodieRecords = new ArrayList<>();
+      HoodieUnMergedLogRecordScanner scanner = HoodieUnMergedLogRecordScanner.newBuilder()
+          .withFileSystem(datasetMetaClient.getFs())
+          .withBasePath(datasetMetaClient.getBasePath())
+          .withLogFilePaths(Collections.singletonList(filePath))
+          .withBufferSize(maxBufferSize)
+          .withLatestInstantTime(datasetMetaClient.getActiveTimeline().getCommitsTimeline().lastInstant().get().getTimestamp())
+          .withReaderSchema(writerSchemaOpt.get())
+          .withLogRecordScannerCallback(hoodieRecords::add)
+          .build();
+      scanner.scan();
+      if (hoodieRecords.isEmpty()) {
+        return Collections.emptyList();
+      }
+      // Extract IndexedRecord from HoodieRecord to use with existing collectColumnRangeMetadata
+      List<IndexedRecord> records = new ArrayList<>();
+      for (HoodieRecord hoodieRecord : hoodieRecords) {
+        try {
+          Option<IndexedRecord> insertValue = ((HoodieRecordPayload) hoodieRecord.getData()).getInsertValue(writerSchemaOpt.get());
+          if (insertValue.isPresent()) {
+            records.add(insertValue.get());
+          }
+        } catch (IOException e) {
+          LOG.warn("Failed to get insert value for record: {}", e.getMessage());
+        }
+      }
+      if (records.isEmpty()) {
+        return Collections.emptyList();
+      }
+      Map<String, HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataMap =
+          collectColumnRangeMetadata(records, fieldsToIndex, getFileNameFromPath(filePath));
+      return new ArrayList<>(columnRangeMetadataMap.values());
+    }
+    return Collections.emptyList();
+  }
+
   /**
    * Does an upcast for {@link BigDecimal} instance to align it with scale/precision expected by
    * the {@link org.apache.avro.LogicalTypes.Decimal} Avro logical type
diff --git a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java
index 250304c7fd0ed..7e47274ab2cbe 100644
--- a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java
+++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java
@@ -251,6 +251,10 @@ public void tesLogFileName() {
     assertEquals(1, FSUtils.getTaskPartitionIdFromLogPath(rlPath));
     assertEquals(0, FSUtils.getStageIdFromLogPath(rlPath));
     assertEquals(1, FSUtils.getTaskAttemptIdFromLogPath(rlPath));
+
+    assertEquals(logFile, FSUtils.getFileNameFromPath("/tmp/path/" + logFile));
+    assertEquals(logFile, FSUtils.getFileNameFromPath("/tmp/abc/def/path/" + logFile));
+    assertEquals(logFile, FSUtils.getFileNameFromPath("/tmp/" + logFile));
   }
 
   @Test
diff --git a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java
new file mode 100644
index 0000000000000..ce2cae78342c8
--- /dev/null
+++ b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.metadata;
+
+import org.apache.hudi.common.model.HoodieColumnRangeMetadata;
+import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.collection.Pair;
+
+import org.apache.avro.generic.IndexedRecord;
+import org.junit.jupiter.api.Test;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import static org.apache.hudi.common.util.CollectionUtils.createImmutableMap;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Tests {@link HoodieMetadataPayload}.
+ */
+public class TestHoodieMetadataPayload extends HoodieCommonTestHarness {
+  public static final String PARTITION_NAME = "2022/10/01";
+  public static final String PARTITION_NAME2 = "2023/10/01";
+  public static final String PARTITION_NAME3 = "2024/10/01";
+
+  @Test
+  public void testFileSystemMetadataPayloadMerging() {
+    Map<String, Long> firstCommitAddedFiles = createImmutableMap(
+        Pair.of("file1.parquet", 1000L),
+        Pair.of("file2.parquet", 2000L),
+        Pair.of("file3.parquet", 3000L)
+    );
+
+    HoodieRecord<HoodieMetadataPayload> firstPartitionFilesRecord =
+        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, firstCommitAddedFiles, Collections.emptyList());
+
+    Map<String, Long> secondCommitAddedFiles = createImmutableMap(
+        // NOTE: This is an append
+        Pair.of("file3.parquet", 3333L),
+        Pair.of("file4.parquet", 4000L),
+        Pair.of("file5.parquet", 5000L)
+    );
+
+    List<String> secondCommitDeletedFiles = Collections.singletonList("file1.parquet");
+
+    HoodieRecord<HoodieMetadataPayload> secondPartitionFilesRecord =
+        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, secondCommitAddedFiles, secondCommitDeletedFiles);
+
+    HoodieMetadataPayload combinedPartitionFilesRecordPayload =
+        secondPartitionFilesRecord.getData().preCombine(firstPartitionFilesRecord.getData());
+
+    HoodieMetadataPayload expectedCombinedPartitionedFilesRecordPayload =
+        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME,
+            createImmutableMap(
+                Pair.of("file2.parquet", 2000L),
+                Pair.of("file3.parquet", 3333L),
+                Pair.of("file4.parquet", 4000L),
+                Pair.of("file5.parquet", 5000L)
+            ),
+            Collections.emptyList()
+        ).getData();
+
+    assertEquals(expectedCombinedPartitionedFilesRecordPayload, combinedPartitionFilesRecordPayload);
+  }
+
+  @Test
+  public void testFileSystemMetadataPayloadMergingWithDeletions() {
+    Map<String, Long> addedFileMap = createImmutableMap(
+        Pair.of("file1.parquet", 1000L),
+        Pair.of("file2.parquet", 2000L),
+        Pair.of("file3.parquet", 3000L),
+        Pair.of("file4.parquet", 4000L)
+    );
+    HoodieRecord<HoodieMetadataPayload> additionRecord =
+        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, addedFileMap, Collections.emptyList());
+
+    List<String> deletedFileList1 = new ArrayList<>();
+    deletedFileList1.add("file1.parquet");
+    deletedFileList1.add("file3.parquet");
+    HoodieRecord<HoodieMetadataPayload> deletionRecord1 =
+        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, Collections.emptyMap(), deletedFileList1);
+
+    List<String> deletedFileList2 = new ArrayList<>();
+    deletedFileList2.add("file1.parquet");
+    deletedFileList2.add("file4.parquet");
+    HoodieRecord<HoodieMetadataPayload> deletionRecord2 =
+        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, Collections.emptyMap(), deletedFileList2);
+
+    assertEquals(
+        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME,
+            createImmutableMap(
+                Pair.of("file2.parquet", 2000L),
+                Pair.of("file4.parquet", 4000L)
+            ),
+            Collections.emptyList()
+        ).getData(),
+        deletionRecord1.getData().preCombine(additionRecord.getData())
+    );
+
+    List<String> expectedDeleteFileList = new ArrayList<>();
+    expectedDeleteFileList.add("file1.parquet");
+    expectedDeleteFileList.add("file3.parquet");
+    expectedDeleteFileList.add("file4.parquet");
+    
+    assertEquals(
+        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME,
+            Collections.emptyMap(),
+            expectedDeleteFileList
+        ).getData(),
+        deletionRecord2.getData().preCombine(deletionRecord1.getData())
+    );
+
+    assertEquals(
+        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME,
+            createImmutableMap(
+                Pair.of("file2.parquet", 2000L)
+            ),
+            Collections.emptyList()
+        ).getData(),
+        deletionRecord2.getData().preCombine(deletionRecord1.getData()).preCombine(additionRecord.getData())
+    );
+
+    assertEquals(
+        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME,
+            createImmutableMap(
+                Pair.of("file2.parquet", 2000L)
+            ),
+            Collections.singletonList("file1.parquet")
+        ).getData(),
+        deletionRecord2.getData().preCombine(deletionRecord1.getData().preCombine(additionRecord.getData()))
+    );
+
+    // lets delete all files
+    List<String> allDeletedFileList = new ArrayList<>();
+    allDeletedFileList.add("file1.parquet");
+    allDeletedFileList.add("file2.parquet");
+    allDeletedFileList.add("file3.parquet");
+    allDeletedFileList.add("file4.parquet");
+    HoodieRecord<HoodieMetadataPayload> allDeletionRecord =
+        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, Collections.emptyMap(), allDeletedFileList);
+
+    HoodieMetadataPayload combinedPayload = allDeletionRecord.getData().preCombine(additionRecord.getData());
+    assertEquals(HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, Collections.emptyMap(), Collections.emptyList()).getData(), combinedPayload);
+    assertTrue(combinedPayload.filesystemMetadata.isEmpty());
+
+    // test all partition record
+    HoodieRecord<HoodieMetadataPayload> allPartitionsRecord = HoodieMetadataPayload.createPartitionListRecord(Arrays.asList(PARTITION_NAME, PARTITION_NAME2, PARTITION_NAME3), false);
+    HoodieRecord<HoodieMetadataPayload> partitionDeletedRecord = HoodieMetadataPayload.createPartitionListRecord(Collections.singletonList(PARTITION_NAME), true);
+    // combine to ensure the deleted partitions is not seen
+    HoodieMetadataPayload payload = partitionDeletedRecord.getData().preCombine(allPartitionsRecord.getData());
+    assertEquals(HoodieMetadataPayload.createPartitionListRecord(Arrays.asList(PARTITION_NAME2, PARTITION_NAME3), false).getData(),
+        payload);
+  }
+
+  @Test
+  public void testColumnStatsPayloadMerging() throws IOException {
+    String fileName = "file.parquet";
+    String targetColName = "c1";
+
+    HoodieColumnRangeMetadata<Comparable> c1Metadata =
+        HoodieColumnRangeMetadata.<Comparable>create(fileName, targetColName, 100, 1000, 5, 1000, 123456, 123456);
+
+    HoodieRecord<HoodieMetadataPayload> columnStatsRecord =
+        HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1Metadata), false)
+            .findFirst().get();
+
+    ////////////////////////////////////////////////////////////////////////
+    // Case 1: Combining proper (non-deleted) records
+    ////////////////////////////////////////////////////////////////////////
+
+    // NOTE: Column Stats record will only be merged in case existing file will be modified,
+    //       which could only happen on storages schemes supporting appends
+    HoodieColumnRangeMetadata<Comparable> c1AppendedBlockMetadata =
+        HoodieColumnRangeMetadata.<Comparable>create(fileName, targetColName, 0, 500, 0, 100, 12345, 12345);
+
+    HoodieRecord<HoodieMetadataPayload> updatedColumnStatsRecord =
+        HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1AppendedBlockMetadata), false)
+            .findFirst().get();
+
+    HoodieMetadataPayload combinedMetadataPayload =
+        columnStatsRecord.getData().preCombine(updatedColumnStatsRecord.getData());
+
+    HoodieColumnRangeMetadata<Comparable> expectedColumnRangeMetadata =
+        HoodieColumnRangeMetadata.<Comparable>create(fileName, targetColName, 0, 1000, 5, 1100, 135801, 135801);
+
+    HoodieRecord<HoodieMetadataPayload> expectedColumnStatsRecord =
+        HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(expectedColumnRangeMetadata), false)
+            .findFirst().get();
+
+    // Assert combined payload
+    assertEquals(combinedMetadataPayload, expectedColumnStatsRecord.getData());
+
+    Option<IndexedRecord> alternativelyCombinedMetadataPayloadAvro =
+        columnStatsRecord.getData().combineAndGetUpdateValue(updatedColumnStatsRecord.getData().getInsertValue(null).get(), null);
+
+    // Assert that using legacy API yields the same value
+    assertEquals(combinedMetadataPayload.getInsertValue(null), alternativelyCombinedMetadataPayloadAvro);
+
+    ////////////////////////////////////////////////////////////////////////
+    // Case 2: Combining w/ deleted records
+    ////////////////////////////////////////////////////////////////////////
+
+    HoodieColumnRangeMetadata<Comparable> c1StubbedMetadata =
+        HoodieColumnRangeMetadata.<Comparable>stub(fileName, targetColName);
+
+    HoodieRecord<HoodieMetadataPayload> deletedColumnStatsRecord =
+        HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1StubbedMetadata), true)
+            .findFirst().get();
+
+    // NOTE: In this case, deleted (or tombstone) record will be therefore deleting
+    //       previous state of the record
+    HoodieMetadataPayload deletedCombinedMetadataPayload =
+        deletedColumnStatsRecord.getData().preCombine(columnStatsRecord.getData());
+
+    assertEquals(deletedColumnStatsRecord.getData(), deletedCombinedMetadataPayload);
+    assertFalse(deletedCombinedMetadataPayload.getInsertValue(null).isPresent());
+    assertTrue(deletedCombinedMetadataPayload.isDeleted());
+
+    // NOTE: In this case, proper incoming record will be overwriting previously deleted
+    //       record
+    HoodieMetadataPayload overwrittenCombinedMetadataPayload =
+        columnStatsRecord.getData().preCombine(deletedColumnStatsRecord.getData());
+
+    assertEquals(columnStatsRecord.getData(), overwrittenCombinedMetadataPayload);
+  }
+
+  @Test
+  public void testPartitionStatsPayloadMerging() {
+    HoodieColumnRangeMetadata<Comparable> fileColumnRange1 = HoodieColumnRangeMetadata.<Comparable>create(
+        "path/to/file", "columnName", 1, 5, 0, 10, 100, 200);
+    HoodieRecord<HoodieMetadataPayload> firstPartitionStatsRecord =
+        HoodieMetadataPayload.createPartitionStatsRecords(PARTITION_NAME, Collections.singletonList(fileColumnRange1), false, false).findFirst().get();
+    HoodieColumnRangeMetadata<Comparable> fileColumnRange2 = HoodieColumnRangeMetadata.<Comparable>create(
+        "path/to/file", "columnName", 3, 8, 1, 15, 120, 250);
+    HoodieRecord<HoodieMetadataPayload> updatedPartitionStatsRecord =
+        HoodieMetadataPayload.createPartitionStatsRecords(PARTITION_NAME, Collections.singletonList(fileColumnRange2), false, false).findFirst().get();
+    HoodieMetadataPayload combinedPartitionStatsRecordPayload =
+        updatedPartitionStatsRecord.getData().preCombine(firstPartitionStatsRecord.getData());
+    HoodieColumnRangeMetadata<Comparable> expectedColumnRange = HoodieColumnRangeMetadata.<Comparable>create(
+        "path/to/file", "columnName", 1, 8, 1, 25, 220, 450);
+    HoodieMetadataPayload expectedColumnRangeMetadata = (HoodieMetadataPayload) HoodieMetadataPayload.createPartitionStatsRecords(
+        PARTITION_NAME, Collections.singletonList(expectedColumnRange), false, false).findFirst().get().getData();
+    assertEquals(expectedColumnRangeMetadata, combinedPartitionStatsRecordPayload);
+  }
+
+  @Test
+  public void testPartitionStatsPayloadMergingWithDelete() {
+    HoodieColumnRangeMetadata<Comparable> fileColumnRange1 = HoodieColumnRangeMetadata.<Comparable>create(
+        "path/to/file", "columnName", 1, 5, 0, 10, 100, 200);
+    HoodieRecord<HoodieMetadataPayload> firstPartitionStatsRecord =
+        HoodieMetadataPayload.createPartitionStatsRecords(PARTITION_NAME, Collections.singletonList(fileColumnRange1), false, false).findFirst().get();
+    HoodieColumnRangeMetadata<Comparable> fileColumnRange2 = HoodieColumnRangeMetadata.<Comparable>create(
+        "path/to/file", "columnName", 3, 8, 1, 15, 120, 250);
+    // create delete payload
+    HoodieRecord<HoodieMetadataPayload> deletedPartitionStatsRecord =
+        HoodieMetadataPayload.createPartitionStatsRecords(PARTITION_NAME, Collections.singletonList(fileColumnRange2), true, false).findFirst().get();
+    // deleted (or tombstone) record will be therefore deleting previous state of the record
+    HoodieMetadataPayload combinedPartitionStatsRecordPayload =
+        deletedPartitionStatsRecord.getData().preCombine(firstPartitionStatsRecord.getData());
+    HoodieColumnRangeMetadata<Comparable> expectedColumnRange = HoodieColumnRangeMetadata.<Comparable>create(
+        "path/to/file", "columnName", 3, 8, 1, 15, 120, 250);
+    HoodieMetadataPayload expectedColumnRangeMetadata = (HoodieMetadataPayload) HoodieMetadataPayload.createPartitionStatsRecords(
+        PARTITION_NAME, Collections.singletonList(expectedColumnRange), true, false).findFirst().get().getData();
+    assertEquals(expectedColumnRangeMetadata, combinedPartitionStatsRecordPayload);
+
+    // another update for the same key should overwrite the delete record
+    HoodieMetadataPayload overwrittenCombinedPartitionStatsRecordPayload =
+        firstPartitionStatsRecord.getData().preCombine(deletedPartitionStatsRecord.getData());
+    assertEquals(firstPartitionStatsRecord.getData(), overwrittenCombinedPartitionStatsRecordPayload);
+  }
+}
diff --git a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java
new file mode 100644
index 0000000000000..9586171d97aa5
--- /dev/null
+++ b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java
@@ -0,0 +1,315 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.metadata;
+
+import org.apache.hudi.common.config.HoodieMetadataConfig;
+import org.apache.hudi.common.data.HoodieData;
+import org.apache.hudi.common.engine.EngineType;
+import org.apache.hudi.common.engine.HoodieLocalEngineContext;
+import org.apache.hudi.common.model.FileSlice;
+import org.apache.hudi.common.model.HoodieBaseFile;
+import org.apache.hudi.common.model.HoodieColumnRangeMetadata;
+import org.apache.hudi.common.model.HoodieCommitMetadata;
+import org.apache.hudi.common.model.HoodieLogFile;
+import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.model.WriteOperationType;
+import org.apache.hudi.common.table.HoodieTableConfig;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.testutils.FileCreateUtils;
+import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
+import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
+import org.apache.hudi.common.testutils.HoodieTestTable;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.io.storage.HoodieFileWriter;
+import org.apache.hudi.io.storage.HoodieFileWriterFactory;
+import org.apache.hudi.storage.StoragePath;
+
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Properties;
+import java.util.Set;
+import java.util.UUID;
+import java.util.stream.Collectors;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class TestHoodieTableMetadataUtil extends HoodieCommonTestHarness {
+
+  private static HoodieTestTable hoodieTestTable;
+  private static final List<String> DATE_PARTITIONS = Arrays.asList("2019/01/01", "2020/01/02", "2021/03/01");
+
+  @BeforeEach
+  public void setUp() throws IOException {
+    initMetaClient();
+    initTestDataGenerator(DATE_PARTITIONS.toArray(new String[0]));
+    hoodieTestTable = HoodieTestTable.of(metaClient);
+  }
+
+  @AfterEach
+  public void tearDown() throws IOException {
+    metaClient.getStorage().deleteDirectory(metaClient.getBasePath());
+    cleanupTestDataGenerator();
+    cleanMetaClient();
+  }
+
+  @Test
+  public void testReadRecordKeysFromBaseFilesWithEmptyPartitionBaseFilePairs() {
+    HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getStorageConf());
+    List<Pair<String, FileSlice>> partitionFileSlicePairs = Collections.emptyList();
+    HoodieData<HoodieRecord> result = HoodieTableMetadataUtil.readRecordKeysFromFileSlices(
+        engineContext,
+        partitionFileSlicePairs,
+        false,
+        1,
+        "activeModule",
+        metaClient,
+        EngineType.SPARK
+    );
+    assertTrue(result.isEmpty());
+  }
+
+  @Test
+  public void testConvertFilesToPartitionStatsRecords() throws Exception {
+    HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getStorageConf());
+    String instant1 = "20230918120000000";
+    hoodieTestTable = hoodieTestTable.addCommit(instant1);
+    String instant2 = "20230918121110000";
+    hoodieTestTable = hoodieTestTable.addCommit(instant2);
+    List<HoodieTableMetadataUtil.DirectoryInfo> partitionInfoList = new ArrayList<>();
+    // Generate 10 inserts for each partition and populate partitionBaseFilePairs and recordKeys.
+    DATE_PARTITIONS.forEach(p -> {
+      try {
+        URI partitionMetaFile = FileCreateUtils.createPartitionMetaFile(basePath, p);
+        StoragePath partitionMetadataPath = new StoragePath(partitionMetaFile);
+        String fileId1 = UUID.randomUUID().toString();
+        FileSlice fileSlice1 = new FileSlice(p, instant1, fileId1);
+        StoragePath storagePath1 = new StoragePath(hoodieTestTable.getBaseFilePath(p, fileId1).toUri());
+        writeParquetFile(
+            instant1,
+            storagePath1,
+            dataGen.generateInsertsForPartition(instant1, 10, p),
+            metaClient,
+            engineContext);
+        HoodieBaseFile baseFile1 = new HoodieBaseFile(hoodieTestTable.getBaseFilePath(p, fileId1).toString());
+        fileSlice1.setBaseFile(baseFile1);
+        String fileId2 = UUID.randomUUID().toString();
+        FileSlice fileSlice2 = new FileSlice(p, instant2, fileId2);
+        StoragePath storagePath2 = new StoragePath(hoodieTestTable.getBaseFilePath(p, fileId2).toUri());
+        writeParquetFile(
+            instant2,
+            storagePath2,
+            dataGen.generateInsertsForPartition(instant2, 10, p),
+            metaClient,
+            engineContext);
+        HoodieBaseFile baseFile2 = new HoodieBaseFile(hoodieTestTable.getBaseFilePath(p, fileId2).toString());
+        fileSlice2.setBaseFile(baseFile2);
+        partitionInfoList.add(new HoodieTableMetadataUtil.DirectoryInfo(
+            p,
+            metaClient.getStorage().listDirectEntries(Arrays.asList(partitionMetadataPath, storagePath1, storagePath2)),
+            instant2,
+            Collections.emptySet()));
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+    });
+
+    List<String> columnsToIndex = Arrays.asList("rider", "driver");
+    HoodieData<HoodieRecord> result = HoodieTableMetadataUtil.convertFilesToPartitionStatsRecords(
+        engineContext,
+        partitionInfoList,
+        HoodieMetadataConfig.newBuilder().enable(true)
+            .withMetadataIndexColumnStats(true)
+            .withMetadataIndexPartitionStats(true)
+            .withColumnStatsIndexForColumns("rider,driver")
+            .withPartitionStatsIndexParallelism(1)
+            .build(),
+        metaClient,
+        Option.of(HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS));
+    // Validate the result.
+    validatePartitionStats(result, instant1, instant2);
+  }
+
+  @Test
+  public void testReadRecordKeysFromBaseFilesWithValidRecords() throws Exception {
+    HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getStorageConf());
+    String instant = "20230918120000000";
+    hoodieTestTable = hoodieTestTable.addCommit(instant);
+    Set<String> recordKeys = new HashSet<>();
+    final List<Pair<String, FileSlice>> partitionFileSlicePairs = new ArrayList<>();
+    // Generate 10 inserts for each partition and populate partitionBaseFilePairs and recordKeys.
+    DATE_PARTITIONS.forEach(p -> {
+      try {
+        List<HoodieRecord> hoodieRecords = dataGen.generateInsertsForPartition(instant, 10, p);
+        String fileId = UUID.randomUUID().toString();
+        FileSlice fileSlice = new FileSlice(p, instant, fileId);
+        writeParquetFile(
+            instant,
+            new StoragePath(hoodieTestTable.getBaseFilePath(p, fileId).toUri()),
+            hoodieRecords,
+            metaClient,
+            engineContext);
+        HoodieBaseFile baseFile = new HoodieBaseFile(hoodieTestTable.getBaseFilePath(p, fileId).toString(), fileId, instant, null);
+        fileSlice.setBaseFile(baseFile);
+        partitionFileSlicePairs.add(Pair.of(p, fileSlice));
+        recordKeys.addAll(hoodieRecords.stream().map(HoodieRecord::getRecordKey).collect(Collectors.toSet()));
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+    });
+
+    // Call the method readRecordKeysFromBaseFiles with the created partitionBaseFilePairs.
+    HoodieData<HoodieRecord> result = HoodieTableMetadataUtil.readRecordKeysFromFileSlices(
+        engineContext,
+        partitionFileSlicePairs,
+        false,
+        1,
+        "activeModule",
+        metaClient,
+        EngineType.SPARK
+    );
+    // Validate the result.
+    List<HoodieRecord> records = result.collectAsList();
+    assertEquals(30, records.size());
+    assertEquals(MetadataPartitionType.RECORD_INDEX.getPartitionPath(), records.get(0).getPartitionPath());
+    for (HoodieRecord record : records) {
+      assertTrue(recordKeys.contains(record.getRecordKey()));
+    }
+  }
+
+  @Test
+  public void testGetLogFileColumnRangeMetadata() throws Exception {
+    HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getStorageConf());
+    String instant1 = "20230918120000000";
+
+    HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
+    commitMetadata.addMetadata("test", "test");
+    commitMetadata.setOperationType(WriteOperationType.INSERT);
+    commitMetadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS.toString());
+    hoodieTestTable = hoodieTestTable.addCommit(instant1, Option.of(commitMetadata));
+    String instant2 = "20230918121110000";
+    hoodieTestTable = hoodieTestTable.addCommit(instant2);
+    List<HoodieTableMetadataUtil.DirectoryInfo> partitionInfoList = new ArrayList<>();
+    List<String> columnsToIndex = Arrays.asList("rider", "driver");
+    // Generate 10 inserts for each partition and populate partitionBaseFilePairs and recordKeys.
+    DATE_PARTITIONS.forEach(p -> {
+      try {
+        URI partitionMetaFile = FileCreateUtils.createPartitionMetaFile(basePath, p);
+        StoragePath partitionMetadataPath = new StoragePath(partitionMetaFile);
+        String fileId1 = UUID.randomUUID().toString();
+        // add only one parquet file in first file slice
+        FileSlice fileSlice1 = new FileSlice(p, instant1, fileId1);
+        StoragePath storagePath1 = new StoragePath(hoodieTestTable.getBaseFilePath(p, fileId1).toUri());
+        writeParquetFile(instant1, storagePath1, dataGen.generateInsertsForPartition(instant1, 10, p), metaClient, engineContext);
+        HoodieBaseFile baseFile1 = new HoodieBaseFile(hoodieTestTable.getBaseFilePath(p, fileId1).toString());
+        fileSlice1.setBaseFile(baseFile1);
+        // add log file in second file slice with higher rider and driver values (which are concatenated with instant)
+        FileSlice fileSlice2 = new FileSlice(p, instant2, fileId1);
+        fileSlice2.setBaseFile(baseFile1);
+        StoragePath storagePath2 = new StoragePath(partitionMetadataPath.getParent(), hoodieTestTable.getLogFileNameById(fileId1, 1));
+        writeLogFiles(new StoragePath(metaClient.getBasePath(), p), HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS, dataGen.generateInsertsForPartition(instant2, 10, p), 1,
+            metaClient.getStorage(), new Properties(), fileId1, instant2);
+        fileSlice2.addLogFile(new HoodieLogFile(storagePath2.toUri().toString()));
+        partitionInfoList.add(new HoodieTableMetadataUtil.DirectoryInfo(
+            p,
+            metaClient.getStorage().listDirectEntries(Arrays.asList(partitionMetadataPath, storagePath1, storagePath2)),
+            instant2,
+            Collections.emptySet()));
+        // NOTE: we need to set table config as we are not using write client explicitly and these configs are needed for log record reader
+        metaClient.getTableConfig().setValue(HoodieTableConfig.POPULATE_META_FIELDS.key(), "false");
+        metaClient.getTableConfig().setValue(HoodieTableConfig.RECORDKEY_FIELDS.key(), "_row_key");
+        metaClient.getTableConfig().setValue(HoodieTableConfig.PARTITION_FIELDS.key(), "partition_path");
+        List<HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataLogFile = HoodieTableMetadataUtil.getLogFileColumnRangeMetadata(
+            storagePath2.toString(),
+            metaClient,
+            columnsToIndex,
+            Option.of(HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS),
+            HoodieMetadataConfig.MAX_READER_BUFFER_SIZE_PROP.defaultValue());
+        // there must be two ranges for rider and driver
+        assertEquals(2, columnRangeMetadataLogFile.size());
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+    });
+    // collect partition stats, this will collect stats for log files as well
+    HoodieData<HoodieRecord> result = HoodieTableMetadataUtil.convertFilesToPartitionStatsRecords(
+        engineContext,
+        partitionInfoList,
+        HoodieMetadataConfig.newBuilder().enable(true)
+            .withMetadataIndexColumnStats(true)
+            .withMetadataIndexPartitionStats(true)
+            .withColumnStatsIndexForColumns("rider,driver")
+            .withPartitionStatsIndexParallelism(1)
+            .build(),
+        metaClient,
+        Option.of(HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS));
+    // Validate the result.
+    validatePartitionStats(result, instant1, instant2);
+  }
+
+  private static void validatePartitionStats(HoodieData<HoodieRecord> result, String instant1, String instant2) {
+    List<HoodieRecord> records = result.collectAsList();
+    // 3 partitions * 2 columns = 6 partition stats records
+    assertEquals(6, records.size());
+    assertEquals(MetadataPartitionType.PARTITION_STATS.getPartitionPath(), records.get(0).getPartitionPath());
+    ((HoodieMetadataPayload) result.collectAsList().get(0).getData()).getColumnStatMetadata().get().getColumnName();
+    records.forEach(r -> {
+      HoodieMetadataPayload payload = (HoodieMetadataPayload) r.getData();
+      assertTrue(payload.getColumnStatMetadata().isPresent());
+      // instant1 < instant2 so instant1 should be in the min value and instant2 should be in the max value.
+      if (payload.getColumnStatMetadata().get().getColumnName().equals("rider")) {
+        assertEquals(String.format("{\"value\": \"rider-%s\"}", instant1), String.valueOf(payload.getColumnStatMetadata().get().getMinValue()));
+        assertEquals(String.format("{\"value\": \"rider-%s\"}", instant2), String.valueOf(payload.getColumnStatMetadata().get().getMaxValue()));
+      } else if (payload.getColumnStatMetadata().get().getColumnName().equals("driver")) {
+        assertEquals(String.format("{\"value\": \"driver-%s\"}", instant1), String.valueOf(payload.getColumnStatMetadata().get().getMinValue()));
+        assertEquals(String.format("{\"value\": \"driver-%s\"}", instant2), String.valueOf(payload.getColumnStatMetadata().get().getMaxValue()));
+      }
+    });
+  }
+
+  private static void writeParquetFile(String instant,
+                                       StoragePath path,
+                                       List<HoodieRecord> records,
+                                       HoodieTableMetaClient metaClient,
+                                       HoodieLocalEngineContext engineContext) throws IOException {
+    HoodieFileWriter writer = HoodieFileWriterFactory.getFileWriter(
+        instant,
+        path,
+        metaClient.getStorage(),
+        metaClient.getTableConfig(),
+        HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS,
+        engineContext.getTaskContextSupplier(),
+        HoodieRecord.HoodieRecordType.AVRO);
+    for (HoodieRecord record : records) {
+      writer.writeWithMetadata(record.getKey(), record, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS);
+    }
+    writer.close();
+  }
+}
diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap-rollback1-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap-rollback1-column-stats-index-table.json
new file mode 100644
index 0000000000000..83790766db25b
--- /dev/null
+++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap-rollback1-column-stats-index-table.json
@@ -0,0 +1,2 @@
+{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40}
+{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 984sdh","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":10000.768,"c3_minValue":0.001,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40}
\ No newline at end of file
diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap1-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap1-column-stats-index-table.json
new file mode 100644
index 0000000000000..75aa7ada3ad3e
--- /dev/null
+++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap1-column-stats-index-table.json
@@ -0,0 +1,4 @@
+{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40}
+{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":0.300,"c3_nullCount":1,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-1000,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40}
+{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":0.300,"c3_nullCount":1,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":39}
+{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":10000.768,"c3_minValue":0.001,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":39}
\ No newline at end of file
diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap2-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap2-column-stats-index-table.json
new file mode 100644
index 0000000000000..9c52707a27d05
--- /dev/null
+++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap2-column-stats-index-table.json
@@ -0,0 +1,5 @@
+{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40}
+{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":0.300,"c3_nullCount":1,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-1000,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40}
+{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":0.300,"c3_nullCount":1,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":39}
+{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":10000.768,"c3_minValue":0.001,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":39}
+{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":200000.000,"c3_minValue":0.100,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":39}
\ No newline at end of file
diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-clean1-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-clean1-column-stats-index-table.json
new file mode 100644
index 0000000000000..a08dea39c0501
--- /dev/null
+++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-clean1-column-stats-index-table.json
@@ -0,0 +1,2 @@
+{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":0.300,"c3_nullCount":1,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-1000,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40}
+{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":10000.768,"c3_minValue":0.001,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-1000,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40}
\ No newline at end of file
diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/delete-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/delete-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json
new file mode 100644
index 0000000000000..17e8f877c50bb
--- /dev/null
+++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/delete-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json
@@ -0,0 +1 @@
+{"c1":633,"c2":" 987sdk","c3":375.308,"c4":"2021-11-18T23:34:44.180-08:00","c5":0,"c6":"2020-01-01","c7":"NA==","c8":9}
\ No newline at end of file
diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap-rollback1-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap-rollback1-column-stats-index-table.json
new file mode 100644
index 0000000000000..dcbf49b141f91
--- /dev/null
+++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap-rollback1-column-stats-index-table.json
@@ -0,0 +1,2 @@
+{"c1_maxValue":562,"c1_minValue":323,"c1_nullCount":0,"c2_maxValue":" 984sdh","c2_minValue":" 980sdd","c2_nullCount":0,"c3_maxValue":10000.768,"c3_minValue":0.001,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-19T23:34:44.181-08:00","c4_nullCount":0,"c5_maxValue":80,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_nullCount":0,"c7_maxValue":"SA==","c7_minValue":"qw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":5}
+{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40}
\ No newline at end of file
diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap1-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap1-column-stats-index-table.json
new file mode 100644
index 0000000000000..146097347e036
--- /dev/null
+++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap1-column-stats-index-table.json
@@ -0,0 +1,3 @@
+{"c1_maxValue":562,"c1_minValue":323,"c1_nullCount":0,"c2_maxValue":" 984sdh","c2_minValue":" 980sdd","c2_nullCount":0,"c3_maxValue":10000.768,"c3_minValue":0.001,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-19T23:34:44.181-08:00","c4_nullCount":0,"c5_maxValue":80,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_nullCount":0,"c7_maxValue":"SA==","c7_minValue":"qw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":5}
+{"c1_maxValue":639,"c1_minValue":323,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 980sdd","c2_nullCount":0,"c3_maxValue":977.328,"c3_minValue":0.300,"c3_nullCount":1,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-1000,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"aQ==","c7_minValue":"qw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":10}
+{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40}
\ No newline at end of file
diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap2-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap2-column-stats-index-table.json
new file mode 100644
index 0000000000000..6256be16c1ddf
--- /dev/null
+++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap2-column-stats-index-table.json
@@ -0,0 +1,5 @@
+{"c1_maxValue":562,"c1_minValue":323,"c1_nullCount":0,"c2_maxValue":" 984sdh","c2_minValue":" 980sdd","c2_nullCount":0,"c3_maxValue":10000.768,"c3_minValue":0.001,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-19T23:34:44.181-08:00","c4_nullCount":0,"c5_maxValue":80,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_nullCount":0,"c7_maxValue":"SA==","c7_minValue":"qw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":5}
+{"c1_maxValue":562,"c1_minValue":323,"c1_nullCount":0,"c2_maxValue":" 984sdh","c2_minValue":" 980sdd","c2_nullCount":0,"c3_maxValue":200000.000,"c3_minValue":0.100,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-19T23:34:44.181-08:00","c4_nullCount":0,"c5_maxValue":80,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_nullCount":0,"c7_maxValue":"SA==","c7_minValue":"qQ==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":5}
+{"c1_maxValue":639,"c1_minValue":323,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 980sdd","c2_nullCount":0,"c3_maxValue":977.328,"c3_minValue":0.300,"c3_nullCount":1,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-1000,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"aQ==","c7_minValue":"qw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":10}
+{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40}
+{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":200000.000,"c3_minValue":0.100,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":39}
\ No newline at end of file
diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-clean1-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-clean1-column-stats-index-table.json
new file mode 100644
index 0000000000000..8c7b1125314a4
--- /dev/null
+++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-clean1-column-stats-index-table.json
@@ -0,0 +1,2 @@
+{"c1_maxValue":562,"c1_minValue":323,"c1_nullCount":0,"c2_maxValue":" 984sdh","c2_minValue":" 980sdd","c2_nullCount":0,"c3_maxValue":10000.768,"c3_minValue":0.001,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-19T23:34:44.181-08:00","c4_nullCount":0,"c5_maxValue":80,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_nullCount":0,"c7_maxValue":"SA==","c7_minValue":"qw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":5}
+{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":0.300,"c3_nullCount":1,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-1000,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40}
\ No newline at end of file
diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-delete-block1-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-delete-block1-column-stats-index-table.json
new file mode 100644
index 0000000000000..fc6c936c7871e
--- /dev/null
+++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-delete-block1-column-stats-index-table.json
@@ -0,0 +1,3 @@
+{"c1_nullCount":0,"c2_nullCount":0,"c3_nullCount":0,"c4_nullCount":0,"c5_nullCount":0,"c6_nullCount":0,"c7_nullCount":0,"c8_nullCount":0,"valueCount":0}
+{"c1_maxValue":639,"c1_minValue":323,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 980sdd","c2_nullCount":0,"c3_maxValue":977.328,"c3_minValue":0.300,"c3_nullCount":1,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-1000,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"aQ==","c7_minValue":"qw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":10}
+{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40}
\ No newline at end of file
diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update2-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update2-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json
new file mode 100644
index 0000000000000..35ae749ddc3fc
--- /dev/null
+++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update2-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json
@@ -0,0 +1,10 @@
+{"c1":323,"c2":" 980sdd","c3":null,"c4":"2021-11-19T23:34:44.201-08:00","c5":70,"c6":"2020-01-15","c7":"Ag==","c8":9}
+{"c1":326,"c2":" 981sde","c3":64.768,"c4":"2021-11-19T23:34:44.201-08:00","c5":80,"c6":"2020-10-13","c7":"AA==","c8":9}
+{"c1":555,"c2":" 982sdf","c3":153.431,"c4":"2021-11-19T23:34:44.186-08:00","c5":10,"c6":"2020-03-12","c7":"rw==","c8":9}
+{"c1":556,"c2":" 983sdg","c3":246.427,"c4":"2021-11-19T23:34:44.186-08:00","c5":45,"c6":"2020-10-08","c7":"qw==","c8":9}
+{"c1":562,"c2":" 984sdh","c3":977.328,"c4":"2021-11-19T23:34:44.181-08:00","c5":-100,"c6":"2020-10-21","c7":"SA==","c8":9}
+{"c1":619,"c2":" 985sdi","c3":230.320,"c4":"2021-11-19T23:34:44.180-08:00","c5":1000,"c6":"2020-02-13","c7":"QA==","c8":9}
+{"c1":624,"c2":" 986sdj","c3":580.317,"c4":"2021-11-18T23:34:44.180-08:00","c5":-1,"c6":"2020-10-10","c7":"PQ==","c8":9}
+{"c1":633,"c2":" 987sdk","c3":375.308,"c4":"2021-11-18T23:34:44.180-08:00","c5":-1000,"c6":"2020-01-01","c7":"NA==","c8":9}
+{"c1":638,"c2":" 988sdl","c3":904.304,"c4":"2021-11-18T23:34:44.179-08:00","c5":20,"c6":"2020-08-25","c7":"MA==","c8":9}
+{"c1":639,"c2":" 989sda","c3":0.300,"c4":"2021-11-18T23:34:44.179-08:00","c5":90,"c6":"2020-04-21","c7":"aa==","c8":9}
diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update3-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update3-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json
new file mode 100644
index 0000000000000..5e04406cf2182
--- /dev/null
+++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update3-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json
@@ -0,0 +1,5 @@
+{"c1":323,"c2":" 980sdd","c3":10.00,"c4":"2021-11-19T23:34:44.201-08:00","c5":70,"c6":"2020-01-15","c7":"Ag==","c8":9}
+{"c1":326,"c2":" 981sde","c3":10000.768,"c4":"2021-11-19T23:34:44.201-08:00","c5":80,"c6":"2020-10-13","c7":"AA==","c8":9}
+{"c1":555,"c2":" 982sdf","c3":2.431,"c4":"2021-11-19T23:34:44.186-08:00","c5":10,"c6":"2020-03-12","c7":"rw==","c8":9}
+{"c1":556,"c2":" 983sdg","c3":0.001,"c4":"2021-11-19T23:34:44.186-08:00","c5":45,"c6":"2020-10-08","c7":"qw==","c8":9}
+{"c1":562,"c2":" 984sdh","c3":5.328,"c4":"2021-11-19T23:34:44.181-08:00","c5":-100,"c6":"2020-10-21","c7":"SA==","c8":9}
diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update4-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update4-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json
new file mode 100644
index 0000000000000..a83a82d8b8bff
--- /dev/null
+++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update4-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json
@@ -0,0 +1,5 @@
+{"c1":323,"c2":" 980sdd","c3":200000.00,"c4":"2021-11-19T23:34:44.201-08:00","c5":70,"c6":"2020-01-15","c7":"Aj==","c8":9}
+{"c1":326,"c2":" 981sde","c3":100.768,"c4":"2021-11-19T23:34:44.201-08:00","c5":80,"c6":"2020-10-13","c7":"AB==","c8":9}
+{"c1":555,"c2":" 982sdf","c3":20.431,"c4":"2021-11-19T23:34:44.186-08:00","c5":10,"c6":"2020-03-12","c7":"rx==","c8":9}
+{"c1":556,"c2":" 983sdg","c3":0.1,"c4":"2021-11-19T23:34:44.186-08:00","c5":45,"c6":"2020-10-08","c7":"qf==","c8":9}
+{"c1":562,"c2":" 984sdh","c3":4.328,"c4":"2021-11-19T23:34:44.181-08:00","c5":-100,"c6":"2020-10-21","c7":"SL==","c8":9}
diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala
index 6a9efb3371d89..ba29a4c36bf15 100644
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala
@@ -18,14 +18,18 @@
 
 package org.apache.hudi.functional
 
-import org.apache.hadoop.fs.{LocatedFileStatus, Path}
+import org.apache.avro.Schema
+import org.apache.hadoop.fs.Path
 import org.apache.hudi.ColumnStatsIndexSupport.composeIndexSchema
 import org.apache.hudi.HoodieConversionUtils.toProperties
+import org.apache.hudi.client.common.HoodieSparkEngineContext
 import org.apache.hudi.common.config.{HoodieMetadataConfig, HoodieStorageConfig}
-import org.apache.hudi.common.model.HoodieTableType
+import org.apache.hudi.common.model.{HoodieBaseFile, HoodieFileGroup, HoodieLogFile, HoodieTableType}
 import org.apache.hudi.common.table.HoodieTableMetaClient
-import org.apache.hudi.functional.ColumnStatIndexTestBase.ColumnStatsTestCase
-import org.apache.hudi.testutils.HoodieSparkClientTestBase
+import org.apache.hudi.common.table.view.FileSystemViewManager
+import org.apache.hudi.config.HoodieCompactionConfig
+import org.apache.hudi.functional.ColumnStatIndexTestBase.{ColumnStatsTestCase, ColumnStatsTestParams}
+import org.apache.hudi.testutils.{HoodieSparkClientTestBase, LogFileColStatsTestUtil}
 import org.apache.hudi.{ColumnStatsIndexSupport, DataSourceWriteOptions}
 import org.apache.spark.sql._
 import org.apache.spark.sql.functions.typedLit
@@ -36,6 +40,9 @@ import org.junit.jupiter.params.provider.Arguments
 
 import java.math.BigInteger
 import java.sql.{Date, Timestamp}
+import java.util
+import java.util.List
+import java.util.stream.Collectors
 import scala.collection.JavaConverters._
 import scala.util.Random
 
@@ -73,42 +80,39 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase {
     cleanupSparkContexts()
   }
 
-  protected def doWriteAndValidateColumnStats(testCase: ColumnStatsTestCase,
-                                            metadataOpts: Map[String, String],
-                                            hudiOpts: Map[String, String],
-                                            dataSourcePath: String,
-                                            expectedColStatsSourcePath: String,
-                                            operation: String,
-                                            saveMode: SaveMode,
-                                            shouldValidate: Boolean = true): Unit = {
-    val sourceJSONTablePath = getClass.getClassLoader.getResource(dataSourcePath).toString
+  protected def doWriteAndValidateColumnStats(params: ColumnStatsTestParams): Unit = {
+
+    val sourceJSONTablePath = getClass.getClassLoader.getResource(params.dataSourcePath).toString
 
     // NOTE: Schema here is provided for validation that the input date is in the appropriate format
     val inputDF = spark.read.schema(sourceTableSchema).json(sourceJSONTablePath)
 
+    val writeOptions: Map[String, String] = params.hudiOpts ++ params.metadataOpts
+
     inputDF
       .sort("c1")
-      .repartition(4, new Column("c1"))
+      .repartition(params.numPartitions, new Column("c1"))
       .write
       .format("hudi")
-      .options(hudiOpts)
-      .option(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.key, 10 * 1024)
-      .option(DataSourceWriteOptions.OPERATION.key, operation)
-      .mode(saveMode)
+      .options(writeOptions)
+      .option(DataSourceWriteOptions.OPERATION.key, params.operation)
+      .option(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.key(), String.valueOf(params.parquetMaxFileSize))
+      .option(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT.key(), String.valueOf(params.smallFileLimit))
+      .mode(params.saveMode)
       .save(basePath)
     dfList = dfList :+ inputDF
 
     metaClient = HoodieTableMetaClient.reload(metaClient)
 
-    if (shouldValidate) {
+    if (params.shouldValidate) {
       // Currently, routine manually validating the column stats (by actually reading every column of every file)
       // only supports parquet files. Therefore we skip such validation when delta-log files are present, and only
       // validate in following cases: (1) COW: all operations; (2) MOR: insert only.
-      val shouldValidateColumnStatsManually = testCase.tableType == HoodieTableType.COPY_ON_WRITE ||
-        operation.equals(DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
+      val shouldValidateColumnStatsManually = params.testCase.tableType == HoodieTableType.COPY_ON_WRITE ||
+        params.operation.equals(DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
 
       validateColumnStatsIndex(
-        testCase, metadataOpts, expectedColStatsSourcePath, shouldValidateColumnStatsManually)
+        params.testCase, params.metadataOpts, params.expectedColStatsSourcePath, shouldValidateColumnStatsManually, params.latestCompletedCommit)
     }
   }
 
@@ -116,20 +120,19 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase {
                                             includedCols: Seq[String],
                                             indexedCols: Seq[String],
                                             indexSchema: StructType): DataFrame = {
-    val files = {
-      val it = fs.listFiles(new Path(tablePath), true)
-      var seq = Seq[LocatedFileStatus]()
-      while (it.hasNext) {
-        seq = seq :+ it.next()
-      }
-      seq.filter(fs => fs.getPath.getName.endsWith(".parquet"))
-    }
-
-    spark.createDataFrame(
-      files.flatMap(file => {
-        val df = spark.read.schema(sourceTableSchema).parquet(file.getPath.toString)
+    val metaClient = HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(tablePath).build()
+    val fsv = FileSystemViewManager.createInMemoryFileSystemView(new HoodieSparkEngineContext(jsc), metaClient, HoodieMetadataConfig.newBuilder().enable(false).build())
+    fsv.loadAllPartitions()
+    val filegroupList = fsv.getAllFileGroups.collect(Collectors.toList[HoodieFileGroup])
+    val baseFilesList = filegroupList.stream().flatMap(fileGroup => fileGroup.getAllBaseFiles).collect(Collectors.toList[HoodieBaseFile])
+    val baseFiles = baseFilesList.stream()
+      .map[Path](baseFile => new Path(baseFile.getPath)).collect(Collectors.toList[Path]).asScala
+
+    val baseFilesDf = spark.createDataFrame(
+      baseFiles.flatMap(file => {
+        val df = spark.read.schema(sourceTableSchema).parquet(file.toString)
         val exprs: Seq[String] =
-          s"'${typedLit(file.getPath.getName)}' AS file" +:
+          s"'${typedLit(file.getName)}' AS file" +:
             s"sum(1) AS valueCount" +:
             df.columns
               .filter(col => includedCols.contains(col))
@@ -157,12 +160,61 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase {
       }).asJava,
       indexSchema
     )
+
+    if (metaClient.getTableConfig.getTableType == HoodieTableType.COPY_ON_WRITE) {
+      baseFilesDf // COW table
+    } else {
+      val allLogFiles = filegroupList.stream().flatMap(fileGroup => fileGroup.getAllFileSlices)
+        .flatMap(fileSlice => fileSlice.getLogFiles)
+        .collect(Collectors.toList[HoodieLogFile])
+      if (allLogFiles.isEmpty) {
+        baseFilesDf // MOR table, but no log files.
+      } else {
+        val colsToGenerateStats = indexedCols // check for included cols
+        val writerSchemaOpt = LogFileColStatsTestUtil.getSchemaForTable(metaClient)
+        val latestCompletedCommit = metaClient.getActiveTimeline.getCommitsTimeline.filterCompletedInstants().lastInstant().get().getTimestamp
+        baseFilesDf.union(getColStatsFromLogFiles(allLogFiles, latestCompletedCommit,
+          scala.collection.JavaConverters.seqAsJavaList(colsToGenerateStats),
+          metaClient,
+          writerSchemaOpt: org.apache.hudi.common.util.Option[Schema],
+          HoodieMetadataConfig.MAX_READER_BUFFER_SIZE_PROP.defaultValue(),
+          indexSchema))
+      }
+    }
+  }
+
+  protected def getColStatsFromLogFiles(logFiles: List[HoodieLogFile], latestCommit: String, columnsToIndex: util.List[String],
+                                        datasetMetaClient: HoodieTableMetaClient,
+                                        writerSchemaOpt: org.apache.hudi.common.util.Option[Schema],
+                                        maxBufferSize: Integer,
+                                        indexSchema: StructType): DataFrame = {
+    val colStatsEntries = logFiles.stream().map[org.apache.hudi.common.util.Option[Row]](logFile => {
+      try {
+        getColStatsFromLogFile(logFile.getPath.toString, latestCommit, columnsToIndex, datasetMetaClient, writerSchemaOpt, maxBufferSize)
+      } catch {
+        case e: Exception =>
+          throw e
+      }
+    }).filter(rowOpt => rowOpt.isPresent).map[Row](rowOpt => rowOpt.get()).collect(Collectors.toList[Row])
+    spark.createDataFrame(colStatsEntries, indexSchema)
+  }
+
+  protected def getColStatsFromLogFile(logFilePath: String,
+                                       latestCommit: String,
+                                       columnsToIndex: util.List[String],
+                                       datasetMetaClient: HoodieTableMetaClient,
+                                       writerSchemaOpt: org.apache.hudi.common.util.Option[Schema],
+                                       maxBufferSize: Integer
+                                      ): org.apache.hudi.common.util.Option[Row] = {
+    LogFileColStatsTestUtil.getLogFileColumnRangeMetadata(logFilePath, datasetMetaClient, latestCommit,
+      columnsToIndex, writerSchemaOpt, maxBufferSize)
   }
 
   protected def validateColumnStatsIndex(testCase: ColumnStatsTestCase,
-                                       metadataOpts: Map[String, String],
-                                       expectedColStatsSourcePath: String,
-                                       validateColumnStatsManually: Boolean): Unit = {
+                                         metadataOpts: Map[String, String],
+                                         expectedColStatsSourcePath: String,
+                                         validateColumnStatsManually: Boolean,
+                                         latestCompletedCommit: String): Unit = {
     val metadataConfig = HoodieMetadataConfig.newBuilder()
       .fromProperties(toProperties(metadataOpts))
       .build()
@@ -178,7 +230,8 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase {
       }
     }
     val (expectedColStatsSchema, _) = composeIndexSchema(sourceTableSchema.fieldNames, indexedColumns, sourceTableSchema)
-    val validationSortColumns = Seq("c1_maxValue", "c1_minValue", "c2_maxValue", "c2_minValue")
+    val validationSortColumns = Seq("c1_maxValue", "c1_minValue", "c2_maxValue", "c2_minValue", "c3_maxValue",
+      "c3_minValue", "c5_maxValue", "c5_minValue")
 
     columnStatsIndex.loadTransposed(sourceTableSchema.fieldNames, testCase.shouldReadInMemory) { transposedColStatsDF =>
       // Match against expected column stats table
@@ -270,14 +323,41 @@ object ColumnStatIndexTestBase {
   def testMetadataColumnStatsIndexParams: java.util.stream.Stream[Arguments] = {
     java.util.stream.Stream.of(HoodieTableType.values().toStream.flatMap(tableType =>
       Seq(Arguments.arguments(ColumnStatsTestCase(tableType, shouldReadInMemory = true)),
-        Arguments.arguments(ColumnStatsTestCase(tableType, shouldReadInMemory = false)))
+        Arguments.arguments(ColumnStatsTestCase(tableType, shouldReadInMemory = false))
+      )
     ): _*)
   }
 
   def testMetadataColumnStatsIndexParamsForMOR: java.util.stream.Stream[Arguments] = {
     java.util.stream.Stream.of(
       Seq(Arguments.arguments(ColumnStatsTestCase(HoodieTableType.MERGE_ON_READ, shouldReadInMemory = true)),
-        Arguments.arguments(ColumnStatsTestCase(HoodieTableType.MERGE_ON_READ, shouldReadInMemory = false)))
-    : _*)
+        Arguments.arguments(ColumnStatsTestCase(HoodieTableType.MERGE_ON_READ, shouldReadInMemory = false))
+      )
+        : _*)
   }
+
+  def testTableTypePartitionTypeParams: java.util.stream.Stream[Arguments] = {
+    java.util.stream.Stream.of(
+      Seq(
+        Arguments.arguments(HoodieTableType.COPY_ON_WRITE, "c8"),
+        // empty partition col represents non-partitioned table.
+        Arguments.arguments(HoodieTableType.COPY_ON_WRITE, ""),
+        Arguments.arguments(HoodieTableType.MERGE_ON_READ, "c8"),
+        Arguments.arguments(HoodieTableType.MERGE_ON_READ, "")
+      )
+        : _*)
+  }
+
+  case class ColumnStatsTestParams(testCase: ColumnStatsTestCase,
+                                   metadataOpts: Map[String, String],
+                                   hudiOpts: Map[String, String],
+                                   dataSourcePath: String,
+                                   expectedColStatsSourcePath: String,
+                                   operation: String,
+                                   saveMode: SaveMode,
+                                   shouldValidate: Boolean = true,
+                                   latestCompletedCommit: String = null,
+                                   numPartitions: Integer = 4,
+                                   parquetMaxFileSize: Integer = 10 * 1024,
+                                   smallFileLimit: Integer = 100 * 1024 * 1024)
 }
diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
index ac83cf81918bb..3702cc8f188f3 100644
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
@@ -19,7 +19,7 @@
 package org.apache.hudi.functional
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hudi.ColumnStatsIndexSupport.composeIndexSchema
 import org.apache.hudi.DataSourceWriteOptions.{PRECOMBINE_FIELD, RECORDKEY_FIELD}
 import org.apache.hudi.HoodieConversionUtils.toProperties
@@ -27,9 +27,15 @@ import org.apache.hudi.common.config.{HoodieCommonConfig, HoodieMetadataConfig,
 import org.apache.hudi.common.model.HoodieTableType
 import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient}
 import org.apache.hudi.common.util.ParquetUtils
-import org.apache.hudi.config.HoodieWriteConfig
+import org.apache.hudi.config.{HoodieCleanConfig, HoodieCompactionConfig, HoodieWriteConfig}
 import org.apache.hudi.functional.ColumnStatIndexTestBase.ColumnStatsTestCase
 import org.apache.hudi.{ColumnStatsIndexSupport, DataSourceWriteOptions}
+import org.apache.hudi.client.common.HoodieSparkEngineContext
+import org.apache.hudi.common.fs.FSUtils
+import org.apache.hudi.common.table.view.FileSystemViewManager
+import org.apache.hudi.common.util.StringUtils
+import org.apache.hudi.DataSourceWriteOptions.PARTITIONPATH_FIELD
+import org.apache.hudi.functional.ColumnStatIndexTestBase.ColumnStatsTestParams
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, GreaterThan, Literal, Or}
@@ -63,17 +69,17 @@ class TestColumnStatsIndex extends ColumnStatIndexTestBase {
       HoodieTableConfig.POPULATE_META_FIELDS.key -> "true"
     ) ++ metadataOpts
 
-    doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts,
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts,
       dataSourcePath = "index/colstats/input-table-json",
       expectedColStatsSourcePath = "index/colstats/column-stats-index-table.json",
       operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL,
-      saveMode = SaveMode.Overwrite)
+      saveMode = SaveMode.Overwrite))
 
-    doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts,
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts,
       dataSourcePath = "index/colstats/another-input-table-json",
       expectedColStatsSourcePath = "index/colstats/updated-column-stats-index-table.json",
       operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL,
-      saveMode = SaveMode.Append)
+      saveMode = SaveMode.Append))
 
     // NOTE: MOR and COW have different fixtures since MOR is bearing delta-log files (holding
     //       deferred updates), diverging from COW
@@ -83,13 +89,441 @@ class TestColumnStatsIndex extends ColumnStatIndexTestBase {
       "index/colstats/mor-updated2-column-stats-index-table.json"
     }
 
-    doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts,
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts,
       dataSourcePath = "index/colstats/update-input-table-json",
       expectedColStatsSourcePath = expectedColStatsSourcePath,
       operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL,
-      saveMode = SaveMode.Append)
+      saveMode = SaveMode.Append))
   }
 
+  @ParameterizedTest
+  @MethodSource(Array("testTableTypePartitionTypeParams"))
+  def testMetadataColumnStatsIndexInitializationWithUpserts(tableType: HoodieTableType, partitionCol : String): Unit = {
+    val testCase = ColumnStatsTestCase(tableType, shouldReadInMemory = true)
+    val metadataOpts = Map(
+      HoodieMetadataConfig.ENABLE.key -> "true"
+    )
+
+    val commonOpts = Map(
+      "hoodie.insert.shuffle.parallelism" -> "1",
+      "hoodie.upsert.shuffle.parallelism" -> "1",
+      HoodieWriteConfig.TBL_NAME.key -> "hoodie_test",
+      DataSourceWriteOptions.TABLE_TYPE.key -> testCase.tableType.toString,
+      RECORDKEY_FIELD.key -> "c1",
+      PRECOMBINE_FIELD.key -> "c1",
+      PARTITIONPATH_FIELD.key() -> partitionCol,
+      HoodieTableConfig.POPULATE_META_FIELDS.key -> "true",
+      HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS.key() -> "5"
+    ) ++ metadataOpts
+
+    // inserts
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts,
+      dataSourcePath = "index/colstats/input-table-json",
+      expectedColStatsSourcePath = null,
+      operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Overwrite,
+      false,
+      numPartitions =  1,
+      parquetMaxFileSize = 100 * 1024 * 1024,
+      smallFileLimit = 0))
+
+    // updates
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts,
+      dataSourcePath = "index/colstats/update2-input-table-json/",
+      expectedColStatsSourcePath = null,
+      operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Append,
+      false,
+        numPartitions =  1,
+      parquetMaxFileSize = 100 * 1024 * 1024,
+      smallFileLimit = 0))
+
+    // delete a subset of recs. this will add a delete log block for MOR table.
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts,
+      dataSourcePath = "index/colstats/delete-input-table-json/",
+      expectedColStatsSourcePath = null,
+      operation = DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Append,
+      false,
+      numPartitions =  1,
+      parquetMaxFileSize = 100 * 1024 * 1024,
+      smallFileLimit = 0))
+
+    val metadataOpts1 = Map(
+      HoodieMetadataConfig.ENABLE.key -> "true",
+      HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "true"
+    )
+
+    // NOTE: MOR and COW have different fixtures since MOR is bearing delta-log files (holding
+    //       deferred updates), diverging from COW
+
+    val expectedColStatsSourcePath = if (testCase.tableType == HoodieTableType.COPY_ON_WRITE) {
+      "index/colstats/cow-bootstrap1-column-stats-index-table.json"
+    } else {
+      "index/colstats/mor-bootstrap1-column-stats-index-table.json"
+    }
+
+    metaClient = HoodieTableMetaClient.reload(metaClient)
+    val latestCompletedCommit = metaClient.getActiveTimeline.filterCompletedInstants().lastInstant().get().getTimestamp
+
+    // lets validate that we have log files generated in case of MOR table
+    if (tableType == HoodieTableType.MERGE_ON_READ) {
+      val metaClient = HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build()
+      val fsv = FileSystemViewManager.createInMemoryFileSystemView(new HoodieSparkEngineContext(jsc), metaClient, HoodieMetadataConfig.newBuilder().enable(false).build())
+      fsv.loadAllPartitions()
+      val basePath2 = new Path(basePath)
+      val allPartitionPaths = fsv.getPartitionPaths
+      allPartitionPaths.forEach(partitionPath => {
+        val pPath = FSUtils.getRelativePartitionPath(basePath2, partitionPath)
+        assertTrue (fsv.getLatestFileSlices(pPath).filter(fileSlice => fileSlice.hasLogFiles).count() > 0)
+      })
+      fsv.close()
+    }
+
+    // updates a subset which are not deleted and enable col stats and validate bootstrap
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts1, commonOpts,
+      dataSourcePath = "index/colstats/update3-input-table-json",
+      expectedColStatsSourcePath = expectedColStatsSourcePath,
+      operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Append,
+      true,
+      latestCompletedCommit,
+      numPartitions =  1,
+      parquetMaxFileSize = 100 * 1024 * 1024,
+      smallFileLimit = 0))
+
+    // trigger one more upsert and compaction (w/ MOR table) and validate.
+    val expectedColStatsSourcePath1 = if (testCase.tableType == HoodieTableType.COPY_ON_WRITE) {
+      "index/colstats/cow-bootstrap2-column-stats-index-table.json"
+    } else {
+      "index/colstats/mor-bootstrap2-column-stats-index-table.json"
+    }
+
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts1, commonOpts,
+      dataSourcePath = "index/colstats/update4-input-table-json",
+      expectedColStatsSourcePath = expectedColStatsSourcePath1,
+      operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Append,
+      true,
+      latestCompletedCommit,
+      numPartitions =  1,
+      parquetMaxFileSize = 100 * 1024 * 1024,
+      smallFileLimit = 0))
+  }
+
+  @ParameterizedTest
+  @MethodSource(Array("testTableTypePartitionTypeParams"))
+  def testMetadataColumnStatsIndexInitializationWithRollbacks(tableType: HoodieTableType, partitionCol : String): Unit = {
+    val testCase = ColumnStatsTestCase(tableType, shouldReadInMemory = true)
+    val metadataOpts = Map(
+      HoodieMetadataConfig.ENABLE.key -> "true"
+    )
+
+    val commonOpts = Map(
+      "hoodie.insert.shuffle.parallelism" -> "1",
+      "hoodie.upsert.shuffle.parallelism" -> "1",
+      HoodieWriteConfig.TBL_NAME.key -> "hoodie_test",
+      DataSourceWriteOptions.TABLE_TYPE.key -> testCase.tableType.toString,
+      RECORDKEY_FIELD.key -> "c1",
+      PRECOMBINE_FIELD.key -> "c1",
+      PARTITIONPATH_FIELD.key() -> partitionCol,
+      "hoodie.write.markers.type" -> "DIRECT",
+      HoodieTableConfig.POPULATE_META_FIELDS.key -> "true"
+    ) ++ metadataOpts
+
+    // inserts
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts,
+      dataSourcePath = "index/colstats/input-table-json",
+      expectedColStatsSourcePath = null,
+      operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Overwrite,
+      false,
+      numPartitions =  1,
+      parquetMaxFileSize = 100 * 1024 * 1024,
+      smallFileLimit = 0))
+
+    // updates
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts,
+      dataSourcePath = "index/colstats/update2-input-table-json/",
+      expectedColStatsSourcePath = null,
+      operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Append,
+      false,
+      numPartitions =  1,
+      parquetMaxFileSize = 100 * 1024 * 1024,
+      smallFileLimit = 0))
+
+    simulateFailureForLatestCommit(tableType, partitionCol)
+
+    val metadataOpts1 = Map(
+      HoodieMetadataConfig.ENABLE.key -> "true",
+      HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "true"
+    )
+
+    // NOTE: MOR and COW have different fixtures since MOR is bearing delta-log files (holding
+    //       deferred updates), diverging from COW
+
+    val expectedColStatsSourcePath = if (testCase.tableType == HoodieTableType.COPY_ON_WRITE) {
+      "index/colstats/cow-bootstrap-rollback1-column-stats-index-table.json"
+    } else {
+      "index/colstats/mor-bootstrap-rollback1-column-stats-index-table.json"
+    }
+
+    metaClient = HoodieTableMetaClient.reload(metaClient)
+    val latestCompletedCommit = metaClient.getActiveTimeline.filterCompletedInstants().lastInstant().get().getTimestamp
+
+    // updates a subset which are not deleted and enable col stats and validate bootstrap
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts1, commonOpts,
+      dataSourcePath = "index/colstats/update3-input-table-json",
+      expectedColStatsSourcePath = expectedColStatsSourcePath,
+      operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Append,
+      true,
+      latestCompletedCommit,
+      numPartitions =  1,
+      parquetMaxFileSize = 100 * 1024 * 1024,
+      smallFileLimit = 0))
+
+    metaClient = HoodieTableMetaClient.reload(metaClient)
+    assertTrue(metaClient.getActiveTimeline.getRollbackTimeline.countInstants() > 0)
+  }
+
+  def simulateFailureForLatestCommit(tableType: HoodieTableType, partitionCol: String) : Unit = {
+    // simulate failure for latest commit.
+    metaClient = HoodieTableMetaClient.reload(metaClient)
+    var baseFileName : String = null
+    var logFileName : String = null
+    val lastCompletedCommit = metaClient.getActiveTimeline.getCommitsTimeline.filterCompletedInstants().lastInstant().get()
+    if (tableType == HoodieTableType.MERGE_ON_READ) {
+      val dataFiles = if (StringUtils.isNullOrEmpty(partitionCol)) {
+        fs.listStatus(new Path(metaClient.getBasePath)).toSeq
+      } else {
+        fs.listStatus(new Path(metaClient.getBasePath + "/9")).toSeq
+      }
+      val logFileFileStatus = dataFiles.filter(fileStatus => fileStatus.getPath.getName.contains(".log")).head
+      logFileName = logFileFileStatus.getPath.getName
+    } else {
+      val dataFiles = if (StringUtils.isNullOrEmpty(partitionCol)) {
+        fs.listStatus(new Path(metaClient.getBasePath)).toSeq
+      } else {
+        fs.listStatus(new Path(metaClient.getBasePath + "/9")).toSeq
+      }
+      val baseFileFileStatus = dataFiles.filter(fileStatus => fileStatus.getPath.getName.contains(lastCompletedCommit.getTimestamp)).head
+      baseFileName = baseFileFileStatus.getPath.getName
+    }
+
+    val latestCompletedFileName = lastCompletedCommit.getFileName
+    fs.delete(new Path(metaClient.getBasePath + "/.hoodie/" + latestCompletedFileName), false)
+
+    // re-create marker for the deleted file.
+    if (tableType == HoodieTableType.MERGE_ON_READ) {
+      if (StringUtils.isNullOrEmpty(partitionCol)) {
+        { fs.create(new Path(metaClient.getBasePath + "/.hoodie/.temp/" + lastCompletedCommit.getTimestamp + "/" + logFileName + ".marker.APPEND")).close() }
+      } else {
+        { fs.create(new Path(metaClient.getBasePath + "/.hoodie/.temp/" + lastCompletedCommit.getTimestamp + "/9/" + logFileName + ".marker.APPEND")).close() }
+      }
+    } else {
+      if (StringUtils.isNullOrEmpty(partitionCol)) {
+        { fs.create(new Path(metaClient.getBasePath + "/.hoodie/.temp/" + lastCompletedCommit.getTimestamp + "/" + baseFileName + ".marker.MERGE")).close() }
+      } else {
+        { fs.create(new Path(metaClient.getBasePath + "/.hoodie/.temp/" + lastCompletedCommit.getTimestamp + "/9/" + baseFileName + ".marker.MERGE")).close() }
+      }
+    }
+  }
+
+  @Test
+  def testMORDeleteBlocks(): Unit = {
+    val tableType: HoodieTableType = HoodieTableType.MERGE_ON_READ
+    val partitionCol = "c8"
+    val testCase = ColumnStatsTestCase(tableType, shouldReadInMemory = true)
+    val metadataOpts = Map(
+      HoodieMetadataConfig.ENABLE.key -> "true",
+      HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "true"
+    )
+
+    val commonOpts = Map(
+      "hoodie.insert.shuffle.parallelism" -> "1",
+      "hoodie.upsert.shuffle.parallelism" -> "1",
+      HoodieWriteConfig.TBL_NAME.key -> "hoodie_test",
+      DataSourceWriteOptions.TABLE_TYPE.key -> testCase.tableType.toString,
+      RECORDKEY_FIELD.key -> "c1",
+      PRECOMBINE_FIELD.key -> "c1",
+      PARTITIONPATH_FIELD.key() -> partitionCol,
+      HoodieTableConfig.POPULATE_META_FIELDS.key -> "true",
+      HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS.key() -> "5"
+    ) ++ metadataOpts
+
+    // inserts
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts,
+      dataSourcePath = "index/colstats/input-table-json",
+      expectedColStatsSourcePath = null,
+      operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Overwrite,
+      false,
+      numPartitions = 1,
+      parquetMaxFileSize = 100 * 1024 * 1024,
+      smallFileLimit = 0))
+
+    // updates
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts,
+      dataSourcePath = "index/colstats/update2-input-table-json/",
+      expectedColStatsSourcePath = null,
+      operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Append,
+      false,
+      numPartitions = 1,
+      parquetMaxFileSize = 100 * 1024 * 1024,
+      smallFileLimit = 0))
+
+    val expectedColStatsSourcePath = "index/colstats/mor-delete-block1-column-stats-index-table.json"
+
+    // delete a subset of recs. this will add a delete log block for MOR table.
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts,
+      dataSourcePath = "index/colstats/delete-input-table-json/",
+      expectedColStatsSourcePath = expectedColStatsSourcePath,
+      operation = DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Append,
+      true,
+      numPartitions = 1,
+      parquetMaxFileSize = 100 * 1024 * 1024,
+      smallFileLimit = 0))
+  }
+
+  @ParameterizedTest
+  @ValueSource(strings = Array("", "c8"))
+  def testColStatsWithCleanCOW(partitionCol: String): Unit = {
+    val tableType: HoodieTableType = HoodieTableType.COPY_ON_WRITE
+    val testCase = ColumnStatsTestCase(tableType, shouldReadInMemory = true)
+    val metadataOpts = Map(
+      HoodieMetadataConfig.ENABLE.key -> "true"
+    )
+
+    val commonOpts = Map(
+      "hoodie.insert.shuffle.parallelism" -> "1",
+      "hoodie.upsert.shuffle.parallelism" -> "1",
+      HoodieWriteConfig.TBL_NAME.key -> "hoodie_test",
+      DataSourceWriteOptions.TABLE_TYPE.key -> testCase.tableType.toString,
+      RECORDKEY_FIELD.key -> "c1",
+      PRECOMBINE_FIELD.key -> "c1",
+      PARTITIONPATH_FIELD.key() -> partitionCol,
+      HoodieTableConfig.POPULATE_META_FIELDS.key -> "true",
+      HoodieCleanConfig.CLEANER_COMMITS_RETAINED.key() -> "1"
+    ) ++ metadataOpts
+
+    // inserts
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts,
+      dataSourcePath = "index/colstats/input-table-json",
+      expectedColStatsSourcePath = null,
+      operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Overwrite,
+      false,
+      numPartitions = 1,
+      parquetMaxFileSize = 100 * 1024 * 1024,
+      smallFileLimit = 0))
+
+    val metadataOpts1 = Map(
+      HoodieMetadataConfig.ENABLE.key -> "true",
+      HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "true"
+    )
+
+    // updates 1
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts1, commonOpts,
+      dataSourcePath = "index/colstats/update2-input-table-json/",
+      expectedColStatsSourcePath = null,
+      operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Append,
+      false,
+      numPartitions = 1,
+      parquetMaxFileSize = 100 * 1024 * 1024,
+      smallFileLimit = 0))
+
+    val expectedColStatsSourcePath = if (testCase.tableType == HoodieTableType.COPY_ON_WRITE) {
+      "index/colstats/cow-clean1-column-stats-index-table.json"
+    } else {
+      "index/colstats/mor-bootstrap-rollback1-column-stats-index-table.json"
+    }
+
+    // updates 2
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts1, commonOpts,
+      dataSourcePath = "index/colstats/update3-input-table-json/",
+      expectedColStatsSourcePath = expectedColStatsSourcePath,
+      operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Append,
+      true,
+      numPartitions = 1,
+      parquetMaxFileSize = 100 * 1024 * 1024,
+      smallFileLimit = 0))
+  }
+
+  @ParameterizedTest
+  @ValueSource(strings = Array("", "c8"))
+  def testColStatsWithCleanMOR(partitionCol: String): Unit = {
+    val tableType: HoodieTableType = HoodieTableType.MERGE_ON_READ
+    val testCase = ColumnStatsTestCase(tableType, shouldReadInMemory = true)
+    val metadataOpts = Map(
+      HoodieMetadataConfig.ENABLE.key -> "true"
+    )
+
+    val commonOpts = Map(
+      "hoodie.insert.shuffle.parallelism" -> "1",
+      "hoodie.upsert.shuffle.parallelism" -> "1",
+      HoodieWriteConfig.TBL_NAME.key -> "hoodie_test",
+      DataSourceWriteOptions.TABLE_TYPE.key -> testCase.tableType.toString,
+      RECORDKEY_FIELD.key -> "c1",
+      PRECOMBINE_FIELD.key -> "c1",
+      PARTITIONPATH_FIELD.key() -> partitionCol,
+      HoodieTableConfig.POPULATE_META_FIELDS.key -> "true",
+      HoodieCleanConfig.CLEANER_COMMITS_RETAINED.key() -> "1",
+      HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS.key() -> "2"
+    ) ++ metadataOpts
+
+    // inserts
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts,
+      dataSourcePath = "index/colstats/input-table-json",
+      expectedColStatsSourcePath = null,
+      operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Overwrite,
+      false,
+      numPartitions = 1,
+      parquetMaxFileSize = 100 * 1024 * 1024,
+      smallFileLimit = 0))
+
+    val metadataOpts1 = Map(
+      HoodieMetadataConfig.ENABLE.key -> "true",
+      HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "true"
+    )
+
+    // updates 1
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts1, commonOpts,
+      dataSourcePath = "index/colstats/update2-input-table-json/",
+      expectedColStatsSourcePath = null,
+      operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Append,
+      false,
+      numPartitions = 1,
+      parquetMaxFileSize = 100 * 1024 * 1024,
+      smallFileLimit = 0))
+
+    val expectedColStatsSourcePath = if (testCase.tableType == HoodieTableType.COPY_ON_WRITE) {
+      "index/colstats/cow-clean1-column-stats-index-table.json"
+    } else {
+      "index/colstats/mor-clean1-column-stats-index-table.json"
+    }
+
+    // updates 2
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts1, commonOpts,
+      dataSourcePath = "index/colstats/update3-input-table-json/",
+      expectedColStatsSourcePath = expectedColStatsSourcePath,
+      operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Append,
+      true,
+      numPartitions = 1,
+      parquetMaxFileSize = 100 * 1024 * 1024,
+      smallFileLimit = 0))
+
+    metaClient = HoodieTableMetaClient.reload(metaClient)
+    assertTrue(metaClient.getActiveTimeline.getCleanerTimeline.countInstants() > 0)
+  }
 
   @ParameterizedTest
   @EnumSource(classOf[HoodieTableType])
diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndexWithSQL.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndexWithSQL.scala
index 9c4099035b12d..d2b76b66c909f 100644
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndexWithSQL.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndexWithSQL.scala
@@ -28,7 +28,7 @@ import org.apache.hudi.common.model.{HoodieCommitMetadata, HoodieTableType, Writ
 import org.apache.hudi.common.table.HoodieTableConfig
 import org.apache.hudi.common.table.timeline.HoodieInstant
 import org.apache.hudi.config.{HoodieCompactionConfig, HoodieIndexConfig, HoodieWriteConfig}
-import org.apache.hudi.functional.ColumnStatIndexTestBase.ColumnStatsTestCase
+import org.apache.hudi.functional.ColumnStatIndexTestBase.{ColumnStatsTestCase, ColumnStatsTestParams}
 import org.apache.hudi.index.HoodieIndex.IndexType.INMEMORY
 import org.apache.hudi.metadata.HoodieMetadataFileSystemView
 import org.apache.hudi.util.JavaConversions
@@ -89,12 +89,12 @@ class TestColumnStatsIndexWithSQL extends ColumnStatIndexTestBase {
       HoodieIndexConfig.INDEX_TYPE.key() -> INMEMORY.name()
     ) ++ metadataOpts
 
-    doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts,
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts,
       dataSourcePath = "index/colstats/input-table-json",
       expectedColStatsSourcePath = "index/colstats/column-stats-index-table.json",
       operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL,
       saveMode = SaveMode.Overwrite,
-      shouldValidate = false)
+      shouldValidate = false))
 
     assertEquals(4, getLatestDataFilesCount(commonOpts))
     assertEquals(0, getLatestDataFilesCount(commonOpts, includeLogFiles = false))
@@ -134,12 +134,12 @@ class TestColumnStatsIndexWithSQL extends ColumnStatIndexTestBase {
     verifyFileIndexAndSQLQueries(commonOpts, isTableDataSameAsAfterSecondInstant = true)
 
     // Add the last df back and verify the queries
-    doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts,
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts,
       dataSourcePath = "index/colstats/update-input-table-json",
       expectedColStatsSourcePath = "",
       operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL,
       saveMode = SaveMode.Append,
-      shouldValidate = false)
+      shouldValidate = false))
     verifyFileIndexAndSQLQueries(commonOpts, verifyFileCount = false)
   }
 
@@ -196,27 +196,27 @@ class TestColumnStatsIndexWithSQL extends ColumnStatIndexTestBase {
     writeClient.scheduleCompaction(org.apache.hudi.common.util.Option.empty())
     writeClient.close()
 
-    doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts,
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts,
       dataSourcePath = "index/colstats/update-input-table-json",
       expectedColStatsSourcePath = "",
       operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL,
       saveMode = SaveMode.Append,
-      shouldValidate = false)
+      shouldValidate = false))
     verifyFileIndexAndSQLQueries(commonOpts)
   }
 
   private def setupTable(testCase: ColumnStatsTestCase, metadataOpts: Map[String, String], commonOpts: Map[String, String], shouldValidate: Boolean): Unit = {
-    doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts,
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts,
       dataSourcePath = "index/colstats/input-table-json",
       expectedColStatsSourcePath = "index/colstats/column-stats-index-table.json",
       operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL,
-      saveMode = SaveMode.Overwrite)
+      saveMode = SaveMode.Overwrite))
 
-    doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts,
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts,
       dataSourcePath = "index/colstats/another-input-table-json",
       expectedColStatsSourcePath = "index/colstats/updated-column-stats-index-table.json",
       operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL,
-      saveMode = SaveMode.Append)
+      saveMode = SaveMode.Append))
 
     // NOTE: MOR and COW have different fixtures since MOR is bearing delta-log files (holding
     //       deferred updates), diverging from COW
@@ -226,12 +226,12 @@ class TestColumnStatsIndexWithSQL extends ColumnStatIndexTestBase {
       "index/colstats/mor-updated2-column-stats-index-table.json"
     }
 
-    doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts,
+    doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts,
       dataSourcePath = "index/colstats/update-input-table-json",
       expectedColStatsSourcePath = expectedColStatsSourcePath,
       operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL,
       saveMode = SaveMode.Append,
-      shouldValidate)
+      shouldValidate))
   }
 
   def verifyFileIndexAndSQLQueries(opts: Map[String, String], isTableDataSameAsAfterSecondInstant: Boolean = false, verifyFileCount: Boolean = true): Unit = {

From 8d17a04be5dfef61533e17bce3bdaece3fcb62fb Mon Sep 17 00:00:00 2001
From: Vamsi <vamsikarnika@gmail.com>
Date: Thu, 12 Mar 2026 16:54:01 +0530
Subject: [PATCH 02/12] Fix LogFileColStatsTestUtil to use IndexedRecord for
 collectColumnRangeMetadata

Convert HoodieRecord list to IndexedRecord before calling collectColumnRangeMetadata,
matching the 3-arg signature in 0.14.x (master's version accepted HoodieRecord + Schema).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../hudi/testutils/LogFileColStatsTestUtil.java     | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/LogFileColStatsTestUtil.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/LogFileColStatsTestUtil.java
index 464ad5ddca1e4..2e0baaac74940 100644
--- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/LogFileColStatsTestUtil.java
+++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/LogFileColStatsTestUtil.java
@@ -20,6 +20,7 @@
 
 import org.apache.hudi.common.model.HoodieColumnRangeMetadata;
 import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.model.HoodieRecordPayload;
 import org.apache.hudi.common.table.HoodieTableMetaClient;
 import org.apache.hudi.common.table.TableSchemaResolver;
 import org.apache.hudi.common.table.log.HoodieUnMergedLogRecordScanner;
@@ -27,6 +28,7 @@
 import org.apache.hudi.exception.HoodieException;
 
 import org.apache.avro.Schema;
+import org.apache.avro.generic.IndexedRecord;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.catalyst.expressions.GenericRow;
 
@@ -53,7 +55,7 @@ public static Option<Row> getLogFileColumnRangeMetadata(String filePath, HoodieT
           .collect(Collectors.toList());
       List<HoodieRecord> records = new ArrayList<>();
       HoodieUnMergedLogRecordScanner scanner = HoodieUnMergedLogRecordScanner.newBuilder()
-          .withStorage(datasetMetaClient.getStorage())
+          .withFileSystem(datasetMetaClient.getFs())
           .withBasePath(datasetMetaClient.getBasePath())
           .withLogFilePaths(Collections.singletonList(filePath))
           .withBufferSize(maxBufferSize)
@@ -65,8 +67,15 @@ public static Option<Row> getLogFileColumnRangeMetadata(String filePath, HoodieT
       if (records.isEmpty()) {
         return Option.empty();
       }
+      List<IndexedRecord> indexedRecords = new ArrayList<>();
+      for (HoodieRecord hoodieRecord : records) {
+        Option<IndexedRecord> insertValue = ((HoodieRecordPayload) hoodieRecord.getData()).getInsertValue(writerSchemaOpt.get());
+        if (insertValue.isPresent()) {
+          indexedRecords.add(insertValue.get());
+        }
+      }
       Map<String, HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataMap =
-          collectColumnRangeMetadata(records, fieldsToIndex, filePath, writerSchemaOpt.get());
+          collectColumnRangeMetadata(indexedRecords, fieldsToIndex, filePath);
       List<HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataList = new ArrayList<>(columnRangeMetadataMap.values());
       return Option.of(getColStatsEntry(filePath, columnRangeMetadataList));
     } else {

From a519ca16022929d61cf85f6748b6b52385df6fe1 Mon Sep 17 00:00:00 2001
From: Vamsi <vamsikarnika@gmail.com>
Date: Thu, 12 Mar 2026 17:09:10 +0530
Subject: [PATCH 03/12] Fix Java 8 generics inference and FileSlice API
 compatibility

- Replace Collector wildcard pattern with forEach+map in collectColumnRangeMetadata
  (HoodieTableMetadataUtil) and readRangeFromParquetMetadata (ParquetUtils) to fix
  Java 8 type inference failures
- Replace FileSlice.hasLogFiles() with getLogFiles().findAny().isPresent() since
  hasLogFiles() doesn't exist in 0.14.x

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../apache/hudi/common/util/ParquetUtils.java | 58 +++++++++----------
 .../metadata/HoodieTableMetadataUtil.java     | 44 +++++++-------
 .../functional/TestColumnStatsIndex.scala     |  2 +-
 3 files changed, 47 insertions(+), 57 deletions(-)

diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java
index de5572523c1eb..84a494fe29f50 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java
@@ -61,7 +61,6 @@
 import java.util.Map;
 import java.util.Set;
 import java.util.function.Function;
-import java.util.stream.Collector;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
@@ -305,38 +304,33 @@ public List<HoodieColumnRangeMetadata<Comparable>> readRangeFromParquetMetadata(
   ) {
     ParquetMetadata metadata = readMetadata(conf, parquetFilePath);
 
-    // NOTE: This collector has to have fully specialized generic type params since
-    //       Java 1.8 struggles to infer them
-    Collector<HoodieColumnRangeMetadata<Comparable>, ?, Map<String, List<HoodieColumnRangeMetadata<Comparable>>>> groupingByCollector =
-        Collectors.groupingBy(HoodieColumnRangeMetadata::getColumnName);
-
     // Collect stats from all individual Parquet blocks
-    Map<String, List<HoodieColumnRangeMetadata<Comparable>>> columnToStatsListMap =
-        (Map<String, List<HoodieColumnRangeMetadata<Comparable>>>) metadata.getBlocks().stream().sequential()
-          .flatMap(blockMetaData ->
-              blockMetaData.getColumns().stream()
-                .filter(f -> cols.contains(f.getPath().toDotString()))
-                .map(columnChunkMetaData -> {
-                  Statistics stats = columnChunkMetaData.getStatistics();
-                  return HoodieColumnRangeMetadata.<Comparable>create(
-                      parquetFilePath.getName(),
-                      columnChunkMetaData.getPath().toDotString(),
-                      convertToNativeJavaType(
-                          columnChunkMetaData.getPrimitiveType(),
-                          stats.genericGetMin()),
-                      convertToNativeJavaType(
-                          columnChunkMetaData.getPrimitiveType(),
-                          stats.genericGetMax()),
-                      // NOTE: In case when column contains only nulls Parquet won't be creating
-                      //       stats for it instead returning stubbed (empty) object. In that case
-                      //       we have to equate number of nulls to the value count ourselves
-                      stats.isEmpty() ? columnChunkMetaData.getValueCount() : stats.getNumNulls(),
-                      columnChunkMetaData.getValueCount(),
-                      columnChunkMetaData.getTotalSize(),
-                      columnChunkMetaData.getTotalUncompressedSize());
-                })
-          )
-          .collect(groupingByCollector);
+    Map<String, List<HoodieColumnRangeMetadata<Comparable>>> columnToStatsListMap = new HashMap<>();
+    metadata.getBlocks().stream().sequential()
+        .flatMap(blockMetaData ->
+            blockMetaData.getColumns().stream()
+              .filter(f -> cols.contains(f.getPath().toDotString()))
+              .map(columnChunkMetaData -> {
+                Statistics stats = columnChunkMetaData.getStatistics();
+                return HoodieColumnRangeMetadata.<Comparable>create(
+                    parquetFilePath.getName(),
+                    columnChunkMetaData.getPath().toDotString(),
+                    convertToNativeJavaType(
+                        columnChunkMetaData.getPrimitiveType(),
+                        stats.genericGetMin()),
+                    convertToNativeJavaType(
+                        columnChunkMetaData.getPrimitiveType(),
+                        stats.genericGetMax()),
+                    // NOTE: In case when column contains only nulls Parquet won't be creating
+                    //       stats for it instead returning stubbed (empty) object. In that case
+                    //       we have to equate number of nulls to the value count ourselves
+                    stats.isEmpty() ? columnChunkMetaData.getValueCount() : stats.getNumNulls(),
+                    columnChunkMetaData.getValueCount(),
+                    columnChunkMetaData.getTotalSize(),
+                    columnChunkMetaData.getTotalUncompressedSize());
+              })
+        )
+        .forEach(crm -> columnToStatsListMap.computeIfAbsent(crm.getColumnName(), k -> new ArrayList<>()).add(crm));
 
     // Combine those into file-level statistics
     // NOTE: Inlining this var makes javac (1.8) upset (due to its inability to infer
diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
index f4ba94136b9c9..e3554bd977ff3 100644
--- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
+++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
@@ -64,6 +64,7 @@
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.common.util.ParquetUtils;
 import org.apache.hudi.common.util.StringUtils;
+import org.apache.hudi.common.util.VisibleForTesting;
 import org.apache.hudi.common.util.collection.ClosableIterator;
 import org.apache.hudi.common.util.collection.Pair;
 import org.apache.hudi.common.util.collection.Tuple3;
@@ -74,7 +75,6 @@
 import org.apache.hudi.io.storage.HoodieFileReaderFactory;
 import org.apache.hudi.util.Lazy;
 
-import com.google.common.annotations.VisibleForTesting;
 import org.apache.avro.AvroTypeException;
 import org.apache.avro.LogicalTypes;
 import org.apache.avro.Schema;
@@ -106,8 +106,6 @@
 import java.util.Set;
 import java.util.UUID;
 import java.util.function.BiFunction;
-import java.util.function.Function;
-import java.util.stream.Collector;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
@@ -242,27 +240,25 @@ class ColumnStats {
       });
     });
 
-    Collector<HoodieColumnRangeMetadata<Comparable>, ?, Map<String, HoodieColumnRangeMetadata<Comparable>>> collector =
-        Collectors.toMap(colRangeMetadata -> colRangeMetadata.getColumnName(), Function.identity());
-
-    return (Map<String, HoodieColumnRangeMetadata<Comparable>>) targetFields.stream()
-        .map(field -> {
-          ColumnStats colStats = allColumnStats.get(field.name());
-          return HoodieColumnRangeMetadata.<Comparable>create(
-              filePath,
-              field.name(),
-              colStats == null ? null : coerceToComparable(field.schema(), colStats.minValue),
-              colStats == null ? null : coerceToComparable(field.schema(), colStats.maxValue),
-              colStats == null ? 0 : colStats.nullCount,
-              colStats == null ? 0 : colStats.valueCount,
-              // NOTE: Size and compressed size statistics are set to 0 to make sure we're not
-              //       mixing up those provided by Parquet with the ones from other encodings,
-              //       since those are not directly comparable
-              0,
-              0
-          );
-        })
-        .collect(collector);
+    Map<String, HoodieColumnRangeMetadata<Comparable>> result = new HashMap<>();
+    targetFields.forEach(field -> {
+      ColumnStats colStats = allColumnStats.get(field.name());
+      HoodieColumnRangeMetadata<Comparable> rangeMetadata = HoodieColumnRangeMetadata.<Comparable>create(
+          filePath,
+          field.name(),
+          colStats == null ? null : coerceToComparable(field.schema(), colStats.minValue),
+          colStats == null ? null : coerceToComparable(field.schema(), colStats.maxValue),
+          colStats == null ? 0 : colStats.nullCount,
+          colStats == null ? 0 : colStats.valueCount,
+          // NOTE: Size and compressed size statistics are set to 0 to make sure we're not
+          //       mixing up those provided by Parquet with the ones from other encodings,
+          //       since those are not directly comparable
+          0,
+          0
+      );
+      result.put(rangeMetadata.getColumnName(), rangeMetadata);
+    });
+    return result;
   }
 
   /**
diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
index 3702cc8f188f3..7d0bacf03bf77 100644
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
@@ -175,7 +175,7 @@ class TestColumnStatsIndex extends ColumnStatIndexTestBase {
       val allPartitionPaths = fsv.getPartitionPaths
       allPartitionPaths.forEach(partitionPath => {
         val pPath = FSUtils.getRelativePartitionPath(basePath2, partitionPath)
-        assertTrue (fsv.getLatestFileSlices(pPath).filter(fileSlice => fileSlice.hasLogFiles).count() > 0)
+        assertTrue (fsv.getLatestFileSlices(pPath).filter(fileSlice => fileSlice.getLogFiles.findAny().isPresent).count() > 0)
       })
       fsv.close()
     }

From 03a878bec557bd75f999869db0841b2bc9d97cde Mon Sep 17 00:00:00 2001
From: Vamsi <vamsikarnika@gmail.com>
Date: Thu, 12 Mar 2026 17:14:48 +0530
Subject: [PATCH 04/12] Fix Java 8 type inference in ParquetUtils flatMap chain

Collect flatMap result to List before grouping to avoid raw type inference
issue where Java 8 loses generic type parameter through the flatMap.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../apache/hudi/common/util/ParquetUtils.java | 57 +++++++++++--------
 1 file changed, 32 insertions(+), 25 deletions(-)

diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java
index 84a494fe29f50..058e4f2f50130 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java
@@ -305,32 +305,39 @@ public List<HoodieColumnRangeMetadata<Comparable>> readRangeFromParquetMetadata(
     ParquetMetadata metadata = readMetadata(conf, parquetFilePath);
 
     // Collect stats from all individual Parquet blocks
+    // NOTE: Intermediate collect to List is required since Java 1.8 cannot infer
+    //       the generic type parameter through the flatMap chain
+    @SuppressWarnings("unchecked")
+    List<HoodieColumnRangeMetadata<Comparable>> allBlockStats = (List<HoodieColumnRangeMetadata<Comparable>>) (List<?>)
+        metadata.getBlocks().stream().sequential()
+            .flatMap(blockMetaData ->
+                blockMetaData.getColumns().stream()
+                  .filter(f -> cols.contains(f.getPath().toDotString()))
+                  .map(columnChunkMetaData -> {
+                    Statistics stats = columnChunkMetaData.getStatistics();
+                    return HoodieColumnRangeMetadata.<Comparable>create(
+                        parquetFilePath.getName(),
+                        columnChunkMetaData.getPath().toDotString(),
+                        convertToNativeJavaType(
+                            columnChunkMetaData.getPrimitiveType(),
+                            stats.genericGetMin()),
+                        convertToNativeJavaType(
+                            columnChunkMetaData.getPrimitiveType(),
+                            stats.genericGetMax()),
+                        // NOTE: In case when column contains only nulls Parquet won't be creating
+                        //       stats for it instead returning stubbed (empty) object. In that case
+                        //       we have to equate number of nulls to the value count ourselves
+                        stats.isEmpty() ? columnChunkMetaData.getValueCount() : stats.getNumNulls(),
+                        columnChunkMetaData.getValueCount(),
+                        columnChunkMetaData.getTotalSize(),
+                        columnChunkMetaData.getTotalUncompressedSize());
+                  })
+            )
+            .collect(Collectors.toList());
     Map<String, List<HoodieColumnRangeMetadata<Comparable>>> columnToStatsListMap = new HashMap<>();
-    metadata.getBlocks().stream().sequential()
-        .flatMap(blockMetaData ->
-            blockMetaData.getColumns().stream()
-              .filter(f -> cols.contains(f.getPath().toDotString()))
-              .map(columnChunkMetaData -> {
-                Statistics stats = columnChunkMetaData.getStatistics();
-                return HoodieColumnRangeMetadata.<Comparable>create(
-                    parquetFilePath.getName(),
-                    columnChunkMetaData.getPath().toDotString(),
-                    convertToNativeJavaType(
-                        columnChunkMetaData.getPrimitiveType(),
-                        stats.genericGetMin()),
-                    convertToNativeJavaType(
-                        columnChunkMetaData.getPrimitiveType(),
-                        stats.genericGetMax()),
-                    // NOTE: In case when column contains only nulls Parquet won't be creating
-                    //       stats for it instead returning stubbed (empty) object. In that case
-                    //       we have to equate number of nulls to the value count ourselves
-                    stats.isEmpty() ? columnChunkMetaData.getValueCount() : stats.getNumNulls(),
-                    columnChunkMetaData.getValueCount(),
-                    columnChunkMetaData.getTotalSize(),
-                    columnChunkMetaData.getTotalUncompressedSize());
-              })
-        )
-        .forEach(crm -> columnToStatsListMap.computeIfAbsent(crm.getColumnName(), k -> new ArrayList<>()).add(crm));
+    for (HoodieColumnRangeMetadata<Comparable> crm : allBlockStats) {
+      columnToStatsListMap.computeIfAbsent(crm.getColumnName(), k -> new ArrayList<>()).add(crm);
+    }
 
     // Combine those into file-level statistics
     // NOTE: Inlining this var makes javac (1.8) upset (due to its inability to infer

From a7d9668e87ac3850d0e96687fce0fb6a8596e468 Mon Sep 17 00:00:00 2001
From: Vamsi <vamsikarnika@gmail.com>
Date: Fri, 13 Mar 2026 20:59:42 +0530
Subject: [PATCH 05/12] Fix conflicts

---
 .../HoodieBackedTableMetadataWriter.java      |  26 +-
 .../log/HoodieUnMergedLogRecordScanner.java   |   2 +-
 .../apache/hudi/common/util/ParquetUtils.java |  65 ++--
 .../metadata/HoodieTableMetadataUtil.java     | 147 ++------
 .../MetadataRecordsGenerationParams.java      |  11 +-
 .../metadata/TestHoodieMetadataPayload.java   |  22 +-
 .../metadata/TestHoodieMetadataPayload.java   | 295 ----------------
 .../metadata/TestHoodieTableMetadataUtil.java | 315 ------------------
 8 files changed, 95 insertions(+), 788 deletions(-)
 delete mode 100644 hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java
 delete mode 100644 hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java

diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
index 68b02ad6d39ba..2f1ab37bf52b6 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
@@ -344,7 +344,6 @@ private void initializeFromFilesystem(String initializationTime, List<MetadataPa
     if (anyPendingDataInstant(dataMetaClient, inflightInstantTimestamp)) {
       return;
     }
-    Set<String> pendingDataInstants = getPendingDataInstants(dataMetaClient);
 
     // FILES partition is always required and is initialized first
     boolean filesPartitionAvailable = dataMetaClient.getTableConfig().isMetadataPartitionAvailable(MetadataPartitionType.FILES);
@@ -369,7 +368,7 @@ private void initializeFromFilesystem(String initializationTime, List<MetadataPa
     // Get a complete list of files and partitions from the file system or from already initialized FILES partition of MDT
     List<DirectoryInfo> partitionInfoList;
     if (filesPartitionAvailable) {
-      partitionInfoList = listAllPartitionsFromMDT(initializationTime, pendingDataInstants);
+      partitionInfoList = listAllPartitionsFromMDT(initializationTime);
     } else {
       // if auto initialization is enabled, then we need to list all partitions from the file system
       if (dataWriteConfig.getMetadataConfig().shouldAutoInitialize()) {
@@ -419,7 +418,7 @@ private void initializeFromFilesystem(String initializationTime, List<MetadataPa
             + " bootstrap failed for " + metadataMetaClient.getBasePath(), e);
       }
 
-      LOG.info(String.format("Initializing %s index with %d mappings.", partitionType.name(), fileGroupCountAndRecordsPair.getKey()));
+      LOG.info("Initializing {} index with {} mappings", partitionType.name(), fileGroupCountAndRecordsPair.getKey());
       HoodieTimer partitionInitTimer = HoodieTimer.start();
 
       // Generate the file groups
@@ -467,9 +466,7 @@ private String generateUniqueCommitInstantTime(String initializationTime) {
   private Pair<Integer, HoodieData<HoodieRecord>> initializeColumnStatsPartition(Map<String, Map<String, Long>> partitionToFilesMap) {
     // during initialization, we need stats for base and log files.
     HoodieData<HoodieRecord> records = HoodieTableMetadataUtil.convertFilesToColumnStatsRecords(
-        engineContext, Collections.emptyMap(), partitionToFilesMap, dataMetaClient, dataWriteConfig.isMetadataColumnStatsIndexEnabled(),
-        dataWriteConfig.getColumnStatsIndexParallelism(), dataWriteConfig.getColumnsEnabledForColumnStatsIndex(),
-        dataWriteConfig.getMetadataConfig().getMaxReaderBufferSize());
+        engineContext, Collections.emptyMap(), partitionToFilesMap, getRecordsGenerationParams());
 
     final int fileGroupCount = dataWriteConfig.getMetadataConfig().getColumnStatsIndexFileGroupCount();
     return Pair.of(fileGroupCount, records);
@@ -566,16 +563,6 @@ private boolean anyPendingDataInstant(HoodieTableMetaClient dataMetaClient, Opti
     return false;
   }
 
-  private Set<String> getPendingDataInstants(HoodieTableMetaClient dataMetaClient) {
-    // Initialize excluding the pending operations on the dataset
-    return dataMetaClient.getActiveTimeline()
-        .getInstantsAsStream().filter(i -> !i.isCompleted())
-        // regular writers should not be blocked due to pending indexing action
-        .filter(i -> !HoodieTimeline.INDEXING_ACTION.equals(i.getAction()))
-        .map(HoodieInstant::getTimestamp)
-        .collect(Collectors.toSet());
-  }
-
   private HoodieTableMetaClient initializeMetaClient() throws IOException {
     return HoodieTableMetaClient.withPropertyBuilder()
         .setTableType(HoodieTableType.MERGE_ON_READ)
@@ -651,11 +638,11 @@ private List<DirectoryInfo> listAllPartitionsFromFilesystem(String initializatio
    * @param initializationTime Files which have a timestamp after this are neglected
    * @return List consisting of {@code DirectoryInfo} for each partition found.
    */
-  private List<DirectoryInfo> listAllPartitionsFromMDT(String initializationTime, Set<String> pendingDataInstants) throws IOException {
+  private List<DirectoryInfo> listAllPartitionsFromMDT(String initializationTime) throws IOException {
+    List<DirectoryInfo> dirinfoList = new LinkedList<>();
     List<String> allAbsolutePartitionPaths = metadata.getAllPartitionPaths().stream()
         .map(partitionPath -> dataWriteConfig.getBasePath() + "/" + partitionPath).collect(Collectors.toList());
     Map<String, FileStatus[]> partitionFileMap = metadata.getAllFilesInPartitions(allAbsolutePartitionPaths);
-    List<DirectoryInfo> dirinfoList = new ArrayList<>(partitionFileMap.size());
     for (Map.Entry<String, FileStatus[]> entry : partitionFileMap.entrySet()) {
       String relativeDirPath = FSUtils.getRelativePartitionPath(new Path(dataWriteConfig.getBasePath()), new Path(entry.getKey()));
       dirinfoList.add(new DirectoryInfo(relativeDirPath, entry.getValue(), initializationTime));
@@ -789,7 +776,8 @@ private MetadataRecordsGenerationParams getRecordsGenerationParams() {
         dataWriteConfig.isMetadataColumnStatsIndexEnabled(),
         dataWriteConfig.getColumnStatsIndexParallelism(),
         dataWriteConfig.getColumnsEnabledForColumnStatsIndex(),
-        dataWriteConfig.getColumnsEnabledForBloomFilterIndex());
+        dataWriteConfig.getColumnsEnabledForBloomFilterIndex(),
+        dataWriteConfig.getMetadataConfig().getMaxReaderBufferSize());
   }
 
   /**
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java
index 99fe6c1ff54f2..f62ec0febd578 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java
@@ -79,7 +79,7 @@ public <T> void processNextRecord(HoodieRecord<T> hoodieRecord) throws Exception
 
   @Override
   protected void processNextDeletedRecord(DeleteRecord deleteRecord) {
-    // no - op
+    throw new IllegalStateException("Not expected to see delete records in this log-scan mode. Check Job Config");
   }
 
   /**
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java
index 058e4f2f50130..de5572523c1eb 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java
@@ -61,6 +61,7 @@
 import java.util.Map;
 import java.util.Set;
 import java.util.function.Function;
+import java.util.stream.Collector;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
@@ -304,40 +305,38 @@ public List<HoodieColumnRangeMetadata<Comparable>> readRangeFromParquetMetadata(
   ) {
     ParquetMetadata metadata = readMetadata(conf, parquetFilePath);
 
+    // NOTE: This collector has to have fully specialized generic type params since
+    //       Java 1.8 struggles to infer them
+    Collector<HoodieColumnRangeMetadata<Comparable>, ?, Map<String, List<HoodieColumnRangeMetadata<Comparable>>>> groupingByCollector =
+        Collectors.groupingBy(HoodieColumnRangeMetadata::getColumnName);
+
     // Collect stats from all individual Parquet blocks
-    // NOTE: Intermediate collect to List is required since Java 1.8 cannot infer
-    //       the generic type parameter through the flatMap chain
-    @SuppressWarnings("unchecked")
-    List<HoodieColumnRangeMetadata<Comparable>> allBlockStats = (List<HoodieColumnRangeMetadata<Comparable>>) (List<?>)
-        metadata.getBlocks().stream().sequential()
-            .flatMap(blockMetaData ->
-                blockMetaData.getColumns().stream()
-                  .filter(f -> cols.contains(f.getPath().toDotString()))
-                  .map(columnChunkMetaData -> {
-                    Statistics stats = columnChunkMetaData.getStatistics();
-                    return HoodieColumnRangeMetadata.<Comparable>create(
-                        parquetFilePath.getName(),
-                        columnChunkMetaData.getPath().toDotString(),
-                        convertToNativeJavaType(
-                            columnChunkMetaData.getPrimitiveType(),
-                            stats.genericGetMin()),
-                        convertToNativeJavaType(
-                            columnChunkMetaData.getPrimitiveType(),
-                            stats.genericGetMax()),
-                        // NOTE: In case when column contains only nulls Parquet won't be creating
-                        //       stats for it instead returning stubbed (empty) object. In that case
-                        //       we have to equate number of nulls to the value count ourselves
-                        stats.isEmpty() ? columnChunkMetaData.getValueCount() : stats.getNumNulls(),
-                        columnChunkMetaData.getValueCount(),
-                        columnChunkMetaData.getTotalSize(),
-                        columnChunkMetaData.getTotalUncompressedSize());
-                  })
-            )
-            .collect(Collectors.toList());
-    Map<String, List<HoodieColumnRangeMetadata<Comparable>>> columnToStatsListMap = new HashMap<>();
-    for (HoodieColumnRangeMetadata<Comparable> crm : allBlockStats) {
-      columnToStatsListMap.computeIfAbsent(crm.getColumnName(), k -> new ArrayList<>()).add(crm);
-    }
+    Map<String, List<HoodieColumnRangeMetadata<Comparable>>> columnToStatsListMap =
+        (Map<String, List<HoodieColumnRangeMetadata<Comparable>>>) metadata.getBlocks().stream().sequential()
+          .flatMap(blockMetaData ->
+              blockMetaData.getColumns().stream()
+                .filter(f -> cols.contains(f.getPath().toDotString()))
+                .map(columnChunkMetaData -> {
+                  Statistics stats = columnChunkMetaData.getStatistics();
+                  return HoodieColumnRangeMetadata.<Comparable>create(
+                      parquetFilePath.getName(),
+                      columnChunkMetaData.getPath().toDotString(),
+                      convertToNativeJavaType(
+                          columnChunkMetaData.getPrimitiveType(),
+                          stats.genericGetMin()),
+                      convertToNativeJavaType(
+                          columnChunkMetaData.getPrimitiveType(),
+                          stats.genericGetMax()),
+                      // NOTE: In case when column contains only nulls Parquet won't be creating
+                      //       stats for it instead returning stubbed (empty) object. In that case
+                      //       we have to equate number of nulls to the value count ourselves
+                      stats.isEmpty() ? columnChunkMetaData.getValueCount() : stats.getNumNulls(),
+                      columnChunkMetaData.getValueCount(),
+                      columnChunkMetaData.getTotalSize(),
+                      columnChunkMetaData.getTotalUncompressedSize());
+                })
+          )
+          .collect(groupingByCollector);
 
     // Combine those into file-level statistics
     // NOTE: Inlining this var makes javac (1.8) upset (due to its inability to infer
diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
index e3554bd977ff3..6c1de68e043cc 100644
--- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
+++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
@@ -44,14 +44,12 @@
 import org.apache.hudi.common.model.HoodieLogFile;
 import org.apache.hudi.common.model.HoodieRecord;
 import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType;
-import org.apache.hudi.common.model.HoodieRecordPayload;
 import org.apache.hudi.common.model.HoodieRecordGlobalLocation;
 import org.apache.hudi.common.model.HoodieWriteStat;
 import org.apache.hudi.common.table.HoodieTableConfig;
 import org.apache.hudi.common.table.HoodieTableMetaClient;
 import org.apache.hudi.common.table.TableSchemaResolver;
 import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
-import org.apache.hudi.common.table.log.HoodieUnMergedLogRecordScanner;
 import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
 import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline;
 import org.apache.hudi.common.table.timeline.HoodieInstant;
@@ -64,7 +62,6 @@
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.common.util.ParquetUtils;
 import org.apache.hudi.common.util.StringUtils;
-import org.apache.hudi.common.util.VisibleForTesting;
 import org.apache.hudi.common.util.collection.ClosableIterator;
 import org.apache.hudi.common.util.collection.Pair;
 import org.apache.hudi.common.util.collection.Tuple3;
@@ -106,6 +103,8 @@
 import java.util.Set;
 import java.util.UUID;
 import java.util.function.BiFunction;
+import java.util.function.Function;
+import java.util.stream.Collector;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
@@ -240,25 +239,27 @@ class ColumnStats {
       });
     });
 
-    Map<String, HoodieColumnRangeMetadata<Comparable>> result = new HashMap<>();
-    targetFields.forEach(field -> {
-      ColumnStats colStats = allColumnStats.get(field.name());
-      HoodieColumnRangeMetadata<Comparable> rangeMetadata = HoodieColumnRangeMetadata.<Comparable>create(
-          filePath,
-          field.name(),
-          colStats == null ? null : coerceToComparable(field.schema(), colStats.minValue),
-          colStats == null ? null : coerceToComparable(field.schema(), colStats.maxValue),
-          colStats == null ? 0 : colStats.nullCount,
-          colStats == null ? 0 : colStats.valueCount,
-          // NOTE: Size and compressed size statistics are set to 0 to make sure we're not
-          //       mixing up those provided by Parquet with the ones from other encodings,
-          //       since those are not directly comparable
-          0,
-          0
-      );
-      result.put(rangeMetadata.getColumnName(), rangeMetadata);
-    });
-    return result;
+    Collector<HoodieColumnRangeMetadata<Comparable>, ?, Map<String, HoodieColumnRangeMetadata<Comparable>>> collector =
+        Collectors.toMap(colRangeMetadata -> colRangeMetadata.getColumnName(), Function.identity());
+
+    return (Map<String, HoodieColumnRangeMetadata<Comparable>>) targetFields.stream()
+        .map(field -> {
+          ColumnStats colStats = allColumnStats.get(field.name());
+          return HoodieColumnRangeMetadata.<Comparable>create(
+              filePath,
+              field.name(),
+              colStats == null ? null : coerceToComparable(field.schema(), colStats.minValue),
+              colStats == null ? null : coerceToComparable(field.schema(), colStats.maxValue),
+              colStats == null ? 0 : colStats.nullCount,
+              colStats == null ? 0 : colStats.valueCount,
+              // NOTE: Size and compressed size statistics are set to 0 to make sure we're not
+              //       mixing up those provided by Parquet with the ones from other encodings,
+              //       since those are not directly comparable
+              0,
+              0
+          );
+        })
+        .collect(collector);
   }
 
   /**
@@ -854,18 +855,12 @@ public static HoodieData<HoodieRecord> convertFilesToBloomFilterRecords(HoodieEn
   public static HoodieData<HoodieRecord> convertFilesToColumnStatsRecords(HoodieEngineContext engineContext,
                                                                           Map<String, List<String>> partitionToDeletedFiles,
                                                                           Map<String, Map<String, Long>> partitionToAppendedFiles,
-                                                                          HoodieTableMetaClient dataMetaClient,
-                                                                          boolean isColumnStatsIndexEnabled,
-                                                                          int columnStatsIndexParallelism,
-                                                                          List<String> targetColumnsForColumnStatsIndex,
-                                                                          int maxReaderBufferSize) {
-    if (!isColumnStatsIndexEnabled) {
-      return engineContext.emptyHoodieData();
-    }
+                                                                          MetadataRecordsGenerationParams recordsGenerationParams) {
     // Find the columns to index
+    HoodieTableMetaClient dataTableMetaClient = recordsGenerationParams.getDataMetaClient();
     final List<String> columnsToIndex =
-        getColumnsToIndex(true, targetColumnsForColumnStatsIndex,
-            Lazy.lazily(() -> tryResolveSchemaForTable(dataMetaClient)));
+        getColumnsToIndex(recordsGenerationParams,
+            Lazy.lazily(() -> tryResolveSchemaForTable(dataTableMetaClient)));
     if (columnsToIndex.isEmpty()) {
       // In case there are no columns to index, bail
       return engineContext.emptyHoodieData();
@@ -877,12 +872,12 @@ public static HoodieData<HoodieRecord> convertFilesToColumnStatsRecords(HoodieEn
     final List<Tuple3<String, String, Boolean>> partitionFileFlagTupleList = fetchPartitionFileInfoTriplets(partitionToDeletedFiles, partitionToAppendedFiles);
 
     // Create records MDT
-    int parallelism = Math.max(Math.min(partitionFileFlagTupleList.size(), columnStatsIndexParallelism), 1);
+    int parallelism = Math.max(Math.min(partitionFileFlagTupleList.size(), recordsGenerationParams.getColumnStatsIndexParallelism()), 1);
     return engineContext.parallelize(partitionFileFlagTupleList, parallelism).flatMap(partitionFileFlagTuple -> {
       final String partitionPath = partitionFileFlagTuple.f0;
       final String filename = partitionFileFlagTuple.f1;
       final boolean isDeleted = partitionFileFlagTuple.f2;
-      return getColumnStatsRecords(partitionPath, filename, dataMetaClient, columnsToIndex, isDeleted, maxReaderBufferSize).iterator();
+      return getColumnStatsRecords(partitionPath, filename, dataTableMetaClient, columnsToIndex, isDeleted, recordsGenerationParams.getMaxReaderBufferSize()).iterator();
     });
   }
 
@@ -1091,27 +1086,6 @@ private static List<String> getColumnsToIndex(MetadataRecordsGenerationParams re
         .orElse(Collections.emptyList());
   }
 
-  /**
-   * Get the list of columns for the table for column stats indexing
-   */
-  private static List<String> getColumnsToIndex(boolean isColumnStatsIndexEnabled,
-                                                List<String> targetColumnsForColumnStatsIndex,
-                                                Lazy<Option<Schema>> lazyWriterSchemaOpt) {
-    checkState(isColumnStatsIndexEnabled);
-
-    if (!targetColumnsForColumnStatsIndex.isEmpty()) {
-      return targetColumnsForColumnStatsIndex;
-    }
-
-    Option<Schema> writerSchemaOpt = lazyWriterSchemaOpt.get();
-    return writerSchemaOpt
-        .map(writerSchema ->
-            writerSchema.getFields().stream()
-                .map(Schema.Field::name)
-                .collect(Collectors.toList()))
-        .orElse(Collections.emptyList());
-  }
-
   private static Stream<HoodieRecord> translateWriteStatToColumnStats(HoodieWriteStat writeStat,
                                                                       HoodieTableMetaClient datasetMetaClient,
                                                                       List<String> columnsToIndex) {
@@ -1147,8 +1121,9 @@ private static Stream<HoodieRecord> getColumnStatsRecords(String partitionPath,
 
       return HoodieMetadataPayload.createColumnStatsRecords(partitionPath, columnRangeMetadataList, true);
     }
+
     List<HoodieColumnRangeMetadata<Comparable>> columnRangeMetadata =
-        readColumnRangeMetadataFrom(partitionPath, fileName, datasetMetaClient, columnsToIndex, maxBufferSize);
+        readColumnRangeMetadataFrom(partitionPath, fileName, datasetMetaClient, columnsToIndex);
 
     return HoodieMetadataPayload.createColumnStatsRecords(partitionPath, columnRangeMetadata, false);
   }
@@ -1156,19 +1131,15 @@ private static Stream<HoodieRecord> getColumnStatsRecords(String partitionPath,
   private static List<HoodieColumnRangeMetadata<Comparable>> readColumnRangeMetadataFrom(String partitionPath,
                                                                                          String fileName,
                                                                                          HoodieTableMetaClient datasetMetaClient,
-                                                                                         List<String> columnsToIndex,
-                                                                                         int maxBufferSize) {
+                                                                                         List<String> columnsToIndex) {
     String partitionPathFileName = (partitionPath.equals(EMPTY_PARTITION_NAME) || partitionPath.equals(NON_PARTITIONED_NAME)) ? fileName
         : partitionPath + "/" + fileName;
     try {
-      Path fullFilePath = new Path(datasetMetaClient.getBasePath(), partitionPathFileName);
+      Path fullFilePath = new Path(datasetMetaClient.getBasePathV2(), partitionPathFileName);
       if (partitionPathFileName.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
         return new ParquetUtils().readRangeFromParquetMetadata(datasetMetaClient.getHadoopConf(), fullFilePath, columnsToIndex);
-      } else if (FSUtils.isLogFile(fileName)) {
-        Option<Schema> writerSchemaOpt = tryResolveSchemaForTable(datasetMetaClient);
-        LOG.warn("Reading log file: {}, to build column range metadata.", partitionPathFileName);
-        return getLogFileColumnRangeMetadata(fullFilePath.toString(), datasetMetaClient, columnsToIndex, writerSchemaOpt, maxBufferSize);
       }
+
       LOG.warn("Column range index not supported for: {}", partitionPathFileName);
       return Collections.emptyList();
     } catch (Exception e) {
@@ -1179,56 +1150,6 @@ private static List<HoodieColumnRangeMetadata<Comparable>> readColumnRangeMetada
     }
   }
 
-  /**
-   * Read column range metadata from log file.
-   */
-  @VisibleForTesting
-  protected static List<HoodieColumnRangeMetadata<Comparable>> getLogFileColumnRangeMetadata(String filePath,
-                                                                                             HoodieTableMetaClient datasetMetaClient,
-                                                                                             List<String> columnsToIndex,
-                                                                                             Option<Schema> writerSchemaOpt,
-                                                                                             int maxBufferSize) throws IOException {
-    if (writerSchemaOpt.isPresent()) {
-      List<Schema.Field> fieldsToIndex = writerSchemaOpt.get().getFields().stream()
-          .filter(field -> columnsToIndex.contains(field.name()))
-          .collect(Collectors.toList());
-      // read log file records without merging
-      List<HoodieRecord> hoodieRecords = new ArrayList<>();
-      HoodieUnMergedLogRecordScanner scanner = HoodieUnMergedLogRecordScanner.newBuilder()
-          .withFileSystem(datasetMetaClient.getFs())
-          .withBasePath(datasetMetaClient.getBasePath())
-          .withLogFilePaths(Collections.singletonList(filePath))
-          .withBufferSize(maxBufferSize)
-          .withLatestInstantTime(datasetMetaClient.getActiveTimeline().getCommitsTimeline().lastInstant().get().getTimestamp())
-          .withReaderSchema(writerSchemaOpt.get())
-          .withLogRecordScannerCallback(hoodieRecords::add)
-          .build();
-      scanner.scan();
-      if (hoodieRecords.isEmpty()) {
-        return Collections.emptyList();
-      }
-      // Extract IndexedRecord from HoodieRecord to use with existing collectColumnRangeMetadata
-      List<IndexedRecord> records = new ArrayList<>();
-      for (HoodieRecord hoodieRecord : hoodieRecords) {
-        try {
-          Option<IndexedRecord> insertValue = ((HoodieRecordPayload) hoodieRecord.getData()).getInsertValue(writerSchemaOpt.get());
-          if (insertValue.isPresent()) {
-            records.add(insertValue.get());
-          }
-        } catch (IOException e) {
-          LOG.warn("Failed to get insert value for record: {}", e.getMessage());
-        }
-      }
-      if (records.isEmpty()) {
-        return Collections.emptyList();
-      }
-      Map<String, HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataMap =
-          collectColumnRangeMetadata(records, fieldsToIndex, getFileNameFromPath(filePath));
-      return new ArrayList<>(columnRangeMetadataMap.values());
-    }
-    return Collections.emptyList();
-  }
-
   /**
    * Does an upcast for {@link BigDecimal} instance to align it with scale/precision expected by
    * the {@link org.apache.avro.LogicalTypes.Decimal} Avro logical type
diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataRecordsGenerationParams.java b/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataRecordsGenerationParams.java
index 72a8bf4cd26f8..00ffb1baa397c 100644
--- a/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataRecordsGenerationParams.java
+++ b/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataRecordsGenerationParams.java
@@ -42,9 +42,11 @@ public class MetadataRecordsGenerationParams implements Serializable {
   private final int columnStatsIndexParallelism;
   private final List<String> targetColumnsForColumnStatsIndex;
   private final List<String> targetColumnsForBloomFilterIndex;
+  private final int maxReaderBufferSize;
 
-  MetadataRecordsGenerationParams(HoodieTableMetaClient dataMetaClient, List<MetadataPartitionType> enabledPartitionTypes, String bloomFilterType, int bloomIndexParallelism,
-                                  boolean isColumnStatsIndexEnabled, int columnStatsIndexParallelism, List<String> targetColumnsForColumnStatsIndex, List<String> targetColumnsForBloomFilterIndex) {
+  MetadataRecordsGenerationParams(HoodieTableMetaClient dataMetaClient, List<MetadataPartitionType> enabledPartitionTypes, String bloomFilterType,
+                                  int bloomIndexParallelism, boolean isColumnStatsIndexEnabled, int columnStatsIndexParallelism, List<String> targetColumnsForColumnStatsIndex,
+                                  List<String> targetColumnsForBloomFilterIndex, int maxReaderBufferSize) {
     this.dataMetaClient = dataMetaClient;
     this.enabledPartitionTypes = enabledPartitionTypes;
     this.bloomFilterType = bloomFilterType;
@@ -53,6 +55,7 @@ public class MetadataRecordsGenerationParams implements Serializable {
     this.columnStatsIndexParallelism = columnStatsIndexParallelism;
     this.targetColumnsForColumnStatsIndex = targetColumnsForColumnStatsIndex;
     this.targetColumnsForBloomFilterIndex = targetColumnsForBloomFilterIndex;
+    this.maxReaderBufferSize = maxReaderBufferSize;
   }
 
   public HoodieTableMetaClient getDataMetaClient() {
@@ -86,4 +89,8 @@ public List<String> getTargetColumnsForColumnStatsIndex() {
   public List<String> getSecondaryKeysForBloomFilterIndex() {
     return targetColumnsForBloomFilterIndex;
   }
+
+  public int getMaxReaderBufferSize() {
+    return maxReaderBufferSize;
+  }
 }
diff --git a/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java b/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java
index cde9341f5cdf1..e2c989c92f582 100644
--- a/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java
+++ b/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java
@@ -34,16 +34,17 @@
 
 import static org.apache.hudi.common.util.CollectionUtils.createImmutableMap;
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
 
 /**
  * Tests {@link HoodieMetadataPayload}.
  */
 public class TestHoodieMetadataPayload extends HoodieCommonTestHarness {
+  public static final String PARTITION_NAME = "2022/10/01";
 
   @Test
   public void testFileSystemMetadataPayloadMerging() {
-    String partitionName = "2022/10/01";
-
     Map<String, Long> firstCommitAddedFiles = createImmutableMap(
         Pair.of("file1.parquet", 1000L),
         Pair.of("file2.parquet", 2000L),
@@ -51,7 +52,7 @@ public void testFileSystemMetadataPayloadMerging() {
     );
 
     HoodieRecord<HoodieMetadataPayload> firstPartitionFilesRecord =
-        HoodieMetadataPayload.createPartitionFilesRecord(partitionName, firstCommitAddedFiles, Collections.emptyList());
+        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, firstCommitAddedFiles, Collections.emptyList());
 
     Map<String, Long> secondCommitAddedFiles = createImmutableMap(
         // NOTE: This is an append
@@ -63,13 +64,13 @@ public void testFileSystemMetadataPayloadMerging() {
     List<String> secondCommitDeletedFiles = Collections.singletonList("file1.parquet");
 
     HoodieRecord<HoodieMetadataPayload> secondPartitionFilesRecord =
-        HoodieMetadataPayload.createPartitionFilesRecord(partitionName, secondCommitAddedFiles, secondCommitDeletedFiles);
+        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, secondCommitAddedFiles, secondCommitDeletedFiles);
 
     HoodieMetadataPayload combinedPartitionFilesRecordPayload =
         secondPartitionFilesRecord.getData().preCombine(firstPartitionFilesRecord.getData());
 
     HoodieMetadataPayload expectedCombinedPartitionedFilesRecordPayload =
-        HoodieMetadataPayload.createPartitionFilesRecord(partitionName,
+        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME,
             createImmutableMap(
                 Pair.of("file2.parquet", 2000L),
                 Pair.of("file3.parquet", 3333L),
@@ -84,7 +85,6 @@ public void testFileSystemMetadataPayloadMerging() {
 
   @Test
   public void testColumnStatsPayloadMerging() throws IOException {
-    String partitionPath = "2022/10/01";
     String fileName = "file.parquet";
     String targetColName = "c1";
 
@@ -92,7 +92,7 @@ public void testColumnStatsPayloadMerging() throws IOException {
         HoodieColumnRangeMetadata.<Comparable>create(fileName, targetColName, 100, 1000, 5, 1000, 123456, 123456);
 
     HoodieRecord<HoodieMetadataPayload> columnStatsRecord =
-        HoodieMetadataPayload.createColumnStatsRecords(partitionPath, Collections.singletonList(c1Metadata), false)
+        HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1Metadata), false)
             .findFirst().get();
 
     ////////////////////////////////////////////////////////////////////////
@@ -105,7 +105,7 @@ public void testColumnStatsPayloadMerging() throws IOException {
         HoodieColumnRangeMetadata.<Comparable>create(fileName, targetColName, 0, 500, 0, 100, 12345, 12345);
 
     HoodieRecord<HoodieMetadataPayload> updatedColumnStatsRecord =
-        HoodieMetadataPayload.createColumnStatsRecords(partitionPath, Collections.singletonList(c1AppendedBlockMetadata), false)
+        HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1AppendedBlockMetadata), false)
             .findFirst().get();
 
     HoodieMetadataPayload combinedMetadataPayload =
@@ -115,7 +115,7 @@ public void testColumnStatsPayloadMerging() throws IOException {
         HoodieColumnRangeMetadata.<Comparable>create(fileName, targetColName, 0, 1000, 5, 1100, 135801, 135801);
 
     HoodieRecord<HoodieMetadataPayload> expectedColumnStatsRecord =
-        HoodieMetadataPayload.createColumnStatsRecords(partitionPath, Collections.singletonList(expectedColumnRangeMetadata), false)
+        HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(expectedColumnRangeMetadata), false)
             .findFirst().get();
 
     // Assert combined payload
@@ -135,7 +135,7 @@ public void testColumnStatsPayloadMerging() throws IOException {
         HoodieColumnRangeMetadata.<Comparable>stub(fileName, targetColName);
 
     HoodieRecord<HoodieMetadataPayload> deletedColumnStatsRecord =
-        HoodieMetadataPayload.createColumnStatsRecords(partitionPath, Collections.singletonList(c1StubbedMetadata), true)
+        HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1StubbedMetadata), true)
             .findFirst().get();
 
     // NOTE: In this case, deleted (or tombstone) record will be therefore deleting
@@ -144,6 +144,8 @@ public void testColumnStatsPayloadMerging() throws IOException {
         deletedColumnStatsRecord.getData().preCombine(columnStatsRecord.getData());
 
     assertEquals(deletedColumnStatsRecord.getData(), deletedCombinedMetadataPayload);
+    assertFalse(deletedCombinedMetadataPayload.getInsertValue(null).isPresent());
+    assertTrue(deletedCombinedMetadataPayload.isDeleted());
 
     // NOTE: In this case, proper incoming record will be overwriting previously deleted
     //       record
diff --git a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java
deleted file mode 100644
index ce2cae78342c8..0000000000000
--- a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java
+++ /dev/null
@@ -1,295 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hudi.metadata;
-
-import org.apache.hudi.common.model.HoodieColumnRangeMetadata;
-import org.apache.hudi.common.model.HoodieRecord;
-import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
-import org.apache.hudi.common.util.Option;
-import org.apache.hudi.common.util.collection.Pair;
-
-import org.apache.avro.generic.IndexedRecord;
-import org.junit.jupiter.api.Test;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-import static org.apache.hudi.common.util.CollectionUtils.createImmutableMap;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertFalse;
-import static org.junit.jupiter.api.Assertions.assertTrue;
-
-/**
- * Tests {@link HoodieMetadataPayload}.
- */
-public class TestHoodieMetadataPayload extends HoodieCommonTestHarness {
-  public static final String PARTITION_NAME = "2022/10/01";
-  public static final String PARTITION_NAME2 = "2023/10/01";
-  public static final String PARTITION_NAME3 = "2024/10/01";
-
-  @Test
-  public void testFileSystemMetadataPayloadMerging() {
-    Map<String, Long> firstCommitAddedFiles = createImmutableMap(
-        Pair.of("file1.parquet", 1000L),
-        Pair.of("file2.parquet", 2000L),
-        Pair.of("file3.parquet", 3000L)
-    );
-
-    HoodieRecord<HoodieMetadataPayload> firstPartitionFilesRecord =
-        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, firstCommitAddedFiles, Collections.emptyList());
-
-    Map<String, Long> secondCommitAddedFiles = createImmutableMap(
-        // NOTE: This is an append
-        Pair.of("file3.parquet", 3333L),
-        Pair.of("file4.parquet", 4000L),
-        Pair.of("file5.parquet", 5000L)
-    );
-
-    List<String> secondCommitDeletedFiles = Collections.singletonList("file1.parquet");
-
-    HoodieRecord<HoodieMetadataPayload> secondPartitionFilesRecord =
-        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, secondCommitAddedFiles, secondCommitDeletedFiles);
-
-    HoodieMetadataPayload combinedPartitionFilesRecordPayload =
-        secondPartitionFilesRecord.getData().preCombine(firstPartitionFilesRecord.getData());
-
-    HoodieMetadataPayload expectedCombinedPartitionedFilesRecordPayload =
-        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME,
-            createImmutableMap(
-                Pair.of("file2.parquet", 2000L),
-                Pair.of("file3.parquet", 3333L),
-                Pair.of("file4.parquet", 4000L),
-                Pair.of("file5.parquet", 5000L)
-            ),
-            Collections.emptyList()
-        ).getData();
-
-    assertEquals(expectedCombinedPartitionedFilesRecordPayload, combinedPartitionFilesRecordPayload);
-  }
-
-  @Test
-  public void testFileSystemMetadataPayloadMergingWithDeletions() {
-    Map<String, Long> addedFileMap = createImmutableMap(
-        Pair.of("file1.parquet", 1000L),
-        Pair.of("file2.parquet", 2000L),
-        Pair.of("file3.parquet", 3000L),
-        Pair.of("file4.parquet", 4000L)
-    );
-    HoodieRecord<HoodieMetadataPayload> additionRecord =
-        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, addedFileMap, Collections.emptyList());
-
-    List<String> deletedFileList1 = new ArrayList<>();
-    deletedFileList1.add("file1.parquet");
-    deletedFileList1.add("file3.parquet");
-    HoodieRecord<HoodieMetadataPayload> deletionRecord1 =
-        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, Collections.emptyMap(), deletedFileList1);
-
-    List<String> deletedFileList2 = new ArrayList<>();
-    deletedFileList2.add("file1.parquet");
-    deletedFileList2.add("file4.parquet");
-    HoodieRecord<HoodieMetadataPayload> deletionRecord2 =
-        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, Collections.emptyMap(), deletedFileList2);
-
-    assertEquals(
-        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME,
-            createImmutableMap(
-                Pair.of("file2.parquet", 2000L),
-                Pair.of("file4.parquet", 4000L)
-            ),
-            Collections.emptyList()
-        ).getData(),
-        deletionRecord1.getData().preCombine(additionRecord.getData())
-    );
-
-    List<String> expectedDeleteFileList = new ArrayList<>();
-    expectedDeleteFileList.add("file1.parquet");
-    expectedDeleteFileList.add("file3.parquet");
-    expectedDeleteFileList.add("file4.parquet");
-    
-    assertEquals(
-        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME,
-            Collections.emptyMap(),
-            expectedDeleteFileList
-        ).getData(),
-        deletionRecord2.getData().preCombine(deletionRecord1.getData())
-    );
-
-    assertEquals(
-        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME,
-            createImmutableMap(
-                Pair.of("file2.parquet", 2000L)
-            ),
-            Collections.emptyList()
-        ).getData(),
-        deletionRecord2.getData().preCombine(deletionRecord1.getData()).preCombine(additionRecord.getData())
-    );
-
-    assertEquals(
-        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME,
-            createImmutableMap(
-                Pair.of("file2.parquet", 2000L)
-            ),
-            Collections.singletonList("file1.parquet")
-        ).getData(),
-        deletionRecord2.getData().preCombine(deletionRecord1.getData().preCombine(additionRecord.getData()))
-    );
-
-    // lets delete all files
-    List<String> allDeletedFileList = new ArrayList<>();
-    allDeletedFileList.add("file1.parquet");
-    allDeletedFileList.add("file2.parquet");
-    allDeletedFileList.add("file3.parquet");
-    allDeletedFileList.add("file4.parquet");
-    HoodieRecord<HoodieMetadataPayload> allDeletionRecord =
-        HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, Collections.emptyMap(), allDeletedFileList);
-
-    HoodieMetadataPayload combinedPayload = allDeletionRecord.getData().preCombine(additionRecord.getData());
-    assertEquals(HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, Collections.emptyMap(), Collections.emptyList()).getData(), combinedPayload);
-    assertTrue(combinedPayload.filesystemMetadata.isEmpty());
-
-    // test all partition record
-    HoodieRecord<HoodieMetadataPayload> allPartitionsRecord = HoodieMetadataPayload.createPartitionListRecord(Arrays.asList(PARTITION_NAME, PARTITION_NAME2, PARTITION_NAME3), false);
-    HoodieRecord<HoodieMetadataPayload> partitionDeletedRecord = HoodieMetadataPayload.createPartitionListRecord(Collections.singletonList(PARTITION_NAME), true);
-    // combine to ensure the deleted partitions is not seen
-    HoodieMetadataPayload payload = partitionDeletedRecord.getData().preCombine(allPartitionsRecord.getData());
-    assertEquals(HoodieMetadataPayload.createPartitionListRecord(Arrays.asList(PARTITION_NAME2, PARTITION_NAME3), false).getData(),
-        payload);
-  }
-
-  @Test
-  public void testColumnStatsPayloadMerging() throws IOException {
-    String fileName = "file.parquet";
-    String targetColName = "c1";
-
-    HoodieColumnRangeMetadata<Comparable> c1Metadata =
-        HoodieColumnRangeMetadata.<Comparable>create(fileName, targetColName, 100, 1000, 5, 1000, 123456, 123456);
-
-    HoodieRecord<HoodieMetadataPayload> columnStatsRecord =
-        HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1Metadata), false)
-            .findFirst().get();
-
-    ////////////////////////////////////////////////////////////////////////
-    // Case 1: Combining proper (non-deleted) records
-    ////////////////////////////////////////////////////////////////////////
-
-    // NOTE: Column Stats record will only be merged in case existing file will be modified,
-    //       which could only happen on storages schemes supporting appends
-    HoodieColumnRangeMetadata<Comparable> c1AppendedBlockMetadata =
-        HoodieColumnRangeMetadata.<Comparable>create(fileName, targetColName, 0, 500, 0, 100, 12345, 12345);
-
-    HoodieRecord<HoodieMetadataPayload> updatedColumnStatsRecord =
-        HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1AppendedBlockMetadata), false)
-            .findFirst().get();
-
-    HoodieMetadataPayload combinedMetadataPayload =
-        columnStatsRecord.getData().preCombine(updatedColumnStatsRecord.getData());
-
-    HoodieColumnRangeMetadata<Comparable> expectedColumnRangeMetadata =
-        HoodieColumnRangeMetadata.<Comparable>create(fileName, targetColName, 0, 1000, 5, 1100, 135801, 135801);
-
-    HoodieRecord<HoodieMetadataPayload> expectedColumnStatsRecord =
-        HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(expectedColumnRangeMetadata), false)
-            .findFirst().get();
-
-    // Assert combined payload
-    assertEquals(combinedMetadataPayload, expectedColumnStatsRecord.getData());
-
-    Option<IndexedRecord> alternativelyCombinedMetadataPayloadAvro =
-        columnStatsRecord.getData().combineAndGetUpdateValue(updatedColumnStatsRecord.getData().getInsertValue(null).get(), null);
-
-    // Assert that using legacy API yields the same value
-    assertEquals(combinedMetadataPayload.getInsertValue(null), alternativelyCombinedMetadataPayloadAvro);
-
-    ////////////////////////////////////////////////////////////////////////
-    // Case 2: Combining w/ deleted records
-    ////////////////////////////////////////////////////////////////////////
-
-    HoodieColumnRangeMetadata<Comparable> c1StubbedMetadata =
-        HoodieColumnRangeMetadata.<Comparable>stub(fileName, targetColName);
-
-    HoodieRecord<HoodieMetadataPayload> deletedColumnStatsRecord =
-        HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1StubbedMetadata), true)
-            .findFirst().get();
-
-    // NOTE: In this case, deleted (or tombstone) record will be therefore deleting
-    //       previous state of the record
-    HoodieMetadataPayload deletedCombinedMetadataPayload =
-        deletedColumnStatsRecord.getData().preCombine(columnStatsRecord.getData());
-
-    assertEquals(deletedColumnStatsRecord.getData(), deletedCombinedMetadataPayload);
-    assertFalse(deletedCombinedMetadataPayload.getInsertValue(null).isPresent());
-    assertTrue(deletedCombinedMetadataPayload.isDeleted());
-
-    // NOTE: In this case, proper incoming record will be overwriting previously deleted
-    //       record
-    HoodieMetadataPayload overwrittenCombinedMetadataPayload =
-        columnStatsRecord.getData().preCombine(deletedColumnStatsRecord.getData());
-
-    assertEquals(columnStatsRecord.getData(), overwrittenCombinedMetadataPayload);
-  }
-
-  @Test
-  public void testPartitionStatsPayloadMerging() {
-    HoodieColumnRangeMetadata<Comparable> fileColumnRange1 = HoodieColumnRangeMetadata.<Comparable>create(
-        "path/to/file", "columnName", 1, 5, 0, 10, 100, 200);
-    HoodieRecord<HoodieMetadataPayload> firstPartitionStatsRecord =
-        HoodieMetadataPayload.createPartitionStatsRecords(PARTITION_NAME, Collections.singletonList(fileColumnRange1), false, false).findFirst().get();
-    HoodieColumnRangeMetadata<Comparable> fileColumnRange2 = HoodieColumnRangeMetadata.<Comparable>create(
-        "path/to/file", "columnName", 3, 8, 1, 15, 120, 250);
-    HoodieRecord<HoodieMetadataPayload> updatedPartitionStatsRecord =
-        HoodieMetadataPayload.createPartitionStatsRecords(PARTITION_NAME, Collections.singletonList(fileColumnRange2), false, false).findFirst().get();
-    HoodieMetadataPayload combinedPartitionStatsRecordPayload =
-        updatedPartitionStatsRecord.getData().preCombine(firstPartitionStatsRecord.getData());
-    HoodieColumnRangeMetadata<Comparable> expectedColumnRange = HoodieColumnRangeMetadata.<Comparable>create(
-        "path/to/file", "columnName", 1, 8, 1, 25, 220, 450);
-    HoodieMetadataPayload expectedColumnRangeMetadata = (HoodieMetadataPayload) HoodieMetadataPayload.createPartitionStatsRecords(
-        PARTITION_NAME, Collections.singletonList(expectedColumnRange), false, false).findFirst().get().getData();
-    assertEquals(expectedColumnRangeMetadata, combinedPartitionStatsRecordPayload);
-  }
-
-  @Test
-  public void testPartitionStatsPayloadMergingWithDelete() {
-    HoodieColumnRangeMetadata<Comparable> fileColumnRange1 = HoodieColumnRangeMetadata.<Comparable>create(
-        "path/to/file", "columnName", 1, 5, 0, 10, 100, 200);
-    HoodieRecord<HoodieMetadataPayload> firstPartitionStatsRecord =
-        HoodieMetadataPayload.createPartitionStatsRecords(PARTITION_NAME, Collections.singletonList(fileColumnRange1), false, false).findFirst().get();
-    HoodieColumnRangeMetadata<Comparable> fileColumnRange2 = HoodieColumnRangeMetadata.<Comparable>create(
-        "path/to/file", "columnName", 3, 8, 1, 15, 120, 250);
-    // create delete payload
-    HoodieRecord<HoodieMetadataPayload> deletedPartitionStatsRecord =
-        HoodieMetadataPayload.createPartitionStatsRecords(PARTITION_NAME, Collections.singletonList(fileColumnRange2), true, false).findFirst().get();
-    // deleted (or tombstone) record will be therefore deleting previous state of the record
-    HoodieMetadataPayload combinedPartitionStatsRecordPayload =
-        deletedPartitionStatsRecord.getData().preCombine(firstPartitionStatsRecord.getData());
-    HoodieColumnRangeMetadata<Comparable> expectedColumnRange = HoodieColumnRangeMetadata.<Comparable>create(
-        "path/to/file", "columnName", 3, 8, 1, 15, 120, 250);
-    HoodieMetadataPayload expectedColumnRangeMetadata = (HoodieMetadataPayload) HoodieMetadataPayload.createPartitionStatsRecords(
-        PARTITION_NAME, Collections.singletonList(expectedColumnRange), true, false).findFirst().get().getData();
-    assertEquals(expectedColumnRangeMetadata, combinedPartitionStatsRecordPayload);
-
-    // another update for the same key should overwrite the delete record
-    HoodieMetadataPayload overwrittenCombinedPartitionStatsRecordPayload =
-        firstPartitionStatsRecord.getData().preCombine(deletedPartitionStatsRecord.getData());
-    assertEquals(firstPartitionStatsRecord.getData(), overwrittenCombinedPartitionStatsRecordPayload);
-  }
-}
diff --git a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java
deleted file mode 100644
index 9586171d97aa5..0000000000000
--- a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.hudi.metadata;
-
-import org.apache.hudi.common.config.HoodieMetadataConfig;
-import org.apache.hudi.common.data.HoodieData;
-import org.apache.hudi.common.engine.EngineType;
-import org.apache.hudi.common.engine.HoodieLocalEngineContext;
-import org.apache.hudi.common.model.FileSlice;
-import org.apache.hudi.common.model.HoodieBaseFile;
-import org.apache.hudi.common.model.HoodieColumnRangeMetadata;
-import org.apache.hudi.common.model.HoodieCommitMetadata;
-import org.apache.hudi.common.model.HoodieLogFile;
-import org.apache.hudi.common.model.HoodieRecord;
-import org.apache.hudi.common.model.WriteOperationType;
-import org.apache.hudi.common.table.HoodieTableConfig;
-import org.apache.hudi.common.table.HoodieTableMetaClient;
-import org.apache.hudi.common.testutils.FileCreateUtils;
-import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
-import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
-import org.apache.hudi.common.testutils.HoodieTestTable;
-import org.apache.hudi.common.util.Option;
-import org.apache.hudi.common.util.collection.Pair;
-import org.apache.hudi.io.storage.HoodieFileWriter;
-import org.apache.hudi.io.storage.HoodieFileWriterFactory;
-import org.apache.hudi.storage.StoragePath;
-
-import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
-import java.io.IOException;
-import java.net.URI;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Properties;
-import java.util.Set;
-import java.util.UUID;
-import java.util.stream.Collectors;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertTrue;
-
-public class TestHoodieTableMetadataUtil extends HoodieCommonTestHarness {
-
-  private static HoodieTestTable hoodieTestTable;
-  private static final List<String> DATE_PARTITIONS = Arrays.asList("2019/01/01", "2020/01/02", "2021/03/01");
-
-  @BeforeEach
-  public void setUp() throws IOException {
-    initMetaClient();
-    initTestDataGenerator(DATE_PARTITIONS.toArray(new String[0]));
-    hoodieTestTable = HoodieTestTable.of(metaClient);
-  }
-
-  @AfterEach
-  public void tearDown() throws IOException {
-    metaClient.getStorage().deleteDirectory(metaClient.getBasePath());
-    cleanupTestDataGenerator();
-    cleanMetaClient();
-  }
-
-  @Test
-  public void testReadRecordKeysFromBaseFilesWithEmptyPartitionBaseFilePairs() {
-    HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getStorageConf());
-    List<Pair<String, FileSlice>> partitionFileSlicePairs = Collections.emptyList();
-    HoodieData<HoodieRecord> result = HoodieTableMetadataUtil.readRecordKeysFromFileSlices(
-        engineContext,
-        partitionFileSlicePairs,
-        false,
-        1,
-        "activeModule",
-        metaClient,
-        EngineType.SPARK
-    );
-    assertTrue(result.isEmpty());
-  }
-
-  @Test
-  public void testConvertFilesToPartitionStatsRecords() throws Exception {
-    HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getStorageConf());
-    String instant1 = "20230918120000000";
-    hoodieTestTable = hoodieTestTable.addCommit(instant1);
-    String instant2 = "20230918121110000";
-    hoodieTestTable = hoodieTestTable.addCommit(instant2);
-    List<HoodieTableMetadataUtil.DirectoryInfo> partitionInfoList = new ArrayList<>();
-    // Generate 10 inserts for each partition and populate partitionBaseFilePairs and recordKeys.
-    DATE_PARTITIONS.forEach(p -> {
-      try {
-        URI partitionMetaFile = FileCreateUtils.createPartitionMetaFile(basePath, p);
-        StoragePath partitionMetadataPath = new StoragePath(partitionMetaFile);
-        String fileId1 = UUID.randomUUID().toString();
-        FileSlice fileSlice1 = new FileSlice(p, instant1, fileId1);
-        StoragePath storagePath1 = new StoragePath(hoodieTestTable.getBaseFilePath(p, fileId1).toUri());
-        writeParquetFile(
-            instant1,
-            storagePath1,
-            dataGen.generateInsertsForPartition(instant1, 10, p),
-            metaClient,
-            engineContext);
-        HoodieBaseFile baseFile1 = new HoodieBaseFile(hoodieTestTable.getBaseFilePath(p, fileId1).toString());
-        fileSlice1.setBaseFile(baseFile1);
-        String fileId2 = UUID.randomUUID().toString();
-        FileSlice fileSlice2 = new FileSlice(p, instant2, fileId2);
-        StoragePath storagePath2 = new StoragePath(hoodieTestTable.getBaseFilePath(p, fileId2).toUri());
-        writeParquetFile(
-            instant2,
-            storagePath2,
-            dataGen.generateInsertsForPartition(instant2, 10, p),
-            metaClient,
-            engineContext);
-        HoodieBaseFile baseFile2 = new HoodieBaseFile(hoodieTestTable.getBaseFilePath(p, fileId2).toString());
-        fileSlice2.setBaseFile(baseFile2);
-        partitionInfoList.add(new HoodieTableMetadataUtil.DirectoryInfo(
-            p,
-            metaClient.getStorage().listDirectEntries(Arrays.asList(partitionMetadataPath, storagePath1, storagePath2)),
-            instant2,
-            Collections.emptySet()));
-      } catch (Exception e) {
-        throw new RuntimeException(e);
-      }
-    });
-
-    List<String> columnsToIndex = Arrays.asList("rider", "driver");
-    HoodieData<HoodieRecord> result = HoodieTableMetadataUtil.convertFilesToPartitionStatsRecords(
-        engineContext,
-        partitionInfoList,
-        HoodieMetadataConfig.newBuilder().enable(true)
-            .withMetadataIndexColumnStats(true)
-            .withMetadataIndexPartitionStats(true)
-            .withColumnStatsIndexForColumns("rider,driver")
-            .withPartitionStatsIndexParallelism(1)
-            .build(),
-        metaClient,
-        Option.of(HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS));
-    // Validate the result.
-    validatePartitionStats(result, instant1, instant2);
-  }
-
-  @Test
-  public void testReadRecordKeysFromBaseFilesWithValidRecords() throws Exception {
-    HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getStorageConf());
-    String instant = "20230918120000000";
-    hoodieTestTable = hoodieTestTable.addCommit(instant);
-    Set<String> recordKeys = new HashSet<>();
-    final List<Pair<String, FileSlice>> partitionFileSlicePairs = new ArrayList<>();
-    // Generate 10 inserts for each partition and populate partitionBaseFilePairs and recordKeys.
-    DATE_PARTITIONS.forEach(p -> {
-      try {
-        List<HoodieRecord> hoodieRecords = dataGen.generateInsertsForPartition(instant, 10, p);
-        String fileId = UUID.randomUUID().toString();
-        FileSlice fileSlice = new FileSlice(p, instant, fileId);
-        writeParquetFile(
-            instant,
-            new StoragePath(hoodieTestTable.getBaseFilePath(p, fileId).toUri()),
-            hoodieRecords,
-            metaClient,
-            engineContext);
-        HoodieBaseFile baseFile = new HoodieBaseFile(hoodieTestTable.getBaseFilePath(p, fileId).toString(), fileId, instant, null);
-        fileSlice.setBaseFile(baseFile);
-        partitionFileSlicePairs.add(Pair.of(p, fileSlice));
-        recordKeys.addAll(hoodieRecords.stream().map(HoodieRecord::getRecordKey).collect(Collectors.toSet()));
-      } catch (Exception e) {
-        throw new RuntimeException(e);
-      }
-    });
-
-    // Call the method readRecordKeysFromBaseFiles with the created partitionBaseFilePairs.
-    HoodieData<HoodieRecord> result = HoodieTableMetadataUtil.readRecordKeysFromFileSlices(
-        engineContext,
-        partitionFileSlicePairs,
-        false,
-        1,
-        "activeModule",
-        metaClient,
-        EngineType.SPARK
-    );
-    // Validate the result.
-    List<HoodieRecord> records = result.collectAsList();
-    assertEquals(30, records.size());
-    assertEquals(MetadataPartitionType.RECORD_INDEX.getPartitionPath(), records.get(0).getPartitionPath());
-    for (HoodieRecord record : records) {
-      assertTrue(recordKeys.contains(record.getRecordKey()));
-    }
-  }
-
-  @Test
-  public void testGetLogFileColumnRangeMetadata() throws Exception {
-    HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getStorageConf());
-    String instant1 = "20230918120000000";
-
-    HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
-    commitMetadata.addMetadata("test", "test");
-    commitMetadata.setOperationType(WriteOperationType.INSERT);
-    commitMetadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS.toString());
-    hoodieTestTable = hoodieTestTable.addCommit(instant1, Option.of(commitMetadata));
-    String instant2 = "20230918121110000";
-    hoodieTestTable = hoodieTestTable.addCommit(instant2);
-    List<HoodieTableMetadataUtil.DirectoryInfo> partitionInfoList = new ArrayList<>();
-    List<String> columnsToIndex = Arrays.asList("rider", "driver");
-    // Generate 10 inserts for each partition and populate partitionBaseFilePairs and recordKeys.
-    DATE_PARTITIONS.forEach(p -> {
-      try {
-        URI partitionMetaFile = FileCreateUtils.createPartitionMetaFile(basePath, p);
-        StoragePath partitionMetadataPath = new StoragePath(partitionMetaFile);
-        String fileId1 = UUID.randomUUID().toString();
-        // add only one parquet file in first file slice
-        FileSlice fileSlice1 = new FileSlice(p, instant1, fileId1);
-        StoragePath storagePath1 = new StoragePath(hoodieTestTable.getBaseFilePath(p, fileId1).toUri());
-        writeParquetFile(instant1, storagePath1, dataGen.generateInsertsForPartition(instant1, 10, p), metaClient, engineContext);
-        HoodieBaseFile baseFile1 = new HoodieBaseFile(hoodieTestTable.getBaseFilePath(p, fileId1).toString());
-        fileSlice1.setBaseFile(baseFile1);
-        // add log file in second file slice with higher rider and driver values (which are concatenated with instant)
-        FileSlice fileSlice2 = new FileSlice(p, instant2, fileId1);
-        fileSlice2.setBaseFile(baseFile1);
-        StoragePath storagePath2 = new StoragePath(partitionMetadataPath.getParent(), hoodieTestTable.getLogFileNameById(fileId1, 1));
-        writeLogFiles(new StoragePath(metaClient.getBasePath(), p), HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS, dataGen.generateInsertsForPartition(instant2, 10, p), 1,
-            metaClient.getStorage(), new Properties(), fileId1, instant2);
-        fileSlice2.addLogFile(new HoodieLogFile(storagePath2.toUri().toString()));
-        partitionInfoList.add(new HoodieTableMetadataUtil.DirectoryInfo(
-            p,
-            metaClient.getStorage().listDirectEntries(Arrays.asList(partitionMetadataPath, storagePath1, storagePath2)),
-            instant2,
-            Collections.emptySet()));
-        // NOTE: we need to set table config as we are not using write client explicitly and these configs are needed for log record reader
-        metaClient.getTableConfig().setValue(HoodieTableConfig.POPULATE_META_FIELDS.key(), "false");
-        metaClient.getTableConfig().setValue(HoodieTableConfig.RECORDKEY_FIELDS.key(), "_row_key");
-        metaClient.getTableConfig().setValue(HoodieTableConfig.PARTITION_FIELDS.key(), "partition_path");
-        List<HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataLogFile = HoodieTableMetadataUtil.getLogFileColumnRangeMetadata(
-            storagePath2.toString(),
-            metaClient,
-            columnsToIndex,
-            Option.of(HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS),
-            HoodieMetadataConfig.MAX_READER_BUFFER_SIZE_PROP.defaultValue());
-        // there must be two ranges for rider and driver
-        assertEquals(2, columnRangeMetadataLogFile.size());
-      } catch (Exception e) {
-        throw new RuntimeException(e);
-      }
-    });
-    // collect partition stats, this will collect stats for log files as well
-    HoodieData<HoodieRecord> result = HoodieTableMetadataUtil.convertFilesToPartitionStatsRecords(
-        engineContext,
-        partitionInfoList,
-        HoodieMetadataConfig.newBuilder().enable(true)
-            .withMetadataIndexColumnStats(true)
-            .withMetadataIndexPartitionStats(true)
-            .withColumnStatsIndexForColumns("rider,driver")
-            .withPartitionStatsIndexParallelism(1)
-            .build(),
-        metaClient,
-        Option.of(HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS));
-    // Validate the result.
-    validatePartitionStats(result, instant1, instant2);
-  }
-
-  private static void validatePartitionStats(HoodieData<HoodieRecord> result, String instant1, String instant2) {
-    List<HoodieRecord> records = result.collectAsList();
-    // 3 partitions * 2 columns = 6 partition stats records
-    assertEquals(6, records.size());
-    assertEquals(MetadataPartitionType.PARTITION_STATS.getPartitionPath(), records.get(0).getPartitionPath());
-    ((HoodieMetadataPayload) result.collectAsList().get(0).getData()).getColumnStatMetadata().get().getColumnName();
-    records.forEach(r -> {
-      HoodieMetadataPayload payload = (HoodieMetadataPayload) r.getData();
-      assertTrue(payload.getColumnStatMetadata().isPresent());
-      // instant1 < instant2 so instant1 should be in the min value and instant2 should be in the max value.
-      if (payload.getColumnStatMetadata().get().getColumnName().equals("rider")) {
-        assertEquals(String.format("{\"value\": \"rider-%s\"}", instant1), String.valueOf(payload.getColumnStatMetadata().get().getMinValue()));
-        assertEquals(String.format("{\"value\": \"rider-%s\"}", instant2), String.valueOf(payload.getColumnStatMetadata().get().getMaxValue()));
-      } else if (payload.getColumnStatMetadata().get().getColumnName().equals("driver")) {
-        assertEquals(String.format("{\"value\": \"driver-%s\"}", instant1), String.valueOf(payload.getColumnStatMetadata().get().getMinValue()));
-        assertEquals(String.format("{\"value\": \"driver-%s\"}", instant2), String.valueOf(payload.getColumnStatMetadata().get().getMaxValue()));
-      }
-    });
-  }
-
-  private static void writeParquetFile(String instant,
-                                       StoragePath path,
-                                       List<HoodieRecord> records,
-                                       HoodieTableMetaClient metaClient,
-                                       HoodieLocalEngineContext engineContext) throws IOException {
-    HoodieFileWriter writer = HoodieFileWriterFactory.getFileWriter(
-        instant,
-        path,
-        metaClient.getStorage(),
-        metaClient.getTableConfig(),
-        HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS,
-        engineContext.getTaskContextSupplier(),
-        HoodieRecord.HoodieRecordType.AVRO);
-    for (HoodieRecord record : records) {
-      writer.writeWithMetadata(record.getKey(), record, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS);
-    }
-    writer.close();
-  }
-}

From 5419e5d296645a50c33113a8acc4ca8f10dfe315 Mon Sep 17 00:00:00 2001
From: Vamsi <vamsikarnika@gmail.com>
Date: Fri, 13 Mar 2026 23:42:01 +0530
Subject: [PATCH 06/12] Fix TestHoodieMetadataPayload tests

---
 .../org/apache/hudi/metadata/TestHoodieMetadataPayload.java     | 2 --
 1 file changed, 2 deletions(-)

diff --git a/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java b/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java
index e2c989c92f582..715fb25eb3c4f 100644
--- a/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java
+++ b/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java
@@ -144,8 +144,6 @@ public void testColumnStatsPayloadMerging() throws IOException {
         deletedColumnStatsRecord.getData().preCombine(columnStatsRecord.getData());
 
     assertEquals(deletedColumnStatsRecord.getData(), deletedCombinedMetadataPayload);
-    assertFalse(deletedCombinedMetadataPayload.getInsertValue(null).isPresent());
-    assertTrue(deletedCombinedMetadataPayload.isDeleted());
 
     // NOTE: In this case, proper incoming record will be overwriting previously deleted
     //       record

From 099f0237e5e6cc92cb5301fce54ec684a3e8f442 Mon Sep 17 00:00:00 2001
From: Vamsi <vamsikarnika@gmail.com>
Date: Fri, 13 Mar 2026 23:48:24 +0530
Subject: [PATCH 07/12] fix scala issues

---
 .../apache/hudi/functional/ColumnStatIndexTestBase.scala    | 6 +++---
 .../org/apache/hudi/functional/TestColumnStatsIndex.scala   | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala
index ba29a4c36bf15..1b81141516731 100644
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala
@@ -124,7 +124,7 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase {
     val fsv = FileSystemViewManager.createInMemoryFileSystemView(new HoodieSparkEngineContext(jsc), metaClient, HoodieMetadataConfig.newBuilder().enable(false).build())
     fsv.loadAllPartitions()
     val filegroupList = fsv.getAllFileGroups.collect(Collectors.toList[HoodieFileGroup])
-    val baseFilesList = filegroupList.stream().flatMap(fileGroup => fileGroup.getAllBaseFiles).collect(Collectors.toList[HoodieBaseFile])
+    val baseFilesList = filegroupList.stream().flatMap((fileGroup: HoodieFileGroup) => fileGroup.getAllBaseFiles).collect(Collectors.toList[HoodieBaseFile])
     val baseFiles = baseFilesList.stream()
       .map[Path](baseFile => new Path(baseFile.getPath)).collect(Collectors.toList[Path]).asScala
 
@@ -164,7 +164,7 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase {
     if (metaClient.getTableConfig.getTableType == HoodieTableType.COPY_ON_WRITE) {
       baseFilesDf // COW table
     } else {
-      val allLogFiles = filegroupList.stream().flatMap(fileGroup => fileGroup.getAllFileSlices)
+      val allLogFiles = filegroupList.stream().flatMap((fileGroup: HoodieFileGroup) => fileGroup.getAllFileSlices)
         .flatMap(fileSlice => fileSlice.getLogFiles)
         .collect(Collectors.toList[HoodieLogFile])
       if (allLogFiles.isEmpty) {
@@ -188,7 +188,7 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase {
                                         writerSchemaOpt: org.apache.hudi.common.util.Option[Schema],
                                         maxBufferSize: Integer,
                                         indexSchema: StructType): DataFrame = {
-    val colStatsEntries = logFiles.stream().map[org.apache.hudi.common.util.Option[Row]](logFile => {
+    val colStatsEntries = logFiles.stream().map[org.apache.hudi.common.util.Option[Row]]((logFile: HoodieLogFile) => {
       try {
         getColStatsFromLogFile(logFile.getPath.toString, latestCommit, columnsToIndex, datasetMetaClient, writerSchemaOpt, maxBufferSize)
       } catch {
diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
index 7d0bacf03bf77..d5dec40e0ad38 100644
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
@@ -24,7 +24,7 @@ import org.apache.hudi.ColumnStatsIndexSupport.composeIndexSchema
 import org.apache.hudi.DataSourceWriteOptions.{PRECOMBINE_FIELD, RECORDKEY_FIELD}
 import org.apache.hudi.HoodieConversionUtils.toProperties
 import org.apache.hudi.common.config.{HoodieCommonConfig, HoodieMetadataConfig, HoodieStorageConfig}
-import org.apache.hudi.common.model.HoodieTableType
+import org.apache.hudi.common.model.{FileSlice, HoodieTableType}
 import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient}
 import org.apache.hudi.common.util.ParquetUtils
 import org.apache.hudi.config.{HoodieCleanConfig, HoodieCompactionConfig, HoodieWriteConfig}
@@ -173,9 +173,9 @@ class TestColumnStatsIndex extends ColumnStatIndexTestBase {
       fsv.loadAllPartitions()
       val basePath2 = new Path(basePath)
       val allPartitionPaths = fsv.getPartitionPaths
-      allPartitionPaths.forEach(partitionPath => {
+      allPartitionPaths.forEach((partitionPath: Path) => {
         val pPath = FSUtils.getRelativePartitionPath(basePath2, partitionPath)
-        assertTrue (fsv.getLatestFileSlices(pPath).filter(fileSlice => fileSlice.getLogFiles.findAny().isPresent).count() > 0)
+        assertTrue (fsv.getLatestFileSlices(pPath).filter((fileSlice: FileSlice) => fileSlice.getLogFiles.findAny().isPresent).count() > 0)
       })
       fsv.close()
     }

From 74b57ec58ecfcc91914ddd172c834de5f823ef5f Mon Sep 17 00:00:00 2001
From: Vamsi <vamsikarnika@gmail.com>
Date: Fri, 13 Mar 2026 23:50:58 +0530
Subject: [PATCH 08/12] Fix checkstyle

---
 .../org/apache/hudi/metadata/TestHoodieMetadataPayload.java     | 2 --
 1 file changed, 2 deletions(-)

diff --git a/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java b/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java
index 715fb25eb3c4f..4f022d7e0dafb 100644
--- a/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java
+++ b/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java
@@ -34,8 +34,6 @@
 
 import static org.apache.hudi.common.util.CollectionUtils.createImmutableMap;
 import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertFalse;
-import static org.junit.jupiter.api.Assertions.assertTrue;
 
 /**
  * Tests {@link HoodieMetadataPayload}.

From d80a4b6c8a47f3f1aa8040e803ac29d0f76c00a7 Mon Sep 17 00:00:00 2001
From: Vamsi <vamsikarnika@gmail.com>
Date: Sun, 15 Mar 2026 17:49:41 +0530
Subject: [PATCH 09/12] fix CI

---
 .../functional/ColumnStatIndexTestBase.scala  | 19 ++++++++++---------
 .../functional/TestColumnStatsIndex.scala     |  4 ++--
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala
index 1b81141516731..cc9a2b8551a70 100644
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala
@@ -124,9 +124,9 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase {
     val fsv = FileSystemViewManager.createInMemoryFileSystemView(new HoodieSparkEngineContext(jsc), metaClient, HoodieMetadataConfig.newBuilder().enable(false).build())
     fsv.loadAllPartitions()
     val filegroupList = fsv.getAllFileGroups.collect(Collectors.toList[HoodieFileGroup])
-    val baseFilesList = filegroupList.stream().flatMap((fileGroup: HoodieFileGroup) => fileGroup.getAllBaseFiles).collect(Collectors.toList[HoodieBaseFile])
-    val baseFiles = baseFilesList.stream()
-      .map[Path](baseFile => new Path(baseFile.getPath)).collect(Collectors.toList[Path]).asScala
+    val baseFiles = filegroupList.asScala
+      .flatMap(fileGroup => fileGroup.getAllBaseFiles.iterator().asScala)
+      .map(baseFile => new Path(baseFile.getPath))
 
     val baseFilesDf = spark.createDataFrame(
       baseFiles.flatMap(file => {
@@ -164,9 +164,10 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase {
     if (metaClient.getTableConfig.getTableType == HoodieTableType.COPY_ON_WRITE) {
       baseFilesDf // COW table
     } else {
-      val allLogFiles = filegroupList.stream().flatMap((fileGroup: HoodieFileGroup) => fileGroup.getAllFileSlices)
-        .flatMap(fileSlice => fileSlice.getLogFiles)
-        .collect(Collectors.toList[HoodieLogFile])
+      val allLogFiles = filegroupList.asScala
+        .flatMap(fileGroup => fileGroup.getAllFileSlices.iterator().asScala)
+        .flatMap(fileSlice => fileSlice.getLogFiles.iterator().asScala)
+        .toList.asJava
       if (allLogFiles.isEmpty) {
         baseFilesDf // MOR table, but no log files.
       } else {
@@ -174,7 +175,7 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase {
         val writerSchemaOpt = LogFileColStatsTestUtil.getSchemaForTable(metaClient)
         val latestCompletedCommit = metaClient.getActiveTimeline.getCommitsTimeline.filterCompletedInstants().lastInstant().get().getTimestamp
         baseFilesDf.union(getColStatsFromLogFiles(allLogFiles, latestCompletedCommit,
-          scala.collection.JavaConverters.seqAsJavaList(colsToGenerateStats),
+          colsToGenerateStats.asJava,
           metaClient,
           writerSchemaOpt: org.apache.hudi.common.util.Option[Schema],
           HoodieMetadataConfig.MAX_READER_BUFFER_SIZE_PROP.defaultValue(),
@@ -188,14 +189,14 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase {
                                         writerSchemaOpt: org.apache.hudi.common.util.Option[Schema],
                                         maxBufferSize: Integer,
                                         indexSchema: StructType): DataFrame = {
-    val colStatsEntries = logFiles.stream().map[org.apache.hudi.common.util.Option[Row]]((logFile: HoodieLogFile) => {
+    val colStatsEntries = logFiles.asScala.map(logFile => {
       try {
         getColStatsFromLogFile(logFile.getPath.toString, latestCommit, columnsToIndex, datasetMetaClient, writerSchemaOpt, maxBufferSize)
       } catch {
         case e: Exception =>
           throw e
       }
-    }).filter(rowOpt => rowOpt.isPresent).map[Row](rowOpt => rowOpt.get()).collect(Collectors.toList[Row])
+    }).filter(rowOpt => rowOpt.isPresent).map(rowOpt => rowOpt.get()).toList.asJava
     spark.createDataFrame(colStatsEntries, indexSchema)
   }
 
diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
index d5dec40e0ad38..bb8bf8856eed3 100644
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
@@ -173,9 +173,9 @@ class TestColumnStatsIndex extends ColumnStatIndexTestBase {
       fsv.loadAllPartitions()
       val basePath2 = new Path(basePath)
       val allPartitionPaths = fsv.getPartitionPaths
-      allPartitionPaths.forEach((partitionPath: Path) => {
+      allPartitionPaths.asScala.foreach(partitionPath => {
         val pPath = FSUtils.getRelativePartitionPath(basePath2, partitionPath)
-        assertTrue (fsv.getLatestFileSlices(pPath).filter((fileSlice: FileSlice) => fileSlice.getLogFiles.findAny().isPresent).count() > 0)
+        assertTrue (fsv.getLatestFileSlices(pPath).iterator().asScala.count(fileSlice => fileSlice.getLogFiles.findAny().isPresent) > 0)
       })
       fsv.close()
     }

From 82913a25f4b2180142e47799e5d60a5c34145248 Mon Sep 17 00:00:00 2001
From: Vamsi <vamsikarnika@gmail.com>
Date: Tue, 17 Mar 2026 15:08:59 +0530
Subject: [PATCH 10/12] Fix tests in TestColumnStatsIndex

---
 .../HoodieBackedTableMetadataWriter.java      | 62 +++++++++----------
 .../log/HoodieUnMergedLogRecordScanner.java   |  2 +-
 .../hudi/metadata/HoodieMetadataPayload.java  | 18 +++---
 .../metadata/HoodieTableMetadataUtil.java     | 52 +++++++++++++++-
 4 files changed, 92 insertions(+), 42 deletions(-)

diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
index 2f1ab37bf52b6..5d53fdb0f7650 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
@@ -341,9 +341,7 @@ private boolean isBootstrapNeeded(Option<HoodieInstant> latestMetadataInstant) {
    */
   private void initializeFromFilesystem(String initializationTime, List<MetadataPartitionType> partitionsToInit,
                                            Option<String> inflightInstantTimestamp) throws IOException {
-    if (anyPendingDataInstant(dataMetaClient, inflightInstantTimestamp)) {
-      return;
-    }
+    Set<String> pendingDataInstants = getPendingDataInstants(dataMetaClient);
 
     // FILES partition is always required and is initialized first
     boolean filesPartitionAvailable = dataMetaClient.getTableConfig().isMetadataPartitionAvailable(MetadataPartitionType.FILES);
@@ -368,11 +366,11 @@ private void initializeFromFilesystem(String initializationTime, List<MetadataPa
     // Get a complete list of files and partitions from the file system or from already initialized FILES partition of MDT
     List<DirectoryInfo> partitionInfoList;
     if (filesPartitionAvailable) {
-      partitionInfoList = listAllPartitionsFromMDT(initializationTime);
+      partitionInfoList = listAllPartitionsFromMDT(initializationTime, pendingDataInstants);
     } else {
       // if auto initialization is enabled, then we need to list all partitions from the file system
       if (dataWriteConfig.getMetadataConfig().shouldAutoInitialize()) {
-        partitionInfoList = listAllPartitionsFromFilesystem(initializationTime);
+        partitionInfoList = listAllPartitionsFromFilesystem(initializationTime, pendingDataInstants);
       } else {
         // if auto initialization is disabled, we can return an empty list
         partitionInfoList = Collections.emptyList();
@@ -545,22 +543,14 @@ private Pair<Integer, HoodieData<HoodieRecord>> initializeFilesPartition(List<Di
     return Pair.of(fileGroupCount, allPartitionsRecord.union(fileListRecords));
   }
 
-  private boolean anyPendingDataInstant(HoodieTableMetaClient dataMetaClient, Option<String> inflightInstantTimestamp) {
-    // We can only initialize if there are no pending operations on the dataset
-    List<HoodieInstant> pendingDataInstant = dataMetaClient.getActiveTimeline()
+  private Set<String> getPendingDataInstants(HoodieTableMetaClient dataMetaClient) {
+    // Initialize excluding the pending operations on the dataset
+    return dataMetaClient.getActiveTimeline()
         .getInstantsAsStream().filter(i -> !i.isCompleted())
-        .filter(i -> !inflightInstantTimestamp.isPresent() || !i.getTimestamp().equals(inflightInstantTimestamp.get()))
         // regular writers should not be blocked due to pending indexing action
         .filter(i -> !HoodieTimeline.INDEXING_ACTION.equals(i.getAction()))
-        .collect(Collectors.toList());
-
-    if (!pendingDataInstant.isEmpty()) {
-      metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.BOOTSTRAP_ERR_STR, 1));
-      LOG.warn("Cannot initialize metadata table as operation(s) are in progress on the dataset: "
-          + Arrays.toString(pendingDataInstant.toArray()));
-      return true;
-    }
-    return false;
+        .map(HoodieInstant::getTimestamp)
+        .collect(Collectors.toSet());
   }
 
   private HoodieTableMetaClient initializeMetaClient() throws IOException {
@@ -582,7 +572,7 @@ private HoodieTableMetaClient initializeMetaClient() throws IOException {
    * @param initializationTime Files which have a timestamp after this are neglected
    * @return List consisting of {@code DirectoryInfo} for each partition found.
    */
-  private List<DirectoryInfo> listAllPartitionsFromFilesystem(String initializationTime) {
+  private List<DirectoryInfo> listAllPartitionsFromFilesystem(String initializationTime, Set<String> pendingDataInstants) {
     List<SerializablePath> pathsToList = new LinkedList<>();
     pathsToList.add(new SerializablePath(new CachingPath(dataWriteConfig.getBasePath())));
 
@@ -601,7 +591,7 @@ private List<DirectoryInfo> listAllPartitionsFromFilesystem(String initializatio
       List<DirectoryInfo> processedDirectories = engineContext.map(pathsToList.subList(0, numDirsToList), path -> {
         FileSystem fs = path.get().getFileSystem(conf.get());
         String relativeDirPath = FSUtils.getRelativePartitionPath(serializableBasePath.get(), path.get());
-        return new DirectoryInfo(relativeDirPath, fs.listStatus(path.get()), initializationTime);
+        return new DirectoryInfo(relativeDirPath, fs.listStatus(path.get()), initializationTime, pendingDataInstants);
       }, numDirsToList);
 
       pathsToList = new LinkedList<>(pathsToList.subList(numDirsToList, pathsToList.size()));
@@ -638,14 +628,14 @@ private List<DirectoryInfo> listAllPartitionsFromFilesystem(String initializatio
    * @param initializationTime Files which have a timestamp after this are neglected
    * @return List consisting of {@code DirectoryInfo} for each partition found.
    */
-  private List<DirectoryInfo> listAllPartitionsFromMDT(String initializationTime) throws IOException {
+  private List<DirectoryInfo> listAllPartitionsFromMDT(String initializationTime, Set<String> pendingDataInstants) throws IOException {
     List<DirectoryInfo> dirinfoList = new LinkedList<>();
     List<String> allAbsolutePartitionPaths = metadata.getAllPartitionPaths().stream()
         .map(partitionPath -> dataWriteConfig.getBasePath() + "/" + partitionPath).collect(Collectors.toList());
     Map<String, FileStatus[]> partitionFileMap = metadata.getAllFilesInPartitions(allAbsolutePartitionPaths);
     for (Map.Entry<String, FileStatus[]> entry : partitionFileMap.entrySet()) {
       String relativeDirPath = FSUtils.getRelativePartitionPath(new Path(dataWriteConfig.getBasePath()), new Path(entry.getKey()));
-      dirinfoList.add(new DirectoryInfo(relativeDirPath, entry.getValue(), initializationTime));
+      dirinfoList.add(new DirectoryInfo(relativeDirPath, entry.getValue(), initializationTime, pendingDataInstants, false));
     }
     return dirinfoList;
   }
@@ -930,7 +920,7 @@ public void update(HoodieRestoreMetadata restoreMetadata, String instantTime) {
 
     // Restore requires the existing pipelines to be shutdown. So we can safely scan the dataset to find the current
     // list of files in the filesystem.
-    List<DirectoryInfo> dirInfoList = listAllPartitionsFromFilesystem(instantTime);
+    List<DirectoryInfo> dirInfoList = listAllPartitionsFromFilesystem(instantTime, Collections.emptySet());
     Map<String, DirectoryInfo> dirInfoMap = dirInfoList.stream().collect(Collectors.toMap(DirectoryInfo::getRelativePath, Function.identity()));
     dirInfoList.clear();
 
@@ -1489,29 +1479,39 @@ static class DirectoryInfo implements Serializable {
     // Is this a hoodie partition
     private boolean isHoodiePartition = false;
 
-    public DirectoryInfo(String relativePath, FileStatus[] fileStatus, String maxInstantTime) {
+    public DirectoryInfo(String relativePath, FileStatus[] fileStatuses, String maxInstantTime, Set<String> pendingDataInstants) {
+      this(relativePath, fileStatuses, maxInstantTime, pendingDataInstants, true);
+    }
+
+    /**
+     * When files are directly fetched from Metadata table we do not need to validate HoodiePartitions.
+     */
+    public DirectoryInfo(String relativePath, FileStatus[] fileStatus, String maxInstantTime, Set<String> pendingDataInstants,
+                         boolean validateHoodiePartitions) {
       this.relativePath = relativePath;
 
       // Pre-allocate with the maximum length possible
       filenameToSizeMap = new HashMap<>(fileStatus.length);
 
+      // Presence of partition meta file implies this is a HUDI partition
+      // if input files are directly fetched from MDT, it may not contain the HoodiePartitionMetadata file. So, we can ignore the validation for isHoodiePartition.
+      isHoodiePartition = !validateHoodiePartitions || Arrays.stream(fileStatus).anyMatch(status -> status.getPath().getName().startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX));
       for (FileStatus status : fileStatus) {
-        if (status.isDirectory()) {
+        // Do not attempt to search for more subdirectories inside directories that are partitions
+        if (!isHoodiePartition && status.isDirectory()) {
           // Ignore .hoodie directory as there cannot be any partitions inside it
           if (!status.getPath().getName().equals(HoodieTableMetaClient.METAFOLDER_NAME)) {
             this.subDirectories.add(status.getPath());
           }
-        } else if (status.getPath().getName().startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX)) {
-          // Presence of partition meta file implies this is a HUDI partition
-          this.isHoodiePartition = true;
-        } else if (FSUtils.isDataFile(status.getPath())) {
+        } else if (isHoodiePartition && FSUtils.isDataFile(status.getPath())) {
           // Regular HUDI data file (base file or log file)
           String dataFileCommitTime = FSUtils.getCommitTime(status.getPath().getName());
-          // Limit the file listings to files which were created before the maxInstant time.
-          if (HoodieTimeline.compareTimestamps(dataFileCommitTime, HoodieTimeline.LESSER_THAN_OR_EQUALS, maxInstantTime)) {
+          // Limit the file listings to files which were created by successful commits before the maxInstant time.
+          if (!pendingDataInstants.contains(dataFileCommitTime) && HoodieTimeline.compareTimestamps(dataFileCommitTime, LESSER_THAN_OR_EQUALS, maxInstantTime)) {
             filenameToSizeMap.put(status.getPath().getName(), status.getLen());
           }
         }
+
       }
     }
 
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java
index f62ec0febd578..032aac8574eb1 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java
@@ -79,7 +79,7 @@ public <T> void processNextRecord(HoodieRecord<T> hoodieRecord) throws Exception
 
   @Override
   protected void processNextDeletedRecord(DeleteRecord deleteRecord) {
-    throw new IllegalStateException("Not expected to see delete records in this log-scan mode. Check Job Config");
+    // no op
   }
 
   /**
diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java
index 8b637be447f0c..8de6e4e2f4a49 100644
--- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java
+++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java
@@ -265,32 +265,34 @@ public HoodieMetadataPayload(Option<GenericRecord> recordOpt) {
   }
 
   private HoodieMetadataPayload(String key, int type, Map<String, HoodieMetadataFileInfo> filesystemMetadata) {
-    this(key, type, filesystemMetadata, null, null, null);
+    this(key, type, filesystemMetadata, null, null, null, false);
   }
 
   private HoodieMetadataPayload(String key, HoodieMetadataBloomFilter metadataBloomFilter) {
-    this(key, METADATA_TYPE_BLOOM_FILTER, null, metadataBloomFilter, null, null);
+    this(key, METADATA_TYPE_BLOOM_FILTER, null, metadataBloomFilter, null, null, metadataBloomFilter.getIsDeleted());
   }
 
   private HoodieMetadataPayload(String key, HoodieMetadataColumnStats columnStats) {
-    this(key, METADATA_TYPE_COLUMN_STATS, null, null, columnStats, null);
+    this(key, METADATA_TYPE_COLUMN_STATS, null, null, columnStats, null, columnStats.getIsDeleted());
   }
 
   private HoodieMetadataPayload(String key, HoodieRecordIndexInfo recordIndexMetadata) {
-    this(key, METADATA_TYPE_RECORD_INDEX, null, null, null, recordIndexMetadata);
+    this(key, METADATA_TYPE_RECORD_INDEX, null, null, null, recordIndexMetadata, false);
   }
 
   protected HoodieMetadataPayload(String key, int type,
-      Map<String, HoodieMetadataFileInfo> filesystemMetadata,
-      HoodieMetadataBloomFilter metadataBloomFilter,
-      HoodieMetadataColumnStats columnStats,
-      HoodieRecordIndexInfo recordIndexMetadata) {
+                                  Map<String, HoodieMetadataFileInfo> filesystemMetadata,
+                                  HoodieMetadataBloomFilter metadataBloomFilter,
+                                  HoodieMetadataColumnStats columnStats,
+                                  HoodieRecordIndexInfo recordIndexMetadata,
+                                  boolean isDeletedRecord) {
     this.key = key;
     this.type = type;
     this.filesystemMetadata = filesystemMetadata;
     this.bloomFilterMetadata = metadataBloomFilter;
     this.columnStatMetadata = columnStats;
     this.recordIndexMetadata = recordIndexMetadata;
+    this.isDeletedRecord = isDeletedRecord;
   }
 
   /**
diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
index 6c1de68e043cc..5f87b8e582435 100644
--- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
+++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
@@ -50,6 +50,7 @@
 import org.apache.hudi.common.table.HoodieTableMetaClient;
 import org.apache.hudi.common.table.TableSchemaResolver;
 import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
+import org.apache.hudi.common.table.log.HoodieUnMergedLogRecordScanner;
 import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
 import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline;
 import org.apache.hudi.common.table.timeline.HoodieInstant;
@@ -62,6 +63,7 @@
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.common.util.ParquetUtils;
 import org.apache.hudi.common.util.StringUtils;
+import org.apache.hudi.common.util.VisibleForTesting;
 import org.apache.hudi.common.util.collection.ClosableIterator;
 import org.apache.hudi.common.util.collection.Pair;
 import org.apache.hudi.common.util.collection.Tuple3;
@@ -100,6 +102,7 @@
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
+import java.util.Properties;
 import java.util.Set;
 import java.util.UUID;
 import java.util.function.BiFunction;
@@ -1123,7 +1126,7 @@ private static Stream<HoodieRecord> getColumnStatsRecords(String partitionPath,
     }
 
     List<HoodieColumnRangeMetadata<Comparable>> columnRangeMetadata =
-        readColumnRangeMetadataFrom(partitionPath, fileName, datasetMetaClient, columnsToIndex);
+        readColumnRangeMetadataFrom(partitionPath, fileName, datasetMetaClient, columnsToIndex, maxBufferSize);
 
     return HoodieMetadataPayload.createColumnStatsRecords(partitionPath, columnRangeMetadata, false);
   }
@@ -1131,13 +1134,18 @@ private static Stream<HoodieRecord> getColumnStatsRecords(String partitionPath,
   private static List<HoodieColumnRangeMetadata<Comparable>> readColumnRangeMetadataFrom(String partitionPath,
                                                                                          String fileName,
                                                                                          HoodieTableMetaClient datasetMetaClient,
-                                                                                         List<String> columnsToIndex) {
+                                                                                         List<String> columnsToIndex,
+                                                                                         int maxBufferSize) {
     String partitionPathFileName = (partitionPath.equals(EMPTY_PARTITION_NAME) || partitionPath.equals(NON_PARTITIONED_NAME)) ? fileName
         : partitionPath + "/" + fileName;
     try {
       Path fullFilePath = new Path(datasetMetaClient.getBasePathV2(), partitionPathFileName);
       if (partitionPathFileName.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
         return new ParquetUtils().readRangeFromParquetMetadata(datasetMetaClient.getHadoopConf(), fullFilePath, columnsToIndex);
+      } else if (FSUtils.isLogFile(fileName)) {
+        Option<Schema> writerSchemaOpt = tryResolveSchemaForTable(datasetMetaClient);
+        LOG.warn("Reading log file: {}, to build column range metadata.", partitionPathFileName);
+        return getLogFileColumnRangeMetadata(fullFilePath.toString(), datasetMetaClient, columnsToIndex, writerSchemaOpt, maxBufferSize);
       }
 
       LOG.warn("Column range index not supported for: {}", partitionPathFileName);
@@ -1150,6 +1158,46 @@ private static List<HoodieColumnRangeMetadata<Comparable>> readColumnRangeMetada
     }
   }
 
+  /**
+   * Read column range metadata from log file.
+   */
+  @VisibleForTesting
+  protected static List<HoodieColumnRangeMetadata<Comparable>> getLogFileColumnRangeMetadata(String filePath,
+                                                                                             HoodieTableMetaClient datasetMetaClient,
+                                                                                             List<String> columnsToIndex,
+                                                                                             Option<Schema> writerSchemaOpt,
+                                                                                             int maxBufferSize) throws IOException {
+    if (writerSchemaOpt.isPresent()) {
+      List<Schema.Field> fieldsToIndex = writerSchemaOpt.get().getFields().stream()
+          .filter(field -> columnsToIndex.contains(field.name()))
+          .collect(Collectors.toList());
+      // read log file records without merging
+      List<HoodieRecord> records = new ArrayList<>();
+      HoodieUnMergedLogRecordScanner scanner = HoodieUnMergedLogRecordScanner.newBuilder()
+          .withFileSystem(datasetMetaClient.getFs())
+          .withBasePath(datasetMetaClient.getBasePath())
+          .withLogFilePaths(Collections.singletonList(filePath))
+          .withBufferSize(maxBufferSize)
+          .withLatestInstantTime(datasetMetaClient.getActiveTimeline().getCommitsTimeline().lastInstant().get().getTimestamp())
+          .withReaderSchema(writerSchemaOpt.get())
+          .withLogRecordScannerCallback(records::add)
+          .build();
+      scanner.scan();
+      if (records.isEmpty()) {
+        return Collections.emptyList();
+      }
+
+      List<IndexedRecord> indexedRecords = new ArrayList<>();
+      for (HoodieRecord hoodieRecord : records) {
+        indexedRecords.add(hoodieRecord.toIndexedRecord(writerSchemaOpt.get(), new Properties()).get().getData());
+      }
+      Map<String, HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataMap =
+          collectColumnRangeMetadata(indexedRecords, fieldsToIndex, getFileNameFromPath(filePath));
+      return new ArrayList<>(columnRangeMetadataMap.values());
+    }
+    return Collections.emptyList();
+  }
+
   /**
    * Does an upcast for {@link BigDecimal} instance to align it with scale/precision expected by
    * the {@link org.apache.avro.LogicalTypes.Decimal} Avro logical type

From b27240d78ef509c437ce9a087b58ae88be898fbd Mon Sep 17 00:00:00 2001
From: Vamsi <vamsikarnika@gmail.com>
Date: Wed, 18 Mar 2026 15:22:24 +0530
Subject: [PATCH 11/12] Fix MDT Bootstrap tests

---
 .../HoodieBackedTableMetadataWriter.java      | 33 +++++++++++++++++--
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
index 5d53fdb0f7650..bbd7427171b55 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
@@ -266,7 +266,11 @@ protected boolean initializeIfNeeded(HoodieTableMetaClient dataMetaClient,
       // If there is no commit on the dataset yet, use the SOLO_COMMIT_TIMESTAMP as the instant time for initial commit
       // Otherwise, we use the timestamp of the latest completed action.
       String initializationTime = dataMetaClient.getActiveTimeline().filterCompletedInstants().lastInstant().map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP);
-      initializeFromFilesystem(initializationTime, metadataPartitionsToInit, inflightInstantTimestamp);
+      if(!initializeFromFilesystem(initializationTime, metadataPartitionsToInit, inflightInstantTimestamp)) {
+        LOG.error("Failed to initialize MDT from filesystem");
+        return false;
+      }
+
       metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.INITIALIZE_STR, timer.endTimer()));
       return true;
     } catch (IOException e) {
@@ -339,10 +343,13 @@ private boolean isBootstrapNeeded(Option<HoodieInstant> latestMetadataInstant) {
    * @param partitionsToInit         - List of MDT partitions to initialize
    * @param inflightInstantTimestamp - Current action instant responsible for this initialization
    */
-  private void initializeFromFilesystem(String initializationTime, List<MetadataPartitionType> partitionsToInit,
+  private boolean initializeFromFilesystem(String initializationTime, List<MetadataPartitionType> partitionsToInit,
                                            Option<String> inflightInstantTimestamp) throws IOException {
-    Set<String> pendingDataInstants = getPendingDataInstants(dataMetaClient);
+    if (anyPendingDataInstant(dataMetaClient, inflightInstantTimestamp)) {
+      return false;
+    }
 
+    Set<String> pendingDataInstants = inflightInstantTimestamp.map(Collections::singleton).orElse(Collections.emptySet());
     // FILES partition is always required and is initialized first
     boolean filesPartitionAvailable = dataMetaClient.getTableConfig().isMetadataPartitionAvailable(MetadataPartitionType.FILES);
     if (!filesPartitionAvailable) {
@@ -434,6 +441,8 @@ private void initializeFromFilesystem(String initializationTime, List<MetadataPa
       long totalInitTime = partitionInitTimer.endTimer();
       LOG.info(String.format("Initializing %s index in metadata table took " + totalInitTime + " in ms", partitionType.name()));
     }
+
+    return true;
   }
 
   /**
@@ -553,6 +562,24 @@ private Set<String> getPendingDataInstants(HoodieTableMetaClient dataMetaClient)
         .collect(Collectors.toSet());
   }
 
+  private boolean anyPendingDataInstant(HoodieTableMetaClient dataMetaClient, Option<String> inflightInstantTimestamp) {
+    // We can only initialize if there are no pending operations on the dataset
+    List<HoodieInstant> pendingDataInstant = dataMetaClient.getActiveTimeline()
+        .getInstantsAsStream().filter(i -> !i.isCompleted())
+        .filter(i -> !inflightInstantTimestamp.isPresent() || !i.getTimestamp().equals(inflightInstantTimestamp.get()))
+        // regular writers should not be blocked due to pending indexing action
+        .filter(i -> !HoodieTimeline.INDEXING_ACTION.equals(i.getAction()))
+        .collect(Collectors.toList());
+
+    if (!pendingDataInstant.isEmpty()) {
+      metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.BOOTSTRAP_ERR_STR, 1));
+      LOG.warn("Cannot initialize metadata table as operation(s) are in progress on the dataset: "
+          + Arrays.toString(pendingDataInstant.toArray()));
+      return true;
+    }
+    return false;
+  }
+
   private HoodieTableMetaClient initializeMetaClient() throws IOException {
     return HoodieTableMetaClient.withPropertyBuilder()
         .setTableType(HoodieTableType.MERGE_ON_READ)

From 6abac329f76d21a5b8b7afaa6e7888d26fb8dbc6 Mon Sep 17 00:00:00 2001
From: Vamsi <vamsikarnika@gmail.com>
Date: Tue, 24 Mar 2026 21:17:35 +0530
Subject: [PATCH 12/12] Fix checkstyle and tests

---
 .../apache/hudi/metadata/HoodieBackedTableMetadataWriter.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
index 0a0473c8052a6..139d661b7e2e9 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
@@ -266,7 +266,7 @@ protected boolean initializeIfNeeded(HoodieTableMetaClient dataMetaClient,
       // If there is no commit on the dataset yet, use the SOLO_COMMIT_TIMESTAMP as the instant time for initial commit
       // Otherwise, we use the timestamp of the latest completed action.
       String initializationTime = dataMetaClient.getActiveTimeline().filterCompletedInstants().lastInstant().map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP);
-      if(!initializeFromFilesystem(initializationTime, metadataPartitionsToInit, inflightInstantTimestamp)) {
+      if (!initializeFromFilesystem(initializationTime, metadataPartitionsToInit, inflightInstantTimestamp)) {
         LOG.error("Failed to initialize MDT from filesystem");
         return false;
       }
@@ -349,7 +349,7 @@ private boolean initializeFromFilesystem(String initializationTime, List<Metadat
       return false;
     }
 
-    Set<String> pendingDataInstants = inflightInstantTimestamp.map(Collections::singleton).orElse(Collections.emptySet());
+    Set<String> pendingDataInstants = getPendingDataInstants(dataMetaClient);
     // FILES partition is always required and is initialized first
     boolean filesPartitionAvailable = dataMetaClient.getTableConfig().isMetadataPartitionAvailable(MetadataPartitionType.FILES);
     if (!filesPartitionAvailable) {