From c97664fe02c2d862286e79e921d983ee02063b45 Mon Sep 17 00:00:00 2001 From: Vamsi Date: Thu, 12 Mar 2026 16:37:27 +0530 Subject: [PATCH 01/12] [HUDI-8371] Fixing column stats index with MDT for few scenarios --- .../HoodieBackedTableMetadataWriter.java | 54 ++- .../testutils/LogFileColStatsTestUtil.java | 96 ++++ .../org/apache/hudi/common/fs/FSUtils.java | 8 + .../log/HoodieUnMergedLogRecordScanner.java | 2 +- .../metadata/HoodieTableMetadataUtil.java | 155 ++++-- .../apache/hudi/common/fs/TestFSUtils.java | 4 + .../metadata/TestHoodieMetadataPayload.java | 295 ++++++++++++ .../metadata/TestHoodieTableMetadataUtil.java | 315 ++++++++++++ ...ap-rollback1-column-stats-index-table.json | 2 + ...w-bootstrap1-column-stats-index-table.json | 4 + ...w-bootstrap2-column-stats-index-table.json | 5 + .../cow-clean1-column-stats-index-table.json | 2 + ...0484-e7e1-48b6-8289-1a7c483b530b-c000.json | 1 + ...ap-rollback1-column-stats-index-table.json | 2 + ...r-bootstrap1-column-stats-index-table.json | 3 + ...r-bootstrap2-column-stats-index-table.json | 5 + .../mor-clean1-column-stats-index-table.json | 2 + ...elete-block1-column-stats-index-table.json | 3 + ...0484-e7e1-48b6-8289-1a7c483b530b-c000.json | 10 + ...0484-e7e1-48b6-8289-1a7c483b530b-c000.json | 5 + ...0484-e7e1-48b6-8289-1a7c483b530b-c000.json | 5 + .../functional/ColumnStatIndexTestBase.scala | 164 +++++-- .../functional/TestColumnStatsIndex.scala | 450 +++++++++++++++++- .../TestColumnStatsIndexWithSQL.scala | 26 +- 24 files changed, 1493 insertions(+), 125 deletions(-) create mode 100644 hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/LogFileColStatsTestUtil.java create mode 100644 hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java create mode 100644 hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap-rollback1-column-stats-index-table.json create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap1-column-stats-index-table.json create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap2-column-stats-index-table.json create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-clean1-column-stats-index-table.json create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/delete-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap-rollback1-column-stats-index-table.json create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap1-column-stats-index-table.json create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap2-column-stats-index-table.json create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-clean1-column-stats-index-table.json create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-delete-block1-column-stats-index-table.json create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update2-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update3-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json create mode 100644 hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update4-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java index d6e7a8f626ebe..68b02ad6d39ba 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java @@ -237,13 +237,13 @@ public List getEnabledPartitionTypes() { protected boolean initializeIfNeeded(HoodieTableMetaClient dataMetaClient, Option inflightInstantTimestamp) throws IOException { HoodieTimer timer = HoodieTimer.start(); - List partitionsToInit = new ArrayList<>(MetadataPartitionType.values().length); + List metadataPartitionsToInit = new ArrayList<>(MetadataPartitionType.values().length); try { boolean exists = metadataTableExists(dataMetaClient); if (!exists) { // FILES partition is always required - partitionsToInit.add(MetadataPartitionType.FILES); + metadataPartitionsToInit.add(MetadataPartitionType.FILES); } // check if any of the enabled partition types needs to be initialized @@ -253,10 +253,10 @@ protected boolean initializeIfNeeded(HoodieTableMetaClient dataMetaClient, LOG.info("Async metadata indexing disabled and following partitions already initialized: " + completedPartitions); this.enabledPartitionTypes.stream() .filter(p -> !completedPartitions.contains(p.getPartitionPath()) && !MetadataPartitionType.FILES.equals(p)) - .forEach(partitionsToInit::add); + .forEach(metadataPartitionsToInit::add); } - if (partitionsToInit.isEmpty()) { + if (metadataPartitionsToInit.isEmpty()) { // No partitions left to initialize, since all the metadata enabled partitions are either initialized before // or current in the process of initialization. initMetadataReader(); @@ -266,13 +266,7 @@ protected boolean initializeIfNeeded(HoodieTableMetaClient dataMetaClient, // If there is no commit on the dataset yet, use the SOLO_COMMIT_TIMESTAMP as the instant time for initial commit // Otherwise, we use the timestamp of the latest completed action. String initializationTime = dataMetaClient.getActiveTimeline().filterCompletedInstants().lastInstant().map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP); - - // Initialize partitions for the first time using data from the files on the file system - if (!initializeFromFilesystem(initializationTime, partitionsToInit, inflightInstantTimestamp)) { - LOG.error("Failed to initialize MDT from filesystem"); - return false; - } - + initializeFromFilesystem(initializationTime, metadataPartitionsToInit, inflightInstantTimestamp); metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.INITIALIZE_STR, timer.endTimer())); return true; } catch (IOException e) { @@ -345,11 +339,12 @@ private boolean isBootstrapNeeded(Option latestMetadataInstant) { * @param partitionsToInit - List of MDT partitions to initialize * @param inflightInstantTimestamp - Current action instant responsible for this initialization */ - private boolean initializeFromFilesystem(String initializationTime, List partitionsToInit, + private void initializeFromFilesystem(String initializationTime, List partitionsToInit, Option inflightInstantTimestamp) throws IOException { if (anyPendingDataInstant(dataMetaClient, inflightInstantTimestamp)) { - return false; + return; } + Set pendingDataInstants = getPendingDataInstants(dataMetaClient); // FILES partition is always required and is initialized first boolean filesPartitionAvailable = dataMetaClient.getTableConfig().isMetadataPartitionAvailable(MetadataPartitionType.FILES); @@ -374,7 +369,7 @@ private boolean initializeFromFilesystem(String initializationTime, List partitionInfoList; if (filesPartitionAvailable) { - partitionInfoList = listAllPartitionsFromMDT(initializationTime); + partitionInfoList = listAllPartitionsFromMDT(initializationTime, pendingDataInstants); } else { // if auto initialization is enabled, then we need to list all partitions from the file system if (dataWriteConfig.getMetadataConfig().shouldAutoInitialize()) { @@ -424,8 +419,7 @@ private boolean initializeFromFilesystem(String initializationTime, List> initializeColumnStatsPartition(Map> partitionToFilesMap) { + // during initialization, we need stats for base and log files. HoodieData records = HoodieTableMetadataUtil.convertFilesToColumnStatsRecords( - engineContext, Collections.emptyMap(), partitionToFilesMap, getRecordsGenerationParams()); + engineContext, Collections.emptyMap(), partitionToFilesMap, dataMetaClient, dataWriteConfig.isMetadataColumnStatsIndexEnabled(), + dataWriteConfig.getColumnStatsIndexParallelism(), dataWriteConfig.getColumnsEnabledForColumnStatsIndex(), + dataWriteConfig.getMetadataConfig().getMaxReaderBufferSize()); final int fileGroupCount = dataWriteConfig.getMetadataConfig().getColumnStatsIndexFileGroupCount(); return Pair.of(fileGroupCount, records); @@ -571,6 +566,16 @@ private boolean anyPendingDataInstant(HoodieTableMetaClient dataMetaClient, Opti return false; } + private Set getPendingDataInstants(HoodieTableMetaClient dataMetaClient) { + // Initialize excluding the pending operations on the dataset + return dataMetaClient.getActiveTimeline() + .getInstantsAsStream().filter(i -> !i.isCompleted()) + // regular writers should not be blocked due to pending indexing action + .filter(i -> !HoodieTimeline.INDEXING_ACTION.equals(i.getAction())) + .map(HoodieInstant::getTimestamp) + .collect(Collectors.toSet()); + } + private HoodieTableMetaClient initializeMetaClient() throws IOException { return HoodieTableMetaClient.withPropertyBuilder() .setTableType(HoodieTableType.MERGE_ON_READ) @@ -646,13 +651,14 @@ private List listAllPartitionsFromFilesystem(String initializatio * @param initializationTime Files which have a timestamp after this are neglected * @return List consisting of {@code DirectoryInfo} for each partition found. */ - private List listAllPartitionsFromMDT(String initializationTime) throws IOException { - List dirinfoList = new LinkedList<>(); - List allPartitionPaths = metadata.getAllPartitionPaths().stream() + private List listAllPartitionsFromMDT(String initializationTime, Set pendingDataInstants) throws IOException { + List allAbsolutePartitionPaths = metadata.getAllPartitionPaths().stream() .map(partitionPath -> dataWriteConfig.getBasePath() + "/" + partitionPath).collect(Collectors.toList()); - Map partitionFileMap = metadata.getAllFilesInPartitions(allPartitionPaths); + Map partitionFileMap = metadata.getAllFilesInPartitions(allAbsolutePartitionPaths); + List dirinfoList = new ArrayList<>(partitionFileMap.size()); for (Map.Entry entry : partitionFileMap.entrySet()) { - dirinfoList.add(new DirectoryInfo(entry.getKey(), entry.getValue(), initializationTime)); + String relativeDirPath = FSUtils.getRelativePartitionPath(new Path(dataWriteConfig.getBasePath()), new Path(entry.getKey())); + dirinfoList.add(new DirectoryInfo(relativeDirPath, entry.getValue(), initializationTime)); } return dirinfoList; } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/LogFileColStatsTestUtil.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/LogFileColStatsTestUtil.java new file mode 100644 index 0000000000000..464ad5ddca1e4 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/LogFileColStatsTestUtil.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.testutils; + +import org.apache.hudi.common.model.HoodieColumnRangeMetadata; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.table.log.HoodieUnMergedLogRecordScanner; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieException; + +import org.apache.avro.Schema; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.catalyst.expressions.GenericRow; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.collectColumnRangeMetadata; + +/** + * Util methods used in tests to fetch col stats records for a log file. + */ +public class LogFileColStatsTestUtil { + + public static Option getLogFileColumnRangeMetadata(String filePath, HoodieTableMetaClient datasetMetaClient, String latestCommitTime, + List columnsToIndex, Option writerSchemaOpt, + int maxBufferSize) throws IOException { + if (writerSchemaOpt.isPresent()) { + List fieldsToIndex = writerSchemaOpt.get().getFields().stream() + .filter(field -> columnsToIndex.contains(field.name())) + .collect(Collectors.toList()); + List records = new ArrayList<>(); + HoodieUnMergedLogRecordScanner scanner = HoodieUnMergedLogRecordScanner.newBuilder() + .withStorage(datasetMetaClient.getStorage()) + .withBasePath(datasetMetaClient.getBasePath()) + .withLogFilePaths(Collections.singletonList(filePath)) + .withBufferSize(maxBufferSize) + .withLatestInstantTime(latestCommitTime) + .withReaderSchema(writerSchemaOpt.get()) + .withLogRecordScannerCallback(records::add) + .build(); + scanner.scan(); + if (records.isEmpty()) { + return Option.empty(); + } + Map> columnRangeMetadataMap = + collectColumnRangeMetadata(records, fieldsToIndex, filePath, writerSchemaOpt.get()); + List> columnRangeMetadataList = new ArrayList<>(columnRangeMetadataMap.values()); + return Option.of(getColStatsEntry(filePath, columnRangeMetadataList)); + } else { + throw new HoodieException("Writer schema needs to be set"); + } + } + + private static Row getColStatsEntry(String logFilePath, List> columnRangeMetadataList) { + Collections.sort(columnRangeMetadataList, (o1, o2) -> o1.getColumnName().compareTo(o2.getColumnName())); + Object[] values = new Object[(columnRangeMetadataList.size() * 3) + 2]; + values[0] = logFilePath.substring(logFilePath.lastIndexOf("/") + 1); + values[1] = columnRangeMetadataList.get(0).getValueCount(); + int counter = 2; + for (HoodieColumnRangeMetadata columnRangeMetadata: columnRangeMetadataList) { + values[counter++] = columnRangeMetadata.getValueCount(); + values[counter++] = columnRangeMetadata.getMinValue(); + values[counter++] = columnRangeMetadata.getMaxValue(); + } + return new GenericRow(values); + } + + public static Option getSchemaForTable(HoodieTableMetaClient metaClient) throws Exception { + TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient); + return Option.of(schemaResolver.getTableAvroSchema()); + } +} + diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java index 91c966d00a2bd..1e834e5a06dba 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java @@ -211,6 +211,14 @@ public static String getFileId(String fullFileName) { return fullFileName.split("_", 2)[0]; } + /** + * @param filePath + * @returns the filename from the given path. Path could be the absolute path or just partition path and file name. + */ + public static String getFileNameFromPath(String filePath) { + return filePath.substring(filePath.lastIndexOf("/") + 1); + } + /** * Gets all partition paths assuming date partitioning (year, month, day) three levels down. */ diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java index f62ec0febd578..99fe6c1ff54f2 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java @@ -79,7 +79,7 @@ public void processNextRecord(HoodieRecord hoodieRecord) throws Exception @Override protected void processNextDeletedRecord(DeleteRecord deleteRecord) { - throw new IllegalStateException("Not expected to see delete records in this log-scan mode. Check Job Config"); + // no - op } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java index 62b0232583293..f4ba94136b9c9 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java @@ -44,12 +44,14 @@ import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; +import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieRecordGlobalLocation; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.common.table.log.HoodieUnMergedLogRecordScanner; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; @@ -72,6 +74,7 @@ import org.apache.hudi.io.storage.HoodieFileReaderFactory; import org.apache.hudi.util.Lazy; +import com.google.common.annotations.VisibleForTesting; import org.apache.avro.AvroTypeException; import org.apache.avro.LogicalTypes; import org.apache.avro.Schema; @@ -119,6 +122,7 @@ import static org.apache.hudi.common.config.HoodieCommonConfig.MAX_MEMORY_FOR_COMPACTION; import static org.apache.hudi.common.config.HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE; import static org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator.MILLIS_INSTANT_ID_LENGTH; +import static org.apache.hudi.common.fs.FSUtils.getFileNameFromPath; import static org.apache.hudi.common.util.StringUtils.isNullOrEmpty; import static org.apache.hudi.common.util.ValidationUtils.checkState; import static org.apache.hudi.metadata.HoodieMetadataPayload.RECORD_INDEX_MISSING_FILEINDEX_FALLBACK; @@ -645,12 +649,8 @@ public static HoodieData convertMetadataToColumnStatsRecords(Hoodi return engineContext.parallelize(deleteFileList, parallelism) .flatMap(deleteFileInfoPair -> { String partitionPath = deleteFileInfoPair.getLeft(); - String filePath = deleteFileInfoPair.getRight(); - - if (filePath.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { - return getColumnStatsRecords(partitionPath, filePath, dataTableMetaClient, columnsToIndex, true).iterator(); - } - return Collections.emptyListIterator(); + String fileName = deleteFileInfoPair.getRight(); + return getColumnStatsRecords(partitionPath, fileName, dataTableMetaClient, columnsToIndex, true).iterator(); }); } @@ -858,12 +858,18 @@ public static HoodieData convertFilesToBloomFilterRecords(HoodieEn public static HoodieData convertFilesToColumnStatsRecords(HoodieEngineContext engineContext, Map> partitionToDeletedFiles, Map> partitionToAppendedFiles, - MetadataRecordsGenerationParams recordsGenerationParams) { + HoodieTableMetaClient dataMetaClient, + boolean isColumnStatsIndexEnabled, + int columnStatsIndexParallelism, + List targetColumnsForColumnStatsIndex, + int maxReaderBufferSize) { + if (!isColumnStatsIndexEnabled) { + return engineContext.emptyHoodieData(); + } // Find the columns to index - HoodieTableMetaClient dataTableMetaClient = recordsGenerationParams.getDataMetaClient(); final List columnsToIndex = - getColumnsToIndex(recordsGenerationParams, - Lazy.lazily(() -> tryResolveSchemaForTable(dataTableMetaClient))); + getColumnsToIndex(true, targetColumnsForColumnStatsIndex, + Lazy.lazily(() -> tryResolveSchemaForTable(dataMetaClient))); if (columnsToIndex.isEmpty()) { // In case there are no columns to index, bail return engineContext.emptyHoodieData(); @@ -875,18 +881,12 @@ public static HoodieData convertFilesToColumnStatsRecords(HoodieEn final List> partitionFileFlagTupleList = fetchPartitionFileInfoTriplets(partitionToDeletedFiles, partitionToAppendedFiles); // Create records MDT - int parallelism = Math.max(Math.min(partitionFileFlagTupleList.size(), recordsGenerationParams.getColumnStatsIndexParallelism()), 1); + int parallelism = Math.max(Math.min(partitionFileFlagTupleList.size(), columnStatsIndexParallelism), 1); return engineContext.parallelize(partitionFileFlagTupleList, parallelism).flatMap(partitionFileFlagTuple -> { - final String partitionName = partitionFileFlagTuple.f0; + final String partitionPath = partitionFileFlagTuple.f0; final String filename = partitionFileFlagTuple.f1; final boolean isDeleted = partitionFileFlagTuple.f2; - if (!FSUtils.isBaseFile(new Path(filename)) || !filename.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { - LOG.warn(String.format("Ignoring file %s as it is not a PARQUET file", filename)); - return Stream.empty().iterator(); - } - - final String filePathWithPartition = partitionName + "/" + filename; - return getColumnStatsRecords(partitionName, filePathWithPartition, dataTableMetaClient, columnsToIndex, isDeleted).iterator(); + return getColumnStatsRecords(partitionPath, filename, dataMetaClient, columnsToIndex, isDeleted, maxReaderBufferSize).iterator(); }); } @@ -1095,6 +1095,27 @@ private static List getColumnsToIndex(MetadataRecordsGenerationParams re .orElse(Collections.emptyList()); } + /** + * Get the list of columns for the table for column stats indexing + */ + private static List getColumnsToIndex(boolean isColumnStatsIndexEnabled, + List targetColumnsForColumnStatsIndex, + Lazy> lazyWriterSchemaOpt) { + checkState(isColumnStatsIndexEnabled); + + if (!targetColumnsForColumnStatsIndex.isEmpty()) { + return targetColumnsForColumnStatsIndex; + } + + Option writerSchemaOpt = lazyWriterSchemaOpt.get(); + return writerSchemaOpt + .map(writerSchema -> + writerSchema.getFields().stream() + .map(Schema.Field::name) + .collect(Collectors.toList())) + .orElse(Collections.emptyList()); + } + private static Stream translateWriteStatToColumnStats(HoodieWriteStat writeStat, HoodieTableMetaClient datasetMetaClient, List columnsToIndex) { @@ -1104,54 +1125,114 @@ private static Stream translateWriteStatToColumnStats(HoodieWriteS return HoodieMetadataPayload.createColumnStatsRecords(writeStat.getPartitionPath(), columnRangeMetadataList, false); } - return getColumnStatsRecords(writeStat.getPartitionPath(), writeStat.getPath(), datasetMetaClient, columnsToIndex, false); + String filePath = writeStat.getPath(); + return getColumnStatsRecords(writeStat.getPartitionPath(), getFileNameFromPath(filePath), datasetMetaClient, columnsToIndex, false); } private static Stream getColumnStatsRecords(String partitionPath, - String filePath, + String fileName, HoodieTableMetaClient datasetMetaClient, List columnsToIndex, boolean isDeleted) { - String filePartitionPath = filePath.startsWith("/") ? filePath.substring(1) : filePath; - String fileName = FSUtils.getFileName(filePath, partitionPath); + return getColumnStatsRecords(partitionPath, fileName, datasetMetaClient, columnsToIndex, isDeleted, -1); + } + + private static Stream getColumnStatsRecords(String partitionPath, + String fileName, + HoodieTableMetaClient datasetMetaClient, + List columnsToIndex, + boolean isDeleted, + int maxBufferSize) { if (isDeleted) { - // TODO we should delete records instead of stubbing them List> columnRangeMetadataList = columnsToIndex.stream() .map(entry -> HoodieColumnRangeMetadata.stub(fileName, entry)) .collect(Collectors.toList()); return HoodieMetadataPayload.createColumnStatsRecords(partitionPath, columnRangeMetadataList, true); } - List> columnRangeMetadata = - readColumnRangeMetadataFrom(filePartitionPath, datasetMetaClient, columnsToIndex); + readColumnRangeMetadataFrom(partitionPath, fileName, datasetMetaClient, columnsToIndex, maxBufferSize); return HoodieMetadataPayload.createColumnStatsRecords(partitionPath, columnRangeMetadata, false); } - private static List> readColumnRangeMetadataFrom(String filePath, + private static List> readColumnRangeMetadataFrom(String partitionPath, + String fileName, HoodieTableMetaClient datasetMetaClient, - List columnsToIndex) { + List columnsToIndex, + int maxBufferSize) { + String partitionPathFileName = (partitionPath.equals(EMPTY_PARTITION_NAME) || partitionPath.equals(NON_PARTITIONED_NAME)) ? fileName + : partitionPath + "/" + fileName; try { - if (filePath.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { - Path fullFilePath = new Path(datasetMetaClient.getBasePath(), filePath); - List> columnRangeMetadataList = - new ParquetUtils().readRangeFromParquetMetadata(datasetMetaClient.getHadoopConf(), fullFilePath, columnsToIndex); - - return columnRangeMetadataList; + Path fullFilePath = new Path(datasetMetaClient.getBasePath(), partitionPathFileName); + if (partitionPathFileName.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { + return new ParquetUtils().readRangeFromParquetMetadata(datasetMetaClient.getHadoopConf(), fullFilePath, columnsToIndex); + } else if (FSUtils.isLogFile(fileName)) { + Option writerSchemaOpt = tryResolveSchemaForTable(datasetMetaClient); + LOG.warn("Reading log file: {}, to build column range metadata.", partitionPathFileName); + return getLogFileColumnRangeMetadata(fullFilePath.toString(), datasetMetaClient, columnsToIndex, writerSchemaOpt, maxBufferSize); } - - LOG.warn("Column range index not supported for: " + filePath); + LOG.warn("Column range index not supported for: {}", partitionPathFileName); return Collections.emptyList(); } catch (Exception e) { // NOTE: In case reading column range metadata from individual file failed, // we simply fall back, in lieu of failing the whole task - LOG.error("Failed to fetch column range metadata for: " + filePath); + LOG.error("Failed to fetch column range metadata for: {}", partitionPathFileName); return Collections.emptyList(); } } + /** + * Read column range metadata from log file. + */ + @VisibleForTesting + protected static List> getLogFileColumnRangeMetadata(String filePath, + HoodieTableMetaClient datasetMetaClient, + List columnsToIndex, + Option writerSchemaOpt, + int maxBufferSize) throws IOException { + if (writerSchemaOpt.isPresent()) { + List fieldsToIndex = writerSchemaOpt.get().getFields().stream() + .filter(field -> columnsToIndex.contains(field.name())) + .collect(Collectors.toList()); + // read log file records without merging + List hoodieRecords = new ArrayList<>(); + HoodieUnMergedLogRecordScanner scanner = HoodieUnMergedLogRecordScanner.newBuilder() + .withFileSystem(datasetMetaClient.getFs()) + .withBasePath(datasetMetaClient.getBasePath()) + .withLogFilePaths(Collections.singletonList(filePath)) + .withBufferSize(maxBufferSize) + .withLatestInstantTime(datasetMetaClient.getActiveTimeline().getCommitsTimeline().lastInstant().get().getTimestamp()) + .withReaderSchema(writerSchemaOpt.get()) + .withLogRecordScannerCallback(hoodieRecords::add) + .build(); + scanner.scan(); + if (hoodieRecords.isEmpty()) { + return Collections.emptyList(); + } + // Extract IndexedRecord from HoodieRecord to use with existing collectColumnRangeMetadata + List records = new ArrayList<>(); + for (HoodieRecord hoodieRecord : hoodieRecords) { + try { + Option insertValue = ((HoodieRecordPayload) hoodieRecord.getData()).getInsertValue(writerSchemaOpt.get()); + if (insertValue.isPresent()) { + records.add(insertValue.get()); + } + } catch (IOException e) { + LOG.warn("Failed to get insert value for record: {}", e.getMessage()); + } + } + if (records.isEmpty()) { + return Collections.emptyList(); + } + Map> columnRangeMetadataMap = + collectColumnRangeMetadata(records, fieldsToIndex, getFileNameFromPath(filePath)); + return new ArrayList<>(columnRangeMetadataMap.values()); + } + return Collections.emptyList(); + } + /** * Does an upcast for {@link BigDecimal} instance to align it with scale/precision expected by * the {@link org.apache.avro.LogicalTypes.Decimal} Avro logical type diff --git a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java index 250304c7fd0ed..7e47274ab2cbe 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java @@ -251,6 +251,10 @@ public void tesLogFileName() { assertEquals(1, FSUtils.getTaskPartitionIdFromLogPath(rlPath)); assertEquals(0, FSUtils.getStageIdFromLogPath(rlPath)); assertEquals(1, FSUtils.getTaskAttemptIdFromLogPath(rlPath)); + + assertEquals(logFile, FSUtils.getFileNameFromPath("/tmp/path/" + logFile)); + assertEquals(logFile, FSUtils.getFileNameFromPath("/tmp/abc/def/path/" + logFile)); + assertEquals(logFile, FSUtils.getFileNameFromPath("/tmp/" + logFile)); } @Test diff --git a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java new file mode 100644 index 0000000000000..ce2cae78342c8 --- /dev/null +++ b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java @@ -0,0 +1,295 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.hudi.common.model.HoodieColumnRangeMetadata; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; + +import org.apache.avro.generic.IndexedRecord; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import static org.apache.hudi.common.util.CollectionUtils.createImmutableMap; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Tests {@link HoodieMetadataPayload}. + */ +public class TestHoodieMetadataPayload extends HoodieCommonTestHarness { + public static final String PARTITION_NAME = "2022/10/01"; + public static final String PARTITION_NAME2 = "2023/10/01"; + public static final String PARTITION_NAME3 = "2024/10/01"; + + @Test + public void testFileSystemMetadataPayloadMerging() { + Map firstCommitAddedFiles = createImmutableMap( + Pair.of("file1.parquet", 1000L), + Pair.of("file2.parquet", 2000L), + Pair.of("file3.parquet", 3000L) + ); + + HoodieRecord firstPartitionFilesRecord = + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, firstCommitAddedFiles, Collections.emptyList()); + + Map secondCommitAddedFiles = createImmutableMap( + // NOTE: This is an append + Pair.of("file3.parquet", 3333L), + Pair.of("file4.parquet", 4000L), + Pair.of("file5.parquet", 5000L) + ); + + List secondCommitDeletedFiles = Collections.singletonList("file1.parquet"); + + HoodieRecord secondPartitionFilesRecord = + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, secondCommitAddedFiles, secondCommitDeletedFiles); + + HoodieMetadataPayload combinedPartitionFilesRecordPayload = + secondPartitionFilesRecord.getData().preCombine(firstPartitionFilesRecord.getData()); + + HoodieMetadataPayload expectedCombinedPartitionedFilesRecordPayload = + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, + createImmutableMap( + Pair.of("file2.parquet", 2000L), + Pair.of("file3.parquet", 3333L), + Pair.of("file4.parquet", 4000L), + Pair.of("file5.parquet", 5000L) + ), + Collections.emptyList() + ).getData(); + + assertEquals(expectedCombinedPartitionedFilesRecordPayload, combinedPartitionFilesRecordPayload); + } + + @Test + public void testFileSystemMetadataPayloadMergingWithDeletions() { + Map addedFileMap = createImmutableMap( + Pair.of("file1.parquet", 1000L), + Pair.of("file2.parquet", 2000L), + Pair.of("file3.parquet", 3000L), + Pair.of("file4.parquet", 4000L) + ); + HoodieRecord additionRecord = + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, addedFileMap, Collections.emptyList()); + + List deletedFileList1 = new ArrayList<>(); + deletedFileList1.add("file1.parquet"); + deletedFileList1.add("file3.parquet"); + HoodieRecord deletionRecord1 = + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, Collections.emptyMap(), deletedFileList1); + + List deletedFileList2 = new ArrayList<>(); + deletedFileList2.add("file1.parquet"); + deletedFileList2.add("file4.parquet"); + HoodieRecord deletionRecord2 = + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, Collections.emptyMap(), deletedFileList2); + + assertEquals( + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, + createImmutableMap( + Pair.of("file2.parquet", 2000L), + Pair.of("file4.parquet", 4000L) + ), + Collections.emptyList() + ).getData(), + deletionRecord1.getData().preCombine(additionRecord.getData()) + ); + + List expectedDeleteFileList = new ArrayList<>(); + expectedDeleteFileList.add("file1.parquet"); + expectedDeleteFileList.add("file3.parquet"); + expectedDeleteFileList.add("file4.parquet"); + + assertEquals( + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, + Collections.emptyMap(), + expectedDeleteFileList + ).getData(), + deletionRecord2.getData().preCombine(deletionRecord1.getData()) + ); + + assertEquals( + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, + createImmutableMap( + Pair.of("file2.parquet", 2000L) + ), + Collections.emptyList() + ).getData(), + deletionRecord2.getData().preCombine(deletionRecord1.getData()).preCombine(additionRecord.getData()) + ); + + assertEquals( + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, + createImmutableMap( + Pair.of("file2.parquet", 2000L) + ), + Collections.singletonList("file1.parquet") + ).getData(), + deletionRecord2.getData().preCombine(deletionRecord1.getData().preCombine(additionRecord.getData())) + ); + + // lets delete all files + List allDeletedFileList = new ArrayList<>(); + allDeletedFileList.add("file1.parquet"); + allDeletedFileList.add("file2.parquet"); + allDeletedFileList.add("file3.parquet"); + allDeletedFileList.add("file4.parquet"); + HoodieRecord allDeletionRecord = + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, Collections.emptyMap(), allDeletedFileList); + + HoodieMetadataPayload combinedPayload = allDeletionRecord.getData().preCombine(additionRecord.getData()); + assertEquals(HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, Collections.emptyMap(), Collections.emptyList()).getData(), combinedPayload); + assertTrue(combinedPayload.filesystemMetadata.isEmpty()); + + // test all partition record + HoodieRecord allPartitionsRecord = HoodieMetadataPayload.createPartitionListRecord(Arrays.asList(PARTITION_NAME, PARTITION_NAME2, PARTITION_NAME3), false); + HoodieRecord partitionDeletedRecord = HoodieMetadataPayload.createPartitionListRecord(Collections.singletonList(PARTITION_NAME), true); + // combine to ensure the deleted partitions is not seen + HoodieMetadataPayload payload = partitionDeletedRecord.getData().preCombine(allPartitionsRecord.getData()); + assertEquals(HoodieMetadataPayload.createPartitionListRecord(Arrays.asList(PARTITION_NAME2, PARTITION_NAME3), false).getData(), + payload); + } + + @Test + public void testColumnStatsPayloadMerging() throws IOException { + String fileName = "file.parquet"; + String targetColName = "c1"; + + HoodieColumnRangeMetadata c1Metadata = + HoodieColumnRangeMetadata.create(fileName, targetColName, 100, 1000, 5, 1000, 123456, 123456); + + HoodieRecord columnStatsRecord = + HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1Metadata), false) + .findFirst().get(); + + //////////////////////////////////////////////////////////////////////// + // Case 1: Combining proper (non-deleted) records + //////////////////////////////////////////////////////////////////////// + + // NOTE: Column Stats record will only be merged in case existing file will be modified, + // which could only happen on storages schemes supporting appends + HoodieColumnRangeMetadata c1AppendedBlockMetadata = + HoodieColumnRangeMetadata.create(fileName, targetColName, 0, 500, 0, 100, 12345, 12345); + + HoodieRecord updatedColumnStatsRecord = + HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1AppendedBlockMetadata), false) + .findFirst().get(); + + HoodieMetadataPayload combinedMetadataPayload = + columnStatsRecord.getData().preCombine(updatedColumnStatsRecord.getData()); + + HoodieColumnRangeMetadata expectedColumnRangeMetadata = + HoodieColumnRangeMetadata.create(fileName, targetColName, 0, 1000, 5, 1100, 135801, 135801); + + HoodieRecord expectedColumnStatsRecord = + HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(expectedColumnRangeMetadata), false) + .findFirst().get(); + + // Assert combined payload + assertEquals(combinedMetadataPayload, expectedColumnStatsRecord.getData()); + + Option alternativelyCombinedMetadataPayloadAvro = + columnStatsRecord.getData().combineAndGetUpdateValue(updatedColumnStatsRecord.getData().getInsertValue(null).get(), null); + + // Assert that using legacy API yields the same value + assertEquals(combinedMetadataPayload.getInsertValue(null), alternativelyCombinedMetadataPayloadAvro); + + //////////////////////////////////////////////////////////////////////// + // Case 2: Combining w/ deleted records + //////////////////////////////////////////////////////////////////////// + + HoodieColumnRangeMetadata c1StubbedMetadata = + HoodieColumnRangeMetadata.stub(fileName, targetColName); + + HoodieRecord deletedColumnStatsRecord = + HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1StubbedMetadata), true) + .findFirst().get(); + + // NOTE: In this case, deleted (or tombstone) record will be therefore deleting + // previous state of the record + HoodieMetadataPayload deletedCombinedMetadataPayload = + deletedColumnStatsRecord.getData().preCombine(columnStatsRecord.getData()); + + assertEquals(deletedColumnStatsRecord.getData(), deletedCombinedMetadataPayload); + assertFalse(deletedCombinedMetadataPayload.getInsertValue(null).isPresent()); + assertTrue(deletedCombinedMetadataPayload.isDeleted()); + + // NOTE: In this case, proper incoming record will be overwriting previously deleted + // record + HoodieMetadataPayload overwrittenCombinedMetadataPayload = + columnStatsRecord.getData().preCombine(deletedColumnStatsRecord.getData()); + + assertEquals(columnStatsRecord.getData(), overwrittenCombinedMetadataPayload); + } + + @Test + public void testPartitionStatsPayloadMerging() { + HoodieColumnRangeMetadata fileColumnRange1 = HoodieColumnRangeMetadata.create( + "path/to/file", "columnName", 1, 5, 0, 10, 100, 200); + HoodieRecord firstPartitionStatsRecord = + HoodieMetadataPayload.createPartitionStatsRecords(PARTITION_NAME, Collections.singletonList(fileColumnRange1), false, false).findFirst().get(); + HoodieColumnRangeMetadata fileColumnRange2 = HoodieColumnRangeMetadata.create( + "path/to/file", "columnName", 3, 8, 1, 15, 120, 250); + HoodieRecord updatedPartitionStatsRecord = + HoodieMetadataPayload.createPartitionStatsRecords(PARTITION_NAME, Collections.singletonList(fileColumnRange2), false, false).findFirst().get(); + HoodieMetadataPayload combinedPartitionStatsRecordPayload = + updatedPartitionStatsRecord.getData().preCombine(firstPartitionStatsRecord.getData()); + HoodieColumnRangeMetadata expectedColumnRange = HoodieColumnRangeMetadata.create( + "path/to/file", "columnName", 1, 8, 1, 25, 220, 450); + HoodieMetadataPayload expectedColumnRangeMetadata = (HoodieMetadataPayload) HoodieMetadataPayload.createPartitionStatsRecords( + PARTITION_NAME, Collections.singletonList(expectedColumnRange), false, false).findFirst().get().getData(); + assertEquals(expectedColumnRangeMetadata, combinedPartitionStatsRecordPayload); + } + + @Test + public void testPartitionStatsPayloadMergingWithDelete() { + HoodieColumnRangeMetadata fileColumnRange1 = HoodieColumnRangeMetadata.create( + "path/to/file", "columnName", 1, 5, 0, 10, 100, 200); + HoodieRecord firstPartitionStatsRecord = + HoodieMetadataPayload.createPartitionStatsRecords(PARTITION_NAME, Collections.singletonList(fileColumnRange1), false, false).findFirst().get(); + HoodieColumnRangeMetadata fileColumnRange2 = HoodieColumnRangeMetadata.create( + "path/to/file", "columnName", 3, 8, 1, 15, 120, 250); + // create delete payload + HoodieRecord deletedPartitionStatsRecord = + HoodieMetadataPayload.createPartitionStatsRecords(PARTITION_NAME, Collections.singletonList(fileColumnRange2), true, false).findFirst().get(); + // deleted (or tombstone) record will be therefore deleting previous state of the record + HoodieMetadataPayload combinedPartitionStatsRecordPayload = + deletedPartitionStatsRecord.getData().preCombine(firstPartitionStatsRecord.getData()); + HoodieColumnRangeMetadata expectedColumnRange = HoodieColumnRangeMetadata.create( + "path/to/file", "columnName", 3, 8, 1, 15, 120, 250); + HoodieMetadataPayload expectedColumnRangeMetadata = (HoodieMetadataPayload) HoodieMetadataPayload.createPartitionStatsRecords( + PARTITION_NAME, Collections.singletonList(expectedColumnRange), true, false).findFirst().get().getData(); + assertEquals(expectedColumnRangeMetadata, combinedPartitionStatsRecordPayload); + + // another update for the same key should overwrite the delete record + HoodieMetadataPayload overwrittenCombinedPartitionStatsRecordPayload = + firstPartitionStatsRecord.getData().preCombine(deletedPartitionStatsRecord.getData()); + assertEquals(firstPartitionStatsRecord.getData(), overwrittenCombinedPartitionStatsRecordPayload); + } +} diff --git a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java new file mode 100644 index 0000000000000..9586171d97aa5 --- /dev/null +++ b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java @@ -0,0 +1,315 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.EngineType; +import org.apache.hudi.common.engine.HoodieLocalEngineContext; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieColumnRangeMetadata; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.testutils.FileCreateUtils; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.io.storage.HoodieFileWriter; +import org.apache.hudi.io.storage.HoodieFileWriterFactory; +import org.apache.hudi.storage.StoragePath; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.net.URI; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Properties; +import java.util.Set; +import java.util.UUID; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestHoodieTableMetadataUtil extends HoodieCommonTestHarness { + + private static HoodieTestTable hoodieTestTable; + private static final List DATE_PARTITIONS = Arrays.asList("2019/01/01", "2020/01/02", "2021/03/01"); + + @BeforeEach + public void setUp() throws IOException { + initMetaClient(); + initTestDataGenerator(DATE_PARTITIONS.toArray(new String[0])); + hoodieTestTable = HoodieTestTable.of(metaClient); + } + + @AfterEach + public void tearDown() throws IOException { + metaClient.getStorage().deleteDirectory(metaClient.getBasePath()); + cleanupTestDataGenerator(); + cleanMetaClient(); + } + + @Test + public void testReadRecordKeysFromBaseFilesWithEmptyPartitionBaseFilePairs() { + HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getStorageConf()); + List> partitionFileSlicePairs = Collections.emptyList(); + HoodieData result = HoodieTableMetadataUtil.readRecordKeysFromFileSlices( + engineContext, + partitionFileSlicePairs, + false, + 1, + "activeModule", + metaClient, + EngineType.SPARK + ); + assertTrue(result.isEmpty()); + } + + @Test + public void testConvertFilesToPartitionStatsRecords() throws Exception { + HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getStorageConf()); + String instant1 = "20230918120000000"; + hoodieTestTable = hoodieTestTable.addCommit(instant1); + String instant2 = "20230918121110000"; + hoodieTestTable = hoodieTestTable.addCommit(instant2); + List partitionInfoList = new ArrayList<>(); + // Generate 10 inserts for each partition and populate partitionBaseFilePairs and recordKeys. + DATE_PARTITIONS.forEach(p -> { + try { + URI partitionMetaFile = FileCreateUtils.createPartitionMetaFile(basePath, p); + StoragePath partitionMetadataPath = new StoragePath(partitionMetaFile); + String fileId1 = UUID.randomUUID().toString(); + FileSlice fileSlice1 = new FileSlice(p, instant1, fileId1); + StoragePath storagePath1 = new StoragePath(hoodieTestTable.getBaseFilePath(p, fileId1).toUri()); + writeParquetFile( + instant1, + storagePath1, + dataGen.generateInsertsForPartition(instant1, 10, p), + metaClient, + engineContext); + HoodieBaseFile baseFile1 = new HoodieBaseFile(hoodieTestTable.getBaseFilePath(p, fileId1).toString()); + fileSlice1.setBaseFile(baseFile1); + String fileId2 = UUID.randomUUID().toString(); + FileSlice fileSlice2 = new FileSlice(p, instant2, fileId2); + StoragePath storagePath2 = new StoragePath(hoodieTestTable.getBaseFilePath(p, fileId2).toUri()); + writeParquetFile( + instant2, + storagePath2, + dataGen.generateInsertsForPartition(instant2, 10, p), + metaClient, + engineContext); + HoodieBaseFile baseFile2 = new HoodieBaseFile(hoodieTestTable.getBaseFilePath(p, fileId2).toString()); + fileSlice2.setBaseFile(baseFile2); + partitionInfoList.add(new HoodieTableMetadataUtil.DirectoryInfo( + p, + metaClient.getStorage().listDirectEntries(Arrays.asList(partitionMetadataPath, storagePath1, storagePath2)), + instant2, + Collections.emptySet())); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + + List columnsToIndex = Arrays.asList("rider", "driver"); + HoodieData result = HoodieTableMetadataUtil.convertFilesToPartitionStatsRecords( + engineContext, + partitionInfoList, + HoodieMetadataConfig.newBuilder().enable(true) + .withMetadataIndexColumnStats(true) + .withMetadataIndexPartitionStats(true) + .withColumnStatsIndexForColumns("rider,driver") + .withPartitionStatsIndexParallelism(1) + .build(), + metaClient, + Option.of(HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS)); + // Validate the result. + validatePartitionStats(result, instant1, instant2); + } + + @Test + public void testReadRecordKeysFromBaseFilesWithValidRecords() throws Exception { + HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getStorageConf()); + String instant = "20230918120000000"; + hoodieTestTable = hoodieTestTable.addCommit(instant); + Set recordKeys = new HashSet<>(); + final List> partitionFileSlicePairs = new ArrayList<>(); + // Generate 10 inserts for each partition and populate partitionBaseFilePairs and recordKeys. + DATE_PARTITIONS.forEach(p -> { + try { + List hoodieRecords = dataGen.generateInsertsForPartition(instant, 10, p); + String fileId = UUID.randomUUID().toString(); + FileSlice fileSlice = new FileSlice(p, instant, fileId); + writeParquetFile( + instant, + new StoragePath(hoodieTestTable.getBaseFilePath(p, fileId).toUri()), + hoodieRecords, + metaClient, + engineContext); + HoodieBaseFile baseFile = new HoodieBaseFile(hoodieTestTable.getBaseFilePath(p, fileId).toString(), fileId, instant, null); + fileSlice.setBaseFile(baseFile); + partitionFileSlicePairs.add(Pair.of(p, fileSlice)); + recordKeys.addAll(hoodieRecords.stream().map(HoodieRecord::getRecordKey).collect(Collectors.toSet())); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + + // Call the method readRecordKeysFromBaseFiles with the created partitionBaseFilePairs. + HoodieData result = HoodieTableMetadataUtil.readRecordKeysFromFileSlices( + engineContext, + partitionFileSlicePairs, + false, + 1, + "activeModule", + metaClient, + EngineType.SPARK + ); + // Validate the result. + List records = result.collectAsList(); + assertEquals(30, records.size()); + assertEquals(MetadataPartitionType.RECORD_INDEX.getPartitionPath(), records.get(0).getPartitionPath()); + for (HoodieRecord record : records) { + assertTrue(recordKeys.contains(record.getRecordKey())); + } + } + + @Test + public void testGetLogFileColumnRangeMetadata() throws Exception { + HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getStorageConf()); + String instant1 = "20230918120000000"; + + HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); + commitMetadata.addMetadata("test", "test"); + commitMetadata.setOperationType(WriteOperationType.INSERT); + commitMetadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS.toString()); + hoodieTestTable = hoodieTestTable.addCommit(instant1, Option.of(commitMetadata)); + String instant2 = "20230918121110000"; + hoodieTestTable = hoodieTestTable.addCommit(instant2); + List partitionInfoList = new ArrayList<>(); + List columnsToIndex = Arrays.asList("rider", "driver"); + // Generate 10 inserts for each partition and populate partitionBaseFilePairs and recordKeys. + DATE_PARTITIONS.forEach(p -> { + try { + URI partitionMetaFile = FileCreateUtils.createPartitionMetaFile(basePath, p); + StoragePath partitionMetadataPath = new StoragePath(partitionMetaFile); + String fileId1 = UUID.randomUUID().toString(); + // add only one parquet file in first file slice + FileSlice fileSlice1 = new FileSlice(p, instant1, fileId1); + StoragePath storagePath1 = new StoragePath(hoodieTestTable.getBaseFilePath(p, fileId1).toUri()); + writeParquetFile(instant1, storagePath1, dataGen.generateInsertsForPartition(instant1, 10, p), metaClient, engineContext); + HoodieBaseFile baseFile1 = new HoodieBaseFile(hoodieTestTable.getBaseFilePath(p, fileId1).toString()); + fileSlice1.setBaseFile(baseFile1); + // add log file in second file slice with higher rider and driver values (which are concatenated with instant) + FileSlice fileSlice2 = new FileSlice(p, instant2, fileId1); + fileSlice2.setBaseFile(baseFile1); + StoragePath storagePath2 = new StoragePath(partitionMetadataPath.getParent(), hoodieTestTable.getLogFileNameById(fileId1, 1)); + writeLogFiles(new StoragePath(metaClient.getBasePath(), p), HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS, dataGen.generateInsertsForPartition(instant2, 10, p), 1, + metaClient.getStorage(), new Properties(), fileId1, instant2); + fileSlice2.addLogFile(new HoodieLogFile(storagePath2.toUri().toString())); + partitionInfoList.add(new HoodieTableMetadataUtil.DirectoryInfo( + p, + metaClient.getStorage().listDirectEntries(Arrays.asList(partitionMetadataPath, storagePath1, storagePath2)), + instant2, + Collections.emptySet())); + // NOTE: we need to set table config as we are not using write client explicitly and these configs are needed for log record reader + metaClient.getTableConfig().setValue(HoodieTableConfig.POPULATE_META_FIELDS.key(), "false"); + metaClient.getTableConfig().setValue(HoodieTableConfig.RECORDKEY_FIELDS.key(), "_row_key"); + metaClient.getTableConfig().setValue(HoodieTableConfig.PARTITION_FIELDS.key(), "partition_path"); + List> columnRangeMetadataLogFile = HoodieTableMetadataUtil.getLogFileColumnRangeMetadata( + storagePath2.toString(), + metaClient, + columnsToIndex, + Option.of(HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS), + HoodieMetadataConfig.MAX_READER_BUFFER_SIZE_PROP.defaultValue()); + // there must be two ranges for rider and driver + assertEquals(2, columnRangeMetadataLogFile.size()); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + // collect partition stats, this will collect stats for log files as well + HoodieData result = HoodieTableMetadataUtil.convertFilesToPartitionStatsRecords( + engineContext, + partitionInfoList, + HoodieMetadataConfig.newBuilder().enable(true) + .withMetadataIndexColumnStats(true) + .withMetadataIndexPartitionStats(true) + .withColumnStatsIndexForColumns("rider,driver") + .withPartitionStatsIndexParallelism(1) + .build(), + metaClient, + Option.of(HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS)); + // Validate the result. + validatePartitionStats(result, instant1, instant2); + } + + private static void validatePartitionStats(HoodieData result, String instant1, String instant2) { + List records = result.collectAsList(); + // 3 partitions * 2 columns = 6 partition stats records + assertEquals(6, records.size()); + assertEquals(MetadataPartitionType.PARTITION_STATS.getPartitionPath(), records.get(0).getPartitionPath()); + ((HoodieMetadataPayload) result.collectAsList().get(0).getData()).getColumnStatMetadata().get().getColumnName(); + records.forEach(r -> { + HoodieMetadataPayload payload = (HoodieMetadataPayload) r.getData(); + assertTrue(payload.getColumnStatMetadata().isPresent()); + // instant1 < instant2 so instant1 should be in the min value and instant2 should be in the max value. + if (payload.getColumnStatMetadata().get().getColumnName().equals("rider")) { + assertEquals(String.format("{\"value\": \"rider-%s\"}", instant1), String.valueOf(payload.getColumnStatMetadata().get().getMinValue())); + assertEquals(String.format("{\"value\": \"rider-%s\"}", instant2), String.valueOf(payload.getColumnStatMetadata().get().getMaxValue())); + } else if (payload.getColumnStatMetadata().get().getColumnName().equals("driver")) { + assertEquals(String.format("{\"value\": \"driver-%s\"}", instant1), String.valueOf(payload.getColumnStatMetadata().get().getMinValue())); + assertEquals(String.format("{\"value\": \"driver-%s\"}", instant2), String.valueOf(payload.getColumnStatMetadata().get().getMaxValue())); + } + }); + } + + private static void writeParquetFile(String instant, + StoragePath path, + List records, + HoodieTableMetaClient metaClient, + HoodieLocalEngineContext engineContext) throws IOException { + HoodieFileWriter writer = HoodieFileWriterFactory.getFileWriter( + instant, + path, + metaClient.getStorage(), + metaClient.getTableConfig(), + HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS, + engineContext.getTaskContextSupplier(), + HoodieRecord.HoodieRecordType.AVRO); + for (HoodieRecord record : records) { + writer.writeWithMetadata(record.getKey(), record, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS); + } + writer.close(); + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap-rollback1-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap-rollback1-column-stats-index-table.json new file mode 100644 index 0000000000000..83790766db25b --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap-rollback1-column-stats-index-table.json @@ -0,0 +1,2 @@ +{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40} +{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 984sdh","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":10000.768,"c3_minValue":0.001,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap1-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap1-column-stats-index-table.json new file mode 100644 index 0000000000000..75aa7ada3ad3e --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap1-column-stats-index-table.json @@ -0,0 +1,4 @@ +{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40} +{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":0.300,"c3_nullCount":1,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-1000,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40} +{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":0.300,"c3_nullCount":1,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":39} +{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":10000.768,"c3_minValue":0.001,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":39} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap2-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap2-column-stats-index-table.json new file mode 100644 index 0000000000000..9c52707a27d05 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-bootstrap2-column-stats-index-table.json @@ -0,0 +1,5 @@ +{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40} +{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":0.300,"c3_nullCount":1,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-1000,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40} +{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":0.300,"c3_nullCount":1,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":39} +{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":10000.768,"c3_minValue":0.001,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":39} +{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":200000.000,"c3_minValue":0.100,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":39} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-clean1-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-clean1-column-stats-index-table.json new file mode 100644 index 0000000000000..a08dea39c0501 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-clean1-column-stats-index-table.json @@ -0,0 +1,2 @@ +{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":0.300,"c3_nullCount":1,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-1000,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40} +{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":10000.768,"c3_minValue":0.001,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-1000,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/delete-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/delete-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json new file mode 100644 index 0000000000000..17e8f877c50bb --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/delete-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json @@ -0,0 +1 @@ +{"c1":633,"c2":" 987sdk","c3":375.308,"c4":"2021-11-18T23:34:44.180-08:00","c5":0,"c6":"2020-01-01","c7":"NA==","c8":9} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap-rollback1-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap-rollback1-column-stats-index-table.json new file mode 100644 index 0000000000000..dcbf49b141f91 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap-rollback1-column-stats-index-table.json @@ -0,0 +1,2 @@ +{"c1_maxValue":562,"c1_minValue":323,"c1_nullCount":0,"c2_maxValue":" 984sdh","c2_minValue":" 980sdd","c2_nullCount":0,"c3_maxValue":10000.768,"c3_minValue":0.001,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-19T23:34:44.181-08:00","c4_nullCount":0,"c5_maxValue":80,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_nullCount":0,"c7_maxValue":"SA==","c7_minValue":"qw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":5} +{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap1-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap1-column-stats-index-table.json new file mode 100644 index 0000000000000..146097347e036 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap1-column-stats-index-table.json @@ -0,0 +1,3 @@ +{"c1_maxValue":562,"c1_minValue":323,"c1_nullCount":0,"c2_maxValue":" 984sdh","c2_minValue":" 980sdd","c2_nullCount":0,"c3_maxValue":10000.768,"c3_minValue":0.001,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-19T23:34:44.181-08:00","c4_nullCount":0,"c5_maxValue":80,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_nullCount":0,"c7_maxValue":"SA==","c7_minValue":"qw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":5} +{"c1_maxValue":639,"c1_minValue":323,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 980sdd","c2_nullCount":0,"c3_maxValue":977.328,"c3_minValue":0.300,"c3_nullCount":1,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-1000,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"aQ==","c7_minValue":"qw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":10} +{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap2-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap2-column-stats-index-table.json new file mode 100644 index 0000000000000..6256be16c1ddf --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-bootstrap2-column-stats-index-table.json @@ -0,0 +1,5 @@ +{"c1_maxValue":562,"c1_minValue":323,"c1_nullCount":0,"c2_maxValue":" 984sdh","c2_minValue":" 980sdd","c2_nullCount":0,"c3_maxValue":10000.768,"c3_minValue":0.001,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-19T23:34:44.181-08:00","c4_nullCount":0,"c5_maxValue":80,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_nullCount":0,"c7_maxValue":"SA==","c7_minValue":"qw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":5} +{"c1_maxValue":562,"c1_minValue":323,"c1_nullCount":0,"c2_maxValue":" 984sdh","c2_minValue":" 980sdd","c2_nullCount":0,"c3_maxValue":200000.000,"c3_minValue":0.100,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-19T23:34:44.181-08:00","c4_nullCount":0,"c5_maxValue":80,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_nullCount":0,"c7_maxValue":"SA==","c7_minValue":"qQ==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":5} +{"c1_maxValue":639,"c1_minValue":323,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 980sdd","c2_nullCount":0,"c3_maxValue":977.328,"c3_minValue":0.300,"c3_nullCount":1,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-1000,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"aQ==","c7_minValue":"qw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":10} +{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40} +{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":200000.000,"c3_minValue":0.100,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":39} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-clean1-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-clean1-column-stats-index-table.json new file mode 100644 index 0000000000000..8c7b1125314a4 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-clean1-column-stats-index-table.json @@ -0,0 +1,2 @@ +{"c1_maxValue":562,"c1_minValue":323,"c1_nullCount":0,"c2_maxValue":" 984sdh","c2_minValue":" 980sdd","c2_nullCount":0,"c3_maxValue":10000.768,"c3_minValue":0.001,"c3_nullCount":0,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-19T23:34:44.181-08:00","c4_nullCount":0,"c5_maxValue":80,"c5_minValue":-100,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_nullCount":0,"c7_maxValue":"SA==","c7_minValue":"qw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":5} +{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":0.300,"c3_nullCount":1,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-1000,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-delete-block1-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-delete-block1-column-stats-index-table.json new file mode 100644 index 0000000000000..fc6c936c7871e --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-delete-block1-column-stats-index-table.json @@ -0,0 +1,3 @@ +{"c1_nullCount":0,"c2_nullCount":0,"c3_nullCount":0,"c4_nullCount":0,"c5_nullCount":0,"c6_nullCount":0,"c7_nullCount":0,"c8_nullCount":0,"valueCount":0} +{"c1_maxValue":639,"c1_minValue":323,"c1_nullCount":0,"c2_maxValue":" 989sda","c2_minValue":" 980sdd","c2_nullCount":0,"c3_maxValue":977.328,"c3_minValue":0.300,"c3_nullCount":1,"c4_maxValue":"2021-11-19T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":1000,"c5_minValue":-1000,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"aQ==","c7_minValue":"qw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":10} +{"c1_maxValue":959,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":40} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update2-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update2-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json new file mode 100644 index 0000000000000..35ae749ddc3fc --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update2-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json @@ -0,0 +1,10 @@ +{"c1":323,"c2":" 980sdd","c3":null,"c4":"2021-11-19T23:34:44.201-08:00","c5":70,"c6":"2020-01-15","c7":"Ag==","c8":9} +{"c1":326,"c2":" 981sde","c3":64.768,"c4":"2021-11-19T23:34:44.201-08:00","c5":80,"c6":"2020-10-13","c7":"AA==","c8":9} +{"c1":555,"c2":" 982sdf","c3":153.431,"c4":"2021-11-19T23:34:44.186-08:00","c5":10,"c6":"2020-03-12","c7":"rw==","c8":9} +{"c1":556,"c2":" 983sdg","c3":246.427,"c4":"2021-11-19T23:34:44.186-08:00","c5":45,"c6":"2020-10-08","c7":"qw==","c8":9} +{"c1":562,"c2":" 984sdh","c3":977.328,"c4":"2021-11-19T23:34:44.181-08:00","c5":-100,"c6":"2020-10-21","c7":"SA==","c8":9} +{"c1":619,"c2":" 985sdi","c3":230.320,"c4":"2021-11-19T23:34:44.180-08:00","c5":1000,"c6":"2020-02-13","c7":"QA==","c8":9} +{"c1":624,"c2":" 986sdj","c3":580.317,"c4":"2021-11-18T23:34:44.180-08:00","c5":-1,"c6":"2020-10-10","c7":"PQ==","c8":9} +{"c1":633,"c2":" 987sdk","c3":375.308,"c4":"2021-11-18T23:34:44.180-08:00","c5":-1000,"c6":"2020-01-01","c7":"NA==","c8":9} +{"c1":638,"c2":" 988sdl","c3":904.304,"c4":"2021-11-18T23:34:44.179-08:00","c5":20,"c6":"2020-08-25","c7":"MA==","c8":9} +{"c1":639,"c2":" 989sda","c3":0.300,"c4":"2021-11-18T23:34:44.179-08:00","c5":90,"c6":"2020-04-21","c7":"aa==","c8":9} diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update3-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update3-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json new file mode 100644 index 0000000000000..5e04406cf2182 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update3-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json @@ -0,0 +1,5 @@ +{"c1":323,"c2":" 980sdd","c3":10.00,"c4":"2021-11-19T23:34:44.201-08:00","c5":70,"c6":"2020-01-15","c7":"Ag==","c8":9} +{"c1":326,"c2":" 981sde","c3":10000.768,"c4":"2021-11-19T23:34:44.201-08:00","c5":80,"c6":"2020-10-13","c7":"AA==","c8":9} +{"c1":555,"c2":" 982sdf","c3":2.431,"c4":"2021-11-19T23:34:44.186-08:00","c5":10,"c6":"2020-03-12","c7":"rw==","c8":9} +{"c1":556,"c2":" 983sdg","c3":0.001,"c4":"2021-11-19T23:34:44.186-08:00","c5":45,"c6":"2020-10-08","c7":"qw==","c8":9} +{"c1":562,"c2":" 984sdh","c3":5.328,"c4":"2021-11-19T23:34:44.181-08:00","c5":-100,"c6":"2020-10-21","c7":"SA==","c8":9} diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update4-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update4-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json new file mode 100644 index 0000000000000..a83a82d8b8bff --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update4-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json @@ -0,0 +1,5 @@ +{"c1":323,"c2":" 980sdd","c3":200000.00,"c4":"2021-11-19T23:34:44.201-08:00","c5":70,"c6":"2020-01-15","c7":"Aj==","c8":9} +{"c1":326,"c2":" 981sde","c3":100.768,"c4":"2021-11-19T23:34:44.201-08:00","c5":80,"c6":"2020-10-13","c7":"AB==","c8":9} +{"c1":555,"c2":" 982sdf","c3":20.431,"c4":"2021-11-19T23:34:44.186-08:00","c5":10,"c6":"2020-03-12","c7":"rx==","c8":9} +{"c1":556,"c2":" 983sdg","c3":0.1,"c4":"2021-11-19T23:34:44.186-08:00","c5":45,"c6":"2020-10-08","c7":"qf==","c8":9} +{"c1":562,"c2":" 984sdh","c3":4.328,"c4":"2021-11-19T23:34:44.181-08:00","c5":-100,"c6":"2020-10-21","c7":"SL==","c8":9} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala index 6a9efb3371d89..ba29a4c36bf15 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala @@ -18,14 +18,18 @@ package org.apache.hudi.functional -import org.apache.hadoop.fs.{LocatedFileStatus, Path} +import org.apache.avro.Schema +import org.apache.hadoop.fs.Path import org.apache.hudi.ColumnStatsIndexSupport.composeIndexSchema import org.apache.hudi.HoodieConversionUtils.toProperties +import org.apache.hudi.client.common.HoodieSparkEngineContext import org.apache.hudi.common.config.{HoodieMetadataConfig, HoodieStorageConfig} -import org.apache.hudi.common.model.HoodieTableType +import org.apache.hudi.common.model.{HoodieBaseFile, HoodieFileGroup, HoodieLogFile, HoodieTableType} import org.apache.hudi.common.table.HoodieTableMetaClient -import org.apache.hudi.functional.ColumnStatIndexTestBase.ColumnStatsTestCase -import org.apache.hudi.testutils.HoodieSparkClientTestBase +import org.apache.hudi.common.table.view.FileSystemViewManager +import org.apache.hudi.config.HoodieCompactionConfig +import org.apache.hudi.functional.ColumnStatIndexTestBase.{ColumnStatsTestCase, ColumnStatsTestParams} +import org.apache.hudi.testutils.{HoodieSparkClientTestBase, LogFileColStatsTestUtil} import org.apache.hudi.{ColumnStatsIndexSupport, DataSourceWriteOptions} import org.apache.spark.sql._ import org.apache.spark.sql.functions.typedLit @@ -36,6 +40,9 @@ import org.junit.jupiter.params.provider.Arguments import java.math.BigInteger import java.sql.{Date, Timestamp} +import java.util +import java.util.List +import java.util.stream.Collectors import scala.collection.JavaConverters._ import scala.util.Random @@ -73,42 +80,39 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase { cleanupSparkContexts() } - protected def doWriteAndValidateColumnStats(testCase: ColumnStatsTestCase, - metadataOpts: Map[String, String], - hudiOpts: Map[String, String], - dataSourcePath: String, - expectedColStatsSourcePath: String, - operation: String, - saveMode: SaveMode, - shouldValidate: Boolean = true): Unit = { - val sourceJSONTablePath = getClass.getClassLoader.getResource(dataSourcePath).toString + protected def doWriteAndValidateColumnStats(params: ColumnStatsTestParams): Unit = { + + val sourceJSONTablePath = getClass.getClassLoader.getResource(params.dataSourcePath).toString // NOTE: Schema here is provided for validation that the input date is in the appropriate format val inputDF = spark.read.schema(sourceTableSchema).json(sourceJSONTablePath) + val writeOptions: Map[String, String] = params.hudiOpts ++ params.metadataOpts + inputDF .sort("c1") - .repartition(4, new Column("c1")) + .repartition(params.numPartitions, new Column("c1")) .write .format("hudi") - .options(hudiOpts) - .option(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.key, 10 * 1024) - .option(DataSourceWriteOptions.OPERATION.key, operation) - .mode(saveMode) + .options(writeOptions) + .option(DataSourceWriteOptions.OPERATION.key, params.operation) + .option(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.key(), String.valueOf(params.parquetMaxFileSize)) + .option(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT.key(), String.valueOf(params.smallFileLimit)) + .mode(params.saveMode) .save(basePath) dfList = dfList :+ inputDF metaClient = HoodieTableMetaClient.reload(metaClient) - if (shouldValidate) { + if (params.shouldValidate) { // Currently, routine manually validating the column stats (by actually reading every column of every file) // only supports parquet files. Therefore we skip such validation when delta-log files are present, and only // validate in following cases: (1) COW: all operations; (2) MOR: insert only. - val shouldValidateColumnStatsManually = testCase.tableType == HoodieTableType.COPY_ON_WRITE || - operation.equals(DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) + val shouldValidateColumnStatsManually = params.testCase.tableType == HoodieTableType.COPY_ON_WRITE || + params.operation.equals(DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) validateColumnStatsIndex( - testCase, metadataOpts, expectedColStatsSourcePath, shouldValidateColumnStatsManually) + params.testCase, params.metadataOpts, params.expectedColStatsSourcePath, shouldValidateColumnStatsManually, params.latestCompletedCommit) } } @@ -116,20 +120,19 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase { includedCols: Seq[String], indexedCols: Seq[String], indexSchema: StructType): DataFrame = { - val files = { - val it = fs.listFiles(new Path(tablePath), true) - var seq = Seq[LocatedFileStatus]() - while (it.hasNext) { - seq = seq :+ it.next() - } - seq.filter(fs => fs.getPath.getName.endsWith(".parquet")) - } - - spark.createDataFrame( - files.flatMap(file => { - val df = spark.read.schema(sourceTableSchema).parquet(file.getPath.toString) + val metaClient = HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(tablePath).build() + val fsv = FileSystemViewManager.createInMemoryFileSystemView(new HoodieSparkEngineContext(jsc), metaClient, HoodieMetadataConfig.newBuilder().enable(false).build()) + fsv.loadAllPartitions() + val filegroupList = fsv.getAllFileGroups.collect(Collectors.toList[HoodieFileGroup]) + val baseFilesList = filegroupList.stream().flatMap(fileGroup => fileGroup.getAllBaseFiles).collect(Collectors.toList[HoodieBaseFile]) + val baseFiles = baseFilesList.stream() + .map[Path](baseFile => new Path(baseFile.getPath)).collect(Collectors.toList[Path]).asScala + + val baseFilesDf = spark.createDataFrame( + baseFiles.flatMap(file => { + val df = spark.read.schema(sourceTableSchema).parquet(file.toString) val exprs: Seq[String] = - s"'${typedLit(file.getPath.getName)}' AS file" +: + s"'${typedLit(file.getName)}' AS file" +: s"sum(1) AS valueCount" +: df.columns .filter(col => includedCols.contains(col)) @@ -157,12 +160,61 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase { }).asJava, indexSchema ) + + if (metaClient.getTableConfig.getTableType == HoodieTableType.COPY_ON_WRITE) { + baseFilesDf // COW table + } else { + val allLogFiles = filegroupList.stream().flatMap(fileGroup => fileGroup.getAllFileSlices) + .flatMap(fileSlice => fileSlice.getLogFiles) + .collect(Collectors.toList[HoodieLogFile]) + if (allLogFiles.isEmpty) { + baseFilesDf // MOR table, but no log files. + } else { + val colsToGenerateStats = indexedCols // check for included cols + val writerSchemaOpt = LogFileColStatsTestUtil.getSchemaForTable(metaClient) + val latestCompletedCommit = metaClient.getActiveTimeline.getCommitsTimeline.filterCompletedInstants().lastInstant().get().getTimestamp + baseFilesDf.union(getColStatsFromLogFiles(allLogFiles, latestCompletedCommit, + scala.collection.JavaConverters.seqAsJavaList(colsToGenerateStats), + metaClient, + writerSchemaOpt: org.apache.hudi.common.util.Option[Schema], + HoodieMetadataConfig.MAX_READER_BUFFER_SIZE_PROP.defaultValue(), + indexSchema)) + } + } + } + + protected def getColStatsFromLogFiles(logFiles: List[HoodieLogFile], latestCommit: String, columnsToIndex: util.List[String], + datasetMetaClient: HoodieTableMetaClient, + writerSchemaOpt: org.apache.hudi.common.util.Option[Schema], + maxBufferSize: Integer, + indexSchema: StructType): DataFrame = { + val colStatsEntries = logFiles.stream().map[org.apache.hudi.common.util.Option[Row]](logFile => { + try { + getColStatsFromLogFile(logFile.getPath.toString, latestCommit, columnsToIndex, datasetMetaClient, writerSchemaOpt, maxBufferSize) + } catch { + case e: Exception => + throw e + } + }).filter(rowOpt => rowOpt.isPresent).map[Row](rowOpt => rowOpt.get()).collect(Collectors.toList[Row]) + spark.createDataFrame(colStatsEntries, indexSchema) + } + + protected def getColStatsFromLogFile(logFilePath: String, + latestCommit: String, + columnsToIndex: util.List[String], + datasetMetaClient: HoodieTableMetaClient, + writerSchemaOpt: org.apache.hudi.common.util.Option[Schema], + maxBufferSize: Integer + ): org.apache.hudi.common.util.Option[Row] = { + LogFileColStatsTestUtil.getLogFileColumnRangeMetadata(logFilePath, datasetMetaClient, latestCommit, + columnsToIndex, writerSchemaOpt, maxBufferSize) } protected def validateColumnStatsIndex(testCase: ColumnStatsTestCase, - metadataOpts: Map[String, String], - expectedColStatsSourcePath: String, - validateColumnStatsManually: Boolean): Unit = { + metadataOpts: Map[String, String], + expectedColStatsSourcePath: String, + validateColumnStatsManually: Boolean, + latestCompletedCommit: String): Unit = { val metadataConfig = HoodieMetadataConfig.newBuilder() .fromProperties(toProperties(metadataOpts)) .build() @@ -178,7 +230,8 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase { } } val (expectedColStatsSchema, _) = composeIndexSchema(sourceTableSchema.fieldNames, indexedColumns, sourceTableSchema) - val validationSortColumns = Seq("c1_maxValue", "c1_minValue", "c2_maxValue", "c2_minValue") + val validationSortColumns = Seq("c1_maxValue", "c1_minValue", "c2_maxValue", "c2_minValue", "c3_maxValue", + "c3_minValue", "c5_maxValue", "c5_minValue") columnStatsIndex.loadTransposed(sourceTableSchema.fieldNames, testCase.shouldReadInMemory) { transposedColStatsDF => // Match against expected column stats table @@ -270,14 +323,41 @@ object ColumnStatIndexTestBase { def testMetadataColumnStatsIndexParams: java.util.stream.Stream[Arguments] = { java.util.stream.Stream.of(HoodieTableType.values().toStream.flatMap(tableType => Seq(Arguments.arguments(ColumnStatsTestCase(tableType, shouldReadInMemory = true)), - Arguments.arguments(ColumnStatsTestCase(tableType, shouldReadInMemory = false))) + Arguments.arguments(ColumnStatsTestCase(tableType, shouldReadInMemory = false)) + ) ): _*) } def testMetadataColumnStatsIndexParamsForMOR: java.util.stream.Stream[Arguments] = { java.util.stream.Stream.of( Seq(Arguments.arguments(ColumnStatsTestCase(HoodieTableType.MERGE_ON_READ, shouldReadInMemory = true)), - Arguments.arguments(ColumnStatsTestCase(HoodieTableType.MERGE_ON_READ, shouldReadInMemory = false))) - : _*) + Arguments.arguments(ColumnStatsTestCase(HoodieTableType.MERGE_ON_READ, shouldReadInMemory = false)) + ) + : _*) } + + def testTableTypePartitionTypeParams: java.util.stream.Stream[Arguments] = { + java.util.stream.Stream.of( + Seq( + Arguments.arguments(HoodieTableType.COPY_ON_WRITE, "c8"), + // empty partition col represents non-partitioned table. + Arguments.arguments(HoodieTableType.COPY_ON_WRITE, ""), + Arguments.arguments(HoodieTableType.MERGE_ON_READ, "c8"), + Arguments.arguments(HoodieTableType.MERGE_ON_READ, "") + ) + : _*) + } + + case class ColumnStatsTestParams(testCase: ColumnStatsTestCase, + metadataOpts: Map[String, String], + hudiOpts: Map[String, String], + dataSourcePath: String, + expectedColStatsSourcePath: String, + operation: String, + saveMode: SaveMode, + shouldValidate: Boolean = true, + latestCompletedCommit: String = null, + numPartitions: Integer = 4, + parquetMaxFileSize: Integer = 10 * 1024, + smallFileLimit: Integer = 100 * 1024 * 1024) } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala index ac83cf81918bb..3702cc8f188f3 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala @@ -19,7 +19,7 @@ package org.apache.hudi.functional import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.Path +import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hudi.ColumnStatsIndexSupport.composeIndexSchema import org.apache.hudi.DataSourceWriteOptions.{PRECOMBINE_FIELD, RECORDKEY_FIELD} import org.apache.hudi.HoodieConversionUtils.toProperties @@ -27,9 +27,15 @@ import org.apache.hudi.common.config.{HoodieCommonConfig, HoodieMetadataConfig, import org.apache.hudi.common.model.HoodieTableType import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} import org.apache.hudi.common.util.ParquetUtils -import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.config.{HoodieCleanConfig, HoodieCompactionConfig, HoodieWriteConfig} import org.apache.hudi.functional.ColumnStatIndexTestBase.ColumnStatsTestCase import org.apache.hudi.{ColumnStatsIndexSupport, DataSourceWriteOptions} +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.table.view.FileSystemViewManager +import org.apache.hudi.common.util.StringUtils +import org.apache.hudi.DataSourceWriteOptions.PARTITIONPATH_FIELD +import org.apache.hudi.functional.ColumnStatIndexTestBase.ColumnStatsTestParams import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, GreaterThan, Literal, Or} @@ -63,17 +69,17 @@ class TestColumnStatsIndex extends ColumnStatIndexTestBase { HoodieTableConfig.POPULATE_META_FIELDS.key -> "true" ) ++ metadataOpts - doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts, + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts, dataSourcePath = "index/colstats/input-table-json", expectedColStatsSourcePath = "index/colstats/column-stats-index-table.json", operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL, - saveMode = SaveMode.Overwrite) + saveMode = SaveMode.Overwrite)) - doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts, + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts, dataSourcePath = "index/colstats/another-input-table-json", expectedColStatsSourcePath = "index/colstats/updated-column-stats-index-table.json", operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL, - saveMode = SaveMode.Append) + saveMode = SaveMode.Append)) // NOTE: MOR and COW have different fixtures since MOR is bearing delta-log files (holding // deferred updates), diverging from COW @@ -83,13 +89,441 @@ class TestColumnStatsIndex extends ColumnStatIndexTestBase { "index/colstats/mor-updated2-column-stats-index-table.json" } - doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts, + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts, dataSourcePath = "index/colstats/update-input-table-json", expectedColStatsSourcePath = expectedColStatsSourcePath, operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, - saveMode = SaveMode.Append) + saveMode = SaveMode.Append)) } + @ParameterizedTest + @MethodSource(Array("testTableTypePartitionTypeParams")) + def testMetadataColumnStatsIndexInitializationWithUpserts(tableType: HoodieTableType, partitionCol : String): Unit = { + val testCase = ColumnStatsTestCase(tableType, shouldReadInMemory = true) + val metadataOpts = Map( + HoodieMetadataConfig.ENABLE.key -> "true" + ) + + val commonOpts = Map( + "hoodie.insert.shuffle.parallelism" -> "1", + "hoodie.upsert.shuffle.parallelism" -> "1", + HoodieWriteConfig.TBL_NAME.key -> "hoodie_test", + DataSourceWriteOptions.TABLE_TYPE.key -> testCase.tableType.toString, + RECORDKEY_FIELD.key -> "c1", + PRECOMBINE_FIELD.key -> "c1", + PARTITIONPATH_FIELD.key() -> partitionCol, + HoodieTableConfig.POPULATE_META_FIELDS.key -> "true", + HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS.key() -> "5" + ) ++ metadataOpts + + // inserts + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts, + dataSourcePath = "index/colstats/input-table-json", + expectedColStatsSourcePath = null, + operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL, + saveMode = SaveMode.Overwrite, + false, + numPartitions = 1, + parquetMaxFileSize = 100 * 1024 * 1024, + smallFileLimit = 0)) + + // updates + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts, + dataSourcePath = "index/colstats/update2-input-table-json/", + expectedColStatsSourcePath = null, + operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, + saveMode = SaveMode.Append, + false, + numPartitions = 1, + parquetMaxFileSize = 100 * 1024 * 1024, + smallFileLimit = 0)) + + // delete a subset of recs. this will add a delete log block for MOR table. + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts, + dataSourcePath = "index/colstats/delete-input-table-json/", + expectedColStatsSourcePath = null, + operation = DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL, + saveMode = SaveMode.Append, + false, + numPartitions = 1, + parquetMaxFileSize = 100 * 1024 * 1024, + smallFileLimit = 0)) + + val metadataOpts1 = Map( + HoodieMetadataConfig.ENABLE.key -> "true", + HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "true" + ) + + // NOTE: MOR and COW have different fixtures since MOR is bearing delta-log files (holding + // deferred updates), diverging from COW + + val expectedColStatsSourcePath = if (testCase.tableType == HoodieTableType.COPY_ON_WRITE) { + "index/colstats/cow-bootstrap1-column-stats-index-table.json" + } else { + "index/colstats/mor-bootstrap1-column-stats-index-table.json" + } + + metaClient = HoodieTableMetaClient.reload(metaClient) + val latestCompletedCommit = metaClient.getActiveTimeline.filterCompletedInstants().lastInstant().get().getTimestamp + + // lets validate that we have log files generated in case of MOR table + if (tableType == HoodieTableType.MERGE_ON_READ) { + val metaClient = HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build() + val fsv = FileSystemViewManager.createInMemoryFileSystemView(new HoodieSparkEngineContext(jsc), metaClient, HoodieMetadataConfig.newBuilder().enable(false).build()) + fsv.loadAllPartitions() + val basePath2 = new Path(basePath) + val allPartitionPaths = fsv.getPartitionPaths + allPartitionPaths.forEach(partitionPath => { + val pPath = FSUtils.getRelativePartitionPath(basePath2, partitionPath) + assertTrue (fsv.getLatestFileSlices(pPath).filter(fileSlice => fileSlice.hasLogFiles).count() > 0) + }) + fsv.close() + } + + // updates a subset which are not deleted and enable col stats and validate bootstrap + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts1, commonOpts, + dataSourcePath = "index/colstats/update3-input-table-json", + expectedColStatsSourcePath = expectedColStatsSourcePath, + operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, + saveMode = SaveMode.Append, + true, + latestCompletedCommit, + numPartitions = 1, + parquetMaxFileSize = 100 * 1024 * 1024, + smallFileLimit = 0)) + + // trigger one more upsert and compaction (w/ MOR table) and validate. + val expectedColStatsSourcePath1 = if (testCase.tableType == HoodieTableType.COPY_ON_WRITE) { + "index/colstats/cow-bootstrap2-column-stats-index-table.json" + } else { + "index/colstats/mor-bootstrap2-column-stats-index-table.json" + } + + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts1, commonOpts, + dataSourcePath = "index/colstats/update4-input-table-json", + expectedColStatsSourcePath = expectedColStatsSourcePath1, + operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, + saveMode = SaveMode.Append, + true, + latestCompletedCommit, + numPartitions = 1, + parquetMaxFileSize = 100 * 1024 * 1024, + smallFileLimit = 0)) + } + + @ParameterizedTest + @MethodSource(Array("testTableTypePartitionTypeParams")) + def testMetadataColumnStatsIndexInitializationWithRollbacks(tableType: HoodieTableType, partitionCol : String): Unit = { + val testCase = ColumnStatsTestCase(tableType, shouldReadInMemory = true) + val metadataOpts = Map( + HoodieMetadataConfig.ENABLE.key -> "true" + ) + + val commonOpts = Map( + "hoodie.insert.shuffle.parallelism" -> "1", + "hoodie.upsert.shuffle.parallelism" -> "1", + HoodieWriteConfig.TBL_NAME.key -> "hoodie_test", + DataSourceWriteOptions.TABLE_TYPE.key -> testCase.tableType.toString, + RECORDKEY_FIELD.key -> "c1", + PRECOMBINE_FIELD.key -> "c1", + PARTITIONPATH_FIELD.key() -> partitionCol, + "hoodie.write.markers.type" -> "DIRECT", + HoodieTableConfig.POPULATE_META_FIELDS.key -> "true" + ) ++ metadataOpts + + // inserts + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts, + dataSourcePath = "index/colstats/input-table-json", + expectedColStatsSourcePath = null, + operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL, + saveMode = SaveMode.Overwrite, + false, + numPartitions = 1, + parquetMaxFileSize = 100 * 1024 * 1024, + smallFileLimit = 0)) + + // updates + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts, + dataSourcePath = "index/colstats/update2-input-table-json/", + expectedColStatsSourcePath = null, + operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, + saveMode = SaveMode.Append, + false, + numPartitions = 1, + parquetMaxFileSize = 100 * 1024 * 1024, + smallFileLimit = 0)) + + simulateFailureForLatestCommit(tableType, partitionCol) + + val metadataOpts1 = Map( + HoodieMetadataConfig.ENABLE.key -> "true", + HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "true" + ) + + // NOTE: MOR and COW have different fixtures since MOR is bearing delta-log files (holding + // deferred updates), diverging from COW + + val expectedColStatsSourcePath = if (testCase.tableType == HoodieTableType.COPY_ON_WRITE) { + "index/colstats/cow-bootstrap-rollback1-column-stats-index-table.json" + } else { + "index/colstats/mor-bootstrap-rollback1-column-stats-index-table.json" + } + + metaClient = HoodieTableMetaClient.reload(metaClient) + val latestCompletedCommit = metaClient.getActiveTimeline.filterCompletedInstants().lastInstant().get().getTimestamp + + // updates a subset which are not deleted and enable col stats and validate bootstrap + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts1, commonOpts, + dataSourcePath = "index/colstats/update3-input-table-json", + expectedColStatsSourcePath = expectedColStatsSourcePath, + operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, + saveMode = SaveMode.Append, + true, + latestCompletedCommit, + numPartitions = 1, + parquetMaxFileSize = 100 * 1024 * 1024, + smallFileLimit = 0)) + + metaClient = HoodieTableMetaClient.reload(metaClient) + assertTrue(metaClient.getActiveTimeline.getRollbackTimeline.countInstants() > 0) + } + + def simulateFailureForLatestCommit(tableType: HoodieTableType, partitionCol: String) : Unit = { + // simulate failure for latest commit. + metaClient = HoodieTableMetaClient.reload(metaClient) + var baseFileName : String = null + var logFileName : String = null + val lastCompletedCommit = metaClient.getActiveTimeline.getCommitsTimeline.filterCompletedInstants().lastInstant().get() + if (tableType == HoodieTableType.MERGE_ON_READ) { + val dataFiles = if (StringUtils.isNullOrEmpty(partitionCol)) { + fs.listStatus(new Path(metaClient.getBasePath)).toSeq + } else { + fs.listStatus(new Path(metaClient.getBasePath + "/9")).toSeq + } + val logFileFileStatus = dataFiles.filter(fileStatus => fileStatus.getPath.getName.contains(".log")).head + logFileName = logFileFileStatus.getPath.getName + } else { + val dataFiles = if (StringUtils.isNullOrEmpty(partitionCol)) { + fs.listStatus(new Path(metaClient.getBasePath)).toSeq + } else { + fs.listStatus(new Path(metaClient.getBasePath + "/9")).toSeq + } + val baseFileFileStatus = dataFiles.filter(fileStatus => fileStatus.getPath.getName.contains(lastCompletedCommit.getTimestamp)).head + baseFileName = baseFileFileStatus.getPath.getName + } + + val latestCompletedFileName = lastCompletedCommit.getFileName + fs.delete(new Path(metaClient.getBasePath + "/.hoodie/" + latestCompletedFileName), false) + + // re-create marker for the deleted file. + if (tableType == HoodieTableType.MERGE_ON_READ) { + if (StringUtils.isNullOrEmpty(partitionCol)) { + { fs.create(new Path(metaClient.getBasePath + "/.hoodie/.temp/" + lastCompletedCommit.getTimestamp + "/" + logFileName + ".marker.APPEND")).close() } + } else { + { fs.create(new Path(metaClient.getBasePath + "/.hoodie/.temp/" + lastCompletedCommit.getTimestamp + "/9/" + logFileName + ".marker.APPEND")).close() } + } + } else { + if (StringUtils.isNullOrEmpty(partitionCol)) { + { fs.create(new Path(metaClient.getBasePath + "/.hoodie/.temp/" + lastCompletedCommit.getTimestamp + "/" + baseFileName + ".marker.MERGE")).close() } + } else { + { fs.create(new Path(metaClient.getBasePath + "/.hoodie/.temp/" + lastCompletedCommit.getTimestamp + "/9/" + baseFileName + ".marker.MERGE")).close() } + } + } + } + + @Test + def testMORDeleteBlocks(): Unit = { + val tableType: HoodieTableType = HoodieTableType.MERGE_ON_READ + val partitionCol = "c8" + val testCase = ColumnStatsTestCase(tableType, shouldReadInMemory = true) + val metadataOpts = Map( + HoodieMetadataConfig.ENABLE.key -> "true", + HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "true" + ) + + val commonOpts = Map( + "hoodie.insert.shuffle.parallelism" -> "1", + "hoodie.upsert.shuffle.parallelism" -> "1", + HoodieWriteConfig.TBL_NAME.key -> "hoodie_test", + DataSourceWriteOptions.TABLE_TYPE.key -> testCase.tableType.toString, + RECORDKEY_FIELD.key -> "c1", + PRECOMBINE_FIELD.key -> "c1", + PARTITIONPATH_FIELD.key() -> partitionCol, + HoodieTableConfig.POPULATE_META_FIELDS.key -> "true", + HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS.key() -> "5" + ) ++ metadataOpts + + // inserts + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts, + dataSourcePath = "index/colstats/input-table-json", + expectedColStatsSourcePath = null, + operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL, + saveMode = SaveMode.Overwrite, + false, + numPartitions = 1, + parquetMaxFileSize = 100 * 1024 * 1024, + smallFileLimit = 0)) + + // updates + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts, + dataSourcePath = "index/colstats/update2-input-table-json/", + expectedColStatsSourcePath = null, + operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, + saveMode = SaveMode.Append, + false, + numPartitions = 1, + parquetMaxFileSize = 100 * 1024 * 1024, + smallFileLimit = 0)) + + val expectedColStatsSourcePath = "index/colstats/mor-delete-block1-column-stats-index-table.json" + + // delete a subset of recs. this will add a delete log block for MOR table. + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts, + dataSourcePath = "index/colstats/delete-input-table-json/", + expectedColStatsSourcePath = expectedColStatsSourcePath, + operation = DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL, + saveMode = SaveMode.Append, + true, + numPartitions = 1, + parquetMaxFileSize = 100 * 1024 * 1024, + smallFileLimit = 0)) + } + + @ParameterizedTest + @ValueSource(strings = Array("", "c8")) + def testColStatsWithCleanCOW(partitionCol: String): Unit = { + val tableType: HoodieTableType = HoodieTableType.COPY_ON_WRITE + val testCase = ColumnStatsTestCase(tableType, shouldReadInMemory = true) + val metadataOpts = Map( + HoodieMetadataConfig.ENABLE.key -> "true" + ) + + val commonOpts = Map( + "hoodie.insert.shuffle.parallelism" -> "1", + "hoodie.upsert.shuffle.parallelism" -> "1", + HoodieWriteConfig.TBL_NAME.key -> "hoodie_test", + DataSourceWriteOptions.TABLE_TYPE.key -> testCase.tableType.toString, + RECORDKEY_FIELD.key -> "c1", + PRECOMBINE_FIELD.key -> "c1", + PARTITIONPATH_FIELD.key() -> partitionCol, + HoodieTableConfig.POPULATE_META_FIELDS.key -> "true", + HoodieCleanConfig.CLEANER_COMMITS_RETAINED.key() -> "1" + ) ++ metadataOpts + + // inserts + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts, + dataSourcePath = "index/colstats/input-table-json", + expectedColStatsSourcePath = null, + operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL, + saveMode = SaveMode.Overwrite, + false, + numPartitions = 1, + parquetMaxFileSize = 100 * 1024 * 1024, + smallFileLimit = 0)) + + val metadataOpts1 = Map( + HoodieMetadataConfig.ENABLE.key -> "true", + HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "true" + ) + + // updates 1 + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts1, commonOpts, + dataSourcePath = "index/colstats/update2-input-table-json/", + expectedColStatsSourcePath = null, + operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, + saveMode = SaveMode.Append, + false, + numPartitions = 1, + parquetMaxFileSize = 100 * 1024 * 1024, + smallFileLimit = 0)) + + val expectedColStatsSourcePath = if (testCase.tableType == HoodieTableType.COPY_ON_WRITE) { + "index/colstats/cow-clean1-column-stats-index-table.json" + } else { + "index/colstats/mor-bootstrap-rollback1-column-stats-index-table.json" + } + + // updates 2 + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts1, commonOpts, + dataSourcePath = "index/colstats/update3-input-table-json/", + expectedColStatsSourcePath = expectedColStatsSourcePath, + operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, + saveMode = SaveMode.Append, + true, + numPartitions = 1, + parquetMaxFileSize = 100 * 1024 * 1024, + smallFileLimit = 0)) + } + + @ParameterizedTest + @ValueSource(strings = Array("", "c8")) + def testColStatsWithCleanMOR(partitionCol: String): Unit = { + val tableType: HoodieTableType = HoodieTableType.MERGE_ON_READ + val testCase = ColumnStatsTestCase(tableType, shouldReadInMemory = true) + val metadataOpts = Map( + HoodieMetadataConfig.ENABLE.key -> "true" + ) + + val commonOpts = Map( + "hoodie.insert.shuffle.parallelism" -> "1", + "hoodie.upsert.shuffle.parallelism" -> "1", + HoodieWriteConfig.TBL_NAME.key -> "hoodie_test", + DataSourceWriteOptions.TABLE_TYPE.key -> testCase.tableType.toString, + RECORDKEY_FIELD.key -> "c1", + PRECOMBINE_FIELD.key -> "c1", + PARTITIONPATH_FIELD.key() -> partitionCol, + HoodieTableConfig.POPULATE_META_FIELDS.key -> "true", + HoodieCleanConfig.CLEANER_COMMITS_RETAINED.key() -> "1", + HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS.key() -> "2" + ) ++ metadataOpts + + // inserts + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts, + dataSourcePath = "index/colstats/input-table-json", + expectedColStatsSourcePath = null, + operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL, + saveMode = SaveMode.Overwrite, + false, + numPartitions = 1, + parquetMaxFileSize = 100 * 1024 * 1024, + smallFileLimit = 0)) + + val metadataOpts1 = Map( + HoodieMetadataConfig.ENABLE.key -> "true", + HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "true" + ) + + // updates 1 + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts1, commonOpts, + dataSourcePath = "index/colstats/update2-input-table-json/", + expectedColStatsSourcePath = null, + operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, + saveMode = SaveMode.Append, + false, + numPartitions = 1, + parquetMaxFileSize = 100 * 1024 * 1024, + smallFileLimit = 0)) + + val expectedColStatsSourcePath = if (testCase.tableType == HoodieTableType.COPY_ON_WRITE) { + "index/colstats/cow-clean1-column-stats-index-table.json" + } else { + "index/colstats/mor-clean1-column-stats-index-table.json" + } + + // updates 2 + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts1, commonOpts, + dataSourcePath = "index/colstats/update3-input-table-json/", + expectedColStatsSourcePath = expectedColStatsSourcePath, + operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, + saveMode = SaveMode.Append, + true, + numPartitions = 1, + parquetMaxFileSize = 100 * 1024 * 1024, + smallFileLimit = 0)) + + metaClient = HoodieTableMetaClient.reload(metaClient) + assertTrue(metaClient.getActiveTimeline.getCleanerTimeline.countInstants() > 0) + } @ParameterizedTest @EnumSource(classOf[HoodieTableType]) diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndexWithSQL.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndexWithSQL.scala index 9c4099035b12d..d2b76b66c909f 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndexWithSQL.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndexWithSQL.scala @@ -28,7 +28,7 @@ import org.apache.hudi.common.model.{HoodieCommitMetadata, HoodieTableType, Writ import org.apache.hudi.common.table.HoodieTableConfig import org.apache.hudi.common.table.timeline.HoodieInstant import org.apache.hudi.config.{HoodieCompactionConfig, HoodieIndexConfig, HoodieWriteConfig} -import org.apache.hudi.functional.ColumnStatIndexTestBase.ColumnStatsTestCase +import org.apache.hudi.functional.ColumnStatIndexTestBase.{ColumnStatsTestCase, ColumnStatsTestParams} import org.apache.hudi.index.HoodieIndex.IndexType.INMEMORY import org.apache.hudi.metadata.HoodieMetadataFileSystemView import org.apache.hudi.util.JavaConversions @@ -89,12 +89,12 @@ class TestColumnStatsIndexWithSQL extends ColumnStatIndexTestBase { HoodieIndexConfig.INDEX_TYPE.key() -> INMEMORY.name() ) ++ metadataOpts - doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts, + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts, dataSourcePath = "index/colstats/input-table-json", expectedColStatsSourcePath = "index/colstats/column-stats-index-table.json", operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL, saveMode = SaveMode.Overwrite, - shouldValidate = false) + shouldValidate = false)) assertEquals(4, getLatestDataFilesCount(commonOpts)) assertEquals(0, getLatestDataFilesCount(commonOpts, includeLogFiles = false)) @@ -134,12 +134,12 @@ class TestColumnStatsIndexWithSQL extends ColumnStatIndexTestBase { verifyFileIndexAndSQLQueries(commonOpts, isTableDataSameAsAfterSecondInstant = true) // Add the last df back and verify the queries - doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts, + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts, dataSourcePath = "index/colstats/update-input-table-json", expectedColStatsSourcePath = "", operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, saveMode = SaveMode.Append, - shouldValidate = false) + shouldValidate = false)) verifyFileIndexAndSQLQueries(commonOpts, verifyFileCount = false) } @@ -196,27 +196,27 @@ class TestColumnStatsIndexWithSQL extends ColumnStatIndexTestBase { writeClient.scheduleCompaction(org.apache.hudi.common.util.Option.empty()) writeClient.close() - doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts, + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts, dataSourcePath = "index/colstats/update-input-table-json", expectedColStatsSourcePath = "", operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, saveMode = SaveMode.Append, - shouldValidate = false) + shouldValidate = false)) verifyFileIndexAndSQLQueries(commonOpts) } private def setupTable(testCase: ColumnStatsTestCase, metadataOpts: Map[String, String], commonOpts: Map[String, String], shouldValidate: Boolean): Unit = { - doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts, + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts, dataSourcePath = "index/colstats/input-table-json", expectedColStatsSourcePath = "index/colstats/column-stats-index-table.json", operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL, - saveMode = SaveMode.Overwrite) + saveMode = SaveMode.Overwrite)) - doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts, + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts, dataSourcePath = "index/colstats/another-input-table-json", expectedColStatsSourcePath = "index/colstats/updated-column-stats-index-table.json", operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, - saveMode = SaveMode.Append) + saveMode = SaveMode.Append)) // NOTE: MOR and COW have different fixtures since MOR is bearing delta-log files (holding // deferred updates), diverging from COW @@ -226,12 +226,12 @@ class TestColumnStatsIndexWithSQL extends ColumnStatIndexTestBase { "index/colstats/mor-updated2-column-stats-index-table.json" } - doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts, + doWriteAndValidateColumnStats(ColumnStatsTestParams(testCase, metadataOpts, commonOpts, dataSourcePath = "index/colstats/update-input-table-json", expectedColStatsSourcePath = expectedColStatsSourcePath, operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, saveMode = SaveMode.Append, - shouldValidate) + shouldValidate)) } def verifyFileIndexAndSQLQueries(opts: Map[String, String], isTableDataSameAsAfterSecondInstant: Boolean = false, verifyFileCount: Boolean = true): Unit = { From 8d17a04be5dfef61533e17bce3bdaece3fcb62fb Mon Sep 17 00:00:00 2001 From: Vamsi Date: Thu, 12 Mar 2026 16:54:01 +0530 Subject: [PATCH 02/12] Fix LogFileColStatsTestUtil to use IndexedRecord for collectColumnRangeMetadata Convert HoodieRecord list to IndexedRecord before calling collectColumnRangeMetadata, matching the 3-arg signature in 0.14.x (master's version accepted HoodieRecord + Schema). Co-Authored-By: Claude Sonnet 4.6 --- .../hudi/testutils/LogFileColStatsTestUtil.java | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/LogFileColStatsTestUtil.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/LogFileColStatsTestUtil.java index 464ad5ddca1e4..2e0baaac74940 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/LogFileColStatsTestUtil.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/LogFileColStatsTestUtil.java @@ -20,6 +20,7 @@ import org.apache.hudi.common.model.HoodieColumnRangeMetadata; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.table.log.HoodieUnMergedLogRecordScanner; @@ -27,6 +28,7 @@ import org.apache.hudi.exception.HoodieException; import org.apache.avro.Schema; +import org.apache.avro.generic.IndexedRecord; import org.apache.spark.sql.Row; import org.apache.spark.sql.catalyst.expressions.GenericRow; @@ -53,7 +55,7 @@ public static Option getLogFileColumnRangeMetadata(String filePath, HoodieT .collect(Collectors.toList()); List records = new ArrayList<>(); HoodieUnMergedLogRecordScanner scanner = HoodieUnMergedLogRecordScanner.newBuilder() - .withStorage(datasetMetaClient.getStorage()) + .withFileSystem(datasetMetaClient.getFs()) .withBasePath(datasetMetaClient.getBasePath()) .withLogFilePaths(Collections.singletonList(filePath)) .withBufferSize(maxBufferSize) @@ -65,8 +67,15 @@ public static Option getLogFileColumnRangeMetadata(String filePath, HoodieT if (records.isEmpty()) { return Option.empty(); } + List indexedRecords = new ArrayList<>(); + for (HoodieRecord hoodieRecord : records) { + Option insertValue = ((HoodieRecordPayload) hoodieRecord.getData()).getInsertValue(writerSchemaOpt.get()); + if (insertValue.isPresent()) { + indexedRecords.add(insertValue.get()); + } + } Map> columnRangeMetadataMap = - collectColumnRangeMetadata(records, fieldsToIndex, filePath, writerSchemaOpt.get()); + collectColumnRangeMetadata(indexedRecords, fieldsToIndex, filePath); List> columnRangeMetadataList = new ArrayList<>(columnRangeMetadataMap.values()); return Option.of(getColStatsEntry(filePath, columnRangeMetadataList)); } else { From a519ca16022929d61cf85f6748b6b52385df6fe1 Mon Sep 17 00:00:00 2001 From: Vamsi Date: Thu, 12 Mar 2026 17:09:10 +0530 Subject: [PATCH 03/12] Fix Java 8 generics inference and FileSlice API compatibility - Replace Collector wildcard pattern with forEach+map in collectColumnRangeMetadata (HoodieTableMetadataUtil) and readRangeFromParquetMetadata (ParquetUtils) to fix Java 8 type inference failures - Replace FileSlice.hasLogFiles() with getLogFiles().findAny().isPresent() since hasLogFiles() doesn't exist in 0.14.x Co-Authored-By: Claude Sonnet 4.6 --- .../apache/hudi/common/util/ParquetUtils.java | 58 +++++++++---------- .../metadata/HoodieTableMetadataUtil.java | 44 +++++++------- .../functional/TestColumnStatsIndex.scala | 2 +- 3 files changed, 47 insertions(+), 57 deletions(-) diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java index de5572523c1eb..84a494fe29f50 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java @@ -61,7 +61,6 @@ import java.util.Map; import java.util.Set; import java.util.function.Function; -import java.util.stream.Collector; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -305,38 +304,33 @@ public List> readRangeFromParquetMetadata( ) { ParquetMetadata metadata = readMetadata(conf, parquetFilePath); - // NOTE: This collector has to have fully specialized generic type params since - // Java 1.8 struggles to infer them - Collector, ?, Map>>> groupingByCollector = - Collectors.groupingBy(HoodieColumnRangeMetadata::getColumnName); - // Collect stats from all individual Parquet blocks - Map>> columnToStatsListMap = - (Map>>) metadata.getBlocks().stream().sequential() - .flatMap(blockMetaData -> - blockMetaData.getColumns().stream() - .filter(f -> cols.contains(f.getPath().toDotString())) - .map(columnChunkMetaData -> { - Statistics stats = columnChunkMetaData.getStatistics(); - return HoodieColumnRangeMetadata.create( - parquetFilePath.getName(), - columnChunkMetaData.getPath().toDotString(), - convertToNativeJavaType( - columnChunkMetaData.getPrimitiveType(), - stats.genericGetMin()), - convertToNativeJavaType( - columnChunkMetaData.getPrimitiveType(), - stats.genericGetMax()), - // NOTE: In case when column contains only nulls Parquet won't be creating - // stats for it instead returning stubbed (empty) object. In that case - // we have to equate number of nulls to the value count ourselves - stats.isEmpty() ? columnChunkMetaData.getValueCount() : stats.getNumNulls(), - columnChunkMetaData.getValueCount(), - columnChunkMetaData.getTotalSize(), - columnChunkMetaData.getTotalUncompressedSize()); - }) - ) - .collect(groupingByCollector); + Map>> columnToStatsListMap = new HashMap<>(); + metadata.getBlocks().stream().sequential() + .flatMap(blockMetaData -> + blockMetaData.getColumns().stream() + .filter(f -> cols.contains(f.getPath().toDotString())) + .map(columnChunkMetaData -> { + Statistics stats = columnChunkMetaData.getStatistics(); + return HoodieColumnRangeMetadata.create( + parquetFilePath.getName(), + columnChunkMetaData.getPath().toDotString(), + convertToNativeJavaType( + columnChunkMetaData.getPrimitiveType(), + stats.genericGetMin()), + convertToNativeJavaType( + columnChunkMetaData.getPrimitiveType(), + stats.genericGetMax()), + // NOTE: In case when column contains only nulls Parquet won't be creating + // stats for it instead returning stubbed (empty) object. In that case + // we have to equate number of nulls to the value count ourselves + stats.isEmpty() ? columnChunkMetaData.getValueCount() : stats.getNumNulls(), + columnChunkMetaData.getValueCount(), + columnChunkMetaData.getTotalSize(), + columnChunkMetaData.getTotalUncompressedSize()); + }) + ) + .forEach(crm -> columnToStatsListMap.computeIfAbsent(crm.getColumnName(), k -> new ArrayList<>()).add(crm)); // Combine those into file-level statistics // NOTE: Inlining this var makes javac (1.8) upset (due to its inability to infer diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java index f4ba94136b9c9..e3554bd977ff3 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java @@ -64,6 +64,7 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ParquetUtils; import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.VisibleForTesting; import org.apache.hudi.common.util.collection.ClosableIterator; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.common.util.collection.Tuple3; @@ -74,7 +75,6 @@ import org.apache.hudi.io.storage.HoodieFileReaderFactory; import org.apache.hudi.util.Lazy; -import com.google.common.annotations.VisibleForTesting; import org.apache.avro.AvroTypeException; import org.apache.avro.LogicalTypes; import org.apache.avro.Schema; @@ -106,8 +106,6 @@ import java.util.Set; import java.util.UUID; import java.util.function.BiFunction; -import java.util.function.Function; -import java.util.stream.Collector; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -242,27 +240,25 @@ class ColumnStats { }); }); - Collector, ?, Map>> collector = - Collectors.toMap(colRangeMetadata -> colRangeMetadata.getColumnName(), Function.identity()); - - return (Map>) targetFields.stream() - .map(field -> { - ColumnStats colStats = allColumnStats.get(field.name()); - return HoodieColumnRangeMetadata.create( - filePath, - field.name(), - colStats == null ? null : coerceToComparable(field.schema(), colStats.minValue), - colStats == null ? null : coerceToComparable(field.schema(), colStats.maxValue), - colStats == null ? 0 : colStats.nullCount, - colStats == null ? 0 : colStats.valueCount, - // NOTE: Size and compressed size statistics are set to 0 to make sure we're not - // mixing up those provided by Parquet with the ones from other encodings, - // since those are not directly comparable - 0, - 0 - ); - }) - .collect(collector); + Map> result = new HashMap<>(); + targetFields.forEach(field -> { + ColumnStats colStats = allColumnStats.get(field.name()); + HoodieColumnRangeMetadata rangeMetadata = HoodieColumnRangeMetadata.create( + filePath, + field.name(), + colStats == null ? null : coerceToComparable(field.schema(), colStats.minValue), + colStats == null ? null : coerceToComparable(field.schema(), colStats.maxValue), + colStats == null ? 0 : colStats.nullCount, + colStats == null ? 0 : colStats.valueCount, + // NOTE: Size and compressed size statistics are set to 0 to make sure we're not + // mixing up those provided by Parquet with the ones from other encodings, + // since those are not directly comparable + 0, + 0 + ); + result.put(rangeMetadata.getColumnName(), rangeMetadata); + }); + return result; } /** diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala index 3702cc8f188f3..7d0bacf03bf77 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala @@ -175,7 +175,7 @@ class TestColumnStatsIndex extends ColumnStatIndexTestBase { val allPartitionPaths = fsv.getPartitionPaths allPartitionPaths.forEach(partitionPath => { val pPath = FSUtils.getRelativePartitionPath(basePath2, partitionPath) - assertTrue (fsv.getLatestFileSlices(pPath).filter(fileSlice => fileSlice.hasLogFiles).count() > 0) + assertTrue (fsv.getLatestFileSlices(pPath).filter(fileSlice => fileSlice.getLogFiles.findAny().isPresent).count() > 0) }) fsv.close() } From 03a878bec557bd75f999869db0841b2bc9d97cde Mon Sep 17 00:00:00 2001 From: Vamsi Date: Thu, 12 Mar 2026 17:14:48 +0530 Subject: [PATCH 04/12] Fix Java 8 type inference in ParquetUtils flatMap chain Collect flatMap result to List before grouping to avoid raw type inference issue where Java 8 loses generic type parameter through the flatMap. Co-Authored-By: Claude Sonnet 4.6 --- .../apache/hudi/common/util/ParquetUtils.java | 57 +++++++++++-------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java index 84a494fe29f50..058e4f2f50130 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java @@ -305,32 +305,39 @@ public List> readRangeFromParquetMetadata( ParquetMetadata metadata = readMetadata(conf, parquetFilePath); // Collect stats from all individual Parquet blocks + // NOTE: Intermediate collect to List is required since Java 1.8 cannot infer + // the generic type parameter through the flatMap chain + @SuppressWarnings("unchecked") + List> allBlockStats = (List>) (List) + metadata.getBlocks().stream().sequential() + .flatMap(blockMetaData -> + blockMetaData.getColumns().stream() + .filter(f -> cols.contains(f.getPath().toDotString())) + .map(columnChunkMetaData -> { + Statistics stats = columnChunkMetaData.getStatistics(); + return HoodieColumnRangeMetadata.create( + parquetFilePath.getName(), + columnChunkMetaData.getPath().toDotString(), + convertToNativeJavaType( + columnChunkMetaData.getPrimitiveType(), + stats.genericGetMin()), + convertToNativeJavaType( + columnChunkMetaData.getPrimitiveType(), + stats.genericGetMax()), + // NOTE: In case when column contains only nulls Parquet won't be creating + // stats for it instead returning stubbed (empty) object. In that case + // we have to equate number of nulls to the value count ourselves + stats.isEmpty() ? columnChunkMetaData.getValueCount() : stats.getNumNulls(), + columnChunkMetaData.getValueCount(), + columnChunkMetaData.getTotalSize(), + columnChunkMetaData.getTotalUncompressedSize()); + }) + ) + .collect(Collectors.toList()); Map>> columnToStatsListMap = new HashMap<>(); - metadata.getBlocks().stream().sequential() - .flatMap(blockMetaData -> - blockMetaData.getColumns().stream() - .filter(f -> cols.contains(f.getPath().toDotString())) - .map(columnChunkMetaData -> { - Statistics stats = columnChunkMetaData.getStatistics(); - return HoodieColumnRangeMetadata.create( - parquetFilePath.getName(), - columnChunkMetaData.getPath().toDotString(), - convertToNativeJavaType( - columnChunkMetaData.getPrimitiveType(), - stats.genericGetMin()), - convertToNativeJavaType( - columnChunkMetaData.getPrimitiveType(), - stats.genericGetMax()), - // NOTE: In case when column contains only nulls Parquet won't be creating - // stats for it instead returning stubbed (empty) object. In that case - // we have to equate number of nulls to the value count ourselves - stats.isEmpty() ? columnChunkMetaData.getValueCount() : stats.getNumNulls(), - columnChunkMetaData.getValueCount(), - columnChunkMetaData.getTotalSize(), - columnChunkMetaData.getTotalUncompressedSize()); - }) - ) - .forEach(crm -> columnToStatsListMap.computeIfAbsent(crm.getColumnName(), k -> new ArrayList<>()).add(crm)); + for (HoodieColumnRangeMetadata crm : allBlockStats) { + columnToStatsListMap.computeIfAbsent(crm.getColumnName(), k -> new ArrayList<>()).add(crm); + } // Combine those into file-level statistics // NOTE: Inlining this var makes javac (1.8) upset (due to its inability to infer From a7d9668e87ac3850d0e96687fce0fb6a8596e468 Mon Sep 17 00:00:00 2001 From: Vamsi Date: Fri, 13 Mar 2026 20:59:42 +0530 Subject: [PATCH 05/12] Fix conflicts --- .../HoodieBackedTableMetadataWriter.java | 26 +- .../log/HoodieUnMergedLogRecordScanner.java | 2 +- .../apache/hudi/common/util/ParquetUtils.java | 65 ++-- .../metadata/HoodieTableMetadataUtil.java | 147 ++------ .../MetadataRecordsGenerationParams.java | 11 +- .../metadata/TestHoodieMetadataPayload.java | 22 +- .../metadata/TestHoodieMetadataPayload.java | 295 ---------------- .../metadata/TestHoodieTableMetadataUtil.java | 315 ------------------ 8 files changed, 95 insertions(+), 788 deletions(-) delete mode 100644 hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java delete mode 100644 hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java index 68b02ad6d39ba..2f1ab37bf52b6 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java @@ -344,7 +344,6 @@ private void initializeFromFilesystem(String initializationTime, List pendingDataInstants = getPendingDataInstants(dataMetaClient); // FILES partition is always required and is initialized first boolean filesPartitionAvailable = dataMetaClient.getTableConfig().isMetadataPartitionAvailable(MetadataPartitionType.FILES); @@ -369,7 +368,7 @@ private void initializeFromFilesystem(String initializationTime, List partitionInfoList; if (filesPartitionAvailable) { - partitionInfoList = listAllPartitionsFromMDT(initializationTime, pendingDataInstants); + partitionInfoList = listAllPartitionsFromMDT(initializationTime); } else { // if auto initialization is enabled, then we need to list all partitions from the file system if (dataWriteConfig.getMetadataConfig().shouldAutoInitialize()) { @@ -419,7 +418,7 @@ private void initializeFromFilesystem(String initializationTime, List> initializeColumnStatsPartition(Map> partitionToFilesMap) { // during initialization, we need stats for base and log files. HoodieData records = HoodieTableMetadataUtil.convertFilesToColumnStatsRecords( - engineContext, Collections.emptyMap(), partitionToFilesMap, dataMetaClient, dataWriteConfig.isMetadataColumnStatsIndexEnabled(), - dataWriteConfig.getColumnStatsIndexParallelism(), dataWriteConfig.getColumnsEnabledForColumnStatsIndex(), - dataWriteConfig.getMetadataConfig().getMaxReaderBufferSize()); + engineContext, Collections.emptyMap(), partitionToFilesMap, getRecordsGenerationParams()); final int fileGroupCount = dataWriteConfig.getMetadataConfig().getColumnStatsIndexFileGroupCount(); return Pair.of(fileGroupCount, records); @@ -566,16 +563,6 @@ private boolean anyPendingDataInstant(HoodieTableMetaClient dataMetaClient, Opti return false; } - private Set getPendingDataInstants(HoodieTableMetaClient dataMetaClient) { - // Initialize excluding the pending operations on the dataset - return dataMetaClient.getActiveTimeline() - .getInstantsAsStream().filter(i -> !i.isCompleted()) - // regular writers should not be blocked due to pending indexing action - .filter(i -> !HoodieTimeline.INDEXING_ACTION.equals(i.getAction())) - .map(HoodieInstant::getTimestamp) - .collect(Collectors.toSet()); - } - private HoodieTableMetaClient initializeMetaClient() throws IOException { return HoodieTableMetaClient.withPropertyBuilder() .setTableType(HoodieTableType.MERGE_ON_READ) @@ -651,11 +638,11 @@ private List listAllPartitionsFromFilesystem(String initializatio * @param initializationTime Files which have a timestamp after this are neglected * @return List consisting of {@code DirectoryInfo} for each partition found. */ - private List listAllPartitionsFromMDT(String initializationTime, Set pendingDataInstants) throws IOException { + private List listAllPartitionsFromMDT(String initializationTime) throws IOException { + List dirinfoList = new LinkedList<>(); List allAbsolutePartitionPaths = metadata.getAllPartitionPaths().stream() .map(partitionPath -> dataWriteConfig.getBasePath() + "/" + partitionPath).collect(Collectors.toList()); Map partitionFileMap = metadata.getAllFilesInPartitions(allAbsolutePartitionPaths); - List dirinfoList = new ArrayList<>(partitionFileMap.size()); for (Map.Entry entry : partitionFileMap.entrySet()) { String relativeDirPath = FSUtils.getRelativePartitionPath(new Path(dataWriteConfig.getBasePath()), new Path(entry.getKey())); dirinfoList.add(new DirectoryInfo(relativeDirPath, entry.getValue(), initializationTime)); @@ -789,7 +776,8 @@ private MetadataRecordsGenerationParams getRecordsGenerationParams() { dataWriteConfig.isMetadataColumnStatsIndexEnabled(), dataWriteConfig.getColumnStatsIndexParallelism(), dataWriteConfig.getColumnsEnabledForColumnStatsIndex(), - dataWriteConfig.getColumnsEnabledForBloomFilterIndex()); + dataWriteConfig.getColumnsEnabledForBloomFilterIndex(), + dataWriteConfig.getMetadataConfig().getMaxReaderBufferSize()); } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java index 99fe6c1ff54f2..f62ec0febd578 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java @@ -79,7 +79,7 @@ public void processNextRecord(HoodieRecord hoodieRecord) throws Exception @Override protected void processNextDeletedRecord(DeleteRecord deleteRecord) { - // no - op + throw new IllegalStateException("Not expected to see delete records in this log-scan mode. Check Job Config"); } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java index 058e4f2f50130..de5572523c1eb 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java @@ -61,6 +61,7 @@ import java.util.Map; import java.util.Set; import java.util.function.Function; +import java.util.stream.Collector; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -304,40 +305,38 @@ public List> readRangeFromParquetMetadata( ) { ParquetMetadata metadata = readMetadata(conf, parquetFilePath); + // NOTE: This collector has to have fully specialized generic type params since + // Java 1.8 struggles to infer them + Collector, ?, Map>>> groupingByCollector = + Collectors.groupingBy(HoodieColumnRangeMetadata::getColumnName); + // Collect stats from all individual Parquet blocks - // NOTE: Intermediate collect to List is required since Java 1.8 cannot infer - // the generic type parameter through the flatMap chain - @SuppressWarnings("unchecked") - List> allBlockStats = (List>) (List) - metadata.getBlocks().stream().sequential() - .flatMap(blockMetaData -> - blockMetaData.getColumns().stream() - .filter(f -> cols.contains(f.getPath().toDotString())) - .map(columnChunkMetaData -> { - Statistics stats = columnChunkMetaData.getStatistics(); - return HoodieColumnRangeMetadata.create( - parquetFilePath.getName(), - columnChunkMetaData.getPath().toDotString(), - convertToNativeJavaType( - columnChunkMetaData.getPrimitiveType(), - stats.genericGetMin()), - convertToNativeJavaType( - columnChunkMetaData.getPrimitiveType(), - stats.genericGetMax()), - // NOTE: In case when column contains only nulls Parquet won't be creating - // stats for it instead returning stubbed (empty) object. In that case - // we have to equate number of nulls to the value count ourselves - stats.isEmpty() ? columnChunkMetaData.getValueCount() : stats.getNumNulls(), - columnChunkMetaData.getValueCount(), - columnChunkMetaData.getTotalSize(), - columnChunkMetaData.getTotalUncompressedSize()); - }) - ) - .collect(Collectors.toList()); - Map>> columnToStatsListMap = new HashMap<>(); - for (HoodieColumnRangeMetadata crm : allBlockStats) { - columnToStatsListMap.computeIfAbsent(crm.getColumnName(), k -> new ArrayList<>()).add(crm); - } + Map>> columnToStatsListMap = + (Map>>) metadata.getBlocks().stream().sequential() + .flatMap(blockMetaData -> + blockMetaData.getColumns().stream() + .filter(f -> cols.contains(f.getPath().toDotString())) + .map(columnChunkMetaData -> { + Statistics stats = columnChunkMetaData.getStatistics(); + return HoodieColumnRangeMetadata.create( + parquetFilePath.getName(), + columnChunkMetaData.getPath().toDotString(), + convertToNativeJavaType( + columnChunkMetaData.getPrimitiveType(), + stats.genericGetMin()), + convertToNativeJavaType( + columnChunkMetaData.getPrimitiveType(), + stats.genericGetMax()), + // NOTE: In case when column contains only nulls Parquet won't be creating + // stats for it instead returning stubbed (empty) object. In that case + // we have to equate number of nulls to the value count ourselves + stats.isEmpty() ? columnChunkMetaData.getValueCount() : stats.getNumNulls(), + columnChunkMetaData.getValueCount(), + columnChunkMetaData.getTotalSize(), + columnChunkMetaData.getTotalUncompressedSize()); + }) + ) + .collect(groupingByCollector); // Combine those into file-level statistics // NOTE: Inlining this var makes javac (1.8) upset (due to its inability to infer diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java index e3554bd977ff3..6c1de68e043cc 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java @@ -44,14 +44,12 @@ import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; -import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieRecordGlobalLocation; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; -import org.apache.hudi.common.table.log.HoodieUnMergedLogRecordScanner; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; @@ -64,7 +62,6 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ParquetUtils; import org.apache.hudi.common.util.StringUtils; -import org.apache.hudi.common.util.VisibleForTesting; import org.apache.hudi.common.util.collection.ClosableIterator; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.common.util.collection.Tuple3; @@ -106,6 +103,8 @@ import java.util.Set; import java.util.UUID; import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.stream.Collector; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -240,25 +239,27 @@ class ColumnStats { }); }); - Map> result = new HashMap<>(); - targetFields.forEach(field -> { - ColumnStats colStats = allColumnStats.get(field.name()); - HoodieColumnRangeMetadata rangeMetadata = HoodieColumnRangeMetadata.create( - filePath, - field.name(), - colStats == null ? null : coerceToComparable(field.schema(), colStats.minValue), - colStats == null ? null : coerceToComparable(field.schema(), colStats.maxValue), - colStats == null ? 0 : colStats.nullCount, - colStats == null ? 0 : colStats.valueCount, - // NOTE: Size and compressed size statistics are set to 0 to make sure we're not - // mixing up those provided by Parquet with the ones from other encodings, - // since those are not directly comparable - 0, - 0 - ); - result.put(rangeMetadata.getColumnName(), rangeMetadata); - }); - return result; + Collector, ?, Map>> collector = + Collectors.toMap(colRangeMetadata -> colRangeMetadata.getColumnName(), Function.identity()); + + return (Map>) targetFields.stream() + .map(field -> { + ColumnStats colStats = allColumnStats.get(field.name()); + return HoodieColumnRangeMetadata.create( + filePath, + field.name(), + colStats == null ? null : coerceToComparable(field.schema(), colStats.minValue), + colStats == null ? null : coerceToComparable(field.schema(), colStats.maxValue), + colStats == null ? 0 : colStats.nullCount, + colStats == null ? 0 : colStats.valueCount, + // NOTE: Size and compressed size statistics are set to 0 to make sure we're not + // mixing up those provided by Parquet with the ones from other encodings, + // since those are not directly comparable + 0, + 0 + ); + }) + .collect(collector); } /** @@ -854,18 +855,12 @@ public static HoodieData convertFilesToBloomFilterRecords(HoodieEn public static HoodieData convertFilesToColumnStatsRecords(HoodieEngineContext engineContext, Map> partitionToDeletedFiles, Map> partitionToAppendedFiles, - HoodieTableMetaClient dataMetaClient, - boolean isColumnStatsIndexEnabled, - int columnStatsIndexParallelism, - List targetColumnsForColumnStatsIndex, - int maxReaderBufferSize) { - if (!isColumnStatsIndexEnabled) { - return engineContext.emptyHoodieData(); - } + MetadataRecordsGenerationParams recordsGenerationParams) { // Find the columns to index + HoodieTableMetaClient dataTableMetaClient = recordsGenerationParams.getDataMetaClient(); final List columnsToIndex = - getColumnsToIndex(true, targetColumnsForColumnStatsIndex, - Lazy.lazily(() -> tryResolveSchemaForTable(dataMetaClient))); + getColumnsToIndex(recordsGenerationParams, + Lazy.lazily(() -> tryResolveSchemaForTable(dataTableMetaClient))); if (columnsToIndex.isEmpty()) { // In case there are no columns to index, bail return engineContext.emptyHoodieData(); @@ -877,12 +872,12 @@ public static HoodieData convertFilesToColumnStatsRecords(HoodieEn final List> partitionFileFlagTupleList = fetchPartitionFileInfoTriplets(partitionToDeletedFiles, partitionToAppendedFiles); // Create records MDT - int parallelism = Math.max(Math.min(partitionFileFlagTupleList.size(), columnStatsIndexParallelism), 1); + int parallelism = Math.max(Math.min(partitionFileFlagTupleList.size(), recordsGenerationParams.getColumnStatsIndexParallelism()), 1); return engineContext.parallelize(partitionFileFlagTupleList, parallelism).flatMap(partitionFileFlagTuple -> { final String partitionPath = partitionFileFlagTuple.f0; final String filename = partitionFileFlagTuple.f1; final boolean isDeleted = partitionFileFlagTuple.f2; - return getColumnStatsRecords(partitionPath, filename, dataMetaClient, columnsToIndex, isDeleted, maxReaderBufferSize).iterator(); + return getColumnStatsRecords(partitionPath, filename, dataTableMetaClient, columnsToIndex, isDeleted, recordsGenerationParams.getMaxReaderBufferSize()).iterator(); }); } @@ -1091,27 +1086,6 @@ private static List getColumnsToIndex(MetadataRecordsGenerationParams re .orElse(Collections.emptyList()); } - /** - * Get the list of columns for the table for column stats indexing - */ - private static List getColumnsToIndex(boolean isColumnStatsIndexEnabled, - List targetColumnsForColumnStatsIndex, - Lazy> lazyWriterSchemaOpt) { - checkState(isColumnStatsIndexEnabled); - - if (!targetColumnsForColumnStatsIndex.isEmpty()) { - return targetColumnsForColumnStatsIndex; - } - - Option writerSchemaOpt = lazyWriterSchemaOpt.get(); - return writerSchemaOpt - .map(writerSchema -> - writerSchema.getFields().stream() - .map(Schema.Field::name) - .collect(Collectors.toList())) - .orElse(Collections.emptyList()); - } - private static Stream translateWriteStatToColumnStats(HoodieWriteStat writeStat, HoodieTableMetaClient datasetMetaClient, List columnsToIndex) { @@ -1147,8 +1121,9 @@ private static Stream getColumnStatsRecords(String partitionPath, return HoodieMetadataPayload.createColumnStatsRecords(partitionPath, columnRangeMetadataList, true); } + List> columnRangeMetadata = - readColumnRangeMetadataFrom(partitionPath, fileName, datasetMetaClient, columnsToIndex, maxBufferSize); + readColumnRangeMetadataFrom(partitionPath, fileName, datasetMetaClient, columnsToIndex); return HoodieMetadataPayload.createColumnStatsRecords(partitionPath, columnRangeMetadata, false); } @@ -1156,19 +1131,15 @@ private static Stream getColumnStatsRecords(String partitionPath, private static List> readColumnRangeMetadataFrom(String partitionPath, String fileName, HoodieTableMetaClient datasetMetaClient, - List columnsToIndex, - int maxBufferSize) { + List columnsToIndex) { String partitionPathFileName = (partitionPath.equals(EMPTY_PARTITION_NAME) || partitionPath.equals(NON_PARTITIONED_NAME)) ? fileName : partitionPath + "/" + fileName; try { - Path fullFilePath = new Path(datasetMetaClient.getBasePath(), partitionPathFileName); + Path fullFilePath = new Path(datasetMetaClient.getBasePathV2(), partitionPathFileName); if (partitionPathFileName.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { return new ParquetUtils().readRangeFromParquetMetadata(datasetMetaClient.getHadoopConf(), fullFilePath, columnsToIndex); - } else if (FSUtils.isLogFile(fileName)) { - Option writerSchemaOpt = tryResolveSchemaForTable(datasetMetaClient); - LOG.warn("Reading log file: {}, to build column range metadata.", partitionPathFileName); - return getLogFileColumnRangeMetadata(fullFilePath.toString(), datasetMetaClient, columnsToIndex, writerSchemaOpt, maxBufferSize); } + LOG.warn("Column range index not supported for: {}", partitionPathFileName); return Collections.emptyList(); } catch (Exception e) { @@ -1179,56 +1150,6 @@ private static List> readColumnRangeMetada } } - /** - * Read column range metadata from log file. - */ - @VisibleForTesting - protected static List> getLogFileColumnRangeMetadata(String filePath, - HoodieTableMetaClient datasetMetaClient, - List columnsToIndex, - Option writerSchemaOpt, - int maxBufferSize) throws IOException { - if (writerSchemaOpt.isPresent()) { - List fieldsToIndex = writerSchemaOpt.get().getFields().stream() - .filter(field -> columnsToIndex.contains(field.name())) - .collect(Collectors.toList()); - // read log file records without merging - List hoodieRecords = new ArrayList<>(); - HoodieUnMergedLogRecordScanner scanner = HoodieUnMergedLogRecordScanner.newBuilder() - .withFileSystem(datasetMetaClient.getFs()) - .withBasePath(datasetMetaClient.getBasePath()) - .withLogFilePaths(Collections.singletonList(filePath)) - .withBufferSize(maxBufferSize) - .withLatestInstantTime(datasetMetaClient.getActiveTimeline().getCommitsTimeline().lastInstant().get().getTimestamp()) - .withReaderSchema(writerSchemaOpt.get()) - .withLogRecordScannerCallback(hoodieRecords::add) - .build(); - scanner.scan(); - if (hoodieRecords.isEmpty()) { - return Collections.emptyList(); - } - // Extract IndexedRecord from HoodieRecord to use with existing collectColumnRangeMetadata - List records = new ArrayList<>(); - for (HoodieRecord hoodieRecord : hoodieRecords) { - try { - Option insertValue = ((HoodieRecordPayload) hoodieRecord.getData()).getInsertValue(writerSchemaOpt.get()); - if (insertValue.isPresent()) { - records.add(insertValue.get()); - } - } catch (IOException e) { - LOG.warn("Failed to get insert value for record: {}", e.getMessage()); - } - } - if (records.isEmpty()) { - return Collections.emptyList(); - } - Map> columnRangeMetadataMap = - collectColumnRangeMetadata(records, fieldsToIndex, getFileNameFromPath(filePath)); - return new ArrayList<>(columnRangeMetadataMap.values()); - } - return Collections.emptyList(); - } - /** * Does an upcast for {@link BigDecimal} instance to align it with scale/precision expected by * the {@link org.apache.avro.LogicalTypes.Decimal} Avro logical type diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataRecordsGenerationParams.java b/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataRecordsGenerationParams.java index 72a8bf4cd26f8..00ffb1baa397c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataRecordsGenerationParams.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataRecordsGenerationParams.java @@ -42,9 +42,11 @@ public class MetadataRecordsGenerationParams implements Serializable { private final int columnStatsIndexParallelism; private final List targetColumnsForColumnStatsIndex; private final List targetColumnsForBloomFilterIndex; + private final int maxReaderBufferSize; - MetadataRecordsGenerationParams(HoodieTableMetaClient dataMetaClient, List enabledPartitionTypes, String bloomFilterType, int bloomIndexParallelism, - boolean isColumnStatsIndexEnabled, int columnStatsIndexParallelism, List targetColumnsForColumnStatsIndex, List targetColumnsForBloomFilterIndex) { + MetadataRecordsGenerationParams(HoodieTableMetaClient dataMetaClient, List enabledPartitionTypes, String bloomFilterType, + int bloomIndexParallelism, boolean isColumnStatsIndexEnabled, int columnStatsIndexParallelism, List targetColumnsForColumnStatsIndex, + List targetColumnsForBloomFilterIndex, int maxReaderBufferSize) { this.dataMetaClient = dataMetaClient; this.enabledPartitionTypes = enabledPartitionTypes; this.bloomFilterType = bloomFilterType; @@ -53,6 +55,7 @@ public class MetadataRecordsGenerationParams implements Serializable { this.columnStatsIndexParallelism = columnStatsIndexParallelism; this.targetColumnsForColumnStatsIndex = targetColumnsForColumnStatsIndex; this.targetColumnsForBloomFilterIndex = targetColumnsForBloomFilterIndex; + this.maxReaderBufferSize = maxReaderBufferSize; } public HoodieTableMetaClient getDataMetaClient() { @@ -86,4 +89,8 @@ public List getTargetColumnsForColumnStatsIndex() { public List getSecondaryKeysForBloomFilterIndex() { return targetColumnsForBloomFilterIndex; } + + public int getMaxReaderBufferSize() { + return maxReaderBufferSize; + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java b/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java index cde9341f5cdf1..e2c989c92f582 100644 --- a/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java @@ -34,16 +34,17 @@ import static org.apache.hudi.common.util.CollectionUtils.createImmutableMap; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; /** * Tests {@link HoodieMetadataPayload}. */ public class TestHoodieMetadataPayload extends HoodieCommonTestHarness { + public static final String PARTITION_NAME = "2022/10/01"; @Test public void testFileSystemMetadataPayloadMerging() { - String partitionName = "2022/10/01"; - Map firstCommitAddedFiles = createImmutableMap( Pair.of("file1.parquet", 1000L), Pair.of("file2.parquet", 2000L), @@ -51,7 +52,7 @@ public void testFileSystemMetadataPayloadMerging() { ); HoodieRecord firstPartitionFilesRecord = - HoodieMetadataPayload.createPartitionFilesRecord(partitionName, firstCommitAddedFiles, Collections.emptyList()); + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, firstCommitAddedFiles, Collections.emptyList()); Map secondCommitAddedFiles = createImmutableMap( // NOTE: This is an append @@ -63,13 +64,13 @@ public void testFileSystemMetadataPayloadMerging() { List secondCommitDeletedFiles = Collections.singletonList("file1.parquet"); HoodieRecord secondPartitionFilesRecord = - HoodieMetadataPayload.createPartitionFilesRecord(partitionName, secondCommitAddedFiles, secondCommitDeletedFiles); + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, secondCommitAddedFiles, secondCommitDeletedFiles); HoodieMetadataPayload combinedPartitionFilesRecordPayload = secondPartitionFilesRecord.getData().preCombine(firstPartitionFilesRecord.getData()); HoodieMetadataPayload expectedCombinedPartitionedFilesRecordPayload = - HoodieMetadataPayload.createPartitionFilesRecord(partitionName, + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, createImmutableMap( Pair.of("file2.parquet", 2000L), Pair.of("file3.parquet", 3333L), @@ -84,7 +85,6 @@ public void testFileSystemMetadataPayloadMerging() { @Test public void testColumnStatsPayloadMerging() throws IOException { - String partitionPath = "2022/10/01"; String fileName = "file.parquet"; String targetColName = "c1"; @@ -92,7 +92,7 @@ public void testColumnStatsPayloadMerging() throws IOException { HoodieColumnRangeMetadata.create(fileName, targetColName, 100, 1000, 5, 1000, 123456, 123456); HoodieRecord columnStatsRecord = - HoodieMetadataPayload.createColumnStatsRecords(partitionPath, Collections.singletonList(c1Metadata), false) + HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1Metadata), false) .findFirst().get(); //////////////////////////////////////////////////////////////////////// @@ -105,7 +105,7 @@ public void testColumnStatsPayloadMerging() throws IOException { HoodieColumnRangeMetadata.create(fileName, targetColName, 0, 500, 0, 100, 12345, 12345); HoodieRecord updatedColumnStatsRecord = - HoodieMetadataPayload.createColumnStatsRecords(partitionPath, Collections.singletonList(c1AppendedBlockMetadata), false) + HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1AppendedBlockMetadata), false) .findFirst().get(); HoodieMetadataPayload combinedMetadataPayload = @@ -115,7 +115,7 @@ public void testColumnStatsPayloadMerging() throws IOException { HoodieColumnRangeMetadata.create(fileName, targetColName, 0, 1000, 5, 1100, 135801, 135801); HoodieRecord expectedColumnStatsRecord = - HoodieMetadataPayload.createColumnStatsRecords(partitionPath, Collections.singletonList(expectedColumnRangeMetadata), false) + HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(expectedColumnRangeMetadata), false) .findFirst().get(); // Assert combined payload @@ -135,7 +135,7 @@ public void testColumnStatsPayloadMerging() throws IOException { HoodieColumnRangeMetadata.stub(fileName, targetColName); HoodieRecord deletedColumnStatsRecord = - HoodieMetadataPayload.createColumnStatsRecords(partitionPath, Collections.singletonList(c1StubbedMetadata), true) + HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1StubbedMetadata), true) .findFirst().get(); // NOTE: In this case, deleted (or tombstone) record will be therefore deleting @@ -144,6 +144,8 @@ public void testColumnStatsPayloadMerging() throws IOException { deletedColumnStatsRecord.getData().preCombine(columnStatsRecord.getData()); assertEquals(deletedColumnStatsRecord.getData(), deletedCombinedMetadataPayload); + assertFalse(deletedCombinedMetadataPayload.getInsertValue(null).isPresent()); + assertTrue(deletedCombinedMetadataPayload.isDeleted()); // NOTE: In this case, proper incoming record will be overwriting previously deleted // record diff --git a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java deleted file mode 100644 index ce2cae78342c8..0000000000000 --- a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java +++ /dev/null @@ -1,295 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.metadata; - -import org.apache.hudi.common.model.HoodieColumnRangeMetadata; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.testutils.HoodieCommonTestHarness; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; - -import org.apache.avro.generic.IndexedRecord; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Map; - -import static org.apache.hudi.common.util.CollectionUtils.createImmutableMap; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; - -/** - * Tests {@link HoodieMetadataPayload}. - */ -public class TestHoodieMetadataPayload extends HoodieCommonTestHarness { - public static final String PARTITION_NAME = "2022/10/01"; - public static final String PARTITION_NAME2 = "2023/10/01"; - public static final String PARTITION_NAME3 = "2024/10/01"; - - @Test - public void testFileSystemMetadataPayloadMerging() { - Map firstCommitAddedFiles = createImmutableMap( - Pair.of("file1.parquet", 1000L), - Pair.of("file2.parquet", 2000L), - Pair.of("file3.parquet", 3000L) - ); - - HoodieRecord firstPartitionFilesRecord = - HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, firstCommitAddedFiles, Collections.emptyList()); - - Map secondCommitAddedFiles = createImmutableMap( - // NOTE: This is an append - Pair.of("file3.parquet", 3333L), - Pair.of("file4.parquet", 4000L), - Pair.of("file5.parquet", 5000L) - ); - - List secondCommitDeletedFiles = Collections.singletonList("file1.parquet"); - - HoodieRecord secondPartitionFilesRecord = - HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, secondCommitAddedFiles, secondCommitDeletedFiles); - - HoodieMetadataPayload combinedPartitionFilesRecordPayload = - secondPartitionFilesRecord.getData().preCombine(firstPartitionFilesRecord.getData()); - - HoodieMetadataPayload expectedCombinedPartitionedFilesRecordPayload = - HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, - createImmutableMap( - Pair.of("file2.parquet", 2000L), - Pair.of("file3.parquet", 3333L), - Pair.of("file4.parquet", 4000L), - Pair.of("file5.parquet", 5000L) - ), - Collections.emptyList() - ).getData(); - - assertEquals(expectedCombinedPartitionedFilesRecordPayload, combinedPartitionFilesRecordPayload); - } - - @Test - public void testFileSystemMetadataPayloadMergingWithDeletions() { - Map addedFileMap = createImmutableMap( - Pair.of("file1.parquet", 1000L), - Pair.of("file2.parquet", 2000L), - Pair.of("file3.parquet", 3000L), - Pair.of("file4.parquet", 4000L) - ); - HoodieRecord additionRecord = - HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, addedFileMap, Collections.emptyList()); - - List deletedFileList1 = new ArrayList<>(); - deletedFileList1.add("file1.parquet"); - deletedFileList1.add("file3.parquet"); - HoodieRecord deletionRecord1 = - HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, Collections.emptyMap(), deletedFileList1); - - List deletedFileList2 = new ArrayList<>(); - deletedFileList2.add("file1.parquet"); - deletedFileList2.add("file4.parquet"); - HoodieRecord deletionRecord2 = - HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, Collections.emptyMap(), deletedFileList2); - - assertEquals( - HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, - createImmutableMap( - Pair.of("file2.parquet", 2000L), - Pair.of("file4.parquet", 4000L) - ), - Collections.emptyList() - ).getData(), - deletionRecord1.getData().preCombine(additionRecord.getData()) - ); - - List expectedDeleteFileList = new ArrayList<>(); - expectedDeleteFileList.add("file1.parquet"); - expectedDeleteFileList.add("file3.parquet"); - expectedDeleteFileList.add("file4.parquet"); - - assertEquals( - HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, - Collections.emptyMap(), - expectedDeleteFileList - ).getData(), - deletionRecord2.getData().preCombine(deletionRecord1.getData()) - ); - - assertEquals( - HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, - createImmutableMap( - Pair.of("file2.parquet", 2000L) - ), - Collections.emptyList() - ).getData(), - deletionRecord2.getData().preCombine(deletionRecord1.getData()).preCombine(additionRecord.getData()) - ); - - assertEquals( - HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, - createImmutableMap( - Pair.of("file2.parquet", 2000L) - ), - Collections.singletonList("file1.parquet") - ).getData(), - deletionRecord2.getData().preCombine(deletionRecord1.getData().preCombine(additionRecord.getData())) - ); - - // lets delete all files - List allDeletedFileList = new ArrayList<>(); - allDeletedFileList.add("file1.parquet"); - allDeletedFileList.add("file2.parquet"); - allDeletedFileList.add("file3.parquet"); - allDeletedFileList.add("file4.parquet"); - HoodieRecord allDeletionRecord = - HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, Collections.emptyMap(), allDeletedFileList); - - HoodieMetadataPayload combinedPayload = allDeletionRecord.getData().preCombine(additionRecord.getData()); - assertEquals(HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, Collections.emptyMap(), Collections.emptyList()).getData(), combinedPayload); - assertTrue(combinedPayload.filesystemMetadata.isEmpty()); - - // test all partition record - HoodieRecord allPartitionsRecord = HoodieMetadataPayload.createPartitionListRecord(Arrays.asList(PARTITION_NAME, PARTITION_NAME2, PARTITION_NAME3), false); - HoodieRecord partitionDeletedRecord = HoodieMetadataPayload.createPartitionListRecord(Collections.singletonList(PARTITION_NAME), true); - // combine to ensure the deleted partitions is not seen - HoodieMetadataPayload payload = partitionDeletedRecord.getData().preCombine(allPartitionsRecord.getData()); - assertEquals(HoodieMetadataPayload.createPartitionListRecord(Arrays.asList(PARTITION_NAME2, PARTITION_NAME3), false).getData(), - payload); - } - - @Test - public void testColumnStatsPayloadMerging() throws IOException { - String fileName = "file.parquet"; - String targetColName = "c1"; - - HoodieColumnRangeMetadata c1Metadata = - HoodieColumnRangeMetadata.create(fileName, targetColName, 100, 1000, 5, 1000, 123456, 123456); - - HoodieRecord columnStatsRecord = - HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1Metadata), false) - .findFirst().get(); - - //////////////////////////////////////////////////////////////////////// - // Case 1: Combining proper (non-deleted) records - //////////////////////////////////////////////////////////////////////// - - // NOTE: Column Stats record will only be merged in case existing file will be modified, - // which could only happen on storages schemes supporting appends - HoodieColumnRangeMetadata c1AppendedBlockMetadata = - HoodieColumnRangeMetadata.create(fileName, targetColName, 0, 500, 0, 100, 12345, 12345); - - HoodieRecord updatedColumnStatsRecord = - HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1AppendedBlockMetadata), false) - .findFirst().get(); - - HoodieMetadataPayload combinedMetadataPayload = - columnStatsRecord.getData().preCombine(updatedColumnStatsRecord.getData()); - - HoodieColumnRangeMetadata expectedColumnRangeMetadata = - HoodieColumnRangeMetadata.create(fileName, targetColName, 0, 1000, 5, 1100, 135801, 135801); - - HoodieRecord expectedColumnStatsRecord = - HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(expectedColumnRangeMetadata), false) - .findFirst().get(); - - // Assert combined payload - assertEquals(combinedMetadataPayload, expectedColumnStatsRecord.getData()); - - Option alternativelyCombinedMetadataPayloadAvro = - columnStatsRecord.getData().combineAndGetUpdateValue(updatedColumnStatsRecord.getData().getInsertValue(null).get(), null); - - // Assert that using legacy API yields the same value - assertEquals(combinedMetadataPayload.getInsertValue(null), alternativelyCombinedMetadataPayloadAvro); - - //////////////////////////////////////////////////////////////////////// - // Case 2: Combining w/ deleted records - //////////////////////////////////////////////////////////////////////// - - HoodieColumnRangeMetadata c1StubbedMetadata = - HoodieColumnRangeMetadata.stub(fileName, targetColName); - - HoodieRecord deletedColumnStatsRecord = - HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1StubbedMetadata), true) - .findFirst().get(); - - // NOTE: In this case, deleted (or tombstone) record will be therefore deleting - // previous state of the record - HoodieMetadataPayload deletedCombinedMetadataPayload = - deletedColumnStatsRecord.getData().preCombine(columnStatsRecord.getData()); - - assertEquals(deletedColumnStatsRecord.getData(), deletedCombinedMetadataPayload); - assertFalse(deletedCombinedMetadataPayload.getInsertValue(null).isPresent()); - assertTrue(deletedCombinedMetadataPayload.isDeleted()); - - // NOTE: In this case, proper incoming record will be overwriting previously deleted - // record - HoodieMetadataPayload overwrittenCombinedMetadataPayload = - columnStatsRecord.getData().preCombine(deletedColumnStatsRecord.getData()); - - assertEquals(columnStatsRecord.getData(), overwrittenCombinedMetadataPayload); - } - - @Test - public void testPartitionStatsPayloadMerging() { - HoodieColumnRangeMetadata fileColumnRange1 = HoodieColumnRangeMetadata.create( - "path/to/file", "columnName", 1, 5, 0, 10, 100, 200); - HoodieRecord firstPartitionStatsRecord = - HoodieMetadataPayload.createPartitionStatsRecords(PARTITION_NAME, Collections.singletonList(fileColumnRange1), false, false).findFirst().get(); - HoodieColumnRangeMetadata fileColumnRange2 = HoodieColumnRangeMetadata.create( - "path/to/file", "columnName", 3, 8, 1, 15, 120, 250); - HoodieRecord updatedPartitionStatsRecord = - HoodieMetadataPayload.createPartitionStatsRecords(PARTITION_NAME, Collections.singletonList(fileColumnRange2), false, false).findFirst().get(); - HoodieMetadataPayload combinedPartitionStatsRecordPayload = - updatedPartitionStatsRecord.getData().preCombine(firstPartitionStatsRecord.getData()); - HoodieColumnRangeMetadata expectedColumnRange = HoodieColumnRangeMetadata.create( - "path/to/file", "columnName", 1, 8, 1, 25, 220, 450); - HoodieMetadataPayload expectedColumnRangeMetadata = (HoodieMetadataPayload) HoodieMetadataPayload.createPartitionStatsRecords( - PARTITION_NAME, Collections.singletonList(expectedColumnRange), false, false).findFirst().get().getData(); - assertEquals(expectedColumnRangeMetadata, combinedPartitionStatsRecordPayload); - } - - @Test - public void testPartitionStatsPayloadMergingWithDelete() { - HoodieColumnRangeMetadata fileColumnRange1 = HoodieColumnRangeMetadata.create( - "path/to/file", "columnName", 1, 5, 0, 10, 100, 200); - HoodieRecord firstPartitionStatsRecord = - HoodieMetadataPayload.createPartitionStatsRecords(PARTITION_NAME, Collections.singletonList(fileColumnRange1), false, false).findFirst().get(); - HoodieColumnRangeMetadata fileColumnRange2 = HoodieColumnRangeMetadata.create( - "path/to/file", "columnName", 3, 8, 1, 15, 120, 250); - // create delete payload - HoodieRecord deletedPartitionStatsRecord = - HoodieMetadataPayload.createPartitionStatsRecords(PARTITION_NAME, Collections.singletonList(fileColumnRange2), true, false).findFirst().get(); - // deleted (or tombstone) record will be therefore deleting previous state of the record - HoodieMetadataPayload combinedPartitionStatsRecordPayload = - deletedPartitionStatsRecord.getData().preCombine(firstPartitionStatsRecord.getData()); - HoodieColumnRangeMetadata expectedColumnRange = HoodieColumnRangeMetadata.create( - "path/to/file", "columnName", 3, 8, 1, 15, 120, 250); - HoodieMetadataPayload expectedColumnRangeMetadata = (HoodieMetadataPayload) HoodieMetadataPayload.createPartitionStatsRecords( - PARTITION_NAME, Collections.singletonList(expectedColumnRange), true, false).findFirst().get().getData(); - assertEquals(expectedColumnRangeMetadata, combinedPartitionStatsRecordPayload); - - // another update for the same key should overwrite the delete record - HoodieMetadataPayload overwrittenCombinedPartitionStatsRecordPayload = - firstPartitionStatsRecord.getData().preCombine(deletedPartitionStatsRecord.getData()); - assertEquals(firstPartitionStatsRecord.getData(), overwrittenCombinedPartitionStatsRecordPayload); - } -} diff --git a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java deleted file mode 100644 index 9586171d97aa5..0000000000000 --- a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java +++ /dev/null @@ -1,315 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.hudi.metadata; - -import org.apache.hudi.common.config.HoodieMetadataConfig; -import org.apache.hudi.common.data.HoodieData; -import org.apache.hudi.common.engine.EngineType; -import org.apache.hudi.common.engine.HoodieLocalEngineContext; -import org.apache.hudi.common.model.FileSlice; -import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieColumnRangeMetadata; -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.WriteOperationType; -import org.apache.hudi.common.table.HoodieTableConfig; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.testutils.FileCreateUtils; -import org.apache.hudi.common.testutils.HoodieCommonTestHarness; -import org.apache.hudi.common.testutils.HoodieTestDataGenerator; -import org.apache.hudi.common.testutils.HoodieTestTable; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.io.storage.HoodieFileWriter; -import org.apache.hudi.io.storage.HoodieFileWriterFactory; -import org.apache.hudi.storage.StoragePath; - -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.net.URI; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Properties; -import java.util.Set; -import java.util.UUID; -import java.util.stream.Collectors; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -public class TestHoodieTableMetadataUtil extends HoodieCommonTestHarness { - - private static HoodieTestTable hoodieTestTable; - private static final List DATE_PARTITIONS = Arrays.asList("2019/01/01", "2020/01/02", "2021/03/01"); - - @BeforeEach - public void setUp() throws IOException { - initMetaClient(); - initTestDataGenerator(DATE_PARTITIONS.toArray(new String[0])); - hoodieTestTable = HoodieTestTable.of(metaClient); - } - - @AfterEach - public void tearDown() throws IOException { - metaClient.getStorage().deleteDirectory(metaClient.getBasePath()); - cleanupTestDataGenerator(); - cleanMetaClient(); - } - - @Test - public void testReadRecordKeysFromBaseFilesWithEmptyPartitionBaseFilePairs() { - HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getStorageConf()); - List> partitionFileSlicePairs = Collections.emptyList(); - HoodieData result = HoodieTableMetadataUtil.readRecordKeysFromFileSlices( - engineContext, - partitionFileSlicePairs, - false, - 1, - "activeModule", - metaClient, - EngineType.SPARK - ); - assertTrue(result.isEmpty()); - } - - @Test - public void testConvertFilesToPartitionStatsRecords() throws Exception { - HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getStorageConf()); - String instant1 = "20230918120000000"; - hoodieTestTable = hoodieTestTable.addCommit(instant1); - String instant2 = "20230918121110000"; - hoodieTestTable = hoodieTestTable.addCommit(instant2); - List partitionInfoList = new ArrayList<>(); - // Generate 10 inserts for each partition and populate partitionBaseFilePairs and recordKeys. - DATE_PARTITIONS.forEach(p -> { - try { - URI partitionMetaFile = FileCreateUtils.createPartitionMetaFile(basePath, p); - StoragePath partitionMetadataPath = new StoragePath(partitionMetaFile); - String fileId1 = UUID.randomUUID().toString(); - FileSlice fileSlice1 = new FileSlice(p, instant1, fileId1); - StoragePath storagePath1 = new StoragePath(hoodieTestTable.getBaseFilePath(p, fileId1).toUri()); - writeParquetFile( - instant1, - storagePath1, - dataGen.generateInsertsForPartition(instant1, 10, p), - metaClient, - engineContext); - HoodieBaseFile baseFile1 = new HoodieBaseFile(hoodieTestTable.getBaseFilePath(p, fileId1).toString()); - fileSlice1.setBaseFile(baseFile1); - String fileId2 = UUID.randomUUID().toString(); - FileSlice fileSlice2 = new FileSlice(p, instant2, fileId2); - StoragePath storagePath2 = new StoragePath(hoodieTestTable.getBaseFilePath(p, fileId2).toUri()); - writeParquetFile( - instant2, - storagePath2, - dataGen.generateInsertsForPartition(instant2, 10, p), - metaClient, - engineContext); - HoodieBaseFile baseFile2 = new HoodieBaseFile(hoodieTestTable.getBaseFilePath(p, fileId2).toString()); - fileSlice2.setBaseFile(baseFile2); - partitionInfoList.add(new HoodieTableMetadataUtil.DirectoryInfo( - p, - metaClient.getStorage().listDirectEntries(Arrays.asList(partitionMetadataPath, storagePath1, storagePath2)), - instant2, - Collections.emptySet())); - } catch (Exception e) { - throw new RuntimeException(e); - } - }); - - List columnsToIndex = Arrays.asList("rider", "driver"); - HoodieData result = HoodieTableMetadataUtil.convertFilesToPartitionStatsRecords( - engineContext, - partitionInfoList, - HoodieMetadataConfig.newBuilder().enable(true) - .withMetadataIndexColumnStats(true) - .withMetadataIndexPartitionStats(true) - .withColumnStatsIndexForColumns("rider,driver") - .withPartitionStatsIndexParallelism(1) - .build(), - metaClient, - Option.of(HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS)); - // Validate the result. - validatePartitionStats(result, instant1, instant2); - } - - @Test - public void testReadRecordKeysFromBaseFilesWithValidRecords() throws Exception { - HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getStorageConf()); - String instant = "20230918120000000"; - hoodieTestTable = hoodieTestTable.addCommit(instant); - Set recordKeys = new HashSet<>(); - final List> partitionFileSlicePairs = new ArrayList<>(); - // Generate 10 inserts for each partition and populate partitionBaseFilePairs and recordKeys. - DATE_PARTITIONS.forEach(p -> { - try { - List hoodieRecords = dataGen.generateInsertsForPartition(instant, 10, p); - String fileId = UUID.randomUUID().toString(); - FileSlice fileSlice = new FileSlice(p, instant, fileId); - writeParquetFile( - instant, - new StoragePath(hoodieTestTable.getBaseFilePath(p, fileId).toUri()), - hoodieRecords, - metaClient, - engineContext); - HoodieBaseFile baseFile = new HoodieBaseFile(hoodieTestTable.getBaseFilePath(p, fileId).toString(), fileId, instant, null); - fileSlice.setBaseFile(baseFile); - partitionFileSlicePairs.add(Pair.of(p, fileSlice)); - recordKeys.addAll(hoodieRecords.stream().map(HoodieRecord::getRecordKey).collect(Collectors.toSet())); - } catch (Exception e) { - throw new RuntimeException(e); - } - }); - - // Call the method readRecordKeysFromBaseFiles with the created partitionBaseFilePairs. - HoodieData result = HoodieTableMetadataUtil.readRecordKeysFromFileSlices( - engineContext, - partitionFileSlicePairs, - false, - 1, - "activeModule", - metaClient, - EngineType.SPARK - ); - // Validate the result. - List records = result.collectAsList(); - assertEquals(30, records.size()); - assertEquals(MetadataPartitionType.RECORD_INDEX.getPartitionPath(), records.get(0).getPartitionPath()); - for (HoodieRecord record : records) { - assertTrue(recordKeys.contains(record.getRecordKey())); - } - } - - @Test - public void testGetLogFileColumnRangeMetadata() throws Exception { - HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getStorageConf()); - String instant1 = "20230918120000000"; - - HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); - commitMetadata.addMetadata("test", "test"); - commitMetadata.setOperationType(WriteOperationType.INSERT); - commitMetadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS.toString()); - hoodieTestTable = hoodieTestTable.addCommit(instant1, Option.of(commitMetadata)); - String instant2 = "20230918121110000"; - hoodieTestTable = hoodieTestTable.addCommit(instant2); - List partitionInfoList = new ArrayList<>(); - List columnsToIndex = Arrays.asList("rider", "driver"); - // Generate 10 inserts for each partition and populate partitionBaseFilePairs and recordKeys. - DATE_PARTITIONS.forEach(p -> { - try { - URI partitionMetaFile = FileCreateUtils.createPartitionMetaFile(basePath, p); - StoragePath partitionMetadataPath = new StoragePath(partitionMetaFile); - String fileId1 = UUID.randomUUID().toString(); - // add only one parquet file in first file slice - FileSlice fileSlice1 = new FileSlice(p, instant1, fileId1); - StoragePath storagePath1 = new StoragePath(hoodieTestTable.getBaseFilePath(p, fileId1).toUri()); - writeParquetFile(instant1, storagePath1, dataGen.generateInsertsForPartition(instant1, 10, p), metaClient, engineContext); - HoodieBaseFile baseFile1 = new HoodieBaseFile(hoodieTestTable.getBaseFilePath(p, fileId1).toString()); - fileSlice1.setBaseFile(baseFile1); - // add log file in second file slice with higher rider and driver values (which are concatenated with instant) - FileSlice fileSlice2 = new FileSlice(p, instant2, fileId1); - fileSlice2.setBaseFile(baseFile1); - StoragePath storagePath2 = new StoragePath(partitionMetadataPath.getParent(), hoodieTestTable.getLogFileNameById(fileId1, 1)); - writeLogFiles(new StoragePath(metaClient.getBasePath(), p), HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS, dataGen.generateInsertsForPartition(instant2, 10, p), 1, - metaClient.getStorage(), new Properties(), fileId1, instant2); - fileSlice2.addLogFile(new HoodieLogFile(storagePath2.toUri().toString())); - partitionInfoList.add(new HoodieTableMetadataUtil.DirectoryInfo( - p, - metaClient.getStorage().listDirectEntries(Arrays.asList(partitionMetadataPath, storagePath1, storagePath2)), - instant2, - Collections.emptySet())); - // NOTE: we need to set table config as we are not using write client explicitly and these configs are needed for log record reader - metaClient.getTableConfig().setValue(HoodieTableConfig.POPULATE_META_FIELDS.key(), "false"); - metaClient.getTableConfig().setValue(HoodieTableConfig.RECORDKEY_FIELDS.key(), "_row_key"); - metaClient.getTableConfig().setValue(HoodieTableConfig.PARTITION_FIELDS.key(), "partition_path"); - List> columnRangeMetadataLogFile = HoodieTableMetadataUtil.getLogFileColumnRangeMetadata( - storagePath2.toString(), - metaClient, - columnsToIndex, - Option.of(HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS), - HoodieMetadataConfig.MAX_READER_BUFFER_SIZE_PROP.defaultValue()); - // there must be two ranges for rider and driver - assertEquals(2, columnRangeMetadataLogFile.size()); - } catch (Exception e) { - throw new RuntimeException(e); - } - }); - // collect partition stats, this will collect stats for log files as well - HoodieData result = HoodieTableMetadataUtil.convertFilesToPartitionStatsRecords( - engineContext, - partitionInfoList, - HoodieMetadataConfig.newBuilder().enable(true) - .withMetadataIndexColumnStats(true) - .withMetadataIndexPartitionStats(true) - .withColumnStatsIndexForColumns("rider,driver") - .withPartitionStatsIndexParallelism(1) - .build(), - metaClient, - Option.of(HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS)); - // Validate the result. - validatePartitionStats(result, instant1, instant2); - } - - private static void validatePartitionStats(HoodieData result, String instant1, String instant2) { - List records = result.collectAsList(); - // 3 partitions * 2 columns = 6 partition stats records - assertEquals(6, records.size()); - assertEquals(MetadataPartitionType.PARTITION_STATS.getPartitionPath(), records.get(0).getPartitionPath()); - ((HoodieMetadataPayload) result.collectAsList().get(0).getData()).getColumnStatMetadata().get().getColumnName(); - records.forEach(r -> { - HoodieMetadataPayload payload = (HoodieMetadataPayload) r.getData(); - assertTrue(payload.getColumnStatMetadata().isPresent()); - // instant1 < instant2 so instant1 should be in the min value and instant2 should be in the max value. - if (payload.getColumnStatMetadata().get().getColumnName().equals("rider")) { - assertEquals(String.format("{\"value\": \"rider-%s\"}", instant1), String.valueOf(payload.getColumnStatMetadata().get().getMinValue())); - assertEquals(String.format("{\"value\": \"rider-%s\"}", instant2), String.valueOf(payload.getColumnStatMetadata().get().getMaxValue())); - } else if (payload.getColumnStatMetadata().get().getColumnName().equals("driver")) { - assertEquals(String.format("{\"value\": \"driver-%s\"}", instant1), String.valueOf(payload.getColumnStatMetadata().get().getMinValue())); - assertEquals(String.format("{\"value\": \"driver-%s\"}", instant2), String.valueOf(payload.getColumnStatMetadata().get().getMaxValue())); - } - }); - } - - private static void writeParquetFile(String instant, - StoragePath path, - List records, - HoodieTableMetaClient metaClient, - HoodieLocalEngineContext engineContext) throws IOException { - HoodieFileWriter writer = HoodieFileWriterFactory.getFileWriter( - instant, - path, - metaClient.getStorage(), - metaClient.getTableConfig(), - HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS, - engineContext.getTaskContextSupplier(), - HoodieRecord.HoodieRecordType.AVRO); - for (HoodieRecord record : records) { - writer.writeWithMetadata(record.getKey(), record, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS); - } - writer.close(); - } -} From 5419e5d296645a50c33113a8acc4ca8f10dfe315 Mon Sep 17 00:00:00 2001 From: Vamsi Date: Fri, 13 Mar 2026 23:42:01 +0530 Subject: [PATCH 06/12] Fix TestHoodieMetadataPayload tests --- .../org/apache/hudi/metadata/TestHoodieMetadataPayload.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java b/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java index e2c989c92f582..715fb25eb3c4f 100644 --- a/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java @@ -144,8 +144,6 @@ public void testColumnStatsPayloadMerging() throws IOException { deletedColumnStatsRecord.getData().preCombine(columnStatsRecord.getData()); assertEquals(deletedColumnStatsRecord.getData(), deletedCombinedMetadataPayload); - assertFalse(deletedCombinedMetadataPayload.getInsertValue(null).isPresent()); - assertTrue(deletedCombinedMetadataPayload.isDeleted()); // NOTE: In this case, proper incoming record will be overwriting previously deleted // record From 099f0237e5e6cc92cb5301fce54ec684a3e8f442 Mon Sep 17 00:00:00 2001 From: Vamsi Date: Fri, 13 Mar 2026 23:48:24 +0530 Subject: [PATCH 07/12] fix scala issues --- .../apache/hudi/functional/ColumnStatIndexTestBase.scala | 6 +++--- .../org/apache/hudi/functional/TestColumnStatsIndex.scala | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala index ba29a4c36bf15..1b81141516731 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala @@ -124,7 +124,7 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase { val fsv = FileSystemViewManager.createInMemoryFileSystemView(new HoodieSparkEngineContext(jsc), metaClient, HoodieMetadataConfig.newBuilder().enable(false).build()) fsv.loadAllPartitions() val filegroupList = fsv.getAllFileGroups.collect(Collectors.toList[HoodieFileGroup]) - val baseFilesList = filegroupList.stream().flatMap(fileGroup => fileGroup.getAllBaseFiles).collect(Collectors.toList[HoodieBaseFile]) + val baseFilesList = filegroupList.stream().flatMap((fileGroup: HoodieFileGroup) => fileGroup.getAllBaseFiles).collect(Collectors.toList[HoodieBaseFile]) val baseFiles = baseFilesList.stream() .map[Path](baseFile => new Path(baseFile.getPath)).collect(Collectors.toList[Path]).asScala @@ -164,7 +164,7 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase { if (metaClient.getTableConfig.getTableType == HoodieTableType.COPY_ON_WRITE) { baseFilesDf // COW table } else { - val allLogFiles = filegroupList.stream().flatMap(fileGroup => fileGroup.getAllFileSlices) + val allLogFiles = filegroupList.stream().flatMap((fileGroup: HoodieFileGroup) => fileGroup.getAllFileSlices) .flatMap(fileSlice => fileSlice.getLogFiles) .collect(Collectors.toList[HoodieLogFile]) if (allLogFiles.isEmpty) { @@ -188,7 +188,7 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase { writerSchemaOpt: org.apache.hudi.common.util.Option[Schema], maxBufferSize: Integer, indexSchema: StructType): DataFrame = { - val colStatsEntries = logFiles.stream().map[org.apache.hudi.common.util.Option[Row]](logFile => { + val colStatsEntries = logFiles.stream().map[org.apache.hudi.common.util.Option[Row]]((logFile: HoodieLogFile) => { try { getColStatsFromLogFile(logFile.getPath.toString, latestCommit, columnsToIndex, datasetMetaClient, writerSchemaOpt, maxBufferSize) } catch { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala index 7d0bacf03bf77..d5dec40e0ad38 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala @@ -24,7 +24,7 @@ import org.apache.hudi.ColumnStatsIndexSupport.composeIndexSchema import org.apache.hudi.DataSourceWriteOptions.{PRECOMBINE_FIELD, RECORDKEY_FIELD} import org.apache.hudi.HoodieConversionUtils.toProperties import org.apache.hudi.common.config.{HoodieCommonConfig, HoodieMetadataConfig, HoodieStorageConfig} -import org.apache.hudi.common.model.HoodieTableType +import org.apache.hudi.common.model.{FileSlice, HoodieTableType} import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} import org.apache.hudi.common.util.ParquetUtils import org.apache.hudi.config.{HoodieCleanConfig, HoodieCompactionConfig, HoodieWriteConfig} @@ -173,9 +173,9 @@ class TestColumnStatsIndex extends ColumnStatIndexTestBase { fsv.loadAllPartitions() val basePath2 = new Path(basePath) val allPartitionPaths = fsv.getPartitionPaths - allPartitionPaths.forEach(partitionPath => { + allPartitionPaths.forEach((partitionPath: Path) => { val pPath = FSUtils.getRelativePartitionPath(basePath2, partitionPath) - assertTrue (fsv.getLatestFileSlices(pPath).filter(fileSlice => fileSlice.getLogFiles.findAny().isPresent).count() > 0) + assertTrue (fsv.getLatestFileSlices(pPath).filter((fileSlice: FileSlice) => fileSlice.getLogFiles.findAny().isPresent).count() > 0) }) fsv.close() } From 74b57ec58ecfcc91914ddd172c834de5f823ef5f Mon Sep 17 00:00:00 2001 From: Vamsi Date: Fri, 13 Mar 2026 23:50:58 +0530 Subject: [PATCH 08/12] Fix checkstyle --- .../org/apache/hudi/metadata/TestHoodieMetadataPayload.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java b/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java index 715fb25eb3c4f..4f022d7e0dafb 100644 --- a/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java @@ -34,8 +34,6 @@ import static org.apache.hudi.common.util.CollectionUtils.createImmutableMap; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; /** * Tests {@link HoodieMetadataPayload}. From d80a4b6c8a47f3f1aa8040e803ac29d0f76c00a7 Mon Sep 17 00:00:00 2001 From: Vamsi Date: Sun, 15 Mar 2026 17:49:41 +0530 Subject: [PATCH 09/12] fix CI --- .../functional/ColumnStatIndexTestBase.scala | 19 ++++++++++--------- .../functional/TestColumnStatsIndex.scala | 4 ++-- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala index 1b81141516731..cc9a2b8551a70 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala @@ -124,9 +124,9 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase { val fsv = FileSystemViewManager.createInMemoryFileSystemView(new HoodieSparkEngineContext(jsc), metaClient, HoodieMetadataConfig.newBuilder().enable(false).build()) fsv.loadAllPartitions() val filegroupList = fsv.getAllFileGroups.collect(Collectors.toList[HoodieFileGroup]) - val baseFilesList = filegroupList.stream().flatMap((fileGroup: HoodieFileGroup) => fileGroup.getAllBaseFiles).collect(Collectors.toList[HoodieBaseFile]) - val baseFiles = baseFilesList.stream() - .map[Path](baseFile => new Path(baseFile.getPath)).collect(Collectors.toList[Path]).asScala + val baseFiles = filegroupList.asScala + .flatMap(fileGroup => fileGroup.getAllBaseFiles.iterator().asScala) + .map(baseFile => new Path(baseFile.getPath)) val baseFilesDf = spark.createDataFrame( baseFiles.flatMap(file => { @@ -164,9 +164,10 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase { if (metaClient.getTableConfig.getTableType == HoodieTableType.COPY_ON_WRITE) { baseFilesDf // COW table } else { - val allLogFiles = filegroupList.stream().flatMap((fileGroup: HoodieFileGroup) => fileGroup.getAllFileSlices) - .flatMap(fileSlice => fileSlice.getLogFiles) - .collect(Collectors.toList[HoodieLogFile]) + val allLogFiles = filegroupList.asScala + .flatMap(fileGroup => fileGroup.getAllFileSlices.iterator().asScala) + .flatMap(fileSlice => fileSlice.getLogFiles.iterator().asScala) + .toList.asJava if (allLogFiles.isEmpty) { baseFilesDf // MOR table, but no log files. } else { @@ -174,7 +175,7 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase { val writerSchemaOpt = LogFileColStatsTestUtil.getSchemaForTable(metaClient) val latestCompletedCommit = metaClient.getActiveTimeline.getCommitsTimeline.filterCompletedInstants().lastInstant().get().getTimestamp baseFilesDf.union(getColStatsFromLogFiles(allLogFiles, latestCompletedCommit, - scala.collection.JavaConverters.seqAsJavaList(colsToGenerateStats), + colsToGenerateStats.asJava, metaClient, writerSchemaOpt: org.apache.hudi.common.util.Option[Schema], HoodieMetadataConfig.MAX_READER_BUFFER_SIZE_PROP.defaultValue(), @@ -188,14 +189,14 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase { writerSchemaOpt: org.apache.hudi.common.util.Option[Schema], maxBufferSize: Integer, indexSchema: StructType): DataFrame = { - val colStatsEntries = logFiles.stream().map[org.apache.hudi.common.util.Option[Row]]((logFile: HoodieLogFile) => { + val colStatsEntries = logFiles.asScala.map(logFile => { try { getColStatsFromLogFile(logFile.getPath.toString, latestCommit, columnsToIndex, datasetMetaClient, writerSchemaOpt, maxBufferSize) } catch { case e: Exception => throw e } - }).filter(rowOpt => rowOpt.isPresent).map[Row](rowOpt => rowOpt.get()).collect(Collectors.toList[Row]) + }).filter(rowOpt => rowOpt.isPresent).map(rowOpt => rowOpt.get()).toList.asJava spark.createDataFrame(colStatsEntries, indexSchema) } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala index d5dec40e0ad38..bb8bf8856eed3 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala @@ -173,9 +173,9 @@ class TestColumnStatsIndex extends ColumnStatIndexTestBase { fsv.loadAllPartitions() val basePath2 = new Path(basePath) val allPartitionPaths = fsv.getPartitionPaths - allPartitionPaths.forEach((partitionPath: Path) => { + allPartitionPaths.asScala.foreach(partitionPath => { val pPath = FSUtils.getRelativePartitionPath(basePath2, partitionPath) - assertTrue (fsv.getLatestFileSlices(pPath).filter((fileSlice: FileSlice) => fileSlice.getLogFiles.findAny().isPresent).count() > 0) + assertTrue (fsv.getLatestFileSlices(pPath).iterator().asScala.count(fileSlice => fileSlice.getLogFiles.findAny().isPresent) > 0) }) fsv.close() } From 82913a25f4b2180142e47799e5d60a5c34145248 Mon Sep 17 00:00:00 2001 From: Vamsi Date: Tue, 17 Mar 2026 15:08:59 +0530 Subject: [PATCH 10/12] Fix tests in TestColumnStatsIndex --- .../HoodieBackedTableMetadataWriter.java | 62 +++++++++---------- .../log/HoodieUnMergedLogRecordScanner.java | 2 +- .../hudi/metadata/HoodieMetadataPayload.java | 18 +++--- .../metadata/HoodieTableMetadataUtil.java | 52 +++++++++++++++- 4 files changed, 92 insertions(+), 42 deletions(-) diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java index 2f1ab37bf52b6..5d53fdb0f7650 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java @@ -341,9 +341,7 @@ private boolean isBootstrapNeeded(Option latestMetadataInstant) { */ private void initializeFromFilesystem(String initializationTime, List partitionsToInit, Option inflightInstantTimestamp) throws IOException { - if (anyPendingDataInstant(dataMetaClient, inflightInstantTimestamp)) { - return; - } + Set pendingDataInstants = getPendingDataInstants(dataMetaClient); // FILES partition is always required and is initialized first boolean filesPartitionAvailable = dataMetaClient.getTableConfig().isMetadataPartitionAvailable(MetadataPartitionType.FILES); @@ -368,11 +366,11 @@ private void initializeFromFilesystem(String initializationTime, List partitionInfoList; if (filesPartitionAvailable) { - partitionInfoList = listAllPartitionsFromMDT(initializationTime); + partitionInfoList = listAllPartitionsFromMDT(initializationTime, pendingDataInstants); } else { // if auto initialization is enabled, then we need to list all partitions from the file system if (dataWriteConfig.getMetadataConfig().shouldAutoInitialize()) { - partitionInfoList = listAllPartitionsFromFilesystem(initializationTime); + partitionInfoList = listAllPartitionsFromFilesystem(initializationTime, pendingDataInstants); } else { // if auto initialization is disabled, we can return an empty list partitionInfoList = Collections.emptyList(); @@ -545,22 +543,14 @@ private Pair> initializeFilesPartition(List inflightInstantTimestamp) { - // We can only initialize if there are no pending operations on the dataset - List pendingDataInstant = dataMetaClient.getActiveTimeline() + private Set getPendingDataInstants(HoodieTableMetaClient dataMetaClient) { + // Initialize excluding the pending operations on the dataset + return dataMetaClient.getActiveTimeline() .getInstantsAsStream().filter(i -> !i.isCompleted()) - .filter(i -> !inflightInstantTimestamp.isPresent() || !i.getTimestamp().equals(inflightInstantTimestamp.get())) // regular writers should not be blocked due to pending indexing action .filter(i -> !HoodieTimeline.INDEXING_ACTION.equals(i.getAction())) - .collect(Collectors.toList()); - - if (!pendingDataInstant.isEmpty()) { - metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.BOOTSTRAP_ERR_STR, 1)); - LOG.warn("Cannot initialize metadata table as operation(s) are in progress on the dataset: " - + Arrays.toString(pendingDataInstant.toArray())); - return true; - } - return false; + .map(HoodieInstant::getTimestamp) + .collect(Collectors.toSet()); } private HoodieTableMetaClient initializeMetaClient() throws IOException { @@ -582,7 +572,7 @@ private HoodieTableMetaClient initializeMetaClient() throws IOException { * @param initializationTime Files which have a timestamp after this are neglected * @return List consisting of {@code DirectoryInfo} for each partition found. */ - private List listAllPartitionsFromFilesystem(String initializationTime) { + private List listAllPartitionsFromFilesystem(String initializationTime, Set pendingDataInstants) { List pathsToList = new LinkedList<>(); pathsToList.add(new SerializablePath(new CachingPath(dataWriteConfig.getBasePath()))); @@ -601,7 +591,7 @@ private List listAllPartitionsFromFilesystem(String initializatio List processedDirectories = engineContext.map(pathsToList.subList(0, numDirsToList), path -> { FileSystem fs = path.get().getFileSystem(conf.get()); String relativeDirPath = FSUtils.getRelativePartitionPath(serializableBasePath.get(), path.get()); - return new DirectoryInfo(relativeDirPath, fs.listStatus(path.get()), initializationTime); + return new DirectoryInfo(relativeDirPath, fs.listStatus(path.get()), initializationTime, pendingDataInstants); }, numDirsToList); pathsToList = new LinkedList<>(pathsToList.subList(numDirsToList, pathsToList.size())); @@ -638,14 +628,14 @@ private List listAllPartitionsFromFilesystem(String initializatio * @param initializationTime Files which have a timestamp after this are neglected * @return List consisting of {@code DirectoryInfo} for each partition found. */ - private List listAllPartitionsFromMDT(String initializationTime) throws IOException { + private List listAllPartitionsFromMDT(String initializationTime, Set pendingDataInstants) throws IOException { List dirinfoList = new LinkedList<>(); List allAbsolutePartitionPaths = metadata.getAllPartitionPaths().stream() .map(partitionPath -> dataWriteConfig.getBasePath() + "/" + partitionPath).collect(Collectors.toList()); Map partitionFileMap = metadata.getAllFilesInPartitions(allAbsolutePartitionPaths); for (Map.Entry entry : partitionFileMap.entrySet()) { String relativeDirPath = FSUtils.getRelativePartitionPath(new Path(dataWriteConfig.getBasePath()), new Path(entry.getKey())); - dirinfoList.add(new DirectoryInfo(relativeDirPath, entry.getValue(), initializationTime)); + dirinfoList.add(new DirectoryInfo(relativeDirPath, entry.getValue(), initializationTime, pendingDataInstants, false)); } return dirinfoList; } @@ -930,7 +920,7 @@ public void update(HoodieRestoreMetadata restoreMetadata, String instantTime) { // Restore requires the existing pipelines to be shutdown. So we can safely scan the dataset to find the current // list of files in the filesystem. - List dirInfoList = listAllPartitionsFromFilesystem(instantTime); + List dirInfoList = listAllPartitionsFromFilesystem(instantTime, Collections.emptySet()); Map dirInfoMap = dirInfoList.stream().collect(Collectors.toMap(DirectoryInfo::getRelativePath, Function.identity())); dirInfoList.clear(); @@ -1489,29 +1479,39 @@ static class DirectoryInfo implements Serializable { // Is this a hoodie partition private boolean isHoodiePartition = false; - public DirectoryInfo(String relativePath, FileStatus[] fileStatus, String maxInstantTime) { + public DirectoryInfo(String relativePath, FileStatus[] fileStatuses, String maxInstantTime, Set pendingDataInstants) { + this(relativePath, fileStatuses, maxInstantTime, pendingDataInstants, true); + } + + /** + * When files are directly fetched from Metadata table we do not need to validate HoodiePartitions. + */ + public DirectoryInfo(String relativePath, FileStatus[] fileStatus, String maxInstantTime, Set pendingDataInstants, + boolean validateHoodiePartitions) { this.relativePath = relativePath; // Pre-allocate with the maximum length possible filenameToSizeMap = new HashMap<>(fileStatus.length); + // Presence of partition meta file implies this is a HUDI partition + // if input files are directly fetched from MDT, it may not contain the HoodiePartitionMetadata file. So, we can ignore the validation for isHoodiePartition. + isHoodiePartition = !validateHoodiePartitions || Arrays.stream(fileStatus).anyMatch(status -> status.getPath().getName().startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX)); for (FileStatus status : fileStatus) { - if (status.isDirectory()) { + // Do not attempt to search for more subdirectories inside directories that are partitions + if (!isHoodiePartition && status.isDirectory()) { // Ignore .hoodie directory as there cannot be any partitions inside it if (!status.getPath().getName().equals(HoodieTableMetaClient.METAFOLDER_NAME)) { this.subDirectories.add(status.getPath()); } - } else if (status.getPath().getName().startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX)) { - // Presence of partition meta file implies this is a HUDI partition - this.isHoodiePartition = true; - } else if (FSUtils.isDataFile(status.getPath())) { + } else if (isHoodiePartition && FSUtils.isDataFile(status.getPath())) { // Regular HUDI data file (base file or log file) String dataFileCommitTime = FSUtils.getCommitTime(status.getPath().getName()); - // Limit the file listings to files which were created before the maxInstant time. - if (HoodieTimeline.compareTimestamps(dataFileCommitTime, HoodieTimeline.LESSER_THAN_OR_EQUALS, maxInstantTime)) { + // Limit the file listings to files which were created by successful commits before the maxInstant time. + if (!pendingDataInstants.contains(dataFileCommitTime) && HoodieTimeline.compareTimestamps(dataFileCommitTime, LESSER_THAN_OR_EQUALS, maxInstantTime)) { filenameToSizeMap.put(status.getPath().getName(), status.getLen()); } } + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java index f62ec0febd578..032aac8574eb1 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java @@ -79,7 +79,7 @@ public void processNextRecord(HoodieRecord hoodieRecord) throws Exception @Override protected void processNextDeletedRecord(DeleteRecord deleteRecord) { - throw new IllegalStateException("Not expected to see delete records in this log-scan mode. Check Job Config"); + // no op } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java index 8b637be447f0c..8de6e4e2f4a49 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java @@ -265,32 +265,34 @@ public HoodieMetadataPayload(Option recordOpt) { } private HoodieMetadataPayload(String key, int type, Map filesystemMetadata) { - this(key, type, filesystemMetadata, null, null, null); + this(key, type, filesystemMetadata, null, null, null, false); } private HoodieMetadataPayload(String key, HoodieMetadataBloomFilter metadataBloomFilter) { - this(key, METADATA_TYPE_BLOOM_FILTER, null, metadataBloomFilter, null, null); + this(key, METADATA_TYPE_BLOOM_FILTER, null, metadataBloomFilter, null, null, metadataBloomFilter.getIsDeleted()); } private HoodieMetadataPayload(String key, HoodieMetadataColumnStats columnStats) { - this(key, METADATA_TYPE_COLUMN_STATS, null, null, columnStats, null); + this(key, METADATA_TYPE_COLUMN_STATS, null, null, columnStats, null, columnStats.getIsDeleted()); } private HoodieMetadataPayload(String key, HoodieRecordIndexInfo recordIndexMetadata) { - this(key, METADATA_TYPE_RECORD_INDEX, null, null, null, recordIndexMetadata); + this(key, METADATA_TYPE_RECORD_INDEX, null, null, null, recordIndexMetadata, false); } protected HoodieMetadataPayload(String key, int type, - Map filesystemMetadata, - HoodieMetadataBloomFilter metadataBloomFilter, - HoodieMetadataColumnStats columnStats, - HoodieRecordIndexInfo recordIndexMetadata) { + Map filesystemMetadata, + HoodieMetadataBloomFilter metadataBloomFilter, + HoodieMetadataColumnStats columnStats, + HoodieRecordIndexInfo recordIndexMetadata, + boolean isDeletedRecord) { this.key = key; this.type = type; this.filesystemMetadata = filesystemMetadata; this.bloomFilterMetadata = metadataBloomFilter; this.columnStatMetadata = columnStats; this.recordIndexMetadata = recordIndexMetadata; + this.isDeletedRecord = isDeletedRecord; } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java index 6c1de68e043cc..5f87b8e582435 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java @@ -50,6 +50,7 @@ import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.common.table.log.HoodieUnMergedLogRecordScanner; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; @@ -62,6 +63,7 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ParquetUtils; import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.VisibleForTesting; import org.apache.hudi.common.util.collection.ClosableIterator; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.common.util.collection.Tuple3; @@ -100,6 +102,7 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Properties; import java.util.Set; import java.util.UUID; import java.util.function.BiFunction; @@ -1123,7 +1126,7 @@ private static Stream getColumnStatsRecords(String partitionPath, } List> columnRangeMetadata = - readColumnRangeMetadataFrom(partitionPath, fileName, datasetMetaClient, columnsToIndex); + readColumnRangeMetadataFrom(partitionPath, fileName, datasetMetaClient, columnsToIndex, maxBufferSize); return HoodieMetadataPayload.createColumnStatsRecords(partitionPath, columnRangeMetadata, false); } @@ -1131,13 +1134,18 @@ private static Stream getColumnStatsRecords(String partitionPath, private static List> readColumnRangeMetadataFrom(String partitionPath, String fileName, HoodieTableMetaClient datasetMetaClient, - List columnsToIndex) { + List columnsToIndex, + int maxBufferSize) { String partitionPathFileName = (partitionPath.equals(EMPTY_PARTITION_NAME) || partitionPath.equals(NON_PARTITIONED_NAME)) ? fileName : partitionPath + "/" + fileName; try { Path fullFilePath = new Path(datasetMetaClient.getBasePathV2(), partitionPathFileName); if (partitionPathFileName.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { return new ParquetUtils().readRangeFromParquetMetadata(datasetMetaClient.getHadoopConf(), fullFilePath, columnsToIndex); + } else if (FSUtils.isLogFile(fileName)) { + Option writerSchemaOpt = tryResolveSchemaForTable(datasetMetaClient); + LOG.warn("Reading log file: {}, to build column range metadata.", partitionPathFileName); + return getLogFileColumnRangeMetadata(fullFilePath.toString(), datasetMetaClient, columnsToIndex, writerSchemaOpt, maxBufferSize); } LOG.warn("Column range index not supported for: {}", partitionPathFileName); @@ -1150,6 +1158,46 @@ private static List> readColumnRangeMetada } } + /** + * Read column range metadata from log file. + */ + @VisibleForTesting + protected static List> getLogFileColumnRangeMetadata(String filePath, + HoodieTableMetaClient datasetMetaClient, + List columnsToIndex, + Option writerSchemaOpt, + int maxBufferSize) throws IOException { + if (writerSchemaOpt.isPresent()) { + List fieldsToIndex = writerSchemaOpt.get().getFields().stream() + .filter(field -> columnsToIndex.contains(field.name())) + .collect(Collectors.toList()); + // read log file records without merging + List records = new ArrayList<>(); + HoodieUnMergedLogRecordScanner scanner = HoodieUnMergedLogRecordScanner.newBuilder() + .withFileSystem(datasetMetaClient.getFs()) + .withBasePath(datasetMetaClient.getBasePath()) + .withLogFilePaths(Collections.singletonList(filePath)) + .withBufferSize(maxBufferSize) + .withLatestInstantTime(datasetMetaClient.getActiveTimeline().getCommitsTimeline().lastInstant().get().getTimestamp()) + .withReaderSchema(writerSchemaOpt.get()) + .withLogRecordScannerCallback(records::add) + .build(); + scanner.scan(); + if (records.isEmpty()) { + return Collections.emptyList(); + } + + List indexedRecords = new ArrayList<>(); + for (HoodieRecord hoodieRecord : records) { + indexedRecords.add(hoodieRecord.toIndexedRecord(writerSchemaOpt.get(), new Properties()).get().getData()); + } + Map> columnRangeMetadataMap = + collectColumnRangeMetadata(indexedRecords, fieldsToIndex, getFileNameFromPath(filePath)); + return new ArrayList<>(columnRangeMetadataMap.values()); + } + return Collections.emptyList(); + } + /** * Does an upcast for {@link BigDecimal} instance to align it with scale/precision expected by * the {@link org.apache.avro.LogicalTypes.Decimal} Avro logical type From b27240d78ef509c437ce9a087b58ae88be898fbd Mon Sep 17 00:00:00 2001 From: Vamsi Date: Wed, 18 Mar 2026 15:22:24 +0530 Subject: [PATCH 11/12] Fix MDT Bootstrap tests --- .../HoodieBackedTableMetadataWriter.java | 33 +++++++++++++++++-- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java index 5d53fdb0f7650..bbd7427171b55 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java @@ -266,7 +266,11 @@ protected boolean initializeIfNeeded(HoodieTableMetaClient dataMetaClient, // If there is no commit on the dataset yet, use the SOLO_COMMIT_TIMESTAMP as the instant time for initial commit // Otherwise, we use the timestamp of the latest completed action. String initializationTime = dataMetaClient.getActiveTimeline().filterCompletedInstants().lastInstant().map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP); - initializeFromFilesystem(initializationTime, metadataPartitionsToInit, inflightInstantTimestamp); + if(!initializeFromFilesystem(initializationTime, metadataPartitionsToInit, inflightInstantTimestamp)) { + LOG.error("Failed to initialize MDT from filesystem"); + return false; + } + metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.INITIALIZE_STR, timer.endTimer())); return true; } catch (IOException e) { @@ -339,10 +343,13 @@ private boolean isBootstrapNeeded(Option latestMetadataInstant) { * @param partitionsToInit - List of MDT partitions to initialize * @param inflightInstantTimestamp - Current action instant responsible for this initialization */ - private void initializeFromFilesystem(String initializationTime, List partitionsToInit, + private boolean initializeFromFilesystem(String initializationTime, List partitionsToInit, Option inflightInstantTimestamp) throws IOException { - Set pendingDataInstants = getPendingDataInstants(dataMetaClient); + if (anyPendingDataInstant(dataMetaClient, inflightInstantTimestamp)) { + return false; + } + Set pendingDataInstants = inflightInstantTimestamp.map(Collections::singleton).orElse(Collections.emptySet()); // FILES partition is always required and is initialized first boolean filesPartitionAvailable = dataMetaClient.getTableConfig().isMetadataPartitionAvailable(MetadataPartitionType.FILES); if (!filesPartitionAvailable) { @@ -434,6 +441,8 @@ private void initializeFromFilesystem(String initializationTime, List getPendingDataInstants(HoodieTableMetaClient dataMetaClient) .collect(Collectors.toSet()); } + private boolean anyPendingDataInstant(HoodieTableMetaClient dataMetaClient, Option inflightInstantTimestamp) { + // We can only initialize if there are no pending operations on the dataset + List pendingDataInstant = dataMetaClient.getActiveTimeline() + .getInstantsAsStream().filter(i -> !i.isCompleted()) + .filter(i -> !inflightInstantTimestamp.isPresent() || !i.getTimestamp().equals(inflightInstantTimestamp.get())) + // regular writers should not be blocked due to pending indexing action + .filter(i -> !HoodieTimeline.INDEXING_ACTION.equals(i.getAction())) + .collect(Collectors.toList()); + + if (!pendingDataInstant.isEmpty()) { + metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.BOOTSTRAP_ERR_STR, 1)); + LOG.warn("Cannot initialize metadata table as operation(s) are in progress on the dataset: " + + Arrays.toString(pendingDataInstant.toArray())); + return true; + } + return false; + } + private HoodieTableMetaClient initializeMetaClient() throws IOException { return HoodieTableMetaClient.withPropertyBuilder() .setTableType(HoodieTableType.MERGE_ON_READ) From 6abac329f76d21a5b8b7afaa6e7888d26fb8dbc6 Mon Sep 17 00:00:00 2001 From: Vamsi Date: Tue, 24 Mar 2026 21:17:35 +0530 Subject: [PATCH 12/12] Fix checkstyle and tests --- .../apache/hudi/metadata/HoodieBackedTableMetadataWriter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java index 0a0473c8052a6..139d661b7e2e9 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java @@ -266,7 +266,7 @@ protected boolean initializeIfNeeded(HoodieTableMetaClient dataMetaClient, // If there is no commit on the dataset yet, use the SOLO_COMMIT_TIMESTAMP as the instant time for initial commit // Otherwise, we use the timestamp of the latest completed action. String initializationTime = dataMetaClient.getActiveTimeline().filterCompletedInstants().lastInstant().map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP); - if(!initializeFromFilesystem(initializationTime, metadataPartitionsToInit, inflightInstantTimestamp)) { + if (!initializeFromFilesystem(initializationTime, metadataPartitionsToInit, inflightInstantTimestamp)) { LOG.error("Failed to initialize MDT from filesystem"); return false; } @@ -349,7 +349,7 @@ private boolean initializeFromFilesystem(String initializationTime, List pendingDataInstants = inflightInstantTimestamp.map(Collections::singleton).orElse(Collections.emptySet()); + Set pendingDataInstants = getPendingDataInstants(dataMetaClient); // FILES partition is always required and is initialized first boolean filesPartitionAvailable = dataMetaClient.getTableConfig().isMetadataPartitionAvailable(MetadataPartitionType.FILES); if (!filesPartitionAvailable) {