-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[MINOR] Add optimisation for repair with logical timestamp #18478
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
551a7a2
94da78a
5401731
38e7a0d
21671e2
c8b9d02
ce6e12b
c21bd5c
fcb55bc
f0fa682
b36d013
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -56,26 +56,23 @@ public class HoodieMergedReadHandle<T, I, K, O> extends HoodieReadHandle<T, I, K | |
| protected final Schema baseFileReaderSchema; | ||
| private final Option<FileSlice> fileSliceOpt; | ||
|
|
||
| public HoodieMergedReadHandle(HoodieWriteConfig config, | ||
| Option<String> instantTime, | ||
| HoodieTable<T, I, K, O> hoodieTable, | ||
| Pair<String, String> partitionPathFileIDPair) { | ||
| this(config, instantTime, hoodieTable, partitionPathFileIDPair, Option.empty()); | ||
| public HoodieMergedReadHandle(HoodieWriteConfig config, Option<String> instantTime, | ||
| HoodieTable<T, I, K, O> hoodieTable, Pair<String, String> partitionPathFileIDPair, | ||
| boolean hasTimestampFields) { | ||
| this(config, instantTime, hoodieTable, partitionPathFileIDPair, hasTimestampFields, Option.empty()); | ||
| } | ||
|
|
||
| public HoodieMergedReadHandle(HoodieWriteConfig config, | ||
| Option<String> instantTime, | ||
| HoodieTable<T, I, K, O> hoodieTable, | ||
| Pair<String, String> partitionPathFileIDPair, | ||
| Option<FileSlice> fileSliceOption) { | ||
| public HoodieMergedReadHandle(HoodieWriteConfig config, Option<String> instantTime, | ||
| HoodieTable<T, I, K, O> hoodieTable, Pair<String, String> partitionPathFileIDPair, | ||
| boolean hasTimestampFields, Option<FileSlice> fileSliceOption) { | ||
| super(config, instantTime, hoodieTable, partitionPathFileIDPair); | ||
| Schema orignalReaderSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()), config.allowOperationMetadataField()); | ||
| // config.getSchema is not canonicalized, while config.getWriteSchema is canonicalized. So, we have to use the canonicalized schema to read the existing data. | ||
| baseFileReaderSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getWriteSchema()), config.allowOperationMetadataField()); | ||
| this.baseFileReaderSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getWriteSchema()), config.allowOperationMetadataField()); | ||
| fileSliceOpt = fileSliceOption.isPresent() ? fileSliceOption : getLatestFileSlice(); | ||
| // Repair reader schema. | ||
| // Assume writer schema should be correct. If not, no repair happens. | ||
| readerSchema = AvroSchemaUtils.getRepairedSchema(orignalReaderSchema, baseFileReaderSchema); | ||
| readerSchema = hasTimestampFields ? AvroSchemaUtils.getRepairedSchema(orignalReaderSchema, this.baseFileReaderSchema) : orignalReaderSchema; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 nit: the comment above this line should be updated to clarify that the repair is now conditional — it only happens when
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 The guard |
||
| } | ||
|
|
||
| public List<HoodieRecord<T>> getMergedRecords() { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,8 @@ | |
|
|
||
| package org.apache.hudi.metadata; | ||
|
|
||
| import org.apache.hudi.avro.AvroSchemaUtils; | ||
| import org.apache.hudi.avro.HoodieAvroUtils; | ||
| import org.apache.hudi.avro.model.HoodieCleanMetadata; | ||
| import org.apache.hudi.avro.model.HoodieIndexPartitionInfo; | ||
| import org.apache.hudi.avro.model.HoodieIndexPlan; | ||
|
|
@@ -75,6 +77,7 @@ | |
| import org.apache.hudi.table.BulkInsertPartitioner; | ||
| import org.apache.hudi.table.HoodieTable; | ||
|
|
||
| import org.apache.avro.Schema; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
|
|
@@ -584,8 +587,10 @@ private static HoodieData<HoodieRecord> readRecordKeysFromFileSliceSnapshot(Hood | |
| final String partition = partitionAndFileSlice.getKey(); | ||
| final FileSlice fileSlice = partitionAndFileSlice.getValue(); | ||
| final String fileId = fileSlice.getFileId(); | ||
| return new HoodieMergedReadHandle(dataWriteConfig, instantTime, hoodieTable, Pair.of(partition, fileSlice.getFileId()), | ||
| Option.of(fileSlice)).getMergedRecords().stream().map(record -> { | ||
| Schema baseFileReaderSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(dataWriteConfig.getWriteSchema()), dataWriteConfig.allowOperationMetadataField()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 The schema parsing and |
||
| boolean hasTimestampFields = baseFileReaderSchema != null && AvroSchemaUtils.hasTimestampMillisField(baseFileReaderSchema); | ||
| return new HoodieMergedReadHandle(dataWriteConfig, instantTime, hoodieTable, Pair.of(partition, fileSlice.getFileId()), hasTimestampFields, Option.of(fileSlice)) | ||
| .getMergedRecords().stream().map(record -> { | ||
| HoodieRecord record1 = (HoodieRecord) record; | ||
| return HoodieMetadataPayload.createRecordIndexUpdate(record1.getRecordKey(), partition, fileId, | ||
| record1.getCurrentLocation().getInstantTime(), 0); | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 Line 136: This - Generated by an AI agent and may contain mistakes. Please verify any suggestions before applying. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
|
|
||
| package org.apache.hudi.table.action.compact; | ||
|
|
||
| import org.apache.hudi.avro.AvroSchemaUtils; | ||
| import org.apache.hudi.avro.HoodieAvroUtils; | ||
| import org.apache.hudi.avro.model.HoodieCompactionPlan; | ||
| import org.apache.hudi.client.WriteStatus; | ||
|
|
@@ -130,6 +131,12 @@ public HoodieData<WriteStatus> compact( | |
| TaskContextSupplier taskContextSupplier = table.getTaskContextSupplier(); | ||
| // if this is a MDT, set up the instant range of log reader just like regular MDT snapshot reader. | ||
| Option<InstantRange> instantRange = CompactHelpers.getInstance().getInstantRange(metaClient); | ||
| // Since we are using merge handle here, we can directly query the write schema from conf | ||
| // Write handle provides an option to use overridden write schema as well which is not used by merge handle | ||
| Schema writerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getWriteSchema()), config.allowOperationMetadataField()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 Same gap as before — 🤖 This review comment was generated by an AI agent and may contain mistakes. Please verify any suggestions before applying.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 This still has the - Generated by an AI agent and may contain mistakes. Please verify any suggestions before applying.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 The metadata table guard is a nice optimization, but - Generated by an AI agent and may contain mistakes. Please verify any suggestions before applying. |
||
| if (!table.isMetadataTable()) { | ||
| AvroSchemaUtils.setLogicalTimestampRepairIfNotSet(table.getStorageConf(), () -> AvroSchemaUtils.hasTimestampMillisField(writerSchema)); | ||
| } | ||
| return context.parallelize(operations).map(operation -> compact( | ||
| compactionHandler, metaClient, config, operation, compactionInstantTime, maxInstantTime, instantRange, taskContextSupplier, executionHelper)) | ||
| .flatMap(List::iterator); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
|
|
||
| package org.apache.hudi.table.action.commit; | ||
|
|
||
| import org.apache.hudi.avro.AvroSchemaUtils; | ||
| import org.apache.hudi.client.WriteStatus; | ||
| import org.apache.hudi.client.clustering.update.strategy.SparkAllowUpdateStrategy; | ||
| import org.apache.hudi.client.utils.SparkValidatorUtils; | ||
|
|
@@ -31,6 +32,7 @@ | |
| import org.apache.hudi.common.model.HoodieRecordLocation; | ||
| import org.apache.hudi.common.model.HoodieWriteStat; | ||
| import org.apache.hudi.common.model.WriteOperationType; | ||
| import org.apache.hudi.common.table.TableSchemaResolver; | ||
| import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; | ||
| import org.apache.hudi.common.table.timeline.HoodieInstant; | ||
| import org.apache.hudi.common.table.timeline.HoodieTimeline; | ||
|
|
@@ -43,6 +45,7 @@ | |
| import org.apache.hudi.data.HoodieJavaPairRDD; | ||
| import org.apache.hudi.data.HoodieJavaRDD; | ||
| import org.apache.hudi.exception.HoodieCommitException; | ||
| import org.apache.hudi.exception.HoodieException; | ||
| import org.apache.hudi.exception.HoodieUpsertException; | ||
| import org.apache.hudi.execution.SparkLazyInsertIterable; | ||
| import org.apache.hudi.index.HoodieIndex; | ||
|
|
@@ -256,6 +259,22 @@ private HoodieData<WriteStatus> mapPartitionsAsRDD(HoodieData<HoodieRecord<T>> d | |
| // Partition only | ||
| partitionedRDD = mappedRDD.partitionBy(partitioner); | ||
| } | ||
|
|
||
| if (!table.isMetadataTable() && table.getMetaClient().getActiveTimeline().getCommitsTimeline().filterCompletedInstants().countInstants() > 0) { | ||
| TableSchemaResolver schemaResolver = new TableSchemaResolver(table.getMetaClient()); | ||
| try { | ||
| AvroSchemaUtils.setLogicalTimestampRepairIfNotSet(table.getStorageConf(), () -> { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 If - Generated by an AI agent and may contain mistakes. Please verify any suggestions before applying. |
||
| try { | ||
| return AvroSchemaUtils.hasTimestampMillisField(schemaResolver.getTableAvroSchema()); | ||
| } catch (Exception e) { | ||
| return true; | ||
| } | ||
| }); | ||
| } catch (Exception e) { | ||
| throw new HoodieException("Failed to set logical ts related configs", e); | ||
| } | ||
| } | ||
|
|
||
| return HoodieJavaRDD.of(partitionedRDD.map(Tuple2::_2).mapPartitionsWithIndex((partition, recordItr) -> { | ||
| if (WriteOperationType.isChangingRecords(operationType)) { | ||
| return handleUpsertPartition(instantTime, partition, recordItr, partitioner); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ | |
|
|
||
| package org.apache.hudi.io.hadoop; | ||
|
|
||
| import org.apache.hudi.avro.AvroSchemaUtils; | ||
| import org.apache.hudi.avro.HoodieAvroUtils; | ||
| import org.apache.hudi.common.bloom.BloomFilter; | ||
| import org.apache.hudi.common.model.HoodieAvroIndexedRecord; | ||
|
|
@@ -172,7 +173,12 @@ private ClosableIterator<IndexedRecord> getIndexedRecordIteratorInternal(Schema | |
| // sure that in case the file-schema is not equal to read-schema we'd still | ||
| // be able to read that file (in case projection is a proper one) | ||
| Configuration hadoopConf = storage.getConf().unwrapCopyAs(Configuration.class); | ||
| Schema repairedFileSchema = getRepairedSchema(getSchema(), schema); | ||
| Schema repairedFileSchema; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 This uses 🤖 This review comment was generated by an AI agent and may contain mistakes. Please verify any suggestions before applying. |
||
| if (AvroSchemaUtils.isLogicalTimestampRepairNeeded(storage.getConf(), () -> true)) { | ||
| repairedFileSchema = getRepairedSchema(getSchema(), schema); | ||
| } else { | ||
| repairedFileSchema = schema; | ||
| } | ||
| Option<Schema> promotedSchema = Option.empty(); | ||
| if (!renamedColumns.isEmpty() || HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(repairedFileSchema, schema)) { | ||
| AvroReadSupport.setAvroReadSchema(hadoopConf, repairedFileSchema); | ||
|
|
@@ -183,7 +189,8 @@ private ClosableIterator<IndexedRecord> getIndexedRecordIteratorInternal(Schema | |
| AvroReadSupport.setRequestedProjection(hadoopConf, schema); | ||
| } | ||
| ParquetReader<IndexedRecord> reader = | ||
| new HoodieAvroParquetReaderBuilder<IndexedRecord>(path) | ||
| new HoodieAvroParquetReaderBuilder<IndexedRecord>(path, | ||
| AvroSchemaUtils.isLogicalTimestampRepairNeeded(storage.getConf(), () -> schema == null || AvroSchemaUtils.hasTimestampMillisField(schema))) | ||
| .withTableSchema(schema) | ||
| .withConf(hadoopConf) | ||
| .set(AvroSchemaConverter.ADD_LIST_ELEMENT_RECORDS, hadoopConf.get(AvroSchemaConverter.ADD_LIST_ELEMENT_RECORDS)) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🤖 nit: the schema setup pattern on these two lines is duplicated identically in HoodieBackedTableMetadataWriter.java (lines 590–591) and TestHoodieMergedReadHandle.java (lines 204–205) — could you extract this into a static helper method to reduce duplication?