diff --git a/docker/demo/compaction.commands b/docker/demo/compaction.commands index 9853a355176f3..bdb0c6c3b632b 100644 --- a/docker/demo/compaction.commands +++ b/docker/demo/compaction.commands @@ -19,4 +19,7 @@ connect --path /user/hive/warehouse/stock_ticks_mor compactions show all compaction schedule compaction run --parallelism 2 --sparkMemory 1G --schemaFilePath /var/demo/config/schema.avsc --retry 1 - +connect --path /user/hive/warehouse/stock_ticks_mor_bs +compactions show all +compaction schedule +compaction run --parallelism 2 --sparkMemory 1G --schemaFilePath /var/demo/config/schema.avsc --retry 1 diff --git a/docker/demo/hive-batch1.commands b/docker/demo/hive-batch1.commands index 93bf3b67930aa..021c6d55b800d 100644 --- a/docker/demo/hive-batch1.commands +++ b/docker/demo/hive-batch1.commands @@ -25,4 +25,12 @@ select symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GO select symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'; select symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'; +select symbol, max(ts) from stock_ticks_cow_bs group by symbol HAVING symbol = 'GOOG'; +select symbol, max(ts) from stock_ticks_mor_bs_ro group by symbol HAVING symbol = 'GOOG'; +select symbol, max(ts) from stock_ticks_mor_bs_rt group by symbol HAVING symbol = 'GOOG'; + +select symbol, ts, volume, open, close from stock_ticks_cow_bs where symbol = 'GOOG'; +select symbol, ts, volume, open, close from stock_ticks_mor_bs_ro where symbol = 'GOOG'; +select symbol, ts, volume, open, close from stock_ticks_mor_bs_rt where symbol = 'GOOG'; + !quit diff --git a/docker/demo/hive-batch2-after-compaction.commands b/docker/demo/hive-batch2-after-compaction.commands index 6b087019d5cca..06582a309ae00 100644 --- a/docker/demo/hive-batch2-after-compaction.commands +++ b/docker/demo/hive-batch2-after-compaction.commands @@ -23,4 +23,10 @@ select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = ' select symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'; select symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'; +select symbol, max(ts) from stock_ticks_mor_bs_ro group by symbol HAVING symbol = 'GOOG'; +select symbol, max(ts) from stock_ticks_mor_bs_rt group by symbol HAVING symbol = 'GOOG'; + +select symbol, ts, volume, open, close from stock_ticks_mor_bs_ro where symbol = 'GOOG'; +select symbol, ts, volume, open, close from stock_ticks_mor_bs_rt where symbol = 'GOOG'; + !quit diff --git a/docker/demo/hive-incremental-cow.commands b/docker/demo/hive-incremental-cow.commands index 7f43548071863..702b2afa52733 100644 --- a/docker/demo/hive-incremental-cow.commands +++ b/docker/demo/hive-incremental-cow.commands @@ -23,5 +23,11 @@ set hoodie.stock_ticks_cow.consume.start.timestamp='${min.commit.time}'; select symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG' and `_hoodie_commit_time` > '${min.commit.time}'; +set hoodie.stock_ticks_cow_bs.consume.mode=INCREMENTAL; +set hoodie.stock_ticks_cow_bs.consume.max.commits=3; +set hoodie.stock_ticks_cow_bs.consume.start.timestamp='00000000000001'; + +select symbol, ts, volume, open, close from stock_ticks_cow_bs where symbol = 'GOOG' and `_hoodie_commit_time` > '00000000000001'; + !quit diff --git a/docker/demo/hive-incremental-mor-ro.commands b/docker/demo/hive-incremental-mor-ro.commands index 8b97c0aac9b5e..51683c010a496 100644 --- a/docker/demo/hive-incremental-mor-ro.commands +++ b/docker/demo/hive-incremental-mor-ro.commands @@ -23,5 +23,11 @@ set hoodie.stock_ticks_mor.consume.start.timestamp='${min.commit.time}'; select symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG' and `_hoodie_commit_time` > '${min.commit.time}'; +set hoodie.stock_ticks_mor_bs.consume.mode=INCREMENTAL; +set hoodie.stock_ticks_mor_bs.consume.max.commits=3; +set hoodie.stock_ticks_mor_bs.consume.start.timestamp='00000000000001'; + +select symbol, ts, volume, open, close from stock_ticks_mor_bs_ro where symbol = 'GOOG' and `_hoodie_commit_time` > '00000000000001'; + !quit diff --git a/docker/demo/hive-incremental-mor-rt.commands b/docker/demo/hive-incremental-mor-rt.commands index a81fb77e077d8..c29fc7ce55730 100644 --- a/docker/demo/hive-incremental-mor-rt.commands +++ b/docker/demo/hive-incremental-mor-rt.commands @@ -23,5 +23,11 @@ set hoodie.stock_ticks_mor.consume.start.timestamp='${min.commit.time}'; select symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG' and `_hoodie_commit_time` > '${min.commit.time}'; +set hoodie.stock_ticks_mor_bs.consume.mode=INCREMENTAL; +set hoodie.stock_ticks_mor_bs.consume.max.commits=3; +set hoodie.stock_ticks_mor_bs.consume.start.timestamp='00000000000001'; + +select symbol, ts, volume, open, close from stock_ticks_mor_bs_rt where symbol = 'GOOG' and `_hoodie_commit_time` > '00000000000001'; + !quit diff --git a/docker/demo/sparksql-batch1.commands b/docker/demo/sparksql-batch1.commands index 727aa1633154d..4de2486c6ce58 100644 --- a/docker/demo/sparksql-batch1.commands +++ b/docker/demo/sparksql-batch1.commands @@ -27,4 +27,14 @@ spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'").show(100, false) spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false) +// Bootstrapped Copy-On-Write table +spark.sql("select symbol, max(ts) from stock_ticks_cow_bs group by symbol HAVING symbol = 'GOOG'").show(100, false) +spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow_bs where symbol = 'GOOG'").show(100, false) + +// Bootstrapped Merge-On-Read table +spark.sql("select symbol, max(ts) from stock_ticks_mor_bs_ro group by symbol HAVING symbol = 'GOOG'").show(100, false) +spark.sql("select symbol, max(ts) from stock_ticks_mor_bs_rt group by symbol HAVING symbol = 'GOOG'").show(100, false) +spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_bs_ro where symbol = 'GOOG'").show(100, false) +spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_bs_rt where symbol = 'GOOG'").show(100, false) + System.exit(0) diff --git a/docker/demo/sparksql-batch2.commands b/docker/demo/sparksql-batch2.commands index 391e11b971a27..739d991dbbc1d 100644 --- a/docker/demo/sparksql-batch2.commands +++ b/docker/demo/sparksql-batch2.commands @@ -26,4 +26,14 @@ spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from s spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false) spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false) + // Copy-On-Write Bootstrapped table +spark.sql("select symbol, max(ts) from stock_ticks_cow_bs group by symbol HAVING symbol = 'GOOG'").show(100, false) +spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow_bs where symbol = 'GOOG'").show(100, false) + +// Merge-On-Read table Bootstrapped Table +spark.sql("select symbol, max(ts) from stock_ticks_mor_bs_ro group by symbol HAVING symbol = 'GOOG'").show(100, false) +spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_bs_ro where symbol = 'GOOG'").show(100, false) +spark.sql("select symbol, max(ts) from stock_ticks_mor_bs_rt group by symbol HAVING symbol = 'GOOG'").show(100, false) +spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_bs_rt where symbol = 'GOOG'").show(100, false) + System.exit(0) diff --git a/docker/demo/sparksql-bootstrap-prep-source.commands b/docker/demo/sparksql-bootstrap-prep-source.commands new file mode 100644 index 0000000000000..23db3e4d38c4b --- /dev/null +++ b/docker/demo/sparksql-bootstrap-prep-source.commands @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.spark.sql.functions.col + +val df = spark.read.format("org.apache.hudi").load("/user/hive/warehouse/stock_ticks_cow/*/*/*").drop("_hoodie_commit_time", "_hoodie_record_key", "_hoodie_file_name", "_hoodie_commit_seqno", "_hoodie_partition_path") +df.write.format("parquet").save("/user/hive/warehouse/stock_ticks_cow_bs_src/2018/08/31/") +System.exit(0) diff --git a/docker/demo/sparksql-incremental.commands b/docker/demo/sparksql-incremental.commands index 8e3e153e27e7e..febfcd28a1116 100644 --- a/docker/demo/sparksql-incremental.commands +++ b/docker/demo/sparksql-incremental.commands @@ -52,8 +52,38 @@ spark.sql("select key, `_hoodie_partition_path` as datestr, symbol, ts, open, cl mode(SaveMode.Overwrite). save("/user/hive/warehouse/stock_ticks_derived_mor"); -spark.sql("show tables").show(20, false) spark.sql("select count(*) from stock_ticks_derived_mor_ro").show(20, false) spark.sql("select count(*) from stock_ticks_derived_mor_rt").show(20, false) -System.exit(0); \ No newline at end of file +val hoodieIncQueryBsDF = spark.read.format("org.apache.hudi"). + option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL). + option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY, "00000000000001"). + load("/user/hive/warehouse/stock_ticks_cow_bs"); +hoodieIncQueryBsDF.registerTempTable("stock_ticks_cow_bs_incr") +spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow_bs_incr where symbol = 'GOOG'").show(100, false); + +spark.sql("select key, `_hoodie_partition_path` as datestr, symbol, ts, open, close from stock_ticks_cow_bs_incr"). + write.format("org.apache.hudi"). + option("hoodie.insert.shuffle.parallelism", "2"). + option("hoodie.upsert.shuffle.parallelism","2"). + option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL). + option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL). + option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "key"). + option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "datestr"). + option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "ts"). + option(HoodieWriteConfig.TABLE_NAME, "stock_ticks_derived_mor_bs"). + option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY, "stock_ticks_derived_mor_bs"). + option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY, "default"). + option(DataSourceWriteOptions.HIVE_URL_OPT_KEY, "jdbc:hive2://hiveserver:10000"). + option(DataSourceWriteOptions.HIVE_USER_OPT_KEY, "hive"). + option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY, "hive"). + option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY, "true"). + option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY, "datestr"). + mode(SaveMode.Overwrite). + save("/user/hive/warehouse/stock_ticks_derived_mor_bs"); + +spark.sql("show tables").show(20, false) +spark.sql("select count(*) from stock_ticks_derived_mor_bs_ro").show(20, false) +spark.sql("select count(*) from stock_ticks_derived_mor_bs_rt").show(20, false) + +System.exit(0); diff --git a/hudi-cli/hudi-cli.sh b/hudi-cli/hudi-cli.sh index b6e708c14436d..78d8f4d6056f6 100755 --- a/hudi-cli/hudi-cli.sh +++ b/hudi-cli/hudi-cli.sh @@ -25,4 +25,6 @@ if [ -z "$CLIENT_JAR" ]; then echo "Client jar location not set, please set it in conf/hudi-env.sh" fi -java -cp ${HADOOP_CONF_DIR}:${SPARK_CONF_DIR}:$DIR/target/lib/*:$HOODIE_JAR:${CLIENT_JAR} -DSPARK_CONF_DIR=${SPARK_CONF_DIR} -DHADOOP_CONF_DIR=${HADOOP_CONF_DIR} org.springframework.shell.Bootstrap $@ +OTHER_JARS=`ls ${DIR}/target/lib/* | grep -v 'hudi-[^/]*jar' | tr '\n' ':'` +echo "Running : java -cp ${HADOOP_CONF_DIR}:${SPARK_CONF_DIR}:${HOODIE_JAR}:${OTHER_JARS}:${CLIENT_JAR} -DSPARK_CONF_DIR=${SPARK_CONF_DIR} -DHADOOP_CONF_DIR=${HADOOP_CONF_DIR} org.springframework.shell.Bootstrap $@" +java -cp ${HADOOP_CONF_DIR}:${SPARK_CONF_DIR}:${HOODIE_JAR}:${OTHER_JARS}:${CLIENT_JAR} -DSPARK_CONF_DIR=${SPARK_CONF_DIR} -DHADOOP_CONF_DIR=${HADOOP_CONF_DIR} org.springframework.shell.Bootstrap $@ diff --git a/hudi-cli/pom.xml b/hudi-cli/pom.xml index dbb44639f3b83..cd2c975bd9bdb 100644 --- a/hudi-cli/pom.xml +++ b/hudi-cli/pom.xml @@ -147,6 +147,41 @@ + + org.apache.maven.plugins + maven-shade-plugin + ${maven-shade-plugin.version} + + + package + + shade + + + true + ${project.build.directory}/dependency-reduced-pom.xml + + + + + + true + + + META-INF/LICENSE + target/classes/META-INF/LICENSE + + + + + org.apache.hudi:hudi-utilities-bundle_${scala.binary.version} + + + ${project.artifactId}-${project.version} + + + + @@ -202,6 +237,12 @@ test-jar + + org.apache.hudi + hudi-utilities-bundle_${scala.binary.version} + ${project.version} + + log4j diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/BootstrapCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/BootstrapCommand.java new file mode 100644 index 0000000000000..e0e4742a9f269 --- /dev/null +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/BootstrapCommand.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli.commands; + +import org.apache.hudi.avro.model.BootstrapIndexInfo; +import org.apache.hudi.cli.HoodieCLI; +import org.apache.hudi.cli.HoodiePrintHelper; +import org.apache.hudi.cli.TableHeader; +import org.apache.hudi.cli.commands.SparkMain.SparkCommand; +import org.apache.hudi.cli.utils.InputStreamConsumer; +import org.apache.hudi.cli.utils.SparkUtil; +import org.apache.hudi.common.bootstrap.index.BootstrapIndex; +import org.apache.hudi.common.model.BootstrapSourceFileMapping; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.utilities.UtilHelpers; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.launcher.SparkLauncher; +import org.apache.spark.util.Utils; +import org.springframework.shell.core.CommandMarker; +import org.springframework.shell.core.annotation.CliCommand; +import org.springframework.shell.core.annotation.CliOption; +import org.springframework.stereotype.Component; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.util.Arrays; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.stream.Collectors; + +import scala.collection.JavaConverters; + +/** + * CLI command to perform bootstrap action & display bootstrap index. + */ +@Component +public class BootstrapCommand implements CommandMarker { + + private static final Logger LOG = LogManager.getLogger(BootstrapCommand.class); + + @CliCommand(value = "bootstrap", help = "Run a bootstrap action for current Hudi table") + public String bootstrap( + @CliOption(key = {"sourcePath"}, mandatory = true, help = "Source data path of the table") final String sourcePath, + @CliOption(key = {"recordKeyColumns"}, mandatory = true, help = "Record key columns for bootstrap data") final String recordKeyCols, + @CliOption(key = {"parallelism"}, unspecifiedDefaultValue = "1500", help = "Bootstrap writer parallelism") final int parallelism, + @CliOption(key = {"selectorClass"}, unspecifiedDefaultValue = "org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector", + help = "Selector class for bootstrap") final String selectorClass, + @CliOption(key = {"schema"}, unspecifiedDefaultValue = "", help = "Schema of the source data file") final String schema) + throws IOException, InterruptedException, URISyntaxException { + + boolean initialized = HoodieCLI.initConf(); + HoodieCLI.initFS(initialized); + + HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); + + String sparkPropertiesPath = + Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala()); + + SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); + + String cmd = SparkCommand.BOOTSTRAP.toString(); + + sparkLauncher.addAppArgs(cmd, metaClient.getTableConfig().getTableName(), metaClient.getBasePath(), sourcePath, recordKeyCols, String.valueOf(parallelism), selectorClass, schema); + UtilHelpers.validateAndAddProperties(new String[] {}, sparkLauncher); + Process process = sparkLauncher.launch(); + InputStreamConsumer.captureOutput(process); + int exitCode = process.waitFor(); + if (exitCode != 0) { + return "Failed to import source data to hudi dataset"; + } + return "Imported source data as hudi dataset"; + } + + @CliCommand(value = "show bootstrap index mapping", help = "Show bootstrap index mapping") + public String showBootstrapIndexMapping( + @CliOption(key = {"partitionPath"}, unspecifiedDefaultValue = "", help = "A valid paritition path") String partition, + @CliOption(key = {"fileIds"}, unspecifiedDefaultValue = "", help = "Valid fileIds split by comma") String fileIds, + @CliOption(key = {"limit"}, unspecifiedDefaultValue = "-1", help = "Limit rows to be displayed") Integer limit, + @CliOption(key = {"sortBy"}, unspecifiedDefaultValue = "", help = "Sorting Field") final String sortByField, + @CliOption(key = {"desc"}, unspecifiedDefaultValue = "false", help = "Ordering") final boolean descending, + @CliOption(key = {"headeronly"}, unspecifiedDefaultValue = "false", help = "Print Header Only") + final boolean headerOnly) { + + if (partition.isEmpty() && !fileIds.isEmpty()) { + throw new IllegalStateException("Both paritionPath and fileIds are required"); + } + BootstrapIndex.IndexReader indexReader = createBootstrapIndexReader(); + + // TODO tmp solution because the indexedPartition name is not clean + // List indexedPartitions = indexReader.getIndexedPartitions(); + List indexedPartitions = indexReader.getIndexedPartitions().stream() + .map(p -> p.split("//")[0].substring(5)).collect(Collectors.toList()); + + if (!partition.isEmpty() && !indexedPartitions.contains(partition)) { + return partition + " is not an valid indexed partition"; + } + + List mappingList = new ArrayList<>(); + if (!fileIds.isEmpty()) { + List fileGroupIds = Arrays.stream(fileIds.split(",")) + .map(fileId -> new HoodieFileGroupId(partition, fileId)).collect(Collectors.toList()); + mappingList.addAll(indexReader.getSourceFileMappingForFileIds(fileGroupIds).values()); + } else if (!partition.isEmpty()) { + mappingList.addAll(indexReader.getSourceFileMappingForPartition(partition)); + } else { + for (String part : indexedPartitions) { + mappingList.addAll(indexReader.getSourceFileMappingForPartition(part)); + } + } + + final List rows = convertBootstrapSourceFileMapping(mappingList); + final TableHeader header = new TableHeader() + .addTableHeaderField("Hudi Partition") + .addTableHeaderField("FileId") + .addTableHeaderField("Source File Base Path") + .addTableHeaderField("Source File Parition") + .addTableHeaderField("Source File Path"); + + return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, + limit, headerOnly, rows); + } + + @CliCommand(value = "show indexed partitions", help = "Show bootstrap indexed partitions") + public String showIndexedPartitions() { + + BootstrapIndex.IndexReader indexReader = createBootstrapIndexReader(); + List indexedPartitions = indexReader.getIndexedPartitions(); + + String[] header = new String[] {"Indexed partitions"}; + String[][] rows = new String[indexedPartitions.size()][1]; + for (int i = 0; i < indexedPartitions.size(); i++) { + rows[i][0] = indexedPartitions.get(i); + } + return HoodiePrintHelper.print(header, rows); + } + + @CliCommand(value = "show bootstrap info", help = "Show bootstrap index info") + public String showBootstrapIndexInfo() { + + BootstrapIndex.IndexReader indexReader = createBootstrapIndexReader(); + BootstrapIndexInfo indexInfo = indexReader.getIndexInfo(); + + String[] header = new String[] {"Version", "Source Base Path", "Created Timestamp", "Number of keys"}; + String[][] rows = {{String.valueOf(indexInfo.getVersion()), indexInfo.getSourceBasePath(), + String.valueOf(indexInfo.getCreatedTimestamp()), String.valueOf(indexInfo.getNumKeys())}}; + + return HoodiePrintHelper.print(header, rows); + } + + private BootstrapIndex.IndexReader createBootstrapIndexReader() { + HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); + BootstrapIndex index = BootstrapIndex.getBootstrapIndex(metaClient); + if (!index.checkIndex()) { + throw new IllegalStateException("This is not a bootstraped Hudi table. Don't have any index info"); + } + return index.createReader(); + } + + private List convertBootstrapSourceFileMapping(List mappingList) { + final List rows = new ArrayList<>(); + for (BootstrapSourceFileMapping mapping : mappingList) { + rows.add(new Comparable[] {mapping.getHudiPartitionPath(), mapping.getHudiFileId(), + mapping.getSourceBasePath(), mapping.getSourcePartitionPath(), mapping.getSourceFileStatus().getPath().getUri()}); + } + return rows; + } +} diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java index 65dcde895208f..ff4bc7dbea4f4 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java @@ -55,7 +55,7 @@ public class SparkMain { * Commands. */ enum SparkCommand { - ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT, UPSERT, COMPACT_SCHEDULE, COMPACT_RUN, + BOOTSTRAP, ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT, UPSERT, COMPACT_SCHEDULE, COMPACT_RUN, COMPACT_UNSCHEDULE_PLAN, COMPACT_UNSCHEDULE_FILE, COMPACT_VALIDATE, COMPACT_REPAIR, CLEAN, DELETE_SAVEPOINT } @@ -70,6 +70,10 @@ public static void main(String[] args) throws Exception { : SparkUtil.initJavaSparkConf("hoodie-cli-" + command); int returnCode = 0; switch (cmd) { + case BOOTSTRAP: + assert (args.length == 8); + returnCode = doBootstrap(jsc, args[1], args[2], args[3], args[4], Integer.parseInt(args[5]), args[6], args[7]); + break; case ROLLBACK: assert (args.length == 3); returnCode = rollback(jsc, args[1], args[2]); diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/TableCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/TableCommand.java index da3d57a4f6a80..0d28bae4ced49 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/TableCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/TableCommand.java @@ -88,7 +88,9 @@ public String createTable( @CliOption(key = {"archiveLogFolder"}, help = "Folder Name for storing archived timeline") String archiveFolder, @CliOption(key = {"layoutVersion"}, help = "Specific Layout Version to use") Integer layoutVersion, @CliOption(key = {"payloadClass"}, unspecifiedDefaultValue = "org.apache.hudi.common.model.HoodieAvroPayload", - help = "Payload Class") final String payloadClass) + help = "Payload Class") final String payloadClass, + @CliOption(key = {"bootstrapIndexClass"}, unspecifiedDefaultValue = "org.apache.hudi.common.bootstrap.index.HFileBasedBootstrapIndex", + help = "Bootstrap Index class") final String bootstrapIndexClass) throws IOException { boolean initialized = HoodieCLI.initConf(); @@ -109,7 +111,7 @@ public String createTable( final HoodieTableType tableType = HoodieTableType.valueOf(tableTypeStr); HoodieTableMetaClient.initTableType(HoodieCLI.conf, path, tableType, name, archiveFolder, - payloadClass, layoutVersion); + payloadClass, layoutVersion, bootstrapIndexClass); // Now connect to ensure loading works return connect(path, layoutVersion, false, 0, 0, 0); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestArchivedCommitsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestArchivedCommitsCommand.java index 88fcb39e45862..2c658044b892d 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestArchivedCommitsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestArchivedCommitsCommand.java @@ -64,7 +64,7 @@ public void init() throws IOException { tablePath = basePath + File.separator + tableName; new TableCommand().createTable( tablePath, tableName, - "COPY_ON_WRITE", "", 1, "org.apache.hudi.common.model.HoodieAvroPayload"); + "COPY_ON_WRITE", "", 1, "org.apache.hudi.common.model.HoodieAvroPayload", null); metaClient = HoodieCLI.getTableMetaClient(); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCleansCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCleansCommand.java index 6722c98915f2c..6edcf28dbb6c5 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCleansCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCleansCommand.java @@ -72,7 +72,7 @@ public void init() throws IOException { // Create table and connect new TableCommand().createTable( tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), - "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload", null); Configuration conf = HoodieCLI.conf; diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestFileSystemViewCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestFileSystemViewCommand.java index 83ad7fc23324c..cd3b727b7bb42 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestFileSystemViewCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestFileSystemViewCommand.java @@ -69,7 +69,7 @@ public void init() throws IOException { String tablePath = Paths.get(basePath, tableName).toString(); new TableCommand().createTable( tablePath, tableName, - "COPY_ON_WRITE", "", 1, "org.apache.hudi.common.model.HoodieAvroPayload"); + "COPY_ON_WRITE", "", 1, "org.apache.hudi.common.model.HoodieAvroPayload", null); metaClient = HoodieCLI.getTableMetaClient(); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java index b0d2504193a5f..79147c23ea281 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java @@ -87,7 +87,7 @@ public void init() throws IOException, InterruptedException, URISyntaxException partitionPath = tablePath + File.separator + HoodieTestCommitMetadataGenerator.DEFAULT_FIRST_PARTITION_PATH; new TableCommand().createTable( tablePath, tableName, HoodieTableType.MERGE_ON_READ.name(), - "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload", null); Files.createDirectories(Paths.get(partitionPath)); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRepairsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRepairsCommand.java index 9fc49181ddfce..8740a9e8b2028 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRepairsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRepairsCommand.java @@ -26,6 +26,7 @@ import org.apache.hudi.cli.HoodieTableHeaderFields; import org.apache.hudi.cli.common.HoodieTestCommitMetadataGenerator; import org.apache.hudi.common.HoodieTestDataGenerator; +import org.apache.hudi.common.bootstrap.index.HFileBasedBootstrapIndex; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -66,7 +67,8 @@ public void init() throws IOException { // Create table and connect new TableCommand().createTable( tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), - "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload", + HFileBasedBootstrapIndex.class.getName()); } /** diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRollbacksCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRollbacksCommand.java index 5a82d778f86e8..60d13301aa399 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRollbacksCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRollbacksCommand.java @@ -26,6 +26,7 @@ import org.apache.hudi.cli.TableHeader; import org.apache.hudi.client.HoodieWriteClient; import org.apache.hudi.common.HoodieTestDataGenerator; +import org.apache.hudi.common.bootstrap.index.HFileBasedBootstrapIndex; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieTestUtils; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; @@ -64,7 +65,8 @@ public void init() throws IOException { String tablePath = basePath + File.separator + tableName; new TableCommand().createTable( tablePath, tableName, HoodieTableType.MERGE_ON_READ.name(), - "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload", + HFileBasedBootstrapIndex.class.getName()); //Create some commits files and parquet files String commitTime1 = "100"; diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestSavepointsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestSavepointsCommand.java index 2c6a3f2939bc6..da246a6ac0ac5 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestSavepointsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestSavepointsCommand.java @@ -23,6 +23,7 @@ import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.HoodieTableHeaderFields; import org.apache.hudi.common.HoodieTestDataGenerator; +import org.apache.hudi.common.bootstrap.index.HFileBasedBootstrapIndex; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; @@ -53,7 +54,8 @@ public void init() throws IOException { // Create table and connect new TableCommand().createTable( tablePath, "test_table", HoodieTableType.COPY_ON_WRITE.name(), - "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload", + HFileBasedBootstrapIndex.class.getName()); } /** diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestStatsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestStatsCommand.java index 85fbc0ab031a5..0064fa1320b1b 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestStatsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestStatsCommand.java @@ -28,6 +28,7 @@ import org.apache.hudi.cli.TableHeader; import org.apache.hudi.cli.common.HoodieTestCommitMetadataGenerator; import org.apache.hudi.common.HoodieTestDataGenerator; +import org.apache.hudi.common.bootstrap.index.HFileBasedBootstrapIndex; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieTestUtils; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; @@ -65,7 +66,8 @@ public void init() throws IOException { // Create table and connect new TableCommand().createTable( tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), - "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload", + HFileBasedBootstrapIndex.class.getName()); } /** diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCleansCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCleansCommand.java index f76c79b5339dd..34b7b73528137 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCleansCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCleansCommand.java @@ -59,7 +59,7 @@ public void init() throws IOException { // Create table and connect new TableCommand().createTable( tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), - "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload", null); Configuration conf = HoodieCLI.conf; diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java index 4f48bc34fa295..cfc2767c7ceef 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java @@ -28,6 +28,7 @@ import org.apache.hudi.cli.commands.TableCommand; import org.apache.hudi.common.HoodieClientTestUtils; import org.apache.hudi.common.HoodieTestDataGenerator; +import org.apache.hudi.common.bootstrap.index.HFileBasedBootstrapIndex; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieLogFile; @@ -80,7 +81,8 @@ public void init() throws IOException, URISyntaxException { // Create table and connect new TableCommand().createTable( tablePath, "test_table", HoodieTableType.COPY_ON_WRITE.name(), - "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload", + HFileBasedBootstrapIndex.class.getName()); // generate 200 records Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestSavepointsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestSavepointsCommand.java index ee9a18e4b29e2..ddbb8281587d9 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestSavepointsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestSavepointsCommand.java @@ -22,6 +22,7 @@ import org.apache.hudi.cli.HoodieCLI; import org.apache.hudi.cli.commands.TableCommand; import org.apache.hudi.common.HoodieTestDataGenerator; +import org.apache.hudi.common.bootstrap.index.HFileBasedBootstrapIndex; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; @@ -59,7 +60,8 @@ public void init() throws IOException { // Create table and connect new TableCommand().createTable( tablePath, "test_table", HoodieTableType.COPY_ON_WRITE.name(), - "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload", + HFileBasedBootstrapIndex.class.getName()); } /** diff --git a/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java b/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java index 13986c4bf06b8..a92d6de05e24e 100644 --- a/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java +++ b/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java @@ -149,12 +149,12 @@ public JavaRDD> filterExists(JavaRDD> hoodieReco /** * Main API to run bootstrap to hudi. */ - public void bootstrap() { + public void bootstrap(Option> extraMetadata) { if (rollbackPending) { rollBackPendingBootstrap(); } HoodieTable table = getTableAndInitCtx(WriteOperationType.UPSERT); - table.bootstrap(jsc); + table.bootstrap(jsc, extraMetadata); } /** diff --git a/hudi-client/src/main/java/org/apache/hudi/client/bootstrap/BootstrapSourceSchemaProvider.java b/hudi-client/src/main/java/org/apache/hudi/client/bootstrap/BootstrapSourceSchemaProvider.java index 6569525354fc3..555f6f53349ef 100644 --- a/hudi-client/src/main/java/org/apache/hudi/client/bootstrap/BootstrapSourceSchemaProvider.java +++ b/hudi-client/src/main/java/org/apache/hudi/client/bootstrap/BootstrapSourceSchemaProvider.java @@ -18,9 +18,11 @@ package org.apache.hudi.client.bootstrap; +import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieFileStatus; import org.apache.hudi.common.bootstrap.FileStatusUtils; import org.apache.hudi.common.util.ParquetUtils; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.spark.api.java.JavaSparkContext; @@ -48,9 +50,12 @@ public BootstrapSourceSchemaProvider(HoodieWriteConfig bootstrapConfig) { * @return Avro Schema */ public final Schema getBootstrapSchema(JavaSparkContext jsc, List>> partitions) { - if (bootstrapConfig.getSchema() != null) { + if (!StringUtils.isNullOrEmpty(bootstrapConfig.getSchema())) { // Use schema specified by user if set - return Schema.parse(bootstrapConfig.getSchema()); + Schema userSchema = Schema.parse(bootstrapConfig.getSchema()); + if (!HoodieAvroUtils.getNullSchema().equals(userSchema)) { + return userSchema; + } } return getBootstrapSourceSchema(jsc, partitions); } diff --git a/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java b/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java index 8b6d99887b41e..fd5c647c05633 100644 --- a/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java +++ b/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java @@ -125,7 +125,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { private ConsistencyGuardConfig consistencyGuardConfig; - private static final String SOURCE_BASE_PATH_PROP = "hoodie.bootstrap.source.base.path"; + public static final String SOURCE_BASE_PATH_PROP = "hoodie.bootstrap.source.base.path"; private static final String BOOTSTRAP_MODE_SELECTOR = "hoodie.bootstrap.mode.selector"; private static final String FULL_BOOTRAP_INPUT_PROVIDER = "hoodie.bootstrap.full.input.provider"; private static final String BOOTSTRAP_KEYGEN_CLASS = "hoodie.bootstrap.keygen.class"; diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java index e385ab0e725a7..ec8541b65804b 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java @@ -42,7 +42,7 @@ import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.io.HoodieCreateHandle; import org.apache.hudi.io.HoodieMergeHandle; -import org.apache.hudi.table.action.bootstrap.BootstrapActionExecutor; +import org.apache.hudi.table.action.bootstrap.BootstrapCommitActionExecutor; import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata; import org.apache.hudi.table.action.clean.CleanActionExecutor; import org.apache.hudi.table.action.HoodieWriteMetadata; @@ -139,8 +139,8 @@ public HoodieWriteMetadata compact(JavaSparkContext jsc, String compactionInstan } @Override - public HoodieBootstrapWriteMetadata bootstrap(JavaSparkContext jsc) { - return new BootstrapActionExecutor(jsc, config, this).execute(); + public HoodieBootstrapWriteMetadata bootstrap(JavaSparkContext jsc, Option> extraMetadata) { + return new BootstrapCommitActionExecutor(jsc, config, this, extraMetadata).execute(); } @Override diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java index d60bbfc807851..07cd200f0ddc0 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java @@ -34,6 +34,8 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.bootstrap.BootstrapDeltaCommitActionExecutor; +import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata; import org.apache.hudi.table.action.compact.RunCompactionActionExecutor; import org.apache.hudi.table.action.deltacommit.BulkInsertDeltaCommitActionExecutor; import org.apache.hudi.table.action.deltacommit.BulkInsertPreppedDeltaCommitActionExecutor; @@ -131,6 +133,11 @@ public HoodieWriteMetadata compact(JavaSparkContext jsc, String compactionInstan return compactionExecutor.execute(); } + @Override + public HoodieBootstrapWriteMetadata bootstrap(JavaSparkContext jsc, Option> extraMetadata) { + return new BootstrapDeltaCommitActionExecutor(jsc, config, this, extraMetadata).execute(); + } + @Override public void rollbackBootstrap(JavaSparkContext jsc, String instantTime) { new MergeOnReadRestoreActionExecutor(jsc, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute(); diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java index 4a2fcc17e1bfb..b414e91bf4fd9 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java @@ -333,9 +333,11 @@ public abstract HoodieWriteMetadata compact(JavaSparkContext jsc, /** * Perform metadata/full bootstrap of a Hudi table. * @param jsc JavaSparkContext + * @param extraMetadata Additional Metadata for storing in commit file. * @return HoodieBootstrapWriteMetadata */ - public abstract HoodieBootstrapWriteMetadata bootstrap(JavaSparkContext jsc); + public abstract HoodieBootstrapWriteMetadata bootstrap(JavaSparkContext jsc, + Option> extraMetadata); /** * Perform rollback of bootstrap of a Hudi table. diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapActionExecutor.java b/hudi-client/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapCommitActionExecutor.java similarity index 97% rename from hudi-client/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapActionExecutor.java rename to hudi-client/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapCommitActionExecutor.java index 1daf12311e9b6..65fbaf59f752c 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapActionExecutor.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapCommitActionExecutor.java @@ -75,6 +75,7 @@ import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; +import org.apache.hudi.table.action.commit.CommitActionExecutor; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.parquet.avro.AvroParquetReader; @@ -96,17 +97,19 @@ import java.util.Map; import java.util.stream.Collectors; -public class BootstrapActionExecutor> +public class BootstrapCommitActionExecutor> extends BaseCommitActionExecutor { - private static final Logger LOG = LogManager.getLogger(BootstrapActionExecutor.class); - private String bootstrapSchema = null; + private static final Logger LOG = LogManager.getLogger(BootstrapCommitActionExecutor.class); + protected String bootstrapSchema = null; - public BootstrapActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable table) { + public BootstrapCommitActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable table, + Option> extraMetadata) { super(jsc, new HoodieWriteConfig.Builder().withProps(config.getProps()) .withAutoCommit(true).withWriteStatusClass(BootstrapWriteStatus.class) .withBulkInsertParallelism(config.getBootstrapParallelism()) - .build(), table, HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, WriteOperationType.BOOTSTRAP); + .build(), table, HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, WriteOperationType.BOOTSTRAP, + extraMetadata); } private void checkArguments() { @@ -143,6 +146,7 @@ public HoodieBootstrapWriteMetadata execute() { } } + @Override protected String getSchemaToStoreInCommit() { return bootstrapSchema; } @@ -220,9 +224,13 @@ protected HoodieWriteMetadata fullBootstrap(List getBulkInsertActionExecutor(JavaRDD inputRecordsRDD) { return new BulkInsertCommitActionExecutor(jsc, new HoodieWriteConfig.Builder().withProps(config.getProps()) .withSchema(bootstrapSchema).build(), table, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS, - inputRecordsRDD, Option.empty()).execute(); + inputRecordsRDD, extraMetadata); } private BootstrapWriteStatus handleMetadataBootstrap(String srcPartitionPath, String partitionPath, diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapDeltaCommitActionExecutor.java b/hudi-client/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapDeltaCommitActionExecutor.java new file mode 100644 index 0000000000000..08760cc3d272f --- /dev/null +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapDeltaCommitActionExecutor.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.bootstrap; + +import java.util.Map; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.commit.CommitActionExecutor; +import org.apache.hudi.table.action.deltacommit.BulkInsertDeltaCommitActionExecutor; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; + +public class BootstrapDeltaCommitActionExecutor> + extends BootstrapCommitActionExecutor { + + public BootstrapDeltaCommitActionExecutor(JavaSparkContext jsc, + HoodieWriteConfig config, HoodieTable table, + Option> extraMetadata) { + super(jsc, config, table, extraMetadata); + } + + protected CommitActionExecutor getBulkInsertActionExecutor(JavaRDD inputRecordsRDD) { + return new BulkInsertDeltaCommitActionExecutor(jsc, new HoodieWriteConfig.Builder().withProps(config.getProps()) + .withSchema(bootstrapSchema).build(), table, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS, + inputRecordsRDD, extraMetadata); + } +} diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java index f52317fac5bcb..e1dbd48f62506 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java @@ -60,19 +60,21 @@ public abstract class BaseCommitActionExecutor, private static final Logger LOG = LogManager.getLogger(BaseCommitActionExecutor.class); + protected final Option> extraMetadata; private final WriteOperationType operationType; protected final SparkTaskContextSupplier sparkTaskContextSupplier = new SparkTaskContextSupplier(); public BaseCommitActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable table, String instantTime, WriteOperationType operationType) { - this(jsc, config, table, instantTime, operationType, null); + this(jsc, config, table, instantTime, operationType, Option.empty()); } public BaseCommitActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable table, String instantTime, WriteOperationType operationType, - JavaRDD> inputRecordsRDD) { + Option> extraMetadata) { super(jsc, config, table, instantTime); this.operationType = operationType; + this.extraMetadata = extraMetadata; } public HoodieWriteMetadata execute(JavaRDD> inputRecordsRDD) { @@ -171,7 +173,7 @@ protected void updateIndexAndCommitIfNeeded(JavaRDD writeStatusRDD, protected void commitOnAutoCommit(HoodieWriteMetadata result) { if (config.shouldAutoCommit()) { LOG.info("Auto commit enabled: Committing " + instantTime); - commit(Option.empty(), result); + commit(extraMetadata, result); } else { LOG.info("Auto commit disabled for " + instantTime); } diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/BulkInsertCommitActionExecutor.java b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/BulkInsertCommitActionExecutor.java index 9f5468e5c721d..4929865fbf57c 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/BulkInsertCommitActionExecutor.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/BulkInsertCommitActionExecutor.java @@ -18,6 +18,7 @@ package org.apache.hudi.table.action.commit; +import java.util.Map; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; @@ -41,7 +42,14 @@ public BulkInsertCommitActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable table, String instantTime, JavaRDD> inputRecordsRDD, Option bulkInsertPartitioner) { - super(jsc, config, table, instantTime, WriteOperationType.BULK_INSERT); + this(jsc, config, table, instantTime, inputRecordsRDD, bulkInsertPartitioner, Option.empty()); + } + + public BulkInsertCommitActionExecutor(JavaSparkContext jsc, + HoodieWriteConfig config, HoodieTable table, + String instantTime, JavaRDD> inputRecordsRDD, + Option bulkInsertPartitioner, Option> extraMetadata) { + super(jsc, config, table, instantTime, WriteOperationType.BULK_INSERT, extraMetadata); this.inputRecordsRDD = inputRecordsRDD; this.bulkInsertPartitioner = bulkInsertPartitioner; } diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/CommitActionExecutor.java b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/CommitActionExecutor.java index 196600dc1b15e..a0d68e8b87073 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/CommitActionExecutor.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/CommitActionExecutor.java @@ -23,6 +23,7 @@ import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.execution.LazyInsertIterable; @@ -50,7 +51,13 @@ public abstract class CommitActionExecutor> public CommitActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable table, String instantTime, WriteOperationType operationType) { - super(jsc, config, table, instantTime, operationType); + this(jsc, config, table, instantTime, operationType, Option.empty()); + } + + public CommitActionExecutor(JavaSparkContext jsc, + HoodieWriteConfig config, HoodieTable table, + String instantTime, WriteOperationType operationType, Option> extraMetadata) { + super(jsc, config, table, instantTime, operationType, extraMetadata); } @Override diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/BulkInsertDeltaCommitActionExecutor.java b/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/BulkInsertDeltaCommitActionExecutor.java index 5e4b915cb2cf3..88bf4f5739f89 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/BulkInsertDeltaCommitActionExecutor.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/BulkInsertDeltaCommitActionExecutor.java @@ -18,6 +18,7 @@ package org.apache.hudi.table.action.deltacommit; +import java.util.Map; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; @@ -42,7 +43,14 @@ public BulkInsertDeltaCommitActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable table, String instantTime, JavaRDD> inputRecordsRDD, Option bulkInsertPartitioner) { - super(jsc, config, table, instantTime, WriteOperationType.BULK_INSERT); + this(jsc, config, table, instantTime, inputRecordsRDD, bulkInsertPartitioner, Option.empty()); + } + + public BulkInsertDeltaCommitActionExecutor(JavaSparkContext jsc, + HoodieWriteConfig config, HoodieTable table, + String instantTime, JavaRDD> inputRecordsRDD, + Option bulkInsertPartitioner, Option> extraMetadata) { + super(jsc, config, table, instantTime, WriteOperationType.BULK_INSERT, extraMetadata); this.inputRecordsRDD = inputRecordsRDD; this.bulkInsertPartitioner = bulkInsertPartitioner; } diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/DeltaCommitActionExecutor.java b/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/DeltaCommitActionExecutor.java index be3806e46c236..b50581fad7206 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/DeltaCommitActionExecutor.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/DeltaCommitActionExecutor.java @@ -18,10 +18,12 @@ package org.apache.hudi.table.action.deltacommit; +import java.util.Map; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.execution.LazyInsertIterable; @@ -51,7 +53,13 @@ public abstract class DeltaCommitActionExecutor public DeltaCommitActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable table, String instantTime, WriteOperationType operationType) { - super(jsc, config, table, instantTime, operationType); + this(jsc, config, table, instantTime, operationType, Option.empty()); + } + + public DeltaCommitActionExecutor(JavaSparkContext jsc, + HoodieWriteConfig config, HoodieTable table, + String instantTime, WriteOperationType operationType, Option> extraMetadata) { + super(jsc, config, table, instantTime, operationType, extraMetadata); } @Override diff --git a/hudi-client/src/test/java/org/apache/hudi/common/HoodieMergeOnReadTestUtils.java b/hudi-client/src/test/java/org/apache/hudi/common/HoodieMergeOnReadTestUtils.java index 22dc0f4db5bd8..53c56bf8cf5cd 100644 --- a/hudi-client/src/test/java/org/apache/hudi/common/HoodieMergeOnReadTestUtils.java +++ b/hudi-client/src/test/java/org/apache/hudi/common/HoodieMergeOnReadTestUtils.java @@ -18,8 +18,11 @@ package org.apache.hudi.common; +import org.apache.avro.Schema.Field; +import org.apache.hadoop.hive.ql.io.IOConstants; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.model.HoodieTestUtils; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.hadoop.HoodieParquetInputFormat; import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat; @@ -45,6 +48,7 @@ * Utility methods to aid in testing MergeOnRead (workaround for HoodieReadClient for MOR). */ public class HoodieMergeOnReadTestUtils { + public static List getRecordsUsingInputFormat(List inputPaths, String basePath) { return getRecordsUsingInputFormat(inputPaths, basePath, new Configuration()); } @@ -56,19 +60,36 @@ public static List getRecordsUsingInputFormat(List inputP } public static List getRecordsUsingInputFormat(List inputPaths, - String basePath, - JobConf jobConf, - HoodieParquetInputFormat inputFormat) { + String basePath, + JobConf jobConf, + HoodieParquetInputFormat inputFormat) { Schema schema = new Schema.Parser().parse(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA); return getRecordsUsingInputFormat(inputPaths, basePath, jobConf, inputFormat, schema, HoodieTestDataGenerator.TRIP_HIVE_COLUMN_TYPES); } public static List getRecordsUsingInputFormat(List inputPaths, String basePath, - JobConf jobConf, HoodieParquetInputFormat inputFormat, Schema rawSchema, String rawHiveColumnTypes) { + JobConf jobConf, HoodieParquetInputFormat inputFormat, Schema rawSchema, String rawHiveColumnTypes) { + return getRecordsUsingInputFormat(inputPaths, basePath, jobConf, inputFormat, rawSchema, rawHiveColumnTypes, + false, new ArrayList<>()); + } + + public static List getRecordsUsingInputFormat(List inputPaths, String basePath, + JobConf jobConf, HoodieParquetInputFormat inputFormat, Schema rawSchema, String rawHiveColumnTypes, + boolean projectCols, List projectedColumns) { Schema schema = HoodieAvroUtils.addMetadataFields(rawSchema); String hiveColumnTypes = HoodieAvroUtils.addMetadataColumnTypes(rawHiveColumnTypes); - setPropsForInputFormat(inputFormat, jobConf, schema, hiveColumnTypes); + setPropsForInputFormat(inputFormat, jobConf, schema, hiveColumnTypes, projectCols, projectedColumns); + final List fields; + if (projectCols) { + fields = schema.getFields().stream().filter(f -> projectedColumns.contains(f.name())) + .collect(Collectors.toList()); + } else { + fields = schema.getFields(); + } + final Schema projectedSchema = Schema.createRecord(fields.stream() + .map(f -> new Schema.Field(f.name(), f.schema(), f.doc(), f.defaultVal())) + .collect(Collectors.toList())); return inputPaths.stream().map(path -> { setInputPath(jobConf, path); List records = new ArrayList<>(); @@ -76,17 +97,19 @@ public static List getRecordsUsingInputFormat(List inputP List splits = Arrays.asList(inputFormat.getSplits(jobConf, 1)); for (InputSplit split : splits) { RecordReader recordReader = inputFormat.getRecordReader(split, jobConf, null); - Void key = (Void) recordReader.createKey(); + Object key = recordReader.createKey(); ArrayWritable writable = (ArrayWritable) recordReader.createValue(); while (recordReader.next(key, writable)) { - GenericRecordBuilder newRecord = new GenericRecordBuilder(schema); + GenericRecordBuilder newRecord = new GenericRecordBuilder(projectedSchema); // writable returns an array with [field1, field2, _hoodie_commit_time, // _hoodie_commit_seqno] Writable[] values = writable.get(); - assert schema.getFields().size() <= values.length; - schema.getFields().forEach(field -> { - newRecord.set(field, values[field.pos()]); - }); + assert projectedSchema.getFields().size() <= values.length; + schema.getFields().stream() + .filter(f -> !projectCols || projectedColumns.contains(f.name())) + .map(f -> Pair.of(projectedSchema.getFields().stream() + .filter(p -> f.name().equals(p.name())).findFirst().get(), f)) + .forEach(fieldsPair -> newRecord.set(fieldsPair.getKey(), values[fieldsPair.getValue().pos()])); records.add(newRecord.build()); } } @@ -101,16 +124,27 @@ public static List getRecordsUsingInputFormat(List inputP } private static void setPropsForInputFormat(HoodieParquetInputFormat inputFormat, JobConf jobConf, - Schema schema, String hiveColumnTyps) { + Schema schema, String hiveColumnTyps, boolean projectCols, List projectedCols) { List fields = schema.getFields(); - String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); - String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); - Configuration conf = HoodieTestUtils.getDefaultHadoopConf(); + final List projectedColNames; + if (!projectCols) { + projectedColNames = fields.stream().map(f -> f.name().toString()).collect(Collectors.toList()); + } else { + projectedColNames = projectedCols; + } - String hiveColumnNames = fields.stream().filter(field -> !field.name().equalsIgnoreCase("datestr")) + String names = fields.stream() + .filter(f -> projectedColNames.contains(f.name().toString())) + .map(f -> f.name().toString()).collect(Collectors.joining(",")); + String postions = fields.stream() + .filter(f -> projectedColNames.contains(f.name().toString())) + .map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); + String hiveColumnNames = fields.stream() + .filter(field -> !field.name().equalsIgnoreCase("datestr")) .map(Schema.Field::name).collect(Collectors.joining(",")); hiveColumnNames = hiveColumnNames + ",datestr"; + Configuration conf = HoodieTestUtils.getDefaultHadoopConf(); String hiveColumnTypes = hiveColumnTyps; hiveColumnTypes = hiveColumnTypes + ",string"; jobConf.set(hive_metastoreConstants.META_TABLE_COLUMNS, hiveColumnNames); @@ -123,6 +157,8 @@ private static void setPropsForInputFormat(HoodieParquetInputFormat inputFormat, conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); conf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "datestr"); conf.set(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, hiveColumnTypes); + conf.set(IOConstants.COLUMNS, hiveColumnNames); + conf.get(IOConstants.COLUMNS_TYPES, hiveColumnTypes); inputFormat.setConf(conf); jobConf.addResource(conf); } diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java index 99b8bab8763d8..86b5b5806816d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java @@ -445,4 +445,8 @@ private static boolean isLogicalTypeDate(Schema fieldSchema) { } return fieldSchema.getLogicalType() == LogicalTypes.date(); } + + public static Schema getNullSchema() { + return Schema.create(Schema.Type.NULL); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/BootstrapIndex.java b/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/BootstrapIndex.java index f3e47f179bbc1..73f0c36aa0706 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/BootstrapIndex.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/BootstrapIndex.java @@ -34,6 +34,8 @@ */ public abstract class BootstrapIndex implements Serializable { + protected static final long serialVersionUID = 1L; + protected final HoodieTableMetaClient metaClient; public BootstrapIndex(HoodieTableMetaClient metaClient) { @@ -71,7 +73,7 @@ public final boolean isIndexAvailable() { /** * Check if bootstrap Index is present and ensures readable. */ - protected abstract boolean checkIndex(); + public abstract boolean checkIndex(); /** * Bootstrap Index Reader Interface. diff --git a/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/HFileBasedBootstrapIndex.java b/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/HFileBasedBootstrapIndex.java index f93a5642f6a4e..ece86b878e99a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/HFileBasedBootstrapIndex.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/HFileBasedBootstrapIndex.java @@ -132,8 +132,8 @@ private static Path getIndexByFileIdPath(HoodieTableMetaClient metaClient) { private static HFile.Reader createReader(String hFilePath, Configuration conf, FileSystem fileSystem) { try { LOG.info("Opening HFile for reading :" + hFilePath); - HFile.Reader reader = HFile.createReader(fileSystem, new HFilePathForReader(hFilePath), new CacheConfig(conf), - conf); + HFile.Reader reader = HFile.createReader(fileSystem, new HFilePathForReader(hFilePath), + new CacheConfig(conf), conf); return reader; } catch (IOException ioe) { throw new HoodieIOException(ioe.getMessage(), ioe); @@ -166,7 +166,7 @@ public void dropIndex() { } @Override - protected boolean checkIndex() { + public boolean checkIndex() { return hasIndex; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java index 1d5f238a1dc0d..d1e2cc6adb67b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java @@ -18,6 +18,7 @@ package org.apache.hudi.common.model; +import org.apache.hadoop.fs.Path; import org.apache.hudi.common.fs.FSUtils; import com.fasterxml.jackson.annotation.JsonAutoDetect; @@ -126,6 +127,18 @@ public HashMap getFileIdAndFullPaths(String basePath) { return fullPaths; } + public Map getFileGroupIdAndFullPaths(String basePath) { + Map fileGroupIdToFullPaths = new HashMap<>(); + for (Map.Entry> entry : getPartitionToWriteStats().entrySet()) { + for (HoodieWriteStat stat : entry.getValue()) { + HoodieFileGroupId fileGroupId = new HoodieFileGroupId(stat.getPartitionPath(), stat.getFileId()); + Path fullPath = new Path(basePath, stat.getPath()); + fileGroupIdToFullPaths.put(fileGroupId, fullPath.toString()); + } + } + return fileGroupIdToFullPaths; + } + public String toJsonString() throws IOException { if (partitionToWriteStats.containsKey(null)) { LOG.info("partition path is null for " + partitionToWriteStats.get(null)); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java index 94298be96a08b..650a341a17892 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java @@ -338,6 +338,13 @@ public static HoodieTableMetaClient initTableType(Configuration hadoopConf, Stri public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, HoodieTableType tableType, String tableName, String archiveLogFolder, String payloadClassName, Integer timelineLayoutVersion) throws IOException { + return initTableType(hadoopConf, basePath, tableType, tableName, archiveLogFolder, payloadClassName, timelineLayoutVersion, null); + } + + public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, + HoodieTableType tableType, String tableName, String archiveLogFolder, String payloadClassName, + Integer timelineLayoutVersion, String bootstrapIndexClassName) throws IOException { + Properties properties = new Properties(); properties.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, tableName); properties.setProperty(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, tableType.name()); @@ -352,6 +359,10 @@ public static HoodieTableMetaClient initTableType(Configuration hadoopConf, Stri if (null != timelineLayoutVersion) { properties.put(HoodieTableConfig.HOODIE_TIMELINE_LAYOUT_VERSION, String.valueOf(timelineLayoutVersion)); } + + if (null != bootstrapIndexClassName) { + properties.put(HoodieTableConfig.HOODIE_BOOTSTRAP_INDEX_CLASS_PROP_NAME, bootstrapIndexClassName); + } return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, basePath, properties); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java index 3c8d9e8614b27..eb5e2b5fa068f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java @@ -190,7 +190,7 @@ private static Map readParquetFooter(Configuration configuration } public static Schema readAvroSchema(Configuration configuration, Path parquetFilePath) { - return new AvroSchemaConverter().convert(readSchema(configuration, parquetFilePath)); + return new AvroSchemaConverter(configuration).convert(readSchema(configuration, parquetFilePath)); } /** diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieColumnProjectionUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieColumnProjectionUtils.java index 55891f9bc2e4d..b7141a8ee762f 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieColumnProjectionUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieColumnProjectionUtils.java @@ -19,6 +19,7 @@ package org.apache.hudi.hadoop; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; @@ -39,8 +40,8 @@ /** * Utility funcitons copied from Hive ColumnProjectionUtils.java. - * Needed to copy as we see NoSuchMethod errors when directly using these APIs with/without Spark. Some of these - * methods are not available across hive versions. + * Needed to copy as we see NoSuchMethod errors when directly using these APIs with/without Spark. + * Some of these methods are not available across hive versions. */ public class HoodieColumnProjectionUtils { public static final Logger LOG = LoggerFactory.getLogger(ColumnProjectionUtils.class); @@ -63,6 +64,15 @@ public class HoodieColumnProjectionUtils { private static final String COMMA = ","; + /** Special Column Names added during Parquet Projection. **/ + public static final String PARQUET_BLOCK_OFFSET_COL_NAME = "BLOCK__OFFSET__INSIDE__FILE"; + public static final String PARQUET_INPUT_FILE_NAME = "INPUT__FILE__NAME"; + public static final String PARQUET_ROW_ID = "ROW__ID"; + + public static final List PARQUET_SPECIAL_COLUMN_NAMES = CollectionUtils + .createImmutableList(PARQUET_BLOCK_OFFSET_COL_NAME, PARQUET_INPUT_FILE_NAME, + PARQUET_ROW_ID); + /** * Sets the READ_ALL_COLUMNS flag and removes any previously * set column ids. @@ -88,6 +98,7 @@ public static void setReadColumns(Configuration conf, List ids, List getIOColumns(Configuration conf) { public static List getIOColumnTypes(Configuration conf) { String colTypes = conf.get(IOConstants.COLUMNS_TYPES, ""); - TypeInfoUtils.getTypeInfosFromTypeString(colTypes); if (colTypes != null && !colTypes.isEmpty()) { return TypeInfoUtils.getTypeInfosFromTypeString(colTypes).stream() .map(t -> t.getTypeName()).collect(Collectors.toList()); diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieColumnStichingRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieColumnStichingRecordReader.java index 0ae65e584b400..f47ce6f8aa6a9 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieColumnStichingRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieColumnStichingRecordReader.java @@ -39,6 +39,7 @@ public class HoodieColumnStichingRecordReader implements RecordReader left, int numLeftColumns, RecordReader right, int numRightColumns, boolean validate) { @@ -78,7 +79,7 @@ public boolean next(NullWritable key, ArrayWritable value) throws IOException { for (int j = numLeftColumns; j < right.get().length; j++) { value.get()[j] = right.get()[j]; } - return hasMoreOnLeft; + return hasMoreOnLeft && hasMoreOnRight; } @Override diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java index 3ed8853b83f78..5c0bbf85d9089 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java @@ -18,10 +18,10 @@ package org.apache.hudi.hadoop; -import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodiePartitionMetadata; +import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; @@ -40,6 +40,7 @@ import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; +import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.FileSplit; @@ -360,13 +361,21 @@ public RecordReader getRecordReader(final InputSpli ExternalBaseFileSplit eSplit = (ExternalBaseFileSplit)split; String[] rawColNames = HoodieColumnProjectionUtils.getReadColumnNames(job); List rawColIds = HoodieColumnProjectionUtils.getReadColumnIDs(job); - List> colsWithIndex = + List> projectedColsWithIndex = IntStream.range(0, rawColIds.size()).mapToObj(idx -> Pair.of(rawColIds.get(idx), rawColNames[idx])) .collect(Collectors.toList()); - List> hoodieColsProjected = colsWithIndex.stream() - .filter(idxWithName -> idxWithName.getKey() < HoodieAvroUtils.NUM_HUDI_METADATA_COLS) + List> hoodieColsProjected = projectedColsWithIndex.stream() + .filter(idxWithName -> HoodieRecord.HOODIE_META_COLUMNS.contains(idxWithName.getValue())) .collect(Collectors.toList()); + List> externalColsProjected = projectedColsWithIndex.stream() + .filter(idxWithName -> !HoodieRecord.HOODIE_META_COLUMNS.contains(idxWithName.getValue()) + && !VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(idxWithName.getValue())) + .collect(Collectors.toList()); + List> virtualParquetColsProjected = projectedColsWithIndex.stream() + .filter(idxWithName -> VirtualColumn.VIRTUAL_COLUMN_NAMES.contains( + idxWithName.getValue())).collect(Collectors.toList()); + // This always matches hive table description List> colNameWithTypes = HoodieColumnProjectionUtils.getIOColumnNameAndTypes(job); List> hoodieColNamesOnlyWithTypes = colNameWithTypes.stream() @@ -389,34 +398,40 @@ public RecordReader getRecordReader(final InputSpli } else if (externalColsProjected.isEmpty()) { return super.getRecordReader(split, job, reporter); } else { + JobConf jobConf1 = new JobConf(job); + JobConf jobConf2 = new JobConf(job); + HoodieColumnProjectionUtils.setIOColumnNameAndTypes(jobConf1, hoodieColNamesOnlyWithTypes); + HoodieColumnProjectionUtils.setIOColumnNameAndTypes(jobConf2, colNamesWithTypesForExternal); + + // Adjust Projection Settings HoodieColumnProjectionUtils.setReadColumns(jobConf1, new ArrayList<>(), new ArrayList<>()); HoodieColumnProjectionUtils.setReadColumns(jobConf2, new ArrayList<>(), new ArrayList<>()); - List hoodieColNames = colsWithIndex.stream() - .filter(idxWithName -> idxWithName.getKey() < HoodieAvroUtils.NUM_HUDI_METADATA_COLS) + List hoodieColNames = projectedColsWithIndex.stream() + .filter(idxWithName -> HoodieRecord.HOODIE_META_COLUMNS.contains(idxWithName.getValue())) .map(idxWithName -> idxWithName.getValue()).collect(Collectors.toList()); - List hoodieColIds = colsWithIndex.stream() - .filter(idxWithName -> idxWithName.getKey() < HoodieAvroUtils.NUM_HUDI_METADATA_COLS) + List hoodieColIds = projectedColsWithIndex.stream() + .filter(idxWithName -> HoodieRecord.HOODIE_META_COLUMNS.contains(idxWithName.getValue())) .map(idxWithName -> idxWithName.getKey()).collect(Collectors.toList()); - List nonHoodieColNames = colsWithIndex.stream() - .filter(idxWithName -> idxWithName.getKey() >= HoodieAvroUtils.NUM_HUDI_METADATA_COLS) + + List externalColNamesWithVirtualCols = projectedColsWithIndex.stream() + .filter(idxWithName -> !HoodieRecord.HOODIE_META_COLUMNS.contains(idxWithName.getValue())) .map(idxWithName -> idxWithName.getValue()).collect(Collectors.toList()); - List nonHoodieColIdsAdjusted = colsWithIndex.stream() - .filter(idxWithName -> idxWithName.getKey() >= HoodieAvroUtils.NUM_HUDI_METADATA_COLS) - .map(idxWithName -> idxWithName.getKey() - HoodieAvroUtils.NUM_HUDI_METADATA_COLS) + List externalColIds = projectedColsWithIndex.stream() + .filter(idxWithName -> !HoodieRecord.HOODIE_META_COLUMNS.contains(idxWithName.getValue()) + && !VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(idxWithName.getValue())) + .map(idxWithName -> idxWithName.getKey() - HoodieRecord.HOODIE_META_COLUMNS.size()) .collect(Collectors.toList()); + List externalColIdsWithVirtualCols = new ArrayList<>(externalColIds); + IntStream.range(0, virtualParquetColsProjected.size()) + .forEach(idx -> externalColIdsWithVirtualCols.add(idx + externalColIds.size())); + HoodieColumnProjectionUtils.setReadColumns(jobConf1, new ArrayList<>(), new ArrayList<>()); + HoodieColumnProjectionUtils.setReadColumns(jobConf2, new ArrayList<>(), new ArrayList<>()); List groupCols = Arrays.asList(job.get(READ_NESTED_COLUMN_PATH_CONF_STR, "").split(",")); - HoodieColumnProjectionUtils.appendReadColumns(jobConf1, hoodieColIds, hoodieColNames, new ArrayList<>()); - HoodieColumnProjectionUtils.appendReadColumns(jobConf2, nonHoodieColIdsAdjusted, nonHoodieColNames, groupCols); - if (LOG.isDebugEnabled()) { - LOG.debug("hoodieColNames=" + hoodieColNames + ", hoodieColIds=" + hoodieColIds - + ", SIZES : hoodieColNames=" + hoodieColNames.size() + ", hoodieColIds=" + hoodieColIds.size() - + ", nonHoodieColNames=" + nonHoodieColNames + ", nonHoodieColIdsAdjusted=" + nonHoodieColIdsAdjusted - + ", nonHoodieColNames=" + nonHoodieColNames.size() + ", nonHoodieColIdsAdjusted=" - + nonHoodieColIdsAdjusted.size()); - } - FileSystem fs = FileSystem.get(job); - //FileSplit rightSplit = - // makeSplit(externalFile, 0, externalFileStatus.getLen(), new String[0], new String[0]); + HoodieColumnProjectionUtils.appendReadColumns(jobConf1, hoodieColIds, + hoodieColNames, new ArrayList<>()); + HoodieColumnProjectionUtils.appendReadColumns(jobConf2, externalColIdsWithVirtualCols, + externalColNamesWithVirtualCols, groupCols); + FileSplit rightSplit = eSplit.getExternalFileSplit(); LOG.info("Generating column stitching reader for " + eSplit.getPath() + " and " + rightSplit.getPath()); return new HoodieColumnStichingRecordReader(super.getRecordReader(eSplit, job, reporter), diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/NullSkeletonRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/NullSkeletonRecordReader.java new file mode 100644 index 0000000000000..c21dc92738cae --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/NullSkeletonRecordReader.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop; + +import java.io.IOException; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hudi.common.model.HoodieRecord; + +public class NullSkeletonRecordReader implements RecordReader { + + private final ArrayWritable rec = new ArrayWritable(NullWritable.class, + new Writable[HoodieRecord.HOODIE_META_COLUMNS.size()]); + + @Override + public boolean next(NullWritable key, ArrayWritable value) throws IOException { + return true; + } + + @Override + public NullWritable createKey() { + return NullWritable.get(); + } + + @Override + public ArrayWritable createValue() { + return rec; + } + + @Override + public long getPos() throws IOException { + return 0; + } + + @Override + public void close() throws IOException { + + } + + @Override + public float getProgress() throws IOException { + return 0; + } +} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java index 5e346224adc72..e5f7f36bd079c 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java @@ -34,6 +34,7 @@ import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.hadoop.ExternalBaseFileSplit; +import org.apache.hudi.hadoop.HoodieColumnProjectionUtils; import org.apache.hudi.hadoop.HoodieParquetInputFormat; import org.apache.hudi.hadoop.UseFileSplitsFromInputFormat; @@ -212,10 +213,15 @@ private static Configuration addProjectionField(Configuration conf, String field } private static void addRequiredProjectionFields(Configuration configuration) { - // Need this to do merge records in HoodieRealtimeRecordReader - addProjectionField(configuration, HoodieRecord.RECORD_KEY_METADATA_FIELD, HOODIE_RECORD_KEY_COL_POS); - addProjectionField(configuration, HoodieRecord.COMMIT_TIME_METADATA_FIELD, HOODIE_COMMIT_TIME_COL_POS); - addProjectionField(configuration, HoodieRecord.PARTITION_PATH_METADATA_FIELD, HOODIE_PARTITION_PATH_COL_POS); + List projectedIds = new ArrayList<>(HoodieColumnProjectionUtils.getReadColumnIDs(configuration)); + List projectedNames = new ArrayList<>( + Arrays.asList(HoodieColumnProjectionUtils.getReadColumnNames(configuration))); + projectedIds.addAll(Arrays.asList(HOODIE_RECORD_KEY_COL_POS, HOODIE_COMMIT_TIME_COL_POS, + HOODIE_PARTITION_PATH_COL_POS)); + projectedNames.addAll(Arrays.asList(HoodieRecord.RECORD_KEY_METADATA_FIELD, HoodieRecord.COMMIT_TIME_METADATA_FIELD, + HoodieRecord.PARTITION_PATH_METADATA_FIELD)); + + HoodieColumnProjectionUtils.setReadColumns(configuration, projectedIds, projectedNames); } /** diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeExternalBaseFileSplit.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeExternalBaseFileSplit.java index 71e84ac18aa06..31ddfb8b81d30 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeExternalBaseFileSplit.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeExternalBaseFileSplit.java @@ -38,6 +38,10 @@ public class RealtimeExternalBaseFileSplit extends ExternalBaseFileSplit impleme private String basePath; + public RealtimeExternalBaseFileSplit() { + super(); + } + public RealtimeExternalBaseFileSplit(FileSplit baseSplit, String basePath, List deltaLogPaths, String maxInstantTime, FileSplit externalFileSplit) throws IOException { super(baseSplit, externalFileSplit); diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java index cf8727394788c..5f36d72dd3553 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java @@ -268,7 +268,8 @@ void assertStdOutContains(Pair stdOutErr, String expectedOutput, saveUpLogs(); } - assertEquals(times, count, "Did not find output the expected number of times"); + assertEquals(times, count, "Did not find output the expected number of times. stdOutSingleSpaced=" + + stdOutSingleSpaced); } public class TestExecStartResultCallback extends ExecStartResultCallback { diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java index fa061dccaa4c4..e56067195f483 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java @@ -53,11 +53,18 @@ public class ITTestHoodieDemo extends ITTestBase { private static final String COW_TABLE_NAME = "stock_ticks_cow"; private static final String MOR_TABLE_NAME = "stock_ticks_mor"; + private static final String BOOTSTRAPPED_SRC_PATH = "/user/hive/warehouse/stock_ticks_cow_bs_src"; + private static final String COW_BOOTSTRAPPED_BASE_PATH = "/user/hive/warehouse/stock_ticks_cow_bs"; + private static final String MOR_BOOTSTRAPPED_BASE_PATH = "/user/hive/warehouse/stock_ticks_mor_bs"; + private static final String COW_BOOTSTRAPPED_TABLE_NAME = "stock_ticks_cow_bs"; + private static final String MOR_BOOTSTRAPPED_TABLE_NAME = "stock_ticks_mor_bs"; + private static final String DEMO_CONTAINER_SCRIPT = HOODIE_WS_ROOT + "/docker/demo/setup_demo_container.sh"; private static final String MIN_COMMIT_TIME_COW_SCRIPT = HOODIE_WS_ROOT + "/docker/demo/get_min_commit_time_cow.sh"; private static final String MIN_COMMIT_TIME_MOR_SCRIPT = HOODIE_WS_ROOT + "/docker/demo/get_min_commit_time_mor.sh"; private static final String HUDI_CLI_TOOL = HOODIE_WS_ROOT + "/hudi-cli/hudi-cli.sh"; private static final String COMPACTION_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/compaction.commands"; + private static final String SPARKSQL_BS_PREP_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/sparksql-bootstrap-prep-source.commands"; private static final String SPARKSQL_BATCH1_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/sparksql-batch1.commands"; private static final String SPARKSQL_BATCH2_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/sparksql-batch2.commands"; private static final String SPARKSQL_INCREMENTAL_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/sparksql-incremental.commands"; @@ -96,6 +103,7 @@ public void testDemo() throws Exception { // compaction scheduleAndRunCompaction(); + testHiveAfterSecondBatchAfterCompaction(); testPrestoAfterSecondBatchAfterCompaction(); testIncrementalHiveQueryAfterCompaction(); @@ -182,35 +190,42 @@ private void ingestFirstBatchAndHiveSync() throws Exception { private void testHiveAfterFirstBatch() throws Exception { Pair stdOutErrPair = executeHiveCommandFile(HIVE_TBLCHECK_COMMANDS); assertStdOutContains(stdOutErrPair, "| stock_ticks_cow |"); + assertStdOutContains(stdOutErrPair, "| stock_ticks_cow_bs |"); assertStdOutContains(stdOutErrPair, "| stock_ticks_mor_ro |"); assertStdOutContains(stdOutErrPair, "| stock_ticks_mor_rt |"); - + assertStdOutContains(stdOutErrPair, "| stock_ticks_mor_bs_ro |"); + assertStdOutContains(stdOutErrPair, "| stock_ticks_mor_bs_rt |"); assertStdOutContains(stdOutErrPair, "| partition |\n+----------------+\n| dt=2018-08-31 |\n+----------------+\n", 3); stdOutErrPair = executeHiveCommandFile(HIVE_BATCH1_COMMANDS); assertStdOutContains(stdOutErrPair, "| symbol | _c1 |\n+---------+----------------------+\n" - + "| GOOG | 2018-08-31 10:29:00 |\n", 3); + + "| GOOG | 2018-08-31 10:29:00 |\n", 6); assertStdOutContains(stdOutErrPair, "| symbol | ts | volume | open | close |\n" + "+---------+----------------------+---------+------------+-----------+\n" + "| GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |\n" + "| GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |\n", - 3); + 6); } private void testSparkSQLAfterFirstBatch() throws Exception { Pair stdOutErrPair = executeSparkSQLCommand(SPARKSQL_BATCH1_COMMANDS, true); assertStdOutContains(stdOutErrPair, "|default |stock_ticks_cow |false |\n" + + "|default |stock_ticks_cow_bs |false |\n" + + "|default |stock_ticks_mor_bs_ro |false |\n" + + "|default |stock_ticks_mor_bs_rt |false |" + "|default |stock_ticks_mor_ro |false |\n" - + "|default |stock_ticks_mor_rt |false |"); + + "|default |stock_ticks_mor_rt |false |"); assertStdOutContains(stdOutErrPair, - "+------+-------------------+\n|GOOG |2018-08-31 10:29:00|\n+------+-------------------+", 3); - assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |", 3); - assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|", 3); + "+------+-------------------+\n|GOOG |2018-08-31 10:29:00|\n+------+-------------------+", 6); + assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |", 6); + assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|", 6); } private void ingestSecondBatchAndHiveSync() throws Exception { + // Note : Unlike normal tables, bootstrapped tables do not have checkpoint. So, they + // begin with null checkpoint and read all states. List cmds = CollectionUtils.createImmutableList( ("hdfs dfs -copyFromLocal -f " + INPUT_BATCH_PATH2 + " " + HDFS_BATCH_PATH2), ("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE @@ -226,14 +241,28 @@ private void ingestSecondBatchAndHiveSync() throws Exception { + " --target-base-path " + MOR_BASE_PATH + " --target-table " + MOR_TABLE_NAME + " --props /var/demo/config/dfs-source.properties" + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " - + " --disable-compaction " + String.format(HIVE_SYNC_CMD_FMT, "dt", MOR_TABLE_NAME))); + + " --disable-compaction " + String.format(HIVE_SYNC_CMD_FMT, "dt", MOR_TABLE_NAME)), + ("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE + + " --table-type COPY_ON_WRITE " + + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " + + " --target-base-path " + COW_BOOTSTRAPPED_BASE_PATH + " --target-table " + COW_BOOTSTRAPPED_TABLE_NAME + + " --props /var/demo/config/dfs-source.properties" + + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " + + String.format(HIVE_SYNC_CMD_FMT, "dt", COW_BOOTSTRAPPED_TABLE_NAME)), + ("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE + + " --table-type MERGE_ON_READ " + + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " + + " --target-base-path " + MOR_BOOTSTRAPPED_BASE_PATH + " --target-table " + MOR_BOOTSTRAPPED_TABLE_NAME + + " --props /var/demo/config/dfs-source.properties" + + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " + + " --disable-compaction " + String.format(HIVE_SYNC_CMD_FMT, "dt", MOR_BOOTSTRAPPED_TABLE_NAME))); executeCommandStringsInDocker(ADHOC_1_CONTAINER, cmds); } private void testPrestoAfterFirstBatch() throws Exception { Pair stdOutErrPair = executePrestoCommandFile(HDFS_PRESTO_INPUT_TABLE_CHECK_PATH); - assertStdOutContains(stdOutErrPair, "stock_ticks_cow"); - assertStdOutContains(stdOutErrPair, "stock_ticks_mor",2); + assertStdOutContains(stdOutErrPair, "stock_ticks_cow", 2); + assertStdOutContains(stdOutErrPair, "stock_ticks_mor",4); stdOutErrPair = executePrestoCommandFile(HDFS_PRESTO_INPUT_BATCH1_PATH); assertStdOutContains(stdOutErrPair, @@ -247,20 +276,20 @@ private void testPrestoAfterFirstBatch() throws Exception { private void testHiveAfterSecondBatch() throws Exception { Pair stdOutErrPair = executeHiveCommandFile(HIVE_BATCH1_COMMANDS); assertStdOutContains(stdOutErrPair, "| symbol | _c1 |\n+---------+----------------------+\n" - + "| GOOG | 2018-08-31 10:29:00 |\n"); + + "| GOOG | 2018-08-31 10:29:00 |\n", 2); assertStdOutContains(stdOutErrPair, "| symbol | _c1 |\n+---------+----------------------+\n" - + "| GOOG | 2018-08-31 10:59:00 |\n", 2); + + "| GOOG | 2018-08-31 10:59:00 |\n", 4); assertStdOutContains(stdOutErrPair, "| symbol | ts | volume | open | close |\n" + "+---------+----------------------+---------+------------+-----------+\n" + "| GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |\n" - + "| GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |\n"); + + "| GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |\n", 2); assertStdOutContains(stdOutErrPair, "| symbol | ts | volume | open | close |\n" + "+---------+----------------------+---------+------------+-----------+\n" + "| GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |\n" + "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |\n", - 2); + 4); } private void testPrestoAfterSecondBatch() throws Exception { @@ -280,13 +309,13 @@ private void testPrestoAfterSecondBatch() throws Exception { private void testHiveAfterSecondBatchAfterCompaction() throws Exception { Pair stdOutErrPair = executeHiveCommandFile(HIVE_BATCH2_COMMANDS); assertStdOutContains(stdOutErrPair, "| symbol | _c1 |\n+---------+----------------------+\n" - + "| GOOG | 2018-08-31 10:59:00 |", 2); + + "| GOOG | 2018-08-31 10:59:00 |", 4); assertStdOutContains(stdOutErrPair, "| symbol | ts | volume | open | close |\n" + "+---------+----------------------+---------+------------+-----------+\n" + "| GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |\n" + "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |", - 2); + 4); } private void testPrestoAfterSecondBatchAfterCompaction() throws Exception { @@ -302,13 +331,13 @@ private void testPrestoAfterSecondBatchAfterCompaction() throws Exception { private void testSparkSQLAfterSecondBatch() throws Exception { Pair stdOutErrPair = executeSparkSQLCommand(SPARKSQL_BATCH2_COMMANDS, true); assertStdOutContains(stdOutErrPair, - "+------+-------------------+\n|GOOG |2018-08-31 10:59:00|\n+------+-------------------+", 2); + "+------+-------------------+\n|GOOG |2018-08-31 10:59:00|\n+------+-------------------+", 4); - assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |", 3); - assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:59:00|9021 |1227.1993|1227.215|", 2); + assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |", 6); + assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:59:00|9021 |1227.1993|1227.215|", 4); assertStdOutContains(stdOutErrPair, - "+------+-------------------+\n|GOOG |2018-08-31 10:29:00|\n+------+-------------------+"); - assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|"); + "+------+-------------------+\n|GOOG |2018-08-31 10:29:00|\n+------+-------------------+", 2); + assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|", 2); } private void testIncrementalHiveQuery(String minCommitTimeScript, String incrementalCommandsFile, @@ -324,36 +353,40 @@ private void testIncrementalHiveQueryBeforeCompaction() throws Exception { String expectedOutput = "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |"; // verify that 10:59 is present in COW table because there is no compaction process for COW - testIncrementalHiveQuery(MIN_COMMIT_TIME_COW_SCRIPT, HIVE_INCREMENTAL_COW_COMMANDS, expectedOutput, 1); + testIncrementalHiveQuery(MIN_COMMIT_TIME_COW_SCRIPT, HIVE_INCREMENTAL_COW_COMMANDS, expectedOutput, 2); // verify that 10:59 is NOT present in RO table because of pending compaction testIncrementalHiveQuery(MIN_COMMIT_TIME_MOR_SCRIPT, HIVE_INCREMENTAL_MOR_RO_COMMANDS, expectedOutput, 0); // verify that 10:59 is present in RT table even with pending compaction - testIncrementalHiveQuery(MIN_COMMIT_TIME_MOR_SCRIPT, HIVE_INCREMENTAL_MOR_RT_COMMANDS, expectedOutput, 1); + testIncrementalHiveQuery(MIN_COMMIT_TIME_MOR_SCRIPT, HIVE_INCREMENTAL_MOR_RT_COMMANDS, expectedOutput, 2); } private void testIncrementalHiveQueryAfterCompaction() throws Exception { String expectedOutput = "| symbol | ts | volume | open | close |\n" - + "+---------+----------------------+---------+------------+-----------+\n" - + "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |"; + + "+---------+----------------------+---------+------------+-----------+\n" + + "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |"; // verify that 10:59 is present for all views because compaction is complete - testIncrementalHiveQuery(MIN_COMMIT_TIME_COW_SCRIPT, HIVE_INCREMENTAL_COW_COMMANDS, expectedOutput, 1); - testIncrementalHiveQuery(MIN_COMMIT_TIME_MOR_SCRIPT, HIVE_INCREMENTAL_MOR_RO_COMMANDS, expectedOutput, 1); - testIncrementalHiveQuery(MIN_COMMIT_TIME_MOR_SCRIPT, HIVE_INCREMENTAL_MOR_RT_COMMANDS, expectedOutput, 1); + testIncrementalHiveQuery(MIN_COMMIT_TIME_COW_SCRIPT, HIVE_INCREMENTAL_COW_COMMANDS, expectedOutput, 2); + testIncrementalHiveQuery(MIN_COMMIT_TIME_MOR_SCRIPT, HIVE_INCREMENTAL_MOR_RO_COMMANDS, expectedOutput, 2); + testIncrementalHiveQuery(MIN_COMMIT_TIME_MOR_SCRIPT, HIVE_INCREMENTAL_MOR_RT_COMMANDS, expectedOutput, 2); } private void testIncrementalSparkSQLQuery() throws Exception { Pair stdOutErrPair = executeSparkSQLCommand(SPARKSQL_INCREMENTAL_COMMANDS, true); - assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:59:00|9021 |1227.1993|1227.215|"); - assertStdOutContains(stdOutErrPair, "|default |stock_ticks_cow |false |\n" - + "|default |stock_ticks_derived_mor_ro|false |\n" - + "|default |stock_ticks_derived_mor_rt|false |\n" - + "|default |stock_ticks_mor_ro |false |\n" - + "|default |stock_ticks_mor_rt |false |\n" - + "| |stock_ticks_cow_incr |true |"); - assertStdOutContains(stdOutErrPair, "|count(1)|\n+--------+\n|99 |", 2); + assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:59:00|9021 |1227.1993|1227.215|", 2); + assertStdOutContains(stdOutErrPair, "|default |stock_ticks_cow |false |\n" + + "|default |stock_ticks_cow_bs |false |\n" + + "|default |stock_ticks_derived_mor_bs_ro|false |\n" + + "|default |stock_ticks_derived_mor_bs_rt|false |\n" + + "|default |stock_ticks_derived_mor_ro |false |\n" + + "|default |stock_ticks_derived_mor_rt |false |\n" + + "|default |stock_ticks_mor_bs_ro |false |\n" + + "|default |stock_ticks_mor_bs_rt |false |" + + "|default |stock_ticks_mor_ro |false |\n" + + "|default |stock_ticks_mor_rt |false |"); + assertStdOutContains(stdOutErrPair, "|count(1)|\n+--------+\n|99 |", 4); } private void scheduleAndRunCompaction() throws Exception { diff --git a/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java b/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java index b51805f91b0e7..e80c66e9522a3 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java +++ b/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java @@ -18,13 +18,18 @@ package org.apache.hudi; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hudi.client.HoodieReadClient; import org.apache.hudi.client.HoodieWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.StringUtils; @@ -34,6 +39,7 @@ import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.exception.TableNotFoundException; +import org.apache.hudi.hadoop.HoodieHiveUtil; import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; import org.apache.hudi.index.HoodieIndex; @@ -41,6 +47,8 @@ import org.apache.hudi.table.UserDefinedBulkInsertPartitioner; import org.apache.avro.generic.GenericRecord; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -55,6 +63,8 @@ */ public class DataSourceUtils { + private static final Logger LOG = LogManager.getLogger(DataSourceUtils.class); + /** * Create a key generator class via reflection, passing in any configs needed. *

@@ -212,4 +222,58 @@ public static HiveSyncConfig buildHiveSyncConfig(TypedProperties props, String b DataSourceWriteOptions.DEFAULT_HIVE_USE_JDBC_OPT_VAL())); return hiveSyncConfig; } + + public static String getTablePath(FileSystem fs, Path[] paths) throws IOException { + LOG.info("Getting table path.."); + for (Path path: paths) { + FileStatus fileStatus = fs.getFileStatus(path); + Option tablePath; + + if (fileStatus.isFile()) { + tablePath = getTablePathFromFile(fs, fileStatus); + } else { + tablePath = getTablePathFromDir(fs, fileStatus); + } + + if (tablePath.isPresent()) { + return tablePath.get().toString(); + } + } + + throw new TableNotFoundException("Cannot find Hudi table for the path provided"); + } + + private static Option getTablePathFromFile(FileSystem fs, FileStatus fileStatus) throws IOException { + LOG.info("Getting table path from file path : " + fileStatus.getPath()); + Path filePath = fileStatus.getPath(); + String filePathStr = filePath.toString(); + + if (filePathStr.contains("/" + HoodieTableMetaClient.METAFOLDER_NAME + "/")) { + // Handle file inside metadata folder + Path tablePath = new Path(filePathStr); + while (!tablePath.toString().endsWith(HoodieTableMetaClient.METAFOLDER_NAME)) { + tablePath = tablePath.getParent(); + } + return Option.of(tablePath.getParent()); + } else if (HoodiePartitionMetadata.hasPartitionMetadata(fs, filePath.getParent())) { + // Handle partition path + Path partitionPath = filePath.getParent(); + HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, partitionPath); + metadata.readFromFS(); + return Option.of(HoodieHiveUtil.getNthParent(partitionPath, metadata.getPartitionDepth())); + } + + return Option.empty(); + } + + private static Option getTablePathFromDir(FileSystem fs, FileStatus fileStatus) throws IOException { + System.out.println("Getting table path from directory path : " + fileStatus.getPath().toString()); + Path tablePath = new Path(fileStatus.getPath().toString()); + + while (tablePath != null && !fs.exists(new Path(tablePath, HoodieTableMetaClient.METAFOLDER_NAME))) { + tablePath = tablePath.getParent(); + } + + return tablePath == null ? Option.empty() : Option.of(tablePath); + } } diff --git a/hudi-spark/src/main/java/org/apache/hudi/keygen/ComplexKeyGenerator.java b/hudi-spark/src/main/java/org/apache/hudi/keygen/ComplexKeyGenerator.java index e810ff1779dcb..af4504573ffb4 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/keygen/ComplexKeyGenerator.java +++ b/hudi-spark/src/main/java/org/apache/hudi/keygen/ComplexKeyGenerator.java @@ -38,6 +38,8 @@ public class ComplexKeyGenerator extends KeyGenerator { protected final boolean hiveStylePartitioning; + protected final boolean encodePartitionPath; + public ComplexKeyGenerator(TypedProperties props) { super(props); this.recordKeyFields = Arrays.asList(props.getString(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY()).split(",")) @@ -47,6 +49,8 @@ public ComplexKeyGenerator(TypedProperties props) { .stream().map(String::trim).collect(Collectors.toList()); this.hiveStylePartitioning = props.getBoolean(DataSourceWriteOptions.HIVE_STYLE_PARTITIONING_OPT_KEY(), Boolean.parseBoolean(DataSourceWriteOptions.DEFAULT_HIVE_STYLE_PARTITIONING_OPT_VAL())); + this.encodePartitionPath = props.getBoolean(DataSourceWriteOptions.URL_ENCODE_PARTITIONING_OPT_KEY(), + Boolean.parseBoolean(DataSourceWriteOptions.DEFAULT_URL_ENCODE_PARTITIONING_OPT_VAL())); } @Override diff --git a/hudi-spark/src/main/java/org/apache/hudi/keygen/SimpleKeyGenerator.java b/hudi-spark/src/main/java/org/apache/hudi/keygen/SimpleKeyGenerator.java index a9df3ee751db6..8568d4eaccecf 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/keygen/SimpleKeyGenerator.java +++ b/hudi-spark/src/main/java/org/apache/hudi/keygen/SimpleKeyGenerator.java @@ -37,12 +37,16 @@ public class SimpleKeyGenerator extends KeyGenerator { protected final boolean hiveStylePartitioning; + protected final boolean encodePartitionPath; + public SimpleKeyGenerator(TypedProperties props) { super(props); this.recordKeyField = props.getString(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY()); this.partitionPathField = props.getString(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY()); this.hiveStylePartitioning = props.getBoolean(DataSourceWriteOptions.HIVE_STYLE_PARTITIONING_OPT_KEY(), Boolean.parseBoolean(DataSourceWriteOptions.DEFAULT_HIVE_STYLE_PARTITIONING_OPT_VAL())); + this.encodePartitionPath = props.getBoolean(DataSourceWriteOptions.URL_ENCODE_PARTITIONING_OPT_KEY(), + Boolean.parseBoolean(DataSourceWriteOptions.DEFAULT_URL_ENCODE_PARTITIONING_OPT_VAL())); } @Override diff --git a/hudi-spark/src/main/scala/org/apache/hudi/DataSourceOptions.scala b/hudi-spark/src/main/scala/org/apache/hudi/DataSourceOptions.scala index 3d1172f0f6fa8..d7431473f9581 100644 --- a/hudi-spark/src/main/scala/org/apache/hudi/DataSourceOptions.scala +++ b/hudi-spark/src/main/scala/org/apache/hudi/DataSourceOptions.scala @@ -50,6 +50,8 @@ object DataSourceReadOptions { val QUERY_TYPE_INCREMENTAL_OPT_VAL = "incremental" val DEFAULT_QUERY_TYPE_OPT_VAL: String = QUERY_TYPE_SNAPSHOT_OPT_VAL + val READ_PATHS_OPT_KEY = "hoodie.datasource.read.paths" + @Deprecated val VIEW_TYPE_OPT_KEY = "hoodie.datasource.view.type" @Deprecated @@ -129,6 +131,7 @@ object DataSourceWriteOptions { val INSERT_OPERATION_OPT_VAL = "insert" val UPSERT_OPERATION_OPT_VAL = "upsert" val DELETE_OPERATION_OPT_VAL = "delete" + val BOOTSTRAP_OPERATION_OPT_VAL = "bootstrap" val DEFAULT_OPERATION_OPT_VAL = UPSERT_OPERATION_OPT_VAL /** @@ -207,7 +210,8 @@ object DataSourceWriteOptions { */ val HIVE_STYLE_PARTITIONING_OPT_KEY = "hoodie.datasource.write.hive_style_partitioning" val DEFAULT_HIVE_STYLE_PARTITIONING_OPT_VAL = "false" - + val URL_ENCODE_PARTITIONING_OPT_KEY = "hoodie.datasource.write.partitionpath.urlencode" + val DEFAULT_URL_ENCODE_PARTITIONING_OPT_VAL = "false" /** * Key generator class, that implements will extract the key out of incoming record * diff --git a/hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala b/hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala index fbdd4ea9cfb1b..e003232c838a3 100644 --- a/hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala +++ b/hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala @@ -17,10 +17,16 @@ package org.apache.hudi +import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hudi.DataSourceReadOptions._ +import org.apache.hudi.DataSourceWriteOptions.{BOOTSTRAP_OPERATION_OPT_VAL, OPERATION_OPT_KEY} +import org.apache.hudi.common.bootstrap.index.BootstrapIndex +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.exception.HoodieException import org.apache.hudi.hadoop.HoodieROTablePathFilter import org.apache.log4j.LogManager +import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand import org.apache.spark.sql.execution.streaming.Sink @@ -54,29 +60,54 @@ class DefaultSource extends RelationProvider val parameters = Map(QUERY_TYPE_OPT_KEY -> DEFAULT_QUERY_TYPE_OPT_VAL) ++ translateViewTypesToQueryTypes(optParams) val path = parameters.get("path") - if (path.isEmpty) { - throw new HoodieException("'path' must be specified.") - } if (parameters(QUERY_TYPE_OPT_KEY).equals(QUERY_TYPE_SNAPSHOT_OPT_VAL)) { - // this is just effectively RO view only, where `path` can contain a mix of - // non-hoodie/hoodie path files. set the path filter up - sqlContext.sparkContext.hadoopConfiguration.setClass( - "mapreduce.input.pathFilter.class", - classOf[HoodieROTablePathFilter], - classOf[org.apache.hadoop.fs.PathFilter]) - - log.info("Constructing hoodie (as parquet) data source with options :" + parameters) - log.warn("Snapshot view not supported yet via data source, for MERGE_ON_READ tables. " + - "Please query the Hive table registered using Spark SQL.") - // simply return as a regular parquet relation - DataSource.apply( - sparkSession = sqlContext.sparkSession, - userSpecifiedSchema = Option(schema), - className = "parquet", - options = parameters) - .resolveRelation() + val readPathsStr = parameters.get(DataSourceReadOptions.READ_PATHS_OPT_KEY) + if (path.isEmpty && readPathsStr.isEmpty) { + throw new HoodieException(s"'path' or '$READ_PATHS_OPT_KEY' or both must be specified.") + } + + val readPaths = readPathsStr.map(p => p.split(",").toSeq).getOrElse(Seq()) + val allPaths = path.map(p => Seq(p)).getOrElse(Seq()) ++ readPaths + + val fs = FSUtils.getFs(allPaths.head, sqlContext.sparkContext.hadoopConfiguration) + val globPaths = checkAndGlobPathIfNecessary(allPaths, fs) + + val tablePath = DataSourceUtils.getTablePath(fs, globPaths.toArray) + log.info("Obtained hudi table path: " + tablePath) + + val metaClient = new HoodieTableMetaClient(fs.getConf, tablePath) + val bootstrapIndex = BootstrapIndex.getBootstrapIndex(metaClient) + log.info("Bootstrap Index Available: " + bootstrapIndex.isIndexAvailable) + + if (bootstrapIndex.isIndexAvailable) { + // For bootstrapped tables, use our custom Spark relation for querying + new HudiBootstrapRelation(sqlContext, schema, globPaths, metaClient, optParams) + } else { + // this is just effectively RO view only, where `path` can contain a mix of + // non-hoodie/hoodie path files. set the path filter up + sqlContext.sparkContext.hadoopConfiguration.setClass( + "mapreduce.input.pathFilter.class", + classOf[HoodieROTablePathFilter], + classOf[org.apache.hadoop.fs.PathFilter]) + + log.info("Constructing hoodie (as parquet) data source with options :" + parameters) + log.warn("Snapshot view not supported yet via data source, for MERGE_ON_READ tables. " + + "Please query the Hive table registered using Spark SQL.") + // simply return as a regular parquet relation + DataSource.apply( + sparkSession = sqlContext.sparkSession, + paths = readPaths, + userSpecifiedSchema = Option(schema), + className = "parquet", + options = parameters) + .resolveRelation() + } } else if (parameters(QUERY_TYPE_OPT_KEY).equals(QUERY_TYPE_INCREMENTAL_OPT_VAL)) { + if (path.isEmpty) { + throw new HoodieException("'path' must be specified for incremental query.") + } + new IncrementalRelation(sqlContext, path.get, optParams, schema) } else { throw new HoodieException("Invalid query type :" + parameters(QUERY_TYPE_OPT_KEY)) @@ -105,7 +136,12 @@ class DefaultSource extends RelationProvider df: DataFrame): BaseRelation = { val parameters = HoodieSparkSqlWriter.parametersWithWriteDefaults(optParams) - HoodieSparkSqlWriter.write(sqlContext, mode, parameters, df) + + if (parameters(OPERATION_OPT_KEY).equals(BOOTSTRAP_OPERATION_OPT_VAL)) { + HoodieSparkSqlWriter.bootstrap(sqlContext, mode, parameters, df) + } else { + HoodieSparkSqlWriter.write(sqlContext, mode, parameters, df) + } new HudiEmptyRelation(sqlContext, df.schema) } @@ -122,5 +158,13 @@ class DefaultSource extends RelationProvider outputMode) } + private def checkAndGlobPathIfNecessary(paths: Seq[String], fs: FileSystem): Seq[Path] = { + paths.flatMap(path => { + val qualified = new Path(path).makeQualified(fs.getUri, fs.getWorkingDirectory) + val globPaths = SparkHadoopUtil.get.globPathIfNecessary(fs, qualified) + globPaths + }) + } + override def shortName(): String = "hudi" } diff --git a/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala b/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala index e1bfe877559c5..b7e7bf063afee 100644 --- a/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala +++ b/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala @@ -24,6 +24,7 @@ import org.apache.avro.generic.GenericRecord import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.conf.HiveConf import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.avro.HoodieAvroUtils import org.apache.hudi.client.{HoodieWriteClient, WriteStatus} import org.apache.hudi.common.config.TypedProperties import org.apache.hudi.common.fs.FSUtils @@ -34,6 +35,7 @@ import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.exception.HoodieException import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncTool} import org.apache.log4j.LogManager +import org.apache.spark.SparkContext import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} @@ -60,7 +62,6 @@ private[hudi] object HoodieSparkSqlWriter { case Some(ser) if ser.equals("org.apache.spark.serializer.KryoSerializer") => case _ => throw new HoodieException("hoodie only support org.apache.spark.serializer.KryoSerializer as spark.serializer") } - val tableType = parameters(TABLE_TYPE_OPT_KEY) val operation = // It does not make sense to allow upsert() operation if INSERT_DROP_DUPS_OPT_KEY is true // Auto-correct the operation to "insert" if OPERATION_OPT_KEY is set to "upsert" wrongly @@ -112,25 +113,7 @@ private[hudi] object HoodieSparkSqlWriter { orderingVal, keyGenerator.getKey(gr), parameters(PAYLOAD_CLASS_OPT_KEY)) }).toJavaRDD() - // Handle various save modes - if (mode == SaveMode.ErrorIfExists && exists) { - throw new HoodieException(s"hoodie table at $basePath already exists.") - } - if (mode == SaveMode.Ignore && exists) { - log.warn(s"hoodie table at $basePath already exists. Ignoring & not performing actual writes.") - (true, common.util.Option.empty()) - } - if (mode == SaveMode.Overwrite && exists) { - log.warn(s"hoodie table at $basePath already exists. Deleting existing data & overwriting with new data.") - fs.delete(basePath, true) - exists = false - } - - // Create the table if not present - if (!exists) { - HoodieTableMetaClient.initTableType(sparkContext.hadoopConfiguration, path.get, tableType, - tblName.get, "archived", parameters(PAYLOAD_CLASS_OPT_KEY)) - } + initTable(mode, basePath, fs, exists, sparkContext, parameters) // Create a HoodieWriteClient & issue the write. val client = DataSourceUtils.createHoodieClient(jsc, schema.toString, path.get, tblName.get, @@ -190,6 +173,37 @@ private[hudi] object HoodieSparkSqlWriter { (writeSuccessful, common.util.Option.ofNullable(instantTime)) } + def bootstrap(sqlContext: SQLContext, + mode: SaveMode, + parameters: Map[String, String], + df: DataFrame): Unit = { + + val sparkContext = sqlContext.sparkContext + val path = parameters.get("path") + val tableName = parameters.get(HoodieWriteConfig.TABLE_NAME) + + var schema: String = null + if (df.schema.nonEmpty) { + val structName = s"${tableName.get}_record" + val nameSpace = s"hoodie.${tableName.get}" + schema = AvroConversionUtils.convertStructTypeToAvroSchema(df.schema, structName, nameSpace).toString + } else { + schema = HoodieAvroUtils.getNullSchema.toString + } + + val basePath = new Path(parameters("path")) + val fs = basePath.getFileSystem(sparkContext.hadoopConfiguration) + val exists = fs.exists(new Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME)) + + initTable(mode, basePath, fs, exists, sparkContext, parameters) + + val jsc = new JavaSparkContext(sqlContext.sparkContext) + val writeClient = DataSourceUtils.createHoodieClient(jsc, schema, path.get, tableName.get, + mapAsJavaMap(parameters)) + writeClient.bootstrap(org.apache.hudi.common.util.Option.empty()) + syncHiveIfEnabled(basePath, jsc, parameters) + } + /** * Add default options for unspecified write options keys. * @@ -228,6 +242,42 @@ private[hudi] object HoodieSparkSqlWriter { props } + private def initTable(mode: SaveMode, basePath: Path, fs: FileSystem, tableExists: Boolean, + sparkContext: SparkContext, parameters: Map[String, String]): Unit = { + val tableName = parameters.get(HoodieWriteConfig.TABLE_NAME) + val tableType = parameters(TABLE_TYPE_OPT_KEY) + + // Handle various save modes + if (mode == SaveMode.ErrorIfExists && tableExists) { + throw new HoodieException(s"hoodie table at $basePath already exists.") + } + if (mode == SaveMode.Ignore && tableExists) { + log.warn(s"hoodie table at $basePath already exists. Ignoring & not performing actual writes.") + (true, common.util.Option.empty()) + } + if (mode == SaveMode.Overwrite && tableExists) { + log.warn(s"hoodie table at $basePath already exists. Deleting existing data & overwriting with new data.") + fs.delete(basePath, true) + } + + // Create the table if not present + if (!tableExists) { + HoodieTableMetaClient.initTableType(sparkContext.hadoopConfiguration, basePath.toString, tableType, + tableName.get, "archived", parameters(PAYLOAD_CLASS_OPT_KEY)) + } + } + + private def syncHiveIfEnabled(basePath: Path, jsc: JavaSparkContext, parameters: Map[String, String]): Boolean = { + val hiveSyncEnabled = parameters.get(HIVE_SYNC_ENABLED_OPT_KEY).exists(r => r.toBoolean) + if (hiveSyncEnabled) { + log.info("Syncing to Hive Metastore (URL: " + parameters(HIVE_URL_OPT_KEY) + ")") + val fs = FSUtils.getFs(basePath.toString, jsc.hadoopConfiguration) + syncHive(basePath, fs, parameters) + } else { + true + } + } + private def syncHive(basePath: Path, fs: FileSystem, parameters: Map[String, String]): Boolean = { val hiveSyncConfig: HiveSyncConfig = buildSyncConfig(basePath, parameters) val hiveConf: HiveConf = new HiveConf() @@ -279,16 +329,9 @@ private[hudi] object HoodieSparkSqlWriter { log.info("Commit " + instantTime + " failed!") } - val hiveSyncEnabled = parameters.get(HIVE_SYNC_ENABLED_OPT_KEY).exists(r => r.toBoolean) - val syncHiveSucess = if (hiveSyncEnabled) { - log.info("Syncing to Hive Metastore (URL: " + parameters(HIVE_URL_OPT_KEY) + ")") - val fs = FSUtils.getFs(basePath.toString, jsc.hadoopConfiguration) - syncHive(basePath, fs, parameters) - } else { - true - } + val syncHiveSuccess = syncHiveIfEnabled(basePath, jsc, parameters) client.close() - commitSuccess && syncHiveSucess + commitSuccess && syncHiveSuccess } else { log.error(s"$operation failed with $errorCount errors :") if (log.isTraceEnabled) { diff --git a/hudi-spark/src/main/scala/org/apache/hudi/HudiBootstrapRDD.scala b/hudi-spark/src/main/scala/org/apache/hudi/HudiBootstrapRDD.scala new file mode 100644 index 0000000000000..05c66821859e8 --- /dev/null +++ b/hudi-spark/src/main/scala/org/apache/hudi/HudiBootstrapRDD.scala @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.spark.{Partition, TaskContext} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.ColumnarBatch + +class HudiBootstrapRDD(@transient spark: SparkSession, + dataReadFunction: PartitionedFile => Iterator[Any], + skeletonReadFunction: PartitionedFile => Iterator[Any], + regularReadFunction: PartitionedFile => Iterator[Any], + dataSchema: StructType, + skeletonSchema: StructType, + requiredColumns: Array[String], + tableState: HudiBootstrapTableState) + extends RDD[InternalRow](spark.sparkContext, Nil) { + + override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = { + val bootstrapPartition = split.asInstanceOf[HudiBootstrapPartition] + + if (bootstrapPartition.split.skeletonFile.isDefined) { + logInfo("Got Split => Index: " + bootstrapPartition.index + ", Data File: " + + bootstrapPartition.split.dataFile.filePath + ", Skeleton File: " + + bootstrapPartition.split.skeletonFile.get.filePath) + } else { + logInfo("Got Split => Index: " + bootstrapPartition.index + ", Data File: " + + bootstrapPartition.split.dataFile.filePath) + } + + var partitionedFileIterator: Iterator[InternalRow] = null + + if (bootstrapPartition.split.skeletonFile.isDefined) { + val dataFileIterator = read(bootstrapPartition.split.dataFile, dataReadFunction) + val skeletonFileIterator = read(bootstrapPartition.split.skeletonFile.get, skeletonReadFunction) + partitionedFileIterator = merge(skeletonFileIterator, dataFileIterator) + } else { + partitionedFileIterator = read(bootstrapPartition.split.dataFile, regularReadFunction) + } + + partitionedFileIterator + } + + def merge(skeletonFileIterator: Iterator[InternalRow], dataFileIterator: Iterator[InternalRow]) + : Iterator[InternalRow] = { + new Iterator[InternalRow] { + override def hasNext: Boolean = dataFileIterator.hasNext && skeletonFileIterator.hasNext + override def next(): InternalRow = { + mergeInternalRow(skeletonFileIterator.next(), dataFileIterator.next()) + } + } + } + + def mergeInternalRow(skeletonRow: InternalRow, dataRow: InternalRow): InternalRow = { + val skeletonArr = skeletonRow.copy().toSeq(skeletonSchema) + val dataArr = dataRow.copy().toSeq(dataSchema) + // We need to return it in the order requested + val mergedArr = requiredColumns.map(col => { + if (skeletonSchema.fieldNames.contains(col)) { + val idx = skeletonSchema.fieldIndex(col) + skeletonArr(idx) + } else { + val idx = dataSchema.fieldIndex(col) + dataArr(idx) + } + }) + + logDebug("Merged data and skeleton values => " + mergedArr.mkString(",")) + val mergedRow = InternalRow.fromSeq(mergedArr) + mergedRow + } + + def read(partitionedFile: PartitionedFile, readFileFunction: PartitionedFile => Iterator[Any]) + : Iterator[InternalRow] = { + val fileIterator = readFileFunction(partitionedFile) + + import scala.collection.JavaConverters._ + + val rows = fileIterator.flatMap(_ match { + case r: InternalRow => Seq(r) + case b: ColumnarBatch => b.rowIterator().asScala + }) + rows + } + + override protected def getPartitions: Array[Partition] = { + logInfo("Getting partitions..") + + tableState.files.zipWithIndex.map(file => { + if (file._1.skeletonFile.isDefined) { + logInfo("Forming partition with => " + file._2 + "," + file._1.dataFile.filePath + + "," + file._1.skeletonFile.get.filePath) + HudiBootstrapPartition(file._2, file._1) + } else { + logInfo("Forming partition with => " + file._2 + "," + file._1.dataFile.filePath) + HudiBootstrapPartition(file._2, file._1) + } + }).toArray + } +} + +case class HudiBootstrapPartition(index: Int, split: HudiBootstrapSplit) extends Partition diff --git a/hudi-spark/src/main/scala/org/apache/hudi/HudiBootstrapRelation.scala b/hudi-spark/src/main/scala/org/apache/hudi/HudiBootstrapRelation.scala new file mode 100644 index 0000000000000..b7aa438129e66 --- /dev/null +++ b/hudi-spark/src/main/scala/org/apache/hudi/HudiBootstrapRelation.scala @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.hadoop.fs.Path +import org.apache.hudi.common.model.{HoodieBaseFile, HoodieRecord} +import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hudi.common.table.view.HoodieTableFileSystemView +import org.apache.spark.internal.Logging +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.datasources.{FileStatusCache, InMemoryFileIndex, PartitionedFile} +import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat +import org.apache.spark.sql.{Row, SQLContext} +import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan} +import org.apache.spark.sql.types.{StringType, StructField, StructType} + +import scala.collection.JavaConverters._ + +class HudiBootstrapRelation(@transient val _sqlContext: SQLContext, + val userSchema: StructType, + val globPaths: Seq[Path], + val metaClient: HoodieTableMetaClient, + val optParams: Map[String, String]) extends BaseRelation + with PrunedFilteredScan with Logging { + + val fileIndex: HudiBootstrapFileIndex = buildFileIndex() + + val skeletonSchema: StructType = StructType(Seq( + StructField(HoodieRecord.COMMIT_TIME_METADATA_FIELD, StringType, nullable = true), + StructField(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, StringType, nullable = true), + StructField(HoodieRecord.RECORD_KEY_METADATA_FIELD, StringType, nullable = true), + StructField(HoodieRecord.PARTITION_PATH_METADATA_FIELD, StringType, nullable = true), + StructField(HoodieRecord.FILENAME_METADATA_FIELD, StringType, nullable = true) + )) + + var dataSchema: StructType = _ + + var completeSchema: StructType = _ + + override def sqlContext: SQLContext = _sqlContext + + override val needConversion: Boolean = false + + override def schema: StructType = inferFullSchema() + + /** + * Implementing PrunedScan to support column pruning, by reading only the required columns from the parquet files + * instead by passing them down to the ParquetFileFormat. + * + * TODO: To get better performance with Filters we should implement PrunedFilteredScan push filters down to the + * parquet files. But this is much more tricky to implement because then with filters being pushed down, unequal + * number od rows may be returned by external data reader, and skeleton file readers. Merging in this scenario + * will become much more complicated. + * + * @param requiredColumns This contains the columns user has passed in select() or filter() operations on the + * dataframe + * @return + */ + override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { + logInfo("Starting scan..") + filters.foreach(filter => logInfo("Obtained filter: " + filter.references.mkString(",") + " " + + filter.getClass)) + + // Compute splits + val bootstrapSplits = fileIndex.files.map(hoodieBaseFile => { + var skeletonFile: Option[PartitionedFile] = Option.empty + var dataFile: PartitionedFile = null + + if (hoodieBaseFile.getExternalBaseFile.isPresent) { + skeletonFile = Option(PartitionedFile(InternalRow.empty, hoodieBaseFile.getPath, 0, hoodieBaseFile.getFileLen)) + dataFile = PartitionedFile(InternalRow.empty, hoodieBaseFile.getExternalBaseFile.get().getPath, 0, + hoodieBaseFile.getExternalBaseFile.get().getFileLen) + } else { + dataFile = PartitionedFile(InternalRow.empty, hoodieBaseFile.getPath, 0, hoodieBaseFile.getFileLen) + } + HudiBootstrapSplit(dataFile, skeletonFile) + }) + val tableState = HudiBootstrapTableState(bootstrapSplits) + + // Get required schemas for column pruning + val requiredDataSchema = StructType(dataSchema.filter(field => requiredColumns.contains(field.name))) + val requiredSkeletonSchema = StructType(skeletonSchema.filter(field => requiredColumns.contains(field.name))) + val requiredRegularSchema = StructType(requiredColumns.map(col => { + completeSchema.find(_.name == col).get + })) + + // Prepare readers for reading data file and skeleton files + val dataReadFunction = new ParquetFileFormat() + .buildReaderWithPartitionValues( + sparkSession = _sqlContext.sparkSession, + dataSchema = dataSchema, + partitionSchema = StructType(Seq.empty), + requiredSchema = requiredDataSchema, + filters = Nil, + options = Map.empty, + hadoopConf = _sqlContext.sparkSession.sessionState.newHadoopConf() + ) + + val skeletonReadFunction = new ParquetFileFormat() + .buildReaderWithPartitionValues( + sparkSession = _sqlContext.sparkSession, + dataSchema = skeletonSchema, + partitionSchema = StructType(Seq.empty), + requiredSchema = requiredSkeletonSchema, + filters = Nil, + options = Map.empty, + hadoopConf = _sqlContext.sparkSession.sessionState.newHadoopConf() + ) + + val regularReadFunction = new ParquetFileFormat() + .buildReaderWithPartitionValues( + sparkSession = _sqlContext.sparkSession, + dataSchema = completeSchema, + partitionSchema = StructType(Seq.empty), + requiredSchema = requiredRegularSchema, + filters = filters, + options = Map.empty, + hadoopConf = _sqlContext.sparkSession.sessionState.newHadoopConf()) + + val rdd = new HudiBootstrapRDD(_sqlContext.sparkSession, dataReadFunction, skeletonReadFunction, + regularReadFunction, requiredDataSchema, requiredSkeletonSchema, requiredColumns, tableState) + + logInfo("Number of partitions for HudiBootstrapRDD => " + rdd.partitions.length) + rdd.asInstanceOf[RDD[Row]] + } + + def inferFullSchema(): StructType = { + if (completeSchema == null) { + logInfo("Inferring schema..") + val schemaResolver = new TableSchemaResolver(metaClient) + val tableSchema = schemaResolver.getTableAvroSchemaWithoutMetadataFields + dataSchema = AvroConversionUtils.convertAvroSchemaToStructType(tableSchema) + completeSchema = StructType(skeletonSchema.fields ++ dataSchema.fields) + } + completeSchema + } + + def buildFileIndex(): HudiBootstrapFileIndex = { + logInfo("Building file index..") + val inMemoryFileIndex = createInMemoryFileIndex(globPaths) + val fileStatuses = inMemoryFileIndex.allFiles() + + if (fileStatuses.isEmpty) { + throw new RuntimeException("No files found for reading.") + } + + val fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline.getCommitsTimeline + .filterCompletedInstants, fileStatuses.toArray) + val latestFiles: List[HoodieBaseFile] = fsView.getLatestBaseFiles.iterator().asScala.toList + latestFiles.foreach(file => logInfo("Skeleton file path: " + file.getPath)) + latestFiles.filter(_.getExternalBaseFile.isPresent).foreach(file => { + logInfo("External data file path: " + file.getExternalBaseFile.get().getPath) + }) + + HudiBootstrapFileIndex(latestFiles) + } + + private def createInMemoryFileIndex(globbedPaths: Seq[Path]): InMemoryFileIndex = { + val fileStatusCache = FileStatusCache.getOrCreate(_sqlContext.sparkSession) + new InMemoryFileIndex(_sqlContext.sparkSession, globbedPaths, Map(), Option.empty, fileStatusCache) + } +} + +case class HudiBootstrapFileIndex(files: List[HoodieBaseFile]) + +case class HudiBootstrapTableState(files: List[HudiBootstrapSplit]) + +case class HudiBootstrapSplit(dataFile: PartitionedFile, skeletonFile: Option[PartitionedFile]) diff --git a/hudi-spark/src/main/scala/org/apache/hudi/IncrementalRelation.scala b/hudi-spark/src/main/scala/org/apache/hudi/IncrementalRelation.scala index 436895bda3499..e796900544e49 100644 --- a/hudi-spark/src/main/scala/org/apache/hudi/IncrementalRelation.scala +++ b/hudi-spark/src/main/scala/org/apache/hudi/IncrementalRelation.scala @@ -17,19 +17,24 @@ package org.apache.hudi +import com.google.common.collect.Lists +import org.apache.avro.Schema import org.apache.hadoop.fs.GlobPattern import org.apache.hadoop.fs.Path import org.apache.hudi.avro.HoodieAvroUtils +import org.apache.hudi.common.bootstrap.index.BootstrapIndex import org.apache.hudi.common.model.{HoodieCommitMetadata, HoodieRecord, HoodieTableType} import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hudi.common.table.timeline.HoodieTimeline +import org.apache.hudi.common.util.ParquetUtils import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.exception.HoodieException import org.apache.hudi.table.HoodieTable import org.apache.log4j.LogManager import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{BaseRelation, TableScan} -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.{Row, SQLContext} +import org.apache.spark.sql.types.{StringType, StructField, StructType} +import org.apache.spark.sql.{DataFrame, Row, SQLContext} import scala.collection.JavaConversions._ import scala.collection.mutable @@ -47,6 +52,14 @@ class IncrementalRelation(val sqlContext: SQLContext, private val log = LogManager.getLogger(classOf[IncrementalRelation]) + val skeletonSchema: StructType = StructType(Seq( + StructField(HoodieRecord.COMMIT_TIME_METADATA_FIELD, StringType, nullable = true), + StructField(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, StringType, nullable = true), + StructField(HoodieRecord.RECORD_KEY_METADATA_FIELD, StringType, nullable = true), + StructField(HoodieRecord.PARTITION_PATH_METADATA_FIELD, StringType, nullable = true), + StructField(HoodieRecord.FILENAME_METADATA_FIELD, StringType, nullable = true) + )) + private val metaClient = new HoodieTableMetaClient(sqlContext.sparkContext.hadoopConfiguration, basePath, true) // MOR tables not supported yet if (metaClient.getTableType.equals(HoodieTableType.MERGE_ON_READ)) { @@ -71,13 +84,16 @@ class IncrementalRelation(val sqlContext: SQLContext, optParams.getOrElse(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY, lastInstant.getTimestamp)) .getInstants.iterator().toList - // use schema from latest metadata, if not present, read schema from the data file - private val latestSchema = { - val schemaUtil = new TableSchemaResolver(metaClient) - val tableSchema = HoodieAvroUtils.createHoodieWriteSchema(schemaUtil.getTableAvroSchemaWithoutMetadataFields); - AvroConversionUtils.convertAvroSchemaToStructType(tableSchema) + // use schema from a file produced in the latest instant + val latestSchema: StructType = { + log.info("Inferring schema..") + val schemaResolver = new TableSchemaResolver(metaClient) + val tableSchema = schemaResolver.getTableAvroSchemaWithoutMetadataFields + val dataSchema = AvroConversionUtils.convertAvroSchemaToStructType(tableSchema) + StructType(skeletonSchema.fields ++ dataSchema.fields) } + private val filters = { if (optParams.contains(DataSourceReadOptions.PUSH_DOWN_INCR_FILTERS_OPT_KEY)) { val filterStr = optParams.getOrElse( @@ -92,36 +108,69 @@ class IncrementalRelation(val sqlContext: SQLContext, override def schema: StructType = latestSchema override def buildScan(): RDD[Row] = { - val fileIdToFullPath = mutable.HashMap[String, String]() + val regularFileIdToFullPath = mutable.HashMap[String, String]() + var metaBootstrapFileIdToFullPath = mutable.HashMap[String, String]() + for (commit <- commitsToReturn) { val metadata: HoodieCommitMetadata = HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit) .get, classOf[HoodieCommitMetadata]) - fileIdToFullPath ++= metadata.getFileIdAndFullPaths(basePath).toMap + + if (HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS == commit.getTimestamp) { + metaBootstrapFileIdToFullPath ++= metadata.getFileIdAndFullPaths(basePath).toMap + } else { + regularFileIdToFullPath ++= metadata.getFileIdAndFullPaths(basePath).toMap + } + } + + if (metaBootstrapFileIdToFullPath.nonEmpty) { + // filer out meta bootstrap files that have had more commits since metadata bootstrap + metaBootstrapFileIdToFullPath = metaBootstrapFileIdToFullPath + .filterNot(fileIdFullPath => regularFileIdToFullPath.contains(fileIdFullPath._1)) } + val pathGlobPattern = optParams.getOrElse( DataSourceReadOptions.INCR_PATH_GLOB_OPT_KEY, DataSourceReadOptions.DEFAULT_INCR_PATH_GLOB_OPT_VAL) - val filteredFullPath = if(!pathGlobPattern.equals(DataSourceReadOptions.DEFAULT_INCR_PATH_GLOB_OPT_VAL)) { - val globMatcher = new GlobPattern("*" + pathGlobPattern) - fileIdToFullPath.filter(p => globMatcher.matches(p._2)) - } else { - fileIdToFullPath + val (filteredRegularFullPaths, filteredMetaBootstrapFullPaths) = { + if(!pathGlobPattern.equals(DataSourceReadOptions.DEFAULT_INCR_PATH_GLOB_OPT_VAL)) { + val globMatcher = new GlobPattern("*" + pathGlobPattern) + (regularFileIdToFullPath.filter(p => globMatcher.matches(p._2)).values, + metaBootstrapFileIdToFullPath.filter(p => globMatcher.matches(p._2)).values) + } else { + (regularFileIdToFullPath.values, metaBootstrapFileIdToFullPath.values) + } } // unset the path filter, otherwise if end_instant_time is not the latest instant, path filter set for RO view // will filter out all the files incorrectly. sqlContext.sparkContext.hadoopConfiguration.unset("mapreduce.input.pathFilter.class") val sOpts = optParams.filter(p => !p._1.equalsIgnoreCase("path")) - if (filteredFullPath.isEmpty) { + if (filteredRegularFullPaths.isEmpty && filteredMetaBootstrapFullPaths.isEmpty) { sqlContext.sparkContext.emptyRDD[Row] } else { log.info("Additional Filters to be applied to incremental source are :" + filters) - filters.foldLeft(sqlContext.read.options(sOpts) - .schema(latestSchema) - .parquet(filteredFullPath.values.toList: _*) - .filter(String.format("%s >= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.head.getTimestamp)) - .filter(String.format("%s <= '%s'", - HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.last.getTimestamp)))((e, f) => e.filter(f)) - .toDF().rdd + + var df: DataFrame = sqlContext.createDataFrame(sqlContext.sparkContext.emptyRDD[Row], latestSchema) + + if (metaBootstrapFileIdToFullPath.nonEmpty) { + df = sqlContext.sparkSession.read + .format("hudi") + .schema(latestSchema) + .option(DataSourceReadOptions.READ_PATHS_OPT_KEY, filteredMetaBootstrapFullPaths.mkString(",")) + .load() + } + + if (regularFileIdToFullPath.nonEmpty) + { + df = df.union(sqlContext.read.options(sOpts) + .schema(latestSchema) + .parquet(filteredRegularFullPaths.toList: _*) + .filter(String.format("%s >= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, + commitsToReturn.head.getTimestamp)) + .filter(String.format("%s <= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, + commitsToReturn.last.getTimestamp))) + } + + filters.foldLeft(df)((e, f) => e.filter(f)).rdd } } } diff --git a/hudi-spark/src/test/java/org/apache/hudi/client/TestBootstrap.java b/hudi-spark/src/test/java/org/apache/hudi/client/TestBootstrap.java index 495e8b057897b..e8eefdcdd61ee 100644 --- a/hudi-spark/src/test/java/org/apache/hudi/client/TestBootstrap.java +++ b/hudi-spark/src/test/java/org/apache/hudi/client/TestBootstrap.java @@ -114,6 +114,11 @@ public void setUp() throws Exception { srcPath = tmpFolder.toAbsolutePath().toString() + "/data"; + // initialize parquet input format + reloadInputFormats(); + } + + private void reloadInputFormats() { // initialize parquet input format roInputFormat = new HoodieParquetInputFormat(); roJobConf = new JobConf(jsc.hadoopConfiguration()); @@ -165,7 +170,7 @@ public void testMetadataBootstrapUnpartitionedCOW() throws Exception { .withSchema(schema.toString()) .withBootstrapModeSelector(MetadataOnlyBootstrapModeSelector.class.getName()).build(); HoodieWriteClient client = new HoodieWriteClient(jsc, config); - client.bootstrap(); + client.bootstrap(Option.empty()); checkBootstrapResults(totalRecords, schema, HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, true, 1, timestamp, timestamp, false); @@ -183,7 +188,7 @@ public void testMetadataBootstrapUnpartitionedCOW() throws Exception { // Run bootstrap again client = new HoodieWriteClient(jsc, config); - client.bootstrap(); + client.bootstrap(Option.empty()); metaClient.reloadActiveTimeline(); index = BootstrapIndex.getBootstrapIndex(metaClient); @@ -223,7 +228,7 @@ public void testMetadataBootstrapWithUpdatesCOW() throws Exception { .withBootstrapModeSelector(MetadataOnlyBootstrapModeSelector.class.getName()) .build(); HoodieWriteClient client = new HoodieWriteClient(jsc, config); - client.bootstrap(); + client.bootstrap(Option.empty()); checkBootstrapResults(totalRecords, schema, HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, true, 1, timestamp, timestamp, false); @@ -241,7 +246,7 @@ public void testMetadataBootstrapWithUpdatesCOW() throws Exception { // Run bootstrap again client = new HoodieWriteClient(jsc, config); - client.bootstrap(); + client.bootstrap(Option.empty()); metaClient.reloadActiveTimeline(); index = BootstrapIndex.getBootstrapIndex(metaClient); @@ -282,7 +287,7 @@ public void testMetadataBootstrapWithUpdatesMOR() throws Exception { .withBootstrapModeSelector(MetadataOnlyBootstrapModeSelector.class.getName()).build(); System.out.println("Config Props :" + config.getProps().getProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY())); HoodieWriteClient client = new HoodieWriteClient(jsc, config); - client.bootstrap(); + client.bootstrap(Option.empty()); checkBootstrapResults(totalRecords, schema, HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, true, 1, timestamp, timestamp, false); // Rollback Bootstrap @@ -299,7 +304,7 @@ public void testMetadataBootstrapWithUpdatesMOR() throws Exception { // Run bootstrap again client = new HoodieWriteClient(jsc, config); - client.bootstrap(); + client.bootstrap(Option.empty()); metaClient.reloadActiveTimeline(); index = BootstrapIndex.getBootstrapIndex(metaClient); @@ -341,7 +346,7 @@ public void testFullBoostrapOnlyCOW() throws Exception { .withFullBootstrapInputProvider(FullTestBootstrapInputProvider.class.getName()) .withBootstrapModeSelector(FullBootstrapModeSelector.class.getName()).build(); HoodieWriteClient client = new HoodieWriteClient(jsc, config); - client.bootstrap(); + client.bootstrap(Option.empty()); checkBootstrapResults(totalRecords, schema, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS, false, 1, timestamp, timestamp, false); // Rollback Bootstrap @@ -358,7 +363,7 @@ public void testFullBoostrapOnlyCOW() throws Exception { // Run bootstrap again client = new HoodieWriteClient(jsc, config); - client.bootstrap(); + client.bootstrap(Option.empty()); metaClient.reloadActiveTimeline(); index = BootstrapIndex.getBootstrapIndex(metaClient); @@ -395,7 +400,7 @@ public void testFullBootstrapWithUpdatesMOR() throws Exception { .withFullBootstrapInputProvider(FullTestBootstrapInputProvider.class.getName()) .withBootstrapModeSelector(FullBootstrapModeSelector.class.getName()).build(); HoodieWriteClient client = new HoodieWriteClient(jsc, config); - client.bootstrap(); + client.bootstrap(Option.empty()); checkBootstrapResults(totalRecords, schema, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS, false, 1, timestamp, timestamp, false); // Rollback Bootstrap @@ -412,7 +417,7 @@ public void testFullBootstrapWithUpdatesMOR() throws Exception { // Run bootstrap again client = new HoodieWriteClient(jsc, config); - client.bootstrap(); + client.bootstrap(Option.empty()); metaClient.reloadActiveTimeline(); index = BootstrapIndex.getBootstrapIndex(metaClient); @@ -454,7 +459,7 @@ public void testMetaAndFullBoostrapCOW() throws Exception { .withFullBootstrapInputProvider(FullTestBootstrapInputProvider.class.getName()) .withBootstrapModeSelector(TestRandomBootstapModeSelector.class.getName()).build(); HoodieWriteClient client = new HoodieWriteClient(jsc, config); - client.bootstrap(); + client.bootstrap(Option.empty()); checkBootstrapResults(totalRecords, schema, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS, false, 2, 2, timestamp, timestamp, false, Arrays.asList(HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS)); @@ -472,7 +477,7 @@ public void testMetaAndFullBoostrapCOW() throws Exception { // Run bootstrap again client = new HoodieWriteClient(jsc, config); - client.bootstrap(); + client.bootstrap(Option.empty()); metaClient.reloadActiveTimeline(); index = BootstrapIndex.getBootstrapIndex(metaClient); @@ -509,7 +514,7 @@ public void testMetadataAndFullBootstrapWithUpdatesMOR() throws Exception { .withFullBootstrapInputProvider(FullTestBootstrapInputProvider.class.getName()) .withBootstrapModeSelector(TestRandomBootstapModeSelector.class.getName()).build(); HoodieWriteClient client = new HoodieWriteClient(jsc, config); - client.bootstrap(); + client.bootstrap(Option.empty()); checkBootstrapResults(totalRecords, schema, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS, false, 2, 2, timestamp, timestamp, false, Arrays.asList(HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS)); @@ -527,7 +532,7 @@ public void testMetadataAndFullBootstrapWithUpdatesMOR() throws Exception { // Run bootstrap again client = new HoodieWriteClient(jsc, config); - client.bootstrap(); + client.bootstrap(Option.empty()); metaClient.reloadActiveTimeline(); index = BootstrapIndex.getBootstrapIndex(metaClient); @@ -593,6 +598,7 @@ private void checkBootstrapResults(int totalRecords, Schema schema, String insta } // RO Input Format Read + reloadInputFormats(); List records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( FSUtils.getAllPartitionPaths(metaClient.getFs(), basePath, false).stream() .map(f -> basePath + "/" + f).collect(Collectors.toList()), @@ -609,6 +615,7 @@ private void checkBootstrapResults(int totalRecords, Schema schema, String insta assertEquals(totalRecords, seenKeys.size()); //RT Input Format Read + reloadInputFormats(); seenKeys = new HashSet<>(); records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( FSUtils.getAllPartitionPaths(metaClient.getFs(), basePath, false).stream() @@ -752,10 +759,8 @@ public Map> select(List { final BootstrapMode mode; if (currIdx == 0) { - System.out.println("METADATA bootstrap selected"); mode = BootstrapMode.METADATA_ONLY_BOOTSTRAP; } else { - System.out.println("FULL bootstrap selected"); mode = BootstrapMode.FULL_BOOTSTRAP; } currIdx = (currIdx + 1) % 2; diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/checkpointing/InitialCheckPointProvider.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/checkpointing/InitialCheckPointProvider.java index 4cdc01ece6468..7fc8afb548034 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/checkpointing/InitialCheckPointProvider.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/checkpointing/InitialCheckPointProvider.java @@ -18,12 +18,12 @@ package org.apache.hudi.utilities.checkpointing; -import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.exception.HoodieException; - +import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.exception.HoodieException; /** * Provide the initial checkpoint for delta streamer. @@ -51,7 +51,13 @@ public InitialCheckPointProvider(TypedProperties props) { * * @param config Hadoop configuration */ - public abstract void init(Configuration config) throws HoodieException; + public void init(Configuration config) throws HoodieException { + try { + this.fs = FileSystem.get(config); + } catch (IOException e) { + throw new HoodieException("CheckpointProvider initialization failed"); + } + } /** * Get checkpoint string recognizable for delta streamer. diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/checkpointing/InitialCheckpointFromAnotherHoodieTimelineProvider.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/checkpointing/InitialCheckpointFromAnotherHoodieTimelineProvider.java new file mode 100644 index 0000000000000..17058da7fddc6 --- /dev/null +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/checkpointing/InitialCheckpointFromAnotherHoodieTimelineProvider.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities.checkpointing; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.exception.HoodieException; + +import java.io.IOException; +import java.util.Objects; + +import static org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.CHECKPOINT_KEY; + +/** + * This is used to set a checkpoint from latest commit of another (mirror) hudi dataset. + * Used by integration test. + */ +public class InitialCheckpointFromAnotherHoodieTimelineProvider extends InitialCheckPointProvider { + + private HoodieTableMetaClient anotherDsHoodieMetaclient; + + public InitialCheckpointFromAnotherHoodieTimelineProvider(TypedProperties props) { + super(props); + } + + @Override + public void init(Configuration config) throws HoodieException { + super.init(config); + this.anotherDsHoodieMetaclient = new HoodieTableMetaClient(config, path.toString()); + } + + @Override + public String getCheckpoint() throws HoodieException { + return anotherDsHoodieMetaclient.getCommitsTimeline().filterCompletedInstants().getReverseOrderedInstants() + .map(instant -> { + try { + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(anotherDsHoodieMetaclient.getActiveTimeline().getInstantDetails(instant).get(), + HoodieCommitMetadata.class); + return commitMetadata.getMetadata(CHECKPOINT_KEY); + } catch (IOException e) { + return null; + } + }).filter(Objects::nonNull).findFirst().get(); + } +} diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/checkpointing/KafkaConnectHdfsProvider.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/checkpointing/KafkaConnectHdfsProvider.java index 8e8af55a3c563..654836c2a68e3 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/checkpointing/KafkaConnectHdfsProvider.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/checkpointing/KafkaConnectHdfsProvider.java @@ -21,9 +21,7 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.exception.HoodieException; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; @@ -44,15 +42,6 @@ public KafkaConnectHdfsProvider(TypedProperties props) { super(props); } - @Override - public void init(Configuration config) throws HoodieException { - try { - this.fs = FileSystem.get(config); - } catch (IOException e) { - throw new HoodieException("KafkaConnectHdfsProvider initialization failed"); - } - } - /** * PathFilter for Kafka-Connect-HDFS. * Directory format: /partition1=xxx/partition2=xxx diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java index 8a8d6780f4246..c5f5e70b7c934 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java @@ -75,6 +75,8 @@ import scala.collection.JavaConversions; +import static org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.CHECKPOINT_KEY; +import static org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.CHECKPOINT_RESET_KEY; import static org.apache.hudi.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE; import static org.apache.hudi.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME; @@ -85,8 +87,6 @@ public class DeltaSync implements Serializable { private static final long serialVersionUID = 1L; private static final Logger LOG = LogManager.getLogger(DeltaSync.class); - public static final String CHECKPOINT_KEY = "deltastreamer.checkpoint.key"; - public static final String CHECKPOINT_RESET_KEY = "deltastreamer.checkpoint.reset_key"; /** * Delta Sync Config. @@ -260,7 +260,8 @@ private Pair>> readFromSource resumeCheckpointStr = Option.of(cfg.checkpoint); } else if (commitMetadata.getMetadata(CHECKPOINT_KEY) != null) { resumeCheckpointStr = Option.of(commitMetadata.getMetadata(CHECKPOINT_KEY)); - } else { + } else if (HoodieTimeline.compareTimestamps(HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS, + HoodieTimeline.LESSER_THAN, lastCommit.get().getTimestamp())) { throw new HoodieDeltaStreamerException( "Unable to find previous checkpoint. Please double check if this table " + "was indeed built via delta streamer. Last Commit :" + lastCommit + ", Instants :" diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java index 725edd5e1d8dc..6f78e29a611f5 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java @@ -18,7 +18,7 @@ package org.apache.hudi.utilities.deltastreamer; -import org.apache.hadoop.hive.conf.HiveConf; +import java.util.HashMap; import org.apache.hudi.DataSourceUtils; import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.client.HoodieWriteClient; @@ -55,6 +55,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaSparkContext; @@ -89,13 +90,14 @@ public class HoodieDeltaStreamer implements Serializable { private static final long serialVersionUID = 1L; private static final Logger LOG = LogManager.getLogger(HoodieDeltaStreamer.class); - public static String CHECKPOINT_KEY = "deltastreamer.checkpoint.key"; + public static final String CHECKPOINT_KEY = "deltastreamer.checkpoint.key"; + public static final String CHECKPOINT_RESET_KEY = "deltastreamer.checkpoint.reset_key"; private final transient Config cfg; private final TypedProperties properties; - private transient DeltaSyncService deltaSyncService; + private transient Option deltaSyncService; private final Option bootstrapExecutor; @@ -114,22 +116,27 @@ public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc, FileSystem fs, Con } public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc, FileSystem fs, Configuration conf, - TypedProperties properties) throws IOException { + TypedProperties props) throws IOException { + // Resolving the properties first in a consistent way + this.properties = props != null ? props : UtilHelpers.readConfig( + FSUtils.getFs(cfg.propsFilePath, jssc.hadoopConfiguration()), + new Path(cfg.propsFilePath), cfg.configs).getConfig(); + if (cfg.initialCheckpointProvider != null && cfg.checkpoint == null) { InitialCheckPointProvider checkPointProvider = - UtilHelpers.createInitialCheckpointProvider(cfg.initialCheckpointProvider, properties); + UtilHelpers.createInitialCheckpointProvider(cfg.initialCheckpointProvider, this.properties); checkPointProvider.init(conf); cfg.checkpoint = checkPointProvider.getCheckpoint(); } this.cfg = cfg; - this.deltaSyncService = new DeltaSyncService(cfg, jssc, fs, conf, properties); - this.properties = properties; this.bootstrapExecutor = Option.ofNullable( - cfg.runBootstrap ? new BootstrapExecutor(cfg, jssc, fs, conf, properties) : null); + cfg.runBootstrap ? new BootstrapExecutor(cfg, jssc, fs, conf, this.properties) : null); + this.deltaSyncService = Option.ofNullable( + cfg.runBootstrap ? null : new DeltaSyncService(cfg, jssc, fs, conf, this.properties)); } public void shutdownGracefully() { - deltaSyncService.shutdown(false); + deltaSyncService.ifPresent(ds -> ds.shutdown(false)); } /** @@ -143,18 +150,30 @@ public void sync() throws Exception { bootstrapExecutor.get().execute(); } else { if (cfg.continuousMode) { - deltaSyncService.start(this::onDeltaSyncShutdown); - deltaSyncService.waitForShutdown(); + deltaSyncService.ifPresent(ds -> { + ds.start(this::onDeltaSyncShutdown); + try { + ds.waitForShutdown(); + } catch (Exception e) { + throw new HoodieException(e.getMessage(), e); + } + }); LOG.info("Delta Sync shutting down"); } else { LOG.info("Delta Streamer running only single round"); try { - deltaSyncService.getDeltaSync().syncOnce(); + deltaSyncService.ifPresent(ds -> { + try { + ds.getDeltaSync().syncOnce(); + } catch (Exception e) { + throw new HoodieException(e.getMessage(), e); + } + }); } catch (Exception ex) { LOG.error("Got error running delta sync once. Shutting down", ex); throw ex; } finally { - deltaSyncService.close(); + deltaSyncService.ifPresent(DeltaSyncService::close); LOG.info("Shut down delta streamer"); } } @@ -167,7 +186,7 @@ public Config getConfig() { private boolean onDeltaSyncShutdown(boolean error) { LOG.info("DeltaSync shutdown. Closing write client. Error?" + error); - deltaSyncService.close(); + deltaSyncService.ifPresent(DeltaSyncService::close); return true; } @@ -410,9 +429,7 @@ public DeltaSyncService(Config cfg, JavaSparkContext jssc, FileSystem fs, Config ValidationUtils.checkArgument(!cfg.filterDupes || cfg.operation != Operation.UPSERT, "'--filter-dupes' needs to be disabled when '--op' is 'UPSERT' to ensure updates are not missed."); - this.props = properties != null ? properties : UtilHelpers.readConfig( - FSUtils.getFs(cfg.propsFilePath, jssc.hadoopConfiguration()), - new Path(cfg.propsFilePath), cfg.configs).getConfig(); + this.props = properties; LOG.info("Creating delta streamer with configs : " + props.toString()); this.schemaProvider = UtilHelpers.createSchemaProvider(cfg.schemaProviderClassName, props, jssc); @@ -685,9 +702,7 @@ public BootstrapExecutor(Config cfg, JavaSparkContext jssc, FileSystem fs, Confi this.jssc = jssc; this.fs = fs; this.configuration = conf; - this.props = properties != null ? properties : UtilHelpers.readConfig( - FSUtils.getFs(cfg.propsFilePath, jssc.hadoopConfiguration()), - new Path(cfg.propsFilePath), cfg.configs).getConfig(); + this.props = properties; // Add more defaults if full bootstrap requested this.props.putIfAbsent(DataSourceWriteOptions.PAYLOAD_CLASS_OPT_KEY(), DataSourceWriteOptions.DEFAULT_PAYLOAD_OPT_VAL()); @@ -713,8 +728,18 @@ public BootstrapExecutor(Config cfg, JavaSparkContext jssc, FileSystem fs, Confi public void execute() throws IOException { initializeTable(); HoodieWriteClient bootstrapClient = new HoodieWriteClient(jssc, bootstrapConfig, true); - bootstrapClient.bootstrap(); - syncHive(); + + try { + HashMap checkpointCommitMetadata = new HashMap<>(); + checkpointCommitMetadata.put(CHECKPOINT_KEY, cfg.checkpoint); + if (cfg.checkpoint != null) { + checkpointCommitMetadata.put(CHECKPOINT_RESET_KEY, cfg.checkpoint); + } + bootstrapClient.bootstrap(Option.of(checkpointCommitMetadata)); + syncHive(); + } finally { + bootstrapClient.close(); + } } /** diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/keygen/TimestampBasedKeyGenerator.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/keygen/TimestampBasedKeyGenerator.java index e8718558ccfe0..176661b0fe356 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/keygen/TimestampBasedKeyGenerator.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/keygen/TimestampBasedKeyGenerator.java @@ -18,6 +18,9 @@ package org.apache.hudi.utilities.keygen; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; import org.apache.hudi.DataSourceUtils; import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.avro.HoodieAvroUtils; @@ -62,6 +65,8 @@ enum TimestampType implements Serializable { // https://docs.oracle.com/javase/8/docs/api/java/util/TimeZone.html private final TimeZone timeZone; + protected final boolean encodePartitionPath; + /** * Supported configs. */ @@ -108,6 +113,9 @@ public TimestampBasedKeyGenerator(TypedProperties config) { default: timeUnit = null; } + + this.encodePartitionPath = config.getBoolean(DataSourceWriteOptions.URL_ENCODE_PARTITIONING_OPT_KEY(), + Boolean.parseBoolean(DataSourceWriteOptions.DEFAULT_URL_ENCODE_PARTITIONING_OPT_VAL())); } @Override diff --git a/packaging/hudi-hadoop-mr-bundle/pom.xml b/packaging/hudi-hadoop-mr-bundle/pom.xml index c759d0d0769a1..0da29359ea9a5 100644 --- a/packaging/hudi-hadoop-mr-bundle/pom.xml +++ b/packaging/hudi-hadoop-mr-bundle/pom.xml @@ -104,6 +104,12 @@ org.apache.hadoop.hbase. org.apache.hudi.org.apache.hadoop.hbase. + + + + + org.apache.hadoop.hbase.util.VersionInfo + org.apache.htrace.