diff --git a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestUtil.java b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestUtil.java index 1ab5bb5224229..4d75ca0e6b420 100644 --- a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestUtil.java +++ b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestUtil.java @@ -193,11 +193,15 @@ public static void clear() throws IOException, HiveException, MetaException { .setPayloadClass(HoodieAvroPayload.class) .initTable(HadoopFSUtils.getStorageConfWithCopy(configuration), basePath); - for (String tableName : createdTablesSet) { - ddlExecutor.runSQL("drop table if exists " + tableName); + if (ddlExecutor != null) { + for (String tableName : createdTablesSet) { + ddlExecutor.runSQL("drop table if exists " + tableName); + } } createdTablesSet.clear(); - ddlExecutor.runSQL("drop database if exists " + DB_NAME + " cascade"); + if (ddlExecutor != null) { + ddlExecutor.runSQL("drop database if exists " + DB_NAME + " cascade"); + } } public static HiveConf getHiveConf() { @@ -226,6 +230,7 @@ public static void shutdown() { try { if (hiveServer != null) { hiveServer.stop(); + hiveServer = null; } } catch (Exception e) { e.printStackTrace(); @@ -235,6 +240,7 @@ public static void shutdown() { try { if (hiveTestService != null) { hiveTestService.stop(); + hiveTestService = null; } } catch (Exception e) { e.printStackTrace(); @@ -244,6 +250,7 @@ public static void shutdown() { try { if (zkServer != null) { zkServer.shutdown(true); + zkServer = null; } } catch (Exception e) { e.printStackTrace(); @@ -253,6 +260,7 @@ public static void shutdown() { try { if (zkService != null) { zkService.stop(); + zkService = null; } } catch (RuntimeException re) { re.printStackTrace(); @@ -262,6 +270,7 @@ public static void shutdown() { try { if (fileSystem != null) { fileSystem.close(); + fileSystem = null; } } catch (IOException ie) { ie.printStackTrace(); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HudiHiveSyncJob.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HudiHiveSyncJob.java new file mode 100644 index 0000000000000..59d92658f4f81 --- /dev/null +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HudiHiveSyncJob.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.hive.HiveSyncTool; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH; + +/** + * Utility job for running Hive sync on-demand for Hudi tables. + *
+ * This tool allows you to synchronize Hudi table metadata with Hive metastore + * independently from ingestion workflows, useful for backfills, manual data + * corrections, or quick metadata reconciliation. + *
+ * Example usage: + *
+ * spark-submit \ + * --class org.apache.hudi.utilities.HudiHiveSyncJob \ + * hudi-utilities.jar \ + * --base-path /path/to/hudi/table \ + * --base-file-format PARQUET \ + * --props-file-path /path/to/hive-sync.properties \ + * --hoodie-conf hoodie.datasource.hive_sync.database=my_db \ + * --hoodie-conf hoodie.datasource.hive_sync.table=my_table + *+ */ +public class HudiHiveSyncJob { + + private static final Logger LOG = LoggerFactory.getLogger(HudiHiveSyncJob.class); + + private final Config cfg; + private final Configuration hadoopConf; + private final TypedProperties props; + + public HudiHiveSyncJob(JavaSparkContext jsc, Config cfg) { + this.cfg = cfg; + this.hadoopConf = jsc.hadoopConfiguration(); + this.props = UtilHelpers.buildProperties(hadoopConf, cfg.propsFilePath, cfg.configs); + } + + public static void main(String[] args) throws IOException { + final Config cfg = new Config(); + new JCommander(cfg, null, args); + LOG.info("Cfg received: {}", cfg); + JavaSparkContext jsc; + if (StringUtils.isNullOrEmpty(cfg.sparkMaster)) { + jsc = UtilHelpers.buildSparkContext("HudiHiveSyncJob", "local[2]", true); + } else { + jsc = UtilHelpers.buildSparkContext("HudiHiveSyncJob", cfg.sparkMaster, true); + } + try { + new HudiHiveSyncJob(jsc, cfg).run(); + } finally { + jsc.stop(); + } + } + + public void run() throws IOException { + LOG.info("Starting hive sync for {}", cfg.basePath); + HoodieTimer timer = HoodieTimer.start(); + HiveSyncTool syncTool = null; + try { + props.put(META_SYNC_BASE_PATH.key(), cfg.basePath); + props.put(META_SYNC_BASE_FILE_FORMAT.key(), cfg.baseFileFormat); + + LOG.info("HiveSyncConfig props used to sync data {}", props); + syncTool = new HiveSyncTool(props, new HiveConf(hadoopConf, HiveConf.class)); + syncTool.syncHoodieTable(); + } catch (Exception e) { + LOG.error("Exception in running hive-sync", e); + throw new HoodieException("Hive sync failed", e); + } finally { + if (syncTool != null) { + syncTool.close(); + } + LOG.info("Hive-sync duration in ms {}", timer.endTimer()); + } + } + + public static class Config implements Serializable { + @Parameter(names = {"--base-path", "-sp"}, description = "Base path for the table", required = true) + public String basePath = null; + + @Parameter(names = {"--base-file-format", "-bff"}, description = "Base file format of the dataset") + public String baseFileFormat = "PARQUET"; + + @Parameter(names = {"--props-file-path"}, description = "Path to properties file on localfs or dfs.") + public String propsFilePath = null; + + @Parameter(names = {"--spark-master"}, + description = "spark master to use, if not defined inherits from your environment taking into " + + "account Spark Configuration priority rules (e.g. not using spark-submit command).") + public String sparkMaster = ""; + + @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file " + + "(using the CLI parameter \"--props\") can also be passed command line using this parameter. This can be repeated", + splitter = IdentitySplitter.class) + public List