diff --git a/CHANGELOG.md b/CHANGELOG.md index 527a82cd..ce2d3f5f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## 8.1.16 - 2026-05-14 +### Added +- `apiary-gluesync-listener`: new `GLUE_SKIP_ARCHIVE` environment variable to set the default value of + `SkipArchive` on AWS Glue `UpdateTable` requests (triggered by Hive `ALTER TABLE` events). Accepts + `true` or `false` (case-insensitive). When unset, the existing built-in default (`true`) is used. + Any other value fails at startup. The per-table Hive property `apiary.gluesync.skipArchive` still + takes precedence. + ## 8.1.15 - 2025-11-28 ### Changed - When sending the events we hash based on db and table name. By default we are taking the newest table name. diff --git a/hive-event-listeners/apiary-gluesync-listener/README.md b/hive-event-listeners/apiary-gluesync-listener/README.md index 93adf370..eb100ac3 100644 --- a/hive-event-listeners/apiary-gluesync-listener/README.md +++ b/hive-event-listeners/apiary-gluesync-listener/README.md @@ -16,10 +16,17 @@ The GlueSync listener can be configured by setting the following System Environm |----|----|----| GLUE_PREFIX|No|Prefix added to Glue databases to handle database name collisions when synchronizing multiple metastores to the Glue catalog. ENABLE_HIVE_TO_GLUE_RENAME_OPERATION|No|Set to true in case you would like to enable Hive table renames when syncing into Glue. Default value is false. +GLUE_SKIP_ARCHIVE|No|Default value applied to `SkipArchive` on AWS Glue `UpdateTable` requests when a table does not set the `apiary.gluesync.skipArchive` property. Accepts `true` or `false` (case-insensitive). When unset, the built-in default (`true`) is used. Any other value causes the listener to fail on startup. ## Table update SkipArchive -[AWS default](https://docs.aws.amazon.com/glue/latest/webapi/API_UpdateTable.html#Glue-UpdateTable-request-SkipArchive) is to archive the table on every update. With Iceberg tables this can lead to a lot of table versions. In Glue you can only have a certain limit of the number of versions and you'll get exceptions when trying to update a table once you hit that limit. Manual version removal through AWS api is then needed. To counter this we override this property and set skipArchive=true. So the listners does *not* make an archive of the table when updating. -If an archive is needed, this can be done per table by setting the Hive table property: 'apiary.gluesync.skipArchive=false'. +[AWS default](https://docs.aws.amazon.com/glue/latest/webapi/API_UpdateTable.html#Glue-UpdateTable-request-SkipArchive) is to archive the table on every update. With Iceberg tables this can lead to a lot of table versions. In Glue you can only have a certain limit of the number of versions and you'll get exceptions when trying to update a table once you hit that limit. Manual version removal through AWS api is then needed. To counter this the listener defaults to `skipArchive=true`, so it does *not* make an archive of the table when updating. + +The effective value is resolved using the following precedence (highest first): +1. The Hive table property `apiary.gluesync.skipArchive` (`true` or `false`), when set. +2. The environment variable `GLUE_SKIP_ARCHIVE` (`true` or `false`), when set. +3. The built-in default, `true`. + +This setting only affects `ALTER TABLE` events. AWS Glue's `UpdatePartition` and `BatchUpdatePartition` APIs do not expose a `SkipArchive` field, so partition updates are not impacted. # Legal diff --git a/hive-event-listeners/apiary-gluesync-listener/src/main/java/com/expediagroup/apiary/extensions/gluesync/listener/ApiaryGlueSync.java b/hive-event-listeners/apiary-gluesync-listener/src/main/java/com/expediagroup/apiary/extensions/gluesync/listener/ApiaryGlueSync.java index 8183afa9..0031518d 100644 --- a/hive-event-listeners/apiary-gluesync-listener/src/main/java/com/expediagroup/apiary/extensions/gluesync/listener/ApiaryGlueSync.java +++ b/hive-event-listeners/apiary-gluesync-listener/src/main/java/com/expediagroup/apiary/extensions/gluesync/listener/ApiaryGlueSync.java @@ -1,5 +1,5 @@ /** - * Copyright (C) 2018-2025 Expedia, Inc. + * Copyright (C) 2018-2026 Expedia, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -82,10 +82,20 @@ public ApiaryGlueSync(Configuration config, boolean throwExceptions) { */ public ApiaryGlueSync(Configuration config, AWSGlue glueClient, String gluePrefix, MetricService metricService, boolean throwExceptions) { + this(config, glueClient, gluePrefix, metricService, throwExceptions, null); + } + + /** + * Just for testing. Allows injecting the default {@code SkipArchive} value + * that would otherwise be read from the + * {@code GLUE_SKIP_ARCHIVE} environment variable. + */ + public ApiaryGlueSync(Configuration config, AWSGlue glueClient, String gluePrefix, MetricService metricService, + boolean throwExceptions, Boolean defaultSkipArchive) { super(config); this.glueClient = glueClient; this.glueDatabaseService = new GlueDatabaseService(glueClient, gluePrefix); - this.gluePartitionService = new GluePartitionService(glueClient, gluePrefix); + this.gluePartitionService = new GluePartitionService(glueClient, gluePrefix, defaultSkipArchive); this.glueTableService = new GlueTableService(glueClient, gluePartitionService, gluePrefix); this.isIcebergPredicate = new IsIcebergTablePredicate(); this.metricService = metricService; diff --git a/hive-event-listeners/apiary-gluesync-listener/src/main/java/com/expediagroup/apiary/extensions/gluesync/listener/service/GluePartitionService.java b/hive-event-listeners/apiary-gluesync-listener/src/main/java/com/expediagroup/apiary/extensions/gluesync/listener/service/GluePartitionService.java index bf5909e8..ff5075e2 100644 --- a/hive-event-listeners/apiary-gluesync-listener/src/main/java/com/expediagroup/apiary/extensions/gluesync/listener/service/GluePartitionService.java +++ b/hive-event-listeners/apiary-gluesync-listener/src/main/java/com/expediagroup/apiary/extensions/gluesync/listener/service/GluePartitionService.java @@ -1,5 +1,5 @@ /** - * Copyright (C) 2018-2025 Expedia, Inc. + * Copyright (C) 2018-2026 Expedia, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -50,18 +50,53 @@ public class GluePartitionService { private final HiveToGlueTransformer transformer; private final GlueMetadataStringCleaner cleaner = new GlueMetadataStringCleaner(); private final HiveToGluePartitionComparator partitionComparator = new HiveToGluePartitionComparator(); + private final Boolean defaultSkipArchive; public static final String APIARY_GLUESYNC_SKIP_ARCHIVE_TABLE_PARAM = "apiary.gluesync.skipArchive"; + public static final String GLUE_SKIP_ARCHIVE_ENV = "GLUE_SKIP_ARCHIVE"; private static final int DEFAULT_MAX_RESULTS_SIZE = 1000; // Current max supported by Glue private static final int MAX_PARTITION_CREATE_BATCH_SIZE = 100; private static final int MAX_PARTITION_UPDATE_BATCH_SIZE = 100; private static final int MAX_PARTITION_DELETE_BATCH_SIZE = 25; public GluePartitionService(AWSGlue glueClient, String gluePrefix) { + this(glueClient, gluePrefix, parseSkipArchiveDefaultFromEnv()); + } + + public GluePartitionService(AWSGlue glueClient, String gluePrefix, Boolean defaultSkipArchive) { this.glueClient = glueClient; this.transformer = new HiveToGlueTransformer(gluePrefix); + this.defaultSkipArchive = defaultSkipArchive; log.debug("ApiaryGlueSync created"); } + /** + * Reads the {@value #GLUE_SKIP_ARCHIVE_ENV} environment variable + * and returns its boolean value. + *
+ * Only the literals {@code "true"} and {@code "false"} (case-insensitive) are
+ * accepted. When the variable is not set or is empty, {@code null} is returned
+ * so the built-in default applies. Any other value causes an
+ * {@link IllegalArgumentException} to be thrown at startup.
+ */
+ static Boolean parseSkipArchiveDefaultFromEnv() {
+ return parseSkipArchiveDefault(System.getenv(GLUE_SKIP_ARCHIVE_ENV));
+ }
+
+ static Boolean parseSkipArchiveDefault(String value) {
+ if (value == null || value.isEmpty()) {
+ return null;
+ }
+ if ("true".equalsIgnoreCase(value)) {
+ return Boolean.TRUE;
+ }
+ if ("false".equalsIgnoreCase(value)) {
+ return Boolean.FALSE;
+ }
+ throw new IllegalArgumentException(
+ "Invalid value for environment variable " + GLUE_SKIP_ARCHIVE_ENV
+ + ": '" + value + "'. Expected 'true' or 'false'.");
+ }
+
public void create(Table table, org.apache.hadoop.hive.metastore.api.Partition partition) {
CreatePartitionRequest createPartitionRequest = new CreatePartitionRequest()
.withPartitionInput(transformer.transformPartition(partition))
@@ -155,16 +190,20 @@ public List