diff --git a/CHANGELOG.md b/CHANGELOG.md index 527a82cd..ce2d3f5f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## 8.1.16 - 2026-05-14 +### Added +- `apiary-gluesync-listener`: new `GLUE_SKIP_ARCHIVE` environment variable to set the default value of + `SkipArchive` on AWS Glue `UpdateTable` requests (triggered by Hive `ALTER TABLE` events). Accepts + `true` or `false` (case-insensitive). When unset, the existing built-in default (`true`) is used. + Any other value fails at startup. The per-table Hive property `apiary.gluesync.skipArchive` still + takes precedence. + ## 8.1.15 - 2025-11-28 ### Changed - When sending the events we hash based on db and table name. By default we are taking the newest table name. diff --git a/hive-event-listeners/apiary-gluesync-listener/README.md b/hive-event-listeners/apiary-gluesync-listener/README.md index 93adf370..eb100ac3 100644 --- a/hive-event-listeners/apiary-gluesync-listener/README.md +++ b/hive-event-listeners/apiary-gluesync-listener/README.md @@ -16,10 +16,17 @@ The GlueSync listener can be configured by setting the following System Environm |----|----|----| GLUE_PREFIX|No|Prefix added to Glue databases to handle database name collisions when synchronizing multiple metastores to the Glue catalog. ENABLE_HIVE_TO_GLUE_RENAME_OPERATION|No|Set to true in case you would like to enable Hive table renames when syncing into Glue. Default value is false. +GLUE_SKIP_ARCHIVE|No|Default value applied to `SkipArchive` on AWS Glue `UpdateTable` requests when a table does not set the `apiary.gluesync.skipArchive` property. Accepts `true` or `false` (case-insensitive). When unset, the built-in default (`true`) is used. Any other value causes the listener to fail on startup. ## Table update SkipArchive -[AWS default](https://docs.aws.amazon.com/glue/latest/webapi/API_UpdateTable.html#Glue-UpdateTable-request-SkipArchive) is to archive the table on every update. With Iceberg tables this can lead to a lot of table versions. In Glue you can only have a certain limit of the number of versions and you'll get exceptions when trying to update a table once you hit that limit. Manual version removal through AWS api is then needed. To counter this we override this property and set skipArchive=true. So the listners does *not* make an archive of the table when updating. -If an archive is needed, this can be done per table by setting the Hive table property: 'apiary.gluesync.skipArchive=false'. +[AWS default](https://docs.aws.amazon.com/glue/latest/webapi/API_UpdateTable.html#Glue-UpdateTable-request-SkipArchive) is to archive the table on every update. With Iceberg tables this can lead to a lot of table versions. In Glue you can only have a certain limit of the number of versions and you'll get exceptions when trying to update a table once you hit that limit. Manual version removal through AWS api is then needed. To counter this the listener defaults to `skipArchive=true`, so it does *not* make an archive of the table when updating. + +The effective value is resolved using the following precedence (highest first): +1. The Hive table property `apiary.gluesync.skipArchive` (`true` or `false`), when set. +2. The environment variable `GLUE_SKIP_ARCHIVE` (`true` or `false`), when set. +3. The built-in default, `true`. + +This setting only affects `ALTER TABLE` events. AWS Glue's `UpdatePartition` and `BatchUpdatePartition` APIs do not expose a `SkipArchive` field, so partition updates are not impacted. # Legal diff --git a/hive-event-listeners/apiary-gluesync-listener/src/main/java/com/expediagroup/apiary/extensions/gluesync/listener/ApiaryGlueSync.java b/hive-event-listeners/apiary-gluesync-listener/src/main/java/com/expediagroup/apiary/extensions/gluesync/listener/ApiaryGlueSync.java index 8183afa9..0031518d 100644 --- a/hive-event-listeners/apiary-gluesync-listener/src/main/java/com/expediagroup/apiary/extensions/gluesync/listener/ApiaryGlueSync.java +++ b/hive-event-listeners/apiary-gluesync-listener/src/main/java/com/expediagroup/apiary/extensions/gluesync/listener/ApiaryGlueSync.java @@ -1,5 +1,5 @@ /** - * Copyright (C) 2018-2025 Expedia, Inc. + * Copyright (C) 2018-2026 Expedia, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -82,10 +82,20 @@ public ApiaryGlueSync(Configuration config, boolean throwExceptions) { */ public ApiaryGlueSync(Configuration config, AWSGlue glueClient, String gluePrefix, MetricService metricService, boolean throwExceptions) { + this(config, glueClient, gluePrefix, metricService, throwExceptions, null); + } + + /** + * Just for testing. Allows injecting the default {@code SkipArchive} value + * that would otherwise be read from the + * {@code GLUE_SKIP_ARCHIVE} environment variable. + */ + public ApiaryGlueSync(Configuration config, AWSGlue glueClient, String gluePrefix, MetricService metricService, + boolean throwExceptions, Boolean defaultSkipArchive) { super(config); this.glueClient = glueClient; this.glueDatabaseService = new GlueDatabaseService(glueClient, gluePrefix); - this.gluePartitionService = new GluePartitionService(glueClient, gluePrefix); + this.gluePartitionService = new GluePartitionService(glueClient, gluePrefix, defaultSkipArchive); this.glueTableService = new GlueTableService(glueClient, gluePartitionService, gluePrefix); this.isIcebergPredicate = new IsIcebergTablePredicate(); this.metricService = metricService; diff --git a/hive-event-listeners/apiary-gluesync-listener/src/main/java/com/expediagroup/apiary/extensions/gluesync/listener/service/GluePartitionService.java b/hive-event-listeners/apiary-gluesync-listener/src/main/java/com/expediagroup/apiary/extensions/gluesync/listener/service/GluePartitionService.java index bf5909e8..ff5075e2 100644 --- a/hive-event-listeners/apiary-gluesync-listener/src/main/java/com/expediagroup/apiary/extensions/gluesync/listener/service/GluePartitionService.java +++ b/hive-event-listeners/apiary-gluesync-listener/src/main/java/com/expediagroup/apiary/extensions/gluesync/listener/service/GluePartitionService.java @@ -1,5 +1,5 @@ /** - * Copyright (C) 2018-2025 Expedia, Inc. + * Copyright (C) 2018-2026 Expedia, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -50,18 +50,53 @@ public class GluePartitionService { private final HiveToGlueTransformer transformer; private final GlueMetadataStringCleaner cleaner = new GlueMetadataStringCleaner(); private final HiveToGluePartitionComparator partitionComparator = new HiveToGluePartitionComparator(); + private final Boolean defaultSkipArchive; public static final String APIARY_GLUESYNC_SKIP_ARCHIVE_TABLE_PARAM = "apiary.gluesync.skipArchive"; + public static final String GLUE_SKIP_ARCHIVE_ENV = "GLUE_SKIP_ARCHIVE"; private static final int DEFAULT_MAX_RESULTS_SIZE = 1000; // Current max supported by Glue private static final int MAX_PARTITION_CREATE_BATCH_SIZE = 100; private static final int MAX_PARTITION_UPDATE_BATCH_SIZE = 100; private static final int MAX_PARTITION_DELETE_BATCH_SIZE = 25; public GluePartitionService(AWSGlue glueClient, String gluePrefix) { + this(glueClient, gluePrefix, parseSkipArchiveDefaultFromEnv()); + } + + public GluePartitionService(AWSGlue glueClient, String gluePrefix, Boolean defaultSkipArchive) { this.glueClient = glueClient; this.transformer = new HiveToGlueTransformer(gluePrefix); + this.defaultSkipArchive = defaultSkipArchive; log.debug("ApiaryGlueSync created"); } + /** + * Reads the {@value #GLUE_SKIP_ARCHIVE_ENV} environment variable + * and returns its boolean value. + *

+ * Only the literals {@code "true"} and {@code "false"} (case-insensitive) are + * accepted. When the variable is not set or is empty, {@code null} is returned + * so the built-in default applies. Any other value causes an + * {@link IllegalArgumentException} to be thrown at startup. + */ + static Boolean parseSkipArchiveDefaultFromEnv() { + return parseSkipArchiveDefault(System.getenv(GLUE_SKIP_ARCHIVE_ENV)); + } + + static Boolean parseSkipArchiveDefault(String value) { + if (value == null || value.isEmpty()) { + return null; + } + if ("true".equalsIgnoreCase(value)) { + return Boolean.TRUE; + } + if ("false".equalsIgnoreCase(value)) { + return Boolean.FALSE; + } + throw new IllegalArgumentException( + "Invalid value for environment variable " + GLUE_SKIP_ARCHIVE_ENV + + ": '" + value + "'. Expected 'true' or 'false'."); + } + public void create(Table table, org.apache.hadoop.hive.metastore.api.Partition partition) { CreatePartitionRequest createPartitionRequest = new CreatePartitionRequest() .withPartitionInput(transformer.transformPartition(partition)) @@ -155,16 +190,20 @@ public List getPartitions(Table tab } public boolean shouldSkipArchive(Table table) { - boolean skipArchive = true; - if (table.getParameters() != null) { - // Only if explicitly overridden to false do enable table archive. Normally we - // want to skip archiving. + if (table != null && table.getParameters() != null) { + // Explicit per-table override always wins. String skipArchiveParam = table.getParameters().get(APIARY_GLUESYNC_SKIP_ARCHIVE_TABLE_PARAM); - if ("false".equals(skipArchiveParam)) { - skipArchive = false; + if ("false".equalsIgnoreCase(skipArchiveParam)) { + return false; + } + if ("true".equalsIgnoreCase(skipArchiveParam)) { + return true; } } - return skipArchive; + if (defaultSkipArchive != null) { + return defaultSkipArchive; + } + return true; } public PartitionInput convertToPartitionInput(com.amazonaws.services.glue.model.Partition partition) { diff --git a/hive-event-listeners/apiary-gluesync-listener/src/test/java/com/expediagroup/apiary/extensions/gluesync/listener/ApiaryGlueSyncTest.java b/hive-event-listeners/apiary-gluesync-listener/src/test/java/com/expediagroup/apiary/extensions/gluesync/listener/ApiaryGlueSyncTest.java index 6fcfa669..6d81757c 100644 --- a/hive-event-listeners/apiary-gluesync-listener/src/test/java/com/expediagroup/apiary/extensions/gluesync/listener/ApiaryGlueSyncTest.java +++ b/hive-event-listeners/apiary-gluesync-listener/src/test/java/com/expediagroup/apiary/extensions/gluesync/listener/ApiaryGlueSyncTest.java @@ -1,5 +1,5 @@ /** - * Copyright (C) 2018-2025 Expedia, Inc. + * Copyright (C) 2018-2026 Expedia, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -367,6 +367,68 @@ public void onAlterHiveTableSkipArchiveOverride() throws MetaException { assertThat(updateTableRequest.getSkipArchive(), is(false)); } + @Test + public void onAlterHiveTableSkipArchiveDefaultEnvFalse() throws MetaException { + ApiaryGlueSync glueSyncWithDefault = new ApiaryGlueSync( + configuration, glueClient, gluePrefix, metricService, false, Boolean.FALSE); + + AlterTableEvent event = mock(AlterTableEvent.class); + when(event.getStatus()).thenReturn(true); + + Table newTable = simpleHiveTable(simpleSchema(), simplePartitioning()); + newTable.setTableName("table2"); + when(event.getOldTable()).thenReturn(newTable); + when(event.getNewTable()).thenReturn(newTable); + + glueSyncWithDefault.onAlterTable(event); + + verify(glueClient).updateTable(updateTableRequestCaptor.capture()); + UpdateTableRequest updateTableRequest = updateTableRequestCaptor.getValue(); + assertThat(updateTableRequest.getSkipArchive(), is(false)); + } + + @Test + public void onAlterHiveTableSkipArchiveDefaultEnvTrue() throws MetaException { + ApiaryGlueSync glueSyncWithDefault = new ApiaryGlueSync( + configuration, glueClient, gluePrefix, metricService, false, Boolean.TRUE); + + AlterTableEvent event = mock(AlterTableEvent.class); + when(event.getStatus()).thenReturn(true); + + Table newTable = simpleHiveTable(simpleSchema(), simplePartitioning()); + newTable.setTableName("table2"); + when(event.getOldTable()).thenReturn(newTable); + when(event.getNewTable()).thenReturn(newTable); + + glueSyncWithDefault.onAlterTable(event); + + verify(glueClient).updateTable(updateTableRequestCaptor.capture()); + UpdateTableRequest updateTableRequest = updateTableRequestCaptor.getValue(); + assertThat(updateTableRequest.getSkipArchive(), is(true)); + } + + @Test + public void onAlterHiveTableTableParamOverridesEnvDefault() throws MetaException { + // Env default says do not skip archiving, but the table explicitly asks to skip. + ApiaryGlueSync glueSyncWithDefault = new ApiaryGlueSync( + configuration, glueClient, gluePrefix, metricService, false, Boolean.FALSE); + + AlterTableEvent event = mock(AlterTableEvent.class); + when(event.getStatus()).thenReturn(true); + + Table newTable = simpleHiveTable(simpleSchema(), simplePartitioning()); + newTable.setTableName("table2"); + newTable.putToParameters(APIARY_GLUESYNC_SKIP_ARCHIVE_TABLE_PARAM, "true"); + when(event.getOldTable()).thenReturn(newTable); + when(event.getNewTable()).thenReturn(newTable); + + glueSyncWithDefault.onAlterTable(event); + + verify(glueClient).updateTable(updateTableRequestCaptor.capture()); + UpdateTableRequest updateTableRequest = updateTableRequestCaptor.getValue(); + assertThat(updateTableRequest.getSkipArchive(), is(true)); + } + @Test public void onAlterHiveTable_RenameTable() throws MetaException { AlterTableEvent event = mock(AlterTableEvent.class); diff --git a/hive-event-listeners/apiary-gluesync-listener/src/test/java/com/expediagroup/apiary/extensions/gluesync/listener/service/GluePartitionServiceTest.java b/hive-event-listeners/apiary-gluesync-listener/src/test/java/com/expediagroup/apiary/extensions/gluesync/listener/service/GluePartitionServiceTest.java index cea9d4c6..dfd736c5 100644 --- a/hive-event-listeners/apiary-gluesync-listener/src/test/java/com/expediagroup/apiary/extensions/gluesync/listener/service/GluePartitionServiceTest.java +++ b/hive-event-listeners/apiary-gluesync-listener/src/test/java/com/expediagroup/apiary/extensions/gluesync/listener/service/GluePartitionServiceTest.java @@ -1,5 +1,5 @@ /** - * Copyright (C) 2018-2025 Expedia, Inc. + * Copyright (C) 2018-2026 Expedia, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -353,4 +353,61 @@ private com.amazonaws.services.glue.model.Partition buildGluePartition(List()); + assertThat(svc.shouldSkipArchive(table), is(true)); + } + + @Test + public void shouldSkipArchive_usesEnvDefaultWhenTableParamAbsent() { + GluePartitionService svc = new GluePartitionService(mockGlueClient, "test-prefix-", Boolean.FALSE); + Table table = new Table(); + table.setParameters(new HashMap<>()); + assertThat(svc.shouldSkipArchive(table), is(false)); + } + + @Test + public void shouldSkipArchive_tableParamOverridesEnvDefault() { + GluePartitionService svc = new GluePartitionService(mockGlueClient, "test-prefix-", Boolean.FALSE); + Table table = new Table(); + HashMap params = new HashMap<>(); + params.put(GluePartitionService.APIARY_GLUESYNC_SKIP_ARCHIVE_TABLE_PARAM, "true"); + table.setParameters(params); + assertThat(svc.shouldSkipArchive(table), is(true)); + } + + @Test + public void shouldSkipArchive_tableParamFalseOverridesEnvDefaultTrue() { + GluePartitionService svc = new GluePartitionService(mockGlueClient, "test-prefix-", Boolean.TRUE); + Table table = new Table(); + HashMap params = new HashMap<>(); + params.put(GluePartitionService.APIARY_GLUESYNC_SKIP_ARCHIVE_TABLE_PARAM, "false"); + table.setParameters(params); + assertThat(svc.shouldSkipArchive(table), is(false)); + } }