apache · voonhous · Apr 17, 2026 · Apr 17, 2026 · Apr 18, 2026 · yihua
diff --git a/docker/README.md b/docker/README.md
@@ -27,6 +27,19 @@ docker demo environment.
 The `/hoodie` folder contains all the configs for assembling necessary docker images. The name and repository of each
 docker image, e.g., `apachehudi/hudi-hadoop_2.8.4-trinobase_368`, is defined in the maven configuration file `pom.xml`.
 
+#### Base image by Java version
+
+`build_docker_images.sh` auto-selects one of the two supported base images from `--spark-version`: Spark 3.x picks
+`base_java11`; Spark 4.0+ picks `base_java17`.
+
+| Base module   | JDK     | Default Hadoop | Used for   |
+|---------------|---------|----------------|------------|
+| `base_java11` | Java 11 | 2.8.4          | Spark 3.x  |
+| `base_java17` | Java 17 | 3.4.0          | Spark 4.0+ |
+
+The legacy Java 8 `base` module under `/hoodie/hadoop/base` is retained for historical reference only; Spark 2.x is no
+longer supported and `build_docker_images.sh` never selects it.
+
 ### Docker compose config for the Demo - `/compose`
 
 The `/compose` folder contains the yaml file to compose the Docker environment for running Hudi Demo.

diff --git a/docker/build_docker_images.sh b/docker/build_docker_images.sh
@@ -54,6 +54,7 @@ done
 if [ "$MULTI_ARCH" = true ]; then
   DOCKER_PLATFORM='linux/amd64,linux/arm64'
   echo "Building multi-arch images (amd64 + arm64)"
+  export BUILDX_EXPERIMENTAL=1
 else
   ARCHITECTURE=$(uname -m)
   case "$ARCHITECTURE" in
@@ -70,9 +71,8 @@ else
   esac
   export DOCKER_DEFAULT_PLATFORM="$DOCKER_PLATFORM"
 fi
-export BUILDX_EXPERIMENTAL=1
 # Get the directory of this script for relative paths
-SCRIPT_DIR=$(cd $(dirname "$0") && pwd)
+SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
 
 # Determine VERSION_TAG (command line arg or Maven project version)
 if [ -n "$VERSION_TAG_ARG" ]; then
@@ -93,16 +93,37 @@ DOCKER_CONTEXT_DIR="hoodie/hadoop"
 
 # Select Java base image based on Spark version (Spark 4.0+ requires Java 17)
 SPARK_MAJOR=$(echo "$SPARK_VERSION" | cut -d. -f1)
-if [ "$SPARK_MAJOR" -ge 4 ] 2>/dev/null; then
+if ! [[ "$SPARK_MAJOR" =~ ^[0-9]+$ ]]; then
+  echo "Error: invalid SPARK_VERSION='$SPARK_VERSION'" >&2
+  exit 1
+fi
+if [ "$SPARK_MAJOR" -ge 4 ]; then
   BASE_IMAGE_DIR="base_java17"
   echo "Using Java 17 base image for Spark ${SPARK_VERSION}"
 else
   BASE_IMAGE_DIR="base_java11"
   echo "Using Java 11 base image for Spark ${SPARK_VERSION}"
 fi
 
+# Select hadoop-aws/aws-sdk versions based on Hadoop major.minor.
+# hadoop-aws must track the Hadoop version; mismatches break the S3A FS classpath.
+HADOOP_MAJOR_MINOR=$(echo "$HADOOP_VERSION" | cut -d. -f1,2)
+case "$HADOOP_MAJOR_MINOR" in
+  3.4)
+    HADOOP_AWS_VERSION="3.4.0"
+    AWS_SDK_VERSION="1.12.734"
+    ;;
+  3.3)
+    HADOOP_AWS_VERSION="3.3.4"
+    AWS_SDK_VERSION="1.12.734"
+    ;;
+  *)
+    HADOOP_AWS_VERSION="3.3.4"
+    AWS_SDK_VERSION="1.12.734"
+    ;;
+esac
+
 # List of images to build: "subdir|image_base_name"
-# Each entry: <subdir>|<image_base_name>
 DOCKER_IMAGES=(
   "${BASE_IMAGE_DIR}|apachehudi/hudi-hadoop_${HADOOP_VERSION}-base"
   "datanode|apachehudi/hudi-hadoop_${HADOOP_VERSION}-datanode"
@@ -114,6 +135,13 @@ DOCKER_IMAGES=(
   "sparkmaster|apachehudi/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}-sparkmaster_${SPARK_VERSION}"
   "sparkworker|apachehudi/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}-sparkworker_${SPARK_VERSION}"
 )
+# Select docker build command once (MULTI_ARCH doesn't change per image).
+if [ "$MULTI_ARCH" = true ]; then
+  DOCKER_BUILD_CMD=(docker buildx build --platform "$DOCKER_PLATFORM" --push)
+else
+  DOCKER_BUILD_CMD=(docker build)
+fi
+
 # Build each Docker image in the list
 for IMAGE_CONFIG in "${DOCKER_IMAGES[@]}"; do
   # Split config into subdir and image base name
@@ -123,24 +151,15 @@ for IMAGE_CONFIG in "${DOCKER_IMAGES[@]}"; do
   TAG_VERSIONED="$IMAGE_BASE:$VERSION_TAG"
   echo "Building $IMAGE_CONTEXT as $TAG_LATEST and $TAG_VERSIONED"
   # Build the Docker image with both latest and versioned tags
-  if [ "$MULTI_ARCH" = true ]; then
-    if ! docker buildx build --platform "$DOCKER_PLATFORM" --push \
-      --build-arg HADOOP_VERSION=${HADOOP_VERSION} \
-      --build-arg SPARK_VERSION=${SPARK_VERSION} \
-      --build-arg HIVE_VERSION=${HIVE_VERSION} \
-      "$IMAGE_CONTEXT" -t "$TAG_LATEST" -t "$TAG_VERSIONED"; then
-      echo "Error: Failed to build docker image for $IMAGE_CONTEXT"
-      exit 1
-    fi
-  else
-    if ! docker build \
-      --build-arg HADOOP_VERSION=${HADOOP_VERSION} \
-      --build-arg SPARK_VERSION=${SPARK_VERSION} \
-      --build-arg HIVE_VERSION=${HIVE_VERSION} \
-      "$IMAGE_CONTEXT" -t "$TAG_LATEST" -t "$TAG_VERSIONED"; then
-      echo "Error: Failed to build docker image for $IMAGE_CONTEXT"
-      exit 1
-    fi
+  if ! "${DOCKER_BUILD_CMD[@]}" \
+    --build-arg HADOOP_VERSION=${HADOOP_VERSION} \
+    --build-arg SPARK_VERSION=${SPARK_VERSION} \
+    --build-arg HIVE_VERSION=${HIVE_VERSION} \
+    --build-arg HADOOP_AWS_VERSION=${HADOOP_AWS_VERSION} \
+    --build-arg AWS_SDK_VERSION=${AWS_SDK_VERSION} \
+    "$IMAGE_CONTEXT" -t "$TAG_LATEST" -t "$TAG_VERSIONED"; then
+    echo "Error: Failed to build docker image for $IMAGE_CONTEXT"
+    exit 1
   fi
 done
 
diff --git a/docker/compose/docker-compose_hadoop340_hive313_spark401_amd64.yml b/docker/compose/docker-compose_hadoop340_hive313_spark401_amd64.yml
@@ -17,6 +17,7 @@ services:
 
   namenode:
     image: apachehudi/hudi-hadoop_3.4.0-namenode:latest
+    platform: linux/amd64
     hostname: namenode
     container_name: namenode
     environment:
@@ -35,6 +36,7 @@ services:
 
   datanode1:
     image: apachehudi/hudi-hadoop_3.4.0-datanode:latest
+    platform: linux/amd64
     container_name: datanode1
     hostname: datanode1
     environment:
@@ -58,6 +60,7 @@ services:
 
   historyserver:
     image: apachehudi/hudi-hadoop_3.4.0-history:latest
+    platform: linux/amd64
     hostname: historyserver
     container_name: historyserver
     environment:
@@ -68,6 +71,7 @@ services:
       - "namenode"
     ports:
       - "8188:8188"
+      - "19888:19888"
     healthcheck:
       test: ["CMD", "curl", "-f", "http://historyserver:8188"]
       interval: 30s
@@ -80,13 +84,15 @@ services:
 
   hive-metastore-postgresql:
     image: bde2020/hive-metastore-postgresql:3.1.0
+    platform: linux/amd64
     volumes:
       - hive-metastore-postgresql:/var/lib/postgresql
     hostname: hive-metastore-postgresql
     container_name: hive-metastore-postgresql
 
   hivemetastore:
     image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3:latest
+    platform: linux/amd64
     hostname: hivemetastore
     container_name: hivemetastore
     links:
@@ -110,6 +116,7 @@ services:
 
   hiveserver:
     image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3:latest
+    platform: linux/amd64
     hostname: hiveserver
     container_name: hiveserver
     env_file:
@@ -130,6 +137,7 @@ services:
 
   zookeeper:
     image: 'bitnamilegacy/zookeeper:3.6.4'
+    platform: linux/amd64
     hostname: zookeeper
     container_name: zookeeper
     ports:
@@ -138,17 +146,30 @@ services:
       - ALLOW_ANONYMOUS_LOGIN=yes
 
   kafka:
-    image: 'bitnamilegacy/kafka:3.4.1'
+    image: 'apache/kafka:3.7.2'
+    platform: linux/amd64
     hostname: kafkabroker
     container_name: kafkabroker
     ports:
       - "9092:9092"
     environment:
-      - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
-      - ALLOW_PLAINTEXT_LISTENER=yes
+      - KAFKA_NODE_ID=1
+      - KAFKA_PROCESS_ROLES=broker,controller
+      - KAFKA_LISTENERS=PLAINTEXT://0.0.0.0:29092,CONTROLLER://0.0.0.0:9093,PLAINTEXT_HOST://0.0.0.0:9092
+      - KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://kafkabroker:29092,PLAINTEXT_HOST://localhost:9092
+      - KAFKA_CONTROLLER_LISTENER_NAMES=CONTROLLER
+      - KAFKA_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
+      - KAFKA_INTER_BROKER_LISTENER_NAME=PLAINTEXT
+      - KAFKA_CONTROLLER_QUORUM_VOTERS=1@kafkabroker:9093
+      - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1
+      - KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR=1
+      - KAFKA_TRANSACTION_STATE_LOG_MIN_ISR=1
+      - KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS=0
+      - KAFKA_NUM_PARTITIONS=3
 
   sparkmaster:
     image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3-sparkmaster_4.0.1:latest
+    platform: linux/amd64
     hostname: sparkmaster
     container_name: sparkmaster
     env_file:
@@ -170,6 +191,7 @@ services:
 
   spark-worker-1:
     image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3-sparkworker_4.0.1:latest
+    platform: linux/amd64
     hostname: spark-worker-1
     container_name: spark-worker-1
     env_file:
@@ -188,6 +210,7 @@ services:
 
   adhoc-1:
     image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3-sparkadhoc_4.0.1:latest
+    platform: linux/amd64
     hostname: adhoc-1
     container_name: adhoc-1
     env_file:
@@ -208,6 +231,7 @@ services:
 
   adhoc-2:
     image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3-sparkadhoc_4.0.1:latest
+    platform: linux/amd64
     hostname: adhoc-2
     container_name: adhoc-2
     env_file:
@@ -226,6 +250,7 @@ services:
 
   minio:
     image: 'minio/minio:latest'
+    platform: linux/amd64
     hostname: minio
     container_name: minio
     ports:
@@ -241,6 +266,7 @@ services:
 
   mc:
     image: minio/mc
+    platform: linux/amd64
     container_name: mc
     entrypoint: >
       /bin/sh -c "

diff --git a/docker/compose/docker-compose_hadoop340_hive313_spark401_arm64.yml b/docker/compose/docker-compose_hadoop340_hive313_spark401_arm64.yml
@@ -68,6 +68,7 @@ services:
       - "namenode"
     ports:
       - "8188:8188"
+      - "19888:19888"
     healthcheck:
       test: ["CMD", "curl", "-f", "http://historyserver:8188"]
       interval: 30s

diff --git a/docker/hoodie/hadoop/base_java17/Dockerfile b/docker/hoodie/hadoop/base_java17/Dockerfile
@@ -36,6 +36,7 @@ RUN set -x \
     && tar -xvf /tmp/hadoop.tar.gz -C /opt/ \
     && rm /tmp/hadoop.tar.gz* \
     && ln -s /opt/hadoop-$HADOOP_VERSION/etc/hadoop /etc/hadoop \
+    && if [ -f /etc/hadoop/mapred-site.xml.template ]; then cp /etc/hadoop/mapred-site.xml.template /etc/hadoop/mapred-site.xml; fi \
     && mkdir /hadoop-data
 
 ENV HADOOP_PREFIX=/opt/hadoop-$HADOOP_VERSION