diff --git a/docker/README.md b/docker/README.md index 718d1943ef7e0..19d3d846f5ee2 100644 --- a/docker/README.md +++ b/docker/README.md @@ -27,6 +27,19 @@ docker demo environment. The `/hoodie` folder contains all the configs for assembling necessary docker images. The name and repository of each docker image, e.g., `apachehudi/hudi-hadoop_2.8.4-trinobase_368`, is defined in the maven configuration file `pom.xml`. +#### Base image by Java version + +`build_docker_images.sh` auto-selects one of the two supported base images from `--spark-version`: Spark 3.x picks +`base_java11`; Spark 4.0+ picks `base_java17`. + +| Base module | JDK | Default Hadoop | Used for | +|---------------|---------|----------------|------------| +| `base_java11` | Java 11 | 2.8.4 | Spark 3.x | +| `base_java17` | Java 17 | 3.4.0 | Spark 4.0+ | + +The legacy Java 8 `base` module under `/hoodie/hadoop/base` is retained for historical reference only; Spark 2.x is no +longer supported and `build_docker_images.sh` never selects it. + ### Docker compose config for the Demo - `/compose` The `/compose` folder contains the yaml file to compose the Docker environment for running Hudi Demo. diff --git a/docker/build_docker_images.sh b/docker/build_docker_images.sh index 5756f87d7a1a5..c67e0d79f9009 100755 --- a/docker/build_docker_images.sh +++ b/docker/build_docker_images.sh @@ -54,6 +54,7 @@ done if [ "$MULTI_ARCH" = true ]; then DOCKER_PLATFORM='linux/amd64,linux/arm64' echo "Building multi-arch images (amd64 + arm64)" + export BUILDX_EXPERIMENTAL=1 else ARCHITECTURE=$(uname -m) case "$ARCHITECTURE" in @@ -70,9 +71,8 @@ else esac export DOCKER_DEFAULT_PLATFORM="$DOCKER_PLATFORM" fi -export BUILDX_EXPERIMENTAL=1 # Get the directory of this script for relative paths -SCRIPT_DIR=$(cd $(dirname "$0") && pwd) +SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) # Determine VERSION_TAG (command line arg or Maven project version) if [ -n "$VERSION_TAG_ARG" ]; then @@ -93,7 +93,11 @@ DOCKER_CONTEXT_DIR="hoodie/hadoop" # Select Java base image based on Spark version (Spark 4.0+ requires Java 17) SPARK_MAJOR=$(echo "$SPARK_VERSION" | cut -d. -f1) -if [ "$SPARK_MAJOR" -ge 4 ] 2>/dev/null; then +if ! [[ "$SPARK_MAJOR" =~ ^[0-9]+$ ]]; then + echo "Error: invalid SPARK_VERSION='$SPARK_VERSION'" >&2 + exit 1 +fi +if [ "$SPARK_MAJOR" -ge 4 ]; then BASE_IMAGE_DIR="base_java17" echo "Using Java 17 base image for Spark ${SPARK_VERSION}" else @@ -101,8 +105,25 @@ else echo "Using Java 11 base image for Spark ${SPARK_VERSION}" fi +# Select hadoop-aws/aws-sdk versions based on Hadoop major.minor. +# hadoop-aws must track the Hadoop version; mismatches break the S3A FS classpath. +HADOOP_MAJOR_MINOR=$(echo "$HADOOP_VERSION" | cut -d. -f1,2) +case "$HADOOP_MAJOR_MINOR" in + 3.4) + HADOOP_AWS_VERSION="3.4.0" + AWS_SDK_VERSION="1.12.734" + ;; + 3.3) + HADOOP_AWS_VERSION="3.3.4" + AWS_SDK_VERSION="1.12.734" + ;; + *) + HADOOP_AWS_VERSION="3.3.4" + AWS_SDK_VERSION="1.12.734" + ;; +esac + # List of images to build: "subdir|image_base_name" -# Each entry: | DOCKER_IMAGES=( "${BASE_IMAGE_DIR}|apachehudi/hudi-hadoop_${HADOOP_VERSION}-base" "datanode|apachehudi/hudi-hadoop_${HADOOP_VERSION}-datanode" @@ -114,6 +135,13 @@ DOCKER_IMAGES=( "sparkmaster|apachehudi/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}-sparkmaster_${SPARK_VERSION}" "sparkworker|apachehudi/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}-sparkworker_${SPARK_VERSION}" ) +# Select docker build command once (MULTI_ARCH doesn't change per image). +if [ "$MULTI_ARCH" = true ]; then + DOCKER_BUILD_CMD=(docker buildx build --platform "$DOCKER_PLATFORM" --push) +else + DOCKER_BUILD_CMD=(docker build) +fi + # Build each Docker image in the list for IMAGE_CONFIG in "${DOCKER_IMAGES[@]}"; do # Split config into subdir and image base name @@ -123,24 +151,15 @@ for IMAGE_CONFIG in "${DOCKER_IMAGES[@]}"; do TAG_VERSIONED="$IMAGE_BASE:$VERSION_TAG" echo "Building $IMAGE_CONTEXT as $TAG_LATEST and $TAG_VERSIONED" # Build the Docker image with both latest and versioned tags - if [ "$MULTI_ARCH" = true ]; then - if ! docker buildx build --platform "$DOCKER_PLATFORM" --push \ - --build-arg HADOOP_VERSION=${HADOOP_VERSION} \ - --build-arg SPARK_VERSION=${SPARK_VERSION} \ - --build-arg HIVE_VERSION=${HIVE_VERSION} \ - "$IMAGE_CONTEXT" -t "$TAG_LATEST" -t "$TAG_VERSIONED"; then - echo "Error: Failed to build docker image for $IMAGE_CONTEXT" - exit 1 - fi - else - if ! docker build \ - --build-arg HADOOP_VERSION=${HADOOP_VERSION} \ - --build-arg SPARK_VERSION=${SPARK_VERSION} \ - --build-arg HIVE_VERSION=${HIVE_VERSION} \ - "$IMAGE_CONTEXT" -t "$TAG_LATEST" -t "$TAG_VERSIONED"; then - echo "Error: Failed to build docker image for $IMAGE_CONTEXT" - exit 1 - fi + if ! "${DOCKER_BUILD_CMD[@]}" \ + --build-arg HADOOP_VERSION=${HADOOP_VERSION} \ + --build-arg SPARK_VERSION=${SPARK_VERSION} \ + --build-arg HIVE_VERSION=${HIVE_VERSION} \ + --build-arg HADOOP_AWS_VERSION=${HADOOP_AWS_VERSION} \ + --build-arg AWS_SDK_VERSION=${AWS_SDK_VERSION} \ + "$IMAGE_CONTEXT" -t "$TAG_LATEST" -t "$TAG_VERSIONED"; then + echo "Error: Failed to build docker image for $IMAGE_CONTEXT" + exit 1 fi done diff --git a/docker/compose/docker-compose_hadoop340_hive313_spark401_amd64.yml b/docker/compose/docker-compose_hadoop340_hive313_spark401_amd64.yml index 483ce92e0c034..83bee992cecec 100644 --- a/docker/compose/docker-compose_hadoop340_hive313_spark401_amd64.yml +++ b/docker/compose/docker-compose_hadoop340_hive313_spark401_amd64.yml @@ -17,6 +17,7 @@ services: namenode: image: apachehudi/hudi-hadoop_3.4.0-namenode:latest + platform: linux/amd64 hostname: namenode container_name: namenode environment: @@ -35,6 +36,7 @@ services: datanode1: image: apachehudi/hudi-hadoop_3.4.0-datanode:latest + platform: linux/amd64 container_name: datanode1 hostname: datanode1 environment: @@ -58,6 +60,7 @@ services: historyserver: image: apachehudi/hudi-hadoop_3.4.0-history:latest + platform: linux/amd64 hostname: historyserver container_name: historyserver environment: @@ -68,6 +71,7 @@ services: - "namenode" ports: - "8188:8188" + - "19888:19888" healthcheck: test: ["CMD", "curl", "-f", "http://historyserver:8188"] interval: 30s @@ -80,6 +84,7 @@ services: hive-metastore-postgresql: image: bde2020/hive-metastore-postgresql:3.1.0 + platform: linux/amd64 volumes: - hive-metastore-postgresql:/var/lib/postgresql hostname: hive-metastore-postgresql @@ -87,6 +92,7 @@ services: hivemetastore: image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3:latest + platform: linux/amd64 hostname: hivemetastore container_name: hivemetastore links: @@ -110,6 +116,7 @@ services: hiveserver: image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3:latest + platform: linux/amd64 hostname: hiveserver container_name: hiveserver env_file: @@ -130,6 +137,7 @@ services: zookeeper: image: 'bitnamilegacy/zookeeper:3.6.4' + platform: linux/amd64 hostname: zookeeper container_name: zookeeper ports: @@ -138,17 +146,30 @@ services: - ALLOW_ANONYMOUS_LOGIN=yes kafka: - image: 'bitnamilegacy/kafka:3.4.1' + image: 'apache/kafka:3.7.2' + platform: linux/amd64 hostname: kafkabroker container_name: kafkabroker ports: - "9092:9092" environment: - - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181 - - ALLOW_PLAINTEXT_LISTENER=yes + - KAFKA_NODE_ID=1 + - KAFKA_PROCESS_ROLES=broker,controller + - KAFKA_LISTENERS=PLAINTEXT://0.0.0.0:29092,CONTROLLER://0.0.0.0:9093,PLAINTEXT_HOST://0.0.0.0:9092 + - KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://kafkabroker:29092,PLAINTEXT_HOST://localhost:9092 + - KAFKA_CONTROLLER_LISTENER_NAMES=CONTROLLER + - KAFKA_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT + - KAFKA_INTER_BROKER_LISTENER_NAME=PLAINTEXT + - KAFKA_CONTROLLER_QUORUM_VOTERS=1@kafkabroker:9093 + - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1 + - KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR=1 + - KAFKA_TRANSACTION_STATE_LOG_MIN_ISR=1 + - KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS=0 + - KAFKA_NUM_PARTITIONS=3 sparkmaster: image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3-sparkmaster_4.0.1:latest + platform: linux/amd64 hostname: sparkmaster container_name: sparkmaster env_file: @@ -170,6 +191,7 @@ services: spark-worker-1: image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3-sparkworker_4.0.1:latest + platform: linux/amd64 hostname: spark-worker-1 container_name: spark-worker-1 env_file: @@ -188,6 +210,7 @@ services: adhoc-1: image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3-sparkadhoc_4.0.1:latest + platform: linux/amd64 hostname: adhoc-1 container_name: adhoc-1 env_file: @@ -208,6 +231,7 @@ services: adhoc-2: image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3-sparkadhoc_4.0.1:latest + platform: linux/amd64 hostname: adhoc-2 container_name: adhoc-2 env_file: @@ -226,6 +250,7 @@ services: minio: image: 'minio/minio:latest' + platform: linux/amd64 hostname: minio container_name: minio ports: @@ -241,6 +266,7 @@ services: mc: image: minio/mc + platform: linux/amd64 container_name: mc entrypoint: > /bin/sh -c " diff --git a/docker/compose/docker-compose_hadoop340_hive313_spark401_arm64.yml b/docker/compose/docker-compose_hadoop340_hive313_spark401_arm64.yml index 483ce92e0c034..a352fe74c1414 100644 --- a/docker/compose/docker-compose_hadoop340_hive313_spark401_arm64.yml +++ b/docker/compose/docker-compose_hadoop340_hive313_spark401_arm64.yml @@ -68,6 +68,7 @@ services: - "namenode" ports: - "8188:8188" + - "19888:19888" healthcheck: test: ["CMD", "curl", "-f", "http://historyserver:8188"] interval: 30s diff --git a/docker/hoodie/hadoop/base_java17/Dockerfile b/docker/hoodie/hadoop/base_java17/Dockerfile index 45108610b19e2..2b78aef2ac9d0 100644 --- a/docker/hoodie/hadoop/base_java17/Dockerfile +++ b/docker/hoodie/hadoop/base_java17/Dockerfile @@ -36,6 +36,7 @@ RUN set -x \ && tar -xvf /tmp/hadoop.tar.gz -C /opt/ \ && rm /tmp/hadoop.tar.gz* \ && ln -s /opt/hadoop-$HADOOP_VERSION/etc/hadoop /etc/hadoop \ + && if [ -f /etc/hadoop/mapred-site.xml.template ]; then cp /etc/hadoop/mapred-site.xml.template /etc/hadoop/mapred-site.xml; fi \ && mkdir /hadoop-data ENV HADOOP_PREFIX=/opt/hadoop-$HADOOP_VERSION