From 1252d5dbd95649d30a039d6534e771802f473485 Mon Sep 17 00:00:00 2001 From: voon Date: Sat, 18 Apr 2026 01:28:21 +0800 Subject: [PATCH] chore: Add Java 17 Hadoop base image and Spark 4.0.1 docker compose setup - Introduce base_java17 Hadoop base image to support Spark 4.x (which requires Java 17) - build_docker_images.sh auto-selects base_java11 or base_java17 based on SPARK_VERSION - Add docker-compose_hadoop340_hive313_spark401 files for amd64 and arm64 - Parameterize hadoop-aws and aws-java-sdk-bundle versions in spark_base Dockerfile --- docker/build_docker_images.sh | 13 +- ...mpose_hadoop340_hive313_spark401_amd64.yml | 264 ++++++++++++++++++ ...mpose_hadoop340_hive313_spark401_arm64.yml | 264 ++++++++++++++++++ docker/hoodie/hadoop/base_java11/Dockerfile | 2 +- docker/hoodie/hadoop/base_java17/Dockerfile | 58 ++++ .../hoodie/hadoop/base_java17/entrypoint.sh | 107 +++++++ .../hadoop/base_java17/export_container_ip.sh | 30 ++ docker/hoodie/hadoop/base_java17/pom.xml | 92 ++++++ docker/hoodie/hadoop/spark_base/Dockerfile | 6 +- 9 files changed, 832 insertions(+), 4 deletions(-) create mode 100644 docker/compose/docker-compose_hadoop340_hive313_spark401_amd64.yml create mode 100644 docker/compose/docker-compose_hadoop340_hive313_spark401_arm64.yml create mode 100644 docker/hoodie/hadoop/base_java17/Dockerfile create mode 100644 docker/hoodie/hadoop/base_java17/entrypoint.sh create mode 100755 docker/hoodie/hadoop/base_java17/export_container_ip.sh create mode 100644 docker/hoodie/hadoop/base_java17/pom.xml diff --git a/docker/build_docker_images.sh b/docker/build_docker_images.sh index 1e328dea389b7..e2478c5c3e0d3 100755 --- a/docker/build_docker_images.sh +++ b/docker/build_docker_images.sh @@ -83,10 +83,21 @@ fi # Docker image tags LATEST_TAG="latest" DOCKER_CONTEXT_DIR="hoodie/hadoop" + +# Select Java base image based on Spark version (Spark 4.0+ requires Java 17) +SPARK_MAJOR=$(echo "$SPARK_VERSION" | cut -d. -f1) +if [ "$SPARK_MAJOR" -ge 4 ] 2>/dev/null; then + BASE_IMAGE_DIR="base_java17" + echo "Using Java 17 base image for Spark ${SPARK_VERSION}" +else + BASE_IMAGE_DIR="base_java11" + echo "Using Java 11 base image for Spark ${SPARK_VERSION}" +fi + # List of images to build: "subdir|image_base_name" # Each entry: | DOCKER_IMAGES=( - "base_java11|apachehudi/hudi-hadoop_${HADOOP_VERSION}-base" + "${BASE_IMAGE_DIR}|apachehudi/hudi-hadoop_${HADOOP_VERSION}-base" "datanode|apachehudi/hudi-hadoop_${HADOOP_VERSION}-datanode" "historyserver|apachehudi/hudi-hadoop_${HADOOP_VERSION}-history" "hive_base|apachehudi/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}" diff --git a/docker/compose/docker-compose_hadoop340_hive313_spark401_amd64.yml b/docker/compose/docker-compose_hadoop340_hive313_spark401_amd64.yml new file mode 100644 index 0000000000000..483ce92e0c034 --- /dev/null +++ b/docker/compose/docker-compose_hadoop340_hive313_spark401_amd64.yml @@ -0,0 +1,264 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +services: + + namenode: + image: apachehudi/hudi-hadoop_3.4.0-namenode:latest + hostname: namenode + container_name: namenode + environment: + - CLUSTER_NAME=hudi_hadoop340_hive313_spark401 + ports: + - "50070:50070" + - "8020:8020" + - "9870:9870" + env_file: + - ./hadoop.env + healthcheck: + test: ["CMD", "curl", "-f", "http://namenode:9870"] + interval: 30s + timeout: 10s + retries: 3 + + datanode1: + image: apachehudi/hudi-hadoop_3.4.0-datanode:latest + container_name: datanode1 + hostname: datanode1 + environment: + - CLUSTER_NAME=hudi_hadoop340_hive313_spark401 + env_file: + - ./hadoop.env + ports: + - "50075:50075" + - "9864:9864" + - "50010:50010" + links: + - "namenode" + - "historyserver" + healthcheck: + test: ["CMD", "curl", "-f", "http://datanode1:9864"] + interval: 30s + timeout: 10s + retries: 3 + depends_on: + - namenode + + historyserver: + image: apachehudi/hudi-hadoop_3.4.0-history:latest + hostname: historyserver + container_name: historyserver + environment: + - CLUSTER_NAME=hudi_hadoop340_hive313_spark401 + depends_on: + - "namenode" + links: + - "namenode" + ports: + - "8188:8188" + healthcheck: + test: ["CMD", "curl", "-f", "http://historyserver:8188"] + interval: 30s + timeout: 10s + retries: 3 + env_file: + - ./hadoop.env + volumes: + - historyserver:/hadoop/yarn/timeline + + hive-metastore-postgresql: + image: bde2020/hive-metastore-postgresql:3.1.0 + volumes: + - hive-metastore-postgresql:/var/lib/postgresql + hostname: hive-metastore-postgresql + container_name: hive-metastore-postgresql + + hivemetastore: + image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3:latest + hostname: hivemetastore + container_name: hivemetastore + links: + - "hive-metastore-postgresql" + - "namenode" + env_file: + - ./hadoop.env + command: /opt/hive/bin/hive --service metastore + environment: + - "SERVICE_PRECONDITION=namenode:9870 hive-metastore-postgresql:5432" + ports: + - "9083:9083" + healthcheck: + test: ["CMD", "nc", "-z", "hivemetastore", "9083"] + interval: 30s + timeout: 10s + retries: 3 + depends_on: + - "hive-metastore-postgresql" + - "namenode" + + hiveserver: + image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3:latest + hostname: hiveserver + container_name: hiveserver + env_file: + - ./hadoop.env + environment: + - SERVICE_PRECONDITION=hivemetastore:9083 + ports: + - "10000:10000" + - "10002:10002" + depends_on: + - "hivemetastore" + links: + - "hivemetastore" + - "hive-metastore-postgresql" + - "namenode" + volumes: + - ${HUDI_WS}:/var/hoodie/ws + + zookeeper: + image: 'bitnamilegacy/zookeeper:3.6.4' + hostname: zookeeper + container_name: zookeeper + ports: + - "2181:2181" + environment: + - ALLOW_ANONYMOUS_LOGIN=yes + + kafka: + image: 'bitnamilegacy/kafka:3.4.1' + hostname: kafkabroker + container_name: kafkabroker + ports: + - "9092:9092" + environment: + - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181 + - ALLOW_PLAINTEXT_LISTENER=yes + + sparkmaster: + image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3-sparkmaster_4.0.1:latest + hostname: sparkmaster + container_name: sparkmaster + env_file: + - ./hadoop.env + ports: + - "8080:8080" + - "7077:7077" + - "8888:8888" + volumes: + - ${HUDI_WS}:/var/hoodie/ws + - ./notebooks:/opt/workspace/notebooks + environment: + - INIT_DAEMON_STEP=setup_spark + links: + - "hivemetastore" + - "hiveserver" + - "hive-metastore-postgresql" + - "namenode" + + spark-worker-1: + image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3-sparkworker_4.0.1:latest + hostname: spark-worker-1 + container_name: spark-worker-1 + env_file: + - ./hadoop.env + depends_on: + - sparkmaster + ports: + - "8081:8081" + environment: + - SPARK_MASTER=spark://sparkmaster:7077 + links: + - "hivemetastore" + - "hiveserver" + - "hive-metastore-postgresql" + - "namenode" + + adhoc-1: + image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3-sparkadhoc_4.0.1:latest + hostname: adhoc-1 + container_name: adhoc-1 + env_file: + - ./hadoop.env + depends_on: + - sparkmaster + ports: + - '4040:4040' + environment: + - SPARK_MASTER=spark://sparkmaster:7077 + links: + - "hivemetastore" + - "hiveserver" + - "hive-metastore-postgresql" + - "namenode" + volumes: + - ${HUDI_WS}:/var/hoodie/ws + + adhoc-2: + image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3-sparkadhoc_4.0.1:latest + hostname: adhoc-2 + container_name: adhoc-2 + env_file: + - ./hadoop.env + depends_on: + - sparkmaster + environment: + - SPARK_MASTER=spark://sparkmaster:7077 + links: + - "hivemetastore" + - "hiveserver" + - "hive-metastore-postgresql" + - "namenode" + volumes: + - ${HUDI_WS}:/var/hoodie/ws + + minio: + image: 'minio/minio:latest' + hostname: minio + container_name: minio + ports: + - 9090:9090 # server address + - 9091:9091 # console address + volumes: + - minio-data:/data + environment: + - MINIO_ACCESS_KEY=minio + - MINIO_SECRET_KEY=minio123 + - MINIO_DOMAIN=minio + command: server --address ":9090" --console-address ":9091" /data + + mc: + image: minio/mc + container_name: mc + entrypoint: > + /bin/sh -c " + until (/usr/bin/mc alias set minio http://minio:9090 minio minio123 --api S3v4) do echo '...waiting...' && sleep 1; done; + /usr/bin/mc rm -r --force minio/warehouse; + /usr/bin/mc mb minio/warehouse; + /usr/bin/mc policy set public minio/warehouse; + tail -f /dev/null + " + depends_on: + - minio + +volumes: + namenode: + historyserver: + hive-metastore-postgresql: + minio-data: + +networks: + default: + name: hudi diff --git a/docker/compose/docker-compose_hadoop340_hive313_spark401_arm64.yml b/docker/compose/docker-compose_hadoop340_hive313_spark401_arm64.yml new file mode 100644 index 0000000000000..483ce92e0c034 --- /dev/null +++ b/docker/compose/docker-compose_hadoop340_hive313_spark401_arm64.yml @@ -0,0 +1,264 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +services: + + namenode: + image: apachehudi/hudi-hadoop_3.4.0-namenode:latest + hostname: namenode + container_name: namenode + environment: + - CLUSTER_NAME=hudi_hadoop340_hive313_spark401 + ports: + - "50070:50070" + - "8020:8020" + - "9870:9870" + env_file: + - ./hadoop.env + healthcheck: + test: ["CMD", "curl", "-f", "http://namenode:9870"] + interval: 30s + timeout: 10s + retries: 3 + + datanode1: + image: apachehudi/hudi-hadoop_3.4.0-datanode:latest + container_name: datanode1 + hostname: datanode1 + environment: + - CLUSTER_NAME=hudi_hadoop340_hive313_spark401 + env_file: + - ./hadoop.env + ports: + - "50075:50075" + - "9864:9864" + - "50010:50010" + links: + - "namenode" + - "historyserver" + healthcheck: + test: ["CMD", "curl", "-f", "http://datanode1:9864"] + interval: 30s + timeout: 10s + retries: 3 + depends_on: + - namenode + + historyserver: + image: apachehudi/hudi-hadoop_3.4.0-history:latest + hostname: historyserver + container_name: historyserver + environment: + - CLUSTER_NAME=hudi_hadoop340_hive313_spark401 + depends_on: + - "namenode" + links: + - "namenode" + ports: + - "8188:8188" + healthcheck: + test: ["CMD", "curl", "-f", "http://historyserver:8188"] + interval: 30s + timeout: 10s + retries: 3 + env_file: + - ./hadoop.env + volumes: + - historyserver:/hadoop/yarn/timeline + + hive-metastore-postgresql: + image: bde2020/hive-metastore-postgresql:3.1.0 + volumes: + - hive-metastore-postgresql:/var/lib/postgresql + hostname: hive-metastore-postgresql + container_name: hive-metastore-postgresql + + hivemetastore: + image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3:latest + hostname: hivemetastore + container_name: hivemetastore + links: + - "hive-metastore-postgresql" + - "namenode" + env_file: + - ./hadoop.env + command: /opt/hive/bin/hive --service metastore + environment: + - "SERVICE_PRECONDITION=namenode:9870 hive-metastore-postgresql:5432" + ports: + - "9083:9083" + healthcheck: + test: ["CMD", "nc", "-z", "hivemetastore", "9083"] + interval: 30s + timeout: 10s + retries: 3 + depends_on: + - "hive-metastore-postgresql" + - "namenode" + + hiveserver: + image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3:latest + hostname: hiveserver + container_name: hiveserver + env_file: + - ./hadoop.env + environment: + - SERVICE_PRECONDITION=hivemetastore:9083 + ports: + - "10000:10000" + - "10002:10002" + depends_on: + - "hivemetastore" + links: + - "hivemetastore" + - "hive-metastore-postgresql" + - "namenode" + volumes: + - ${HUDI_WS}:/var/hoodie/ws + + zookeeper: + image: 'bitnamilegacy/zookeeper:3.6.4' + hostname: zookeeper + container_name: zookeeper + ports: + - "2181:2181" + environment: + - ALLOW_ANONYMOUS_LOGIN=yes + + kafka: + image: 'bitnamilegacy/kafka:3.4.1' + hostname: kafkabroker + container_name: kafkabroker + ports: + - "9092:9092" + environment: + - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181 + - ALLOW_PLAINTEXT_LISTENER=yes + + sparkmaster: + image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3-sparkmaster_4.0.1:latest + hostname: sparkmaster + container_name: sparkmaster + env_file: + - ./hadoop.env + ports: + - "8080:8080" + - "7077:7077" + - "8888:8888" + volumes: + - ${HUDI_WS}:/var/hoodie/ws + - ./notebooks:/opt/workspace/notebooks + environment: + - INIT_DAEMON_STEP=setup_spark + links: + - "hivemetastore" + - "hiveserver" + - "hive-metastore-postgresql" + - "namenode" + + spark-worker-1: + image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3-sparkworker_4.0.1:latest + hostname: spark-worker-1 + container_name: spark-worker-1 + env_file: + - ./hadoop.env + depends_on: + - sparkmaster + ports: + - "8081:8081" + environment: + - SPARK_MASTER=spark://sparkmaster:7077 + links: + - "hivemetastore" + - "hiveserver" + - "hive-metastore-postgresql" + - "namenode" + + adhoc-1: + image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3-sparkadhoc_4.0.1:latest + hostname: adhoc-1 + container_name: adhoc-1 + env_file: + - ./hadoop.env + depends_on: + - sparkmaster + ports: + - '4040:4040' + environment: + - SPARK_MASTER=spark://sparkmaster:7077 + links: + - "hivemetastore" + - "hiveserver" + - "hive-metastore-postgresql" + - "namenode" + volumes: + - ${HUDI_WS}:/var/hoodie/ws + + adhoc-2: + image: apachehudi/hudi-hadoop_3.4.0-hive_3.1.3-sparkadhoc_4.0.1:latest + hostname: adhoc-2 + container_name: adhoc-2 + env_file: + - ./hadoop.env + depends_on: + - sparkmaster + environment: + - SPARK_MASTER=spark://sparkmaster:7077 + links: + - "hivemetastore" + - "hiveserver" + - "hive-metastore-postgresql" + - "namenode" + volumes: + - ${HUDI_WS}:/var/hoodie/ws + + minio: + image: 'minio/minio:latest' + hostname: minio + container_name: minio + ports: + - 9090:9090 # server address + - 9091:9091 # console address + volumes: + - minio-data:/data + environment: + - MINIO_ACCESS_KEY=minio + - MINIO_SECRET_KEY=minio123 + - MINIO_DOMAIN=minio + command: server --address ":9090" --console-address ":9091" /data + + mc: + image: minio/mc + container_name: mc + entrypoint: > + /bin/sh -c " + until (/usr/bin/mc alias set minio http://minio:9090 minio minio123 --api S3v4) do echo '...waiting...' && sleep 1; done; + /usr/bin/mc rm -r --force minio/warehouse; + /usr/bin/mc mb minio/warehouse; + /usr/bin/mc policy set public minio/warehouse; + tail -f /dev/null + " + depends_on: + - minio + +volumes: + namenode: + historyserver: + hive-metastore-postgresql: + minio-data: + +networks: + default: + name: hudi diff --git a/docker/hoodie/hadoop/base_java11/Dockerfile b/docker/hoodie/hadoop/base_java11/Dockerfile index 42333067b5698..9c03716f28605 100644 --- a/docker/hoodie/hadoop/base_java11/Dockerfile +++ b/docker/hoodie/hadoop/base_java11/Dockerfile @@ -15,7 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM eclipse-temurin:11-jdk-jammy +FROM openjdk:11-jdk-slim-bullseye LABEL maintainer="Hoodie" USER root diff --git a/docker/hoodie/hadoop/base_java17/Dockerfile b/docker/hoodie/hadoop/base_java17/Dockerfile new file mode 100644 index 0000000000000..45108610b19e2 --- /dev/null +++ b/docker/hoodie/hadoop/base_java17/Dockerfile @@ -0,0 +1,58 @@ + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM eclipse-temurin:17-jdk +LABEL maintainer="Hoodie" +USER root + +# Default to UTF-8 file.encoding +ENV LANG C.UTF-8 + +ARG HADOOP_VERSION=3.4.0 +ARG HADOOP_URL=https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz +ENV HADOOP_VERSION ${HADOOP_VERSION} +ENV HADOOP_URL ${HADOOP_URL} + +RUN set -x \ + && DEBIAN_FRONTEND=noninteractive apt-get -yq update && apt-get -yq install curl wget netcat-openbsd procps \ + && echo "Fetch URL2 is : ${HADOOP_URL}" \ + && curl -fSL "${HADOOP_URL}" -o /tmp/hadoop.tar.gz \ + && curl -fSL "${HADOOP_URL}.asc" -o /tmp/hadoop.tar.gz.asc \ + && mkdir -p /opt/hadoop-$HADOOP_VERSION/logs \ + && tar -xvf /tmp/hadoop.tar.gz -C /opt/ \ + && rm /tmp/hadoop.tar.gz* \ + && ln -s /opt/hadoop-$HADOOP_VERSION/etc/hadoop /etc/hadoop \ + && mkdir /hadoop-data + +ENV HADOOP_PREFIX=/opt/hadoop-$HADOOP_VERSION +ENV HADOOP_CONF_DIR=/etc/hadoop +ENV MULTIHOMED_NETWORK=1 +ENV HADOOP_HOME=${HADOOP_PREFIX} +ENV HADOOP_INSTALL=${HADOOP_HOME} +ENV USER=root +ENV PATH /usr/bin:/bin:$HADOOP_PREFIX/bin/:$PATH + +# Exposing a union of ports across hadoop versions +# Well known ports including ssh +EXPOSE 0-1024 4040 7000-10100 5000-5100 50000-50200 58188 58088 58042 + +ADD entrypoint.sh /entrypoint.sh +ADD export_container_ip.sh /usr/bin/ +RUN chmod a+x /usr/bin/export_container_ip.sh \ + && chmod a+x /entrypoint.sh + +ENTRYPOINT ["/bin/bash", "/entrypoint.sh"] diff --git a/docker/hoodie/hadoop/base_java17/entrypoint.sh b/docker/hoodie/hadoop/base_java17/entrypoint.sh new file mode 100644 index 0000000000000..7c26f29f66886 --- /dev/null +++ b/docker/hoodie/hadoop/base_java17/entrypoint.sh @@ -0,0 +1,107 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +####################################################################################### +## COPIED FROM ## +## https://github.com/big-data-europe/docker-hadoop/blob/master/base/entrypoint.sh ## +# ## +####################################################################################### + +# Set some sensible defaults +export CORE_CONF_fs_defaultFS=${CORE_CONF_fs_defaultFS:-hdfs://`hostname -f`:8020} + +function addProperty() { + local path=$1 + local name=$2 + local value=$3 + + local entry="$name${value}" + local escapedEntry=$(echo $entry | sed 's/\//\\\//g') + sed -i "/<\/configuration>/ s/.*/${escapedEntry}\n&/" $path +} + +function configure() { + local path=$1 + local module=$2 + local envPrefix=$3 + + local var + local value + + echo "Configuring $module" + for c in `printenv | perl -sne 'print "$1 " if m/^${envPrefix}_(.+?)=.*/' -- -envPrefix=$envPrefix`; do + name=`echo ${c} | perl -pe 's/___/-/g; s/__/@/g; s/_/./g; s/@/_/g;'` + var="${envPrefix}_${c}" + value=${!var} + echo " - Setting $name=$value" + addProperty /etc/hadoop/$module-site.xml $name "$value" + done +} + +configure /etc/hadoop/core-site.xml core CORE_CONF +configure /etc/hadoop/hdfs-site.xml hdfs HDFS_CONF +configure /etc/hadoop/yarn-site.xml yarn YARN_CONF +configure /etc/hadoop/httpfs-site.xml httpfs HTTPFS_CONF +configure /etc/hadoop/kms-site.xml kms KMS_CONF + +if [ "$MULTIHOMED_NETWORK" = "1" ]; then + echo "Configuring for multihomed network" + + # HDFS + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.rpc-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.servicerpc-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.http-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.https-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.client.use.datanode.hostname true + addProperty /etc/hadoop/hdfs-site.xml dfs.datanode.use.datanode.hostname true + + # YARN + addProperty /etc/hadoop/yarn-site.xml yarn.resourcemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.timeline-service.bind-host 0.0.0.0 + + # MAPRED + addProperty /etc/hadoop/mapred-site.xml yarn.nodemanager.bind-host 0.0.0.0 +fi + +if [ -n "$GANGLIA_HOST" ]; then + mv /etc/hadoop/hadoop-metrics.properties /etc/hadoop/hadoop-metrics.properties.orig + mv /etc/hadoop/hadoop-metrics2.properties /etc/hadoop/hadoop-metrics2.properties.orig + + for module in mapred jvm rpc ugi; do + echo "$module.class=org.apache.hadoop.metrics.ganglia.GangliaContext31" + echo "$module.period=10" + echo "$module.servers=$GANGLIA_HOST:8649" + done > /etc/hadoop/hadoop-metrics.properties + + for module in namenode datanode resourcemanager nodemanager mrappmaster jobhistoryserver; do + echo "$module.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31" + echo "$module.sink.ganglia.period=10" + echo "$module.sink.ganglia.supportsparse=true" + echo "$module.sink.ganglia.slope=jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both" + echo "$module.sink.ganglia.dmax=jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40" + echo "$module.sink.ganglia.servers=$GANGLIA_HOST:8649" + done > /etc/hadoop/hadoop-metrics2.properties +fi + +# Save Container IP in ENV variable +/usr/bin/export_container_ip.sh + +exec "$@" diff --git a/docker/hoodie/hadoop/base_java17/export_container_ip.sh b/docker/hoodie/hadoop/base_java17/export_container_ip.sh new file mode 100755 index 0000000000000..b427f92ccf7c3 --- /dev/null +++ b/docker/hoodie/hadoop/base_java17/export_container_ip.sh @@ -0,0 +1,30 @@ + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +interfaces=( "en0" "eth0" ) + +ipAddr="" +for interface in "${interfaces[@]}" +do + ipAddr=`ifconfig $interface | grep -Eo 'inet (addr:)?([0-9]+\.){3}[0-9]+' | grep -Eo '([0-9]+\.){3}[0-9]+' | grep -v '127.0.0.1' | head` + if [ -n "$ipAddr" ]; then + break + fi +done + +echo "Container IP is set to : $ipAddr" +export MY_CONTAINER_IP=$ipAddr diff --git a/docker/hoodie/hadoop/base_java17/pom.xml b/docker/hoodie/hadoop/base_java17/pom.xml new file mode 100644 index 0000000000000..e147ad418162c --- /dev/null +++ b/docker/hoodie/hadoop/base_java17/pom.xml @@ -0,0 +1,92 @@ + + + + + hudi-hadoop-docker + org.apache.hudi + 1.2.0-SNAPSHOT + + 4.0.0 + pom + hudi-hadoop-base-java17-docker + + Base Docker Image with Java 17 for Spark 4.0+ + + + UTF-8 + true + ${project.parent.parent.basedir} + + + + + + org.apache.hudi + hudi-hadoop-docker + ${project.version} + pom + import + + + + + + hudi + + + + com.spotify + dockerfile-maven-plugin + ${dockerfile.maven.version} + + + tag-latest + pre-integration-test + + build + tag + + + ${docker.build.skip} + false + apachehudi/hudi-hadoop_${docker.hadoop.version}-base + true + latest + + + + tag-version + pre-integration-test + + build + tag + + + ${docker.build.skip} + false + apachehudi/hudi-hadoop_${docker.hadoop.version}-base + true + ${project.version} + + + + + + + diff --git a/docker/hoodie/hadoop/spark_base/Dockerfile b/docker/hoodie/hadoop/spark_base/Dockerfile index d92c38e68e363..68bfbaae76d83 100644 --- a/docker/hoodie/hadoop/spark_base/Dockerfile +++ b/docker/hoodie/hadoop/spark_base/Dockerfile @@ -79,7 +79,9 @@ ENV SPARK_BLOCKMGR_PORT 5003 EXPOSE $SPARK_DRIVER_PORT $SPARK_UI_PORT $SPARK_BLOCKMGR_PORT # Without this spark-shell fails - Download if it is not already there in $SPARK_INSTALL +ARG HADOOP_AWS_VERSION=3.3.4 +ARG AWS_SDK_VERSION=1.12.734 RUN wget -nc -q -O "${SPARK_INSTALL}/jars/jersey-bundle-1.19.4.jar" "https://repo1.maven.org/maven2/com/sun/jersey/jersey-bundle/1.19.4/jersey-bundle-1.19.4.jar" && \ - wget -O "${SPARK_INSTALL}/jars/hadoop-aws-3.3.4.jar" https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar && \ - wget -O "${SPARK_INSTALL}/jars/aws-java-sdk-bundle-1.12.734.jar" https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.734/aws-java-sdk-bundle-1.12.734.jar + wget -O "${SPARK_INSTALL}/jars/hadoop-aws-${HADOOP_AWS_VERSION}.jar" https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_AWS_VERSION}/hadoop-aws-${HADOOP_AWS_VERSION}.jar && \ + wget -O "${SPARK_INSTALL}/jars/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar" https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar