diff --git a/SETUP.md b/SETUP.md index f45215d38..27d923c29 100644 --- a/SETUP.md +++ b/SETUP.md @@ -234,7 +234,8 @@ docker exec -it local.spark-master /bin/bash Start `spark-shell` with the following command: Available users are `openhouse` and `u_tableowner`. ``` -bin/spark-shell --packages org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:1.2.0 \ +bin/spark-shell --master spark://spark-master:7077 \ + --packages org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:1.2.0 \ --jars openhouse-spark-runtime_2.12-*-all.jar \ --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,com.linkedin.openhouse.spark.extensions.OpenhouseSparkSessionExtensions \ --conf spark.sql.catalog.openhouse=org.apache.iceberg.spark.SparkCatalog \ @@ -245,6 +246,10 @@ bin/spark-shell --packages org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:1.2 --conf spark.sql.catalog.openhouse.cluster=LocalHadoopCluster ``` +> **Note:** `--master spark://spark-master:7077` connects to the Spark standalone cluster +> instead of using the default `local[*]` mode. Without this, Spark actions that scan +> HDFS (e.g. orphan file deletion) may hang. + If you are integrating with ADLS, use this `spark-shell` command instead: ``` diff --git a/infra/recipes/docker-compose/common/spark/spark-3.5-base-hadoop3.2.dockerfile b/infra/recipes/docker-compose/common/spark/spark-3.5-base-hadoop3.2.dockerfile index 2cfe9bf2a..c4dad4bf7 100644 --- a/infra/recipes/docker-compose/common/spark/spark-3.5-base-hadoop3.2.dockerfile +++ b/infra/recipes/docker-compose/common/spark/spark-3.5-base-hadoop3.2.dockerfile @@ -1,9 +1,11 @@ -FROM openjdk:11.0.11-jdk-slim-buster as builder +FROM eclipse-temurin:11-jdk-jammy as builder -RUN apt-get update && apt-get install -y \ - git curl vim zip software-properties-common ssh net-tools ca-certificates \ +# Update package lists and install packages with proper security +RUN apt-get update && apt-get install -y --no-install-recommends \ + git curl vim zip unzip software-properties-common ssh net-tools ca-certificates \ # Add Dependencies for PySpark \ - python3 python3-pip python3-numpy python3-matplotlib python3-scipy python3-pandas python3-simpy + python3 python3-pip python3-numpy python3-matplotlib python3-scipy python3-pandas python3-simpy && \ + apt-get clean && rm -rf /var/lib/apt/lists/* RUN update-alternatives --install "/usr/bin/python" "python" "$(which python3)" 1 @@ -25,7 +27,7 @@ RUN curl --no-verbose -o apache-spark.tgz \ && rm apache-spark.tgz # install maven to build apache livy -RUN curl --no-verbose -o maven.tgz https://dlcdn.apache.org/maven/maven-3/${MAVEN_VERSION}/binaries/apache-maven-${MAVEN_VERSION}-bin.tar.gz \ +RUN curl --no-verbose -o maven.tgz https://archive.apache.org/dist/maven/maven-3/${MAVEN_VERSION}/binaries/apache-maven-${MAVEN_VERSION}-bin.tar.gz \ && mkdir -p /opt/maven \ && tar -xf maven.tgz -C /opt/maven --strip-components=1 \ && rm maven.tgz