From 31cd56ac14bf3eb107c83701da4a12f37056b812 Mon Sep 17 00:00:00 2001 From: Dushyant Kumar Date: Wed, 18 Mar 2026 13:24:30 +0530 Subject: [PATCH 1/2] [Fix] Fix broken spark-3.5-base-hadoop3.2.dockerfile - Replace deprecated openjdk:11.0.11-jdk-slim-buster (Debian Buster EOL, apt repos return 404) with eclipse-temurin:11-jdk-jammy (Ubuntu Jammy LTS) - Add missing unzip package required for Livy assembly extraction - Add --no-install-recommends and apt cleanup to reduce image size - Fix Maven download URL from dlcdn.apache.org to archive.apache.org (dlcdn returns 404 for older Maven versions like 3.9.4) Co-Authored-By: Claude Opus 4.6 --- .../common/spark/spark-3.5-base-hadoop3.2.dockerfile | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/infra/recipes/docker-compose/common/spark/spark-3.5-base-hadoop3.2.dockerfile b/infra/recipes/docker-compose/common/spark/spark-3.5-base-hadoop3.2.dockerfile index 2cfe9bf2a..c4dad4bf7 100644 --- a/infra/recipes/docker-compose/common/spark/spark-3.5-base-hadoop3.2.dockerfile +++ b/infra/recipes/docker-compose/common/spark/spark-3.5-base-hadoop3.2.dockerfile @@ -1,9 +1,11 @@ -FROM openjdk:11.0.11-jdk-slim-buster as builder +FROM eclipse-temurin:11-jdk-jammy as builder -RUN apt-get update && apt-get install -y \ - git curl vim zip software-properties-common ssh net-tools ca-certificates \ +# Update package lists and install packages with proper security +RUN apt-get update && apt-get install -y --no-install-recommends \ + git curl vim zip unzip software-properties-common ssh net-tools ca-certificates \ # Add Dependencies for PySpark \ - python3 python3-pip python3-numpy python3-matplotlib python3-scipy python3-pandas python3-simpy + python3 python3-pip python3-numpy python3-matplotlib python3-scipy python3-pandas python3-simpy && \ + apt-get clean && rm -rf /var/lib/apt/lists/* RUN update-alternatives --install "/usr/bin/python" "python" "$(which python3)" 1 @@ -25,7 +27,7 @@ RUN curl --no-verbose -o apache-spark.tgz \ && rm apache-spark.tgz # install maven to build apache livy -RUN curl --no-verbose -o maven.tgz https://dlcdn.apache.org/maven/maven-3/${MAVEN_VERSION}/binaries/apache-maven-${MAVEN_VERSION}-bin.tar.gz \ +RUN curl --no-verbose -o maven.tgz https://archive.apache.org/dist/maven/maven-3/${MAVEN_VERSION}/binaries/apache-maven-${MAVEN_VERSION}-bin.tar.gz \ && mkdir -p /opt/maven \ && tar -xf maven.tgz -C /opt/maven --strip-components=1 \ && rm maven.tgz From a492387d1295c2fc51f77bbfcefc3b61e693f63d Mon Sep 17 00:00:00 2001 From: Dushyant Kumar Date: Wed, 18 Mar 2026 14:01:06 +0530 Subject: [PATCH 2/2] [Docs] Add --master flag to spark-shell command in SETUP.md Without --master spark://spark-master:7077, spark-shell defaults to local[*] which may cause Spark actions that scan HDFS to hang. Co-Authored-By: Claude Opus 4.6 --- SETUP.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/SETUP.md b/SETUP.md index f45215d38..27d923c29 100644 --- a/SETUP.md +++ b/SETUP.md @@ -234,7 +234,8 @@ docker exec -it local.spark-master /bin/bash Start `spark-shell` with the following command: Available users are `openhouse` and `u_tableowner`. ``` -bin/spark-shell --packages org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:1.2.0 \ +bin/spark-shell --master spark://spark-master:7077 \ + --packages org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:1.2.0 \ --jars openhouse-spark-runtime_2.12-*-all.jar \ --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,com.linkedin.openhouse.spark.extensions.OpenhouseSparkSessionExtensions \ --conf spark.sql.catalog.openhouse=org.apache.iceberg.spark.SparkCatalog \ @@ -245,6 +246,10 @@ bin/spark-shell --packages org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:1.2 --conf spark.sql.catalog.openhouse.cluster=LocalHadoopCluster ``` +> **Note:** `--master spark://spark-master:7077` connects to the Spark standalone cluster +> instead of using the default `local[*]` mode. Without this, Spark actions that scan +> HDFS (e.g. orphan file deletion) may hang. + If you are integrating with ADLS, use this `spark-shell` command instead: ```