diff --git a/docker/.dockerignore b/docker/.dockerignore new file mode 100644 index 0000000..979d0c4 --- /dev/null +++ b/docker/.dockerignore @@ -0,0 +1,7 @@ +# Ignore most files +*.md +*.sh +docker-compose.yml + +# Allow JARs directory for local builds +!jars/ \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..16c8369 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,107 @@ +# Spark History Server with DataFlint Plugin +# +# Build arguments: +# SPARK_VERSION - Spark version to use (default: 3.5.1) +# SCALA_VERSION - Scala binary version: 2.12 or 2.13 (default: 2.12) +# DATAFLINT_VERSION - DataFlint plugin version (default: 0.8.3) +# USE_LOCAL_JAR - Use locally built JAR instead of Maven (default: false) +# +# Runtime: +# Mount your event log directory to /spark-history +# Example: docker run -v /path/to/logs:/spark-history -p 18080:18080 dataflint-history-server +# +# Build examples: +# # From Maven (default): +# docker build -t dataflint-history-server . +# docker build -t dataflint-history-server --build-arg SPARK_VERSION=3.5.3 . +# +# # From local JAR (run ./build-jars.sh first): +# docker build -t dataflint-history-server --build-arg USE_LOCAL_JAR=true . +# docker build -t dataflint-history-server --build-arg USE_LOCAL_JAR=true --build-arg SPARK_VERSION=4.0.0 . + +ARG SPARK_VERSION=3.5.1 + +FROM apache/spark:${SPARK_VERSION} + +# Re-declare ARGs after FROM to make them available +ARG SPARK_VERSION=3.5.1 +ARG SCALA_VERSION=2.12 +ARG DATAFLINT_VERSION=0.8.3 +ARG USE_LOCAL_JAR=false + +# Labels +LABEL maintainer="DataFlint" +LABEL description="Spark History Server with DataFlint plugin and UI" +LABEL spark.version="${SPARK_VERSION}" +LABEL dataflint.version="${DATAFLINT_VERSION}" + +USER root + +# Install curl for downloading the JAR (only needed for Maven download) +RUN apt-get update && \ + apt-get install -y --no-install-recommends curl && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Copy local JARs directory (will be empty if not using local JARs) +COPY jars/ /tmp/dataflint-jars/ + +# Install DataFlint plugin - either from local JAR or Maven +RUN SPARK_MAJOR=$(echo ${SPARK_VERSION} | cut -d. -f1) && \ + if [ "$SPARK_MAJOR" = "4" ]; then \ + ARTIFACT_NAME="dataflint-spark4"; \ + SCALA_VER="2.13"; \ + else \ + ARTIFACT_NAME="spark"; \ + SCALA_VER="${SCALA_VERSION}"; \ + fi && \ + JAR_NAME="${ARTIFACT_NAME}_${SCALA_VER}-${DATAFLINT_VERSION}-SNAPSHOT.jar" && \ + if [ "${USE_LOCAL_JAR}" = "true" ]; then \ + echo "Using local JAR: ${JAR_NAME}" && \ + if [ -f "/tmp/dataflint-jars/${JAR_NAME}" ]; then \ + cp "/tmp/dataflint-jars/${JAR_NAME}" /opt/spark/jars/dataflint-plugin.jar; \ + else \ + echo "ERROR: Local JAR not found: /tmp/dataflint-jars/${JAR_NAME}" && \ + echo "Available JARs:" && ls -la /tmp/dataflint-jars/ && \ + exit 1; \ + fi; \ + else \ + echo "Downloading from Maven: io.dataflint:${ARTIFACT_NAME}_${SCALA_VER}:${DATAFLINT_VERSION}" && \ + curl -fSL -o /opt/spark/jars/dataflint-plugin.jar \ + "https://repo1.maven.org/maven2/io/dataflint/${ARTIFACT_NAME}_${SCALA_VER}/${DATAFLINT_VERSION}/${ARTIFACT_NAME}_${SCALA_VER}-${DATAFLINT_VERSION}.jar"; \ + fi && \ + rm -rf /tmp/dataflint-jars + +# Create directory for spark event logs +RUN mkdir -p /spark-history && \ + chown -R spark:spark /spark-history + +# Configure Spark defaults for History Server with DataFlint +RUN mkdir -p /opt/spark/conf && \ + touch /opt/spark/conf/spark-defaults.conf && \ + echo "spark.history.fs.logDirectory=/spark-history" >> /opt/spark/conf/spark-defaults.conf && \ + echo "spark.history.ui.port=18080" >> /opt/spark/conf/spark-defaults.conf + +# Create history server startup script +RUN echo '#!/bin/bash' > /opt/spark/bin/start-history-server.sh && \ + echo 'exec /opt/spark/bin/spark-class org.apache.spark.deploy.history.HistoryServer "$@"' >> /opt/spark/bin/start-history-server.sh && \ + chmod +x /opt/spark/bin/start-history-server.sh + +# Switch back to spark user +USER spark + +# Expose History Server port +EXPOSE 18080 + +# Volume for event logs +VOLUME ["/spark-history"] + +# Set working directory +WORKDIR /opt/spark + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \ + CMD curl -f http://localhost:18080 || exit 1 + +# Start History Server +CMD ["/opt/spark/bin/start-history-server.sh"] \ No newline at end of file diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000..6982dbe --- /dev/null +++ b/docker/README.md @@ -0,0 +1,166 @@ +# Spark History Server with DataFlint + +Docker image for Apache Spark History Server with the DataFlint plugin and UI pre-installed. + +## Quick Start + +### Using Docker (from Maven) + +```bash +# Build the image (downloads JAR from Maven Central) +docker build -t dataflint-history-server . + +# Run with your event logs directory +docker run -d \ + -p 18080:18080 \ + -v /path/to/spark-events:/spark-history:ro \ + --name dataflint-history-server \ + dataflint-history-server +``` + +Access the History Server at http://localhost:18080. Click on any application to see the DataFlint tab. + +### Using Local JARs (for development) + +```bash +# Step 1: Build the JARs locally +./build-jars.sh + +# Step 2: Build Docker image with local JAR +docker build -t dataflint-history-server --build-arg USE_LOCAL_JAR=true . + +# Step 3: Run +docker run -d \ + -p 18080:18080 \ + -v /path/to/spark-events:/spark-history:ro \ + dataflint-history-server +``` + +### Using Docker Compose + +```bash +# Set your event logs directory +export SPARK_HISTORY_DIR=/path/to/spark-events + +# Start the service +docker-compose up -d +``` + +## Build Arguments + +| Argument | Default | Description | +|----------|---------|-------------| +| `SPARK_VERSION` | `3.5.1` | Apache Spark version | +| `SCALA_VERSION` | `2.12` | Scala binary version (2.12 or 2.13) | +| `DATAFLINT_VERSION` | `0.8.3` | DataFlint plugin version | +| `USE_LOCAL_JAR` | `false` | Use locally built JAR instead of Maven | + +### Examples + +```bash +# Spark 3.5.3 from Maven +docker build -t dataflint-hs:3.5.3 --build-arg SPARK_VERSION=3.5.3 . + +# Spark 3.4.1 with Scala 2.13 from Maven +docker build -t dataflint-hs:3.4.1 \ + --build-arg SPARK_VERSION=3.4.1 \ + --build-arg SCALA_VERSION=2.13 . + +# Spark 4.0.0 from Maven (automatically uses Scala 2.13) +docker build -t dataflint-hs:4.0.0 --build-arg SPARK_VERSION=4.0.0 . + +# Local JAR for Spark 3.x +./build-jars.sh +docker build -t dataflint-hs:local --build-arg USE_LOCAL_JAR=true . + +# Local JAR for Spark 4.x +./build-jars.sh +docker build -t dataflint-hs:4.0.0-local \ + --build-arg USE_LOCAL_JAR=true \ + --build-arg SPARK_VERSION=4.0.0 . +``` + +## Build Script + +The `build-jars.sh` script automates the local build process: + +1. Builds the React UI (`spark-ui`) +2. Builds plugin JARs for all Scala versions (`spark-plugin`) +3. Copies JARs to `docker/jars/` directory + +**Prerequisites:** +- Node.js 20+ +- Java 8+ +- sbt + +## Runtime Configuration + +### Environment Variables (docker-compose) + +| Variable | Default | Description | +|----------|---------|-------------| +| `SPARK_HISTORY_DIR` | `./spark-events` | Host path to Spark event logs | +| `HISTORY_SERVER_PORT` | `18080` | Host port for History Server | + +### Volume Mount + +The container expects event logs at `/spark-history`. Mount your Spark event logs directory: + +```bash +docker run -v /your/spark/events:/spark-history:ro ... +``` + +## Spark Version Compatibility + +| Spark Version | Scala Version | Notes | +|---------------|---------------|-------| +| 3.2.x - 3.5.x | 2.12, 2.13 | Default: 2.12 | +| 4.0.x | 2.13 | Automatically selected | + +## Generating Event Logs + +To enable event logging in your Spark applications: + +```python +spark = SparkSession.builder \ + .config("spark.eventLog.enabled", "true") \ + .config("spark.eventLog.dir", "/path/to/spark-events") \ + .getOrCreate() +``` + +Or via spark-submit: +```bash +spark-submit \ + --conf spark.eventLog.enabled=true \ + --conf spark.eventLog.dir=/path/to/spark-events \ + your_app.py +``` + +## Cloud Storage + +For S3, GCS, or other cloud storage, you may need to add additional JARs and configuration: + +```bash +docker run -d \ + -p 18080:18080 \ + -e SPARK_HISTORY_OPTS="-Dspark.history.fs.logDirectory=s3a://bucket/spark-events" \ + -v /path/to/aws-hadoop-jars:/opt/spark/jars/cloud:ro \ + dataflint-history-server +``` + +## Troubleshooting + +### No applications showing +- Verify event logs exist in the mounted directory +- Check logs: `docker logs dataflint-history-server` +- Ensure event log files are complete (not still being written) + +### DataFlint tab not appearing +- The DataFlint tab appears when you click on a specific application +- Verify the plugin JAR was downloaded: `docker exec dataflint-history-server ls /opt/spark/jars/dataflint*` + +### Local JAR build fails +- Ensure Node.js 20+ is installed: `node --version` +- Ensure Java 8+ is installed: `java -version` +- Ensure sbt is installed: `sbt --version` +- Check that you're running from the `docker/` directory \ No newline at end of file diff --git a/docker/build-jars.sh b/docker/build-jars.sh new file mode 100755 index 0000000..c384955 --- /dev/null +++ b/docker/build-jars.sh @@ -0,0 +1,63 @@ +#!/bin/bash +set -e + +# Build DataFlint JARs for Docker +# This script builds the UI and plugin JARs locally + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +OUTPUT_DIR="$SCRIPT_DIR/jars" + +echo "=== Building DataFlint JARs ===" +echo "Project root: $PROJECT_ROOT" +echo "Output directory: $OUTPUT_DIR" + +# Create output directory +mkdir -p "$OUTPUT_DIR" + +# Step 1: Build the UI +echo "" +echo "=== Step 1: Building UI ===" +cd "$PROJECT_ROOT/spark-ui" + +if [ ! -d "node_modules" ]; then + echo "Installing npm dependencies..." + npm ci +fi + +echo "Building and deploying UI..." +npm run deploy + +# Step 2: Build the plugin JARs +echo "" +echo "=== Step 2: Building Plugin JARs ===" +cd "$PROJECT_ROOT/spark-plugin" + +# Set JVM memory for sbt +export SBT_OPTS="-Xmx4G -Xss2M -XX:+UseG1GC" + +echo "Building Spark 3.x plugin JARs..." +sbt "+pluginspark3/assembly" + +echo "Building Spark 4.x plugin JAR..." +sbt "pluginspark4/assembly" + +# Step 3: Copy JARs to output directory +echo "" +echo "=== Step 3: Copying JARs ===" + +# Spark 3.x JARs +cp pluginspark3/target/scala-2.12/spark_2.12-*.jar "$OUTPUT_DIR/" 2>/dev/null || echo "No Scala 2.12 JAR found (optional)" +cp pluginspark3/target/scala-2.13/spark_2.13-*.jar "$OUTPUT_DIR/" 2>/dev/null || echo "No Scala 2.13 JAR found (optional)" + +# Spark 4.x JAR +cp pluginspark4/target/scala-2.13/dataflint-spark4_2.13-*.jar "$OUTPUT_DIR/" 2>/dev/null || echo "No Spark 4.x JAR found (optional)" + +echo "" +echo "=== Build Complete ===" +echo "JARs available in: $OUTPUT_DIR" +ls -la "$OUTPUT_DIR" + +echo "" +echo "To build Docker image with local JARs:" +echo " docker build -t dataflint-history-server --build-arg USE_LOCAL_JAR=true ." \ No newline at end of file diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml new file mode 100644 index 0000000..4be5313 --- /dev/null +++ b/docker/docker-compose.yml @@ -0,0 +1,25 @@ +services: + spark-history-server: + build: + context: . + dockerfile: Dockerfile + args: + SPARK_VERSION: ${SPARK_VERSION:-3.5.1} + SCALA_VERSION: ${SCALA_VERSION:-2.12} + DATAFLINT_VERSION: ${DATAFLINT_VERSION:-0.8.3} + USE_LOCAL_JAR: ${USE_LOCAL_JAR:-true} + image: dataflint-history-server:${SPARK_VERSION:-3.5.1} + container_name: dataflint-history-server + ports: + - "${HISTORY_SERVER_PORT:-18080}:18080" + volumes: + - ${SPARK_HISTORY_DIR:-./spark-events}:/spark-history:ro + environment: + - SPARK_NO_DAEMONIZE=true + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:18080"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s \ No newline at end of file diff --git a/docker/jars/.gitkeep b/docker/jars/.gitkeep new file mode 100644 index 0000000..0429082 --- /dev/null +++ b/docker/jars/.gitkeep @@ -0,0 +1,3 @@ +# This directory holds locally built JARs for Docker builds +# Run ./build-jars.sh to populate this directory +# Then build with: docker build --build-arg USE_LOCAL_JAR=true . \ No newline at end of file