Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docker/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Ignore most files
*.md
*.sh
docker-compose.yml

# Allow JARs directory for local builds
!jars/
107 changes: 107 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# Spark History Server with DataFlint Plugin
#
# Build arguments:
# SPARK_VERSION - Spark version to use (default: 3.5.1)
# SCALA_VERSION - Scala binary version: 2.12 or 2.13 (default: 2.12)
# DATAFLINT_VERSION - DataFlint plugin version (default: 0.8.3)
# USE_LOCAL_JAR - Use locally built JAR instead of Maven (default: false)
#
# Runtime:
# Mount your event log directory to /spark-history
# Example: docker run -v /path/to/logs:/spark-history -p 18080:18080 dataflint-history-server
#
# Build examples:
# # From Maven (default):
# docker build -t dataflint-history-server .
# docker build -t dataflint-history-server --build-arg SPARK_VERSION=3.5.3 .
#
# # From local JAR (run ./build-jars.sh first):
# docker build -t dataflint-history-server --build-arg USE_LOCAL_JAR=true .
# docker build -t dataflint-history-server --build-arg USE_LOCAL_JAR=true --build-arg SPARK_VERSION=4.0.0 .

ARG SPARK_VERSION=3.5.1

FROM apache/spark:${SPARK_VERSION}

# Re-declare ARGs after FROM to make them available
ARG SPARK_VERSION=3.5.1
ARG SCALA_VERSION=2.12
ARG DATAFLINT_VERSION=0.8.3
ARG USE_LOCAL_JAR=false

# Labels
LABEL maintainer="DataFlint"
LABEL description="Spark History Server with DataFlint plugin and UI"
LABEL spark.version="${SPARK_VERSION}"
LABEL dataflint.version="${DATAFLINT_VERSION}"

USER root

# Install curl for downloading the JAR (only needed for Maven download)
RUN apt-get update && \
apt-get install -y --no-install-recommends curl && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Copy local JARs directory (will be empty if not using local JARs)
COPY jars/ /tmp/dataflint-jars/

# Install DataFlint plugin - either from local JAR or Maven
RUN SPARK_MAJOR=$(echo ${SPARK_VERSION} | cut -d. -f1) && \
if [ "$SPARK_MAJOR" = "4" ]; then \
ARTIFACT_NAME="dataflint-spark4"; \
SCALA_VER="2.13"; \
else \
ARTIFACT_NAME="spark"; \
SCALA_VER="${SCALA_VERSION}"; \
fi && \
JAR_NAME="${ARTIFACT_NAME}_${SCALA_VER}-${DATAFLINT_VERSION}-SNAPSHOT.jar" && \
if [ "${USE_LOCAL_JAR}" = "true" ]; then \
echo "Using local JAR: ${JAR_NAME}" && \
if [ -f "/tmp/dataflint-jars/${JAR_NAME}" ]; then \
cp "/tmp/dataflint-jars/${JAR_NAME}" /opt/spark/jars/dataflint-plugin.jar; \
else \
echo "ERROR: Local JAR not found: /tmp/dataflint-jars/${JAR_NAME}" && \
echo "Available JARs:" && ls -la /tmp/dataflint-jars/ && \
exit 1; \
fi; \
else \
echo "Downloading from Maven: io.dataflint:${ARTIFACT_NAME}_${SCALA_VER}:${DATAFLINT_VERSION}" && \
curl -fSL -o /opt/spark/jars/dataflint-plugin.jar \
"https://repo1.maven.org/maven2/io/dataflint/${ARTIFACT_NAME}_${SCALA_VER}/${DATAFLINT_VERSION}/${ARTIFACT_NAME}_${SCALA_VER}-${DATAFLINT_VERSION}.jar"; \
fi && \
rm -rf /tmp/dataflint-jars

# Create directory for spark event logs
RUN mkdir -p /spark-history && \
chown -R spark:spark /spark-history

# Configure Spark defaults for History Server with DataFlint
RUN mkdir -p /opt/spark/conf && \
touch /opt/spark/conf/spark-defaults.conf && \
echo "spark.history.fs.logDirectory=/spark-history" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.history.ui.port=18080" >> /opt/spark/conf/spark-defaults.conf

# Create history server startup script
RUN echo '#!/bin/bash' > /opt/spark/bin/start-history-server.sh && \
echo 'exec /opt/spark/bin/spark-class org.apache.spark.deploy.history.HistoryServer "$@"' >> /opt/spark/bin/start-history-server.sh && \
chmod +x /opt/spark/bin/start-history-server.sh

# Switch back to spark user
USER spark

# Expose History Server port
EXPOSE 18080

# Volume for event logs
VOLUME ["/spark-history"]

# Set working directory
WORKDIR /opt/spark

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
CMD curl -f http://localhost:18080 || exit 1

# Start History Server
CMD ["/opt/spark/bin/start-history-server.sh"]
166 changes: 166 additions & 0 deletions docker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
# Spark History Server with DataFlint

Docker image for Apache Spark History Server with the DataFlint plugin and UI pre-installed.

## Quick Start

### Using Docker (from Maven)

```bash
# Build the image (downloads JAR from Maven Central)
docker build -t dataflint-history-server .

# Run with your event logs directory
docker run -d \
-p 18080:18080 \
-v /path/to/spark-events:/spark-history:ro \
--name dataflint-history-server \
dataflint-history-server
```

Access the History Server at http://localhost:18080. Click on any application to see the DataFlint tab.

### Using Local JARs (for development)

```bash
# Step 1: Build the JARs locally
./build-jars.sh

# Step 2: Build Docker image with local JAR
docker build -t dataflint-history-server --build-arg USE_LOCAL_JAR=true .

# Step 3: Run
docker run -d \
-p 18080:18080 \
-v /path/to/spark-events:/spark-history:ro \
dataflint-history-server
```

### Using Docker Compose

```bash
# Set your event logs directory
export SPARK_HISTORY_DIR=/path/to/spark-events

# Start the service
docker-compose up -d
```

## Build Arguments

| Argument | Default | Description |
|----------|---------|-------------|
| `SPARK_VERSION` | `3.5.1` | Apache Spark version |
| `SCALA_VERSION` | `2.12` | Scala binary version (2.12 or 2.13) |
| `DATAFLINT_VERSION` | `0.8.3` | DataFlint plugin version |
| `USE_LOCAL_JAR` | `false` | Use locally built JAR instead of Maven |

### Examples

```bash
# Spark 3.5.3 from Maven
docker build -t dataflint-hs:3.5.3 --build-arg SPARK_VERSION=3.5.3 .

# Spark 3.4.1 with Scala 2.13 from Maven
docker build -t dataflint-hs:3.4.1 \
--build-arg SPARK_VERSION=3.4.1 \
--build-arg SCALA_VERSION=2.13 .

# Spark 4.0.0 from Maven (automatically uses Scala 2.13)
docker build -t dataflint-hs:4.0.0 --build-arg SPARK_VERSION=4.0.0 .

# Local JAR for Spark 3.x
./build-jars.sh
docker build -t dataflint-hs:local --build-arg USE_LOCAL_JAR=true .

# Local JAR for Spark 4.x
./build-jars.sh
docker build -t dataflint-hs:4.0.0-local \
--build-arg USE_LOCAL_JAR=true \
--build-arg SPARK_VERSION=4.0.0 .
```

## Build Script

The `build-jars.sh` script automates the local build process:

1. Builds the React UI (`spark-ui`)
2. Builds plugin JARs for all Scala versions (`spark-plugin`)
3. Copies JARs to `docker/jars/` directory

**Prerequisites:**
- Node.js 20+
- Java 8+
- sbt

## Runtime Configuration

### Environment Variables (docker-compose)

| Variable | Default | Description |
|----------|---------|-------------|
| `SPARK_HISTORY_DIR` | `./spark-events` | Host path to Spark event logs |
| `HISTORY_SERVER_PORT` | `18080` | Host port for History Server |

### Volume Mount

The container expects event logs at `/spark-history`. Mount your Spark event logs directory:

```bash
docker run -v /your/spark/events:/spark-history:ro ...
```

## Spark Version Compatibility

| Spark Version | Scala Version | Notes |
|---------------|---------------|-------|
| 3.2.x - 3.5.x | 2.12, 2.13 | Default: 2.12 |
| 4.0.x | 2.13 | Automatically selected |

## Generating Event Logs

To enable event logging in your Spark applications:

```python
spark = SparkSession.builder \
.config("spark.eventLog.enabled", "true") \
.config("spark.eventLog.dir", "/path/to/spark-events") \
.getOrCreate()
```

Or via spark-submit:
```bash
spark-submit \
--conf spark.eventLog.enabled=true \
--conf spark.eventLog.dir=/path/to/spark-events \
your_app.py
```

## Cloud Storage

For S3, GCS, or other cloud storage, you may need to add additional JARs and configuration:

```bash
docker run -d \
-p 18080:18080 \
-e SPARK_HISTORY_OPTS="-Dspark.history.fs.logDirectory=s3a://bucket/spark-events" \
-v /path/to/aws-hadoop-jars:/opt/spark/jars/cloud:ro \
dataflint-history-server
```

## Troubleshooting

### No applications showing
- Verify event logs exist in the mounted directory
- Check logs: `docker logs dataflint-history-server`
- Ensure event log files are complete (not still being written)

### DataFlint tab not appearing
- The DataFlint tab appears when you click on a specific application
- Verify the plugin JAR was downloaded: `docker exec dataflint-history-server ls /opt/spark/jars/dataflint*`

### Local JAR build fails
- Ensure Node.js 20+ is installed: `node --version`
- Ensure Java 8+ is installed: `java -version`
- Ensure sbt is installed: `sbt --version`
- Check that you're running from the `docker/` directory
63 changes: 63 additions & 0 deletions docker/build-jars.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/bin/bash
set -e

# Build DataFlint JARs for Docker
# This script builds the UI and plugin JARs locally

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
OUTPUT_DIR="$SCRIPT_DIR/jars"

echo "=== Building DataFlint JARs ==="
echo "Project root: $PROJECT_ROOT"
echo "Output directory: $OUTPUT_DIR"

# Create output directory
mkdir -p "$OUTPUT_DIR"

# Step 1: Build the UI
echo ""
echo "=== Step 1: Building UI ==="
cd "$PROJECT_ROOT/spark-ui"

if [ ! -d "node_modules" ]; then
echo "Installing npm dependencies..."
npm ci
fi

echo "Building and deploying UI..."
npm run deploy

# Step 2: Build the plugin JARs
echo ""
echo "=== Step 2: Building Plugin JARs ==="
cd "$PROJECT_ROOT/spark-plugin"

# Set JVM memory for sbt
export SBT_OPTS="-Xmx4G -Xss2M -XX:+UseG1GC"

echo "Building Spark 3.x plugin JARs..."
sbt "+pluginspark3/assembly"

echo "Building Spark 4.x plugin JAR..."
sbt "pluginspark4/assembly"

# Step 3: Copy JARs to output directory
echo ""
echo "=== Step 3: Copying JARs ==="

# Spark 3.x JARs
cp pluginspark3/target/scala-2.12/spark_2.12-*.jar "$OUTPUT_DIR/" 2>/dev/null || echo "No Scala 2.12 JAR found (optional)"
cp pluginspark3/target/scala-2.13/spark_2.13-*.jar "$OUTPUT_DIR/" 2>/dev/null || echo "No Scala 2.13 JAR found (optional)"

# Spark 4.x JAR
cp pluginspark4/target/scala-2.13/dataflint-spark4_2.13-*.jar "$OUTPUT_DIR/" 2>/dev/null || echo "No Spark 4.x JAR found (optional)"

echo ""
echo "=== Build Complete ==="
echo "JARs available in: $OUTPUT_DIR"
ls -la "$OUTPUT_DIR"

echo ""
echo "To build Docker image with local JARs:"
echo " docker build -t dataflint-history-server --build-arg USE_LOCAL_JAR=true ."
25 changes: 25 additions & 0 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
services:
spark-history-server:
build:
context: .
dockerfile: Dockerfile
args:
SPARK_VERSION: ${SPARK_VERSION:-3.5.1}
SCALA_VERSION: ${SCALA_VERSION:-2.12}
DATAFLINT_VERSION: ${DATAFLINT_VERSION:-0.8.3}
USE_LOCAL_JAR: ${USE_LOCAL_JAR:-true}
image: dataflint-history-server:${SPARK_VERSION:-3.5.1}
container_name: dataflint-history-server
ports:
- "${HISTORY_SERVER_PORT:-18080}:18080"
volumes:
- ${SPARK_HISTORY_DIR:-./spark-events}:/spark-history:ro
environment:
- SPARK_NO_DAEMONIZE=true
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:18080"]
interval: 30s
timeout: 10s
retries: 3
start_period: 30s
3 changes: 3 additions & 0 deletions docker/jars/.gitkeep
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# This directory holds locally built JARs for Docker builds
# Run ./build-jars.sh to populate this directory
# Then build with: docker build --build-arg USE_LOCAL_JAR=true .