diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index e96fc415..24b60d9d 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -95,35 +95,6 @@ jobs: SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }} SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }} - - name: Verify artifact published - run: | - VERSION=$(grep 'lazy val versionNum' build.sbt | grep -oE '"[0-9.]+"' | tr -d '"') - if [[ "$IS_RELEASE" == "true" ]]; then - URL="https://repo1.maven.org/maven2/io/dataflint/spark_2.12/${VERSION}/spark_2.12-${VERSION}.pom" - AUTH_ARGS="" - EXPECTED_VERSION="${VERSION}" - else - URL="https://central.sonatype.com/repository/maven-snapshots/io/dataflint/spark_2.12/maven-metadata.xml" - AUTH_ARGS="-u ${SONATYPE_USERNAME}:${SONATYPE_PASSWORD}" - EXPECTED_VERSION="${VERSION}-SNAPSHOT" - fi - echo "Verifying $EXPECTED_VERSION at: $URL" - sleep 15 - for i in $(seq 1 10); do - if curl -sf $AUTH_ARGS "$URL" | grep -q "${EXPECTED_VERSION}"; then - echo "Artifact published successfully" - exit 0 - fi - echo "Attempt $i/10 — waiting 30s..." - sleep 30 - done - echo "Artifact not found after 5 minutes" - exit 1 - working-directory: ./spark-plugin - env: - SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }} - SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }} - - name: Create Release if: github.event.inputs.release_type == 'official' || startsWith(github.ref, 'refs/tags/v') run: | diff --git a/README.md b/README.md index e30a3f53..0777a52d 100644 --- a/README.md +++ b/README.md @@ -125,7 +125,7 @@ spark-submit * There is also support for scala 2.13, if your spark cluster is using scala 2.13 change package name to io.dataflint:spark_**2.13**:0.9.6 * For more installation options, including for **python** and **k8s spark-operator**, see [Install on Spark docs](https://dataflint.gitbook.io/dataflint-for-spark/getting-started/install-on-spark) * For installing DataFlint OSS in **spark history server** for observability on completed runs see [install on spark history server docs](https://dataflint.gitbook.io/dataflint-for-spark/getting-started/install-on-spark-history-server) -* For installing DataFlint OSS on **DataBricks** see [install on databricks docs](https://dataflint.gitbook.io/dataflint-for-spark/getting-started/install-on-databricks) +* For installing DataFlint OSS on **DataBricks** see [install on databricks docs](https://dataflint.gitbook.io/dataflint-for-spark/getting-started/install-on-databricks). Databricks Runtime 17.3+ ships `javax.servlet` instead of `jakarta.servlet`, so use the dedicated shaded artifact `io.dataflint:dataflint-spark4-databricks_2.13` (same plugin class — only the jar coordinate differs). ## How it Works diff --git a/spark-plugin/build.sbt b/spark-plugin/build.sbt index a99e5e1f..27a2f66b 100644 --- a/spark-plugin/build.sbt +++ b/spark-plugin/build.sbt @@ -12,6 +12,7 @@ lazy val dataflint = project plugin, pluginspark3, pluginspark4, + pluginspark4databricks, example_3_1_3, example_3_2_4, example_3_3_3, @@ -162,11 +163,66 @@ lazy val pluginspark4 = (project in file("pluginspark4")) // Include source from plugin directory for self-contained build Compile / unmanagedSourceDirectories += (plugin / Compile / sourceDirectory).value / "scala", - + // Include resources from plugin directory for static UI files Compile / unmanagedResourceDirectories += (plugin / Compile / resourceDirectory).value ) +lazy val pluginspark4databricks = (project in file("pluginspark4databricks")) + .enablePlugins(AssemblyPlugin) + .settings( + name := "dataflint-spark4-databricks", + organization := "io.dataflint", + scalaVersion := scala213, + crossScalaVersions := List(scala213), // Only Scala 2.13 for Spark 4.x + version := (if (git.gitCurrentTags.value.exists(_.startsWith("v"))) { + versionNum + } else { + versionNum + "-SNAPSHOT" + }), + libraryDependencies += "org.apache.spark" %% "spark-core" % "4.0.1" % "provided", + libraryDependencies += "org.apache.spark" %% "spark-sql" % "4.0.1" % "provided", + libraryDependencies += "com.amazonaws" % "aws-java-sdk-s3" % "1.12.470" % "provided", + libraryDependencies += "org.apache.iceberg" %% "iceberg-spark-runtime-3.5" % "1.5.0" % "provided", + libraryDependencies += "io.delta" %% "delta-spark" % "3.2.0" % "provided", + + // Source-share with pluginspark4 + plugin so we don't duplicate code. + Compile / unmanagedSourceDirectories += (pluginspark4 / Compile / sourceDirectory).value / "scala", + Compile / unmanagedSourceDirectories += (plugin / Compile / sourceDirectory).value / "scala", + Compile / unmanagedResourceDirectories += (plugin / Compile / resourceDirectory).value, + + // Drop the upstream DataflintSparkUILoader so our local copy (which uses + // Spark4DatabricksPageFactory) is the one compiled. + Compile / unmanagedSources / excludeFilter := { + val upstreamLoader = (pluginspark4 / Compile / sourceDirectory).value / + "scala" / "org" / "apache" / "spark" / "dataflint" / "DataflintSparkUILoader.scala" + new sbt.io.SimpleFileFilter(_.getCanonicalPath == upstreamLoader.getCanonicalPath) + }, + + assembly / assemblyJarName := s"${name.value}_${scalaBinaryVersion.value}-${version.value}.jar", + assembly / assemblyOption := (assembly / assemblyOption).value.withIncludeScala(false), + // Rewrite jakarta.servlet → javax.servlet in our bytecode so the artifact + // loads on Databricks Runtime 17.3, which ships javax instead of jakarta. + assembly / assemblyShadeRules := Seq( + ShadeRule.rename("jakarta.servlet.**" -> "javax.servlet.@1").inAll + ), + assembly / assemblyMergeStrategy := { + case PathList("META-INF", "services", xs @ _*) => MergeStrategy.concat + case PathList("META-INF", xs @ _*) => MergeStrategy.discard + case "application.conf" => MergeStrategy.concat + case "reference.conf" => MergeStrategy.concat + case _ => MergeStrategy.first + }, + + Compile / packageBin := assembly.value, + publishTo := { + if (isSnapshot.value) + Some("snapshots" at "https://central.sonatype.com/repository/maven-snapshots/") + else + sonatypePublishToBundle.value + } + ) + lazy val example_3_1_3 = (project in file("example_3_1_3")) .settings( name := "DataflintSparkExample313", diff --git a/spark-plugin/clean-and-setup.sh b/spark-plugin/clean-and-setup.sh index fcd97e13..d9b3bf51 100755 --- a/spark-plugin/clean-and-setup.sh +++ b/spark-plugin/clean-and-setup.sh @@ -31,6 +31,10 @@ sbt pluginspark3/assembly echo "Building Spark 4 fat JAR..." sbt ++2.13.16 pluginspark4/assembly +# Build Spark 4 Databricks fat JAR (jakarta.servlet → javax.servlet shaded) +echo "Building Spark 4 Databricks fat JAR..." +sbt ++2.13.16 pluginspark4databricks/assembly + echo "✅ Setup complete!" echo "" echo "📋 Next steps:" @@ -40,4 +44,5 @@ echo "" echo "📦 Fat JARs created:" echo "- Spark 3.x: pluginspark3/target/scala-2.12/dataflint-spark3_2.12-0.9.7-SNAPSHOT.jar" echo "- Spark 4.x: pluginspark4/target/scala-2.13/dataflint-spark4_2.13-0.9.7-SNAPSHOT.jar" +echo "- Spark 4 (Databricks): pluginspark4databricks/target/scala-2.13/dataflint-spark4-databricks_2.13-0.9.7-SNAPSHOT.jar" diff --git a/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/MetricsUtils.scala b/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/MetricsUtils.scala index cb940215..a36d47f6 100644 --- a/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/MetricsUtils.scala +++ b/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/MetricsUtils.scala @@ -39,7 +39,11 @@ object MetricsUtils { def getSizeMetric(name: String)(implicit sparkContext: SparkContext): (String, SQLMetric) = { name -> { try { - SQLMetrics.createSizeMetric(sparkContext, name) + // Pass initValue explicitly to avoid emitting a `createSizeMetric$default$3()` + // bytecode call. Databricks Runtime 17.x rewrites SQLMetrics with explicit + // overloads instead of Scala default args, so the $default$3 helper is missing + // and the call would NoSuchMethodError before ever reaching createSizeMetric. + SQLMetrics.createSizeMetric(sparkContext, name, -1L) } catch { case _: NoSuchMethodError => try { @@ -84,7 +88,9 @@ object MetricsUtils { def getTimingMetric(name: String)(implicit sparkContext: SparkContext): (String, SQLMetric) = { name -> { try { - SQLMetrics.createTimingMetric(sparkContext, name) + // See note on getSizeMetric: pass initValue explicitly so the bytecode skips + // the missing `createTimingMetric$default$3()` helper on Databricks runtimes. + SQLMetrics.createTimingMetric(sparkContext, name, -1L) } catch { case _: NoSuchMethodError => try { diff --git a/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/DataflintSparkUILoader.scala b/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/DataflintSparkUILoader.scala new file mode 100644 index 00000000..497ca53c --- /dev/null +++ b/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/DataflintSparkUILoader.scala @@ -0,0 +1,22 @@ +package org.apache.spark.dataflint + +import org.apache.spark.SparkContext +import org.apache.spark.dataflint.api.Spark4DatabricksPageFactory +import org.apache.spark.ui.SparkUI + +/** + * Databricks variant of the Spark 4 loader. Identical to the pluginspark4 + * loader except it instantiates `Spark4DatabricksPageFactory`, which + * inverts the Databricks UI gate so the shaded jar serves UI only on DBR. + * Same FQN as the upstream loader so the shared `SparkDataflintPlugin` + * entrypoint resolves it without any per-flavor wiring. + */ +object DataflintSparkUILoader { + private val pageFactory = new Spark4DatabricksPageFactory() + + def install(context: SparkContext): String = + new org.apache.spark.dataflint.DataflintSparkUICommonInstaller().install(context, pageFactory) + + def loadUI(ui: SparkUI): String = + new org.apache.spark.dataflint.DataflintSparkUICommonInstaller().loadUI(ui, pageFactory) +} diff --git a/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/api/Spark4DatabricksPageFactory.scala b/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/api/Spark4DatabricksPageFactory.scala new file mode 100644 index 00000000..b55b82c5 --- /dev/null +++ b/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/api/Spark4DatabricksPageFactory.scala @@ -0,0 +1,16 @@ +package org.apache.spark.dataflint.api + +import org.apache.spark.ui.SparkUI + +/** + * Databricks variant of [[Spark4PageFactory]]. The parent class skips the + * DataFlint UI on any Databricks runtime to avoid the jakarta.servlet + * NoClassDefFoundError on DBR 17.3 (see issue #47). This subclass inverts + * the check: enable UI ONLY on Databricks (where the javax-shaded bytecode + * matches the runtime). If this jar is installed on stock Spark 4 by + * mistake, the UI is silently skipped instead of crashing. + */ +class Spark4DatabricksPageFactory extends Spark4PageFactory { + override def isUISupported(ui: SparkUI): Boolean = + ui.conf.getOption("spark.databricks.clusterUsageTags.cloudProvider").isDefined +} diff --git a/spark-ui/src/utils/FormatUtils.ts b/spark-ui/src/utils/FormatUtils.ts index c4778973..6d5f6a6b 100644 --- a/spark-ui/src/utils/FormatUtils.ts +++ b/spark-ui/src/utils/FormatUtils.ts @@ -91,7 +91,12 @@ export function timeStringToMilliseconds( case "h": return duration(value, "hours").asMilliseconds(); default: - throw new Error(`Unsupported time unit: ${unit}`); + // Some Spark forks (e.g. Databricks) return certain duration-like metrics as + // bare numbers without a unit suffix, which slices to digit pairs. Return + // undefined instead of throwing — every caller already handles undefined + // with `?? 0` and the rest of the page keeps rendering. + console.warn(`timeStringToMilliseconds: unsupported time unit "${unit}" in "${timeString}"`); + return undefined; } }