From bedec4f5e95388537d2da1df89e19d22972cb03a Mon Sep 17 00:00:00 2001 From: Avi Minsky Date: Thu, 7 May 2026 17:37:30 +0300 Subject: [PATCH 1/4] add dataflint-spark4-databricks shaded artifact for DBR 17.3+ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Databricks Runtime 17.3 ships javax.servlet instead of jakarta.servlet, crashing the standard Spark 4 plugin at startup with NoClassDefFoundError on jakarta/servlet/Servlet (issue #47). Add a parallel SBT module pluginspark4databricks that source-shares with pluginspark4 but applies ShadeRule.rename("jakarta.servlet.**" -> "javax.servlet.@1") at assembly time, producing io.dataflint:dataflint-spark4-databricks_2.13. A Spark4DatabricksPageFactory subclass inverts the Databricks UI gate so the new jar enables the UI only on DBR (and silently degrades to listeners-only if accidentally installed on stock Spark 4); the original Spark4PageFactory is unchanged. Drop the Maven-Central verify step in cd.yml — it only checked spark_2.12 and didn't work for snapshots. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/cd.yml | 29 ---------- README.md | 2 +- spark-plugin/build.sbt | 58 ++++++++++++++++++- spark-plugin/clean-and-setup.sh | 5 ++ .../dataflint/DataflintSparkUILoader.scala | 22 +++++++ .../api/Spark4DatabricksPageFactory.scala | 16 +++++ 6 files changed, 101 insertions(+), 31 deletions(-) create mode 100644 spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/DataflintSparkUILoader.scala create mode 100644 spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/api/Spark4DatabricksPageFactory.scala diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index e96fc41..24b60d9 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -95,35 +95,6 @@ jobs: SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }} SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }} - - name: Verify artifact published - run: | - VERSION=$(grep 'lazy val versionNum' build.sbt | grep -oE '"[0-9.]+"' | tr -d '"') - if [[ "$IS_RELEASE" == "true" ]]; then - URL="https://repo1.maven.org/maven2/io/dataflint/spark_2.12/${VERSION}/spark_2.12-${VERSION}.pom" - AUTH_ARGS="" - EXPECTED_VERSION="${VERSION}" - else - URL="https://central.sonatype.com/repository/maven-snapshots/io/dataflint/spark_2.12/maven-metadata.xml" - AUTH_ARGS="-u ${SONATYPE_USERNAME}:${SONATYPE_PASSWORD}" - EXPECTED_VERSION="${VERSION}-SNAPSHOT" - fi - echo "Verifying $EXPECTED_VERSION at: $URL" - sleep 15 - for i in $(seq 1 10); do - if curl -sf $AUTH_ARGS "$URL" | grep -q "${EXPECTED_VERSION}"; then - echo "Artifact published successfully" - exit 0 - fi - echo "Attempt $i/10 — waiting 30s..." - sleep 30 - done - echo "Artifact not found after 5 minutes" - exit 1 - working-directory: ./spark-plugin - env: - SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }} - SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }} - - name: Create Release if: github.event.inputs.release_type == 'official' || startsWith(github.ref, 'refs/tags/v') run: | diff --git a/README.md b/README.md index e30a3f5..0777a52 100644 --- a/README.md +++ b/README.md @@ -125,7 +125,7 @@ spark-submit * There is also support for scala 2.13, if your spark cluster is using scala 2.13 change package name to io.dataflint:spark_**2.13**:0.9.6 * For more installation options, including for **python** and **k8s spark-operator**, see [Install on Spark docs](https://dataflint.gitbook.io/dataflint-for-spark/getting-started/install-on-spark) * For installing DataFlint OSS in **spark history server** for observability on completed runs see [install on spark history server docs](https://dataflint.gitbook.io/dataflint-for-spark/getting-started/install-on-spark-history-server) -* For installing DataFlint OSS on **DataBricks** see [install on databricks docs](https://dataflint.gitbook.io/dataflint-for-spark/getting-started/install-on-databricks) +* For installing DataFlint OSS on **DataBricks** see [install on databricks docs](https://dataflint.gitbook.io/dataflint-for-spark/getting-started/install-on-databricks). Databricks Runtime 17.3+ ships `javax.servlet` instead of `jakarta.servlet`, so use the dedicated shaded artifact `io.dataflint:dataflint-spark4-databricks_2.13` (same plugin class — only the jar coordinate differs). ## How it Works diff --git a/spark-plugin/build.sbt b/spark-plugin/build.sbt index a99e5e1..27a2f66 100644 --- a/spark-plugin/build.sbt +++ b/spark-plugin/build.sbt @@ -12,6 +12,7 @@ lazy val dataflint = project plugin, pluginspark3, pluginspark4, + pluginspark4databricks, example_3_1_3, example_3_2_4, example_3_3_3, @@ -162,11 +163,66 @@ lazy val pluginspark4 = (project in file("pluginspark4")) // Include source from plugin directory for self-contained build Compile / unmanagedSourceDirectories += (plugin / Compile / sourceDirectory).value / "scala", - + // Include resources from plugin directory for static UI files Compile / unmanagedResourceDirectories += (plugin / Compile / resourceDirectory).value ) +lazy val pluginspark4databricks = (project in file("pluginspark4databricks")) + .enablePlugins(AssemblyPlugin) + .settings( + name := "dataflint-spark4-databricks", + organization := "io.dataflint", + scalaVersion := scala213, + crossScalaVersions := List(scala213), // Only Scala 2.13 for Spark 4.x + version := (if (git.gitCurrentTags.value.exists(_.startsWith("v"))) { + versionNum + } else { + versionNum + "-SNAPSHOT" + }), + libraryDependencies += "org.apache.spark" %% "spark-core" % "4.0.1" % "provided", + libraryDependencies += "org.apache.spark" %% "spark-sql" % "4.0.1" % "provided", + libraryDependencies += "com.amazonaws" % "aws-java-sdk-s3" % "1.12.470" % "provided", + libraryDependencies += "org.apache.iceberg" %% "iceberg-spark-runtime-3.5" % "1.5.0" % "provided", + libraryDependencies += "io.delta" %% "delta-spark" % "3.2.0" % "provided", + + // Source-share with pluginspark4 + plugin so we don't duplicate code. + Compile / unmanagedSourceDirectories += (pluginspark4 / Compile / sourceDirectory).value / "scala", + Compile / unmanagedSourceDirectories += (plugin / Compile / sourceDirectory).value / "scala", + Compile / unmanagedResourceDirectories += (plugin / Compile / resourceDirectory).value, + + // Drop the upstream DataflintSparkUILoader so our local copy (which uses + // Spark4DatabricksPageFactory) is the one compiled. + Compile / unmanagedSources / excludeFilter := { + val upstreamLoader = (pluginspark4 / Compile / sourceDirectory).value / + "scala" / "org" / "apache" / "spark" / "dataflint" / "DataflintSparkUILoader.scala" + new sbt.io.SimpleFileFilter(_.getCanonicalPath == upstreamLoader.getCanonicalPath) + }, + + assembly / assemblyJarName := s"${name.value}_${scalaBinaryVersion.value}-${version.value}.jar", + assembly / assemblyOption := (assembly / assemblyOption).value.withIncludeScala(false), + // Rewrite jakarta.servlet → javax.servlet in our bytecode so the artifact + // loads on Databricks Runtime 17.3, which ships javax instead of jakarta. + assembly / assemblyShadeRules := Seq( + ShadeRule.rename("jakarta.servlet.**" -> "javax.servlet.@1").inAll + ), + assembly / assemblyMergeStrategy := { + case PathList("META-INF", "services", xs @ _*) => MergeStrategy.concat + case PathList("META-INF", xs @ _*) => MergeStrategy.discard + case "application.conf" => MergeStrategy.concat + case "reference.conf" => MergeStrategy.concat + case _ => MergeStrategy.first + }, + + Compile / packageBin := assembly.value, + publishTo := { + if (isSnapshot.value) + Some("snapshots" at "https://central.sonatype.com/repository/maven-snapshots/") + else + sonatypePublishToBundle.value + } + ) + lazy val example_3_1_3 = (project in file("example_3_1_3")) .settings( name := "DataflintSparkExample313", diff --git a/spark-plugin/clean-and-setup.sh b/spark-plugin/clean-and-setup.sh index fcd97e1..d9b3bf5 100755 --- a/spark-plugin/clean-and-setup.sh +++ b/spark-plugin/clean-and-setup.sh @@ -31,6 +31,10 @@ sbt pluginspark3/assembly echo "Building Spark 4 fat JAR..." sbt ++2.13.16 pluginspark4/assembly +# Build Spark 4 Databricks fat JAR (jakarta.servlet → javax.servlet shaded) +echo "Building Spark 4 Databricks fat JAR..." +sbt ++2.13.16 pluginspark4databricks/assembly + echo "✅ Setup complete!" echo "" echo "📋 Next steps:" @@ -40,4 +44,5 @@ echo "" echo "📦 Fat JARs created:" echo "- Spark 3.x: pluginspark3/target/scala-2.12/dataflint-spark3_2.12-0.9.7-SNAPSHOT.jar" echo "- Spark 4.x: pluginspark4/target/scala-2.13/dataflint-spark4_2.13-0.9.7-SNAPSHOT.jar" +echo "- Spark 4 (Databricks): pluginspark4databricks/target/scala-2.13/dataflint-spark4-databricks_2.13-0.9.7-SNAPSHOT.jar" diff --git a/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/DataflintSparkUILoader.scala b/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/DataflintSparkUILoader.scala new file mode 100644 index 0000000..56ed1e7 --- /dev/null +++ b/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/DataflintSparkUILoader.scala @@ -0,0 +1,22 @@ +package org.apache.spark.dataflint + +import org.apache.spark.SparkContext +import org.apache.spark.dataflint.api.Spark4DatabricksPageFactory +import org.apache.spark.ui.SparkUI + +/** + * Databricks variant of the Spark 4 loader. Identical to the pluginspark4 + * loader except it instantiates [[Spark4DatabricksPageFactory]], which + * inverts the Databricks UI gate so the shaded jar serves UI only on DBR. + * Same FQN as the upstream loader so the shared SparkDataflintPlugin + * entrypoint resolves it without any per-flavor wiring. + */ +object DataflintSparkUILoader { + private val pageFactory = new Spark4DatabricksPageFactory() + + def install(context: SparkContext): String = + new org.apache.spark.dataflint.DataflintSparkUICommonInstaller().install(context, pageFactory) + + def loadUI(ui: SparkUI): String = + new org.apache.spark.dataflint.DataflintSparkUICommonInstaller().loadUI(ui, pageFactory) +} diff --git a/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/api/Spark4DatabricksPageFactory.scala b/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/api/Spark4DatabricksPageFactory.scala new file mode 100644 index 0000000..b55b82c --- /dev/null +++ b/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/api/Spark4DatabricksPageFactory.scala @@ -0,0 +1,16 @@ +package org.apache.spark.dataflint.api + +import org.apache.spark.ui.SparkUI + +/** + * Databricks variant of [[Spark4PageFactory]]. The parent class skips the + * DataFlint UI on any Databricks runtime to avoid the jakarta.servlet + * NoClassDefFoundError on DBR 17.3 (see issue #47). This subclass inverts + * the check: enable UI ONLY on Databricks (where the javax-shaded bytecode + * matches the runtime). If this jar is installed on stock Spark 4 by + * mistake, the UI is silently skipped instead of crashing. + */ +class Spark4DatabricksPageFactory extends Spark4PageFactory { + override def isUISupported(ui: SparkUI): Boolean = + ui.conf.getOption("spark.databricks.clusterUsageTags.cloudProvider").isDefined +} From 2fea52d4cbe6fe655112d3576017262fcddc57bb Mon Sep 17 00:00:00 2001 From: Avi Minsky Date: Thu, 7 May 2026 19:03:36 +0300 Subject: [PATCH 2/4] pass explicit initValue to SQLMetrics factories so DBR loads timing/size metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Databricks Runtime 17.x rewrites SQLMetrics with explicit overloads instead of Scala default args, so the bytecode-level helpers createTimingMetric$default$3 and createSizeMetric$default$3 don't exist on DBR. Calling SQLMetrics.createTimingMetric(sc, name) compiles to a $default$3 fetch + 3-arg call — the helper fetch NoSuchMethodErrors before the metric is ever created, the catch falls through to step 3 (SQLMetrics.createMetric, a SUM metric), and the TimedExec "duration" surfaces in the Spark UI as a bare number with no unit (e.g. "1058") instead of "5s (1s, 2s, 3s)". The DataFlint React UI then crashes parsing the bare number with "Unsupported time unit: 58". Pass -1L explicitly so the bytecode emits a direct 3-arg invokevirtual, matching the runtime overload that exists on both stock Spark 4 and DBR 17.x. -1L is the same value stock Spark uses as the default — semantics unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../org/apache/spark/dataflint/MetricsUtils.scala | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/MetricsUtils.scala b/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/MetricsUtils.scala index cb94021..a36d47f 100644 --- a/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/MetricsUtils.scala +++ b/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/MetricsUtils.scala @@ -39,7 +39,11 @@ object MetricsUtils { def getSizeMetric(name: String)(implicit sparkContext: SparkContext): (String, SQLMetric) = { name -> { try { - SQLMetrics.createSizeMetric(sparkContext, name) + // Pass initValue explicitly to avoid emitting a `createSizeMetric$default$3()` + // bytecode call. Databricks Runtime 17.x rewrites SQLMetrics with explicit + // overloads instead of Scala default args, so the $default$3 helper is missing + // and the call would NoSuchMethodError before ever reaching createSizeMetric. + SQLMetrics.createSizeMetric(sparkContext, name, -1L) } catch { case _: NoSuchMethodError => try { @@ -84,7 +88,9 @@ object MetricsUtils { def getTimingMetric(name: String)(implicit sparkContext: SparkContext): (String, SQLMetric) = { name -> { try { - SQLMetrics.createTimingMetric(sparkContext, name) + // See note on getSizeMetric: pass initValue explicitly so the bytecode skips + // the missing `createTimingMetric$default$3()` helper on Databricks runtimes. + SQLMetrics.createTimingMetric(sparkContext, name, -1L) } catch { case _: NoSuchMethodError => try { From 4842285b68a3f9885d4e823b7eaa2e6b2b85ed9b Mon Sep 17 00:00:00 2001 From: Avi Minsky Date: Thu, 7 May 2026 19:10:01 +0300 Subject: [PATCH 3/4] ui: stop crashing on unsupported time-unit suffix in metric strings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit timeStringToMilliseconds slices the last 2 chars of a metric value as the unit and threw "Unsupported time unit: ${unit}" if it wasn't ms/s/m/h. Some Spark forks (Databricks) return duration-named metrics as bare numbers with no unit suffix — the slice then picks up digit pairs (e.g. "58") and the throw bubbles up through the React render, blanking the SQL plan page. Return undefined instead. Every caller (SqlReducer, GraphDurationAttribution) already handles undefined with `?? 0`, so missing duration data degrades gracefully and the rest of the page keeps rendering. Logs a console warning so the malformed value is still discoverable in DevTools. Co-Authored-By: Claude Opus 4.7 (1M context) --- spark-ui/src/utils/FormatUtils.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/spark-ui/src/utils/FormatUtils.ts b/spark-ui/src/utils/FormatUtils.ts index c477897..6d5f6a6 100644 --- a/spark-ui/src/utils/FormatUtils.ts +++ b/spark-ui/src/utils/FormatUtils.ts @@ -91,7 +91,12 @@ export function timeStringToMilliseconds( case "h": return duration(value, "hours").asMilliseconds(); default: - throw new Error(`Unsupported time unit: ${unit}`); + // Some Spark forks (e.g. Databricks) return certain duration-like metrics as + // bare numbers without a unit suffix, which slices to digit pairs. Return + // undefined instead of throwing — every caller already handles undefined + // with `?? 0` and the rest of the page keeps rendering. + console.warn(`timeStringToMilliseconds: unsupported time unit "${unit}" in "${timeString}"`); + return undefined; } } From 09e00514afc0b2fa3130a496b32bab68af19afe2 Mon Sep 17 00:00:00 2001 From: Avi Minsky Date: Thu, 7 May 2026 19:12:41 +0300 Subject: [PATCH 4/4] fix scaladoc link warning on DataflintSparkUILoader MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Scaladoc couldn't resolve [[Spark4DatabricksPageFactory]] when generating docs in CD because of how the new module compiles via source-share. Use backticks (markdown code) instead of doc links — same readability, no linker pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../org/apache/spark/dataflint/DataflintSparkUILoader.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/DataflintSparkUILoader.scala b/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/DataflintSparkUILoader.scala index 56ed1e7..497ca53 100644 --- a/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/DataflintSparkUILoader.scala +++ b/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/DataflintSparkUILoader.scala @@ -6,9 +6,9 @@ import org.apache.spark.ui.SparkUI /** * Databricks variant of the Spark 4 loader. Identical to the pluginspark4 - * loader except it instantiates [[Spark4DatabricksPageFactory]], which + * loader except it instantiates `Spark4DatabricksPageFactory`, which * inverts the Databricks UI gate so the shaded jar serves UI only on DBR. - * Same FQN as the upstream loader so the shared SparkDataflintPlugin + * Same FQN as the upstream loader so the shared `SparkDataflintPlugin` * entrypoint resolves it without any per-flavor wiring. */ object DataflintSparkUILoader {