From bedec4f5e95388537d2da1df89e19d22972cb03a Mon Sep 17 00:00:00 2001
From: Avi Minsky <minsky.a@gmail.com>
Date: Thu, 7 May 2026 17:37:30 +0300
Subject: [PATCH 1/4] add dataflint-spark4-databricks shaded artifact for DBR
 17.3+
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Databricks Runtime 17.3 ships javax.servlet instead of jakarta.servlet,
crashing the standard Spark 4 plugin at startup with NoClassDefFoundError
on jakarta/servlet/Servlet (issue #47). Add a parallel SBT module
pluginspark4databricks that source-shares with pluginspark4 but applies
ShadeRule.rename("jakarta.servlet.**" -> "javax.servlet.@1") at assembly
time, producing io.dataflint:dataflint-spark4-databricks_2.13. A
Spark4DatabricksPageFactory subclass inverts the Databricks UI gate so
the new jar enables the UI only on DBR (and silently degrades to
listeners-only if accidentally installed on stock Spark 4); the original
Spark4PageFactory is unchanged. Drop the Maven-Central verify step in
cd.yml — it only checked spark_2.12 and didn't work for snapshots.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/cd.yml                      | 29 ----------
 README.md                                     |  2 +-
 spark-plugin/build.sbt                        | 58 ++++++++++++++++++-
 spark-plugin/clean-and-setup.sh               |  5 ++
 .../dataflint/DataflintSparkUILoader.scala    | 22 +++++++
 .../api/Spark4DatabricksPageFactory.scala     | 16 +++++
 6 files changed, 101 insertions(+), 31 deletions(-)
 create mode 100644 spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/DataflintSparkUILoader.scala
 create mode 100644 spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/api/Spark4DatabricksPageFactory.scala

diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml
index e96fc41..24b60d9 100644
--- a/.github/workflows/cd.yml
+++ b/.github/workflows/cd.yml
@@ -95,35 +95,6 @@ jobs:
           SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }}
           SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }}
 
-      - name: Verify artifact published
-        run: |
-          VERSION=$(grep 'lazy val versionNum' build.sbt | grep -oE '"[0-9.]+"' | tr -d '"')
-          if [[ "$IS_RELEASE" == "true" ]]; then
-            URL="https://repo1.maven.org/maven2/io/dataflint/spark_2.12/${VERSION}/spark_2.12-${VERSION}.pom"
-            AUTH_ARGS=""
-            EXPECTED_VERSION="${VERSION}"
-          else
-            URL="https://central.sonatype.com/repository/maven-snapshots/io/dataflint/spark_2.12/maven-metadata.xml"
-            AUTH_ARGS="-u ${SONATYPE_USERNAME}:${SONATYPE_PASSWORD}"
-            EXPECTED_VERSION="${VERSION}-SNAPSHOT"
-          fi
-          echo "Verifying $EXPECTED_VERSION at: $URL"
-          sleep 15
-          for i in $(seq 1 10); do
-            if curl -sf $AUTH_ARGS "$URL" | grep -q "${EXPECTED_VERSION}"; then
-              echo "Artifact published successfully"
-              exit 0
-            fi
-            echo "Attempt $i/10 — waiting 30s..."
-            sleep 30
-          done
-          echo "Artifact not found after 5 minutes"
-          exit 1
-        working-directory: ./spark-plugin
-        env:
-          SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }}
-          SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }}
-
       - name: Create Release
         if: github.event.inputs.release_type == 'official' || startsWith(github.ref, 'refs/tags/v')
         run: |
diff --git a/README.md b/README.md
index e30a3f5..0777a52 100644
--- a/README.md
+++ b/README.md
@@ -125,7 +125,7 @@ spark-submit
 * There is also support for scala 2.13, if your spark cluster is using scala 2.13 change package name to io.dataflint:spark_**2.13**:0.9.6
 * For more installation options, including for **python** and **k8s spark-operator**, see [Install on Spark docs](https://dataflint.gitbook.io/dataflint-for-spark/getting-started/install-on-spark)
 * For installing DataFlint OSS in **spark history server** for observability on completed runs see [install on spark history server docs](https://dataflint.gitbook.io/dataflint-for-spark/getting-started/install-on-spark-history-server)
-* For installing DataFlint OSS on **DataBricks** see [install on databricks docs](https://dataflint.gitbook.io/dataflint-for-spark/getting-started/install-on-databricks)
+* For installing DataFlint OSS on **DataBricks** see [install on databricks docs](https://dataflint.gitbook.io/dataflint-for-spark/getting-started/install-on-databricks). Databricks Runtime 17.3+ ships `javax.servlet` instead of `jakarta.servlet`, so use the dedicated shaded artifact `io.dataflint:dataflint-spark4-databricks_2.13` (same plugin class — only the jar coordinate differs).
 
 ## How it Works
 
diff --git a/spark-plugin/build.sbt b/spark-plugin/build.sbt
index a99e5e1..27a2f66 100644
--- a/spark-plugin/build.sbt
+++ b/spark-plugin/build.sbt
@@ -12,6 +12,7 @@ lazy val dataflint = project
     plugin,
     pluginspark3,
     pluginspark4,
+    pluginspark4databricks,
     example_3_1_3,
     example_3_2_4,
     example_3_3_3,
@@ -162,11 +163,66 @@ lazy val pluginspark4 = (project in file("pluginspark4"))
     
     // Include source from plugin directory for self-contained build
     Compile / unmanagedSourceDirectories += (plugin / Compile / sourceDirectory).value / "scala",
-    
+
     // Include resources from plugin directory for static UI files
     Compile / unmanagedResourceDirectories += (plugin / Compile / resourceDirectory).value
   )
 
+lazy val pluginspark4databricks = (project in file("pluginspark4databricks"))
+  .enablePlugins(AssemblyPlugin)
+  .settings(
+    name := "dataflint-spark4-databricks",
+    organization := "io.dataflint",
+    scalaVersion := scala213,
+    crossScalaVersions := List(scala213), // Only Scala 2.13 for Spark 4.x
+    version      := (if (git.gitCurrentTags.value.exists(_.startsWith("v"))) {
+      versionNum
+    } else {
+      versionNum + "-SNAPSHOT"
+    }),
+    libraryDependencies += "org.apache.spark" %% "spark-core" % "4.0.1" % "provided",
+    libraryDependencies += "org.apache.spark" %% "spark-sql" % "4.0.1"  % "provided",
+    libraryDependencies +=  "com.amazonaws" % "aws-java-sdk-s3" % "1.12.470" % "provided",
+    libraryDependencies += "org.apache.iceberg" %% "iceberg-spark-runtime-3.5" % "1.5.0" % "provided",
+    libraryDependencies += "io.delta" %% "delta-spark" % "3.2.0" % "provided",
+
+    // Source-share with pluginspark4 + plugin so we don't duplicate code.
+    Compile / unmanagedSourceDirectories += (pluginspark4 / Compile / sourceDirectory).value / "scala",
+    Compile / unmanagedSourceDirectories += (plugin / Compile / sourceDirectory).value / "scala",
+    Compile / unmanagedResourceDirectories += (plugin / Compile / resourceDirectory).value,
+
+    // Drop the upstream DataflintSparkUILoader so our local copy (which uses
+    // Spark4DatabricksPageFactory) is the one compiled.
+    Compile / unmanagedSources / excludeFilter := {
+      val upstreamLoader = (pluginspark4 / Compile / sourceDirectory).value /
+        "scala" / "org" / "apache" / "spark" / "dataflint" / "DataflintSparkUILoader.scala"
+      new sbt.io.SimpleFileFilter(_.getCanonicalPath == upstreamLoader.getCanonicalPath)
+    },
+
+    assembly / assemblyJarName := s"${name.value}_${scalaBinaryVersion.value}-${version.value}.jar",
+    assembly / assemblyOption := (assembly / assemblyOption).value.withIncludeScala(false),
+    // Rewrite jakarta.servlet → javax.servlet in our bytecode so the artifact
+    // loads on Databricks Runtime 17.3, which ships javax instead of jakarta.
+    assembly / assemblyShadeRules := Seq(
+      ShadeRule.rename("jakarta.servlet.**" -> "javax.servlet.@1").inAll
+    ),
+    assembly / assemblyMergeStrategy := {
+      case PathList("META-INF", "services", xs @ _*) => MergeStrategy.concat
+      case PathList("META-INF", xs @ _*) => MergeStrategy.discard
+      case "application.conf" => MergeStrategy.concat
+      case "reference.conf" => MergeStrategy.concat
+      case _ => MergeStrategy.first
+    },
+
+    Compile / packageBin := assembly.value,
+    publishTo := {
+      if (isSnapshot.value)
+        Some("snapshots" at "https://central.sonatype.com/repository/maven-snapshots/")
+      else
+        sonatypePublishToBundle.value
+    }
+  )
+
 lazy val example_3_1_3 = (project in file("example_3_1_3"))
   .settings(
     name := "DataflintSparkExample313",
diff --git a/spark-plugin/clean-and-setup.sh b/spark-plugin/clean-and-setup.sh
index fcd97e1..d9b3bf5 100755
--- a/spark-plugin/clean-and-setup.sh
+++ b/spark-plugin/clean-and-setup.sh
@@ -31,6 +31,10 @@ sbt pluginspark3/assembly
 echo "Building Spark 4 fat JAR..."
 sbt ++2.13.16 pluginspark4/assembly
 
+# Build Spark 4 Databricks fat JAR (jakarta.servlet → javax.servlet shaded)
+echo "Building Spark 4 Databricks fat JAR..."
+sbt ++2.13.16 pluginspark4databricks/assembly
+
 echo "✅ Setup complete!"
 echo ""
 echo "📋 Next steps:"
@@ -40,4 +44,5 @@ echo ""
 echo "📦 Fat JARs created:"
 echo "- Spark 3.x: pluginspark3/target/scala-2.12/dataflint-spark3_2.12-0.9.7-SNAPSHOT.jar"
 echo "- Spark 4.x: pluginspark4/target/scala-2.13/dataflint-spark4_2.13-0.9.7-SNAPSHOT.jar"
+echo "- Spark 4 (Databricks): pluginspark4databricks/target/scala-2.13/dataflint-spark4-databricks_2.13-0.9.7-SNAPSHOT.jar"
 
diff --git a/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/DataflintSparkUILoader.scala b/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/DataflintSparkUILoader.scala
new file mode 100644
index 0000000..56ed1e7
--- /dev/null
+++ b/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/DataflintSparkUILoader.scala
@@ -0,0 +1,22 @@
+package org.apache.spark.dataflint
+
+import org.apache.spark.SparkContext
+import org.apache.spark.dataflint.api.Spark4DatabricksPageFactory
+import org.apache.spark.ui.SparkUI
+
+/**
+ * Databricks variant of the Spark 4 loader. Identical to the pluginspark4
+ * loader except it instantiates [[Spark4DatabricksPageFactory]], which
+ * inverts the Databricks UI gate so the shaded jar serves UI only on DBR.
+ * Same FQN as the upstream loader so the shared SparkDataflintPlugin
+ * entrypoint resolves it without any per-flavor wiring.
+ */
+object DataflintSparkUILoader {
+  private val pageFactory = new Spark4DatabricksPageFactory()
+
+  def install(context: SparkContext): String =
+    new org.apache.spark.dataflint.DataflintSparkUICommonInstaller().install(context, pageFactory)
+
+  def loadUI(ui: SparkUI): String =
+    new org.apache.spark.dataflint.DataflintSparkUICommonInstaller().loadUI(ui, pageFactory)
+}
diff --git a/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/api/Spark4DatabricksPageFactory.scala b/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/api/Spark4DatabricksPageFactory.scala
new file mode 100644
index 0000000..b55b82c
--- /dev/null
+++ b/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/api/Spark4DatabricksPageFactory.scala
@@ -0,0 +1,16 @@
+package org.apache.spark.dataflint.api
+
+import org.apache.spark.ui.SparkUI
+
+/**
+ * Databricks variant of [[Spark4PageFactory]]. The parent class skips the
+ * DataFlint UI on any Databricks runtime to avoid the jakarta.servlet
+ * NoClassDefFoundError on DBR 17.3 (see issue #47). This subclass inverts
+ * the check: enable UI ONLY on Databricks (where the javax-shaded bytecode
+ * matches the runtime). If this jar is installed on stock Spark 4 by
+ * mistake, the UI is silently skipped instead of crashing.
+ */
+class Spark4DatabricksPageFactory extends Spark4PageFactory {
+  override def isUISupported(ui: SparkUI): Boolean =
+    ui.conf.getOption("spark.databricks.clusterUsageTags.cloudProvider").isDefined
+}

From 2fea52d4cbe6fe655112d3576017262fcddc57bb Mon Sep 17 00:00:00 2001
From: Avi Minsky <minsky.a@gmail.com>
Date: Thu, 7 May 2026 19:03:36 +0300
Subject: [PATCH 2/4] pass explicit initValue to SQLMetrics factories so DBR
 loads timing/size metrics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Databricks Runtime 17.x rewrites SQLMetrics with explicit overloads instead
of Scala default args, so the bytecode-level helpers createTimingMetric$default$3
and createSizeMetric$default$3 don't exist on DBR. Calling
SQLMetrics.createTimingMetric(sc, name) compiles to a $default$3 fetch +
3-arg call — the helper fetch NoSuchMethodErrors before the metric is ever
created, the catch falls through to step 3 (SQLMetrics.createMetric, a SUM
metric), and the TimedExec "duration" surfaces in the Spark UI as a bare
number with no unit (e.g. "1058") instead of "5s (1s, 2s, 3s)". The DataFlint
React UI then crashes parsing the bare number with "Unsupported time unit: 58".

Pass -1L explicitly so the bytecode emits a direct 3-arg invokevirtual,
matching the runtime overload that exists on both stock Spark 4 and DBR 17.x.
-1L is the same value stock Spark uses as the default — semantics unchanged.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../org/apache/spark/dataflint/MetricsUtils.scala      | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/MetricsUtils.scala b/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/MetricsUtils.scala
index cb94021..a36d47f 100644
--- a/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/MetricsUtils.scala
+++ b/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/MetricsUtils.scala
@@ -39,7 +39,11 @@ object MetricsUtils {
   def getSizeMetric(name: String)(implicit sparkContext: SparkContext): (String, SQLMetric) = {
     name -> {
       try {
-        SQLMetrics.createSizeMetric(sparkContext, name)
+        // Pass initValue explicitly to avoid emitting a `createSizeMetric$default$3()`
+        // bytecode call. Databricks Runtime 17.x rewrites SQLMetrics with explicit
+        // overloads instead of Scala default args, so the $default$3 helper is missing
+        // and the call would NoSuchMethodError before ever reaching createSizeMetric.
+        SQLMetrics.createSizeMetric(sparkContext, name, -1L)
       } catch {
         case _: NoSuchMethodError =>
           try {
@@ -84,7 +88,9 @@ object MetricsUtils {
   def getTimingMetric(name: String)(implicit sparkContext: SparkContext): (String, SQLMetric) = {
     name -> {
       try {
-        SQLMetrics.createTimingMetric(sparkContext, name)
+        // See note on getSizeMetric: pass initValue explicitly so the bytecode skips
+        // the missing `createTimingMetric$default$3()` helper on Databricks runtimes.
+        SQLMetrics.createTimingMetric(sparkContext, name, -1L)
       } catch {
         case _: NoSuchMethodError =>
           try {

From 4842285b68a3f9885d4e823b7eaa2e6b2b85ed9b Mon Sep 17 00:00:00 2001
From: Avi Minsky <minsky.a@gmail.com>
Date: Thu, 7 May 2026 19:10:01 +0300
Subject: [PATCH 3/4] ui: stop crashing on unsupported time-unit suffix in
 metric strings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

timeStringToMilliseconds slices the last 2 chars of a metric value as the
unit and threw "Unsupported time unit: ${unit}" if it wasn't ms/s/m/h.
Some Spark forks (Databricks) return duration-named metrics as bare numbers
with no unit suffix — the slice then picks up digit pairs (e.g. "58") and
the throw bubbles up through the React render, blanking the SQL plan page.

Return undefined instead. Every caller (SqlReducer, GraphDurationAttribution)
already handles undefined with `?? 0`, so missing duration data degrades
gracefully and the rest of the page keeps rendering. Logs a console warning
so the malformed value is still discoverable in DevTools.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 spark-ui/src/utils/FormatUtils.ts | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/spark-ui/src/utils/FormatUtils.ts b/spark-ui/src/utils/FormatUtils.ts
index c477897..6d5f6a6 100644
--- a/spark-ui/src/utils/FormatUtils.ts
+++ b/spark-ui/src/utils/FormatUtils.ts
@@ -91,7 +91,12 @@ export function timeStringToMilliseconds(
     case "h":
       return duration(value, "hours").asMilliseconds();
     default:
-      throw new Error(`Unsupported time unit: ${unit}`);
+      // Some Spark forks (e.g. Databricks) return certain duration-like metrics as
+      // bare numbers without a unit suffix, which slices to digit pairs. Return
+      // undefined instead of throwing — every caller already handles undefined
+      // with `?? 0` and the rest of the page keeps rendering.
+      console.warn(`timeStringToMilliseconds: unsupported time unit "${unit}" in "${timeString}"`);
+      return undefined;
   }
 }
 

From 09e00514afc0b2fa3130a496b32bab68af19afe2 Mon Sep 17 00:00:00 2001
From: Avi Minsky <minsky.a@gmail.com>
Date: Thu, 7 May 2026 19:12:41 +0300
Subject: [PATCH 4/4] fix scaladoc link warning on DataflintSparkUILoader
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Scaladoc couldn't resolve [[Spark4DatabricksPageFactory]] when generating
docs in CD because of how the new module compiles via source-share. Use
backticks (markdown code) instead of doc links — same readability, no
linker pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../org/apache/spark/dataflint/DataflintSparkUILoader.scala   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/DataflintSparkUILoader.scala b/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/DataflintSparkUILoader.scala
index 56ed1e7..497ca53 100644
--- a/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/DataflintSparkUILoader.scala
+++ b/spark-plugin/pluginspark4databricks/src/main/scala/org/apache/spark/dataflint/DataflintSparkUILoader.scala
@@ -6,9 +6,9 @@ import org.apache.spark.ui.SparkUI
 
 /**
  * Databricks variant of the Spark 4 loader. Identical to the pluginspark4
- * loader except it instantiates [[Spark4DatabricksPageFactory]], which
+ * loader except it instantiates `Spark4DatabricksPageFactory`, which
  * inverts the Databricks UI gate so the shaded jar serves UI only on DBR.
- * Same FQN as the upstream loader so the shared SparkDataflintPlugin
+ * Same FQN as the upstream loader so the shared `SparkDataflintPlugin`
  * entrypoint resolves it without any per-flavor wiring.
  */
 object DataflintSparkUILoader {