From 492ea595c4ac6b5ff80faec83382a975a8a0aa5f Mon Sep 17 00:00:00 2001 From: menishmueli Date: Sat, 11 Apr 2026 23:15:55 +0300 Subject: [PATCH 1/2] Add Apache Gluten/Velox support to DataFlint UI - Add Gluten/Velox node type classification, display names, and accelerator badges (Velox, Photon, RAPIDS, DataFusion) in the SQL plan flow view - Fix stage identification for Gluten's WholeStageCodegenTransformer nodes by inferring codegen-to-node mapping and handling AQE codegen renumbering - Split ColumnarExchange into write/read visual nodes across stage boundaries - Propagate stages through Gluten-specific boundary nodes (VeloxResizeBatches, RowToVeloxColumnar, TakeOrderedAndProjectExecTransformer, etc.) - Show Velox native timing metrics (aggregation/filter/sort/window time, peak memory, spill) on plan nodes - Strip Gluten class name prefixes from plan descriptions in parsers - Add Docker environment and example app for running Gluten/Velox on Spark 3.5 - Add unit test for Gluten stage assignment with real fixture data --- docker/gluten/.gitignore | 3 + docker/gluten/Dockerfile | 61 ++ docker/gluten/docker-compose.yml | 16 + docker/gluten/run-gluten-example.sh | 143 ++++ .../example/GlutenVeloxExample.scala | 116 +++ .../components/SqlFlow/SqlLayoutService.ts | 2 +- spark-ui/src/components/SqlFlow/StageNode.tsx | 36 + spark-ui/src/reducers/PlanGraphUtils.ts | 2 +- .../reducers/PlanParsers/ExchangeParser.ts | 2 +- .../src/reducers/PlanParsers/FilterParser.ts | 1 + .../src/reducers/PlanParsers/ProjectParser.ts | 8 +- .../src/reducers/PlanParsers/WindowParser.ts | 2 +- spark-ui/src/reducers/SQLNodeStageReducer.ts | 62 +- spark-ui/src/reducers/SqlReducer.ts | 55 +- spark-ui/src/reducers/SqlReducerUtils.ts | 164 +++- .../__tests__/GlutenStageAssignment.spec.ts | 157 ++++ .../__tests__/gluten-sql4-fixture.json | 744 ++++++++++++++++++ 17 files changed, 1551 insertions(+), 23 deletions(-) create mode 100644 docker/gluten/.gitignore create mode 100644 docker/gluten/Dockerfile create mode 100644 docker/gluten/docker-compose.yml create mode 100755 docker/gluten/run-gluten-example.sh create mode 100644 spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/GlutenVeloxExample.scala create mode 100644 spark-ui/src/reducers/__tests__/GlutenStageAssignment.spec.ts create mode 100644 spark-ui/src/reducers/__tests__/gluten-sql4-fixture.json diff --git a/docker/gluten/.gitignore b/docker/gluten/.gitignore new file mode 100644 index 00000000..e486dd31 --- /dev/null +++ b/docker/gluten/.gitignore @@ -0,0 +1,3 @@ +jars/ +test_data/ +spark-events/ diff --git a/docker/gluten/Dockerfile b/docker/gluten/Dockerfile new file mode 100644 index 00000000..f78dc570 --- /dev/null +++ b/docker/gluten/Dockerfile @@ -0,0 +1,61 @@ +# Spark + Gluten/Velox + DataFlint example runner +# +# Build arguments: +# SPARK_VERSION - Spark version (default: 3.5.7) +# GLUTEN_JAR - Filename of the Gluten bundle jar in jars/ directory +# +# Usage: +# ./run-gluten-example.sh (recommended — builds everything and runs) +# docker compose up --build (if jars are already in jars/) + +ARG SPARK_VERSION=3.5.7 + +FROM apache/spark:${SPARK_VERSION} + +ARG SPARK_VERSION=3.5.7 + +USER root + +# Create directories for event logs and test data +RUN mkdir -p /tmp/spark-events && \ + chown -R spark:spark /tmp/spark-events && \ + mkdir -p /opt/spark/work-dir/test_data && \ + chown -R spark:spark /opt/spark/work-dir/test_data + +# Copy all jars (Gluten bundle + DataFlint plugin + example) into Spark's jars dir +COPY jars/*.jar /opt/spark/jars/ + +# Copy test data +COPY test_data/ /opt/spark/work-dir/test_data/ + +# Configure Spark defaults for Gluten +# The --add-opens flags are required because the Gluten nightly (JDK8 target) uses +# sun.misc.Unsafe / DirectByteBuffer internals that are module-restricted on Java 11+. +RUN mkdir -p /opt/spark/conf && \ + echo "spark.plugins=io.dataflint.spark.SparkDataflintPlugin,org.apache.gluten.GlutenPlugin" >> /opt/spark/conf/spark-defaults.conf && \ + echo "spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager" >> /opt/spark/conf/spark-defaults.conf && \ + echo "spark.memory.offHeap.enabled=true" >> /opt/spark/conf/spark-defaults.conf && \ + echo "spark.memory.offHeap.size=4g" >> /opt/spark/conf/spark-defaults.conf && \ + echo "spark.eventLog.enabled=true" >> /opt/spark/conf/spark-defaults.conf && \ + echo "spark.eventLog.dir=/tmp/spark-events" >> /opt/spark/conf/spark-defaults.conf && \ + echo "spark.ui.port=10000" >> /opt/spark/conf/spark-defaults.conf && \ + echo "spark.dataflint.telemetry.enabled=false" >> /opt/spark/conf/spark-defaults.conf && \ + echo "spark.sql.maxMetadataStringLength=10000" >> /opt/spark/conf/spark-defaults.conf && \ + echo "spark.sql.adaptive.enabled=true" >> /opt/spark/conf/spark-defaults.conf && \ + echo "spark.driver.extraJavaOptions=--add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/sun.misc=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED" >> /opt/spark/conf/spark-defaults.conf && \ + echo "spark.executor.extraJavaOptions=--add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/sun.misc=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED" >> /opt/spark/conf/spark-defaults.conf + +USER spark + +EXPOSE 10000 + +WORKDIR /opt/spark/work-dir + +ENV _JAVA_OPTIONS="--add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/sun.misc=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/jdk.internal.misc=ALL-UNNAMED -Dio.netty.tryReflectionSetAccessible=true" + +# Run the Gluten example via spark-submit +CMD ["/opt/spark/bin/spark-submit", \ + "--master", "local[*]", \ + "--class", "io.dataflint.example.GlutenVeloxExample", \ + "--driver-memory", "2g", \ + "/opt/spark/jars/example.jar"] diff --git a/docker/gluten/docker-compose.yml b/docker/gluten/docker-compose.yml new file mode 100644 index 00000000..4ac470e8 --- /dev/null +++ b/docker/gluten/docker-compose.yml @@ -0,0 +1,16 @@ +services: + spark-gluten-example: + build: + context: . + dockerfile: Dockerfile + args: + SPARK_VERSION: ${SPARK_VERSION:-3.5.7} + image: dataflint-gluten-example:${SPARK_VERSION:-3.5.7} + container_name: dataflint-gluten-example + ports: + - "${SPARK_UI_PORT:-10000}:10000" + volumes: + - ${SPARK_EVENTS_DIR:-./spark-events}:/tmp/spark-events + environment: + - SPARK_NO_DAEMONIZE=true + restart: "no" diff --git a/docker/gluten/run-gluten-example.sh b/docker/gluten/run-gluten-example.sh new file mode 100755 index 00000000..a34e5cf4 --- /dev/null +++ b/docker/gluten/run-gluten-example.sh @@ -0,0 +1,143 @@ +#!/bin/bash +set -e + +# Run DataFlint Gluten/Velox Example +# +# This script: +# 1. Builds the DataFlint UI and plugin jar +# 2. Packages the Gluten example app +# 3. Downloads the Gluten nightly bundle jar (cached) +# 4. Builds and runs the Docker container +# +# Prerequisites: Node.js 20+, Java 8+, sbt, Docker +# +# Usage: +# ./run-gluten-example.sh # full build + run +# ./run-gluten-example.sh --skip-build # skip sbt/npm, just rebuild Docker +# ./run-gluten-example.sh --amd64 # force x86_64 (Rosetta 2 emulation) + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +JARS_DIR="$SCRIPT_DIR/jars" +TEST_DATA_DIR="$SCRIPT_DIR/test_data" +SPARK_EVENTS_DIR="$SCRIPT_DIR/spark-events" + +SPARK_VERSION="${SPARK_VERSION:-3.5.7}" +SCALA_VERSION="${SCALA_VERSION:-2.12}" + +SKIP_BUILD=false +FORCE_AMD64=false + +for arg in "$@"; do + case $arg in + --skip-build) SKIP_BUILD=true ;; + --amd64) FORCE_AMD64=true ;; + esac +done + +# Detect architecture for Gluten jar download +ARCH=$(uname -m) +if [ "$FORCE_AMD64" = true ]; then + GLUTEN_ARCH="linux_amd64" + DOCKER_PLATFORM="--platform linux/amd64" +elif [ "$ARCH" = "arm64" ] || [ "$ARCH" = "aarch64" ]; then + GLUTEN_ARCH="linux_aarch64" + DOCKER_PLATFORM="" +else + GLUTEN_ARCH="linux_amd64" + DOCKER_PLATFORM="" +fi + +GLUTEN_JAR_NAME="gluten-velox-bundle-spark3.5_2.12-${GLUTEN_ARCH}-1.7.0-SNAPSHOT.jar" +GLUTEN_JAR_URL="https://nightlies.apache.org/gluten/nightly-release-jdk8/${GLUTEN_JAR_NAME}" + +echo "=== DataFlint Gluten/Velox Example ===" +echo "Project root: $PROJECT_ROOT" +echo "Spark version: $SPARK_VERSION" +echo "Architecture: $GLUTEN_ARCH" +echo "Gluten jar: $GLUTEN_JAR_NAME" +echo "" + +mkdir -p "$JARS_DIR" +mkdir -p "$SPARK_EVENTS_DIR" + +# --- Step 1: Download Gluten nightly jar (cached) --- +echo "=== Step 1: Downloading Gluten nightly jar ===" +if [ -f "$JARS_DIR/$GLUTEN_JAR_NAME" ]; then + echo "Gluten jar already cached: $JARS_DIR/$GLUTEN_JAR_NAME" +else + echo "Downloading: $GLUTEN_JAR_URL" + curl -fSL -o "$JARS_DIR/$GLUTEN_JAR_NAME" "$GLUTEN_JAR_URL" + echo "Downloaded successfully." +fi + +if [ "$SKIP_BUILD" = false ]; then + # --- Step 2: Build DataFlint UI --- + echo "" + echo "=== Step 2: Building DataFlint UI ===" + cd "$PROJECT_ROOT/spark-ui" + if [ ! -d "node_modules" ]; then + echo "Installing npm dependencies..." + npm ci + fi + echo "Building and deploying UI into plugin resources..." + npm run deploy + + # --- Step 3: Build DataFlint plugin jar --- + echo "" + echo "=== Step 3: Building DataFlint plugin jar ===" + cd "$PROJECT_ROOT/spark-plugin" + export SBT_OPTS="-Xmx4G -Xss2M -XX:+UseG1GC" + sbt "pluginspark3/assembly" + + # --- Step 4: Package example jar --- + echo "" + echo "=== Step 4: Packaging example jar ===" + sbt "example_3_5_1/package" +fi + +# --- Step 5: Copy jars to docker context --- +echo "" +echo "=== Step 5: Copying jars to Docker context ===" + +# DataFlint plugin jar +PLUGIN_JAR=$(find "$PROJECT_ROOT/spark-plugin/pluginspark3/target/scala-${SCALA_VERSION}" -name "spark_${SCALA_VERSION}-*.jar" -type f | head -1) +if [ -z "$PLUGIN_JAR" ]; then + echo "ERROR: DataFlint plugin jar not found. Run without --skip-build first." + exit 1 +fi +cp "$PLUGIN_JAR" "$JARS_DIR/dataflint-plugin.jar" +echo "Copied DataFlint plugin: $(basename "$PLUGIN_JAR")" + +# Example jar +EXAMPLE_JAR=$(find "$PROJECT_ROOT/spark-plugin/example_3_5_1/target/scala-${SCALA_VERSION}" -name "DataflintSparkExample351_${SCALA_VERSION}-*.jar" -type f | head -1) +if [ -z "$EXAMPLE_JAR" ]; then + echo "ERROR: Example jar not found. Run without --skip-build first." + exit 1 +fi +cp "$EXAMPLE_JAR" "$JARS_DIR/example.jar" +echo "Copied example jar: $(basename "$EXAMPLE_JAR")" + +echo "Gluten jar: $GLUTEN_JAR_NAME" + +# --- Step 6: Copy test data --- +echo "" +echo "=== Step 6: Copying test data ===" +rm -rf "$TEST_DATA_DIR" +cp -r "$PROJECT_ROOT/spark-plugin/test_data" "$TEST_DATA_DIR" +echo "Copied test_data/" + +# --- Step 7: Build and run Docker --- +echo "" +echo "=== Step 7: Building and running Docker container ===" +cd "$SCRIPT_DIR" + +# Stop any previous container +docker compose down 2>/dev/null || true + +# Build with platform flag if needed +if [ -n "$DOCKER_PLATFORM" ]; then + DOCKER_DEFAULT_PLATFORM=linux/amd64 docker compose up --build +else + docker compose up --build +fi diff --git a/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/GlutenVeloxExample.scala b/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/GlutenVeloxExample.scala new file mode 100644 index 00000000..f435e8a1 --- /dev/null +++ b/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/GlutenVeloxExample.scala @@ -0,0 +1,116 @@ +package io.dataflint.example + +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.expressions.Window +import org.apache.spark.sql.functions._ + +object GlutenVeloxExample extends App { + val spark = SparkSession + .builder() + .appName("GlutenVeloxExample") + .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin,org.apache.gluten.GlutenPlugin") + .config("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager") + .config("spark.memory.offHeap.enabled", "true") + .config("spark.memory.offHeap.size", "4g") + .config("spark.ui.port", "10000") + .config("spark.eventLog.enabled", "true") + .config("spark.eventLog.dir", "/tmp/spark-events") + .config("spark.dataflint.telemetry.enabled", value = false) + .config("spark.sql.maxMetadataStringLength", "10000") + .config("spark.sql.adaptive.enabled", "true") + .master("local[*]") + .getOrCreate() + + import spark.implicits._ + + def shakespeareDF: DataFrame = spark.read + .format("csv") + .option("sep", ";") + .option("inferSchema", true) + .load("./test_data/will_play_text.csv") + .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry") + + // --- Filter + Project --- + spark.sparkContext.setJobDescription("Filter and Select") + val filtered = shakespeareDF + .filter($"speaker".isNotNull && $"line_id" > 100) + .select($"play_name", $"speaker", $"text_entry", $"speech_number") + filtered.show(10, truncate = false) + + // --- Aggregation (GroupBy + Count/Sum) --- + spark.sparkContext.setJobDescription("GroupBy Aggregation") + val linesPerSpeaker = shakespeareDF + .filter($"speaker".isNotNull) + .groupBy("play_name", "speaker") + .agg( + count("*").alias("line_count"), + sum("speech_number").alias("total_speech_numbers"), + avg("speech_number").alias("avg_speech_number") + ) + linesPerSpeaker.show(20, truncate = false) + + // --- Sort --- + spark.sparkContext.setJobDescription("Sort by line count") + val sortedSpeakers = linesPerSpeaker + .orderBy(col("line_count").desc) + sortedSpeakers.show(20, truncate = false) + + // --- Broadcast Hash Join --- + spark.sparkContext.setJobDescription("Broadcast Hash Join") + val topSpeakers = linesPerSpeaker + .filter($"line_count" > 50) + .select($"speaker".alias("top_speaker"), $"play_name".alias("top_play")) + val broadcastJoined = shakespeareDF + .join(broadcast(topSpeakers), $"speaker" === $"top_speaker" && $"play_name" === $"top_play") + println(s"Broadcast join result count: ${broadcastJoined.count()}") + + // --- Sort Merge Join (disable broadcast to force SMJ) --- + spark.sparkContext.setJobDescription("Sort Merge Join") + spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1) + val plays1 = shakespeareDF + .groupBy("play_name") + .agg(count("*").alias("total_lines")) + .repartition(10) + val plays2 = shakespeareDF + .groupBy("play_name") + .agg(countDistinct("speaker").alias("unique_speakers")) + .repartition(10) + val smjResult = plays1.join(plays2, Seq("play_name")) + smjResult.show(20, truncate = false) + spark.conf.set("spark.sql.autoBroadcastJoinThreshold", 10485760) + + // --- Window Functions --- + spark.sparkContext.setJobDescription("Window Functions") + val speakerWindow = Window.partitionBy("play_name").orderBy(col("line_count").desc) + val rankedSpeakers = linesPerSpeaker + .withColumn("rank", rank().over(speakerWindow)) + .withColumn("dense_rank", dense_rank().over(speakerWindow)) + .withColumn("total_in_play", sum("line_count").over(Window.partitionBy("play_name"))) + .withColumn("pct", round(col("line_count") / col("total_in_play") * 100, 2)) + .filter(col("rank") <= 3) + .orderBy("play_name", "rank") + rankedSpeakers.show(30, truncate = false) + + // --- Explode / Generate --- + spark.sparkContext.setJobDescription("Explode words from text") + val words = shakespeareDF + .filter($"text_entry".isNotNull) + .select($"play_name", $"speaker", explode(split($"text_entry", "\\s+")).alias("word")) + .filter(length($"word") > 0) + val wordCounts = words + .groupBy("word") + .agg(count("*").alias("word_count")) + .orderBy(col("word_count").desc) + wordCounts.show(20, truncate = false) + + // --- Union + distinct --- + spark.sparkContext.setJobDescription("Union and Distinct") + val hamlet = shakespeareDF.filter($"play_name" === "Hamlet").select("speaker") + val macbeth = shakespeareDF.filter($"play_name" === "macbeth").select("speaker") + val allSpeakers = hamlet.union(macbeth).distinct() + println(s"Distinct speakers in Hamlet + Macbeth: ${allSpeakers.count()}") + + println("GlutenVeloxExample completed. Spark UI available at http://localhost:10000") + println("Press Ctrl+C to stop.") + Thread.sleep(Long.MaxValue) +} diff --git a/spark-ui/src/components/SqlFlow/SqlLayoutService.ts b/spark-ui/src/components/SqlFlow/SqlLayoutService.ts index b6ce8175..b1276a01 100644 --- a/spark-ui/src/components/SqlFlow/SqlLayoutService.ts +++ b/spark-ui/src/components/SqlFlow/SqlLayoutService.ts @@ -593,7 +593,7 @@ class SqlLayoutService { const splitExchangeNodeIds = new Set(); for (const nodeId of nodesIds) { const node = nodeMap.get(nodeId); - if (node?.nodeName === "Exchange") { + if (node?.nodeName === "Exchange" || node?.nodeName === "ColumnarExchange") { splitExchangeNodeIds.add(nodeId.toString()); } } diff --git a/spark-ui/src/components/SqlFlow/StageNode.tsx b/spark-ui/src/components/SqlFlow/StageNode.tsx index 6abd5859..1bdb8c55 100644 --- a/spark-ui/src/components/SqlFlow/StageNode.tsx +++ b/spark-ui/src/components/SqlFlow/StageNode.tsx @@ -1,5 +1,6 @@ import ErrorIcon from "@mui/icons-material/Error"; import FlagIcon from "@mui/icons-material/Flag"; +import RocketLaunchIcon from "@mui/icons-material/RocketLaunch"; import WarningIcon from "@mui/icons-material/Warning"; import { Alert, AlertTitle, Box, Tooltip, Typography } from "@mui/material"; import React, { FC, memo, useMemo } from "react"; @@ -7,6 +8,7 @@ import { useSearchParams } from "react-router-dom"; import { Handle, Position } from "reactflow"; import { Alert as AppAlert, EnrichedSqlNode, SQLNodeExchangeStageData, SQLNodeStageData } from "../../interfaces/AppStore"; import { humanFileSize, parseBytesString } from "../../utils/FormatUtils"; +import { getNodeAccelerator } from "../../reducers/SqlReducerUtils"; import { TransperantTooltip } from "../AlertBadge/AlertBadge"; import MetricDisplay, { MetricWithTooltip } from "./MetricDisplay"; import { @@ -49,6 +51,7 @@ const StageNodeComponent: FC = ({ data }) => { + // Memoized computations for better performance const { isHighlighted, allMetrics, hasDeltaOptimizeWrite, displayName, variantStage, variantDuration, variantDurationPercentage } = useMemo(() => { // Parse nodeIds from URL parameters @@ -328,6 +331,39 @@ const StageNodeComponent: FC = ({ data }) => { )} + {/* Accelerator badge - bottom left corner */} + {(() => { + const accel = getNodeAccelerator(data.node.nodeName); + if (!accel) return null; + return ( + + + + + {accel.label} + + + + ); + })()} + {/* Alert badge */} {sqlNodeAlert && ( { if ( - node.nodeName == "CollectLimit" || - node.nodeName === "BroadcastExchange" + node.nodeName === "CollectLimit" || + node.nodeName === "ColumnarCollectLimit" || + node.nodeName === "BroadcastExchange" || + node.nodeName === "ColumnarBroadcastExchange" || + node.nodeName === "VeloxResizeBatches" || + node.nodeName === "RowToVeloxColumnar" ) { + if (node.stage !== undefined) return node; + const previousNode = findPreviousNode(node.nodeId); + if (previousNode !== undefined && previousNode.stage !== undefined) { + return { ...node, stage: previousNode.stage }; + } + } + return node; + }); + rebuildNodeMap(); + // TakeOrderedAndProjectExecTransformer and VeloxColumnarToRow: inherit from next node + nodes = nodes.map((node) => { + if ( + node.nodeName === "TakeOrderedAndProjectExecTransformer" || + node.nodeName === "VeloxColumnarToRow" + ) { + if (node.stage !== undefined) return node; const previousNode = findPreviousNode(node.nodeId); if (previousNode !== undefined && previousNode.stage !== undefined) { return { ...node, stage: previousNode.stage }; @@ -108,7 +128,7 @@ export function calculateSQLNodeStage(sql: EnrichedSparkSQL, sqlStages: SparkSta return node; }); nodes = nodes.map((node) => { - if (node.nodeName === "AQEShuffleRead" || node.nodeName === "Coalesce" || + if (node.nodeName === "AQEShuffleRead" || node.nodeName === "Coalesce" || node.nodeName === "CoalesceExecTransformer" || node.nodeName === "BatchEvalPython" || node.nodeName === "DataFlintBatchEvalPython" || node.nodeName === "MapInPandas" || node.nodeName === "DataFlintMapInPandas" || node.nodeName === "MapInArrow" || node.nodeName === "PythonMapInArrow" || node.nodeName === "DataFlintMapInArrow" || @@ -116,7 +136,7 @@ export function calculateSQLNodeStage(sql: EnrichedSparkSQL, sqlStages: SparkSta node.nodeName === "FlatMapGroupsInPandas" || node.nodeName === "DataFlintFlatMapGroupsInPandas" || node.nodeName === "FlatMapCoGroupsInPandas" || node.nodeName === "DataFlintFlatMapCoGroupsInPandas" || node.nodeName === "WindowInPandas" || node.nodeName === "DataFlintWindowInPandas" || node.nodeName === "DataFlintArrowWindowPython" || - node.nodeName === "Window" || node.nodeName === "DataFlintWindow") { + node.nodeName === "Window" || node.nodeName === "DataFlintWindow" || node.nodeName === "WindowExecTransformer") { const nextNode = findNextNode(node.nodeId); if (nextNode !== undefined && nextNode.stage !== undefined) { return { ...node, stage: nextNode.stage }; @@ -128,7 +148,7 @@ export function calculateSQLNodeStage(sql: EnrichedSparkSQL, sqlStages: SparkSta nodes = nodes.map((node) => { // Convert Exchange nodes to exchange stage type if they have adjacent nodes with stage info // This handles both nodes without stage data and nodes with onestage type that should be exchange type - if (node.nodeName === "Exchange" && (node.stage === undefined || node.stage.type === "onestage")) { + if ((node.nodeName === "Exchange" || node.nodeName === "ColumnarExchange") && (node.stage === undefined || node.stage.type === "onestage")) { const nextNode = findNextNode(node.nodeId); const previousNode = findPreviousNode(node.nodeId); const metricsExchangeStageIds = findExchangeStageIds(node.metrics); @@ -202,7 +222,7 @@ export function calculateSQLNodeStage(sql: EnrichedSparkSQL, sqlStages: SparkSta return node; }); nodes = nodes.map((node) => { - if (node.nodeName === "Window" && node.stage === undefined) { + if ((node.nodeName === "Window" || node.nodeName === "WindowExecTransformer") && node.stage === undefined) { // For Window nodes, try to find stage from next node first, then previous node const nextNode = findNextNode(node.nodeId); if (nextNode !== undefined && nextNode.stage !== undefined) { @@ -216,7 +236,7 @@ export function calculateSQLNodeStage(sql: EnrichedSparkSQL, sqlStages: SparkSta return node; }); nodes = nodes.map((node) => { - if (node.nodeName === "Union" && node.stage === undefined) { + if ((node.nodeName === "Union" || node.nodeName === "ColumnarUnion") && node.stage === undefined) { const nextNode = findNextNode(node.nodeId); if (nextNode !== undefined && nextNode.stage !== undefined) { return { ...node, stage: nextNode.stage }; @@ -402,6 +422,18 @@ export function calculateSqlStage( } } + // Collect stage IDs that have WholeStageCodegenTransformer in their RDD data + const stageCodegenNames = new Map(); + for (const stage of sqlStages) { + if (stage.stagesRdd !== undefined) { + for (const value of Object.values(stage.stagesRdd)) { + if (typeof value === "string" && value.startsWith("WholeStageCodegenTransformer")) { + stageCodegenNames.set(stage.stageId, value); + } + } + } + } + const codegenNodes = sql.codegenNodes.map((node) => { const stageIdByName = rddValueToStageId.get(node.nodeName); const stageIdByRddScope = node.rddScopeId !== undefined ? rddKeyToStageId.get(node.rddScopeId) : undefined; @@ -411,6 +443,20 @@ export function calculateSqlStage( }; }); + // Fallback: AQE may renumber codegen IDs at runtime (e.g., plan has codegen (2) but + // the actual stage has codegen (3)). Match unmatched codegen nodes to unmatched stages + // by ordering. + const matchedStageIds = new Set(codegenNodes.filter(cg => cg.stage !== undefined).map(cg => cg.stage!.type === "onestage" ? cg.stage!.stageId : -1)); + const unmatchedCodegens = codegenNodes.filter(cg => cg.stage === undefined); + const unmatchedStages = Array.from(stageCodegenNames.keys()).filter(sid => !matchedStageIds.has(sid)).sort((a, b) => a - b); + + if (unmatchedCodegens.length > 0 && unmatchedStages.length > 0) { + const sortedUnmatched = [...unmatchedCodegens].sort((a, b) => (a.wholeStageCodegenId ?? 0) - (b.wholeStageCodegenId ?? 0)); + for (let i = 0; i < Math.min(sortedUnmatched.length, unmatchedStages.length); i++) { + sortedUnmatched[i].stage = stageDataFromStage(unmatchedStages[i], stages); + } + } + // Build codegen lookup map, excluding duplicate codegen IDs // If the same codegen ID appears multiple times, we can't reliably determine which stage it belongs to const codegenByWholeStageId = new Map(); @@ -485,7 +531,7 @@ export function calculateSqlStage( readArr.push(node); exchangeReadByStageId.set(node.stage.readStage, readArr); } - if (node.nodeName === "BroadcastExchange" && node?.stage?.type === "onestage") { + if ((node.nodeName === "BroadcastExchange" || node.nodeName === "ColumnarBroadcastExchange") && node?.stage?.type === "onestage") { const arr = broadcastByStageId.get(node.stage.stageId) ?? []; arr.push(node); broadcastByStageId.set(node.stage.stageId, arr); diff --git a/spark-ui/src/reducers/SqlReducer.ts b/spark-ui/src/reducers/SqlReducer.ts index 9ee57a2a..27f95d28 100644 --- a/spark-ui/src/reducers/SqlReducer.ts +++ b/spark-ui/src/reducers/SqlReducer.ts @@ -103,22 +103,27 @@ export function parseNodePlan( case "HashAggregate": case "SortAggregate": case "ObjectHashAggregate": + case "FlushableHashAggregateExecTransformer": + case "RegularHashAggregateExecTransformer": return { type: "HashAggregate", plan: parseHashAggregate(plan.planDescription), }; case "TakeOrderedAndProject": + case "TakeOrderedAndProjectExecTransformer": return { type: "TakeOrderedAndProject", plan: parseTakeOrderedAndProject(plan.planDescription), }; case "CollectLimit": + case "ColumnarCollectLimit": return { type: "CollectLimit", plan: parseCollectLimit(plan.planDescription), }; case "Coalesce": + case "CoalesceExecTransformer": return { type: "Coalesce", plan: parseCoalesce(plan.planDescription), @@ -150,6 +155,7 @@ export function parseNodePlan( case "GpuFilter": case "CometFilter": case "Filter": + case "FilterExecTransformer": return { type: "Filter", plan: parseFilter(plan.planDescription), @@ -158,6 +164,8 @@ export function parseNodePlan( case "CometExchange": case "CometColumnarExchange": case "GpuColumnarExchange": + case "ColumnarExchange": + case "ColumnarBroadcastExchange": return { type: "Exchange", plan: parseExchange(plan.planDescription), @@ -166,6 +174,7 @@ export function parseNodePlan( case "GpuProject": case "CometFilter": case "Project": + case "ProjectExecTransformer": return { type: "Project", plan: parseProject(plan.planDescription), @@ -173,6 +182,7 @@ export function parseNodePlan( case "GpuSort": case "CometSort": case "Sort": + case "SortExecTransformer": return { type: "Sort", plan: parseSort(plan.planDescription), @@ -182,6 +192,7 @@ export function parseNodePlan( case "WindowInPandas": case "DataFlintWindowInPandas": case "DataFlintArrowWindowPython": + case "WindowExecTransformer": return { type: "Window", plan: parseWindow(plan.planDescription), @@ -204,11 +215,13 @@ export function parseNodePlan( plan: parseBatchEvalPython(plan.planDescription), }; case "Generate": + case "GenerateExecTransformer": return { type: "Generate", plan: parseGenerate(plan.planDescription), }; case "Expand": + case "ExpandExecTransformer": return { type: "Expand", plan: parseExpand(plan.planDescription), @@ -318,11 +331,43 @@ function calculateSql( function extractCodegenId(): number | undefined { return parseInt( - node.nodeName.replace("WholeStageCodegen (", "").replace(")", ""), + node.nodeName + .replace("WholeStageCodegenTransformer (", "") + .replace("WholeStageCodegen (", "") + .replace(")", ""), ); } }); + // For Gluten/Velox: WholeStageCodegenTransformer nodes are disconnected orphans in the graph + // and Spark doesn't set wholeStageCodegenId on their child nodes. Infer it from node ID ordering: + // a codegen node at ID X contains the pipeline nodes at IDs X+1, X+2, ... until hitting + // a stage boundary (exchange, AQE, scan) or another codegen node. + const hasGlutenCodegen = typeEnrichedNodes.some( + (n) => n.isCodegenNode && n.nodeName.includes("Transformer"), + ); + if (hasGlutenCodegen) { + const stageBoundaryNames = new Set([ + "ColumnarExchange", "ColumnarBroadcastExchange", "Exchange", "BroadcastExchange", + "AQEShuffleRead", "VeloxResizeBatches", "RowToVeloxColumnar", "VeloxColumnarToRow", + "ColumnarCollectLimit", "AdaptiveSparkPlan", "ColumnarUnion", + ]); + const sorted = [...typeEnrichedNodes].sort((a, b) => a.nodeId - b.nodeId); + let currentCodegenId: number | undefined = undefined; + for (const node of sorted) { + if (node.isCodegenNode) { + currentCodegenId = node.wholeStageCodegenId; + } else if (stageBoundaryNames.has(node.nodeName) || node.nodeName.includes("Scan")) { + currentCodegenId = undefined; + } else if ( + currentCodegenId !== undefined && + node.wholeStageCodegenId === undefined + ) { + node.wholeStageCodegenId = currentCodegenId; + } + } + } + const onlyCodeGenNodes = typeEnrichedNodes .filter((node) => node.isCodegenNode) .map((node) => { @@ -626,8 +671,10 @@ function calcCodegenDuration(metrics: EnrichedSqlMetric[]): number | undefined { function calcExchangeMetrics(nodeName: string, metrics: EnrichedSqlMetric[]) { var exchangeMetrics: ExchangeMetrics | undefined = undefined; - if (nodeName == "Exchange") { - const writeDuration = getMetricDuration("shuffle write time", metrics) ?? 0; + if (nodeName === "Exchange" || nodeName === "ColumnarExchange") { + const writeDuration = + (getMetricDuration("shuffle write time", metrics) ?? 0) + + (getMetricDuration("shuffle wall time", metrics) ?? 0); const readDuration = (getMetricDuration("fetch wait time", metrics) ?? 0) + (getMetricDuration("remote reqs duration", metrics) ?? 0) + @@ -645,7 +692,7 @@ function calcBroadcastExchangeDuration( nodeName: string, metrics: EnrichedSqlMetric[], ): number | undefined { - if (nodeName == "BroadcastExchange") { + if (nodeName === "BroadcastExchange" || nodeName === "ColumnarBroadcastExchange") { const duration = getMetricDuration("time to broadcast", metrics) ?? 0; +(getMetricDuration("time to build", metrics) ?? 0) + (getMetricDuration("time to collect", metrics) ?? 0); diff --git a/spark-ui/src/reducers/SqlReducerUtils.ts b/spark-ui/src/reducers/SqlReducerUtils.ts index 4e3b14f9..e0a80cf7 100644 --- a/spark-ui/src/reducers/SqlReducerUtils.ts +++ b/spark-ui/src/reducers/SqlReducerUtils.ts @@ -34,7 +34,14 @@ const metricAllowlist: Record> = { "total number of files merged by ZOrderBy", "total bytes in files merged by ZOrderBy", ], - join: ["number of output rows", "output columnar batches"], + join: [ + "number of output rows", + "output columnar batches", + "number of hash build input rows", + "number of hash probe input rows", + "time of hash build", + "time of hash probe", + ], transformation: [ "number of output rows", "output columnar batches", @@ -42,6 +49,13 @@ const metricAllowlist: Record> = { "data sent to Python workers", "data returned from Python workers", "duration", + "number of input rows", + "time of aggregation", + "time of filter", + "time of window", + "time of generate", + "number of spilled bytes", + "peak memory bytes", ], shuffle: [ "number of partitions", @@ -57,10 +71,19 @@ const metricAllowlist: Record> = { "remote bytes read", "fetch wait time", "data size", + "number of input rows", + "number of input batches", + "number of output batches", ], broadcast: ["number of output rows", "data size", "output columnar batches"], - sort: ["spill size", "output columnar batches"], + sort: [ + "spill size", + "output columnar batches", + "time of sort", + "number of spilled bytes", + "peak memory bytes", + ], other: [], }; @@ -86,6 +109,19 @@ const metricsValueTransformer: Record< "remote bytes read": extractTotalFromStatisticsMetric, "fetch wait time": extractTotalFromStatisticsMetric, "data size": extractTotalFromStatisticsMetric, + "time of aggregation": extractTotalFromStatisticsMetric, + "time of filter": extractTotalFromStatisticsMetric, + "time of sort": extractTotalFromStatisticsMetric, + "time of window": extractTotalFromStatisticsMetric, + "time of generate": extractTotalFromStatisticsMetric, + "time of hash build": extractTotalFromStatisticsMetric, + "time of hash probe": extractTotalFromStatisticsMetric, + "number of spilled bytes": (value: string) => { + const total = extractTotalFromStatisticsMetric(value); + if (total === undefined || total === "0.0 B" || total === "0 B") return undefined; + return total; + }, + "peak memory bytes": extractTotalFromStatisticsMetric, "number of dynamic part": (value: string) => { // if dynamic part is 0 we want to remove it from metrics if (value === "0") { @@ -134,6 +170,20 @@ const metricsRenamer: Record = { "number of read streams": "number of read streams", "parsing time for BQ": "parsing time", "number of BQ bytes read": "bytes read", + "number of input rows": "input rows", + "number of input batches": "input batches", + "number of output batches": "output batches", + "number of hash build input rows": "build input rows", + "number of hash probe input rows": "probe input rows", + "time of aggregation": "aggregation time", + "time of filter": "filter time", + "time of sort": "sort time", + "time of window": "window time", + "time of generate": "generate time", + "time of hash build": "hash build time", + "time of hash probe": "hash probe time", + "number of spilled bytes": "spill", + "peak memory bytes": "peak memory", }; const nodeTypeDict: Record = { @@ -220,6 +270,31 @@ const nodeTypeDict: Record = { DataFlintWindow: "transformation", Generate: "transformation", Expand: "transformation", + FilterExecTransformer: "transformation", + ProjectExecTransformer: "transformation", + FlushableHashAggregateExecTransformer: "transformation", + RegularHashAggregateExecTransformer: "transformation", + SortExecTransformer: "sort", + BroadcastHashJoinExecTransformer: "join", + ShuffledHashJoinExecTransformer: "join", + SortMergeJoinExecTransformer: "join", + ColumnarExchange: "shuffle", + ColumnarBroadcastExchange: "broadcast", + WindowExecTransformer: "transformation", + GenerateExecTransformer: "transformation", + TakeOrderedAndProjectExecTransformer: "output", + ColumnarCollectLimit: "output", + ColumnarUnion: "join", + VeloxColumnarToRow: "other", + RowToVeloxColumnar: "other", + VeloxResizeBatches: "other", + InputIteratorTransformer: "other", + BatchScanExecTransformer: "input", + FileSourceScanExecTransformer: "input", + ExpandExecTransformer: "transformation", + CoalesceExecTransformer: "shuffle", + LimitTransformer: "output", + CartesianProductExecTransformer: "join", }; const nodeRenamerDict: Record = { @@ -311,6 +386,31 @@ const nodeRenamerDict: Record = { DataFlintWindowInPandas: "Window (with Pandas UDF)", DataFlintArrowWindowPython: "Window (with Arrow UDF)", Expand: "Expand", + FilterExecTransformer: "Filter (Velox)", + ProjectExecTransformer: "Select (Velox)", + FlushableHashAggregateExecTransformer: "Aggregate Within Partition (Velox)", + RegularHashAggregateExecTransformer: "Aggregate By Merge (Velox)", + SortExecTransformer: "Sort (Velox)", + BroadcastHashJoinExecTransformer: "Join (Broadcast Hash) (Velox)", + ShuffledHashJoinExecTransformer: "Join (Shuffled Hash) (Velox)", + SortMergeJoinExecTransformer: "Join (Sort Merge) (Velox)", + ColumnarExchange: "Repartition (Velox)", + ColumnarBroadcastExchange: "Broadcast (Velox)", + WindowExecTransformer: "Window (Velox)", + GenerateExecTransformer: "Generate (Velox)", + TakeOrderedAndProjectExecTransformer: "Take Ordered (Velox)", + ColumnarCollectLimit: "Collect (Velox)", + ColumnarUnion: "Union (Velox)", + VeloxColumnarToRow: "Columnar To Row", + RowToVeloxColumnar: "Row To Columnar", + VeloxResizeBatches: "Resize Batches", + InputIteratorTransformer: "Input Iterator", + BatchScanExecTransformer: "Read (Velox)", + FileSourceScanExecTransformer: "Read (Velox)", + ExpandExecTransformer: "Expand (Velox)", + CoalesceExecTransformer: "Coalesce (Velox)", + LimitTransformer: "Limit (Velox)", + CartesianProductExecTransformer: "Join (Cartesian Product) (Velox)", }; export function extractTotalFromStatisticsMetric( @@ -579,7 +679,9 @@ export const EXCHANGE_NODE_TYPES = [ "CometColumnarExchange", "PhotonBroadcastExchange", "PhotonShuffleExchangeSink", - "PhotonShuffleExchangeSource" + "PhotonShuffleExchangeSource", + "ColumnarExchange", + "ColumnarBroadcastExchange", ]; /** @@ -663,6 +765,8 @@ export const AGGREGATE_NODE_NAMES = [ "HashAggregate", "SortAggregate", "ObjectHashAggregate", + "FlushableHashAggregateExecTransformer", + "RegularHashAggregateExecTransformer", ]; /** @@ -673,3 +777,57 @@ export const AGGREGATE_NODE_NAMES = [ export function isAggregateNode(nodeName: string): boolean { return AGGREGATE_NODE_NAMES.includes(nodeName); } + +export type AcceleratorType = "velox" | "photon" | "rapids" | "comet" | undefined; + +export interface AcceleratorInfo { + type: AcceleratorType; + label: string; + tooltip: string; + gradientFrom: string; + gradientTo: string; +} + +const ACCELERATOR_MAP: Record = {}; + +const VELOX_INFO: AcceleratorInfo = { type: "velox", label: "Velox", tooltip: "Accelerated by Apache Gluten (Velox native engine)", gradientFrom: "#e65100", gradientTo: "#ff6d00" }; +const PHOTON_INFO: AcceleratorInfo = { type: "photon", label: "Photon", tooltip: "Accelerated by Databricks Photon engine", gradientFrom: "#6a1b9a", gradientTo: "#ab47bc" }; +const RAPIDS_INFO: AcceleratorInfo = { type: "rapids", label: "RAPIDS", tooltip: "Accelerated by NVIDIA RAPIDS GPU engine", gradientFrom: "#1b5e20", gradientTo: "#43a047" }; +const COMET_INFO: AcceleratorInfo = { type: "comet", label: "DataFusion", tooltip: "Accelerated by Apache DataFusion Comet engine", gradientFrom: "#01579b", gradientTo: "#0288d1" }; + +[ + "FilterExecTransformer", "ProjectExecTransformer", + "FlushableHashAggregateExecTransformer", "RegularHashAggregateExecTransformer", + "SortExecTransformer", "BroadcastHashJoinExecTransformer", + "ShuffledHashJoinExecTransformer", "SortMergeJoinExecTransformer", + "WindowExecTransformer", "GenerateExecTransformer", + "TakeOrderedAndProjectExecTransformer", "ColumnarCollectLimit", + "ColumnarExchange", "ColumnarBroadcastExchange", "ColumnarUnion", + "ExpandExecTransformer", "CoalesceExecTransformer", "LimitTransformer", + "CartesianProductExecTransformer", "BatchScanExecTransformer", "FileSourceScanExecTransformer", +].forEach(n => ACCELERATOR_MAP[n] = VELOX_INFO); + +[ + "PhotonProject", "PhotonGroupingAgg", "PhotonShuffleExchangeSink", + "PhotonShuffleExchangeSource", "PhotonTopK", "PhotonFilter", + "PhotonBroadcastExchange", "PhotonBroadcastHashJoin", +].forEach(n => ACCELERATOR_MAP[n] = PHOTON_INFO); + +[ + "GpuFilter", "GpuBroadcastHashJoin", "GpuCoalesceBatches", + "GpuBroadcastExchange", "GpuProject", "GpuHashAggregate", + "GpuColumnarExchange", "GpuCustomShuffleReader", "GpuTopN", + "GpuShuffleCoalesce", "GpuSort", "GpuShuffledSymmetricHashJoin", + "GpuBroadcastNestedLoopJoin", +].forEach(n => ACCELERATOR_MAP[n] = RAPIDS_INFO); + +[ + "CometColumnarExchange", "CometHashAggregate", "CometExchange", + "CometProject", "CometFilter", "CometSort", + "CometHashJoin", "CometBroadcastHashJoin", "CometSortMergeJoin", +].forEach(n => ACCELERATOR_MAP[n] = COMET_INFO); + +export function getNodeAccelerator(nodeName: string): AcceleratorInfo | undefined { + return ACCELERATOR_MAP[nodeName]; +} + diff --git a/spark-ui/src/reducers/__tests__/GlutenStageAssignment.spec.ts b/spark-ui/src/reducers/__tests__/GlutenStageAssignment.spec.ts new file mode 100644 index 00000000..0929fe20 --- /dev/null +++ b/spark-ui/src/reducers/__tests__/GlutenStageAssignment.spec.ts @@ -0,0 +1,157 @@ +import fixture from "./gluten-sql4-fixture.json"; +import { EnrichedSparkSQL, EnrichedSqlEdge, EnrichedSqlNode, SparkStagesStore } from "../../interfaces/AppStore"; +import { calcNodeType } from "../SqlReducerUtils"; +import { calculateSqlStage } from "../SQLNodeStageReducer"; + +function buildEnrichedSql(): { sql: EnrichedSparkSQL; stages: SparkStagesStore; jobs: typeof fixture.jobs } { + const stageBoundaryNames = new Set([ + "ColumnarExchange", "ColumnarBroadcastExchange", "Exchange", "BroadcastExchange", + "AQEShuffleRead", "VeloxResizeBatches", "RowToVeloxColumnar", "VeloxColumnarToRow", + "ColumnarCollectLimit", "AdaptiveSparkPlan", "ColumnarUnion", + ]); + + // Step 1: Enrich nodes with type and wholeStageCodegenId (mimics calculateSql) + const rawNodes = fixture.sql.nodes.map((node) => { + const type = calcNodeType(node.nodeName); + const isCodegenNode = node.nodeName.includes("WholeStageCodegen"); + let wholeStageCodegenId: number | undefined = undefined; + if (isCodegenNode) { + wholeStageCodegenId = parseInt( + node.nodeName + .replace("WholeStageCodegenTransformer (", "") + .replace("WholeStageCodegen (", "") + .replace(")", ""), + ); + } + return { + ...node, + type, + isCodegenNode, + wholeStageCodegenId, + enrichedName: node.nodeName, + metrics: node.metrics.map(m => ({ ...m, stageId: undefined as number | undefined })), + } as unknown as EnrichedSqlNode; + }); + + // Step 2: Gluten codegen ID inference (mimics the logic in SqlReducer.ts) + const hasGlutenCodegen = rawNodes.some(n => n.isCodegenNode && n.nodeName.includes("Transformer")); + if (hasGlutenCodegen) { + const sorted = [...rawNodes].sort((a, b) => a.nodeId - b.nodeId); + let currentCodegenId: number | undefined = undefined; + for (const node of sorted) { + if (node.isCodegenNode) { + currentCodegenId = node.wholeStageCodegenId; + } else if (stageBoundaryNames.has(node.nodeName) || node.nodeName.includes("Scan")) { + currentCodegenId = undefined; + } else if (currentCodegenId !== undefined && node.wholeStageCodegenId === undefined) { + (node as any).wholeStageCodegenId = currentCodegenId; + } + } + } + + // Step 3: Separate codegen vs graph nodes + const codegenNodes = rawNodes + .filter(n => n.isCodegenNode) + .map(n => ({ ...n, codegenDuration: undefined as number | undefined, nodeIdFromMetrics: undefined as number | undefined })); + const graphNodes = rawNodes.filter(n => !n.isCodegenNode); + + // Mark last visible node as output if none exists + const hasOutput = graphNodes.some(n => n.type === "output"); + if (!hasOutput) { + const filtered = graphNodes.filter(n => n.nodeName !== "AdaptiveSparkPlan" && n.nodeName !== "ResultQueryStage"); + if (filtered.length > 0) { + filtered[filtered.length - 1].type = "output"; + } + } + + const edges: EnrichedSqlEdge[] = fixture.sql.edges.map(e => ({ fromId: e.fromId, toId: e.toId })); + + // Build stages store + const stages: SparkStagesStore = fixture.stages.map(s => ({ + stageId: s.stageId, + attemptId: s.attemptId, + name: "", + status: s.status, + numTasks: s.numTasks, + completedTasks: s.numCompleteTasks, + failedTasks: s.numFailedTasks, + activeTasks: s.numActiveTasks, + pendingTasks: s.numTasks - s.numCompleteTasks - s.numFailedTasks - s.numActiveTasks, + stageRealTimeDurationMs: undefined, + stagesRdd: fixture.stagesRdd[String(s.stageId) as keyof typeof fixture.stagesRdd], + durationDistribution: [0, 0, 0, 0, 0], + outputDistribution: [0, 0, 0, 0, 0], + outputRowsDistribution: [0, 0, 0, 0, 0], + inputDistribution: [0, 0, 0, 0, 0], + inputRowsDistribution: [0, 0, 0, 0, 0], + spillDiskDistriution: [0, 0, 0, 0, 0], + shuffleReadDistribution: [0, 0, 0, 0, 0], + shuffleWriteDistribution: [0, 0, 0, 0, 0], + stageProgress: 100, + metrics: { executorRunTime: s.executorRunTime }, + } as any)); + + const sql: EnrichedSparkSQL = { + id: fixture.sql.id, + description: fixture.sql.description, + successJobIds: fixture.sql.successJobIds, + runningJobIds: fixture.sql.runningJobIds, + failedJobIds: fixture.sql.failedJobIds, + nodes: graphNodes, + edges, + codegenNodes, + metricUpdateId: "test", + } as any; + + return { sql, stages, jobs: fixture.jobs }; +} + +describe("Gluten/Velox Stage Assignment - SQL 4 (Sort by line count)", () => { + it("should assign correct stages to all nodes", () => { + const { sql, stages, jobs } = buildEnrichedSql(); + + const result = calculateSqlStage(sql, stages, jobs as any); + + // Debug: print all node stages + for (const node of result.nodes) { + const stageInfo = node.stage + ? node.stage.type === "onestage" ? `onestage:${(node.stage as any).stageId}` + : node.stage.type === "exchange" ? `exchange:w=${(node.stage as any).writeStage},r=${(node.stage as any).readStage}` + : `${node.stage.type}` + : "NONE"; + console.log(` Node ${node.nodeId}: ${node.nodeName.padEnd(45)} stage=${stageInfo} wcid=${node.wholeStageCodegenId}`); + } + + // Pre-shuffle nodes should be in stage 8 + const scanNode = result.nodes.find(n => n.nodeName === "Scan csv"); + expect(scanNode?.stage?.type).toBe("onestage"); + expect((scanNode?.stage as any)?.stageId).toBe(8); + + const filterNode = result.nodes.find(n => n.nodeName === "FilterExecTransformer"); + expect(filterNode?.stage?.type).toBe("onestage"); + expect((filterNode?.stage as any)?.stageId).toBe(8); + + const flushableAgg = result.nodes.find(n => n.nodeName === "FlushableHashAggregateExecTransformer"); + expect(flushableAgg?.stage?.type).toBe("onestage"); + expect((flushableAgg?.stage as any)?.stageId).toBe(8); + + // ColumnarExchange should be split: write=8, read=10 + const exchange = result.nodes.find(n => n.nodeName === "ColumnarExchange"); + expect(exchange?.stage?.type).toBe("exchange"); + expect((exchange?.stage as any)?.writeStage).toBe(8); + expect((exchange?.stage as any)?.readStage).toBe(10); + + // Post-shuffle nodes should be in stage 10 + const aqeRead = result.nodes.find(n => n.nodeName === "AQEShuffleRead"); + expect(aqeRead?.stage?.type).toBe("onestage"); + expect((aqeRead?.stage as any)?.stageId).toBe(10); + + const regularAgg = result.nodes.find(n => n.nodeName === "RegularHashAggregateExecTransformer"); + expect(regularAgg?.stage?.type).toBe("onestage"); + expect((regularAgg?.stage as any)?.stageId).toBe(10); + + const takeOrdered = result.nodes.find(n => n.nodeName === "TakeOrderedAndProjectExecTransformer"); + expect(takeOrdered?.stage?.type).toBe("onestage"); + expect((takeOrdered?.stage as any)?.stageId).toBe(10); + }); +}); diff --git a/spark-ui/src/reducers/__tests__/gluten-sql4-fixture.json b/spark-ui/src/reducers/__tests__/gluten-sql4-fixture.json new file mode 100644 index 00000000..b8506ce0 --- /dev/null +++ b/spark-ui/src/reducers/__tests__/gluten-sql4-fixture.json @@ -0,0 +1,744 @@ +{ + "sql": { + "id": "4", + "description": "Sort by line count", + "successJobIds": [ + 7, + 8 + ], + "runningJobIds": [], + "failedJobIds": [], + "nodes": [ + { + "nodeId": 16, + "nodeName": "Scan csv", + "metrics": [ + { + "name": "number of output rows", + "value": "111,389" + }, + { + "name": "number of files read", + "value": "1" + }, + { + "name": "metadata time", + "value": "0 ms" + }, + { + "name": "size of files read", + "value": "9.7 MiB" + } + ] + }, + { + "nodeId": 15, + "nodeName": "RowToVeloxColumnar", + "metrics": [ + { + "name": "number of input rows", + "value": "111,389" + }, + { + "name": "number of output batches", + "value": "29" + }, + { + "name": "time to convert", + "value": "total (min, med, max (stageId: taskId))\n43 ms (7 ms, 17 ms, 19 ms (stage 8.0: task 14))" + } + ] + }, + { + "nodeId": 14, + "nodeName": "InputIteratorTransformer", + "metrics": [ + { + "name": "cpu wall time count", + "value": "70" + }, + { + "name": "time of operator input", + "value": "total (min, med, max (stageId: taskId))\n153 ms (34 ms, 59 ms, 60 ms (stage 8.0: task 13))" + }, + { + "name": "number of output rows", + "value": "111,389" + }, + { + "name": "number of output vectors", + "value": "29" + } + ] + }, + { + "nodeId": 13, + "nodeName": "FilterExecTransformer", + "metrics": [ + { + "name": "time of filter", + "value": "total (min, med, max (stageId: taskId))\n0 ms (0 ms, 0 ms, 0 ms (stage 8.0: task 15))" + }, + { + "name": "number of output bytes", + "value": "total (min, med, max (stageId: taskId))\n7.3 MiB (1388.7 KiB, 3.0 MiB, 3.0 MiB (stage 8.0: task 14))" + }, + { + "name": "cpu wall time count", + "value": "0" + }, + { + "name": "number of output vectors", + "value": "29" + }, + { + "name": "peak memory bytes", + "value": "total (min, med, max (stageId: taskId))\n0.0 B (0.0 B, 0.0 B, 0.0 B (stage 8.0: task 15))" + }, + { + "name": "number of output rows", + "value": "111,389" + }, + { + "name": "number of memory allocations", + "value": "0" + }, + { + "name": "time of loading lazy vectors", + "value": "total (min, med, max (stageId: taskId))\n0 ms (0 ms, 0 ms, 0 ms (stage 8.0: task 15))" + } + ] + }, + { + "nodeId": 12, + "nodeName": "ProjectExecTransformer", + "metrics": [ + { + "name": "time of project", + "value": "total (min, med, max (stageId: taskId))\n0 ms (0 ms, 0 ms, 0 ms (stage 8.0: task 13))" + }, + { + "name": "number of output bytes", + "value": "total (min, med, max (stageId: taskId))\n7.3 MiB (1388.7 KiB, 3.0 MiB, 3.0 MiB (stage 8.0: task 14))" + }, + { + "name": "cpu wall time count", + "value": "262" + }, + { + "name": "number of output vectors", + "value": "29" + }, + { + "name": "peak memory bytes", + "value": "total (min, med, max (stageId: taskId))\n0.0 B (0.0 B, 0.0 B, 0.0 B (stage 8.0: task 15))" + }, + { + "name": "number of output rows", + "value": "111,389" + }, + { + "name": "number of memory allocations", + "value": "0" + }, + { + "name": "time of loading lazy vectors", + "value": "total (min, med, max (stageId: taskId))\n0 ms (0 ms, 0 ms, 0 ms (stage 8.0: task 15))" + } + ] + }, + { + "nodeId": 11, + "nodeName": "FlushableHashAggregateExecTransformer", + "metrics": [ + { + "name": "number of final output vectors", + "value": "0" + }, + { + "name": "time of extraction", + "value": "total (min, med, max (stageId: taskId))\n0 ms (0 ms, 0 ms, 0 ms (stage 8.0: task 13))" + }, + { + "name": "rowConstruction cpu wall time count", + "value": "0" + }, + { + "name": "number of memory allocations", + "value": "97" + }, + { + "name": "number of output vectors", + "value": "3" + }, + { + "name": "number of spilled bytes", + "value": "total (min, med, max (stageId: taskId))\n0.0 B (0.0 B, 0.0 B, 0.0 B (stage 8.0: task 15))" + }, + { + "name": "number of final output rows", + "value": "0" + }, + { + "name": "bloom filter blocks byte size", + "value": "0.0 B" + }, + { + "name": "number of output rows", + "value": "1,338" + }, + { + "name": "number of pushdown aggregations", + "value": "0" + }, + { + "name": "number of output bytes", + "value": "total (min, med, max (stageId: taskId))\n1437.8 KiB (479.3 KiB, 479.3 KiB, 479.3 KiB (stage 8.0: task 15))" + }, + { + "name": "number of spilled files", + "value": "0" + }, + { + "name": "time of aggregation", + "value": "total (min, med, max (stageId: taskId))\n4 ms (0 ms, 1 ms, 1 ms (stage 8.0: task 13))" + }, + { + "name": "peak memory bytes", + "value": "total (min, med, max (stageId: taskId))\n3.6 MiB (1176.1 KiB, 1216.1 KiB, 1260.1 KiB (stage 8.0: task 14))" + }, + { + "name": "number of spilled rows", + "value": "0" + }, + { + "name": "cpu wall time count", + "value": "254" + }, + { + "name": "number of spilled partitions", + "value": "0" + }, + { + "name": "time of loading lazy vectors", + "value": "total (min, med, max (stageId: taskId))\n0 ms (0 ms, 0 ms, 0 ms (stage 8.0: task 15))" + }, + { + "name": "time of rowConstruction", + "value": "0 ms" + }, + { + "name": "number of flushed rows", + "value": "0" + }, + { + "name": "extraction cpu wall time count", + "value": "113" + } + ] + }, + { + "nodeId": 10, + "nodeName": "ProjectExecTransformer", + "metrics": [ + { + "name": "time of project", + "value": "total (min, med, max (stageId: taskId))\n0 ms (0 ms, 0 ms, 0 ms (stage 8.0: task 13))" + }, + { + "name": "number of output bytes", + "value": "total (min, med, max (stageId: taskId))\n1444.5 KiB (480.2 KiB, 482.2 KiB, 482.2 KiB (stage 8.0: task 14))" + }, + { + "name": "cpu wall time count", + "value": "84" + }, + { + "name": "number of output vectors", + "value": "3" + }, + { + "name": "peak memory bytes", + "value": "total (min, med, max (stageId: taskId))\n7.0 KiB (1024.0 B, 3.0 KiB, 3.0 KiB (stage 8.0: task 14))" + }, + { + "name": "number of output rows", + "value": "1,338" + }, + { + "name": "number of memory allocations", + "value": "3" + }, + { + "name": "time of loading lazy vectors", + "value": "total (min, med, max (stageId: taskId))\n0 ms (0 ms, 0 ms, 0 ms (stage 8.0: task 15))" + } + ] + }, + { + "nodeId": 9, + "nodeName": "WholeStageCodegenTransformer (1)", + "metrics": [ + { + "name": "duration", + "value": "total (min, med, max (stageId: taskId))\n184 ms (43 ms, 70 ms, 71 ms (stage 8.0: task 13))" + } + ] + }, + { + "nodeId": 8, + "nodeName": "VeloxResizeBatches", + "metrics": [ + { + "name": "number of output batches", + "value": "3" + }, + { + "name": "number of input rows", + "value": "1,338" + }, + { + "name": "time to convert batches", + "value": "total (min, med, max (stageId: taskId))\n1 ms (0 ms, 0 ms, 1 ms (stage 8.0: task 15))" + }, + { + "name": "number of input batches", + "value": "3" + }, + { + "name": "number of output rows", + "value": "1,338" + } + ] + }, + { + "nodeId": 7, + "nodeName": "ColumnarExchange", + "metrics": [ + { + "name": "shuffle records written", + "value": "1,338" + }, + { + "name": "local merged chunks fetched", + "value": "0" + }, + { + "name": "shuffle write time", + "value": "total (min, med, max (stageId: taskId))\n1 ms (0 ms, 0 ms, 0 ms (stage 8.0: task 14))" + }, + { + "name": "remote merged bytes read", + "value": "0.0 B" + }, + { + "name": "time to compress", + "value": "total (min, med, max (stageId: taskId))\n0 ms (0 ms, 0 ms, 0 ms (stage 8.0: task 15))" + }, + { + "name": "local merged blocks fetched", + "value": "0" + }, + { + "name": "time to split", + "value": "total (min, med, max (stageId: taskId))\n8 ms (2 ms, 2 ms, 3 ms (stage 8.0: task 13))" + }, + { + "name": "corrupt merged block chunks", + "value": "0" + }, + { + "name": "shuffle wall time", + "value": "total (min, med, max (stageId: taskId))\n9 ms (2 ms, 2 ms, 4 ms (stage 8.0: task 13))" + }, + { + "name": "number of input rows", + "value": "1,338" + }, + { + "name": "time to decompress", + "value": "0 ms" + }, + { + "name": "remote merged reqs duration", + "value": "0 ms" + }, + { + "name": "remote merged blocks fetched", + "value": "0" + }, + { + "name": "time to spill", + "value": "total (min, med, max (stageId: taskId))\n0 ms (0 ms, 0 ms, 0 ms (stage 8.0: task 15))" + }, + { + "name": "records read", + "value": "1,338" + }, + { + "name": "local bytes read", + "value": "142.9 KiB" + }, + { + "name": "dictionary size", + "value": "total (min, med, max (stageId: taskId))\n0.0 B (0.0 B, 0.0 B, 0.0 B (stage 8.0: task 15))" + }, + { + "name": "fetch wait time", + "value": "0 ms" + }, + { + "name": "remote bytes read", + "value": "0.0 B" + }, + { + "name": "time to deserialize", + "value": "0 ms" + }, + { + "name": "merged fetch fallback count", + "value": "0" + }, + { + "name": "avg read batch num rows", + "value": "2.5" + }, + { + "name": "batches read", + "value": "515" + }, + { + "name": "shuffle bytes spilled", + "value": "total (min, med, max (stageId: taskId))\n0.0 B (0.0 B, 0.0 B, 0.0 B (stage 8.0: task 15))" + }, + { + "name": "number of input batches", + "value": "3" + }, + { + "name": "avg dictionary fields", + "value": "0" + }, + { + "name": "number of output rows", + "value": "1,338" + }, + { + "name": "local blocks read", + "value": "3" + }, + { + "name": "remote merged chunks fetched", + "value": "0" + }, + { + "name": "remote blocks read", + "value": "0" + }, + { + "name": "data size", + "value": "total (min, med, max (stageId: taskId))\n81.6 KiB (12.6 KiB, 33.0 KiB, 35.9 KiB (stage 8.0: task 13))" + }, + { + "name": "local merged bytes read", + "value": "0.0 B" + }, + { + "name": "peak bytes allocated", + "value": "total (min, med, max (stageId: taskId))\n128.8 MiB (35.3 MiB, 46.1 MiB, 47.4 MiB (stage 8.0: task 13))" + }, + { + "name": "number of partitions", + "value": "200" + }, + { + "name": "remote reqs duration", + "value": "0 ms" + }, + { + "name": "remote bytes read to disk", + "value": "0.0 B" + }, + { + "name": "shuffle bytes written", + "value": "total (min, med, max (stageId: taskId))\n142.9 KiB (29.1 KiB, 55.2 KiB, 58.6 KiB (stage 8.0: task 13))" + } + ] + }, + { + "nodeId": 6, + "nodeName": "AQEShuffleRead", + "metrics": [ + { + "name": "number of partitions", + "value": "1" + }, + { + "name": "partition data size", + "value": "149.6 KiB" + }, + { + "name": "number of coalesced partitions", + "value": "1" + } + ] + }, + { + "nodeId": 5, + "nodeName": "InputIteratorTransformer", + "metrics": [ + { + "name": "cpu wall time count", + "value": "1,034" + }, + { + "name": "time of operator input", + "value": "3 ms" + }, + { + "name": "number of output rows", + "value": "1,338" + }, + { + "name": "number of output vectors", + "value": "515" + } + ] + }, + { + "nodeId": 4, + "nodeName": "RegularHashAggregateExecTransformer", + "metrics": [ + { + "name": "number of final output vectors", + "value": "0" + }, + { + "name": "time of extraction", + "value": "0 ms" + }, + { + "name": "rowConstruction cpu wall time count", + "value": "4,130" + }, + { + "name": "number of memory allocations", + "value": "41" + }, + { + "name": "number of output vectors", + "value": "1" + }, + { + "name": "number of spilled bytes", + "value": "0.0 B" + }, + { + "name": "number of final output rows", + "value": "0" + }, + { + "name": "bloom filter blocks byte size", + "value": "0.0 B" + }, + { + "name": "number of output rows", + "value": "1,326" + }, + { + "name": "number of pushdown aggregations", + "value": "0" + }, + { + "name": "number of output bytes", + "value": "431.3 KiB" + }, + { + "name": "number of spilled files", + "value": "0" + }, + { + "name": "time of aggregation", + "value": "2 ms" + }, + { + "name": "peak memory bytes", + "value": "1004.6 KiB" + }, + { + "name": "number of spilled rows", + "value": "0" + }, + { + "name": "cpu wall time count", + "value": "3,622" + }, + { + "name": "number of spilled partitions", + "value": "0" + }, + { + "name": "time of loading lazy vectors", + "value": "0 ms" + }, + { + "name": "time of rowConstruction", + "value": "2 ms" + }, + { + "name": "number of flushed rows", + "value": "0" + }, + { + "name": "extraction cpu wall time count", + "value": "0" + } + ] + }, + { + "nodeId": 3, + "nodeName": "WholeStageCodegenTransformer (2)", + "metrics": [] + }, + { + "nodeId": 2, + "nodeName": "TakeOrderedAndProjectExecTransformer", + "metrics": [] + }, + { + "nodeId": 1, + "nodeName": "VeloxColumnarToRow", + "metrics": [ + { + "name": "number of output rows", + "value": "21" + }, + { + "name": "number of input batches", + "value": "1" + }, + { + "name": "time to convert", + "value": "0 ms" + } + ] + }, + { + "nodeId": 0, + "nodeName": "AdaptiveSparkPlan", + "metrics": [] + } + ], + "edges": [ + { + "fromId": 1, + "toId": 0 + }, + { + "fromId": 2, + "toId": 1 + }, + { + "fromId": 4, + "toId": 2 + }, + { + "fromId": 5, + "toId": 4 + }, + { + "fromId": 6, + "toId": 5 + }, + { + "fromId": 7, + "toId": 6 + }, + { + "fromId": 8, + "toId": 7 + }, + { + "fromId": 10, + "toId": 8 + }, + { + "fromId": 11, + "toId": 10 + }, + { + "fromId": 12, + "toId": 11 + }, + { + "fromId": 13, + "toId": 12 + }, + { + "fromId": 14, + "toId": 13 + }, + { + "fromId": 15, + "toId": 14 + }, + { + "fromId": 16, + "toId": 15 + } + ] + }, + "jobs": [ + { + "jobId": 8, + "stageIds": [ + 9, + 10 + ] + }, + { + "jobId": 7, + "stageIds": [ + 8 + ] + } + ], + "stages": [ + { + "stageId": 10, + "status": "COMPLETE", + "numTasks": 1, + "numCompleteTasks": 1, + "numFailedTasks": 0, + "numActiveTasks": 0, + "executorRunTime": 14, + "attemptId": 0 + }, + { + "stageId": 8, + "status": "COMPLETE", + "numTasks": 3, + "numCompleteTasks": 3, + "numFailedTasks": 0, + "numActiveTasks": 0, + "executorRunTime": 203, + "attemptId": 0 + } + ], + "stagesRdd": { + "8": { + "100": "Scan csv ", + "90": "ColumnarExchange", + "99": "RowToVeloxColumnar", + "92": "WholeStageCodegenTransformer (1)", + "91": "VeloxResizeBatches" + }, + "9": {}, + "10": { + "107": "AQEShuffleRead", + "108": "WholeStageCodegenTransformer (3)", + "101": "VeloxColumnarToRow", + "115": "mapPartitionsInternal" + } + } +} \ No newline at end of file From 1ef4c56c98f0a5b55c816f756ae8b77659ae519a Mon Sep 17 00:00:00 2001 From: menishmueli Date: Mon, 13 Apr 2026 21:33:10 -0400 Subject: [PATCH 2/2] Fix Comet/DataFusion exchange split and aggregate enrichment - Add CometExchange, CometColumnarExchange, GpuColumnarExchange to exchange visual split, stage assignment, and shuffle metrics calculation - Add CometHashAggregate to aggregate node parsing and naming - Support Comet plan description format (Keys:/Functions:) in parser - Re-add fallback plan description parsing from SQL-level planDescription for native engines where DataFlint custom endpoint returns empty --- .../example/DataFusionCometExample.scala | 2 +- .../components/SqlFlow/SqlLayoutService.ts | 4 +- .../PlanParsers/hashAggregateParser.ts | 4 +- spark-ui/src/reducers/SQLNodeStageReducer.ts | 4 +- spark-ui/src/reducers/SqlReducer.ts | 72 ++++++++++++++++++- spark-ui/src/reducers/SqlReducerUtils.ts | 1 + 6 files changed, 80 insertions(+), 7 deletions(-) diff --git a/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/DataFusionCometExample.scala b/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/DataFusionCometExample.scala index 0823dbd6..38e3d072 100644 --- a/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/DataFusionCometExample.scala +++ b/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/DataFusionCometExample.scala @@ -43,7 +43,7 @@ object DataFusionCometExample extends App { println(s"number of unique words : $uniqueWords") - spark.read.load("/Users/menishmueli/Documents/GitHub/spark-sql-perf/data/store_sales").filter($"ss_quantity" > 1).count() +// spark.read.load("/Users/menishmueli/Documents/GitHub/spark-sql-perf/data/store_sales").filter($"ss_quantity" > 1).count() scala.io.StdIn.readLine() spark.stop() diff --git a/spark-ui/src/components/SqlFlow/SqlLayoutService.ts b/spark-ui/src/components/SqlFlow/SqlLayoutService.ts index b1276a01..beaa3c8d 100644 --- a/spark-ui/src/components/SqlFlow/SqlLayoutService.ts +++ b/spark-ui/src/components/SqlFlow/SqlLayoutService.ts @@ -593,7 +593,9 @@ class SqlLayoutService { const splitExchangeNodeIds = new Set(); for (const nodeId of nodesIds) { const node = nodeMap.get(nodeId); - if (node?.nodeName === "Exchange" || node?.nodeName === "ColumnarExchange") { + if (node?.nodeName === "Exchange" || node?.nodeName === "ColumnarExchange" || + node?.nodeName === "CometExchange" || node?.nodeName === "CometColumnarExchange" || + node?.nodeName === "GpuColumnarExchange") { splitExchangeNodeIds.add(nodeId.toString()); } } diff --git a/spark-ui/src/reducers/PlanParsers/hashAggregateParser.ts b/spark-ui/src/reducers/PlanParsers/hashAggregateParser.ts index 2b90403a..cb899e84 100644 --- a/spark-ui/src/reducers/PlanParsers/hashAggregateParser.ts +++ b/spark-ui/src/reducers/PlanParsers/hashAggregateParser.ts @@ -3,8 +3,8 @@ import { bracedSplit, hashNumbersRemover, onlyUnique } from "./PlanParserUtils"; export function parseHashAggregate(input: string): ParsedHashAggregatePlan { const cleanInput = hashNumbersRemover(input); - const keysMatch = cleanInput.match(/keys=\[([^\]]+)\]/); - const functionsMatch = cleanInput.match(/functions=\[([^\]]+)\]/); + const keysMatch = cleanInput.match(/keys=\[([^\]]+)\]/) ?? cleanInput.match(/Keys:\s*\[([^\]]+)\]/); + const functionsMatch = cleanInput.match(/functions=\[([^\]]+)\]/) ?? cleanInput.match(/Functions\s*\[\d+\]:\s*\[([^\]]+)\]/); let keys: string[] = []; let functions: string[] = []; diff --git a/spark-ui/src/reducers/SQLNodeStageReducer.ts b/spark-ui/src/reducers/SQLNodeStageReducer.ts index da3e1f73..d12fadee 100644 --- a/spark-ui/src/reducers/SQLNodeStageReducer.ts +++ b/spark-ui/src/reducers/SQLNodeStageReducer.ts @@ -148,7 +148,9 @@ export function calculateSQLNodeStage(sql: EnrichedSparkSQL, sqlStages: SparkSta nodes = nodes.map((node) => { // Convert Exchange nodes to exchange stage type if they have adjacent nodes with stage info // This handles both nodes without stage data and nodes with onestage type that should be exchange type - if ((node.nodeName === "Exchange" || node.nodeName === "ColumnarExchange") && (node.stage === undefined || node.stage.type === "onestage")) { + if ((node.nodeName === "Exchange" || node.nodeName === "ColumnarExchange" || + node.nodeName === "CometExchange" || node.nodeName === "CometColumnarExchange" || + node.nodeName === "GpuColumnarExchange") && (node.stage === undefined || node.stage.type === "onestage")) { const nextNode = findNextNode(node.nodeId); const previousNode = findPreviousNode(node.nodeId); const metricsExchangeStageIds = findExchangeStageIds(node.metrics); diff --git a/spark-ui/src/reducers/SqlReducer.ts b/spark-ui/src/reducers/SqlReducer.ts index 27f95d28..3ae75268 100644 --- a/spark-ui/src/reducers/SqlReducer.ts +++ b/spark-ui/src/reducers/SqlReducer.ts @@ -100,6 +100,7 @@ export function parseNodePlan( case "PhotonGroupingAgg": case "GpuHashAggregate": case "!CometGpuHashAggregate": + case "CometHashAggregate": case "HashAggregate": case "SortAggregate": case "ObjectHashAggregate": @@ -268,6 +269,57 @@ export function getMetricDuration( return duration; } +/** + * When the DataFlint custom plan endpoint returns empty (e.g., for Gluten/Velox or Comet), + * fall back to parsing per-node descriptions from the SQL-level planDescription text. + * Matches plan sections like "(26) WindowExecTransformer\nArguments: [...]" to SQL nodes by name. + */ +function buildFallbackPlanDescriptions( + sqlPlanDescription: string, + nodes: { nodeId: number; nodeName: string }[], +): Map { + const result = new Map(); + if (!sqlPlanDescription) return result; + + const lines = sqlPlanDescription.split("\n"); + const sections: { name: string; body: string }[] = []; + let currentName: string | undefined; + let currentBody: string[] = []; + + for (const line of lines) { + const headerMatch = line.match(/^\((\d+)\)\s+(\S+)/); + if (headerMatch) { + if (currentName !== undefined && currentBody.length > 0) { + sections.push({ name: currentName, body: currentBody.join(" ") }); + } + currentName = headerMatch[2]; + currentBody = []; + } else if (currentName !== undefined) { + const trimmed = line.trim(); + if (trimmed.startsWith("Arguments:") || trimmed.startsWith("Keys:") || trimmed.startsWith("Functions")) { + currentBody.push(trimmed); + } + } + } + if (currentName !== undefined && currentBody.length > 0) { + sections.push({ name: currentName, body: currentBody.join(" ") }); + } + + const usedSections = new Set(); + for (const node of nodes) { + for (let i = 0; i < sections.length; i++) { + if (usedSections.has(i)) continue; + if (sections[i].name === node.nodeName && sections[i].body) { + result.set(node.nodeId, `${node.nodeName} ${sections[i].body}`); + usedSections.add(i); + break; + } + } + } + + return result; +} + function calculateSql( sql: SparkSQL, plan: SQLPlan | undefined, @@ -277,13 +329,27 @@ function calculateSql( ): EnrichedSparkSQL { const enrichedSql = sql as EnrichedSparkSQL; const originalNumOfNodes = enrichedSql.nodes.length; + + const hasPlanData = plan !== undefined && plan.nodesPlan.length > 0; + const fallbackDescs = hasPlanData + ? new Map() + : buildFallbackPlanDescriptions(sql.planDescription, enrichedSql.nodes); + const typeEnrichedNodes = enrichedSql.nodes.map((node) => { const type = calcNodeType(node.nodeName); const nodePlan = plan?.nodesPlan.find( (planNode) => planNode.id === node.nodeId, ); - const parsedPlan = + let parsedPlan = nodePlan !== undefined ? parseNodePlan(node, nodePlan) : undefined; + + if (parsedPlan === undefined) { + const fallbackDesc = fallbackDescs.get(node.nodeId); + if (fallbackDesc) { + parsedPlan = parseNodePlan(node, { id: node.nodeId, planDescription: fallbackDesc, rddScopeId: undefined }); + } + } + const isCodegenNode = node.nodeName.includes("WholeStageCodegen"); // Find the Delta Lake scan that matches this node's table location @@ -671,7 +737,9 @@ function calcCodegenDuration(metrics: EnrichedSqlMetric[]): number | undefined { function calcExchangeMetrics(nodeName: string, metrics: EnrichedSqlMetric[]) { var exchangeMetrics: ExchangeMetrics | undefined = undefined; - if (nodeName === "Exchange" || nodeName === "ColumnarExchange") { + if (nodeName === "Exchange" || nodeName === "ColumnarExchange" || + nodeName === "CometExchange" || nodeName === "CometColumnarExchange" || + nodeName === "GpuColumnarExchange") { const writeDuration = (getMetricDuration("shuffle write time", metrics) ?? 0) + (getMetricDuration("shuffle wall time", metrics) ?? 0); diff --git a/spark-ui/src/reducers/SqlReducerUtils.ts b/spark-ui/src/reducers/SqlReducerUtils.ts index e0a80cf7..8dce0fa7 100644 --- a/spark-ui/src/reducers/SqlReducerUtils.ts +++ b/spark-ui/src/reducers/SqlReducerUtils.ts @@ -762,6 +762,7 @@ export const AGGREGATE_NODE_NAMES = [ "PhotonGroupingAgg", "GpuHashAggregate", "!CometGpuHashAggregate", + "CometHashAggregate", "HashAggregate", "SortAggregate", "ObjectHashAggregate",