Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions .github/workflows/cd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -144,9 +144,6 @@ jobs:
sed -i "s/lazy val versionNum: String = \"${VERSION}\"/lazy val versionNum: String = \"${NEW_VERSION}\"/" build.sbt
sed -i "s/\"version\": \"${VERSION}\"/\"version\": \"${NEW_VERSION}\"/" ../spark-ui/package.json
sed -i "s/${VERSION}/${NEW_VERSION}/g" clean-and-setup.sh
sed -i "s/${VERSION}/${NEW_VERSION}/g" pyspark-testing/dataflint_pyspark_example.py
sed -i "s/${VERSION}/${NEW_VERSION}/g" pyspark-testing/dataflint_pyspark_example_test.py
sed -i "s/${VERSION}/${NEW_VERSION}/g" pyspark-testing/README.md
OLD_README_VERSION=$(grep -oE '[0-9]+\.[0-9]+\.[0-9]+' ../README.md | head -1)
if [ -n "$OLD_README_VERSION" ] && [ "$OLD_README_VERSION" != "$VERSION" ]; then
sed -i "s/${OLD_README_VERSION}/${VERSION}/g" ../README.md
Expand Down
31 changes: 2 additions & 29 deletions spark-plugin/pyspark-testing/dataflint_pyspark_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,35 +27,8 @@ def sleep(seconds):
if SLEEP_ENABLED:
time.sleep(seconds)

# Resolve the plugin JAR path relative to this script's location
# Detect Spark version from environment to load the correct JAR
_script_dir = Path(__file__).resolve().parent
_project_root = _script_dir.parent

# Try to detect Spark version from SPARK_HOME
import os
spark_home = os.environ.get('SPARK_HOME', '')
spark_major_version = 3 # default to Spark 3

if spark_home:
import re
m = re.search(r'[/_-](\d+)\.\d', spark_home)
if m:
spark_major_version = int(m.group(1))

# Select the appropriate plugin JAR based on Spark version
if spark_major_version == 4:
_plugin_jar = _project_root / "pluginspark4" / "target" / "scala-2.13" / "dataflint-spark4_2.13-0.9.6.jar"
_plugin_module = "pluginspark4"
else:
_plugin_jar = _project_root / "pluginspark3" / "target" / "scala-2.12" / "spark_2.12-0.9.6.jar"
_plugin_module = "pluginspark3"

if not _plugin_jar.exists():
raise FileNotFoundError(
f"Plugin JAR not found at {_plugin_jar}\n"
f"Run: cd {_project_root} && sbt {_plugin_module}/assembly"
)
from find_plugin_jar import find_plugin_jar
_plugin_jar = find_plugin_jar()
instrument = "true"
spark = SparkSession \
.builder \
Expand Down
31 changes: 2 additions & 29 deletions spark-plugin/pyspark-testing/dataflint_pyspark_example_short.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,35 +27,8 @@ def sleep(seconds):
if SLEEP_ENABLED:
time.sleep(seconds)

# Resolve the plugin JAR path relative to this script's location
# Detect Spark version from environment to load the correct JAR
_script_dir = Path(__file__).resolve().parent
_project_root = _script_dir.parent

# Try to detect Spark version from SPARK_HOME
import os
spark_home = os.environ.get('SPARK_HOME', '')
spark_major_version = 3 # default to Spark 3

if spark_home:
import re
m = re.search(r'[/_-](\d+)\.\d', spark_home)
if m:
spark_major_version = int(m.group(1))

# Select the appropriate plugin JAR based on Spark version
if spark_major_version == 4:
_plugin_jar = _project_root / "pluginspark4" / "target" / "scala-2.13" / "dataflint-spark4_2.13-0.8.9.jar"
_plugin_module = "pluginspark4"
else:
_plugin_jar = _project_root / "pluginspark3" / "target" / "scala-2.12" / "spark_2.12-0.8.9.jar"
_plugin_module = "pluginspark3"

if not _plugin_jar.exists():
raise FileNotFoundError(
f"Plugin JAR not found at {_plugin_jar}\n"
f"Run: cd {_project_root} && sbt {_plugin_module}/assembly"
)
from find_plugin_jar import find_plugin_jar
_plugin_jar = find_plugin_jar()

spark = SparkSession \
.builder \
Expand Down
32 changes: 2 additions & 30 deletions spark-plugin/pyspark-testing/dataflint_pyspark_example_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,36 +26,8 @@
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# Resolve the plugin JAR path relative to this script's location
# Detect Spark version from environment to load the correct JAR
_script_dir = Path(__file__).resolve().parent
_project_root = _script_dir.parent

# Try to detect Spark version from SPARK_HOME
import os
spark_home = os.environ.get('SPARK_HOME', '')
spark_major_version = 3 # default to Spark 3

if spark_home:
# Try to extract version from SPARK_HOME path
if '4.0' in spark_home or 'spark-4' in spark_home:
spark_major_version = 4
elif '3.' in spark_home or 'spark-3' in spark_home:
spark_major_version = 3

# Select the appropriate plugin JAR based on Spark version
if spark_major_version == 4:
_plugin_jar = _project_root / "pluginspark4" / "target" / "scala-2.13" / "dataflint-spark4_2.13-0.9.6.jar"
_plugin_module = "pluginspark4"
else:
_plugin_jar = _project_root / "pluginspark3" / "target" / "scala-2.12" / "spark_2.12-0.9.6.jar"
_plugin_module = "pluginspark3"

if not _plugin_jar.exists():
raise FileNotFoundError(
f"Plugin JAR not found at {_plugin_jar}\n"
f"Run: cd {_project_root} && sbt {_plugin_module}/assembly"
)
from find_plugin_jar import find_plugin_jar
_plugin_jar = find_plugin_jar()

spark = SparkSession \
.builder \
Expand Down
53 changes: 53 additions & 0 deletions spark-plugin/pyspark-testing/find_plugin_jar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""
Locate the DataFlint plugin JAR for the current Spark version.

Usage:
from find_plugin_jar import find_plugin_jar
plugin_jar = find_plugin_jar()
"""
import os
import re
from pathlib import Path


def find_plugin_jar() -> Path:
"""Find the DataFlint plugin JAR, supporting both release and SNAPSHOT builds."""
project_root = Path(__file__).resolve().parent.parent

# Detect Spark major version from SPARK_HOME
spark_home = os.environ.get("SPARK_HOME", "")
spark_major_version = 3 # default

if spark_home:
m = re.search(r"[/_-](\d+)\.\d", spark_home)
if m:
spark_major_version = int(m.group(1))

# Select module and artifact pattern based on Spark version
if spark_major_version == 4:
module = "pluginspark4"
scala_dir = "scala-2.13"
artifact_prefix = "dataflint-spark4_2.13-"
else:
module = "pluginspark3"
scala_dir = "scala-2.12"
artifact_prefix = "spark_2.12-"

jar_dir = project_root / module / "target" / scala_dir

# Look for JARs: prefer release, fall back to SNAPSHOT
if jar_dir.exists():
jars = sorted(jar_dir.glob(f"{artifact_prefix}*.jar"), reverse=True)
# Filter out sources/javadoc JARs
jars = [j for j in jars if "-sources" not in j.name and "-javadoc" not in j.name]
# Prefer non-SNAPSHOT
release_jars = [j for j in jars if "SNAPSHOT" not in j.name]
if release_jars:
return release_jars[0]
if jars:
return jars[0]

raise FileNotFoundError(
f"Plugin JAR not found in {jar_dir}\n"
f"Run: cd {project_root} && sbt {module}/assembly"
)
Loading