From 6d34328b384692ad69ad597b99cb8ead52c50bb3 Mon Sep 17 00:00:00 2001 From: Avi Minsky Date: Wed, 29 Apr 2026 09:11:40 +0300 Subject: [PATCH] merged pytests logic for finding plugin jar. removed specific version from it so no need to bump when releasing new version --- .github/workflows/cd.yml | 3 -- .../dataflint_pyspark_example.py | 31 +---------- .../dataflint_pyspark_example_short.py | 31 +---------- .../dataflint_pyspark_example_test.py | 32 +---------- .../pyspark-testing/find_plugin_jar.py | 53 +++++++++++++++++++ 5 files changed, 59 insertions(+), 91 deletions(-) create mode 100644 spark-plugin/pyspark-testing/find_plugin_jar.py diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index f4827cd..50ec7bd 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -144,9 +144,6 @@ jobs: sed -i "s/lazy val versionNum: String = \"${VERSION}\"/lazy val versionNum: String = \"${NEW_VERSION}\"/" build.sbt sed -i "s/\"version\": \"${VERSION}\"/\"version\": \"${NEW_VERSION}\"/" ../spark-ui/package.json sed -i "s/${VERSION}/${NEW_VERSION}/g" clean-and-setup.sh - sed -i "s/${VERSION}/${NEW_VERSION}/g" pyspark-testing/dataflint_pyspark_example.py - sed -i "s/${VERSION}/${NEW_VERSION}/g" pyspark-testing/dataflint_pyspark_example_test.py - sed -i "s/${VERSION}/${NEW_VERSION}/g" pyspark-testing/README.md OLD_README_VERSION=$(grep -oE '[0-9]+\.[0-9]+\.[0-9]+' ../README.md | head -1) if [ -n "$OLD_README_VERSION" ] && [ "$OLD_README_VERSION" != "$VERSION" ]; then sed -i "s/${OLD_README_VERSION}/${VERSION}/g" ../README.md diff --git a/spark-plugin/pyspark-testing/dataflint_pyspark_example.py b/spark-plugin/pyspark-testing/dataflint_pyspark_example.py index 3416294..5687edf 100755 --- a/spark-plugin/pyspark-testing/dataflint_pyspark_example.py +++ b/spark-plugin/pyspark-testing/dataflint_pyspark_example.py @@ -27,35 +27,8 @@ def sleep(seconds): if SLEEP_ENABLED: time.sleep(seconds) -# Resolve the plugin JAR path relative to this script's location -# Detect Spark version from environment to load the correct JAR -_script_dir = Path(__file__).resolve().parent -_project_root = _script_dir.parent - -# Try to detect Spark version from SPARK_HOME -import os -spark_home = os.environ.get('SPARK_HOME', '') -spark_major_version = 3 # default to Spark 3 - -if spark_home: - import re - m = re.search(r'[/_-](\d+)\.\d', spark_home) - if m: - spark_major_version = int(m.group(1)) - -# Select the appropriate plugin JAR based on Spark version -if spark_major_version == 4: - _plugin_jar = _project_root / "pluginspark4" / "target" / "scala-2.13" / "dataflint-spark4_2.13-0.9.6.jar" - _plugin_module = "pluginspark4" -else: - _plugin_jar = _project_root / "pluginspark3" / "target" / "scala-2.12" / "spark_2.12-0.9.6.jar" - _plugin_module = "pluginspark3" - -if not _plugin_jar.exists(): - raise FileNotFoundError( - f"Plugin JAR not found at {_plugin_jar}\n" - f"Run: cd {_project_root} && sbt {_plugin_module}/assembly" - ) +from find_plugin_jar import find_plugin_jar +_plugin_jar = find_plugin_jar() instrument = "true" spark = SparkSession \ .builder \ diff --git a/spark-plugin/pyspark-testing/dataflint_pyspark_example_short.py b/spark-plugin/pyspark-testing/dataflint_pyspark_example_short.py index 9d1ea92..0254d5e 100755 --- a/spark-plugin/pyspark-testing/dataflint_pyspark_example_short.py +++ b/spark-plugin/pyspark-testing/dataflint_pyspark_example_short.py @@ -27,35 +27,8 @@ def sleep(seconds): if SLEEP_ENABLED: time.sleep(seconds) -# Resolve the plugin JAR path relative to this script's location -# Detect Spark version from environment to load the correct JAR -_script_dir = Path(__file__).resolve().parent -_project_root = _script_dir.parent - -# Try to detect Spark version from SPARK_HOME -import os -spark_home = os.environ.get('SPARK_HOME', '') -spark_major_version = 3 # default to Spark 3 - -if spark_home: - import re - m = re.search(r'[/_-](\d+)\.\d', spark_home) - if m: - spark_major_version = int(m.group(1)) - -# Select the appropriate plugin JAR based on Spark version -if spark_major_version == 4: - _plugin_jar = _project_root / "pluginspark4" / "target" / "scala-2.13" / "dataflint-spark4_2.13-0.8.9.jar" - _plugin_module = "pluginspark4" -else: - _plugin_jar = _project_root / "pluginspark3" / "target" / "scala-2.12" / "spark_2.12-0.8.9.jar" - _plugin_module = "pluginspark3" - -if not _plugin_jar.exists(): - raise FileNotFoundError( - f"Plugin JAR not found at {_plugin_jar}\n" - f"Run: cd {_project_root} && sbt {_plugin_module}/assembly" - ) +from find_plugin_jar import find_plugin_jar +_plugin_jar = find_plugin_jar() spark = SparkSession \ .builder \ diff --git a/spark-plugin/pyspark-testing/dataflint_pyspark_example_test.py b/spark-plugin/pyspark-testing/dataflint_pyspark_example_test.py index 520ff67..e943323 100755 --- a/spark-plugin/pyspark-testing/dataflint_pyspark_example_test.py +++ b/spark-plugin/pyspark-testing/dataflint_pyspark_example_test.py @@ -26,36 +26,8 @@ from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType -# Resolve the plugin JAR path relative to this script's location -# Detect Spark version from environment to load the correct JAR -_script_dir = Path(__file__).resolve().parent -_project_root = _script_dir.parent - -# Try to detect Spark version from SPARK_HOME -import os -spark_home = os.environ.get('SPARK_HOME', '') -spark_major_version = 3 # default to Spark 3 - -if spark_home: - # Try to extract version from SPARK_HOME path - if '4.0' in spark_home or 'spark-4' in spark_home: - spark_major_version = 4 - elif '3.' in spark_home or 'spark-3' in spark_home: - spark_major_version = 3 - -# Select the appropriate plugin JAR based on Spark version -if spark_major_version == 4: - _plugin_jar = _project_root / "pluginspark4" / "target" / "scala-2.13" / "dataflint-spark4_2.13-0.9.6.jar" - _plugin_module = "pluginspark4" -else: - _plugin_jar = _project_root / "pluginspark3" / "target" / "scala-2.12" / "spark_2.12-0.9.6.jar" - _plugin_module = "pluginspark3" - -if not _plugin_jar.exists(): - raise FileNotFoundError( - f"Plugin JAR not found at {_plugin_jar}\n" - f"Run: cd {_project_root} && sbt {_plugin_module}/assembly" - ) +from find_plugin_jar import find_plugin_jar +_plugin_jar = find_plugin_jar() spark = SparkSession \ .builder \ diff --git a/spark-plugin/pyspark-testing/find_plugin_jar.py b/spark-plugin/pyspark-testing/find_plugin_jar.py new file mode 100644 index 0000000..f78ea19 --- /dev/null +++ b/spark-plugin/pyspark-testing/find_plugin_jar.py @@ -0,0 +1,53 @@ +""" +Locate the DataFlint plugin JAR for the current Spark version. + +Usage: + from find_plugin_jar import find_plugin_jar + plugin_jar = find_plugin_jar() +""" +import os +import re +from pathlib import Path + + +def find_plugin_jar() -> Path: + """Find the DataFlint plugin JAR, supporting both release and SNAPSHOT builds.""" + project_root = Path(__file__).resolve().parent.parent + + # Detect Spark major version from SPARK_HOME + spark_home = os.environ.get("SPARK_HOME", "") + spark_major_version = 3 # default + + if spark_home: + m = re.search(r"[/_-](\d+)\.\d", spark_home) + if m: + spark_major_version = int(m.group(1)) + + # Select module and artifact pattern based on Spark version + if spark_major_version == 4: + module = "pluginspark4" + scala_dir = "scala-2.13" + artifact_prefix = "dataflint-spark4_2.13-" + else: + module = "pluginspark3" + scala_dir = "scala-2.12" + artifact_prefix = "spark_2.12-" + + jar_dir = project_root / module / "target" / scala_dir + + # Look for JARs: prefer release, fall back to SNAPSHOT + if jar_dir.exists(): + jars = sorted(jar_dir.glob(f"{artifact_prefix}*.jar"), reverse=True) + # Filter out sources/javadoc JARs + jars = [j for j in jars if "-sources" not in j.name and "-javadoc" not in j.name] + # Prefer non-SNAPSHOT + release_jars = [j for j in jars if "SNAPSHOT" not in j.name] + if release_jars: + return release_jars[0] + if jars: + return jars[0] + + raise FileNotFoundError( + f"Plugin JAR not found in {jar_dir}\n" + f"Run: cd {project_root} && sbt {module}/assembly" + ) \ No newline at end of file