From fd9ca8d26348b41870ea60b044ea7b1d85348b50 Mon Sep 17 00:00:00 2001 From: Conrad Date: Sat, 13 Jun 2026 06:07:30 -0400 Subject: [PATCH 1/4] feat: Add target model and resolve dialect param to a target Promote the transpile() dialect parameter into a first-class target selector. Introduce a Capabilities descriptor and Generic, DuckDB, and DataFusion target classes, each carrying a capability set and the sqlglot output dialect, plus a resolve_target() helper. The dialect string now resolves to a Target, and the DuckDB IEJoin gate is keyed off the range_join_strategy capability rather than a literal dialect comparison. Adds "datafusion" as an accepted target that routes through the generic binned path. Behaviour-preserving: existing dialect values emit identical SQL. --- src/giql/targets.py | 157 ++++++++++++++++++++++++++++++++++++++++++ src/giql/transpile.py | 30 ++++---- 2 files changed, 174 insertions(+), 13 deletions(-) create mode 100644 src/giql/targets.py diff --git a/src/giql/targets.py b/src/giql/targets.py new file mode 100644 index 0000000..ec38bb8 --- /dev/null +++ b/src/giql/targets.py @@ -0,0 +1,157 @@ +"""Target-engine model for GIQL transpilation. + +A :class:`Target` is a first-class SQL target engine: it carries a +:class:`Capabilities` set describing what the engine supports and the +sqlglot output dialect used to serialize standard AST for that engine. + +This is step 1 of epic #137. The targets and capability descriptors defined +here are the foundation that later steps build on — the operator-expander +registry is keyed by ``(target, operator)`` and emission choices become +capability lookups rather than scattered ``if dialect == ...`` branches. +At this step the model is wired into :func:`giql.transpile.transpile` only +to resolve the ``dialect`` parameter and drive the existing DuckDB IEJoin +gate; no emission behaviour changes. +""" + +from dataclasses import dataclass +from typing import Literal + +RangeJoinStrategy = Literal["binned", "iejoin"] + + +@dataclass(frozen=True) +class Capabilities: + """Feature set of a SQL target engine. + + Each field is a portable choice that later steps of epic #137 turn into + a capability lookup instead of a hardcoded dialect branch. + + Parameters + ---------- + supports_lateral : bool + Whether the engine supports ``LATERAL`` / correlated joins. Drives + the NEAREST LATERAL-vs-window-function strategy (#142). + supports_star_replace : bool + Whether the engine supports ``SELECT * REPLACE (...)``. Drives the + coordinate-canonicalization output: ``* REPLACE`` where supported, + an explicit portable projection otherwise (#143). Supported by + DuckDB / BigQuery / Snowflake / ClickHouse; not by PostgreSQL, + SQLite, or DataFusion. + supports_qualify : bool + Whether the engine supports the ``QUALIFY`` clause. + range_join_strategy : RangeJoinStrategy + The plan used for column-to-column INTERSECTS joins: ``"binned"`` + for the generic binned equi-join, ``"iejoin"`` for DuckDB's + per-partition IEJoin plan. + """ + + supports_lateral: bool + supports_star_replace: bool + supports_qualify: bool + range_join_strategy: RangeJoinStrategy + + +class Target: + """A SQL target engine. + + Subclasses declare the engine ``name``, the ``sqlglot_dialect`` used to + serialize AST for that engine (``None`` selects sqlglot's default + generic serialization), and the engine ``capabilities``. + """ + + name: str + sqlglot_dialect: str | None + capabilities: Capabilities + + +class GenericTarget(Target): + """Portable SQL-92-ish target with no engine-specific features. + + This is the default target (``dialect=None``). Its capabilities are the + conservative, maximally portable baseline that matches today's + :class:`giql.generators.base.BaseGIQLGenerator` output. + """ + + name = "generic" + sqlglot_dialect = None + capabilities = Capabilities( + supports_lateral=True, + supports_star_replace=False, + supports_qualify=False, + range_join_strategy="binned", + ) + + +class DuckDBTarget(Target): + """DuckDB target. + + Serializes through sqlglot's ``duckdb`` dialect and uses the IEJoin + per-partition plan for column-to-column INTERSECTS joins. + """ + + name = "duckdb" + sqlglot_dialect = "duckdb" + capabilities = Capabilities( + supports_lateral=True, + supports_star_replace=True, + supports_qualify=True, + range_join_strategy="iejoin", + ) + + +class DataFusionTarget(Target): + """Apache DataFusion target. + + sqlglot has no DataFusion dialect, so serialization falls back to the + generic form (``sqlglot_dialect = None``) for now; #145 finalizes + DataFusion serialization. The capability values below are conservative + and provisional — they are validated against a real DataFusion engine + when the operator migrations exercise them (#142, #145). DataFusion + supports ``* EXCEPT`` / ``* EXCLUDE`` but not ``* REPLACE``. + """ + + name = "datafusion" + sqlglot_dialect = None + capabilities = Capabilities( + supports_lateral=False, + supports_star_replace=False, + supports_qualify=False, + range_join_strategy="binned", + ) + + +_TARGETS_BY_NAME: dict[str, type[Target]] = { + GenericTarget.name: GenericTarget, + DuckDBTarget.name: DuckDBTarget, + DataFusionTarget.name: DataFusionTarget, +} + + +def resolve_target(dialect: str | None) -> Target: + """Resolve a ``dialect`` parameter to a :class:`Target` instance. + + Parameters + ---------- + dialect : str | None + The target dialect name. ``None`` resolves to :class:`GenericTarget`; + ``"duckdb"`` and ``"datafusion"`` resolve to their respective targets. + + Returns + ------- + Target + The resolved target instance. + + Raises + ------ + ValueError + If *dialect* is not a recognized target name. + """ + if dialect is None: + return GenericTarget() + + target_cls = _TARGETS_BY_NAME.get(dialect) + if target_cls is None or target_cls is GenericTarget: + raise ValueError( + f"Unknown dialect: {dialect!r}. Supported: 'duckdb', 'datafusion', or None." + ) + return target_cls() diff --git a/src/giql/transpile.py b/src/giql/transpile.py index 5bfe894..912c430 100644 --- a/src/giql/transpile.py +++ b/src/giql/transpile.py @@ -17,6 +17,7 @@ from giql.resolver import resolve_operator_refs from giql.table import Table from giql.table import Tables +from giql.targets import resolve_target from giql.transformer import ClusterTransformer from giql.transformer import IntersectsBinnedJoinTransformer from giql.transformer import IntersectsDuckDBIEJoinTransformer @@ -28,7 +29,7 @@ def transpile( giql: str, tables: list[str | Table] | None = None, *, - dialect: None = None, + dialect: Literal["datafusion"] | None = None, intersects_bin_size: int | None = None, ) -> str: ... @@ -47,7 +48,7 @@ def transpile( giql: str, tables: list[str | Table] | None = None, *, - dialect: Literal["duckdb"] | None = None, + dialect: Literal["duckdb", "datafusion"] | None = None, intersects_bin_size: int | None = None, ) -> str: """Transpile a GIQL query to SQL. @@ -64,16 +65,19 @@ def transpile( Table configurations. Strings use default column mappings (chrom, start, end, strand). :class:`Table` objects provide custom column name mappings. - dialect : Literal["duckdb"] | None - Optional target dialect. When set to ``"duckdb"``, column-to-column + dialect : Literal["duckdb", "datafusion"] | None + Optional target engine. Resolves to a :class:`giql.targets.Target` + carrying the engine's capability set; ``None`` selects the generic + portable target. When set to ``"duckdb"``, column-to-column ``INTERSECTS`` joins (INNER, SEMI, or ANTI) are transpiled into a per-chromosome dynamic-SQL pattern (``SET VARIABLE`` + ``query(getvariable(...))``) that DuckDB plans through its - range-join family (``IE_JOIN`` / ``PIECEWISE_MERGE_JOIN``). - Mutually exclusive with ``intersects_bin_size``. Defaults to - ``None`` (the generic binned equi-join path). Hard-error projection - shapes raise ``ValueError`` at transpile time; see the performance - guide for the full enumeration. + range-join family (``IE_JOIN`` / ``PIECEWISE_MERGE_JOIN``); this + IEJoin plan is mutually exclusive with ``intersects_bin_size``. + ``"datafusion"`` and ``None`` use the generic binned equi-join path + and accept ``intersects_bin_size``. Hard-error projection shapes + raise ``ValueError`` at transpile time; see the performance guide + for the full enumeration. intersects_bin_size : int | None Bin size for INTERSECTS equi-join optimization. When a query contains a full-table column-to-column INTERSECTS join, the @@ -136,9 +140,9 @@ def transpile( dialect="duckdb", ) """ - if dialect is not None and dialect != "duckdb": - raise ValueError(f"Unknown dialect: {dialect!r}. Supported: 'duckdb' or None.") - if dialect == "duckdb" and intersects_bin_size is not None: + target = resolve_target(dialect) + uses_iejoin = target.capabilities.range_join_strategy == "iejoin" + if uses_iejoin and intersects_bin_size is not None: raise ValueError( "intersects_bin_size has no effect with dialect='duckdb'; " "the DuckDB dialect uses an IEJoin per-partition plan instead " @@ -153,7 +157,7 @@ def transpile( # Falls back to the binned plan for unsupported shapes — see # IntersectsDuckDBIEJoinTransformer.transform_to_sql for the complete # fallback set. - if dialect == "duckdb": + if uses_iejoin: duckdb_transformer = IntersectsDuckDBIEJoinTransformer(tables_container) with _reraise_as_value_error("Transformation error"): duckdb_sql = duckdb_transformer.transform_to_sql(ast) From 451eb154eeffa1cbf1e4d4741d95b0a9a0446dc3 Mon Sep 17 00:00:00 2001 From: Conrad Date: Sat, 13 Jun 2026 06:07:46 -0400 Subject: [PATCH 2/4] test: Cover the target model and DataFusion dialect routing Add unit tests for Capabilities, the three target classes, and resolve_target, including parametrized and property-based rejection of unsupported dialect names. Extend the transpile dialect tests with datafusion-equals-generic identity across the operator spread and the duckdb bin-size guard boundaries. Add a DataFusion integration smoke suite that executes dialect="datafusion" output against a real DataFusion engine, covering the literal-predicate and binned-join routing seams. --- tests/integration/datafusion/__init__.py | 0 tests/integration/datafusion/conftest.py | 49 +++ .../datafusion/test_datafusion_target.py | 123 ++++++++ tests/test_targets.py | 295 ++++++++++++++++++ tests/test_transpile.py | 192 ++++++++++++ 5 files changed, 659 insertions(+) create mode 100644 tests/integration/datafusion/__init__.py create mode 100644 tests/integration/datafusion/conftest.py create mode 100644 tests/integration/datafusion/test_datafusion_target.py create mode 100644 tests/test_targets.py diff --git a/tests/integration/datafusion/__init__.py b/tests/integration/datafusion/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/datafusion/conftest.py b/tests/integration/datafusion/conftest.py new file mode 100644 index 0000000..f57d7bc --- /dev/null +++ b/tests/integration/datafusion/conftest.py @@ -0,0 +1,49 @@ +"""Fixtures for DataFusion target execution smoke tests (issue #132). + +These tests prove that SQL produced via ``transpile(..., dialect="datafusion")`` +parses and executes on a real DataFusion engine. The broad cross-target result +oracle and the full DataFusion integration lane are deferred to #139. +""" + +import pytest + +pytest.importorskip("datafusion") +pytest.importorskip("pyarrow") + +pytestmark = pytest.mark.integration + + +@pytest.fixture +def datafusion_ctx(): + """Return a builder that registers interval tables into a SessionContext. + + The returned callable accepts ``name=[(chrom, start, end), ...]`` keyword + arguments and yields a DataFusion ``SessionContext`` with each table + registered under the default ``chrom``/``start``/``end`` schema (matching + the default :class:`giql.Table` column mapping). + """ + import pyarrow as pa + from datafusion import SessionContext + + schema = pa.schema( + [ + ("chrom", pa.utf8()), + ("start", pa.int64()), + ("end", pa.int64()), + ] + ) + + def _build(**tables): + ctx = SessionContext() + for name, rows in tables.items(): + arrays = { + "chrom": [r[0] for r in rows], + "start": [r[1] for r in rows], + "end": [r[2] for r in rows], + } + ctx.register_record_batches( + name, [pa.table(arrays, schema=schema).to_batches()] + ) + return ctx + + return _build diff --git a/tests/integration/datafusion/test_datafusion_target.py b/tests/integration/datafusion/test_datafusion_target.py new file mode 100644 index 0000000..8c267f5 --- /dev/null +++ b/tests/integration/datafusion/test_datafusion_target.py @@ -0,0 +1,123 @@ +"""DataFusion execution smoke tests for the ``dialect="datafusion"`` target. + +Issue #132 makes ``"datafusion"`` a routing key in :func:`giql.transpile`. The +existing ``TestBinnedJoinDataFusion`` suite proves the binned plan executes on +DataFusion, but always with ``dialect`` omitted (``None``); nothing executes the +``dialect="datafusion"`` entry point end-to-end. These tests close that seam. +""" + +import pytest + +from giql import transpile + +pytestmark = pytest.mark.integration + + +class TestDataFusionTargetExecution: + """End-to-end execution of dialect="datafusion" output on DataFusion.""" + + def test_intersects_literal_returns_overlapping_row(self, datafusion_ctx): + """Test a literal INTERSECTS query through the datafusion target. + + Given: + A peaks table in a real DataFusion context with one overlapping + interval, one non-overlapping interval, and one on another + chromosome. + When: + A literal-range INTERSECTS query is transpiled with + dialect="datafusion" and executed. + Then: + It should parse, execute, and return exactly the overlapping + chr1 row. + """ + # Arrange + ctx = datafusion_ctx( + peaks=[ + ("chr1", 1500, 1800), # overlaps chr1:1000-2000 + ("chr1", 5000, 6000), # same chrom, no overlap + ("chr2", 1500, 1800), # overlapping position, wrong chrom + ] + ) + sql = transpile( + 'SELECT chrom, start, "end" FROM peaks ' + "WHERE interval INTERSECTS 'chr1:1000-2000'", + tables=["peaks"], + dialect="datafusion", + ) + + # Act + df = ctx.sql(sql).to_pandas() + + # Assert + assert len(df) == 1 + assert df.iloc[0]["chrom"] == "chr1" + assert df.iloc[0]["start"] == 1500 + + def test_intersects_literal_no_overlap_returns_zero_rows(self, datafusion_ctx): + """Test that a non-overlapping literal INTERSECTS yields no rows. + + Given: + A peaks table in a real DataFusion context with no interval + overlapping the query range. + When: + The literal INTERSECTS query is transpiled with + dialect="datafusion" and executed. + Then: + It should execute and return zero rows. + """ + # Arrange + ctx = datafusion_ctx( + peaks=[ + ("chr1", 5000, 6000), + ("chr2", 1500, 1800), + ] + ) + sql = transpile( + 'SELECT chrom, start, "end" FROM peaks ' + "WHERE interval INTERSECTS 'chr1:1000-2000'", + tables=["peaks"], + dialect="datafusion", + ) + + # Act + df = ctx.sql(sql).to_pandas() + + # Assert + assert len(df) == 0 + + def test_intersects_join_with_bin_size_dedups_across_bins(self, datafusion_ctx): + """Test the datafusion-routed binned join honours an explicit bin size. + + Given: + Two interval tables in a real DataFusion context whose single + intervals overlap and span several bins under a small bin size. + When: + A column-to-column INTERSECTS join is transpiled with + dialect="datafusion" and an explicit intersects_bin_size, then + executed. + Then: + It should be accepted (not rejected as it would be for duckdb), + route through the binned path, and return exactly one row — the + overlap, de-duplicated across the shared bins. + """ + # Arrange + ctx = datafusion_ctx( + peaks=[("chr1", 0, 5000)], + genes=[("chr1", 500, 4500)], + ) + sql = transpile( + 'SELECT a.chrom, a.start, a."end", ' + 'b.start AS b_start, b."end" AS b_end ' + "FROM peaks a JOIN genes b ON a.interval INTERSECTS b.interval", + tables=["peaks", "genes"], + dialect="datafusion", + intersects_bin_size=1000, + ) + + # Act + df = ctx.sql(sql).to_pandas() + + # Assert + assert len(df) == 1 + assert df.iloc[0]["start"] == 0 + assert df.iloc[0]["b_start"] == 500 diff --git a/tests/test_targets.py b/tests/test_targets.py new file mode 100644 index 0000000..1049ada --- /dev/null +++ b/tests/test_targets.py @@ -0,0 +1,295 @@ +"""Tests for the target-engine model.""" + +from dataclasses import FrozenInstanceError + +import pytest +from hypothesis import given +from hypothesis import strategies as st + +from giql.targets import Capabilities +from giql.targets import DataFusionTarget +from giql.targets import DuckDBTarget +from giql.targets import GenericTarget +from giql.targets import resolve_target + + +class TestCapabilities: + """Tests for the Capabilities dataclass.""" + + def test___init___stores_all_descriptors(self): + """Test that Capabilities records each descriptor. + + Given: + A full set of capability descriptor values. + When: + A Capabilities instance is constructed. + Then: + It should expose each value on the corresponding attribute. + """ + # Act + caps = Capabilities( + supports_lateral=True, + supports_star_replace=False, + supports_qualify=True, + range_join_strategy="binned", + ) + + # Assert + assert caps.supports_lateral is True + assert caps.supports_star_replace is False + assert caps.supports_qualify is True + assert caps.range_join_strategy == "binned" + + def test___eq___with_identical_values(self): + """Test value equality of Capabilities. + + Given: + Two Capabilities built from identical descriptor values. + When: + They are compared for equality. + Then: + It should treat them as equal (frozen value semantics). + """ + # Arrange + first = Capabilities( + supports_lateral=True, + supports_star_replace=True, + supports_qualify=True, + range_join_strategy="iejoin", + ) + second = Capabilities( + supports_lateral=True, + supports_star_replace=True, + supports_qualify=True, + range_join_strategy="iejoin", + ) + + # Act & assert + assert first == second + + def test___eq___with_one_differing_field(self): + """Test value inequality of Capabilities. + + Given: + Two Capabilities identical in every descriptor except one. + When: + They are compared for equality. + Then: + It should treat them as unequal (per-field value semantics). + """ + # Arrange + base = Capabilities( + supports_lateral=True, + supports_star_replace=True, + supports_qualify=True, + range_join_strategy="iejoin", + ) + differing = Capabilities( + supports_lateral=True, + supports_star_replace=True, + supports_qualify=True, + range_join_strategy="binned", + ) + + # Act & assert + assert base != differing + + def test___setattr___is_frozen(self): + """Test that Capabilities is immutable. + + Given: + A constructed Capabilities instance. + When: + An attribute is reassigned. + Then: + It should raise FrozenInstanceError. + """ + # Arrange + caps = Capabilities( + supports_lateral=True, + supports_star_replace=False, + supports_qualify=False, + range_join_strategy="binned", + ) + + # Act & assert + with pytest.raises(FrozenInstanceError): + caps.supports_lateral = False # type: ignore[misc] + + +class TestGenericTarget: + """Tests for the GenericTarget.""" + + def test___init___exposes_portable_baseline(self): + """Test the generic target's identity and capabilities. + + Given: + The GenericTarget class. + When: + An instance is constructed. + Then: + It should carry the portable SQL-92 baseline: no engine dialect, + lateral supported, no star-REPLACE, no QUALIFY, binned joins. + """ + # Act + target = GenericTarget() + + # Assert + assert target.name == "generic" + assert target.sqlglot_dialect is None + assert target.capabilities.supports_lateral is True + assert target.capabilities.supports_star_replace is False + assert target.capabilities.supports_qualify is False + assert target.capabilities.range_join_strategy == "binned" + + +class TestDuckDBTarget: + """Tests for the DuckDBTarget.""" + + def test___init___exposes_duckdb_features(self): + """Test the DuckDB target's identity and capabilities. + + Given: + The DuckDBTarget class. + When: + An instance is constructed. + Then: + It should serialize through the duckdb dialect and enable + lateral, star-REPLACE, QUALIFY, and the IEJoin strategy. + """ + # Act + target = DuckDBTarget() + + # Assert + assert target.name == "duckdb" + assert target.sqlglot_dialect == "duckdb" + assert target.capabilities.supports_lateral is True + assert target.capabilities.supports_star_replace is True + assert target.capabilities.supports_qualify is True + assert target.capabilities.range_join_strategy == "iejoin" + + +class TestDataFusionTarget: + """Tests for the DataFusionTarget.""" + + def test___init___exposes_conservative_capabilities(self): + """Test the DataFusion target's identity and capabilities. + + Given: + The DataFusionTarget class. + When: + An instance is constructed. + Then: + It should fall back to generic serialization (no sqlglot + DataFusion dialect), disable star-REPLACE and QUALIFY, and use + the binned join strategy. + """ + # Act + target = DataFusionTarget() + + # Assert + assert target.name == "datafusion" + assert target.sqlglot_dialect is None + assert target.capabilities.supports_lateral is False + assert target.capabilities.supports_star_replace is False + assert target.capabilities.supports_qualify is False + assert target.capabilities.range_join_strategy == "binned" + + +def test_resolve_target_with_none_returns_generic(): + """Test resolution of the default dialect. + + Given: + A None dialect argument. + When: + resolve_target is called. + Then: + It should return a GenericTarget. + """ + # Act + target = resolve_target(None) + + # Assert + assert isinstance(target, GenericTarget) + + +def test_resolve_target_with_duckdb_returns_duckdb(): + """Test resolution of the duckdb dialect. + + Given: + The dialect name "duckdb". + When: + resolve_target is called. + Then: + It should return a DuckDBTarget. + """ + # Act + target = resolve_target("duckdb") + + # Assert + assert isinstance(target, DuckDBTarget) + + +def test_resolve_target_with_datafusion_returns_datafusion(): + """Test resolution of the datafusion dialect. + + Given: + The dialect name "datafusion". + When: + resolve_target is called. + Then: + It should return a DataFusionTarget. + """ + # Act + target = resolve_target("datafusion") + + # Assert + assert isinstance(target, DataFusionTarget) + + +@pytest.mark.parametrize( + "dialect", + [ + "postgres", # plain unknown engine + "sqlite", # second unknown — guards against a hardcoded message + "DuckDB", # case-variant of a real target — lookup is case-sensitive + "", # empty string is distinct from None + "generic", # the internal target name — None is the only public path + ], +) +def test_resolve_target_with_unsupported_dialect_raises(dialect): + """Test resolution of dialect names that are not public targets. + + Given: + A dialect string that is not one of the supported public names + (an unknown engine, a case-variant, the empty string, or the + internal "generic" name). + When: + resolve_target is called. + Then: + It should raise ValueError naming the offending value and the + supported targets. + """ + # Act & assert + with pytest.raises( + ValueError, + match=rf"Unknown dialect: {dialect!r}\..*'duckdb', 'datafusion', or None\.", + ): + resolve_target(dialect) + + +@given(st.text().filter(lambda s: s not in ("duckdb", "datafusion"))) +def test_resolve_target_with_arbitrary_unsupported_string_raises(dialect): + """Test resolution over the open domain of unsupported strings. + + Given: + Any string that is not a registered public dialect name (the only + public string names are "duckdb" and "datafusion"). + When: + resolve_target is called. + Then: + It should raise ValueError rather than returning a target. + """ + # Act & assert + with pytest.raises(ValueError, match="Unknown dialect"): + resolve_target(dialect) diff --git a/tests/test_transpile.py b/tests/test_transpile.py index e0f54fe..e43a1e0 100644 --- a/tests/test_transpile.py +++ b/tests/test_transpile.py @@ -389,6 +389,198 @@ def test_invalid_syntax(self): with pytest.raises(ValueError, match="Parse error"): transpile("SELECT * FORM peaks") # typo: FORM instead of FROM + def test_unknown_dialect(self): + """Test transpilation with an unrecognized dialect. + + Given: + A GIQL query and a dialect that is not a supported target. + When: + Transpiling. + Then: + It should raise ValueError naming the supported targets. + """ + # Act & assert + with pytest.raises(ValueError, match="Unknown dialect: 'postgres'"): + transpile( + "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", + tables=["peaks"], + dialect="postgres", # type: ignore[call-overload] + ) + + +class TestTranspileDialects: + """Tests for dialect/target selection.""" + + def test_transpile_datafusion_accepted(self): + """Test that the datafusion dialect is a valid target. + + Given: + A GIQL query and dialect="datafusion". + When: + Transpiling. + Then: + It should return SQL referencing the table without raising. + """ + # Act + sql = transpile( + "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", + tables=["peaks"], + dialect="datafusion", + ) + + # Assert + assert "SELECT" in sql + assert "peaks" in sql + + @pytest.mark.parametrize( + "query, tables", + [ + ( + "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", + ["peaks"], + ), + ("SELECT * FROM peaks WHERE interval CONTAINS 'chr1:1500'", ["peaks"]), + ( + "SELECT * FROM peaks WHERE interval WITHIN 'chr1:1000-2000'", + ["peaks"], + ), + ( + "SELECT * FROM NEAREST(genes, reference := 'chr1:1000-2000', k := 3)", + ["genes"], + ), + ( + "SELECT a.chrom, b.chrom FROM peaks a " + "JOIN genes b ON a.interval INTERSECTS b.interval", + ["peaks", "genes"], + ), + ( + "SELECT * FROM peaks WHERE interval INTERSECTS " + "ANY('chr1:1000-2000', 'chr1:5000-6000')", + ["peaks"], + ), + ], + ids=["intersects_literal", "contains", "within", "nearest", "join", "any"], + ) + def test_transpile_datafusion_matches_generic_output(self, query, tables): + """Test that datafusion routes through the generic path for now. + + Given: + A GIQL query across the operator spread, transpiled with + dialect=None and with dialect="datafusion". + When: + Comparing the two outputs. + Then: + It should produce byte-identical SQL, since datafusion has no + engine-specific expansion yet. + """ + # Act + generic_sql = transpile(query, tables=tables) + datafusion_sql = transpile(query, tables=tables, dialect="datafusion") + + # Assert + assert datafusion_sql == generic_sql + + def test_transpile_datafusion_accepts_intersects_bin_size(self): + """Test that datafusion honours the binned-join bin size identically. + + Given: + A column-to-column INTERSECTS join with an explicit + intersects_bin_size, transpiled with dialect=None and + dialect="datafusion". + When: + Comparing the two outputs. + Then: + It should produce byte-identical SQL — datafusion takes the same + binned-join path and honours the bin size. + """ + # Arrange + query = ( + "SELECT a.chrom, b.chrom FROM peaks a " + "JOIN genes b ON a.interval INTERSECTS b.interval" + ) + + # Act + generic_sql = transpile( + query, tables=["peaks", "genes"], intersects_bin_size=50000 + ) + datafusion_sql = transpile( + query, + tables=["peaks", "genes"], + dialect="datafusion", + intersects_bin_size=50000, + ) + + # Assert + assert datafusion_sql == generic_sql + + def test_transpile_duckdb_returns_sql_for_column_join(self): + """Test the duckdb happy path for a column-to-column INTERSECTS join. + + Given: + A column-to-column INTERSECTS join and dialect="duckdb". + When: + Transpiling. + Then: + It should return SQL referencing the joined tables without raising. + """ + # Act + sql = transpile( + "SELECT a.chrom, a.start, b.start FROM peaks a " + "JOIN genes b ON a.interval INTERSECTS b.interval", + tables=["peaks", "genes"], + dialect="duckdb", + ) + + # Assert + assert "SELECT" in sql + assert "peaks" in sql + assert "genes" in sql + + def test_transpile_duckdb_rejects_intersects_bin_size(self): + """Test that duckdb and intersects_bin_size are mutually exclusive. + + Given: + dialect="duckdb" combined with an explicit intersects_bin_size. + When: + Transpiling. + Then: + It should raise ValueError explaining the IEJoin plan ignores + the bin size. + """ + # Act & assert + with pytest.raises( + ValueError, + match=r"intersects_bin_size has no effect.*Pass one or the other, not both\.", + ): + transpile( + "SELECT a.chrom, b.chrom FROM peaks a " + "JOIN genes b ON a.interval INTERSECTS b.interval", + tables=["peaks", "genes"], + dialect="duckdb", + intersects_bin_size=50000, + ) + + def test_transpile_duckdb_rejects_zero_intersects_bin_size(self): + """Test that a falsy-but-set bin size still triggers the duckdb guard. + + Given: + dialect="duckdb" combined with intersects_bin_size=0. + When: + Transpiling. + Then: + It should raise ValueError — the guard is `is not None`, so 0 + (falsy) still conflicts with the IEJoin plan. + """ + # Act & assert + with pytest.raises(ValueError, match="intersects_bin_size has no effect"): + transpile( + "SELECT a.chrom, b.chrom FROM peaks a " + "JOIN genes b ON a.interval INTERSECTS b.interval", + tables=["peaks", "genes"], + dialect="duckdb", + intersects_bin_size=0, + ) + class TestModuleExports: """Tests for module-level exports.""" From 241d3104b7bb2f500bd2dbc69f2942953d9d1cbe Mon Sep 17 00:00:00 2001 From: Conrad Date: Sat, 13 Jun 2026 21:17:21 -0400 Subject: [PATCH 3/4] refactor: Make targets value-equal and harden dialect resolution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert Target and its subclasses to frozen dataclasses so resolved targets are value-equal and hashable — the contract the operator-expander registry will key on. Drop the generic entry from the name lookup so unknown dialects are rejected with a plain membership check instead of an identity special-case. Interpolate the target name into the bin-size mutual-exclusion error so it stays accurate as more IEJoin targets are added, and tighten the capability docstrings. --- src/giql/targets.py | 46 +++++++++++++++++++++++++++++-------------- src/giql/transpile.py | 4 ++-- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/src/giql/targets.py b/src/giql/targets.py index ec38bb8..6fd29d3 100644 --- a/src/giql/targets.py +++ b/src/giql/targets.py @@ -29,8 +29,11 @@ class Capabilities: Parameters ---------- supports_lateral : bool - Whether the engine supports ``LATERAL`` / correlated joins. Drives - the NEAREST LATERAL-vs-window-function strategy (#142). + Whether the engine supports ``LATERAL`` / correlated joins. Will + drive the NEAREST LATERAL-vs-window-function strategy (#142). Until + then, :attr:`giql.generators.base.BaseGIQLGenerator.SUPPORTS_LATERAL` + remains the live source of truth at generation time; #142 reconciles + the two. supports_star_replace : bool Whether the engine supports ``SELECT * REPLACE (...)``. Drives the coordinate-canonicalization output: ``* REPLACE`` where supported, @@ -38,11 +41,14 @@ class Capabilities: DuckDB / BigQuery / Snowflake / ClickHouse; not by PostgreSQL, SQLite, or DataFusion. supports_qualify : bool - Whether the engine supports the ``QUALIFY`` clause. + Whether the engine supports the ``QUALIFY`` clause. Reserved: no + emission path consumes it yet (a future window-function operator + port would). range_join_strategy : RangeJoinStrategy The plan used for column-to-column INTERSECTS joins: ``"binned"`` for the generic binned equi-join, ``"iejoin"`` for DuckDB's - per-partition IEJoin plan. + per-partition IEJoin plan. The IEJoin path covers INNER / SEMI / + ANTI joins, with binned fallback for unsupported shapes. """ supports_lateral: bool @@ -51,12 +57,18 @@ class Capabilities: range_join_strategy: RangeJoinStrategy +@dataclass(frozen=True) class Target: """A SQL target engine. Subclasses declare the engine ``name``, the ``sqlglot_dialect`` used to serialize AST for that engine (``None`` selects sqlglot's default generic serialization), and the engine ``capabilities``. + + Targets are frozen, value-equal, and hashable: two ``DuckDBTarget()`` + instances compare equal and hash alike, so the operator-expander registry + (#138) can key on a resolved target by value. Equality is class-scoped — + ``GenericTarget() != DataFusionTarget()`` even where their fields overlap. """ name: str @@ -64,6 +76,7 @@ class Target: capabilities: Capabilities +@dataclass(frozen=True) class GenericTarget(Target): """Portable SQL-92-ish target with no engine-specific features. @@ -72,9 +85,9 @@ class GenericTarget(Target): :class:`giql.generators.base.BaseGIQLGenerator` output. """ - name = "generic" - sqlglot_dialect = None - capabilities = Capabilities( + name: str = "generic" + sqlglot_dialect: str | None = None + capabilities: Capabilities = Capabilities( supports_lateral=True, supports_star_replace=False, supports_qualify=False, @@ -82,6 +95,7 @@ class GenericTarget(Target): ) +@dataclass(frozen=True) class DuckDBTarget(Target): """DuckDB target. @@ -89,9 +103,9 @@ class DuckDBTarget(Target): per-partition plan for column-to-column INTERSECTS joins. """ - name = "duckdb" - sqlglot_dialect = "duckdb" - capabilities = Capabilities( + name: str = "duckdb" + sqlglot_dialect: str | None = "duckdb" + capabilities: Capabilities = Capabilities( supports_lateral=True, supports_star_replace=True, supports_qualify=True, @@ -99,6 +113,7 @@ class DuckDBTarget(Target): ) +@dataclass(frozen=True) class DataFusionTarget(Target): """Apache DataFusion target. @@ -110,9 +125,9 @@ class DataFusionTarget(Target): supports ``* EXCEPT`` / ``* EXCLUDE`` but not ``* REPLACE``. """ - name = "datafusion" - sqlglot_dialect = None - capabilities = Capabilities( + name: str = "datafusion" + sqlglot_dialect: str | None = None + capabilities: Capabilities = Capabilities( supports_lateral=False, supports_star_replace=False, supports_qualify=False, @@ -120,8 +135,9 @@ class DataFusionTarget(Target): ) +# Public dialect names only. ``generic`` is intentionally absent: ``None`` is +# the sole public way to select it (see :func:`resolve_target`). _TARGETS_BY_NAME: dict[str, type[Target]] = { - GenericTarget.name: GenericTarget, DuckDBTarget.name: DuckDBTarget, DataFusionTarget.name: DataFusionTarget, } @@ -150,7 +166,7 @@ def resolve_target(dialect: str | None) -> Target: return GenericTarget() target_cls = _TARGETS_BY_NAME.get(dialect) - if target_cls is None or target_cls is GenericTarget: + if target_cls is None: raise ValueError( f"Unknown dialect: {dialect!r}. Supported: 'duckdb', 'datafusion', or None." ) diff --git a/src/giql/transpile.py b/src/giql/transpile.py index 912c430..f5462b4 100644 --- a/src/giql/transpile.py +++ b/src/giql/transpile.py @@ -144,8 +144,8 @@ def transpile( uses_iejoin = target.capabilities.range_join_strategy == "iejoin" if uses_iejoin and intersects_bin_size is not None: raise ValueError( - "intersects_bin_size has no effect with dialect='duckdb'; " - "the DuckDB dialect uses an IEJoin per-partition plan instead " + f"intersects_bin_size has no effect with dialect={target.name!r}; " + f"the {target.name} target uses an IEJoin per-partition plan instead " "of the binned equi-join. Pass one or the other, not both." ) From 0866e62bd0d72f7cff378157e66edf0b094aacbb Mon Sep 17 00:00:00 2001 From: Conrad Date: Sat, 13 Jun 2026 21:17:27 -0400 Subject: [PATCH 4/4] test: Pin target value semantics and harden datafusion bin-size check Add tests asserting targets are value-equal and hashable as registry keys. Harden the unsupported-dialect regex with re.escape and relabel the datafusion-equals-generic test as the alias regression net it is. Make the datafusion bin-size integration test assert the explicit bin size changes emission, so a dropped parameter would fail it. --- .../datafusion/test_datafusion_target.py | 18 ++++-- tests/test_targets.py | 62 +++++++++++++++++-- tests/test_transpile.py | 17 ++--- 3 files changed, 80 insertions(+), 17 deletions(-) diff --git a/tests/integration/datafusion/test_datafusion_target.py b/tests/integration/datafusion/test_datafusion_target.py index 8c267f5..96d32ea 100644 --- a/tests/integration/datafusion/test_datafusion_target.py +++ b/tests/integration/datafusion/test_datafusion_target.py @@ -97,27 +97,33 @@ def test_intersects_join_with_bin_size_dedups_across_bins(self, datafusion_ctx): executed. Then: It should be accepted (not rejected as it would be for duckdb), - route through the binned path, and return exactly one row — the - overlap, de-duplicated across the shared bins. + the explicit bin size should reach emission (changing the SQL vs + the default), and execution should return exactly one row — the + overlap, de-duplicated by the pairs CTE across the shared bins. """ # Arrange ctx = datafusion_ctx( peaks=[("chr1", 0, 5000)], genes=[("chr1", 500, 4500)], ) - sql = transpile( + query = ( 'SELECT a.chrom, a.start, a."end", ' 'b.start AS b_start, b."end" AS b_end ' - "FROM peaks a JOIN genes b ON a.interval INTERSECTS b.interval", + "FROM peaks a JOIN genes b ON a.interval INTERSECTS b.interval" + ) + default_sql = transpile(query, tables=["peaks", "genes"], dialect="datafusion") + sized_sql = transpile( + query, tables=["peaks", "genes"], dialect="datafusion", intersects_bin_size=1000, ) # Act - df = ctx.sql(sql).to_pandas() + df = ctx.sql(sized_sql).to_pandas() # Assert - assert len(df) == 1 + assert sized_sql != default_sql # the explicit bin size reaches emission + assert len(df) == 1 # de-duplicated across the shared bins assert df.iloc[0]["start"] == 0 assert df.iloc[0]["b_start"] == 500 diff --git a/tests/test_targets.py b/tests/test_targets.py index 1049ada..14d10a6 100644 --- a/tests/test_targets.py +++ b/tests/test_targets.py @@ -1,5 +1,6 @@ """Tests for the target-engine model.""" +import re from dataclasses import FrozenInstanceError import pytest @@ -196,6 +197,57 @@ def test___init___exposes_conservative_capabilities(self): assert target.capabilities.range_join_strategy == "binned" +class TestTarget: + """Tests for the shared Target value semantics.""" + + def test___eq___same_target_type(self): + """Test that two instances of the same target are value-equal. + + Given: + Two separately constructed DuckDBTarget instances. + When: + They are compared for equality. + Then: + It should treat them as equal (frozen value semantics), so a + resolved target is a stable registry key. + """ + # Act & assert + assert DuckDBTarget() == DuckDBTarget() + + def test___eq___different_target_types(self): + """Test that distinct target classes are never equal. + + Given: + A GenericTarget and a DataFusionTarget (whose fields partly + overlap). + When: + They are compared for equality. + Then: + It should treat them as unequal, since equality is class-scoped. + """ + # Act & assert + assert GenericTarget() != DataFusionTarget() + + def test___hash___supports_set_and_dict_keys(self): + """Test that targets are hashable and usable as keys. + + Given: + Value-equal and value-distinct target instances. + When: + They are placed in a set and used as dict keys. + Then: + It should dedup equal instances and resolve a key built from a + separately constructed equal instance — the contract the + operator-expander registry (#138) relies on. + """ + # Arrange + registry = {(DuckDBTarget(), "Intersects"): "expander"} + + # Act & assert + assert len({DuckDBTarget(), DuckDBTarget(), GenericTarget()}) == 2 + assert registry[(DuckDBTarget(), "Intersects")] == "expander" + + def test_resolve_target_with_none_returns_generic(): """Test resolution of the default dialect. @@ -271,10 +323,12 @@ def test_resolve_target_with_unsupported_dialect_raises(dialect): supported targets. """ # Act & assert - with pytest.raises( - ValueError, - match=rf"Unknown dialect: {dialect!r}\..*'duckdb', 'datafusion', or None\.", - ): + pattern = ( + re.escape(f"Unknown dialect: {dialect!r}.") + + r".*" + + re.escape("'duckdb', 'datafusion', or None.") + ) + with pytest.raises(ValueError, match=pattern): resolve_target(dialect) diff --git a/tests/test_transpile.py b/tests/test_transpile.py index e43a1e0..ea770a8 100644 --- a/tests/test_transpile.py +++ b/tests/test_transpile.py @@ -462,16 +462,19 @@ def test_transpile_datafusion_accepted(self): ids=["intersects_literal", "contains", "within", "nearest", "join", "any"], ) def test_transpile_datafusion_matches_generic_output(self, query, tables): - """Test that datafusion routes through the generic path for now. + """Test that datafusion is currently a pure alias for the generic target. Given: - A GIQL query across the operator spread, transpiled with - dialect=None and with dialect="datafusion". + A GIQL query transpiled with dialect=None and with + dialect="datafusion". The operator spread is a regression net, + not codepath coverage: datafusion has no engine-specific + expansion yet, so every query funnels through the same generic + path. When: Comparing the two outputs. Then: - It should produce byte-identical SQL, since datafusion has no - engine-specific expansion yet. + It should produce byte-identical SQL for every query, pinning + datafusion as a generic alias until later steps diverge it. """ # Act generic_sql = transpile(query, tables=tables) @@ -557,7 +560,7 @@ def test_transpile_duckdb_rejects_intersects_bin_size(self): "JOIN genes b ON a.interval INTERSECTS b.interval", tables=["peaks", "genes"], dialect="duckdb", - intersects_bin_size=50000, + intersects_bin_size=50000, # type: ignore[call-overload] ) def test_transpile_duckdb_rejects_zero_intersects_bin_size(self): @@ -578,7 +581,7 @@ def test_transpile_duckdb_rejects_zero_intersects_bin_size(self): "JOIN genes b ON a.interval INTERSECTS b.interval", tables=["peaks", "genes"], dialect="duckdb", - intersects_bin_size=0, + intersects_bin_size=0, # type: ignore[call-overload] )