diff --git a/src/giql/targets.py b/src/giql/targets.py new file mode 100644 index 0000000..6fd29d3 --- /dev/null +++ b/src/giql/targets.py @@ -0,0 +1,173 @@ +"""Target-engine model for GIQL transpilation. + +A :class:`Target` is a first-class SQL target engine: it carries a +:class:`Capabilities` set describing what the engine supports and the +sqlglot output dialect used to serialize standard AST for that engine. + +This is step 1 of epic #137. The targets and capability descriptors defined +here are the foundation that later steps build on — the operator-expander +registry is keyed by ``(target, operator)`` and emission choices become +capability lookups rather than scattered ``if dialect == ...`` branches. +At this step the model is wired into :func:`giql.transpile.transpile` only +to resolve the ``dialect`` parameter and drive the existing DuckDB IEJoin +gate; no emission behaviour changes. +""" + +from dataclasses import dataclass +from typing import Literal + +RangeJoinStrategy = Literal["binned", "iejoin"] + + +@dataclass(frozen=True) +class Capabilities: + """Feature set of a SQL target engine. + + Each field is a portable choice that later steps of epic #137 turn into + a capability lookup instead of a hardcoded dialect branch. + + Parameters + ---------- + supports_lateral : bool + Whether the engine supports ``LATERAL`` / correlated joins. Will + drive the NEAREST LATERAL-vs-window-function strategy (#142). Until + then, :attr:`giql.generators.base.BaseGIQLGenerator.SUPPORTS_LATERAL` + remains the live source of truth at generation time; #142 reconciles + the two. + supports_star_replace : bool + Whether the engine supports ``SELECT * REPLACE (...)``. Drives the + coordinate-canonicalization output: ``* REPLACE`` where supported, + an explicit portable projection otherwise (#143). Supported by + DuckDB / BigQuery / Snowflake / ClickHouse; not by PostgreSQL, + SQLite, or DataFusion. + supports_qualify : bool + Whether the engine supports the ``QUALIFY`` clause. Reserved: no + emission path consumes it yet (a future window-function operator + port would). + range_join_strategy : RangeJoinStrategy + The plan used for column-to-column INTERSECTS joins: ``"binned"`` + for the generic binned equi-join, ``"iejoin"`` for DuckDB's + per-partition IEJoin plan. The IEJoin path covers INNER / SEMI / + ANTI joins, with binned fallback for unsupported shapes. + """ + + supports_lateral: bool + supports_star_replace: bool + supports_qualify: bool + range_join_strategy: RangeJoinStrategy + + +@dataclass(frozen=True) +class Target: + """A SQL target engine. + + Subclasses declare the engine ``name``, the ``sqlglot_dialect`` used to + serialize AST for that engine (``None`` selects sqlglot's default + generic serialization), and the engine ``capabilities``. + + Targets are frozen, value-equal, and hashable: two ``DuckDBTarget()`` + instances compare equal and hash alike, so the operator-expander registry + (#138) can key on a resolved target by value. Equality is class-scoped — + ``GenericTarget() != DataFusionTarget()`` even where their fields overlap. + """ + + name: str + sqlglot_dialect: str | None + capabilities: Capabilities + + +@dataclass(frozen=True) +class GenericTarget(Target): + """Portable SQL-92-ish target with no engine-specific features. + + This is the default target (``dialect=None``). Its capabilities are the + conservative, maximally portable baseline that matches today's + :class:`giql.generators.base.BaseGIQLGenerator` output. + """ + + name: str = "generic" + sqlglot_dialect: str | None = None + capabilities: Capabilities = Capabilities( + supports_lateral=True, + supports_star_replace=False, + supports_qualify=False, + range_join_strategy="binned", + ) + + +@dataclass(frozen=True) +class DuckDBTarget(Target): + """DuckDB target. + + Serializes through sqlglot's ``duckdb`` dialect and uses the IEJoin + per-partition plan for column-to-column INTERSECTS joins. + """ + + name: str = "duckdb" + sqlglot_dialect: str | None = "duckdb" + capabilities: Capabilities = Capabilities( + supports_lateral=True, + supports_star_replace=True, + supports_qualify=True, + range_join_strategy="iejoin", + ) + + +@dataclass(frozen=True) +class DataFusionTarget(Target): + """Apache DataFusion target. + + sqlglot has no DataFusion dialect, so serialization falls back to the + generic form (``sqlglot_dialect = None``) for now; #145 finalizes + DataFusion serialization. The capability values below are conservative + and provisional — they are validated against a real DataFusion engine + when the operator migrations exercise them (#142, #145). DataFusion + supports ``* EXCEPT`` / ``* EXCLUDE`` but not ``* REPLACE``. + """ + + name: str = "datafusion" + sqlglot_dialect: str | None = None + capabilities: Capabilities = Capabilities( + supports_lateral=False, + supports_star_replace=False, + supports_qualify=False, + range_join_strategy="binned", + ) + + +# Public dialect names only. ``generic`` is intentionally absent: ``None`` is +# the sole public way to select it (see :func:`resolve_target`). +_TARGETS_BY_NAME: dict[str, type[Target]] = { + DuckDBTarget.name: DuckDBTarget, + DataFusionTarget.name: DataFusionTarget, +} + + +def resolve_target(dialect: str | None) -> Target: + """Resolve a ``dialect`` parameter to a :class:`Target` instance. + + Parameters + ---------- + dialect : str | None + The target dialect name. ``None`` resolves to :class:`GenericTarget`; + ``"duckdb"`` and ``"datafusion"`` resolve to their respective targets. + + Returns + ------- + Target + The resolved target instance. + + Raises + ------ + ValueError + If *dialect* is not a recognized target name. + """ + if dialect is None: + return GenericTarget() + + target_cls = _TARGETS_BY_NAME.get(dialect) + if target_cls is None: + raise ValueError( + f"Unknown dialect: {dialect!r}. Supported: 'duckdb', 'datafusion', or None." + ) + return target_cls() diff --git a/src/giql/transpile.py b/src/giql/transpile.py index 5bfe894..f5462b4 100644 --- a/src/giql/transpile.py +++ b/src/giql/transpile.py @@ -17,6 +17,7 @@ from giql.resolver import resolve_operator_refs from giql.table import Table from giql.table import Tables +from giql.targets import resolve_target from giql.transformer import ClusterTransformer from giql.transformer import IntersectsBinnedJoinTransformer from giql.transformer import IntersectsDuckDBIEJoinTransformer @@ -28,7 +29,7 @@ def transpile( giql: str, tables: list[str | Table] | None = None, *, - dialect: None = None, + dialect: Literal["datafusion"] | None = None, intersects_bin_size: int | None = None, ) -> str: ... @@ -47,7 +48,7 @@ def transpile( giql: str, tables: list[str | Table] | None = None, *, - dialect: Literal["duckdb"] | None = None, + dialect: Literal["duckdb", "datafusion"] | None = None, intersects_bin_size: int | None = None, ) -> str: """Transpile a GIQL query to SQL. @@ -64,16 +65,19 @@ def transpile( Table configurations. Strings use default column mappings (chrom, start, end, strand). :class:`Table` objects provide custom column name mappings. - dialect : Literal["duckdb"] | None - Optional target dialect. When set to ``"duckdb"``, column-to-column + dialect : Literal["duckdb", "datafusion"] | None + Optional target engine. Resolves to a :class:`giql.targets.Target` + carrying the engine's capability set; ``None`` selects the generic + portable target. When set to ``"duckdb"``, column-to-column ``INTERSECTS`` joins (INNER, SEMI, or ANTI) are transpiled into a per-chromosome dynamic-SQL pattern (``SET VARIABLE`` + ``query(getvariable(...))``) that DuckDB plans through its - range-join family (``IE_JOIN`` / ``PIECEWISE_MERGE_JOIN``). - Mutually exclusive with ``intersects_bin_size``. Defaults to - ``None`` (the generic binned equi-join path). Hard-error projection - shapes raise ``ValueError`` at transpile time; see the performance - guide for the full enumeration. + range-join family (``IE_JOIN`` / ``PIECEWISE_MERGE_JOIN``); this + IEJoin plan is mutually exclusive with ``intersects_bin_size``. + ``"datafusion"`` and ``None`` use the generic binned equi-join path + and accept ``intersects_bin_size``. Hard-error projection shapes + raise ``ValueError`` at transpile time; see the performance guide + for the full enumeration. intersects_bin_size : int | None Bin size for INTERSECTS equi-join optimization. When a query contains a full-table column-to-column INTERSECTS join, the @@ -136,12 +140,12 @@ def transpile( dialect="duckdb", ) """ - if dialect is not None and dialect != "duckdb": - raise ValueError(f"Unknown dialect: {dialect!r}. Supported: 'duckdb' or None.") - if dialect == "duckdb" and intersects_bin_size is not None: + target = resolve_target(dialect) + uses_iejoin = target.capabilities.range_join_strategy == "iejoin" + if uses_iejoin and intersects_bin_size is not None: raise ValueError( - "intersects_bin_size has no effect with dialect='duckdb'; " - "the DuckDB dialect uses an IEJoin per-partition plan instead " + f"intersects_bin_size has no effect with dialect={target.name!r}; " + f"the {target.name} target uses an IEJoin per-partition plan instead " "of the binned equi-join. Pass one or the other, not both." ) @@ -153,7 +157,7 @@ def transpile( # Falls back to the binned plan for unsupported shapes — see # IntersectsDuckDBIEJoinTransformer.transform_to_sql for the complete # fallback set. - if dialect == "duckdb": + if uses_iejoin: duckdb_transformer = IntersectsDuckDBIEJoinTransformer(tables_container) with _reraise_as_value_error("Transformation error"): duckdb_sql = duckdb_transformer.transform_to_sql(ast) diff --git a/tests/integration/datafusion/__init__.py b/tests/integration/datafusion/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/datafusion/conftest.py b/tests/integration/datafusion/conftest.py new file mode 100644 index 0000000..f57d7bc --- /dev/null +++ b/tests/integration/datafusion/conftest.py @@ -0,0 +1,49 @@ +"""Fixtures for DataFusion target execution smoke tests (issue #132). + +These tests prove that SQL produced via ``transpile(..., dialect="datafusion")`` +parses and executes on a real DataFusion engine. The broad cross-target result +oracle and the full DataFusion integration lane are deferred to #139. +""" + +import pytest + +pytest.importorskip("datafusion") +pytest.importorskip("pyarrow") + +pytestmark = pytest.mark.integration + + +@pytest.fixture +def datafusion_ctx(): + """Return a builder that registers interval tables into a SessionContext. + + The returned callable accepts ``name=[(chrom, start, end), ...]`` keyword + arguments and yields a DataFusion ``SessionContext`` with each table + registered under the default ``chrom``/``start``/``end`` schema (matching + the default :class:`giql.Table` column mapping). + """ + import pyarrow as pa + from datafusion import SessionContext + + schema = pa.schema( + [ + ("chrom", pa.utf8()), + ("start", pa.int64()), + ("end", pa.int64()), + ] + ) + + def _build(**tables): + ctx = SessionContext() + for name, rows in tables.items(): + arrays = { + "chrom": [r[0] for r in rows], + "start": [r[1] for r in rows], + "end": [r[2] for r in rows], + } + ctx.register_record_batches( + name, [pa.table(arrays, schema=schema).to_batches()] + ) + return ctx + + return _build diff --git a/tests/integration/datafusion/test_datafusion_target.py b/tests/integration/datafusion/test_datafusion_target.py new file mode 100644 index 0000000..96d32ea --- /dev/null +++ b/tests/integration/datafusion/test_datafusion_target.py @@ -0,0 +1,129 @@ +"""DataFusion execution smoke tests for the ``dialect="datafusion"`` target. + +Issue #132 makes ``"datafusion"`` a routing key in :func:`giql.transpile`. The +existing ``TestBinnedJoinDataFusion`` suite proves the binned plan executes on +DataFusion, but always with ``dialect`` omitted (``None``); nothing executes the +``dialect="datafusion"`` entry point end-to-end. These tests close that seam. +""" + +import pytest + +from giql import transpile + +pytestmark = pytest.mark.integration + + +class TestDataFusionTargetExecution: + """End-to-end execution of dialect="datafusion" output on DataFusion.""" + + def test_intersects_literal_returns_overlapping_row(self, datafusion_ctx): + """Test a literal INTERSECTS query through the datafusion target. + + Given: + A peaks table in a real DataFusion context with one overlapping + interval, one non-overlapping interval, and one on another + chromosome. + When: + A literal-range INTERSECTS query is transpiled with + dialect="datafusion" and executed. + Then: + It should parse, execute, and return exactly the overlapping + chr1 row. + """ + # Arrange + ctx = datafusion_ctx( + peaks=[ + ("chr1", 1500, 1800), # overlaps chr1:1000-2000 + ("chr1", 5000, 6000), # same chrom, no overlap + ("chr2", 1500, 1800), # overlapping position, wrong chrom + ] + ) + sql = transpile( + 'SELECT chrom, start, "end" FROM peaks ' + "WHERE interval INTERSECTS 'chr1:1000-2000'", + tables=["peaks"], + dialect="datafusion", + ) + + # Act + df = ctx.sql(sql).to_pandas() + + # Assert + assert len(df) == 1 + assert df.iloc[0]["chrom"] == "chr1" + assert df.iloc[0]["start"] == 1500 + + def test_intersects_literal_no_overlap_returns_zero_rows(self, datafusion_ctx): + """Test that a non-overlapping literal INTERSECTS yields no rows. + + Given: + A peaks table in a real DataFusion context with no interval + overlapping the query range. + When: + The literal INTERSECTS query is transpiled with + dialect="datafusion" and executed. + Then: + It should execute and return zero rows. + """ + # Arrange + ctx = datafusion_ctx( + peaks=[ + ("chr1", 5000, 6000), + ("chr2", 1500, 1800), + ] + ) + sql = transpile( + 'SELECT chrom, start, "end" FROM peaks ' + "WHERE interval INTERSECTS 'chr1:1000-2000'", + tables=["peaks"], + dialect="datafusion", + ) + + # Act + df = ctx.sql(sql).to_pandas() + + # Assert + assert len(df) == 0 + + def test_intersects_join_with_bin_size_dedups_across_bins(self, datafusion_ctx): + """Test the datafusion-routed binned join honours an explicit bin size. + + Given: + Two interval tables in a real DataFusion context whose single + intervals overlap and span several bins under a small bin size. + When: + A column-to-column INTERSECTS join is transpiled with + dialect="datafusion" and an explicit intersects_bin_size, then + executed. + Then: + It should be accepted (not rejected as it would be for duckdb), + the explicit bin size should reach emission (changing the SQL vs + the default), and execution should return exactly one row — the + overlap, de-duplicated by the pairs CTE across the shared bins. + """ + # Arrange + ctx = datafusion_ctx( + peaks=[("chr1", 0, 5000)], + genes=[("chr1", 500, 4500)], + ) + query = ( + 'SELECT a.chrom, a.start, a."end", ' + 'b.start AS b_start, b."end" AS b_end ' + "FROM peaks a JOIN genes b ON a.interval INTERSECTS b.interval" + ) + default_sql = transpile(query, tables=["peaks", "genes"], dialect="datafusion") + sized_sql = transpile( + query, + tables=["peaks", "genes"], + dialect="datafusion", + intersects_bin_size=1000, + ) + + # Act + df = ctx.sql(sized_sql).to_pandas() + + # Assert + assert sized_sql != default_sql # the explicit bin size reaches emission + assert len(df) == 1 # de-duplicated across the shared bins + assert df.iloc[0]["start"] == 0 + assert df.iloc[0]["b_start"] == 500 diff --git a/tests/test_targets.py b/tests/test_targets.py new file mode 100644 index 0000000..14d10a6 --- /dev/null +++ b/tests/test_targets.py @@ -0,0 +1,349 @@ +"""Tests for the target-engine model.""" + +import re +from dataclasses import FrozenInstanceError + +import pytest +from hypothesis import given +from hypothesis import strategies as st + +from giql.targets import Capabilities +from giql.targets import DataFusionTarget +from giql.targets import DuckDBTarget +from giql.targets import GenericTarget +from giql.targets import resolve_target + + +class TestCapabilities: + """Tests for the Capabilities dataclass.""" + + def test___init___stores_all_descriptors(self): + """Test that Capabilities records each descriptor. + + Given: + A full set of capability descriptor values. + When: + A Capabilities instance is constructed. + Then: + It should expose each value on the corresponding attribute. + """ + # Act + caps = Capabilities( + supports_lateral=True, + supports_star_replace=False, + supports_qualify=True, + range_join_strategy="binned", + ) + + # Assert + assert caps.supports_lateral is True + assert caps.supports_star_replace is False + assert caps.supports_qualify is True + assert caps.range_join_strategy == "binned" + + def test___eq___with_identical_values(self): + """Test value equality of Capabilities. + + Given: + Two Capabilities built from identical descriptor values. + When: + They are compared for equality. + Then: + It should treat them as equal (frozen value semantics). + """ + # Arrange + first = Capabilities( + supports_lateral=True, + supports_star_replace=True, + supports_qualify=True, + range_join_strategy="iejoin", + ) + second = Capabilities( + supports_lateral=True, + supports_star_replace=True, + supports_qualify=True, + range_join_strategy="iejoin", + ) + + # Act & assert + assert first == second + + def test___eq___with_one_differing_field(self): + """Test value inequality of Capabilities. + + Given: + Two Capabilities identical in every descriptor except one. + When: + They are compared for equality. + Then: + It should treat them as unequal (per-field value semantics). + """ + # Arrange + base = Capabilities( + supports_lateral=True, + supports_star_replace=True, + supports_qualify=True, + range_join_strategy="iejoin", + ) + differing = Capabilities( + supports_lateral=True, + supports_star_replace=True, + supports_qualify=True, + range_join_strategy="binned", + ) + + # Act & assert + assert base != differing + + def test___setattr___is_frozen(self): + """Test that Capabilities is immutable. + + Given: + A constructed Capabilities instance. + When: + An attribute is reassigned. + Then: + It should raise FrozenInstanceError. + """ + # Arrange + caps = Capabilities( + supports_lateral=True, + supports_star_replace=False, + supports_qualify=False, + range_join_strategy="binned", + ) + + # Act & assert + with pytest.raises(FrozenInstanceError): + caps.supports_lateral = False # type: ignore[misc] + + +class TestGenericTarget: + """Tests for the GenericTarget.""" + + def test___init___exposes_portable_baseline(self): + """Test the generic target's identity and capabilities. + + Given: + The GenericTarget class. + When: + An instance is constructed. + Then: + It should carry the portable SQL-92 baseline: no engine dialect, + lateral supported, no star-REPLACE, no QUALIFY, binned joins. + """ + # Act + target = GenericTarget() + + # Assert + assert target.name == "generic" + assert target.sqlglot_dialect is None + assert target.capabilities.supports_lateral is True + assert target.capabilities.supports_star_replace is False + assert target.capabilities.supports_qualify is False + assert target.capabilities.range_join_strategy == "binned" + + +class TestDuckDBTarget: + """Tests for the DuckDBTarget.""" + + def test___init___exposes_duckdb_features(self): + """Test the DuckDB target's identity and capabilities. + + Given: + The DuckDBTarget class. + When: + An instance is constructed. + Then: + It should serialize through the duckdb dialect and enable + lateral, star-REPLACE, QUALIFY, and the IEJoin strategy. + """ + # Act + target = DuckDBTarget() + + # Assert + assert target.name == "duckdb" + assert target.sqlglot_dialect == "duckdb" + assert target.capabilities.supports_lateral is True + assert target.capabilities.supports_star_replace is True + assert target.capabilities.supports_qualify is True + assert target.capabilities.range_join_strategy == "iejoin" + + +class TestDataFusionTarget: + """Tests for the DataFusionTarget.""" + + def test___init___exposes_conservative_capabilities(self): + """Test the DataFusion target's identity and capabilities. + + Given: + The DataFusionTarget class. + When: + An instance is constructed. + Then: + It should fall back to generic serialization (no sqlglot + DataFusion dialect), disable star-REPLACE and QUALIFY, and use + the binned join strategy. + """ + # Act + target = DataFusionTarget() + + # Assert + assert target.name == "datafusion" + assert target.sqlglot_dialect is None + assert target.capabilities.supports_lateral is False + assert target.capabilities.supports_star_replace is False + assert target.capabilities.supports_qualify is False + assert target.capabilities.range_join_strategy == "binned" + + +class TestTarget: + """Tests for the shared Target value semantics.""" + + def test___eq___same_target_type(self): + """Test that two instances of the same target are value-equal. + + Given: + Two separately constructed DuckDBTarget instances. + When: + They are compared for equality. + Then: + It should treat them as equal (frozen value semantics), so a + resolved target is a stable registry key. + """ + # Act & assert + assert DuckDBTarget() == DuckDBTarget() + + def test___eq___different_target_types(self): + """Test that distinct target classes are never equal. + + Given: + A GenericTarget and a DataFusionTarget (whose fields partly + overlap). + When: + They are compared for equality. + Then: + It should treat them as unequal, since equality is class-scoped. + """ + # Act & assert + assert GenericTarget() != DataFusionTarget() + + def test___hash___supports_set_and_dict_keys(self): + """Test that targets are hashable and usable as keys. + + Given: + Value-equal and value-distinct target instances. + When: + They are placed in a set and used as dict keys. + Then: + It should dedup equal instances and resolve a key built from a + separately constructed equal instance — the contract the + operator-expander registry (#138) relies on. + """ + # Arrange + registry = {(DuckDBTarget(), "Intersects"): "expander"} + + # Act & assert + assert len({DuckDBTarget(), DuckDBTarget(), GenericTarget()}) == 2 + assert registry[(DuckDBTarget(), "Intersects")] == "expander" + + +def test_resolve_target_with_none_returns_generic(): + """Test resolution of the default dialect. + + Given: + A None dialect argument. + When: + resolve_target is called. + Then: + It should return a GenericTarget. + """ + # Act + target = resolve_target(None) + + # Assert + assert isinstance(target, GenericTarget) + + +def test_resolve_target_with_duckdb_returns_duckdb(): + """Test resolution of the duckdb dialect. + + Given: + The dialect name "duckdb". + When: + resolve_target is called. + Then: + It should return a DuckDBTarget. + """ + # Act + target = resolve_target("duckdb") + + # Assert + assert isinstance(target, DuckDBTarget) + + +def test_resolve_target_with_datafusion_returns_datafusion(): + """Test resolution of the datafusion dialect. + + Given: + The dialect name "datafusion". + When: + resolve_target is called. + Then: + It should return a DataFusionTarget. + """ + # Act + target = resolve_target("datafusion") + + # Assert + assert isinstance(target, DataFusionTarget) + + +@pytest.mark.parametrize( + "dialect", + [ + "postgres", # plain unknown engine + "sqlite", # second unknown — guards against a hardcoded message + "DuckDB", # case-variant of a real target — lookup is case-sensitive + "", # empty string is distinct from None + "generic", # the internal target name — None is the only public path + ], +) +def test_resolve_target_with_unsupported_dialect_raises(dialect): + """Test resolution of dialect names that are not public targets. + + Given: + A dialect string that is not one of the supported public names + (an unknown engine, a case-variant, the empty string, or the + internal "generic" name). + When: + resolve_target is called. + Then: + It should raise ValueError naming the offending value and the + supported targets. + """ + # Act & assert + pattern = ( + re.escape(f"Unknown dialect: {dialect!r}.") + + r".*" + + re.escape("'duckdb', 'datafusion', or None.") + ) + with pytest.raises(ValueError, match=pattern): + resolve_target(dialect) + + +@given(st.text().filter(lambda s: s not in ("duckdb", "datafusion"))) +def test_resolve_target_with_arbitrary_unsupported_string_raises(dialect): + """Test resolution over the open domain of unsupported strings. + + Given: + Any string that is not a registered public dialect name (the only + public string names are "duckdb" and "datafusion"). + When: + resolve_target is called. + Then: + It should raise ValueError rather than returning a target. + """ + # Act & assert + with pytest.raises(ValueError, match="Unknown dialect"): + resolve_target(dialect) diff --git a/tests/test_transpile.py b/tests/test_transpile.py index e0f54fe..ea770a8 100644 --- a/tests/test_transpile.py +++ b/tests/test_transpile.py @@ -389,6 +389,201 @@ def test_invalid_syntax(self): with pytest.raises(ValueError, match="Parse error"): transpile("SELECT * FORM peaks") # typo: FORM instead of FROM + def test_unknown_dialect(self): + """Test transpilation with an unrecognized dialect. + + Given: + A GIQL query and a dialect that is not a supported target. + When: + Transpiling. + Then: + It should raise ValueError naming the supported targets. + """ + # Act & assert + with pytest.raises(ValueError, match="Unknown dialect: 'postgres'"): + transpile( + "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", + tables=["peaks"], + dialect="postgres", # type: ignore[call-overload] + ) + + +class TestTranspileDialects: + """Tests for dialect/target selection.""" + + def test_transpile_datafusion_accepted(self): + """Test that the datafusion dialect is a valid target. + + Given: + A GIQL query and dialect="datafusion". + When: + Transpiling. + Then: + It should return SQL referencing the table without raising. + """ + # Act + sql = transpile( + "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", + tables=["peaks"], + dialect="datafusion", + ) + + # Assert + assert "SELECT" in sql + assert "peaks" in sql + + @pytest.mark.parametrize( + "query, tables", + [ + ( + "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", + ["peaks"], + ), + ("SELECT * FROM peaks WHERE interval CONTAINS 'chr1:1500'", ["peaks"]), + ( + "SELECT * FROM peaks WHERE interval WITHIN 'chr1:1000-2000'", + ["peaks"], + ), + ( + "SELECT * FROM NEAREST(genes, reference := 'chr1:1000-2000', k := 3)", + ["genes"], + ), + ( + "SELECT a.chrom, b.chrom FROM peaks a " + "JOIN genes b ON a.interval INTERSECTS b.interval", + ["peaks", "genes"], + ), + ( + "SELECT * FROM peaks WHERE interval INTERSECTS " + "ANY('chr1:1000-2000', 'chr1:5000-6000')", + ["peaks"], + ), + ], + ids=["intersects_literal", "contains", "within", "nearest", "join", "any"], + ) + def test_transpile_datafusion_matches_generic_output(self, query, tables): + """Test that datafusion is currently a pure alias for the generic target. + + Given: + A GIQL query transpiled with dialect=None and with + dialect="datafusion". The operator spread is a regression net, + not codepath coverage: datafusion has no engine-specific + expansion yet, so every query funnels through the same generic + path. + When: + Comparing the two outputs. + Then: + It should produce byte-identical SQL for every query, pinning + datafusion as a generic alias until later steps diverge it. + """ + # Act + generic_sql = transpile(query, tables=tables) + datafusion_sql = transpile(query, tables=tables, dialect="datafusion") + + # Assert + assert datafusion_sql == generic_sql + + def test_transpile_datafusion_accepts_intersects_bin_size(self): + """Test that datafusion honours the binned-join bin size identically. + + Given: + A column-to-column INTERSECTS join with an explicit + intersects_bin_size, transpiled with dialect=None and + dialect="datafusion". + When: + Comparing the two outputs. + Then: + It should produce byte-identical SQL — datafusion takes the same + binned-join path and honours the bin size. + """ + # Arrange + query = ( + "SELECT a.chrom, b.chrom FROM peaks a " + "JOIN genes b ON a.interval INTERSECTS b.interval" + ) + + # Act + generic_sql = transpile( + query, tables=["peaks", "genes"], intersects_bin_size=50000 + ) + datafusion_sql = transpile( + query, + tables=["peaks", "genes"], + dialect="datafusion", + intersects_bin_size=50000, + ) + + # Assert + assert datafusion_sql == generic_sql + + def test_transpile_duckdb_returns_sql_for_column_join(self): + """Test the duckdb happy path for a column-to-column INTERSECTS join. + + Given: + A column-to-column INTERSECTS join and dialect="duckdb". + When: + Transpiling. + Then: + It should return SQL referencing the joined tables without raising. + """ + # Act + sql = transpile( + "SELECT a.chrom, a.start, b.start FROM peaks a " + "JOIN genes b ON a.interval INTERSECTS b.interval", + tables=["peaks", "genes"], + dialect="duckdb", + ) + + # Assert + assert "SELECT" in sql + assert "peaks" in sql + assert "genes" in sql + + def test_transpile_duckdb_rejects_intersects_bin_size(self): + """Test that duckdb and intersects_bin_size are mutually exclusive. + + Given: + dialect="duckdb" combined with an explicit intersects_bin_size. + When: + Transpiling. + Then: + It should raise ValueError explaining the IEJoin plan ignores + the bin size. + """ + # Act & assert + with pytest.raises( + ValueError, + match=r"intersects_bin_size has no effect.*Pass one or the other, not both\.", + ): + transpile( + "SELECT a.chrom, b.chrom FROM peaks a " + "JOIN genes b ON a.interval INTERSECTS b.interval", + tables=["peaks", "genes"], + dialect="duckdb", + intersects_bin_size=50000, # type: ignore[call-overload] + ) + + def test_transpile_duckdb_rejects_zero_intersects_bin_size(self): + """Test that a falsy-but-set bin size still triggers the duckdb guard. + + Given: + dialect="duckdb" combined with intersects_bin_size=0. + When: + Transpiling. + Then: + It should raise ValueError — the guard is `is not None`, so 0 + (falsy) still conflicts with the IEJoin plan. + """ + # Act & assert + with pytest.raises(ValueError, match="intersects_bin_size has no effect"): + transpile( + "SELECT a.chrom, b.chrom FROM peaks a " + "JOIN genes b ON a.interval INTERSECTS b.interval", + tables=["peaks", "genes"], + dialect="duckdb", + intersects_bin_size=0, # type: ignore[call-overload] + ) + class TestModuleExports: """Tests for module-level exports."""