Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 173 additions & 0 deletions src/giql/targets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
"""Target-engine model for GIQL transpilation.

A :class:`Target` is a first-class SQL target engine: it carries a
:class:`Capabilities` set describing what the engine supports and the
sqlglot output dialect used to serialize standard AST for that engine.

This is step 1 of epic #137. The targets and capability descriptors defined
here are the foundation that later steps build on — the operator-expander
registry is keyed by ``(target, operator)`` and emission choices become
capability lookups rather than scattered ``if dialect == ...`` branches.
At this step the model is wired into :func:`giql.transpile.transpile` only
to resolve the ``dialect`` parameter and drive the existing DuckDB IEJoin
gate; no emission behaviour changes.
"""

from dataclasses import dataclass
from typing import Literal

RangeJoinStrategy = Literal["binned", "iejoin"]


@dataclass(frozen=True)
class Capabilities:
"""Feature set of a SQL target engine.

Each field is a portable choice that later steps of epic #137 turn into
a capability lookup instead of a hardcoded dialect branch.

Parameters
----------
supports_lateral : bool
Whether the engine supports ``LATERAL`` / correlated joins. Will
drive the NEAREST LATERAL-vs-window-function strategy (#142). Until
then, :attr:`giql.generators.base.BaseGIQLGenerator.SUPPORTS_LATERAL`
remains the live source of truth at generation time; #142 reconciles
the two.
supports_star_replace : bool
Whether the engine supports ``SELECT * REPLACE (...)``. Drives the
coordinate-canonicalization output: ``* REPLACE`` where supported,
an explicit portable projection otherwise (#143). Supported by
DuckDB / BigQuery / Snowflake / ClickHouse; not by PostgreSQL,
SQLite, or DataFusion.
supports_qualify : bool
Whether the engine supports the ``QUALIFY`` clause. Reserved: no
emission path consumes it yet (a future window-function operator
port would).
range_join_strategy : RangeJoinStrategy
The plan used for column-to-column INTERSECTS joins: ``"binned"``
for the generic binned equi-join, ``"iejoin"`` for DuckDB's
per-partition IEJoin plan. The IEJoin path covers INNER / SEMI /
ANTI joins, with binned fallback for unsupported shapes.
"""

supports_lateral: bool
supports_star_replace: bool
supports_qualify: bool
range_join_strategy: RangeJoinStrategy


@dataclass(frozen=True)
class Target:
"""A SQL target engine.

Subclasses declare the engine ``name``, the ``sqlglot_dialect`` used to
serialize AST for that engine (``None`` selects sqlglot's default
generic serialization), and the engine ``capabilities``.

Targets are frozen, value-equal, and hashable: two ``DuckDBTarget()``
instances compare equal and hash alike, so the operator-expander registry
(#138) can key on a resolved target by value. Equality is class-scoped —
``GenericTarget() != DataFusionTarget()`` even where their fields overlap.
"""

name: str
sqlglot_dialect: str | None
capabilities: Capabilities


@dataclass(frozen=True)
class GenericTarget(Target):
"""Portable SQL-92-ish target with no engine-specific features.

This is the default target (``dialect=None``). Its capabilities are the
conservative, maximally portable baseline that matches today's
:class:`giql.generators.base.BaseGIQLGenerator` output.
"""

name: str = "generic"
sqlglot_dialect: str | None = None
capabilities: Capabilities = Capabilities(
supports_lateral=True,
supports_star_replace=False,
supports_qualify=False,
range_join_strategy="binned",
)


@dataclass(frozen=True)
class DuckDBTarget(Target):
"""DuckDB target.

Serializes through sqlglot's ``duckdb`` dialect and uses the IEJoin
per-partition plan for column-to-column INTERSECTS joins.
"""

name: str = "duckdb"
sqlglot_dialect: str | None = "duckdb"
capabilities: Capabilities = Capabilities(
supports_lateral=True,
supports_star_replace=True,
supports_qualify=True,
range_join_strategy="iejoin",
)


@dataclass(frozen=True)
class DataFusionTarget(Target):
"""Apache DataFusion target.

sqlglot has no DataFusion dialect, so serialization falls back to the
generic form (``sqlglot_dialect = None``) for now; #145 finalizes
DataFusion serialization. The capability values below are conservative
and provisional — they are validated against a real DataFusion engine
when the operator migrations exercise them (#142, #145). DataFusion
supports ``* EXCEPT`` / ``* EXCLUDE`` but not ``* REPLACE``.
"""

name: str = "datafusion"
sqlglot_dialect: str | None = None
capabilities: Capabilities = Capabilities(
supports_lateral=False,
supports_star_replace=False,
supports_qualify=False,
range_join_strategy="binned",
)


# Public dialect names only. ``generic`` is intentionally absent: ``None`` is
# the sole public way to select it (see :func:`resolve_target`).
_TARGETS_BY_NAME: dict[str, type[Target]] = {
DuckDBTarget.name: DuckDBTarget,
DataFusionTarget.name: DataFusionTarget,
}


def resolve_target(dialect: str | None) -> Target:
"""Resolve a ``dialect`` parameter to a :class:`Target` instance.

Parameters
----------
dialect : str | None
The target dialect name. ``None`` resolves to :class:`GenericTarget`;
``"duckdb"`` and ``"datafusion"`` resolve to their respective targets.

Returns
-------
Target
The resolved target instance.

Raises
------
ValueError
If *dialect* is not a recognized target name.
"""
if dialect is None:
return GenericTarget()

target_cls = _TARGETS_BY_NAME.get(dialect)
if target_cls is None:
raise ValueError(
f"Unknown dialect: {dialect!r}. Supported: 'duckdb', 'datafusion', or None."
)
return target_cls()
34 changes: 19 additions & 15 deletions src/giql/transpile.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from giql.resolver import resolve_operator_refs
from giql.table import Table
from giql.table import Tables
from giql.targets import resolve_target
from giql.transformer import ClusterTransformer
from giql.transformer import IntersectsBinnedJoinTransformer
from giql.transformer import IntersectsDuckDBIEJoinTransformer
Expand All @@ -28,7 +29,7 @@ def transpile(
giql: str,
tables: list[str | Table] | None = None,
*,
dialect: None = None,
dialect: Literal["datafusion"] | None = None,
intersects_bin_size: int | None = None,
) -> str: ...

Expand All @@ -47,7 +48,7 @@ def transpile(
giql: str,
tables: list[str | Table] | None = None,
*,
dialect: Literal["duckdb"] | None = None,
dialect: Literal["duckdb", "datafusion"] | None = None,
intersects_bin_size: int | None = None,
) -> str:
"""Transpile a GIQL query to SQL.
Expand All @@ -64,16 +65,19 @@ def transpile(
Table configurations. Strings use default column mappings
(chrom, start, end, strand). :class:`Table` objects provide
custom column name mappings.
dialect : Literal["duckdb"] | None
Optional target dialect. When set to ``"duckdb"``, column-to-column
dialect : Literal["duckdb", "datafusion"] | None
Optional target engine. Resolves to a :class:`giql.targets.Target`
carrying the engine's capability set; ``None`` selects the generic
portable target. When set to ``"duckdb"``, column-to-column
``INTERSECTS`` joins (INNER, SEMI, or ANTI) are transpiled into a
per-chromosome dynamic-SQL pattern (``SET VARIABLE`` +
``query(getvariable(...))``) that DuckDB plans through its
range-join family (``IE_JOIN`` / ``PIECEWISE_MERGE_JOIN``).
Mutually exclusive with ``intersects_bin_size``. Defaults to
``None`` (the generic binned equi-join path). Hard-error projection
shapes raise ``ValueError`` at transpile time; see the performance
guide for the full enumeration.
range-join family (``IE_JOIN`` / ``PIECEWISE_MERGE_JOIN``); this
IEJoin plan is mutually exclusive with ``intersects_bin_size``.
``"datafusion"`` and ``None`` use the generic binned equi-join path
and accept ``intersects_bin_size``. Hard-error projection shapes
raise ``ValueError`` at transpile time; see the performance guide
for the full enumeration.
intersects_bin_size : int | None
Bin size for INTERSECTS equi-join optimization. When a query
contains a full-table column-to-column INTERSECTS join, the
Expand Down Expand Up @@ -136,12 +140,12 @@ def transpile(
dialect="duckdb",
)
"""
if dialect is not None and dialect != "duckdb":
raise ValueError(f"Unknown dialect: {dialect!r}. Supported: 'duckdb' or None.")
if dialect == "duckdb" and intersects_bin_size is not None:
target = resolve_target(dialect)
uses_iejoin = target.capabilities.range_join_strategy == "iejoin"
if uses_iejoin and intersects_bin_size is not None:
raise ValueError(
"intersects_bin_size has no effect with dialect='duckdb'; "
"the DuckDB dialect uses an IEJoin per-partition plan instead "
f"intersects_bin_size has no effect with dialect={target.name!r}; "
f"the {target.name} target uses an IEJoin per-partition plan instead "
"of the binned equi-join. Pass one or the other, not both."
)

Expand All @@ -153,7 +157,7 @@ def transpile(
# Falls back to the binned plan for unsupported shapes — see
# IntersectsDuckDBIEJoinTransformer.transform_to_sql for the complete
# fallback set.
if dialect == "duckdb":
if uses_iejoin:
duckdb_transformer = IntersectsDuckDBIEJoinTransformer(tables_container)
with _reraise_as_value_error("Transformation error"):
duckdb_sql = duckdb_transformer.transform_to_sql(ast)
Expand Down
Empty file.
49 changes: 49 additions & 0 deletions tests/integration/datafusion/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""Fixtures for DataFusion target execution smoke tests (issue #132).

These tests prove that SQL produced via ``transpile(..., dialect="datafusion")``
parses and executes on a real DataFusion engine. The broad cross-target result
oracle and the full DataFusion integration lane are deferred to #139.
"""

import pytest

pytest.importorskip("datafusion")
pytest.importorskip("pyarrow")

pytestmark = pytest.mark.integration


@pytest.fixture
def datafusion_ctx():
"""Return a builder that registers interval tables into a SessionContext.

The returned callable accepts ``name=[(chrom, start, end), ...]`` keyword
arguments and yields a DataFusion ``SessionContext`` with each table
registered under the default ``chrom``/``start``/``end`` schema (matching
the default :class:`giql.Table` column mapping).
"""
import pyarrow as pa
from datafusion import SessionContext

schema = pa.schema(
[
("chrom", pa.utf8()),
("start", pa.int64()),
("end", pa.int64()),
]
)

def _build(**tables):
ctx = SessionContext()
for name, rows in tables.items():
arrays = {
"chrom": [r[0] for r in rows],
"start": [r[1] for r in rows],
"end": [r[2] for r in rows],
}
ctx.register_record_batches(
name, [pa.table(arrays, schema=schema).to_batches()]
)
return ctx

return _build
Loading
Loading