diff --git a/src/giql/constants.py b/src/giql/constants.py index 0daf016..51a0943 100644 --- a/src/giql/constants.py +++ b/src/giql/constants.py @@ -12,3 +12,12 @@ # Default bin size for INTERSECTS binned equi-join optimization DEFAULT_BIN_SIZE = 10_000 + +#: The reserved alias prefix naming DISJOIN's internal CTEs (``__giql_dj_ref`` / +#: ``__giql_dj_tgt`` / ``__giql_dj_cuts`` / ...). Follows the same +#: reserved-prefix scheme as :data:`giql.canonicalizer.CANON_PREFIX`; the leading +#: double underscore keeps +#: the namespace clear of user identifiers. A single source of truth so the +#: resolver's reserved-prefix guard and the DISJOIN expander's CTE names and +#: guard cannot drift apart. +DJ_PREFIX = "__giql_dj_" diff --git a/src/giql/expanders/disjoin.py b/src/giql/expanders/disjoin.py new file mode 100644 index 0000000..f784805 --- /dev/null +++ b/src/giql/expanders/disjoin.py @@ -0,0 +1,348 @@ +"""The DISJOIN operator expander (epic #137, wave 3 — issue #143). + +DISJOIN splits each target interval at every reference breakpoint strictly +interior to it, passing the full target row through and appending the +sub-interval as ``disjoin_chrom`` / ``disjoin_start`` / ``disjoin_end`` in the +target table's declared coordinate system. A coverage filter drops sub-intervals +overlapping no reference interval; an omitted reference defaults to the target +set. + +This module is the AST-expansion replacement for the legacy +``BaseGIQLGenerator.giqldisjoin_sql`` emitter. It assembles the same WITH-CTE +subquery, parses it back into a sqlglot :class:`~sqlglot.expressions.Expression`, +and returns that node so the active target's serializer renders it — dissolving +the emit-time string special-case left by epic #114. + +Three behaviours are capability-driven / corrected here relative to the legacy +emitter: + +* **star-REPLACE portability (#143).** The full-row passthrough de-canonicalizes + the interval columns back into the target's declared encoding. On a target that + supports ``SELECT * REPLACE (...)`` (``supports_star_replace`` — DuckDB) the + passthrough emits ``t.* REPLACE (...)``. On a target without it (the generic + baseline and the DataFusion family) it emits the portable + ``t.* EXCEPT (...)`` form plus the two recomputed interval columns. This + portable form is **not** SQL-92 — it requires ``SELECT * EXCEPT (...)`` + support, which the DataFusion family provides but a strict SQL-92 engine (and + DuckDB, which prefers ``* REPLACE``) does not — so the generic non-canonical + passthrough is **not DuckDB-runnable** and runs only on an ``* EXCEPT``-capable + engine. +* **passthrough column order diverges across targets (issue #143).** The two + passthrough forms are *row-equivalent* but **not column-order-equivalent**. + ``* REPLACE`` substitutes the start/end columns *in place*, so the row keeps + its original column order. ``* EXCEPT (start, end)`` drops those two columns + and the recomputed ones are *re-appended* at the end of the projection, so on + the generic / DataFusion path the start/end columns move to the right of any + passthrough column. A ``SELECT *`` over a non-canonical DISJOIN is therefore + column-order-divergent between DuckDB and the generic target; only the row + *set* is identical. Select **explicit** columns (in a chosen order) when the + output column order must agree across targets. +* **duplicate-column fix (#153).** Every UNION branch of the ``__giql_dj_cuts`` + CTE aliases all four projected columns (``kc`` / ``ks`` / ``ke`` / ``pos``), so + the ``t."end"`` column and the end-cut expression never collide on an output + name when ``end`` de-canonicalizes to the bare physical column (the default + 0-based half-open identity case). DuckDB tolerated the prior unaliased + duplicate; DataFusion rejected it as a non-unique projection name. + +Input canonicalization is owned by ``CanonicalizeCoordinates`` (pass 2, #122): +every non-canonical interval-bearing operand is rewritten to a canonical +``__giql_canon_*`` CTE before this pass runs, so the expander consumes +already-canonical 0-based half-open columns and applies no input-canonicalization +arithmetic. The output round-trip back to the target's declared encoding stays +here, driven by the original encoding pass 2 preserves on the resolution. +""" + +from __future__ import annotations + +from sqlglot import exp +from sqlglot import parse_one + +from giql.canonical import decanonical_end +from giql.canonical import decanonical_start +from giql.constants import DJ_PREFIX +from giql.dialect import GIQLDialect +from giql.expander import ExpansionContext +from giql.expander import register +from giql.expressions import GIQLDisjoin +from giql.resolver import OperatorResolution +from giql.resolver import ResolvedRef +from giql.table import Table +from giql.targets import GenericTarget + + +@register(GenericTarget, GIQLDisjoin) +def expand_disjoin(node: GIQLDisjoin, ctx: ExpansionContext) -> exp.Expression: + """Expand a DISJOIN node to its WITH-CTE subquery AST for the active target. + + Registered for :class:`~giql.targets.GenericTarget`, so it is the portable + fallback every target resolves to through the registry's generic chain. The + one capability branch — ``ctx.capabilities.supports_star_replace`` — selects + the passthrough projection form, so a single expander covers both the + ``* REPLACE`` (DuckDB) and ``* EXCEPT`` (DataFusion / generic) targets. + + Parameters + ---------- + node : GIQLDisjoin + The :class:`~giql.expressions.GIQLDisjoin` node being expanded. + ctx : ExpansionContext + The expansion context carrying the node's pass-1/pass-2 resolution + metadata, the active target and its capabilities, and the registered + tables. + + Returns + ------- + exp.Expression + The parsed AST of the parenthesized WITH-CTE subquery that replaces the + DISJOIN node. + """ + sql = _build_disjoin_sql(node, ctx) + return parse_one(sql, dialect=GIQLDialect) + + +def _build_disjoin_sql(node: GIQLDisjoin, ctx: ExpansionContext) -> str: + """Assemble the DISJOIN WITH-CTE subquery SQL string for the active target. + + Mirrors the legacy ``BaseGIQLGenerator.giqldisjoin_sql`` shape one named + ``__giql_dj_*`` CTE at a time, with the #153 alias fix and the + capability-driven passthrough applied. + """ + resolution = ctx.resolution + target_ref, ref, ref_from = _disjoin_resolution(node, resolution) + target_name = target_ref.name + target_chrom, target_start, target_end = target_ref.cols + + ref_chrom, ref_start, ref_end = ref.cols + is_self_reference = ref.coverage_skippable + + # The target's *declared* encoding, which disjoin_* output and the + # passed-through interval round-trip back into. Pass 2 preserves it on the + # resolution when it wraps a non-canonical target (the slot's own Table is + # then None); a canonical target keeps the (identity) encoding on its slot. + output_table = _disjoin_output_encoding(resolution, target_ref) + + # Post-pass every operand is canonical 0-based half-open, so the physical + # columns are consumed verbatim with no input-canonicalization arithmetic. + t_chrom = f't."{target_chrom}"' + t_start = f't."{target_start}"' + t_end = f't."{target_end}"' + + # Reference endpoints: unqualified for the breakpoint CTE, qualified by 'r' + # for the coverage EXISTS filter. + bp_start = f'"{ref_start}"' + bp_end = f'"{ref_end}"' + r_start = f'r."{ref_start}"' + r_end = f'r."{ref_end}"' + + # disjoin_start / disjoin_end are emitted in the target's declared coordinate + # system so an output row carries one convention; the cut math stays canonical + # internally. + out_start = decanonical_start("s.seg_start", output_table) + out_end = decanonical_end("s.seg_end", output_table) + passthrough = _disjoin_passthrough(ctx, target_start, target_end, output_table) + + # Build the WITH clause one named fragment per __giql_dj_* CTE so each block + # reads on its own. The `seg_end > seg_start` guard in the final WHERE is + # belt-and-suspenders: UNION already dedupes cut positions, so LEAD cannot + # produce a zero-length segment unless it becomes UNION ALL. + ref_cte = f"__giql_dj_ref AS (SELECT * FROM {ref_from})" + tgt_cte = f"__giql_dj_tgt AS (SELECT * FROM {target_name})" + bp_cte = ( + "__giql_dj_bp AS (" + f'SELECT "{ref_chrom}" AS chrom, {bp_start} AS pos FROM __giql_dj_ref ' + "UNION " + f'SELECT "{ref_chrom}" AS chrom, {bp_end} AS pos FROM __giql_dj_ref)' + ) + # #153: alias all four columns in EVERY UNION branch. The output names already + # come from branch 1, so this is behaviour-preserving on DuckDB and makes each + # branch's projection internally unique for strict engines (DataFusion), where + # an unaliased t."end" would otherwise collide with the end-cut t."end". + cuts_cte = ( + "__giql_dj_cuts AS (" + f'SELECT t."{target_chrom}" AS kc, t."{target_start}" AS ks, ' + f't."{target_end}" AS ke, {t_start} AS pos FROM __giql_dj_tgt AS t ' + "UNION " + f'SELECT t."{target_chrom}" AS kc, t."{target_start}" AS ks, ' + f't."{target_end}" AS ke, {t_end} AS pos FROM __giql_dj_tgt AS t ' + "UNION " + f'SELECT t."{target_chrom}" AS kc, t."{target_start}" AS ks, ' + f't."{target_end}" AS ke, bp.pos AS pos ' + "FROM __giql_dj_tgt AS t JOIN __giql_dj_bp AS bp " + f"ON bp.chrom = {t_chrom} AND bp.pos > {t_start} " + f"AND bp.pos < {t_end})" + ) + segs_cte = ( + "__giql_dj_segs AS (" + "SELECT kc, ks, ke, pos AS seg_start, " + "LEAD(pos) OVER (PARTITION BY kc, ks, ke ORDER BY pos) AS seg_end " + "FROM __giql_dj_cuts)" + ) + # In self-reference mode the coverage EXISTS is provably always true: every + # emitted segment lies inside its parent target row, and that row is itself a + # member of the reference set. Skip the clause so the planner does not waste + # work on a no-op semi-join. The __giql_dj_ref CTE itself stays live because + # __giql_dj_bp still draws breakpoints from it. + where_clauses = ["s.seg_end IS NOT NULL", "s.seg_end > s.seg_start"] + if not is_self_reference: + where_clauses.append( + f'EXISTS (SELECT 1 FROM __giql_dj_ref AS r WHERE r."{ref_chrom}" = s.kc ' + f"AND {r_start} <= s.seg_start AND {r_end} > s.seg_start)" + ) + where_sql = " AND ".join(where_clauses) + final_select = ( + f"SELECT {passthrough}, s.kc AS disjoin_chrom, " + f"{out_start} AS disjoin_start, " + f"{out_end} AS disjoin_end FROM __giql_dj_tgt AS t " + f'JOIN __giql_dj_segs AS s ON t."{target_chrom}" = s.kc ' + f'AND t."{target_start}" = s.ks AND t."{target_end}" = s.ke ' + f"WHERE {where_sql}" + ) + return ( + f"(WITH {ref_cte}, {tgt_cte}, {bp_cte}, " + f"{cuts_cte}, {segs_cte} {final_select})" + ) + + +def _disjoin_passthrough( + ctx: ExpansionContext, + target_start: str, + target_end: str, + output_table: Table | None, +) -> str: + """Project the target's full row, de-canonicalizing the interval columns. + + When the target's declared encoding is canonical 0-based half-open the row + passes through as a plain ``t.*`` — the identity fast path, portable on every + engine. When it is non-canonical the interval columns (canonical inside + ``__giql_dj_tgt``) are de-canonicalized back into that encoding: + + * ``t.* REPLACE (...)`` on a target that supports it (``supports_star_replace`` + — DuckDB), substituting start/end **in place**; + * the portable ``t.* EXCEPT (start, end), , `` form otherwise + (the generic baseline / DataFusion family), which every ``* EXCEPT``-capable + engine plans. This form is **not** SQL-92 and is **not DuckDB-runnable**: it + requires ``SELECT * EXCEPT`` support. + + Column-order divergence (issue #143) + ------------------------------------ + The two forms produce row-equivalent but **not column-order-equivalent** + output. ``* REPLACE`` keeps the original column order; ``* EXCEPT`` drops + start/end and **re-appends** the recomputed columns at the end of the + projection, so they move to the right of any passthrough column on the + generic / DataFusion path. A ``SELECT *`` over a non-canonical DISJOIN is + therefore order-divergent across targets — only the row set agrees. Select + explicit columns when the cross-target column order must match. + """ + if output_table is None or ( + output_table.coordinate_system == "0based" + and output_table.interval_type == "half_open" + ): + return "t.*" + pt_start = decanonical_start(f't."{target_start}"', output_table) + pt_end = decanonical_end(f't."{target_end}"', output_table) + if ctx.capabilities.supports_star_replace: + return ( + f't.* REPLACE ({pt_start} AS "{target_start}", {pt_end} AS "{target_end}")' + ) + # Portable substitution: drop the two interval columns from the star and + # re-project them recomputed. EXCEPT removes them from the row, the trailing + # projections add them back in the target's encoding under their own names. + return ( + f't.* EXCEPT ("{target_start}", "{target_end}"), ' + f'{pt_start} AS "{target_start}", {pt_end} AS "{target_end}"' + ) + + +def _disjoin_output_encoding( + resolution: OperatorResolution | None, target_ref: ResolvedRef +) -> Table | None: + """Return the target's declared encoding for DISJOIN's output round-trip. + + ``CanonicalizeCoordinates`` (pass 2) records the original + :class:`~giql.table.Table` on the resolution when it wraps a non-canonical + target (blanking the slot's own ``table``). For an unwrapped target — a + canonical registered table, or any target when the pass did not run — the + slot's own ``table`` carries the (identity) encoding. + """ + if isinstance(resolution, OperatorResolution): + preserved = resolution.output_tables.get("this") + if preserved is not None: + return preserved + return target_ref.table + + +def _disjoin_resolution( + node: GIQLDisjoin, resolution: OperatorResolution | None +) -> tuple[ResolvedRef, ResolvedRef, str]: + """Unpack the DISJOIN resolution attached by ResolveOperatorRefs (pass 1). + + Returns ``(target_ref, ref, ref_from)`` where ``ref_from`` is the text + following ``FROM`` inside the ``__giql_dj_ref`` CTE. A subquery reference + carries no name, so it is rendered from the AST node as an aliased derived + table; registered tables and CTEs are selected from by name. + + The resolver pass deliberately leaves unresolvable slots unresolved; for + those, and for a resolved name using the reserved ``__giql_dj_`` prefix, this + re-raises DISJOIN's historical diagnostics verbatim. + """ + target_ref = ( + resolution.slot("this") + if isinstance(resolution, OperatorResolution) + else None + ) + + # An unresolved target means it is not a registered table. + if target_ref is None: + target = node.this + if isinstance(target, exp.Table): + target_name = target.name + elif isinstance(target, exp.Column): + target_name = target.table if target.table else str(target.this) + else: + target_name = str(target) + raise ValueError( + f"Target table '{target_name}' not found in tables. " + "Register the table before transpiling." + ) + + # The __giql_dj_ prefix names the operator's internal CTEs; a target table + # using it would collide with them. + if target_ref.name.startswith(DJ_PREFIX): + raise ValueError( + f"DISJOIN target {target_ref.name!r} uses the reserved " + "'__giql_dj_' prefix, which names the operator's internal " + "CTEs. Rename the table." + ) + + reference = node.args.get("reference") + ref = resolution.slot("reference") + if ref is not None: + if ref.kind == "subquery": + # Serializing the subquery with sqlglot's stock generator (not + # BaseGIQLGenerator) is safe because nested GIQL operators expand + # deepest-first: any GIQL construct inside this operand is already + # rewritten to standard AST by the time this outer DISJOIN expands, + # so the stock generator only ever round-trips plain SQL here. + ref_sql = reference.sql(dialect=GIQLDialect) + return target_ref, ref, f"{ref_sql} AS __giql_dj_rs" + return target_ref, ref, ref.name + + # Unresolved reference: re-classify it and raise the matching historical + # diagnostic. An omitted reference always resolves, so reference is non-None. + if not isinstance(reference, (exp.Table, exp.Column, exp.Identifier)): + raise ValueError( + "DISJOIN reference must be a table name, a CTE, or a " + f"(SELECT ...) subquery; got {type(reference).__name__}: " + f"{reference}" + ) + ref_name = reference.name + if ref_name.startswith(DJ_PREFIX): + raise ValueError( + f"DISJOIN reference {ref_name!r} uses the reserved " + "'__giql_dj_' prefix, which names the operator's internal " + "CTEs. Rename the reference relation." + ) + raise ValueError( + f"DISJOIN reference {ref_name!r} is neither a registered table " + "nor a CTE defined in this query. Register the table or define " + "the CTE before transpiling." + ) diff --git a/src/giql/expressions.py b/src/giql/expressions.py index 2e8fd0c..6a0f440 100644 --- a/src/giql/expressions.py +++ b/src/giql/expressions.py @@ -438,7 +438,11 @@ class GIQLDisjoin(exp.Func): #: (0-based half-open) operands are left unwrapped and the emitted SQL stays #: byte-identical. GIQL_CANONICALIZE = True - GIQL_EXPAND = _EXPAND + #: Opt DISJOIN into the ExpandOperators pass (epic #137, issue #143). DISJOIN + #: is migrated to a registered expander (``giql.expanders.disjoin``), so the + #: pass replaces the node with the expander's AST and the legacy + #: ``giqldisjoin_sql`` emitter is removed. + GIQL_EXPAND = True GIQL_SLOTS = ( SlotSpec("this", frozenset({"registered_table"}), required=True), diff --git a/src/giql/generators/base.py b/src/giql/generators/base.py index 599eb0d..b0f3f79 100644 --- a/src/giql/generators/base.py +++ b/src/giql/generators/base.py @@ -4,7 +4,6 @@ from giql.canonical import decanonical_end from giql.canonical import decanonical_start from giql.dialect import GIQLDialect -from giql.expressions import GIQLDisjoin from giql.expressions import GIQLNearest from giql.range_parser import RangeParser from giql.resolver import META_KEY @@ -106,187 +105,17 @@ def _nearest_passthrough( return f"{table_name}.*" pt_start = decanonical_start(f'{table_name}."{target_start}"', output_table) pt_end = decanonical_end(f'{table_name}."{target_end}"', output_table) + # TODO(#142): this emits an unconditional ``* REPLACE`` (DuckDB-only). + # When DataFusion gains correlated LATERAL, adopt the capability branch the + # DISJOIN expander uses (``giql.expanders.disjoin._disjoin_passthrough``): + # ``* REPLACE`` where ``supports_star_replace`` holds, the portable + # ``* EXCEPT`` form otherwise, so a non-canonical NEAREST passthrough runs + # on the DataFusion family too. return ( f"{table_name}.* REPLACE " f'({pt_start} AS "{target_start}", {pt_end} AS "{target_end}")' ) - def giqldisjoin_sql(self, expression: GIQLDisjoin) -> str: - """Generate SQL for the DISJOIN table function. - - DISJOIN splits each target interval at every reference breakpoint - strictly interior to it. The full target row passes through unchanged; - the sub-interval is appended as ``disjoin_chrom`` / ``disjoin_start`` / - ``disjoin_end`` in the target table's coordinate system. A coverage - filter drops sub-intervals overlapping no reference interval. When no - ``reference`` is given it defaults to the target set. - - Input canonicalization is owned by ``CanonicalizeCoordinates`` (pass 2, - issue #122): every non-canonical interval-bearing operand is rewritten to - a canonical ``__giql_canon_*`` CTE before generation, so this emitter - consumes already-canonical 0-based half-open columns and applies no - in-emitter canonicalization arithmetic. The output round-trip back to the - target's declared encoding stays here — the ``disjoin_*`` columns and the - passed-through interval are synthesized at generation time and cannot be - reached by a pass-2 outermost-projection rewrite — driven by the original - encoding the pass preserves on the resolution. - - :param expression: - GIQLDisjoin expression node - :return: - SQL string (a parenthesized WITH-CTE subquery) for the DISJOIN table - """ - # Unpack the resolution metadata attached by ResolveOperatorRefs (pass 1) - # and rewritten by CanonicalizeCoordinates (pass 2). - target_ref, ref, ref_from = self._disjoin_resolution(expression) - target_name = target_ref.name - target_chrom, target_start, target_end = target_ref.cols - ref_chrom, ref_start, ref_end = ref.cols - is_self_reference = ref.coverage_skippable - - # The target's *declared* encoding, which disjoin_* output and the - # passed-through interval must round-trip back into. Pass 2 preserves it - # on the resolution when it wraps a non-canonical target (the slot's own - # Table is then None); a canonical target is left unwrapped and its slot - # Table carries the (identity) encoding. - output_table = self._disjoin_output_encoding(expression, target_ref) - - # Post-pass every operand is canonical 0-based half-open (a registered - # table is either identity-encoded or rewritten to a canonical CTE), so - # the physical columns are consumed verbatim with no canonicalization. - t_chrom = f't."{target_chrom}"' - t_start = f't."{target_start}"' - t_end = f't."{target_end}"' - - # Reference endpoints: unqualified for the breakpoint CTE, qualified by - # 'r' for the coverage EXISTS filter. - bp_start = f'"{ref_start}"' - bp_end = f'"{ref_end}"' - r_start = f'r."{ref_start}"' - r_end = f'r."{ref_end}"' - - # disjoin_start / disjoin_end are emitted in the target's declared - # coordinate system so an output row carries one convention; the cut math - # stays canonical internally. - out_start = decanonical_start("s.seg_start", output_table) - out_end = decanonical_end("s.seg_end", output_table) - passthrough = self._disjoin_passthrough(target_start, target_end, output_table) - - # Build the WITH clause one named fragment per __giql_dj_* CTE so each - # block reads on its own. The `seg_end > seg_start` guard in the final - # WHERE is belt-and-suspenders: UNION already dedupes cut positions, so - # LEAD cannot produce a zero-length segment unless it becomes UNION ALL. - ref_cte = f"__giql_dj_ref AS (SELECT * FROM {ref_from})" - tgt_cte = f"__giql_dj_tgt AS (SELECT * FROM {target_name})" - bp_cte = ( - "__giql_dj_bp AS (" - f'SELECT "{ref_chrom}" AS chrom, {bp_start} AS pos FROM __giql_dj_ref ' - "UNION " - f'SELECT "{ref_chrom}" AS chrom, {bp_end} AS pos FROM __giql_dj_ref)' - ) - cuts_cte = ( - "__giql_dj_cuts AS (" - f'SELECT t."{target_chrom}" AS kc, t."{target_start}" AS ks, ' - f't."{target_end}" AS ke, {t_start} AS pos FROM __giql_dj_tgt AS t ' - "UNION " - f'SELECT t."{target_chrom}", t."{target_start}", t."{target_end}", ' - f"{t_end} FROM __giql_dj_tgt AS t " - "UNION " - f'SELECT t."{target_chrom}", t."{target_start}", t."{target_end}", ' - "bp.pos FROM __giql_dj_tgt AS t JOIN __giql_dj_bp AS bp " - f"ON bp.chrom = {t_chrom} AND bp.pos > {t_start} " - f"AND bp.pos < {t_end})" - ) - segs_cte = ( - "__giql_dj_segs AS (" - "SELECT kc, ks, ke, pos AS seg_start, " - "LEAD(pos) OVER (PARTITION BY kc, ks, ke ORDER BY pos) AS seg_end " - "FROM __giql_dj_cuts)" - ) - # In self-reference mode the coverage EXISTS is provably always true: - # every emitted segment lies inside its parent target row, and that - # row is itself a member of the reference set. Skip the clause so the - # planner does not waste work on a no-op semi-join. The __giql_dj_ref - # CTE itself stays live because __giql_dj_bp still draws breakpoints - # from it. - where_clauses = ["s.seg_end IS NOT NULL", "s.seg_end > s.seg_start"] - if not is_self_reference: - where_clauses.append( - f'EXISTS (SELECT 1 FROM __giql_dj_ref AS r WHERE r."{ref_chrom}" = s.kc ' - f"AND {r_start} <= s.seg_start AND {r_end} > s.seg_start)" - ) - where_sql = " AND ".join(where_clauses) - final_select = ( - f"SELECT {passthrough}, s.kc AS disjoin_chrom, " - f"{out_start} AS disjoin_start, " - f"{out_end} AS disjoin_end FROM __giql_dj_tgt AS t " - f'JOIN __giql_dj_segs AS s ON t."{target_chrom}" = s.kc ' - f'AND t."{target_start}" = s.ks AND t."{target_end}" = s.ke ' - f"WHERE {where_sql}" - ) - return ( - f"(WITH {ref_cte}, {tgt_cte}, {bp_cte}, " - f"{cuts_cte}, {segs_cte} {final_select})" - ) - - def _disjoin_output_encoding( - self, expression: GIQLDisjoin, target_ref: ResolvedRef - ) -> Table | None: - """Return the target's declared encoding for DISJOIN's output round-trip. - - ``CanonicalizeCoordinates`` (pass 2) records the original - :class:`~giql.table.Table` on the resolution when it wraps a non-canonical - target (blanking the slot's own ``table``). For an unwrapped target — a - canonical registered table, or any target when the pass did not run — the - slot's own ``table`` carries the (identity) encoding. - - :param expression: - GIQLDisjoin expression node - :param target_ref: - The resolved target reference (post pass 2) - :return: - The target's declared :class:`~giql.table.Table`, or ``None`` - """ - resolution = expression.meta.get(META_KEY) - if isinstance(resolution, OperatorResolution): - preserved = resolution.output_tables.get("this") - if preserved is not None: - return preserved - return target_ref.table - - def _disjoin_passthrough( - self, target_start: str, target_end: str, output_table: Table | None - ) -> str: - """Project the target's full row, de-canonicalizing the interval columns. - - When the target's declared encoding is canonical 0-based half-open the - row passes through as a plain ``t.*`` — the byte-identical identity fast - path. When it is non-canonical the interval columns, canonical inside - ``__giql_dj_tgt``, are de-canonicalized back into that encoding via a star - ``REPLACE`` so the passed-through interval matches the target's own - convention. (Only non-canonical targets are wrapped, so the ``REPLACE`` - appears only where a canonical CTE already shapes the SQL.) - - :param target_start: - Physical start column name - :param target_end: - Physical end column name - :param output_table: - The target's declared :class:`~giql.table.Table`, or ``None`` - :return: - The passthrough projection fragment (``t.*`` or a star ``REPLACE``) - """ - if output_table is None or ( - output_table.coordinate_system == "0based" - and output_table.interval_type == "half_open" - ): - return "t.*" - pt_start = decanonical_start(f't."{target_start}"', output_table) - pt_end = decanonical_end(f't."{target_end}"', output_table) - return ( - f't.* REPLACE ({pt_start} AS "{target_start}", {pt_end} AS "{target_end}")' - ) - @staticmethod def _generate_distance_case( chrom_a: str, @@ -481,92 +310,6 @@ def _raise_nearest_reference_error( ) raise ValueError(f"Could not parse reference genomic range: {range_str}.") - def _disjoin_resolution( - self, expression: GIQLDisjoin - ) -> tuple[ResolvedRef, ResolvedRef, str]: - """Unpack the DISJOIN resolution attached by ResolveOperatorRefs (pass 1). - - Reads the :class:`~giql.resolver.OperatorResolution` from - ``expression.meta`` and returns ``(target_ref, ref, ref_from)`` where - ``ref_from`` is the text following ``FROM`` inside the ``__giql_dj_ref`` - CTE. A subquery reference carries no name, so it is rendered from the - AST node as an aliased derived table; registered tables and CTEs are - selected from by name. - - The resolver pass deliberately leaves unresolvable slots unresolved - (unregistered target; unsupported reference node type; reference name - using the reserved ``__giql_dj_`` prefix or matching neither a - registered table nor an in-query CTE). For those, and for a target name - using the reserved prefix (which the resolver does resolve), this - re-raises the generator's historical diagnostics verbatim. - - :param expression: - GIQLDisjoin expression node - :return: - Tuple of ``(target_ref, ref, ref_from)`` - :raises ValueError: - If the target or reference slot is unresolved, or a resolved name - uses the reserved ``__giql_dj_`` prefix. - """ - resolution = expression.meta.get(META_KEY) - target_ref = ( - resolution.slot("this") - if isinstance(resolution, OperatorResolution) - else None - ) - - # An unresolved target means it is not a registered table. - if target_ref is None: - target = expression.this - if isinstance(target, exp.Table): - target_name = target.name - elif isinstance(target, exp.Column): - target_name = target.table if target.table else str(target.this) - else: - target_name = str(target) - raise ValueError( - f"Target table '{target_name}' not found in tables. " - "Register the table before transpiling." - ) - - # The __giql_dj_ prefix names the operator's internal CTEs; a target - # table using it would collide with them. - if target_ref.name.startswith("__giql_dj_"): - raise ValueError( - f"DISJOIN target {target_ref.name!r} uses the reserved " - "'__giql_dj_' prefix, which names the operator's internal " - "CTEs. Rename the table." - ) - - reference = expression.args.get("reference") - ref = resolution.slot("reference") - if ref is not None: - if ref.kind == "subquery": - return target_ref, ref, f"{self.sql(reference)} AS __giql_dj_rs" - return target_ref, ref, ref.name - - # Unresolved reference: re-classify it and raise the matching - # historical diagnostic. An omitted reference always resolves (to the - # target set), so reference is non-None here. - if not isinstance(reference, (exp.Table, exp.Column, exp.Identifier)): - raise ValueError( - "DISJOIN reference must be a table name, a CTE, or a " - f"(SELECT ...) subquery; got {type(reference).__name__}: " - f"{reference}" - ) - ref_name = reference.name - if ref_name.startswith("__giql_dj_"): - raise ValueError( - f"DISJOIN reference {ref_name!r} uses the reserved " - "'__giql_dj_' prefix, which names the operator's internal " - "CTEs. Rename the reference relation." - ) - raise ValueError( - f"DISJOIN reference {ref_name!r} is neither a registered table " - "nor a CTE defined in this query. Register the table or define " - "the CTE before transpiling." - ) - @staticmethod def _extract_bool_param(param_expr: exp.Expression | None) -> bool: """Extract boolean value from a parameter expression. diff --git a/src/giql/resolver.py b/src/giql/resolver.py index 80d0969..7869d28 100644 --- a/src/giql/resolver.py +++ b/src/giql/resolver.py @@ -24,8 +24,8 @@ Scope note (epic #114, steps 1-3) --------------------------------- -The pass is behavior-preserving. DISJOIN's emitter -(``BaseGIQLGenerator.giqldisjoin_sql``, step 2) and NEAREST's emitter +The pass is behavior-preserving. DISJOIN's expander +(``giql.expanders.disjoin``, step 2) and NEAREST's emitter (``BaseGIQLGenerator.giqlnearest_sql``, step 3) consume the attached metadata; DISTANCE and the spatial predicates still use the generator's legacy resolver paths and ignore everything attached here until their port issues land. The @@ -73,6 +73,7 @@ from giql.constants import DEFAULT_END_COL from giql.constants import DEFAULT_START_COL from giql.constants import DEFAULT_STRAND_COL +from giql.constants import DJ_PREFIX from giql.expressions import Contains from giql.expressions import GIQLDisjoin from giql.expressions import GIQLDistance @@ -758,7 +759,7 @@ def _resolve_disjoin_reference( ref_name = reference.name # The __giql_dj_ prefix names the operator's internal CTEs. - if ref_name.startswith("__giql_dj_"): + if ref_name.startswith(DJ_PREFIX): return None # A CTE from an enclosing WITH shadows a registered table of the same name diff --git a/src/giql/targets.py b/src/giql/targets.py index 825a6d2..88e4eb7 100644 --- a/src/giql/targets.py +++ b/src/giql/targets.py @@ -85,6 +85,15 @@ class GenericTarget(Target): This is the default target (``dialect=None``). Its capabilities are the conservative, maximally portable baseline that matches today's :class:`giql.generators.base.BaseGIQLGenerator` output. + + "SQL-92-ish", not strict SQL-92: because ``supports_star_replace=False``, the + DISJOIN passthrough over a **non-canonical** target falls back to a + ``SELECT * EXCEPT (...)`` projection (re-appending the de-canonicalized + interval columns). ``* EXCEPT`` is **not** SQL-92 and is **not + DuckDB-runnable** — it is a DataFusion-family extension — so the generic + target's non-canonical DISJOIN output runs only on an ``* EXCEPT``-capable + engine. A canonical (0-based half-open) target passes the row through as a + plain, fully portable ``SELECT *``. """ name: str = "generic" diff --git a/src/giql/transpile.py b/src/giql/transpile.py index b5568e2..3ff986a 100644 --- a/src/giql/transpile.py +++ b/src/giql/transpile.py @@ -230,7 +230,7 @@ def transpile( # Pass 1 of the normalization pipeline (epic #114): attach resolution # metadata to every GIQL operator slot ahead of generation. DISJOIN's - # emitter consumes this metadata (step 2); the remaining operators still + # expander consumes this metadata (step 2); the remaining operators still # use the generator's legacy resolver paths until their ports land. with _reraise_as_value_error("Resolution error"): ast = resolve_operator_refs(ast, tables_container) diff --git a/tests/integration/coordinate_space/conftest.py b/tests/integration/coordinate_space/conftest.py index 52f8d62..8b966d1 100644 --- a/tests/integration/coordinate_space/conftest.py +++ b/tests/integration/coordinate_space/conftest.py @@ -40,7 +40,10 @@ def giql_query(duckdb_connection): def _run(query: str, *, tables: list[Table], **table_data): for name, rows in table_data.items(): load_intervals(duckdb_connection, name, rows) - sql = transpile(query, tables=tables) + # Transpile for the DuckDB target since the SQL executes on DuckDB: a + # non-canonical DISJOIN passthrough emits DuckDB's ``* REPLACE`` here + # rather than the portable ``* EXCEPT`` form generic targets use. + sql = transpile(query, tables=tables, dialect="duckdb") return duckdb_connection.execute(sql).fetchall() return _run diff --git a/tests/integration/datafusion/test_cross_target_oracle.py b/tests/integration/datafusion/test_cross_target_oracle.py index f67ff9b..ba4a927 100644 --- a/tests/integration/datafusion/test_cross_target_oracle.py +++ b/tests/integration/datafusion/test_cross_target_oracle.py @@ -3,9 +3,9 @@ These exercise the reusable oracle (``tests/integration/conftest.py``) over the operators that already emit identical generic SQL across Generic and DataFusion and run correctly on DuckDB: INTERSECTS (literal + column-to-column join), -CONTAINS, WITHIN, and standalone NEAREST. The spatial predicates have since been -migrated to the expander registry (#141, epic #137); this lane locks in the -verification path that migration and every later one (#142-#144) consume. +CONTAINS, WITHIN, and standalone NEAREST. DISTANCE, the spatial predicates, +NEAREST, and DISJOIN have since been migrated to the expander registry (epic +#137); this lane locks in the verification path each migration consumes. For the non-join operators (DISTANCE, CONTAINS, WITHIN, ANY/ALL, CLUSTER, MERGE) the generic and datafusion targets emit byte-identical SQL and both run @@ -26,8 +26,10 @@ exposes the fallback's reserved ``__giql_x_*`` rank/key columns, a divergent output schema from the LATERAL form's — a known limitation pinned by the ``xfail`` ``test_correlated_nearest_star_projection_diverges_on_datafusion`` case -below and tracked for a query-level fix by #160 (dependent on #146). DISJOIN has -an analogous pending-#153 gap (duplicate ``end`` output names). +below and tracked for a query-level fix by #160 (dependent on #146). DISJOIN is +migrated onto the expander registry (#143) and its previously pending-#153 gap +(duplicate ``end`` output names) is closed, so its full three-target oracle now +runs as a real cross-target identity test. """ import pytest @@ -861,11 +863,12 @@ def test_merge_empty_input_returns_zero_rows(self, cross_target_oracle): class TestCrossTargetOracleDisjoin: - """DISJOIN: a DuckDB-only case plus the DataFusion duplicate-``end`` gap (#153). + """DISJOIN cross-target identity across Generic, DataFusion, and DuckDB (#143). - The DataFusion gap is the unaliased duplicate ``t."end"`` columns in the - ``__giql_dj_cuts`` CTE UNION branches, which DataFusion rejects as non-unique - projection names (DuckDB tolerates them). + DISJOIN is migrated onto the expander registry (#143), and the duplicate + unaliased ``t."end"`` columns in the ``__giql_dj_cuts`` CTE UNION branches are + aliased in every branch (#153), so DataFusion no longer rejects the projection + for non-unique names. The full three-target oracle therefore runs and agrees. """ def test_disjoin_splits_overlaps_on_duckdb(self, cross_target_oracle): @@ -893,40 +896,79 @@ def test_disjoin_splits_overlaps_on_duckdb(self, cross_target_oracle): targets=("duckdb",), ) - def test_disjoin_on_datafusion_unsupported_pending_153(self, cross_target_oracle): - """Test the full DISJOIN oracle raises DataFusion's duplicate-name error. + def test_disjoin_agrees_across_all_targets(self, cross_target_oracle): + """Test the full DISJOIN oracle returns identical rows on every target. Given: - The same two overlapping intervals. + Two overlapping intervals on chr1. + When: + The oracle runs all three targets — generic and datafusion execute the + registry-expanded DISJOIN (with every ``__giql_dj_cuts`` UNION branch + aliased, #153) on DataFusion, and duckdb runs on DuckDB. + Then: + Every target should return the same sub-segments, proving DISJOIN now + executes on DataFusion (the #153 duplicate-name gap is closed) and + agrees with DuckDB. + """ + # Arrange / Act / Assert + cross_target_oracle( + 'SELECT chrom, start, "end", disjoin_start, disjoin_end FROM DISJOIN(peaks)', + tables=[Table("peaks")], + peaks=[("chr1", 0, 100), ("chr1", 50, 150)], + expected=[ + ("chr1", 0, 100, 0, 50), + ("chr1", 0, 100, 50, 100), + ("chr1", 50, 150, 50, 100), + ("chr1", 50, 150, 100, 150), + ], + ) + + def test_disjoin_non_canonical_passthrough_agrees_across_targets( + self, cross_target_oracle + ): + """Test the non-canonical EXCEPT passthrough agrees with DuckDB REPLACE. + + Given: + Two overlapping intervals on chr1 in a table declared **1-based + closed** (non-canonical), each carrying an extra ``name`` passthrough + column. When: - The oracle runs all three targets — the datafusion target executes - DISJOIN's ``__giql_dj_cuts`` CTE, whose UNION branches project the - unaliased ``t."end"`` column twice. + The oracle runs all three targets — generic and datafusion exercise + the portable ``SELECT * EXCEPT (start, end), ...`` passthrough on + DataFusion (the #143 headline path, which runs on no engine without + this case), and duckdb runs the in-place ``* REPLACE`` form on DuckDB. Then: - DataFusion should reject the projection for non-unique expression - names (DuckDB tolerates it). This pins the known #153 gap: the - ``match`` narrows to the duplicate-output-name signature so an - unrelated/reworded DataFusion error fails loudly, and a closed gap - (no exception) trips pytest's "DID NOT RAISE", forcing this to be - converted into a real cross-target identity test when the duplicate - ``end`` columns in the ``__giql_dj_cuts`` UNION branches are aliased. + Every target should return the same rows: the de-canonicalized + (1-based closed) interval columns, the passthrough ``name``, and the + disjoin sub-segments. The projection lists **explicit** columns so the + row comparison is order-agnostic across the EXCEPT-re-append vs. + REPLACE-in-place column orders the two forms produce. """ # Arrange / Act / Assert - with pytest.raises( - Exception, match="Projections require unique expression names" - ): - cross_target_oracle( - 'SELECT chrom, start, "end", disjoin_start, disjoin_end ' - "FROM DISJOIN(peaks)", - tables=[Table("peaks")], - peaks=[("chr1", 0, 100), ("chr1", 50, 150)], - expected=[ - ("chr1", 0, 100, 0, 50), - ("chr1", 0, 100, 50, 100), - ("chr1", 50, 150, 50, 100), - ("chr1", 50, 150, 100, 150), - ], - ) + cross_target_oracle( + 'SELECT name, start, "end", disjoin_start, disjoin_end ' + "FROM DISJOIN(feats)", + tables=[ + Table( + "feats", + coordinate_system="1based", + interval_type="closed", + ) + ], + columns=( + ("chrom", "utf8"), + ("start", "int64"), + ("end", "int64"), + ("name", "utf8"), + ), + feats=[("chr1", 1, 100, "a"), ("chr1", 50, 150, "b")], + expected=[ + ("a", 1, 100, 1, 49), + ("a", 1, 100, 50, 100), + ("b", 50, 150, 50, 100), + ("b", 50, 150, 101, 150), + ], + ) class TestCrossTargetOracleDataShapes: diff --git a/tests/test_canonicalizer.py b/tests/test_canonicalizer.py index afea3e5..57b751c 100644 --- a/tests/test_canonicalizer.py +++ b/tests/test_canonicalizer.py @@ -21,6 +21,7 @@ from giql.canonicalizer import CANON_PREFIX from giql.canonicalizer import canonicalize_coordinates from giql.dialect import GIQLDialect +from giql.expander import ExpandOperators from giql.expressions import GIQLDisjoin from giql.expressions import GIQLDistance from giql.expressions import GIQLNearest @@ -30,6 +31,7 @@ from giql.resolver import resolve_operator_refs from giql.table import Table from giql.table import Tables +from giql.targets import GenericTarget from giql.transpile import transpile hypothesis = pytest.importorskip("hypothesis") @@ -102,6 +104,10 @@ def test_canonical_table_sql_unchanged(self): query = "SELECT * FROM DISJOIN(variants)" tables = _tables(("0based", "half_open")) ast = resolve_operator_refs(parse_one(query, dialect=GIQLDialect), tables) + # Bypass pass 2 (canonicalization) but keep pass 3 (expansion): DISJOIN + # now goes through its registered expander, so the pass-bypassed reference + # must too, isolating pass 2's contribution to nothing. + ast = ExpandOperators(GenericTarget(), tables).transform(ast) expected = BaseGIQLGenerator(tables=tables).generate(ast) # Act @@ -130,6 +136,10 @@ def test_non_canonical_table_sql_unchanged(self, monkeypatch): tables = Tables() tables.register("variants", variants) ast = resolve_operator_refs(parse_one(query, dialect=GIQLDialect), tables) + # Bypass pass 2 (canonicalization) but keep pass 3 (expansion): DISJOIN + # now expands through its registry entry, so the pass-bypassed reference + # must run pass 3 too for the byte-identical comparison to isolate pass 2. + ast = ExpandOperators(GenericTarget(), tables).transform(ast) expected = BaseGIQLGenerator(tables=tables).generate(ast) # Act diff --git a/tests/test_disjoin_transpilation.py b/tests/test_disjoin_transpilation.py index 34412b9..426adf0 100644 --- a/tests/test_disjoin_transpilation.py +++ b/tests/test_disjoin_transpilation.py @@ -32,6 +32,29 @@ def test_giqldisjoin_sql_should_emit_with_cte_subquery(self): assert "UNION" in sql assert "LEAD(" in sql + def test_transpile_should_alias_every_cuts_union_branch(self): + """Test that every __giql_dj_cuts UNION branch aliases its columns (#153). + + Given: + A default (canonical 0-based half-open) DISJOIN, whose ``end`` + de-canonicalizes to the bare physical ``t."end"`` column. + When: + Transpiling to SQL. + Then: + Each of the three ``__giql_dj_cuts`` UNION branches should project a + uniquely aliased ``ke`` (start/end/breakpoint), and the unaliased + ``t."end", t."end" FROM`` collision shape the #153 fix removed should + be absent — so DataFusion accepts the projection's unique names. + """ + # Arrange & act + sql = transpile("SELECT * FROM DISJOIN(features)", tables=["features"]) + + # Assert: every cuts branch aliases the end-column projection as ``ke``. + assert sql.count("AS kc") == 3 + assert sql.count("AS ke") == 3 + # The pre-#153 unaliased duplicate-end collision must not survive. + assert 't."end", t."end" FROM' not in sql + def test_giqldisjoin_sql_should_emit_disjoin_columns(self): """Test that DISJOIN appends the sub-interval columns. @@ -309,7 +332,7 @@ def test_giqldisjoin_sql_should_skip_exists_clause_when_reference_omitted(self): sql = transpile("SELECT * FROM DISJOIN(features)", tables=["features"]) # Assert - assert "EXISTS (" not in sql + assert "EXISTS(" not in sql def test_giqldisjoin_sql_should_skip_exists_clause_when_reference_resolves_to_target( self, @@ -331,7 +354,7 @@ def test_giqldisjoin_sql_should_skip_exists_clause_when_reference_resolves_to_ta ) # Assert - assert "EXISTS (" not in sql + assert "EXISTS(" not in sql def test_giqldisjoin_sql_should_emit_exists_clause_when_reference_is_different_table( self, @@ -354,7 +377,7 @@ def test_giqldisjoin_sql_should_emit_exists_clause_when_reference_is_different_t ) # Assert - assert "EXISTS (" in sql + assert "EXISTS(" in sql def test_giqldisjoin_sql_should_emit_exists_clause_when_cte_shadows_target_name( self, @@ -377,7 +400,7 @@ def test_giqldisjoin_sql_should_emit_exists_clause_when_cte_shadows_target_name( ) # Assert - assert "EXISTS (" in sql + assert "EXISTS(" in sql def test_giqldisjoin_sql_should_emit_exists_clause_when_reference_is_subquery(self): """Test that a subquery reference keeps the EXISTS clause. @@ -397,7 +420,7 @@ def test_giqldisjoin_sql_should_emit_exists_clause_when_reference_is_subquery(se ) # Assert - assert "EXISTS (" in sql + assert "EXISTS(" in sql def test_giqldisjoin_sql_should_skip_exists_clause_when_target_uses_one_based_closed_encoding( self, @@ -425,7 +448,7 @@ def test_giqldisjoin_sql_should_skip_exists_clause_when_target_uses_one_based_cl ) # Assert - assert "EXISTS (" not in sql + assert "EXISTS(" not in sql assert '("start" - 1) AS "start"' in sql def test_giqldisjoin_sql_should_skip_exists_clause_when_target_uses_custom_column_names( @@ -450,7 +473,7 @@ def test_giqldisjoin_sql_should_skip_exists_clause_when_target_uses_custom_colum ) # Assert - assert "EXISTS (" not in sql + assert "EXISTS(" not in sql assert 'r."seqid"' not in sql assert 'r."lo"' not in sql assert 'r."hi"' not in sql @@ -483,7 +506,7 @@ def test_giqldisjoin_sql_should_skip_exists_clause_when_explicit_self_reference_ ) # Assert - assert "EXISTS (" not in sql + assert "EXISTS(" not in sql assert '("start" - 1) AS "start"' in sql # Target and explicit self-reference dedupe to a single wrapper CTE: one # CTE definition plus the two FROM references (target + reference). @@ -501,7 +524,7 @@ def test_giqldisjoin_sql_should_emit_one_exists_clause_when_query_mixes_self_and When: Transpiling to SQL. Then: - It should produce SQL containing exactly one ``EXISTS (`` + It should produce SQL containing exactly one ``EXISTS(`` occurrence — only the distinct-reference call retains the coverage filter. """ @@ -514,7 +537,7 @@ def test_giqldisjoin_sql_should_emit_one_exists_clause_when_query_mixes_self_and ) # Assert - assert sql.count("EXISTS (") == 1 + assert sql.count("EXISTS(") == 1 @pytest.mark.parametrize( ("query", "tables", "skips_exists"), @@ -571,7 +594,7 @@ def test_giqldisjoin_sql_should_pin_coverage_skippable_across_reference_shapes( sql = transpile(query, tables=tables) # Assert - assert ("EXISTS (" not in sql) is skips_exists + assert ("EXISTS(" not in sql) is skips_exists def test_giqldisjoin_sql_should_emit_exists_clause_when_enclosing_cte_shadows_target_from_outer_scope( self, @@ -599,7 +622,7 @@ def test_giqldisjoin_sql_should_emit_exists_clause_when_enclosing_cte_shadows_ta ) # Assert - assert sql.count("EXISTS (") == 1 + assert sql.count("EXISTS(") == 1 def test_giqldisjoin_sql_should_resolve_recursive_cte_reference(self): """Test that a WITH RECURSIVE CTE reference resolves to the CTE. @@ -627,7 +650,7 @@ def test_giqldisjoin_sql_should_resolve_recursive_cte_reference(self): # Assert assert "__giql_dj_ref AS (SELECT * FROM refs)" in sql - assert "EXISTS (" in sql + assert "EXISTS(" in sql assert '("start" - 1)' not in sql def test_giqldisjoin_sql_should_resolve_set_operation_attached_cte_reference(self): @@ -656,7 +679,7 @@ def test_giqldisjoin_sql_should_resolve_set_operation_attached_cte_reference(sel # Assert assert "__giql_dj_ref AS (SELECT * FROM refs)" in sql - assert sql.count("EXISTS (") == 1 + assert sql.count("EXISTS(") == 1 def test_giqldisjoin_sql_should_resolve_redeclared_inner_cte_reference(self): """Test that an inner WITH redeclaring an outer CTE name still resolves. @@ -683,7 +706,7 @@ def test_giqldisjoin_sql_should_resolve_redeclared_inner_cte_reference(self): # Assert assert "__giql_dj_ref AS (SELECT * FROM refs)" in sql - assert sql.count("EXISTS (") == 1 + assert sql.count("EXISTS(") == 1 def test_giqldisjoin_sql_should_not_see_cte_declared_in_sibling_derived_table( self, @@ -810,17 +833,50 @@ def test_giqldisjoin_sql_should_dedupe_self_reference_to_one_canon_cte(self): assert sql.count("AS (SELECT * REPLACE") == 1 assert sql.count("__giql_canon_") == 3 - def test_giqldisjoin_sql_should_emit_passthrough_interval_in_target_encoding(self): - """Test that the passed-through interval is de-canonicalized to the target. + def test_transpile_should_emit_portable_passthrough_interval_on_generic_target( + self, + ): + """Test that the passthrough uses a portable EXCEPT projection by default. + + Given: + A self-mode DISJOIN over a 1-based closed target whose row passes + through canonical inside the wrapper CTE, transpiled for the generic + target (which does not support ``SELECT * REPLACE``). + When: + Transpiling to SQL. + Then: + It should de-canonicalize the passed-through start/end back into the + target's encoding via a portable ``* EXCEPT`` projection that drops + the interval columns and re-projects them recomputed. + """ + # Arrange & act + sql = transpile( + "SELECT * FROM DISJOIN(features)", + tables=[ + Table("features", coordinate_system="1based", interval_type="closed") + ], + ) + + # Assert + assert ( + 't.* EXCEPT ("start", "end"), (t."start" + 1) AS "start", ' + 't."end" AS "end"' in sql + ) + + def test_transpile_should_emit_star_replace_passthrough_on_duckdb_target( + self, + ): + """Test that the passthrough uses ``* REPLACE`` on a REPLACE-capable target. Given: A self-mode DISJOIN over a 1-based closed target whose row passes - through canonical inside the wrapper CTE. + through canonical inside the wrapper CTE, transpiled for the DuckDB + target (which supports ``SELECT * REPLACE``). When: Transpiling to SQL. Then: It should de-canonicalize the passed-through start/end back into the - target's encoding via a star REPLACE on the final projection. + target's encoding via a star ``REPLACE`` on the final projection. """ # Arrange & act sql = transpile( @@ -828,6 +884,7 @@ def test_giqldisjoin_sql_should_emit_passthrough_interval_in_target_encoding(sel tables=[ Table("features", coordinate_system="1based", interval_type="closed") ], + dialect="duckdb", ) # Assert diff --git a/tests/test_usage_patterns.py b/tests/test_usage_patterns.py index c82dca7..7b3dd29 100644 --- a/tests/test_usage_patterns.py +++ b/tests/test_usage_patterns.py @@ -89,9 +89,12 @@ def _execute(usage: OperatorUsage, query: str, engine: str) -> list: The descriptor's `tables` yields a `Table` config for any fixture with a custom physical layout, so GIQL resolves custom column names and - coordinate systems during transpilation. + coordinate systems during transpilation. The query is transpiled for the + engine's own target dialect so engine-specific emission (e.g. DISJOIN's + capability-driven non-canonical passthrough, which uses ``* REPLACE`` on + DuckDB) executes on the engine it is shaped for. """ - sql = transpile(query, tables=list(usage.tables)) + sql = transpile(query, tables=list(usage.tables), dialect=engine) rows = ENGINES[engine](usage.fixtures, sql) return sorted(([list(row) for row in rows]), key=repr)