diff --git a/docs/transpilation/schema-mapping.rst b/docs/transpilation/schema-mapping.rst index 453c289..2cba2ea 100644 --- a/docs/transpilation/schema-mapping.rst +++ b/docs/transpilation/schema-mapping.rst @@ -173,6 +173,23 @@ If your data uses 1-based coordinates (like VCF or GFF), configure the ], ) +.. note:: + + **Non-canonical encodings currently require a DuckDB-compatible engine.** + When a table declares an encoding other than the default 0-based half-open + (for example ``coordinate_system="1based"`` or ``interval_type="closed"``), + GIQL canonicalizes its coordinates by wrapping the relation in a hidden CTE + that uses ``SELECT * REPLACE (...)`` syntax. That syntax is supported by + DuckDB, BigQuery, Snowflake, and ClickHouse, but **not** by PostgreSQL, + SQLite, or DataFusion. Tables in the default 0-based half-open encoding are + unaffected -- they take an identity fast path that emits portable SQL. + + To target a non-``REPLACE`` engine today, store your data in 0-based + half-open form, or convert it explicitly in a CTE and reference that CTE + (which GIQL treats as already canonical). Making canonicalization emit + portable SQL on every engine is tracked in + `#132 `_. + Working with Point Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/giql/canonicalizer.py b/src/giql/canonicalizer.py index bb0e34f..714a007 100644 --- a/src/giql/canonicalizer.py +++ b/src/giql/canonicalizer.py @@ -34,6 +34,20 @@ tradeoff the epic calls out (only synthesize a wrapper when canonicalization actually changes columns). +Engine portability (known limitation) +------------------------------------- +The wrapper projection uses ``SELECT * REPLACE (...)`` to canonicalize the +interval columns in place while passing every other source column through +untouched (the registry declares only the genomic columns, so an explicit +full-column projection is not available). ``* REPLACE`` is supported by DuckDB, +BigQuery, Snowflake, and ClickHouse, but **not** by PostgreSQL, SQLite, or +DataFusion — so a non-canonical encoding currently transpiles to +engine-incompatible SQL on those targets. Identity-encoded (default 0-based +half-open) relations are unaffected: they skip wrapping entirely and emit +portable SQL. Making the emit strategy dialect-aware (an explicit portable +projection when the target lacks ``REPLACE`` or the full schema is declared) is +tracked in https://github.com/abdenlab/giql/issues/132. + Gating (epic #114, step 6) -------------------------- The pass is gated per operator by a ``GIQL_CANONICALIZE`` class attribute on the @@ -48,11 +62,17 @@ De-canonicalization hook ------------------------- -The outermost ``SELECT`` projection receives a de-canonicalization rewrite for -any output column that a migrated operator emitted in canonical form but that -must land in the user's preferred encoding. With no operator migrated in this -issue that rewrite has nothing to act on; :func:`_decanonicalize_outputs` is the -designed-but-inert hook the port issues will fill in. +A migrated operator's *output* columns must land back in the target relation's +declared encoding. Epic #114 step 6 envisioned a rewrite of the outermost +``SELECT`` projection, but that placement is wrong for a table function: DISJOIN +synthesizes its ``disjoin_*`` output and its passed-through interval at +*generation* time, so those columns do not exist as AST in this pass, and a +``SELECT *`` consumer hides them from any outer-projection rewrite. So +:func:`_decanonicalize_outputs` instead records each wrapped slot's *original* +:class:`~giql.table.Table` on the operator's +:class:`~giql.resolver.OperatorResolution`, and the operator's emitter reads it +to de-canonicalize those synthesized columns where it generates them (DISJOIN, +issue #122). """ from __future__ import annotations @@ -246,22 +266,39 @@ def _fresh_name(next_name, taken: set[str]) -> str: def _canonical_projection(ref: ResolvedRef) -> exp.Select: """Build the ``SELECT`` body that projects *ref*'s table to canonical form. - The projection exposes the canonical ``chrom`` / ``start`` / ``end`` columns - under their original physical names, with ``start`` / ``end`` rewritten by - the :mod:`giql.canonical` arithmetic for the table's declared encoding. This - is the interval contract every CTE / subquery reference is assumed to satisfy - (canonical 0-based half-open ``chrom`` / ``start`` / ``end``); operator port - issues #122 / #123 may extend it with pass-through columns as their emitters - require. + The projection is a **full-row passthrough**: ``SELECT *`` keeps every + physical column of the source relation, and a star ``REPLACE`` rewrites only + the two interval columns — ``start`` / ``end``, under their original physical + names — with the :mod:`giql.canonical` arithmetic for the table's declared + encoding. ``chrom`` and every non-interval column flow through the star + untouched. + + The full row (rather than a bare ``chrom`` / ``start`` / ``end`` triple) is + required by table-function operators whose final projection passes the whole + source row through — DISJOIN's ``SELECT t.*`` (#122) — and by their join-back + semantics, which key on the source's physical columns. A CTE / subquery + reference that only needs the canonical interval triple still reads those + three columns from the same wrapper. """ - chrom, start, end = ref.cols + _chrom, start, end = ref.cols table = ref.table relation = ref.name - return exp.select( - exp.alias_(exp.column(chrom), chrom), - exp.alias_(_canonical_start_expr(start, table), start), - exp.alias_(_canonical_end_expr(end, table), end), - ).from_(exp.to_table(relation)) + # Quote the interval identifiers: the canonical column names are physical and + # routinely reserved words (the default genomic layout's ``start`` / ``end``), + # so the executed wrapper must quote them. + star = exp.Star( + replace=[ + exp.alias_( + _canonical_start_expr(start, table), + exp.to_identifier(start, quoted=True), + ), + exp.alias_( + _canonical_end_expr(end, table), + exp.to_identifier(end, quoted=True), + ), + ] + ) + return exp.Select(expressions=[star]).from_(exp.to_table(relation)) def _canonical_start_expr(start: str, table: Table | None) -> exp.Expression: @@ -272,7 +309,7 @@ def _canonical_start_expr(start: str, table: Table | None) -> exp.Expression: - ``0based``: ``start`` (identity) - ``1based``: ``start - 1`` """ - col = exp.column(start) + col = exp.column(exp.to_identifier(start, quoted=True)) if table is None or table.coordinate_system == "0based": return col return exp.paren(exp.Sub(this=col, expression=exp.Literal.number(1))) @@ -288,7 +325,7 @@ def _canonical_end_expr(end: str, table: Table | None) -> exp.Expression: - ``1based`` / ``half_open``: ``end - 1`` - ``1based`` / ``closed``: ``end`` (identity) """ - col = exp.column(end) + col = exp.column(exp.to_identifier(end, quoted=True)) if table is None: return col key = (table.coordinate_system, table.interval_type) @@ -343,13 +380,27 @@ def _decanonicalize_outputs( expression: exp.Expression, targets: list[tuple[exp.Expression, str, ResolvedRef]], ) -> None: - """De-canonicalize migrated operator outputs in the outermost projection. - - Inert hook (epic #114, step 6). The outermost ``SELECT`` projection list - should rewrite any output column a migrated operator emitted in canonical - form back into the user's preferred encoding. No operator is migrated in - issue #121, so there is nothing to rewrite; the operator port issues (#122, - #123) fill this in alongside flipping their ``GIQL_CANONICALIZE`` flags. + """Preserve each wrapped slot's original encoding for the emitter's output. + + A wrapped slot's :class:`~giql.resolver.ResolvedRef` is rewritten to a + ``Table``-free canonical-CTE ref, which would otherwise lose the + (non-canonical) encoding the operator's *output* must round-trip back into. + + The de-canonicalization itself cannot be applied on the AST in this pass for + a table-function operator: DISJOIN synthesizes its ``disjoin_*`` columns and + its passed-through interval at *generation* time, so those columns do not + exist as AST here, and a ``SELECT *`` consumer hides them from any + outer-projection rewrite. The originally-envisioned outermost-projection + rewrite (epic #114, step 6) is therefore wrong for projected + table-function columns; instead this hook records the per-slot original + :class:`~giql.table.Table` on the :class:`~giql.resolver.OperatorResolution`, + and the operator's emitter reads it to de-canonicalize those synthesized + columns where it generates them (see :issue:`122`). + + *targets* carries the original (pre-rewrite) refs, so ``ref.table`` is the + source relation's declared encoding. """ - # Intentionally empty until an operator opts in (see module docstring). - return None + for node, arg, ref in targets: + resolution = node.meta.get(META_KEY) + if isinstance(resolution, OperatorResolution): + resolution.output_tables[arg] = ref.table diff --git a/src/giql/expressions.py b/src/giql/expressions.py index 4364fce..52e5fce 100644 --- a/src/giql/expressions.py +++ b/src/giql/expressions.py @@ -336,6 +336,15 @@ class GIQLDisjoin(exp.Func): "reference": False, # Optional: reference table/CTE name or subquery } + #: Opt DISJOIN into the CanonicalizeCoordinates pass (epic #114 step 7, + #: issue #122). With this flag set, pass 2 wraps every non-canonical + #: interval-bearing operand in a canonical ``__giql_canon_*`` CTE and + #: rewrites the slot to point at it, so the emitter consumes already-canonical + #: 0-based half-open columns instead of canonicalizing inline. Identity + #: (0-based half-open) operands are left unwrapped and the emitted SQL stays + #: byte-identical. + GIQL_CANONICALIZE = True + GIQL_SLOTS = ( SlotSpec("this", frozenset({"registered_table"}), required=True), SlotSpec( diff --git a/src/giql/generators/base.py b/src/giql/generators/base.py index 189a14e..2bbf918 100644 --- a/src/giql/generators/base.py +++ b/src/giql/generators/base.py @@ -280,41 +280,56 @@ def giqldisjoin_sql(self, expression: GIQLDisjoin) -> str: filter drops sub-intervals overlapping no reference interval. When no ``reference`` is given it defaults to the target set. - Coordinate-system round-tripping is handled by - :func:`giql.canonical.decanonical_start` / - :func:`giql.canonical.decanonical_end`. + Input canonicalization is owned by ``CanonicalizeCoordinates`` (pass 2, + issue #122): every non-canonical interval-bearing operand is rewritten to + a canonical ``__giql_canon_*`` CTE before generation, so this emitter + consumes already-canonical 0-based half-open columns and applies no + in-emitter canonicalization arithmetic. The output round-trip back to the + target's declared encoding stays here — the ``disjoin_*`` columns and the + passed-through interval are synthesized at generation time and cannot be + reached by a pass-2 outermost-projection rewrite — driven by the original + encoding the pass preserves on the resolution. :param expression: GIQLDisjoin expression node :return: SQL string (a parenthesized WITH-CTE subquery) for the DISJOIN table """ - # Unpack the resolution metadata attached by ResolveOperatorRefs (pass 1). + # Unpack the resolution metadata attached by ResolveOperatorRefs (pass 1) + # and rewritten by CanonicalizeCoordinates (pass 2). target_ref, ref, ref_from = self._disjoin_resolution(expression) target_name = target_ref.name target_chrom, target_start, target_end = target_ref.cols - target_table = target_ref.table ref_chrom, ref_start, ref_end = ref.cols - ref_table = ref.table is_self_reference = ref.coverage_skippable - # Canonical target endpoints, qualified by the __giql_dj_tgt alias. + # The target's *declared* encoding, which disjoin_* output and the + # passed-through interval must round-trip back into. Pass 2 preserves it + # on the resolution when it wraps a non-canonical target (the slot's own + # Table is then None); a canonical target is left unwrapped and its slot + # Table carries the (identity) encoding. + output_table = self._disjoin_output_encoding(expression, target_ref) + + # Post-pass every operand is canonical 0-based half-open (a registered + # table is either identity-encoded or rewritten to a canonical CTE), so + # the physical columns are consumed verbatim with no canonicalization. t_chrom = f't."{target_chrom}"' - t_start = canonical_start(f't."{target_start}"', target_table) - t_end = canonical_end(f't."{target_end}"', target_table) - - # Canonical reference endpoints: unqualified for the breakpoint CTE, - # qualified by 'r' for the coverage EXISTS filter. - bp_start = canonical_start(f'"{ref_start}"', ref_table) - bp_end = canonical_end(f'"{ref_end}"', ref_table) - r_start = canonical_start(f'r."{ref_start}"', ref_table) - r_end = canonical_end(f'r."{ref_end}"', ref_table) - - # disjoin_start / disjoin_end are emitted in the target table's - # coordinate system so an output row carries one convention; the cut - # math above stays canonical internally. - out_start = decanonical_start("s.seg_start", target_table) - out_end = decanonical_end("s.seg_end", target_table) + t_start = f't."{target_start}"' + t_end = f't."{target_end}"' + + # Reference endpoints: unqualified for the breakpoint CTE, qualified by + # 'r' for the coverage EXISTS filter. + bp_start = f'"{ref_start}"' + bp_end = f'"{ref_end}"' + r_start = f'r."{ref_start}"' + r_end = f'r."{ref_end}"' + + # disjoin_start / disjoin_end are emitted in the target's declared + # coordinate system so an output row carries one convention; the cut math + # stays canonical internally. + out_start = decanonical_start("s.seg_start", output_table) + out_end = decanonical_end("s.seg_end", output_table) + passthrough = self._disjoin_passthrough(target_start, target_end, output_table) # Build the WITH clause one named fragment per __giql_dj_* CTE so each # block reads on its own. The `seg_end > seg_start` guard in the final @@ -361,7 +376,8 @@ def giqldisjoin_sql(self, expression: GIQLDisjoin) -> str: ) where_sql = " AND ".join(where_clauses) final_select = ( - f"SELECT t.*, s.kc AS disjoin_chrom, {out_start} AS disjoin_start, " + f"SELECT {passthrough}, s.kc AS disjoin_chrom, " + f"{out_start} AS disjoin_start, " f"{out_end} AS disjoin_end FROM __giql_dj_tgt AS t " f'JOIN __giql_dj_segs AS s ON t."{target_chrom}" = s.kc ' f'AND t."{target_start}" = s.ks AND t."{target_end}" = s.ke ' @@ -372,6 +388,64 @@ def giqldisjoin_sql(self, expression: GIQLDisjoin) -> str: f"{cuts_cte}, {segs_cte} {final_select})" ) + def _disjoin_output_encoding( + self, expression: GIQLDisjoin, target_ref: ResolvedRef + ) -> Table | None: + """Return the target's declared encoding for DISJOIN's output round-trip. + + ``CanonicalizeCoordinates`` (pass 2) records the original + :class:`~giql.table.Table` on the resolution when it wraps a non-canonical + target (blanking the slot's own ``table``). For an unwrapped target — a + canonical registered table, or any target when the pass did not run — the + slot's own ``table`` carries the (identity) encoding. + + :param expression: + GIQLDisjoin expression node + :param target_ref: + The resolved target reference (post pass 2) + :return: + The target's declared :class:`~giql.table.Table`, or ``None`` + """ + resolution = expression.meta.get(META_KEY) + if isinstance(resolution, OperatorResolution): + preserved = resolution.output_tables.get("this") + if preserved is not None: + return preserved + return target_ref.table + + def _disjoin_passthrough( + self, target_start: str, target_end: str, output_table: Table | None + ) -> str: + """Project the target's full row, de-canonicalizing the interval columns. + + When the target's declared encoding is canonical 0-based half-open the + row passes through as a plain ``t.*`` — the byte-identical identity fast + path. When it is non-canonical the interval columns, canonical inside + ``__giql_dj_tgt``, are de-canonicalized back into that encoding via a star + ``REPLACE`` so the passed-through interval matches the target's own + convention. (Only non-canonical targets are wrapped, so the ``REPLACE`` + appears only where a canonical CTE already shapes the SQL.) + + :param target_start: + Physical start column name + :param target_end: + Physical end column name + :param output_table: + The target's declared :class:`~giql.table.Table`, or ``None`` + :return: + The passthrough projection fragment (``t.*`` or a star ``REPLACE``) + """ + if output_table is None or ( + output_table.coordinate_system == "0based" + and output_table.interval_type == "half_open" + ): + return "t.*" + pt_start = decanonical_start(f't."{target_start}"', output_table) + pt_end = decanonical_end(f't."{target_end}"', output_table) + return ( + f't.* REPLACE ({pt_start} AS "{target_start}", {pt_end} AS "{target_end}")' + ) + def giqldistance_sql(self, expression: GIQLDistance) -> str: """Generate SQL CASE expression for DISTANCE function. diff --git a/src/giql/resolver.py b/src/giql/resolver.py index 2c6889a..9fa3104 100644 --- a/src/giql/resolver.py +++ b/src/giql/resolver.py @@ -346,12 +346,23 @@ class OperatorResolution: resolve (a literal range, or an unqualified column outside a current-table context) is left out and the generator raises its existing error. + output_tables : dict[str, Table] + Mapping from a slot's arg key to the *original* :class:`~giql.table.Table` + whose declared encoding that slot carried before + :func:`giql.canonicalizer.canonicalize_coordinates` (pass 2) wrapped it + in a canonical CTE and blanked its ``ResolvedRef.table``. The pass + populates this for every slot it wraps so the operator's emitter can + round-trip *synthesized* output columns (DISJOIN's ``disjoin_*`` and its + passed-through interval) back into that encoding — columns that do not + exist as AST in pass 2 and that a ``SELECT *`` consumer hides from any + outer-projection rewrite. Empty until pass 2 wraps a slot. """ operator: str slots: dict[str, ResolvedRef | ResolvedInterval] deferrals: dict[str, SlotDeferral] = field(default_factory=dict) columns: dict[str, ResolvedColumn] = field(default_factory=dict) + output_tables: dict[str, Table] = field(default_factory=dict) def slot(self, arg: str) -> ResolvedRef | ResolvedInterval | None: """Return the resolved metadata for slot *arg*, or ``None``.""" diff --git a/tests/test_canonicalizer.py b/tests/test_canonicalizer.py index efffb0f..6899e9a 100644 --- a/tests/test_canonicalizer.py +++ b/tests/test_canonicalizer.py @@ -108,18 +108,20 @@ def test_canonical_table_sql_unchanged(self): assert actual == expected assert CANON_PREFIX not in actual - def test_non_canonical_table_sql_unchanged(self): + def test_non_canonical_table_sql_unchanged(self, monkeypatch): """Test that a non-canonical table's SQL is byte-identical through the pass. Given: - A DISJOIN over a non-canonical (1based/closed) registered table and no - operator opted into canonicalization. + A DISJOIN over a non-canonical (1based/closed) registered table with + its GIQL_CANONICALIZE flag explicitly toggled off (the gating an + unmigrated operator relies on). When: Transpiling the query and comparing against a pass-bypassed run. Then: No wrapper CTE is synthesized and the SQL is unchanged. """ # Arrange + monkeypatch.setattr(GIQLDisjoin, "GIQL_CANONICALIZE", False, raising=False) query = "SELECT * FROM DISJOIN(variants)" variants = Table("variants", coordinate_system="1based", interval_type="closed") tables = Tables() @@ -134,17 +136,19 @@ def test_non_canonical_table_sql_unchanged(self): assert actual == expected assert CANON_PREFIX not in actual - def test_pass_returns_expression_with_no_with_added(self): + def test_pass_returns_expression_with_no_with_added(self, monkeypatch): """Test that the no-op pass adds no WITH clause to the AST. Given: - A non-canonical DISJOIN AST and no operator opted in. + A non-canonical DISJOIN AST with its GIQL_CANONICALIZE flag explicitly + toggled off. When: Running canonicalize_coordinates directly. Then: No canonical CTE is present on the returned tree. """ # Arrange + monkeypatch.setattr(GIQLDisjoin, "GIQL_CANONICALIZE", False, raising=False) tables = _tables(("1based", "closed")) # Act @@ -175,8 +179,8 @@ def test_zero_based_closed_projection(self, disjoin_opted_in): # Assert body = _canon_ctes(ast)[0].this.sql() - assert "start AS start" in body - assert "(end + 1) AS end" in body + assert '"start" AS "start"' in body + assert '("end" + 1) AS "end"' in body def test_one_based_half_open_projection(self, disjoin_opted_in): """Test the canonical projection for a 1based/half_open table. @@ -196,8 +200,8 @@ def test_one_based_half_open_projection(self, disjoin_opted_in): # Assert body = _canon_ctes(ast)[0].this.sql() - assert "(start - 1) AS start" in body - assert "(end - 1) AS end" in body + assert '("start" - 1) AS "start"' in body + assert '("end" - 1) AS "end"' in body def test_one_based_closed_projection(self, disjoin_opted_in): """Test the canonical projection for a 1based/closed table. @@ -217,8 +221,8 @@ def test_one_based_closed_projection(self, disjoin_opted_in): # Assert body = _canon_ctes(ast)[0].this.sql() - assert "(start - 1) AS start" in body - assert "end AS end" in body + assert '("start" - 1) AS "start"' in body + assert '"end" AS "end"' in body def test_projection_preserves_custom_column_names(self, disjoin_opted_in): """Test the projection uses a table's physical column names. @@ -228,7 +232,8 @@ def test_projection_preserves_custom_column_names(self, disjoin_opted_in): When: Running pass 2. Then: - The wrapper exposes the canonical interval under those same names. + The wrapper exposes the canonical interval under those same names; + chrom (never offset) flows through the star untouched. """ # Arrange variants = Table( @@ -247,9 +252,9 @@ def test_projection_preserves_custom_column_names(self, disjoin_opted_in): # Assert body = _canon_ctes(ast)[0].this.sql() - assert "chr AS chr" in body - assert "(lo - 1) AS lo" in body - assert "hi AS hi" in body + assert body.startswith("SELECT *") + assert '("lo" - 1) AS "lo"' in body + assert '"hi" AS "hi"' in body class TestPassThrough: @@ -548,8 +553,10 @@ def test_wrap_iff_non_canonical(self, coordinate_system, interval_type): wrapped slot always becomes a Table-free CTE ref. """ # Arrange - # A plain try/finally toggles the class flag: @given re-runs the body - # many times under one function-scoped fixture, so monkeypatch is unsafe. + # DISJOIN is opted into canonicalization by default (issue #122); a plain + # save/restore toggle pins it on, since @given re-runs the body many times + # under one function-scoped fixture, making monkeypatch unsafe. + previous = GIQLDisjoin.GIQL_CANONICALIZE GIQLDisjoin.GIQL_CANONICALIZE = True tables = _tables((coordinate_system, interval_type)) is_canonical = coordinate_system == "0based" and interval_type == "half_open" @@ -558,7 +565,7 @@ def test_wrap_iff_non_canonical(self, coordinate_system, interval_type): try: ast = _prepare("SELECT * FROM DISJOIN(variants)", tables) finally: - del GIQLDisjoin.GIQL_CANONICALIZE + GIQLDisjoin.GIQL_CANONICALIZE = previous # Assert ctes = _canon_ctes(ast) @@ -580,18 +587,26 @@ def test_flags_off_is_always_noop(self, coordinate_system, interval_type): """Test that the pass never mutates the tree while flags are off. Given: - A DISJOIN over a registered table with any encoding and no operator - opted in. + A DISJOIN over a registered table with any encoding and its + GIQL_CANONICALIZE flag explicitly toggled off (the gating an + unmigrated operator relies on). When: Running passes 1 and 2. Then: No canonical CTE is ever synthesized. """ # Arrange + # Save/restore the class flag off: @given re-runs the body many times + # under one function-scoped fixture, making monkeypatch unsafe. + previous = GIQLDisjoin.GIQL_CANONICALIZE + GIQLDisjoin.GIQL_CANONICALIZE = False tables = _tables((coordinate_system, interval_type)) # Act - ast = _prepare("SELECT * FROM DISJOIN(variants)", tables) + try: + ast = _prepare("SELECT * FROM DISJOIN(variants)", tables) + finally: + GIQLDisjoin.GIQL_CANONICALIZE = previous # Assert assert _canon_ctes(ast) == [] diff --git a/tests/test_disjoin_transpilation.py b/tests/test_disjoin_transpilation.py index 16c2bc3..34412b9 100644 --- a/tests/test_disjoin_transpilation.py +++ b/tests/test_disjoin_transpilation.py @@ -115,7 +115,9 @@ def test_giqldisjoin_sql_should_canonicalize_one_based_closed_target(self): When: Transpiling a DISJOIN query. Then: - It should shift the start to canonical 0-based coordinates. + It should shift the start to canonical 0-based coordinates in a + synthesized __giql_canon_ wrapper CTE, with no inline shift left in + the emitter's cut arithmetic. """ # Arrange & act sql = transpile( @@ -130,7 +132,11 @@ def test_giqldisjoin_sql_should_canonicalize_one_based_closed_target(self): ) # Assert - assert '(t."start" - 1)' in sql + # Canonicalization now lives in the CanonicalizeCoordinates wrapper CTE, + # not as inline emitter arithmetic. + assert "__giql_canon_" in sql + assert '("start" - 1) AS "start"' in sql + assert '(t."start" - 1)' not in sql def test_giqldisjoin_sql_should_emit_disjoin_start_in_target_encoding(self): """Test that disjoin_start is emitted in the target's encoding. @@ -403,8 +409,8 @@ def test_giqldisjoin_sql_should_skip_exists_clause_when_target_uses_one_based_cl When: Transpiling to SQL. Then: - It should omit the coverage EXISTS subquery and still emit the - canonical 0-based shift on the target endpoints. + It should omit the coverage EXISTS subquery and still apply the + canonical 0-based shift on the target endpoints in the wrapper CTE. """ # Arrange & act sql = transpile( @@ -420,7 +426,7 @@ def test_giqldisjoin_sql_should_skip_exists_clause_when_target_uses_one_based_cl # Assert assert "EXISTS (" not in sql - assert '(t."start" - 1)' in sql + assert '("start" - 1) AS "start"' in sql def test_giqldisjoin_sql_should_skip_exists_clause_when_target_uses_custom_column_names( self, @@ -460,8 +466,9 @@ def test_giqldisjoin_sql_should_skip_exists_clause_when_explicit_self_reference_ When: Transpiling to SQL. Then: - It should omit the coverage EXISTS subquery and still emit the - canonical 0-based shift on the breakpoint CTE endpoints. + It should omit the coverage EXISTS subquery and still apply the + canonical 0-based shift in the wrapper CTE shared by target and + (self-)reference. """ # Arrange & act sql = transpile( @@ -477,7 +484,11 @@ def test_giqldisjoin_sql_should_skip_exists_clause_when_explicit_self_reference_ # Assert assert "EXISTS (" not in sql - assert '("start" - 1)' in sql + assert '("start" - 1) AS "start"' in sql + # Target and explicit self-reference dedupe to a single wrapper CTE: one + # CTE definition plus the two FROM references (target + reference). + assert sql.count("__giql_canon_") == 3 + assert "SELECT * REPLACE" in sql def test_giqldisjoin_sql_should_emit_one_exists_clause_when_query_mixes_self_and_distinct_reference_disjoins( self, @@ -695,3 +706,129 @@ def test_giqldisjoin_sql_should_not_see_cte_declared_in_sibling_derived_table( "(WITH refs AS (SELECT 1) SELECT * FROM refs) AS sub", tables=["features"], ) + + +class TestDisjoinCanonicalization: + """DISJOIN consumes CanonicalizeCoordinates (pass 2) output (issue #122).""" + + def test_giqldisjoin_sql_should_wrap_non_canonical_target_in_canon_cte(self): + """Test that a non-canonical target is wrapped by the canonicalize pass. + + Given: + A self-mode DISJOIN over a 1-based closed target. + When: + Transpiling to SQL. + Then: + It should synthesize a __giql_canon_ wrapper CTE doing the canonical + shift via a star REPLACE, with no canonicalization arithmetic left + inline in the emitter's cut/breakpoint expressions. + """ + # Arrange & act + sql = transpile( + "SELECT * FROM DISJOIN(features)", + tables=[ + Table("features", coordinate_system="1based", interval_type="closed") + ], + ) + + # Assert + assert "__giql_canon_" in sql + assert "SELECT * REPLACE" in sql + assert '("start" - 1) AS "start"' in sql + # No inline canonicalization arithmetic survives in the emitter output. + assert '(t."start" - 1)' not in sql + assert '("start" - 1) AS pos' not in sql + + def test_giqldisjoin_sql_should_wrap_non_canonical_reference_in_canon_cte(self): + """Test that a non-canonical explicit reference is wrapped by the pass. + + Given: + A two-table DISJOIN whose distinct reference table is 1-based closed + and whose target is canonical. + When: + Transpiling to SQL. + Then: + It should wrap the reference in a __giql_canon_ CTE and emit no inline + shift on the breakpoint endpoints. + """ + # Arrange & act + sql = transpile( + "SELECT * FROM DISJOIN(features, reference := refs)", + tables=[ + "features", + Table("refs", coordinate_system="1based", interval_type="closed"), + ], + ) + + # Assert + assert "__giql_canon_" in sql + assert "SELECT * REPLACE" in sql + # The breakpoint CTE reads the canonical endpoints verbatim, no inline shift. + assert '("start" - 1) AS pos' not in sql + + def test_giqldisjoin_sql_should_not_wrap_canonical_target(self): + """Test that a canonical target produces no wrapper CTE (identity fast path). + + Given: + A self-mode DISJOIN over a canonical 0-based half-open target. + When: + Transpiling to SQL. + Then: + It should synthesize no __giql_canon_ wrapper CTE and pass the row + through as a plain ``t.*`` with no star REPLACE. + """ + # Arrange & act + sql = transpile("SELECT * FROM DISJOIN(features)", tables=["features"]) + + # Assert + assert "__giql_canon_" not in sql + assert "REPLACE" not in sql + assert "t.*," in sql + + def test_giqldisjoin_sql_should_dedupe_self_reference_to_one_canon_cte(self): + """Test that an omitted self-reference shares one wrapper with the target. + + Given: + A self-mode (omitted reference) DISJOIN over a 1-based closed target, + where target and defaulted reference resolve to the same relation. + When: + Transpiling to SQL. + Then: + It should synthesize exactly one canonical wrapper CTE, referenced by + both the target and the reference CTEs (one definition plus two FROM + references). + """ + # Arrange & act + sql = transpile( + "SELECT * FROM DISJOIN(features)", + tables=[ + Table("features", coordinate_system="1based", interval_type="closed") + ], + ) + + # Assert + assert sql.count("AS (SELECT * REPLACE") == 1 + assert sql.count("__giql_canon_") == 3 + + def test_giqldisjoin_sql_should_emit_passthrough_interval_in_target_encoding(self): + """Test that the passed-through interval is de-canonicalized to the target. + + Given: + A self-mode DISJOIN over a 1-based closed target whose row passes + through canonical inside the wrapper CTE. + When: + Transpiling to SQL. + Then: + It should de-canonicalize the passed-through start/end back into the + target's encoding via a star REPLACE on the final projection. + """ + # Arrange & act + sql = transpile( + "SELECT * FROM DISJOIN(features)", + tables=[ + Table("features", coordinate_system="1based", interval_type="closed") + ], + ) + + # Assert + assert 't.* REPLACE ((t."start" + 1) AS "start", t."end" AS "end")' in sql