Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions src/giql/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,12 @@

# Default bin size for INTERSECTS binned equi-join optimization
DEFAULT_BIN_SIZE = 10_000

#: The reserved alias prefix naming DISJOIN's internal CTEs (``__giql_dj_ref`` /
#: ``__giql_dj_tgt`` / ``__giql_dj_cuts`` / ...). Follows the same
#: reserved-prefix scheme as :data:`giql.canonicalizer.CANON_PREFIX`; the leading
#: double underscore keeps
#: the namespace clear of user identifiers. A single source of truth so the
#: resolver's reserved-prefix guard and the DISJOIN expander's CTE names and
#: guard cannot drift apart.
DJ_PREFIX = "__giql_dj_"
348 changes: 348 additions & 0 deletions src/giql/expanders/disjoin.py

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion src/giql/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,11 @@ class GIQLDisjoin(exp.Func):
#: (0-based half-open) operands are left unwrapped and the emitted SQL stays
#: byte-identical.
GIQL_CANONICALIZE = True
GIQL_EXPAND = _EXPAND
#: Opt DISJOIN into the ExpandOperators pass (epic #137, issue #143). DISJOIN
#: is migrated to a registered expander (``giql.expanders.disjoin``), so the
#: pass replaces the node with the expander's AST and the legacy
#: ``giqldisjoin_sql`` emitter is removed.
GIQL_EXPAND = True

GIQL_SLOTS = (
SlotSpec("this", frozenset({"registered_table"}), required=True),
Expand Down
269 changes: 6 additions & 263 deletions src/giql/generators/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from giql.canonical import decanonical_end
from giql.canonical import decanonical_start
from giql.dialect import GIQLDialect
from giql.expressions import GIQLDisjoin
from giql.expressions import GIQLNearest
from giql.range_parser import RangeParser
from giql.resolver import META_KEY
Expand Down Expand Up @@ -106,187 +105,17 @@ def _nearest_passthrough(
return f"{table_name}.*"
pt_start = decanonical_start(f'{table_name}."{target_start}"', output_table)
pt_end = decanonical_end(f'{table_name}."{target_end}"', output_table)
# TODO(#142): this emits an unconditional ``* REPLACE`` (DuckDB-only).
# When DataFusion gains correlated LATERAL, adopt the capability branch the
# DISJOIN expander uses (``giql.expanders.disjoin._disjoin_passthrough``):
# ``* REPLACE`` where ``supports_star_replace`` holds, the portable
# ``* EXCEPT`` form otherwise, so a non-canonical NEAREST passthrough runs
# on the DataFusion family too.
return (
f"{table_name}.* REPLACE "
f'({pt_start} AS "{target_start}", {pt_end} AS "{target_end}")'
)

def giqldisjoin_sql(self, expression: GIQLDisjoin) -> str:
"""Generate SQL for the DISJOIN table function.

DISJOIN splits each target interval at every reference breakpoint
strictly interior to it. The full target row passes through unchanged;
the sub-interval is appended as ``disjoin_chrom`` / ``disjoin_start`` /
``disjoin_end`` in the target table's coordinate system. A coverage
filter drops sub-intervals overlapping no reference interval. When no
``reference`` is given it defaults to the target set.

Input canonicalization is owned by ``CanonicalizeCoordinates`` (pass 2,
issue #122): every non-canonical interval-bearing operand is rewritten to
a canonical ``__giql_canon_*`` CTE before generation, so this emitter
consumes already-canonical 0-based half-open columns and applies no
in-emitter canonicalization arithmetic. The output round-trip back to the
target's declared encoding stays here — the ``disjoin_*`` columns and the
passed-through interval are synthesized at generation time and cannot be
reached by a pass-2 outermost-projection rewrite — driven by the original
encoding the pass preserves on the resolution.

:param expression:
GIQLDisjoin expression node
:return:
SQL string (a parenthesized WITH-CTE subquery) for the DISJOIN table
"""
# Unpack the resolution metadata attached by ResolveOperatorRefs (pass 1)
# and rewritten by CanonicalizeCoordinates (pass 2).
target_ref, ref, ref_from = self._disjoin_resolution(expression)
target_name = target_ref.name
target_chrom, target_start, target_end = target_ref.cols
ref_chrom, ref_start, ref_end = ref.cols
is_self_reference = ref.coverage_skippable

# The target's *declared* encoding, which disjoin_* output and the
# passed-through interval must round-trip back into. Pass 2 preserves it
# on the resolution when it wraps a non-canonical target (the slot's own
# Table is then None); a canonical target is left unwrapped and its slot
# Table carries the (identity) encoding.
output_table = self._disjoin_output_encoding(expression, target_ref)

# Post-pass every operand is canonical 0-based half-open (a registered
# table is either identity-encoded or rewritten to a canonical CTE), so
# the physical columns are consumed verbatim with no canonicalization.
t_chrom = f't."{target_chrom}"'
t_start = f't."{target_start}"'
t_end = f't."{target_end}"'

# Reference endpoints: unqualified for the breakpoint CTE, qualified by
# 'r' for the coverage EXISTS filter.
bp_start = f'"{ref_start}"'
bp_end = f'"{ref_end}"'
r_start = f'r."{ref_start}"'
r_end = f'r."{ref_end}"'

# disjoin_start / disjoin_end are emitted in the target's declared
# coordinate system so an output row carries one convention; the cut math
# stays canonical internally.
out_start = decanonical_start("s.seg_start", output_table)
out_end = decanonical_end("s.seg_end", output_table)
passthrough = self._disjoin_passthrough(target_start, target_end, output_table)

# Build the WITH clause one named fragment per __giql_dj_* CTE so each
# block reads on its own. The `seg_end > seg_start` guard in the final
# WHERE is belt-and-suspenders: UNION already dedupes cut positions, so
# LEAD cannot produce a zero-length segment unless it becomes UNION ALL.
ref_cte = f"__giql_dj_ref AS (SELECT * FROM {ref_from})"
tgt_cte = f"__giql_dj_tgt AS (SELECT * FROM {target_name})"
bp_cte = (
"__giql_dj_bp AS ("
f'SELECT "{ref_chrom}" AS chrom, {bp_start} AS pos FROM __giql_dj_ref '
"UNION "
f'SELECT "{ref_chrom}" AS chrom, {bp_end} AS pos FROM __giql_dj_ref)'
)
cuts_cte = (
"__giql_dj_cuts AS ("
f'SELECT t."{target_chrom}" AS kc, t."{target_start}" AS ks, '
f't."{target_end}" AS ke, {t_start} AS pos FROM __giql_dj_tgt AS t '
"UNION "
f'SELECT t."{target_chrom}", t."{target_start}", t."{target_end}", '
f"{t_end} FROM __giql_dj_tgt AS t "
"UNION "
f'SELECT t."{target_chrom}", t."{target_start}", t."{target_end}", '
"bp.pos FROM __giql_dj_tgt AS t JOIN __giql_dj_bp AS bp "
f"ON bp.chrom = {t_chrom} AND bp.pos > {t_start} "
f"AND bp.pos < {t_end})"
)
segs_cte = (
"__giql_dj_segs AS ("
"SELECT kc, ks, ke, pos AS seg_start, "
"LEAD(pos) OVER (PARTITION BY kc, ks, ke ORDER BY pos) AS seg_end "
"FROM __giql_dj_cuts)"
)
# In self-reference mode the coverage EXISTS is provably always true:
# every emitted segment lies inside its parent target row, and that
# row is itself a member of the reference set. Skip the clause so the
# planner does not waste work on a no-op semi-join. The __giql_dj_ref
# CTE itself stays live because __giql_dj_bp still draws breakpoints
# from it.
where_clauses = ["s.seg_end IS NOT NULL", "s.seg_end > s.seg_start"]
if not is_self_reference:
where_clauses.append(
f'EXISTS (SELECT 1 FROM __giql_dj_ref AS r WHERE r."{ref_chrom}" = s.kc '
f"AND {r_start} <= s.seg_start AND {r_end} > s.seg_start)"
)
where_sql = " AND ".join(where_clauses)
final_select = (
f"SELECT {passthrough}, s.kc AS disjoin_chrom, "
f"{out_start} AS disjoin_start, "
f"{out_end} AS disjoin_end FROM __giql_dj_tgt AS t "
f'JOIN __giql_dj_segs AS s ON t."{target_chrom}" = s.kc '
f'AND t."{target_start}" = s.ks AND t."{target_end}" = s.ke '
f"WHERE {where_sql}"
)
return (
f"(WITH {ref_cte}, {tgt_cte}, {bp_cte}, "
f"{cuts_cte}, {segs_cte} {final_select})"
)

def _disjoin_output_encoding(
self, expression: GIQLDisjoin, target_ref: ResolvedRef
) -> Table | None:
"""Return the target's declared encoding for DISJOIN's output round-trip.

``CanonicalizeCoordinates`` (pass 2) records the original
:class:`~giql.table.Table` on the resolution when it wraps a non-canonical
target (blanking the slot's own ``table``). For an unwrapped target — a
canonical registered table, or any target when the pass did not run — the
slot's own ``table`` carries the (identity) encoding.

:param expression:
GIQLDisjoin expression node
:param target_ref:
The resolved target reference (post pass 2)
:return:
The target's declared :class:`~giql.table.Table`, or ``None``
"""
resolution = expression.meta.get(META_KEY)
if isinstance(resolution, OperatorResolution):
preserved = resolution.output_tables.get("this")
if preserved is not None:
return preserved
return target_ref.table

def _disjoin_passthrough(
self, target_start: str, target_end: str, output_table: Table | None
) -> str:
"""Project the target's full row, de-canonicalizing the interval columns.

When the target's declared encoding is canonical 0-based half-open the
row passes through as a plain ``t.*`` — the byte-identical identity fast
path. When it is non-canonical the interval columns, canonical inside
``__giql_dj_tgt``, are de-canonicalized back into that encoding via a star
``REPLACE`` so the passed-through interval matches the target's own
convention. (Only non-canonical targets are wrapped, so the ``REPLACE``
appears only where a canonical CTE already shapes the SQL.)

:param target_start:
Physical start column name
:param target_end:
Physical end column name
:param output_table:
The target's declared :class:`~giql.table.Table`, or ``None``
:return:
The passthrough projection fragment (``t.*`` or a star ``REPLACE``)
"""
if output_table is None or (
output_table.coordinate_system == "0based"
and output_table.interval_type == "half_open"
):
return "t.*"
pt_start = decanonical_start(f't."{target_start}"', output_table)
pt_end = decanonical_end(f't."{target_end}"', output_table)
return (
f't.* REPLACE ({pt_start} AS "{target_start}", {pt_end} AS "{target_end}")'
)

@staticmethod
def _generate_distance_case(
chrom_a: str,
Expand Down Expand Up @@ -481,92 +310,6 @@ def _raise_nearest_reference_error(
)
raise ValueError(f"Could not parse reference genomic range: {range_str}.")

def _disjoin_resolution(
self, expression: GIQLDisjoin
) -> tuple[ResolvedRef, ResolvedRef, str]:
"""Unpack the DISJOIN resolution attached by ResolveOperatorRefs (pass 1).

Reads the :class:`~giql.resolver.OperatorResolution` from
``expression.meta`` and returns ``(target_ref, ref, ref_from)`` where
``ref_from`` is the text following ``FROM`` inside the ``__giql_dj_ref``
CTE. A subquery reference carries no name, so it is rendered from the
AST node as an aliased derived table; registered tables and CTEs are
selected from by name.

The resolver pass deliberately leaves unresolvable slots unresolved
(unregistered target; unsupported reference node type; reference name
using the reserved ``__giql_dj_`` prefix or matching neither a
registered table nor an in-query CTE). For those, and for a target name
using the reserved prefix (which the resolver does resolve), this
re-raises the generator's historical diagnostics verbatim.

:param expression:
GIQLDisjoin expression node
:return:
Tuple of ``(target_ref, ref, ref_from)``
:raises ValueError:
If the target or reference slot is unresolved, or a resolved name
uses the reserved ``__giql_dj_`` prefix.
"""
resolution = expression.meta.get(META_KEY)
target_ref = (
resolution.slot("this")
if isinstance(resolution, OperatorResolution)
else None
)

# An unresolved target means it is not a registered table.
if target_ref is None:
target = expression.this
if isinstance(target, exp.Table):
target_name = target.name
elif isinstance(target, exp.Column):
target_name = target.table if target.table else str(target.this)
else:
target_name = str(target)
raise ValueError(
f"Target table '{target_name}' not found in tables. "
"Register the table before transpiling."
)

# The __giql_dj_ prefix names the operator's internal CTEs; a target
# table using it would collide with them.
if target_ref.name.startswith("__giql_dj_"):
raise ValueError(
f"DISJOIN target {target_ref.name!r} uses the reserved "
"'__giql_dj_' prefix, which names the operator's internal "
"CTEs. Rename the table."
)

reference = expression.args.get("reference")
ref = resolution.slot("reference")
if ref is not None:
if ref.kind == "subquery":
return target_ref, ref, f"{self.sql(reference)} AS __giql_dj_rs"
return target_ref, ref, ref.name

# Unresolved reference: re-classify it and raise the matching
# historical diagnostic. An omitted reference always resolves (to the
# target set), so reference is non-None here.
if not isinstance(reference, (exp.Table, exp.Column, exp.Identifier)):
raise ValueError(
"DISJOIN reference must be a table name, a CTE, or a "
f"(SELECT ...) subquery; got {type(reference).__name__}: "
f"{reference}"
)
ref_name = reference.name
if ref_name.startswith("__giql_dj_"):
raise ValueError(
f"DISJOIN reference {ref_name!r} uses the reserved "
"'__giql_dj_' prefix, which names the operator's internal "
"CTEs. Rename the reference relation."
)
raise ValueError(
f"DISJOIN reference {ref_name!r} is neither a registered table "
"nor a CTE defined in this query. Register the table or define "
"the CTE before transpiling."
)

@staticmethod
def _extract_bool_param(param_expr: exp.Expression | None) -> bool:
"""Extract boolean value from a parameter expression.
Expand Down
7 changes: 4 additions & 3 deletions src/giql/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@

Scope note (epic #114, steps 1-3)
---------------------------------
The pass is behavior-preserving. DISJOIN's emitter
(``BaseGIQLGenerator.giqldisjoin_sql``, step 2) and NEAREST's emitter
The pass is behavior-preserving. DISJOIN's expander
(``giql.expanders.disjoin``, step 2) and NEAREST's emitter
(``BaseGIQLGenerator.giqlnearest_sql``, step 3) consume the attached metadata;
DISTANCE and the spatial predicates still use the generator's legacy resolver
paths and ignore everything attached here until their port issues land. The
Expand Down Expand Up @@ -73,6 +73,7 @@
from giql.constants import DEFAULT_END_COL
from giql.constants import DEFAULT_START_COL
from giql.constants import DEFAULT_STRAND_COL
from giql.constants import DJ_PREFIX
from giql.expressions import Contains
from giql.expressions import GIQLDisjoin
from giql.expressions import GIQLDistance
Expand Down Expand Up @@ -758,7 +759,7 @@ def _resolve_disjoin_reference(
ref_name = reference.name

# The __giql_dj_ prefix names the operator's internal CTEs.
if ref_name.startswith("__giql_dj_"):
if ref_name.startswith(DJ_PREFIX):
return None

# A CTE from an enclosing WITH shadows a registered table of the same name
Expand Down
9 changes: 9 additions & 0 deletions src/giql/targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,15 @@ class GenericTarget(Target):
This is the default target (``dialect=None``). Its capabilities are the
conservative, maximally portable baseline that matches today's
:class:`giql.generators.base.BaseGIQLGenerator` output.

"SQL-92-ish", not strict SQL-92: because ``supports_star_replace=False``, the
DISJOIN passthrough over a **non-canonical** target falls back to a
``SELECT * EXCEPT (...)`` projection (re-appending the de-canonicalized
interval columns). ``* EXCEPT`` is **not** SQL-92 and is **not
DuckDB-runnable** — it is a DataFusion-family extension — so the generic
target's non-canonical DISJOIN output runs only on an ``* EXCEPT``-capable
engine. A canonical (0-based half-open) target passes the row through as a
plain, fully portable ``SELECT *``.
"""

name: str = "generic"
Expand Down
2 changes: 1 addition & 1 deletion src/giql/transpile.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def transpile(

# Pass 1 of the normalization pipeline (epic #114): attach resolution
# metadata to every GIQL operator slot ahead of generation. DISJOIN's
# emitter consumes this metadata (step 2); the remaining operators still
# expander consumes this metadata (step 2); the remaining operators still
# use the generator's legacy resolver paths until their ports land.
with _reraise_as_value_error("Resolution error"):
ast = resolve_operator_refs(ast, tables_container)
Expand Down
5 changes: 4 additions & 1 deletion tests/integration/coordinate_space/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@ def giql_query(duckdb_connection):
def _run(query: str, *, tables: list[Table], **table_data):
for name, rows in table_data.items():
load_intervals(duckdb_connection, name, rows)
sql = transpile(query, tables=tables)
# Transpile for the DuckDB target since the SQL executes on DuckDB: a
# non-canonical DISJOIN passthrough emits DuckDB's ``* REPLACE`` here
# rather than the portable ``* EXCEPT`` form generic targets use.
sql = transpile(query, tables=tables, dialect="duckdb")
return duckdb_connection.execute(sql).fetchall()

return _run
Loading
Loading