Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
0f00f06
feat: Add GIQLCoverage expression node and parser registration
conradbzura Mar 11, 2026
08ffb4d
feat: Add CoverageTransformer for binned genome coverage
conradbzura Mar 11, 2026
a97f829
test: Add parsing and transpilation tests for COVERAGE operator
conradbzura Mar 11, 2026
38f9ac0
docs: Add COVERAGE operator reference and recipes
conradbzura Mar 11, 2026
76d36ba
feat: Support => (standard SQL) named parameter syntax in COVERAGE
conradbzura Mar 11, 2026
75bfd14
fix: Stop treating = as named parameter syntax in COVERAGE
conradbzura Mar 11, 2026
9a5a1fd
refactor: Remove dead code and fix LATERAL syntax for DuckDB compat
conradbzura Mar 12, 2026
8b8eaee
feat: Add target parameter and default alias to COVERAGE operator
conradbzura Mar 12, 2026
462e436
fix: Move COVERAGE WHERE clause into LEFT JOIN ON condition
conradbzura Mar 12, 2026
6e7b21b
test: Rewrite COVERAGE tests to spec with full API coverage
conradbzura Mar 12, 2026
4ddb5de
test: Add unit tests for bedtools test utilities
conradbzura Mar 25, 2026
ecf2b1a
test: Add unit tests for GIQL parsing, generation, and transpilation
conradbzura Mar 25, 2026
4a09eb7
test: Add bedtools integration tests for operator correctness
conradbzura Mar 25, 2026
76a988f
docs: Clarify score column reference and add sample output table
conradbzura Mar 25, 2026
67f8459
test: Add property-based tests for COVERAGE transpilation
conradbzura Mar 25, 2026
185b716
fix: Align unit tests with := named parameter syntax and fix CTE pres…
conradbzura Mar 26, 2026
1fba22a
fix: Compare only coordinates in merge-then-intersect workflow test
conradbzura Mar 26, 2026
c25a2ff
fix: Count non-null source column to preserve zero-coverage bins
conradbzura Apr 23, 2026
1adfd5d
fix: Propagate table alias into chroms subquery
conradbzura Apr 23, 2026
23205ed
fix: Preserve user CTEs in CoverageTransformer output
conradbzura Apr 23, 2026
2faa7c4
fix: Reject non-positive COVERAGE resolution at transpile time
conradbzura Apr 23, 2026
0966f27
refactor: Reuse _split_named_and_positional in GIQLCoverage
conradbzura Apr 23, 2026
47a5dd3
refactor: Delegate table and column lookup to ClusterTransformer
conradbzura Apr 23, 2026
b0c4507
style: Move public transform above private helpers in CoverageTransfo…
conradbzura Apr 23, 2026
368e812
fix: Clamp generate_series upper bound to avoid trailing empty bin
conradbzura Apr 23, 2026
63e3ac5
fix: Raise when COVERAGE FROM clause is not a named table
conradbzura Apr 23, 2026
2b698ab
fix: Require stat and target to be string literals in COVERAGE
conradbzura Apr 23, 2026
b427c53
docs: Clarify supported COVERAGE FROM clauses and CTE workaround
conradbzura Apr 23, 2026
e5297c0
docs: List COVERAGE in the dialect aggregation operators table
conradbzura Apr 23, 2026
97c7cd4
docs: Quote reserved column identifiers in 5' end counting recipe
conradbzura Apr 23, 2026
e82ae47
test: Make adjacent-neighbor nearest test honest about what it verifies
conradbzura Apr 23, 2026
1278d27
test: Execute full intersect/merge/nearest pipeline through GIQL
conradbzura Apr 23, 2026
3f2ced4
test: Register and propagate integration marker
conradbzura Apr 23, 2026
8a2f29b
test: Apply BDD naming, GWT docstrings, and AAA comments to integrati…
conradbzura Apr 23, 2026
806511e
test: Move bedtools helper tests next to the helpers they cover
conradbzura Apr 23, 2026
837927c
test: Consolidate COVERAGE tests into tests/unit/ and drop root-level…
conradbzura Apr 23, 2026
0df62b1
test: Move test_data_models alongside its target module
conradbzura Apr 23, 2026
91c19c4
test: Apply BDD naming, GWT docstrings, and AAA comments across unit …
conradbzura Apr 23, 2026
a3a8611
test: Set explicit max_examples on all Hypothesis property tests
conradbzura Apr 23, 2026
d1ac6be
test: Rename _transform_and_sql helper to reflect full pipeline scope
conradbzura Apr 23, 2026
e1d01c5
style: Apply small hygiene fixes to COVERAGE source files
conradbzura Apr 23, 2026
52f092a
refactor: Use typed SQLGlot aggregate nodes in COVERAGE transformer
conradbzura Apr 23, 2026
3d5f682
docs: Polish COVERAGE operator reference and recipes
conradbzura Apr 23, 2026
1f3fdd4
test: Strengthen integration-test rigor and extract random-interval h…
conradbzura Apr 23, 2026
54b8815
test: Tighten unit-test rigor for COVERAGE and bedtools helpers
conradbzura Apr 23, 2026
5ae4536
refactor: Scope COVERAGE to count statistic only
conradbzura Apr 24, 2026
93c1ff9
docs: Trim COVERAGE reference and recipes to count-only scope
conradbzura Apr 24, 2026
e3dd879
refactor: Rename COVERAGE operator to RASTERIZE
conradbzura Apr 24, 2026
539d5a7
docs: Rename COVERAGE references to RASTERIZE
conradbzura Apr 24, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 124 additions & 0 deletions docs/dialect/aggregation-operators.rst
Original file line number Diff line number Diff line change
Expand Up @@ -328,4 +328,128 @@ Related Operators
~~~~~~~~~~~~~~~~~

- :ref:`CLUSTER <cluster-operator>` - Assign cluster IDs without merging
- :ref:`RASTERIZE <rasterize-operator>` - Rasterize intervals onto a fixed bin grid
- :ref:`INTERSECTS <intersects-operator>` - Test for overlap between specific pairs

----

.. _rasterize-operator:

RASTERIZE
---------

Rasterize interval data onto a fixed-resolution bin grid, counting overlaps per bin.

Description
~~~~~~~~~~~

The ``RASTERIZE`` operator tiles the genome into fixed-width bins and counts the number of intervals overlapping each bin. It generates a bin grid using ``generate_series`` and joins it against the source table to count overlapping features per bin.

This is useful for:

- Summarising feature density at a user-defined resolution
- Creating fixed-resolution count tracks from interval data
- Quick visualisation of interval pile-ups across the genome

An interval that spans multiple bins is counted in each of the bins it overlaps, matching the ``bedtools coverage`` convention. As a result, the sum of bin counts is generally greater than the number of source intervals — bin counts answer "how many intervals touch this bin?", not "how are intervals partitioned across bins?".

The operator works as an aggregate function, returning one row per bin with the bin coordinates and the count.

.. note::

RASTERIZE depends on ``LATERAL`` plus ``generate_series`` for bin generation, which DuckDB and PostgreSQL both support. SQLite does not currently provide either primitive, so this operator is not yet available on the SQLite backend.

.. note::

Only the ``count`` aggregation is supported in this release. Weighted summary statistics (mean, sum, min, max) over interval values raise non-trivial semantic questions when intervals span bin boundaries (full-value contribution vs. length-weighted vs. per-base depth) and are tracked as a follow-up.

Syntax
~~~~~~

.. code-block:: sql

-- Count overlapping intervals per bin
SELECT RASTERIZE(interval, <bin_width>) FROM features

-- Named resolution parameter
SELECT RASTERIZE(interval, resolution := 500) FROM features

Parameters
Comment thread
conradbzura marked this conversation as resolved.
~~~~~~~~~~

**interval**
A genomic column.

**resolution** *(required)*
Bin width in base pairs — must be a positive integer literal. Can be given as a positional or named parameter (``RASTERIZE(interval, 1000)`` or ``RASTERIZE(interval, resolution := 1000)``). Omitting it, or supplying a non-positive value, raises ``ValueError`` at transpile time.

Return Value
~~~~~~~~~~~~

Returns one row per genomic bin:

- ``chrom`` — Chromosome of the bin
- ``start`` — Start position of the bin
- ``end`` — End position of the bin
Comment thread
conradbzura marked this conversation as resolved.
- ``value`` — The count of intervals overlapping the bin (default alias; use ``AS`` to rename)

Examples
~~~~~~~~

**Basic Count:**

Count the number of features overlapping each 1 kb bin:

.. code-block:: sql

SELECT RASTERIZE(interval, 1000)
FROM features

**Named Alias:**

.. code-block:: sql

SELECT RASTERIZE(interval, 1000) AS depth
FROM reads

**With WHERE Filter:**

Assuming the source table includes a ``score`` column, count high-scoring features per bin:

.. code-block:: sql

SELECT RASTERIZE(interval, 1000) AS depth
FROM features
WHERE score > 10
Comment thread
conradbzura marked this conversation as resolved.

Supported FROM clauses
~~~~~~~~~~~~~~~~~~~~~~

``RASTERIZE`` requires a ``FROM`` clause that references a table or named CTE. Inline subqueries (``FROM (SELECT ...) AS sub``) and ``VALUES`` clauses are not supported — wrap the derivation in a ``WITH`` clause and select ``RASTERIZE(...)`` from the CTE by name:

.. code-block:: sql

-- Not supported: inline subquery in FROM
SELECT RASTERIZE(interval, 1000)
FROM (SELECT * FROM features WHERE score > 50) AS filtered

-- Supported: same derivation wrapped in a CTE
WITH filtered AS (
SELECT * FROM features WHERE score > 50
)
SELECT RASTERIZE(interval, 1000) FROM filtered

Any ``WITH`` clauses you declare are preserved alongside the internal ``__giql_bins`` CTE in the transpiled SQL.

Performance Notes
~~~~~~~~~~~~~~~~~

- The operator creates one bin per chromosome per step, so smaller resolutions produce more rows
- A ``LEFT JOIN`` ensures bins with zero coverage are included in the output
- For very large genomes, consider restricting the query with a ``WHERE`` clause on chromosome

Related Operators
~~~~~~~~~~~~~~~~~

- :ref:`MERGE <merge-operator>` - Combine overlapping intervals into single regions
- :ref:`CLUSTER <cluster-operator>` - Assign cluster IDs to overlapping intervals
3 changes: 3 additions & 0 deletions docs/dialect/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ Combine and cluster genomic intervals.
* - :ref:`MERGE <merge-operator>`
- Combine overlapping intervals into unified regions
- ``SELECT MERGE(interval) FROM features``
* - :ref:`RASTERIZE <rasterize-operator>`
- Rasterize intervals onto a fixed bin grid with per-bin counts
- ``SELECT RASTERIZE(interval, 1000) FROM features``

See :doc:`aggregation-operators` for detailed documentation.

Expand Down
4 changes: 4 additions & 0 deletions docs/recipes/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ Recipe Categories
Clustering overlapping intervals, distance-based clustering,
merging intervals, and aggregating cluster statistics.

:doc:`rasterize`
Rasterizing intervals onto a fixed bin grid: per-bin counts,
strand-specific counts, normalisation, and 5' end counting.

:doc:`advanced`
Multi-range matching, complex filtering with joins, aggregate statistics,
window expansions, and multi-table queries.
Expand Down
145 changes: 145 additions & 0 deletions docs/recipes/rasterize.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
Rasterize
=========

This section covers patterns for projecting interval data onto a fixed-resolution bin grid using GIQL's ``RASTERIZE`` operator.

Basic Usage
-----------

Rasterized counts underpin most genome-wide signal summaries — read-pileup plots for ChIP-seq, exon-level depth in RNA-seq, and peak-density overviews across megabases. The recipes below start from a canonical per-bin count and build toward more specialised variants.

Count Overlapping Features
~~~~~~~~~~~~~~~~~~~~~~~~~~

Count the number of features overlapping each 1 kb bin across the genome:

.. code-block:: sql

SELECT RASTERIZE(interval, 1000) AS depth
FROM features

**Sample output:**

.. code-block:: text

┌────────┬────────┬────────┬───────┐
│ chrom │ start │ end │ depth │
├────────┼────────┼────────┼───────┤
│ chr1 │ 0 │ 1000 │ 3 │
│ chr1 │ 1000 │ 2000 │ 1 │
│ chr1 │ 2000 │ 3000 │ 0 │
│ ... │ ... │ ... │ ... │
└────────┴────────┴────────┴───────┘

Each row represents one genomic bin. Bins with no overlapping features appear with a count of zero. An interval that spans more than one bin is counted in each bin it overlaps (the ``bedtools coverage`` convention), so the sum of bin counts is generally greater than the number of source intervals.

**Use case:** Compute read depth or feature density at a fixed resolution.

Custom Bin Size
~~~~~~~~~~~~~~~

Use a finer resolution of 100 bp:

.. code-block:: sql

SELECT RASTERIZE(interval, 100) AS depth
FROM reads

**Use case:** High-resolution count tracks for visualisation.

Named Resolution Parameter
~~~~~~~~~~~~~~~~~~~~~~~~~~

The resolution can also be supplied by name:

.. code-block:: sql

SELECT RASTERIZE(interval, resolution := 500) AS depth
FROM features

Both ``:=`` and ``=>`` are accepted for named parameters.

.. note::

Weighted summary statistics (mean, sum, min, max over interval values, with bin-boundary-aware weighting) are not yet implemented. See the project tracker for the follow-up.

Filtered Rasterization
----------------------

Strand-Specific Counts
~~~~~~~~~~~~~~~~~~~~~~

Compute per-bin counts for each strand separately by filtering:

.. code-block:: sql

-- Plus strand
SELECT RASTERIZE(interval, 1000) AS depth
FROM features
WHERE strand = '+'

.. code-block:: sql

-- Minus strand
SELECT RASTERIZE(interval, 1000) AS depth
FROM features
WHERE strand = '-'

**Use case:** Strand-specific signal tracks for RNA-seq or stranded assays.

High-Scoring Features
~~~~~~~~~~~~~~~~~~~~~

Restrict counts to features above a quality threshold:

.. code-block:: sql

SELECT RASTERIZE(interval, 1000) AS depth
FROM features
WHERE score > 10

5' End Counting
~~~~~~~~~~~~~~~

To count only the 5' ends of features (e.g. TSS or read starts), first
create a view or CTE that trims each interval to its 5' end, then apply
``RASTERIZE``:

.. code-block:: sql

WITH five_prime AS (
SELECT chrom, "start", "start" + 1 AS "end"
FROM features
WHERE strand = '+'
UNION ALL
SELECT chrom, "end" - 1 AS "start", "end"
FROM features
WHERE strand = '-'
)
SELECT RASTERIZE(interval, 1000) AS tss_count
FROM five_prime

Normalised Counts
-----------------

RPM Normalisation
~~~~~~~~~~~~~~~~~

Normalise bin counts to reads per million (RPM) by dividing by the total
number of reads:

.. code-block:: sql

WITH bins AS (
SELECT RASTERIZE(interval, 1000) AS depth
FROM reads
),
total AS (
SELECT COUNT(*) AS n FROM reads
)
SELECT
bins.chrom,
bins.start,
bins.end,
bins.depth * 1000000.0 / total.n AS rpm
FROM bins, total
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ path = "build-hooks/metadata.py"

[tool.pytest.ini_options]
addopts = "--cov --cov-config=.coveragerc"
markers = [
"integration: tests exercising real bedtools subprocesses and DuckDB I/O",
]

[tool.ruff]
line-length = 89
Expand Down
2 changes: 2 additions & 0 deletions src/giql/dialect.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from giql.expressions import Contains
from giql.expressions import GIQLCluster
from giql.expressions import GIQLRasterize
from giql.expressions import GIQLDistance
from giql.expressions import GIQLMerge
from giql.expressions import GIQLNearest
Expand Down Expand Up @@ -54,6 +55,7 @@ class Parser(Parser):
FUNCTIONS = {
**Parser.FUNCTIONS,
"CLUSTER": GIQLCluster.from_arg_list,
"RASTERIZE": GIQLRasterize.from_arg_list,
"MERGE": GIQLMerge.from_arg_list,
"DISTANCE": GIQLDistance.from_arg_list,
"NEAREST": GIQLNearest.from_arg_list,
Expand Down
27 changes: 27 additions & 0 deletions src/giql/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,33 @@ def from_arg_list(cls, args):
return cls(**kwargs)


class GIQLRasterize(exp.Func):
"""RASTERIZE aggregate function that projects intervals onto a fixed bin grid.

Tiles the genome into fixed-width bins and counts the number of
overlapping intervals per bin (bedtools-coverage convention: an
interval that spans multiple bins is counted in each of them).

Examples:
RASTERIZE(interval, 1000)
RASTERIZE(interval, resolution := 1000)
"""

arg_types = {
"this": True, # genomic column
"resolution": True, # bin width (positional or named)
}

@classmethod
def from_arg_list(cls, args):
kwargs, positional_args = _split_named_and_positional(args)
if len(positional_args) > 0:
kwargs["this"] = positional_args[0]
if len(positional_args) > 1:
kwargs["resolution"] = positional_args[1]
return cls(**kwargs)


class GIQLDistance(exp.Func):
"""DISTANCE function for calculating genomic distances between intervals.

Expand Down
Loading
Loading