abdenlab · conradbzura · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/docs/dialect/aggregation-operators.rst b/docs/dialect/aggregation-operators.rst
@@ -40,6 +40,9 @@ Syntax
    -- Strand-specific clustering
    CLUSTER(interval, stranded := true) AS cluster_id
 
+   -- Predicate-gated clustering (run-length encoding on a column)
+   CLUSTER(interval, predicate := depth = PREV(depth)) AS cluster_id
+
    -- Combined parameters
    CLUSTER(interval, distance, stranded := true) AS cluster_id
 
@@ -56,6 +59,36 @@ Parameters
 **stranded** *(optional)*
    When ``true``, only cluster intervals on the same strand. Default: ``false``.
 
+**predicate** *(optional)*
+   A boolean expression evaluated between each interval and its sorted
+   predecessor. When supplied, the cluster-boundary condition becomes
+   **adjacent AND predicate**: an interval stays in the current cluster only
+   when it is within ``distance`` of its predecessor *and* the predicate holds
+   between the two. A change in the predicate forces a new cluster, so an
+   equality predicate yields a run-length encoding of the input sequence.
+   Omitting the predicate preserves the default adjacency-only behavior.
+
+   Bare column references resolve to the *current* interval; the predecessor's
+   value of a column is referenced with ``PREV(column)``
+   (e.g. ``depth = PREV(depth)``). The predicate composes with ``distance`` and
+   ``stranded`` and is evaluated under the operator's existing per-chromosome
+   (and per-strand) partition and start-position order.
+
+   Two constraints apply:
+
+   - **References existing columns only.** The predicate *gates* merging on
+     columns already present on the input rows; it does not synthesize a
+     statistic. Coverage depth, for example, must already be a column on the
+     rows (typically produced upstream by :ref:`DISJOIN <disjoin-operator>` and
+     aggregation).
+   - **Pairwise only, with single-linkage drift.** The predicate compares each
+     interval to its immediate sorted predecessor (everything ``LAG`` can
+     express). Whole-cluster conditions are out of scope. When the predicate is
+     not an equivalence relation (e.g. ``ABS(score - PREV(score)) < 5``),
+     consecutive pairs may each satisfy it while the cluster's extremes do not
+     — the same single-linkage behavior that ``distance``-based clustering
+     already exhibits.
+
 Return Value
 ~~~~~~~~~~~~
 
@@ -101,6 +134,19 @@ Cluster intervals separately by strand:
    FROM features
    ORDER BY chrom, strand, start
 
+**Predicate-Gated Clustering:**
+
+Cut adjacent intervals into clusters wherever a column's value changes
+(run-length encoding). ``PREV(column)`` references the predecessor row's value:
+
+.. code-block:: sql
+
+   SELECT
+       *,
+       CLUSTER(interval, predicate := depth = PREV(depth)) AS cluster_id
+   FROM features
+   ORDER BY chrom, start
+
 **Analyze Cluster Statistics:**
 
 Count features per cluster:
@@ -194,6 +240,9 @@ Syntax
    -- Strand-specific merge
    SELECT MERGE(interval, stranded := true) FROM features
 
+   -- Predicate-gated merge (merge only equal-valued adjacent runs)
+   SELECT MERGE(interval, predicate := depth = PREV(depth)) FROM features
+
    -- Merge with additional aggregations
    SELECT
        MERGE(interval),
@@ -214,6 +263,16 @@ Parameters
 **stranded** *(optional)*
    When ``true``, merge intervals separately by strand. Default: ``false``.
 
+**predicate** *(optional)*
+   A boolean expression that further restricts which adjacent intervals are
+   merged. ``MERGE`` decomposes into :ref:`CLUSTER <cluster-operator>` plus a
+   ``GROUP BY`` over the cluster id, so it inherits predicate-aware boundaries
+   directly — see the :ref:`CLUSTER predicate <cluster-operator>` description
+   for the full semantics, the ``PREV(column)`` convention, the
+   references-existing-columns-only constraint, and the pairwise-only /
+   single-linkage caveat. Omitting the predicate preserves the default
+   adjacency-only merge.
+
 Return Value
 ~~~~~~~~~~~~
 
@@ -256,6 +315,24 @@ Merge intervals separately by strand:
    SELECT MERGE(interval, stranded := true)
    FROM features
 
+**Predicate-Gated Merge (coverage depth):**
+
+Merge only adjacent intervals that share the same coverage depth, reconstructing
+a re-clustered, depth-segmented partition from per-breakpoint segments produced
+by :ref:`DISJOIN <disjoin-operator>` and aggregation:
+
+.. code-block:: sql
+
+   SELECT MERGE(interval, predicate := depth = PREV(depth))
+   FROM (
+       SELECT disjoin_chrom AS chrom,
+              disjoin_start AS start,
+              disjoin_end AS end,
+              COUNT(*) AS depth
+       FROM DISJOIN(features)
+       GROUP BY disjoin_chrom, disjoin_start, disjoin_end
+   ) AS segments
+
 **Merge with Feature Count:**
 
 Count how many features were merged into each region:

diff --git a/docs/recipes/clustering.rst b/docs/recipes/clustering.rst
@@ -347,6 +347,86 @@ Compare raw vs merged coverage:
 
 **Use case:** Quantify the redundancy in your feature set.
 
+Predicate-Gated Clustering and Merging
+--------------------------------------
+
+Both ``CLUSTER`` and ``MERGE`` accept an optional ``predicate :=`` argument that
+further restricts which adjacent intervals are coalesced: an interval stays in
+the current cluster only when it is adjacent to its predecessor *and* the
+predicate holds between the two. Bare columns resolve to the current interval;
+the predecessor's value is referenced with ``PREV(column)``. The predicate
+references columns already present on the rows — it gates merging, it does not
+synthesize a statistic — and it compares each interval only to its immediate
+sorted predecessor (so non-equivalence predicates exhibit single-linkage drift,
+just like ``distance``-based clustering).
+
+Run-Length Encoding on a Column
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Merge only adjacent intervals that share the same value, cutting a new region
+wherever the value changes:
+
+.. code-block:: sql
+
+   SELECT MERGE(interval, predicate := depth = PREV(depth))
+   FROM segments
+
+**Use case:** Collapse a per-base or per-segment signal into maximal runs of
+constant value (e.g. equal coverage depth, same genotype, same annotation).
+
+Run-Length Encode with CLUSTER
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Assign a distinct cluster id to each maximal equal-valued run while keeping the
+individual rows:
+
+.. code-block:: sql
+
+   SELECT
+       *,
+       CLUSTER(interval, predicate := depth = PREV(depth)) AS run_id
+   FROM segments
+   ORDER BY chrom, start
+
+**Use case:** Label run boundaries for inspection before aggregating.
+
+Reconstruct disjoin() Coverage Segments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+GIQL's :ref:`DISJOIN <disjoin-operator>` primitive splits overlapping intervals
+at every breakpoint but deliberately does not re-cluster the resulting
+sub-intervals. Pairing it with a predicate-gated ``MERGE`` closes that gap: cut
+the input at every breakpoint, aggregate coverage depth per segment, then merge
+back the contiguous runs of equal depth — reproducing the re-clustered,
+depth-annotated output Bioconductor's ``disjoin()`` produces:
+
+.. code-block:: sql
+
+   SELECT MERGE(interval, predicate := depth = PREV(depth))
+   FROM (
+       SELECT disjoin_chrom AS chrom,
+              disjoin_start AS start,
+              disjoin_end AS end,
+              COUNT(*) AS depth
+       FROM DISJOIN(features)
+       GROUP BY disjoin_chrom, disjoin_start, disjoin_end
+   ) AS segments
+
+**Use case:** Build a re-clustered coverage profile from overlapping intervals,
+the expression-based generalization of ``disjoin()`` to any pairwise condition.
+
+Multi-Column Predicate
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Gate merging on more than one column by combining comparisons with ``AND``:
+
+.. code-block:: sql
+
+   SELECT MERGE(interval, predicate := strand = PREV(strand) AND name = PREV(name))
+   FROM features
+
+**Use case:** Keep merged regions homogeneous across several attributes at once.
+
 Advanced Patterns
 -----------------
 

diff --git a/src/giql/expressions.py b/src/giql/expressions.py
@@ -199,17 +199,25 @@ class GIQLCluster(exp.Func):
 
     Implicitly partitions by chromosome and orders by start position.
 
+    The optional ``predicate`` argument is a boolean expression evaluated
+    between each interval and its sorted predecessor; intervals are only kept
+    in the same cluster when they are adjacent *and* the predicate holds. Bare
+    columns resolve to the current interval; the predecessor's value of a
+    column is referenced with ``PREV(column)``.
+
     Examples:
         CLUSTER(interval)
         CLUSTER(interval, 1000)
         CLUSTER(interval, stranded := true)
         CLUSTER(interval, 1000, stranded := true)
+        CLUSTER(interval, predicate := depth = PREV(depth))
     """
 
     arg_types = {
         "this": True,  # genomic column
         "distance": False,  # maximum distance between features
         "stranded": False,  # strand-specific clustering
+        "predicate": False,  # pairwise boolean gate (current row vs PREV(col))
     }
 
     @classmethod
@@ -232,16 +240,25 @@ class GIQLMerge(exp.Func):
     Merges overlapping or bookended intervals into single intervals.
     Built on top of CLUSTER operation.
 
+    The optional ``predicate`` argument gates merging on a pairwise boolean
+    expression between each interval and its sorted predecessor (see
+    :class:`GIQLCluster`); ``PREV(column)`` references the predecessor's value
+    of a column. When the predicate tests equality of a value this yields a
+    run-length encoding of the input interval sequence.
+
     Examples:
         MERGE(interval)
         MERGE(interval, 1000)
         MERGE(interval, stranded := true)
+        MERGE(interval, predicate := depth = PREV(depth))
+        MERGE(interval, predicate := strand = PREV(strand) AND name = PREV(name))
     """
 
     arg_types = {
         "this": True,  # genomic column
         "distance": False,  # maximum distance between features
         "stranded": False,  # strand-specific merging
+        "predicate": False,  # pairwise boolean gate (current row vs PREV(col))
     }
 
     @classmethod

diff --git a/src/giql/mcp/server.py b/src/giql/mcp/server.py
@@ -105,6 +105,14 @@
                 "description": "Max gap to consider same cluster (default: 0)",
             },
             {"name": "stranded", "description": "Cluster by strand (default: false)"},
+            {
+                "name": "predicate",
+                "description": (
+                    "Pairwise boolean gate; keep adjacent intervals together "
+                    "only when it holds. Use PREV(col) for the predecessor row's "
+                    "value (e.g. predicate := depth = PREV(depth)). Optional."
+                ),
+            },
         ],
         "returns": "Integer cluster ID",
         "example": "SELECT *, CLUSTER(interval) AS cluster_id FROM features",
@@ -118,6 +126,14 @@
             {"name": "interval", "description": "Genomic column to merge"},
             {"name": "distance", "description": "Max gap to merge (default: 0)"},
             {"name": "stranded", "description": "Merge by strand (default: false)"},
+            {
+                "name": "predicate",
+                "description": (
+                    "Pairwise boolean gate; merge adjacent intervals only when "
+                    "it holds. Use PREV(col) for the predecessor row's value "
+                    "(e.g. predicate := depth = PREV(depth)). Optional."
+                ),
+            },
         ],
         "returns": "Merged interval coordinates (chromosome, start_pos, end_pos)",
         "example": "SELECT MERGE(interval), COUNT(*) FROM features",