From 4cbface9131d82f5dba79b3985d564b86d6ddc7c Mon Sep 17 00:00:00 2001
From: Patrick Gilhooley <pgilhooley95@gmail.com>
Date: Wed, 6 May 2026 17:19:42 -0400
Subject: [PATCH 1/6] docs(plan): Phase 5 fusion (Viterbi + chord-aware) design

---
 docs/plans/2026-05-06-phase5-fusion-design.md | 217 ++++++++++++++++++
 1 file changed, 217 insertions(+)
 create mode 100644 docs/plans/2026-05-06-phase5-fusion-design.md

diff --git a/docs/plans/2026-05-06-phase5-fusion-design.md b/docs/plans/2026-05-06-phase5-fusion-design.md
new file mode 100644
index 0000000..ac10615
--- /dev/null
+++ b/docs/plans/2026-05-06-phase5-fusion-design.md
@@ -0,0 +1,217 @@
+# Phase 5 — Fusion (Viterbi + chord-aware) Design
+
+**Date:** 2026-05-06
+**Author:** Patrick (brainstormed with Claude)
+**Status:** Proposed — pending sign-off
+**Spec source:** `SPEC.md` §5 Phase 5, §8 module contracts.
+**Branch:** `claude/refactor-eval` (forked from `refactor/v1`); merge back to `refactor/v1` on green.
+
+## 0. Status snapshot
+
+What `tabvision.fusion` looks like right now on `refactor/v1`:
+
+| Module | Lines | State |
+|---|---:|---|
+| `candidates.py` | 50 | **Done.** `candidate_positions(pitch, cfg) → list[Candidate]`. Used by Phase 1 audio-only fusion. |
+| `viterbi.py` | 119 | **Phase-1 placeholder.** `fuse(...)` raises `FusionError` whenever any `FrameFingering` carries non-zero logits ("video-aware fusion not implemented — this is a Phase 5 deliverable"). Greedy lowest-fret + continuity decoder works for the audio-only path (5 tests passing). |
+| `playability.py` | 9 | **Stub.** Docstring only. |
+| `chord.py` | 9 | **Stub.** Docstring only. |
+| CLI | — | `--fusion-lambda-vision` flag not yet exposed. |
+
+Phase 4 already produces `FrameFingering.marginal_string_fret() → (6, 25)` softmax per frame (`tabvision.video.hand.fingertip_to_fret`). Phase 5 consumes that.
+
+Legacy reference: `tabvision-server/app/fusion_engine.py` (2,216 lines, 23 functions) and `tabvision-server/app/chord_shapes.py` (790 lines). Per the SPEC §3.3 module-boundary plan, we **port selectively** (hand-span, slide, monophony heuristics) rather than wholesale-translate. The Apr-24 learned-fusion attempt (LightGBM ranker) **did not ship** (LOOCV +0.3 pp vs +5 pp gate per `tools/outputs/position_selector_report-2026-04-29.md`); the lesson is that small ML on top of weak features doesn't beat structured search with informative evidence — Phase 5 takes the structured-search path.
+
+## 1. Goal & acceptance bars
+
+From SPEC §5 Phase 5:
+
+- **Tab F1 ≥ 0.85** on the user eval set. Target 0.88 by Phase 9.
+- **Chord-instance accuracy ≥ 0.80**. Target 0.85 by Phase 9.
+- **Audio+vision must beat audio-only by ≥ 8 pp on Tab F1** (ablation report).
+
+The user eval set = the 20-video iPhone-recorded training set, plus whatever Phase 1.5 annotation tooling adds to the four difficulty tiers. Today's audio-only baseline on that set is **exact F1 ≈ 0.51** (per `errors-2026-04-28_185743.md`). Phase 5's 0.85 bar therefore needs both (a) better audio (Phase 2 SOTA backbone) and (b) the audio+vision boost. Phase 5 alone is on the hook for the **+8 pp audio+vision delta**, not the absolute number — that's the readable signal that the fusion is doing real work.
+
+## 2. Cost function
+
+We score a sequence of decoded `(string, fret)` picks by a sum of **emission** terms (per pick) and **transition** terms (between consecutive picks). Lower total cost wins. All terms are negative log-probs (or proportional to them) — i.e. dimensionally consistent.
+
+### 2.1 Emission cost per `Candidate c = (s, f)` for `AudioEvent ev`
+
+```
+E(c | ev, fingering_at_t) =
+      -log P_audio(c | ev)              # audio prior on string/fret
+  +   -λ_v · log P_vision(c | t)        # vision marginal at event time
+  +   α_open · 1[f == 0] · open_bonus   # negative if c is on an open string
+  +   α_low · f                          # mild lower-fret bias
+```
+
+- `P_audio(c | ev)`:
+  - If `ev.fret_prior` is provided (Phase 2's `tabcnn` backend, when present), use it directly. Otherwise uniform over candidates.
+  - Multiply by `ev.confidence` (the model's pitch posterior).
+- `P_vision(c | t)`:
+  - Look up the `FrameFingering` whose `t` is closest to `ev.onset_s`. Linear-interpolate between two adjacent frames if the gap is small (< 1 / fps).
+  - `marginal_string_fret()[s, f]` is the per-(string, fret) cell of the (6, 25) softmax.
+  - If no fingering carries evidence (`finger_pos_logits.size == 0` or all-zero) → fallback to uniform; `λ_v` is effectively zero for this event.
+- `λ_v`: tunable, default `1.0`, exposed as `--fusion-lambda-vision` (CLI) and `lambda_vision` kwarg on `fuse()`.
+- `open_bonus`: small constant (e.g. 0.5). Open strings are systematically under-represented in MediaPipe-derived `marginal_string_fret` because no fingertip is pressing — so we re-introduce them via this bonus.
+- `α_low`: lower-fret bias (e.g. 0.05/fret). Keeps the decoder honest when audio + vision are both flat across candidates.
+
+### 2.2 Transition cost between `prev = (s_p, f_p)` and `curr = (s_c, f_c)`
+
+```
+T(prev → curr) =
+      β_shift · |f_c - f_p| / span_norm        # position-shift penalty
+  +   β_span · max(0, |f_c - f_p| - max_span)  # hard hand-span barrier (kicks in beyond ~5 frets)
+  -   β_string · 1[s_c == s_p]                 # same-string continuity bonus
+```
+
+- `span_norm = 12` (one octave), `max_span = 5` frets — calibrated from the legacy `fusion_engine.py` anchor system.
+- `β_string` ≈ 0.5 — direct port of the existing `STRING_CONTINUITY_BONUS`.
+- A "muted" / X transition is permitted by skipping cost contribution (technique flag set on the `TabEvent`).
+
+### 2.3 Per-string monophony
+
+Hard constraint baked into the **chord cluster** state space (§3.2), not a soft cost. Single-line Viterbi (§3.1) is monophonic by construction.
+
+## 3. State spaces
+
+### 3.1 Single-line Viterbi (`viterbi.py`)
+
+Triggered when consecutive events are > 80 ms apart.
+
+- States at event `i`: `candidate_positions(events[i].pitch_midi, cfg)` — typically 2–6 per pitch.
+- Initial cost: `E(c_0)`.
+- Recurrence: `cost[i, c] = E(c) + min_{c'} (cost[i-1, c'] + T(c' → c))`.
+- Termination: pick the lowest-cost terminal state, backtrack.
+- Worst case: `O(N × K^2)` for `N` events, `K ≤ 6` candidates per event. `N` is hundreds; trivial.
+
+### 3.2 Chord cluster decode (`chord.py`)
+
+A **chord cluster** is a maximal run of consecutive `AudioEvent`s pairwise within 80 ms onset distance. (SPEC §5: "simultaneous events ≤ 80 ms apart".)
+
+For a cluster of `m` events:
+
+- A **chord state** is an ordered tuple of m candidates `(c_1, …, c_m)` with:
+  - **Per-string monophony:** all `s_i` distinct.
+  - **Hand-span constraint:** `max(f_i for f_i > 0) - min(f_i for f_i > 0) ≤ max_span` (open strings exempt).
+  - Order convention: low-pitch first (so the spelling is reproducible).
+- State enumeration: cartesian product of candidates, filtered by the two constraints. With `m ≤ 6` (six-string guitar) and `K ≤ 6` per pitch, worst case `6^6 = 46 656` raw tuples — pruned aggressively to a few hundred valid ones.
+- Emission cost for a chord state = sum of per-event emission costs.
+- Transition between two chord clusters: collapse each cluster to its **lowest-fret pressed note** (the natural anchor point) and apply `T(prev → curr)` from §2.2 — keeps the inter-chord cost compatible with single-line transitions.
+- Optional: `chord_shapes.py` templates from the legacy code give a prior over common shapes (open chords, barre, power). **Deferred to Step D below** — start without templates and only add if F1 demands.
+
+The chord-cluster decode is itself a Viterbi over chord-states between clusters; single-line events are degenerate clusters of size 1.
+
+## 4. Module responsibilities
+
+```
+tabvision.fusion.candidates   -- (done) candidate_positions, Candidate dataclass.
+tabvision.fusion.playability  -- emission + transition cost helpers (pure functions, fully unit-tested).
+tabvision.fusion.viterbi      -- (a) the public fuse() entrypoint; (b) single-line Viterbi; (c) dispatcher to chord.
+tabvision.fusion.chord        -- chord cluster grouping + chord-state Viterbi.
+```
+
+`viterbi.fuse(events, fingerings, cfg, session, lambda_vision=1.0)` stays as the single public entrypoint per SPEC §8; behaviour switches internally based on whether `fingerings` carry evidence and whether events fall into chord clusters.
+
+## 5. Port mapping (legacy → new)
+
+| Legacy (`tabvision-server/app/fusion_engine.py`) | New | Notes |
+|---|---|---|
+| `_score_position_heuristic` | `playability.emission_cost` | Drop hand-anchor side-channel; subsume into structured Viterbi. |
+| `_select_best_position` | replaced by single-line Viterbi | The greedy logic was the source of `wrong_position_same_pitch` errors. |
+| `_optimize_chord_positions` | `chord.decode_chord_state` | The legacy version is greedy with backtracking; the new version is exhaustive over the (already-small) feasible set. |
+| `_correct_slide_positions` | `playability.transition_cost` (built-in) | Slide/legato preference falls out of the same-string continuity bonus and the position-shift penalty — no separate post-pass. |
+| `_correct_melodic_segments` | not ported; subsumed by Viterbi | Subsumed. Confirm via ablation. |
+| `_postfilter_tab_notes` | not ported (yet) | Dedup + low-confidence isolated filter. Defer; revisit if Phase 5 has visible artifacts of this kind. |
+| `_detect_techniques` | shallow port | Hammer-on / pull-off / slide tag inference based on consecutive same-string events. Spec §5 leaves bend/vibrato to Phase 7. |
+| `chord_shapes.py` (templates) | optional Step D in `chord.py` | Defer — only adopt if needed. |
+| `fuse_audio_only` | already ported (Phase 1 path) | Keep. |
+| `fuse_audio_video` | replaced wholesale | The legacy version is the worst-performing module per `errors-2026-04-28_185743.md` (35.2% of loss is `wrong_position_same_pitch`). |
+
+## 6. Step-by-step phasing within Phase 5
+
+Each step is independently mergeable; each lands tests before behaviour.
+
+### Step A — `playability.py`: pure cost helpers (~½ day)
+
+Implement:
+- `emission_cost(candidate, event, fingering_at_t, cfg, *, lambda_vision=1.0) → float`
+- `transition_cost(prev, curr, cfg) → float`
+- Constants for the weight hyperparameters (named, documented).
+
+Tests (`tabvision/tests/unit/test_playability.py`, new):
+- Emission: pure-audio (no fingering) reproduces the existing greedy decoder's preferences.
+- Emission: vision evidence pulls a candidate that audio is indifferent on.
+- Emission: open-string bonus correctly recovers fret 0 when MediaPipe marginal is uniform.
+- Transition: same-string is cheaper than string-jump.
+- Transition: hand-span barrier triggers only past `max_span`.
+
+**Acceptance:** All new unit tests green. No change to `viterbi.fuse()` behaviour (Phase 1 tests still pass).
+
+### Step B — single-line Viterbi (~1 day)
+
+Replace `viterbi._greedy_audio_only` with a single-line Viterbi using `playability` costs. Keep the public `fuse()` signature.
+
+Tests (extend `test_fusion_audio_only.py`):
+- All five existing tests still pass (regression gate).
+- Add: 4-event sequence where greedy picks the wrong string at event 3 but Viterbi recovers it via lookahead.
+- Add: vision-uniform fingerings produce same output as no fingerings (sanity).
+- Add: vision-decisive fingering moves the pick to a non-lowest-fret candidate.
+
+**Acceptance:** All tests green. Run `tabvision/tests/eval/test_phase4_eval.py` (or its Phase 5 sibling, see Step E) and confirm no regression on the audio-only path.
+
+### Step C — chord cluster decode (~1–1½ days)
+
+Implement `chord.cluster_events(events, max_gap_ms=80)` and `chord.decode_clusters(clusters, fingerings, cfg, lambda_vision)` returning the per-event picks. Wire `viterbi.fuse()` to dispatch.
+
+Tests (`tabvision/tests/unit/test_chord_fusion.py`, new):
+- Two simultaneous events on the same string get one moved (per-string monophony).
+- A 3-note chord has all picks within `max_span` of each other (hand-span constraint).
+- A chord cluster with vision evidence prefers the vision-supported voicing.
+- An open-chord shape (open strings present) is preferred over a barre when both are reachable and vision is uniform.
+
+**Acceptance:** All tests green. Single-line tests still pass.
+
+### Step D — CLI integration & lambda sweep (~½ day)
+
+- Add `--fusion-lambda-vision FLOAT` to `tabvision.cli`. Default `1.0`. Pass through to `fuse()`.
+- Document in CLI `--help`.
+- Add `tabvision/tests/unit/test_cli_fusion_flag.py`: smoke that the flag round-trips into `fuse()`.
+
+### Step E — Phase 5 acceptance eval (~1 day)
+
+Add `tabvision/tests/eval/test_phase5_eval.py` modelled on `test_phase4_eval.py`. It:
+
+1. Runs the full pipeline (audio + video) on each video in the user eval set.
+2. Computes Tab F1 (string + fret + onset within ±50 ms) and chord-instance accuracy.
+3. Runs the audio-only ablation (`λ_v = 0`) on the same set.
+4. Asserts:
+   - `tab_f1 >= 0.85` (the §5 bar) — **may be marked `xfail` until Phase 2 SOTA backbone lands**, with the understanding that today's audio is the bottleneck.
+   - `tab_f1_audio_video - tab_f1_audio_only >= 0.08` — **the Phase-5-specific bar; this is the gate for "fusion is doing real work"**.
+   - `chord_accuracy >= 0.80`.
+5. Writes a markdown report to `tabvision-server/tools/outputs/phase5_eval-YYYY-MM-DD.md` summarising the ablation per video (mirrors the `finetune_baseline-*.md` convention).
+
+**Acceptance for Phase 5 as a whole:** the `tab_f1_audio_video - tab_f1_audio_only >= 0.08` assertion passes. The absolute-Tab-F1 bar may be deferred to Phase 7 if audio is still the bottleneck — but if it is, that's a material finding and should land in `DECISIONS.md`.
+
+## 7. Risks & open questions
+
+- **Risk:** `λ_v = 1.0` may be wrong by an order of magnitude. Mitigation: Step E sweeps `λ_v ∈ {0, 0.5, 1, 2, 5}` and reports best per video and aggregate. If best is `0`, vision evidence is genuinely uncalibrated → SPEC §5 decision tree's `C2` branch (return to Phase 4).
+- **Risk:** chord-state explosion on dense voicings. Mitigation: 6-string max plus monophony pruning bounds cardinality at 720 raw tuples; in practice the constraint cuts to <100. If a real video produces a worst-case cluster (>100 tuples), beam-search is a 5-line addition.
+- **Risk:** open-string bonus over-fires when the player is fingering a fret-0 chord (e.g. capo-0 G major shape) and MediaPipe correctly says "no fingertip on the low strings." Mitigation: chord-cluster decode considers the whole shape — bonus is per-event, but the chord-state's hand-span constraint pulls the rest of the shape into a coherent fingering.
+- **Open:** does Step C need `chord_shapes.py` templates as a prior? Plan says no — start without and add only if F1 demands. Tracked as a Step-C-follow-up if needed.
+- **Open:** what's "the user eval set" for Step E? Today: the 20-video iPhone training set. Phase 1.5's annotation tool will add labelled clips across four difficulty tiers — those should fold into the same eval as they land.
+
+## 8. Estimated effort
+
+Steps A → E total **~4 working days** of implementation + writeup. Acceptance eval (Step E) is the slowest because it requires running the full pipeline on the eval set, which is gated on Phase 4's video stack working end-to-end on the iPhone videos (probably true today but worth confirming as Step 0 below).
+
+## 9. Pre-flight (before Step A)
+
+A quick 15-min sanity check before any code:
+
+- Run `tabvision/tests/eval/test_phase4_eval.py` end-to-end on at least one iPhone video and confirm we get a non-empty `list[FrameFingering]` with non-uniform `marginal_string_fret`. If we don't, Step E is going to be useless and we should fix Phase 4's eval path first.
+
+---
+
+**For sign-off:** confirm (a) cost-function shape (§2), (b) module split (§4), (c) phasing/order of A–E. If those look right I'll start with Step A.

From 99f9df1d93965492bcc9b09dd2de21b8e6fe9d32 Mon Sep 17 00:00:00 2001
From: Patrick Gilhooley <pgilhooley95@gmail.com>
Date: Wed, 6 May 2026 17:41:19 -0400
Subject: [PATCH 2/6] feat(phase5): playability emission + transition cost
 helpers

---
 tabvision/tabvision/fusion/playability.py | 169 ++++++++++++++++-
 tabvision/tests/unit/test_playability.py  | 219 ++++++++++++++++++++++
 2 files changed, 382 insertions(+), 6 deletions(-)
 create mode 100644 tabvision/tests/unit/test_playability.py

diff --git a/tabvision/tabvision/fusion/playability.py b/tabvision/tabvision/fusion/playability.py
index 60df95c..2def9c5 100644
--- a/tabvision/tabvision/fusion/playability.py
+++ b/tabvision/tabvision/fusion/playability.py
@@ -1,9 +1,166 @@
-"""Playability transition costs — Phase 5 deliverable. Stub.
+"""Playability emission + transition costs — Phase 5 deliverable.
 
-Hand-span penalty, position-shift penalty, open-string bonus, same-string
-monophony enforcement.
+All functions return **negative log-probs** in nats: lower cost = better.
+Costs decompose into per-candidate emission terms (audio prior + vision
+evidence + open-string bonus + low-fret bias) and pairwise transition
+terms (string continuity + position shift + hand-span barrier).
 
-Port targets:
-- ``tabvision-server/app/fusion_engine.py`` (position scoring, melodic
-  segment correction, slide correction).
+See ``docs/plans/2026-05-06-phase5-fusion-design.md`` §2 for the formulae
+and ``SPEC.md`` §5 for acceptance bars.
+
+Port targets: ``tabvision-server/app/fusion_engine.py`` —
+``_score_position_heuristic``, ``_correct_slide_positions``, the
+hand-anchor/position-shift logic.
 """
+
+from __future__ import annotations
+
+import math
+from typing import Sequence
+
+from tabvision.fusion.candidates import Candidate
+from tabvision.types import AudioEvent, FrameFingering, GuitarConfig
+
+# --- emission term weights ---
+LOW_FRET_BIAS = 0.10
+"""Cost added per fret index. Keeps the decoder honest when audio + vision
+are flat — picks the lower fret all else equal. Same magnitude as the legacy
+``viterbi.LOWER_FRET_BIAS``."""
+
+OPEN_STRING_BONUS = 0.5
+"""Cost subtracted when the candidate is an open string (fret 0).
+
+Open strings are systematically under-represented by MediaPipe-derived
+``marginal_string_fret`` because there is no fingertip pressing — this
+bonus re-introduces them. Magnitude calibrated to roughly cancel the
+vision-floor cost (``-log(VISION_FLOOR)`` over a uniform marginal)."""
+
+VISION_FLOOR = 1e-3
+"""Minimum probability used when computing ``-log P_vision``. Caps the
+vision evidence's contribution at ``-log(1e-3) ≈ 6.9`` per candidate so
+a confident wrong fingering can still be overridden by strong audio +
+playability evidence."""
+
+# --- transition term weights ---
+SAME_STRING_BONUS = 0.5
+"""Cost subtracted when ``prev.string_idx == curr.string_idx``. Direct
+port of legacy ``STRING_CONTINUITY_BONUS``."""
+
+POSITION_SHIFT_COST = 0.05
+"""Cost per fret of ``|curr.fret - prev.fret|`` (after normalisation by
+``SPAN_NORM``). Mild — encourages staying close on the neck without
+forbidding jumps."""
+
+SPAN_NORM = 12
+"""Normalisation for ``POSITION_SHIFT_COST`` — one octave."""
+
+MAX_HAND_SPAN = 5
+"""Frets — beyond this distance the hand-span barrier kicks in."""
+
+HAND_SPAN_BARRIER = 5.0
+"""Cost added per fret of overshoot beyond ``MAX_HAND_SPAN``. Steep
+enough to act as a soft hard-constraint while still allowing a jump
+when audio + vision agree strongly."""
+
+EPS = 1e-9
+
+
+def find_fingering_at(
+    t: float, fingerings: Sequence[FrameFingering]
+) -> FrameFingering | None:
+    """Return the ``FrameFingering`` whose ``.t`` is closest to ``t``.
+
+    Returns ``None`` when ``fingerings`` is empty or no entry carries
+    evidence (logits None, empty, or all-zero). Ties broken by earliest.
+    """
+    if not fingerings:
+        return None
+    best: FrameFingering | None = None
+    best_dt = math.inf
+    for f in fingerings:
+        if f.finger_pos_logits is None or f.finger_pos_logits.size == 0:
+            continue
+        if not (f.finger_pos_logits != 0).any():
+            continue
+        dt = abs(f.t - t)
+        if dt < best_dt:
+            best = f
+            best_dt = dt
+    return best
+
+
+def emission_cost(
+    candidate: Candidate,
+    event: AudioEvent,
+    fingering: FrameFingering | None,
+    cfg: GuitarConfig,
+    *,
+    lambda_vision: float = 1.0,
+) -> float:
+    """Emission cost (negative log-prob) for ``candidate`` given ``event``.
+
+    Decomposition (lower = better):
+
+    - ``-log(event.confidence)`` — per-event constant (does not affect
+      ranking within a single event but matters across events).
+    - ``-log(event.fret_prior[s, f])`` — only when the audio backend
+      provides a per-position prior (e.g. Phase 2 ``tabcnn``).
+    - ``lambda_vision * -log(P_vision[s, f])`` — vision marginal at
+      ``event.onset_s``. Skipped when ``fingering is None``.
+    - ``LOW_FRET_BIAS * fret`` — gentle low-fret preference.
+    - ``-OPEN_STRING_BONUS`` when ``fret == 0``.
+    """
+    cost = -math.log(max(event.confidence, EPS))
+
+    if event.fret_prior is not None:
+        prior = float(event.fret_prior[candidate.string_idx, candidate.fret])
+        cost += -math.log(max(prior, EPS))
+
+    if fingering is not None:
+        marginal = fingering.marginal_string_fret()
+        p = float(marginal[candidate.string_idx, candidate.fret])
+        cost += lambda_vision * (-math.log(max(p, VISION_FLOOR)))
+
+    cost += LOW_FRET_BIAS * candidate.fret
+    if candidate.fret == 0:
+        cost -= OPEN_STRING_BONUS
+
+    return cost
+
+
+def transition_cost(
+    prev: Candidate, curr: Candidate, cfg: GuitarConfig
+) -> float:
+    """Transition cost from ``prev`` to ``curr``.
+
+    - String continuity: ``-SAME_STRING_BONUS`` when on the same string.
+    - Position shift: ``POSITION_SHIFT_COST * |Δfret| / SPAN_NORM``.
+    - Hand-span barrier: ``HAND_SPAN_BARRIER * max(0, |Δfret| - MAX_HAND_SPAN)``.
+
+    ``cfg`` is reserved for future use (e.g. instrument-specific span
+    limits); pass the same value used elsewhere in the decode.
+    """
+    del cfg  # unused for now; reserved.
+    cost = 0.0
+    delta = abs(curr.fret - prev.fret)
+    cost += POSITION_SHIFT_COST * delta / SPAN_NORM
+    if delta > MAX_HAND_SPAN:
+        cost += HAND_SPAN_BARRIER * (delta - MAX_HAND_SPAN)
+    if curr.string_idx == prev.string_idx:
+        cost -= SAME_STRING_BONUS
+    return cost
+
+
+__all__ = [
+    "find_fingering_at",
+    "emission_cost",
+    "transition_cost",
+    "LOW_FRET_BIAS",
+    "OPEN_STRING_BONUS",
+    "VISION_FLOOR",
+    "SAME_STRING_BONUS",
+    "POSITION_SHIFT_COST",
+    "SPAN_NORM",
+    "MAX_HAND_SPAN",
+    "HAND_SPAN_BARRIER",
+]
diff --git a/tabvision/tests/unit/test_playability.py b/tabvision/tests/unit/test_playability.py
new file mode 100644
index 0000000..02a0979
--- /dev/null
+++ b/tabvision/tests/unit/test_playability.py
@@ -0,0 +1,219 @@
+"""Unit tests for ``tabvision.fusion.playability``.
+
+Covers:
+- emission cost: audio-only ranking matches the legacy greedy decoder's
+  preferences (lower fret + open-string bonus).
+- emission cost: vision evidence pulls a candidate that audio is
+  indifferent on.
+- emission cost: open-string bonus correctly recovers fret 0 when the
+  vision marginal is uniform.
+- transition cost: same-string is cheaper than string-jump.
+- transition cost: hand-span barrier triggers only past ``MAX_HAND_SPAN``.
+- ``find_fingering_at`` picks the nearest non-empty fingering.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+
+from tabvision.fusion.candidates import Candidate, candidate_positions
+from tabvision.fusion.playability import (
+    HAND_SPAN_BARRIER,
+    MAX_HAND_SPAN,
+    OPEN_STRING_BONUS,
+    SAME_STRING_BONUS,
+    emission_cost,
+    find_fingering_at,
+    transition_cost,
+)
+from tabvision.types import AudioEvent, FrameFingering, GuitarConfig
+
+# ---------- helpers ----------
+
+
+def _ev(midi: int, t: float = 0.0, confidence: float = 0.8) -> AudioEvent:
+    return AudioEvent(
+        onset_s=t,
+        offset_s=t + 0.25,
+        pitch_midi=midi,
+        velocity=0.8,
+        confidence=confidence,
+    )
+
+
+def _peaked_fingering(
+    t: float,
+    target_string: int,
+    target_fret: int,
+    n_strings: int = 6,
+    max_fret: int = 24,
+) -> FrameFingering:
+    """Marginal sharply peaked at ``(target_string, target_fret)``."""
+    logits = np.zeros((4, n_strings, max_fret + 1), dtype=np.float64)
+    logits[0, target_string, target_fret] = 10.0
+    return FrameFingering(
+        t=t, finger_pos_logits=logits, homography_confidence=0.9
+    )
+
+
+def _uniform_fingering(
+    t: float, n_strings: int = 6, max_fret: int = 24
+) -> FrameFingering:
+    """Marginal ≈ uniform across (string, fret) cells."""
+    logits = np.ones((4, n_strings, max_fret + 1), dtype=np.float64)
+    return FrameFingering(
+        t=t, finger_pos_logits=logits, homography_confidence=0.9
+    )
+
+
+# ---------- emission ----------
+
+
+def test_emission_audio_only_prefers_lower_fret():
+    """Without vision evidence, lowest-fret candidate has lowest emission cost.
+
+    A4 (MIDI 69) candidates: s5f5 (high E, fret 5) and s4f9 (B, fret 9), among
+    others. The plain low-fret bias should pick s5f5.
+    """
+    cfg = GuitarConfig()
+    ev = _ev(69)
+    cands = candidate_positions(69, cfg)
+    costs = [(c, emission_cost(c, ev, None, cfg)) for c in cands]
+    best = min(costs, key=lambda kv: kv[1])[0]
+    assert best.fret == 5
+    assert best.string_idx == 5  # high E
+
+
+def test_emission_open_string_bonus_recovers_fret_zero():
+    """For a pitch with a fret-0 option, the open-string bonus puts it on top.
+
+    E2 (MIDI 40) has only one candidate: s0f0 — the bonus should make its
+    emission cost lower than any fingered alternative would have been.
+    """
+    cfg = GuitarConfig()
+    ev = _ev(40)
+    cands = candidate_positions(40, cfg)
+    assert len(cands) == 1 and cands[0].fret == 0
+    open_cost = emission_cost(cands[0], ev, None, cfg)
+
+    # Compare against a synthetic fret-1 candidate's would-be cost: same
+    # pitch contribution, but no bonus and one tick of low-fret bias.
+    fake = Candidate(string_idx=0, fret=1)
+    # Construct a fake AudioEvent with the same confidence so the per-event
+    # constant cancels out.
+    fake_cost = emission_cost(fake, ev, None, cfg)
+    assert open_cost < fake_cost
+    assert (fake_cost - open_cost) >= OPEN_STRING_BONUS - 1e-9
+
+
+def test_emission_vision_pulls_pick_off_lowest_fret():
+    """Vision evidence should override the lowest-fret default.
+
+    A4 (MIDI 69) audio-only picks s5f5 (high E, fret 5). With a fingering
+    peaked at s2f14 (G string, fret 14 — also a valid A4 position), the
+    emission cost there should be lower despite the higher fret.
+    """
+    cfg = GuitarConfig()
+    ev = _ev(69, t=1.0)
+    fing = _peaked_fingering(t=1.0, target_string=2, target_fret=14)
+
+    audio_pick = Candidate(string_idx=5, fret=5)
+    vision_pick = Candidate(string_idx=2, fret=14)
+
+    audio_cost = emission_cost(audio_pick, ev, fing, cfg, lambda_vision=1.0)
+    vision_cost = emission_cost(vision_pick, ev, fing, cfg, lambda_vision=1.0)
+    assert vision_cost < audio_cost
+
+
+def test_emission_uniform_vision_does_not_change_ranking():
+    """A uniform fingering should not flip the audio-only preference."""
+    cfg = GuitarConfig()
+    ev = _ev(69)
+    fing = _uniform_fingering(t=0.0)
+    cands = candidate_positions(69, cfg)
+    pure_audio = sorted(
+        cands, key=lambda c: emission_cost(c, ev, None, cfg)
+    )
+    with_uniform = sorted(
+        cands,
+        key=lambda c: emission_cost(c, ev, fing, cfg, lambda_vision=1.0),
+    )
+    assert [c for c in pure_audio] == [c for c in with_uniform]
+
+
+# ---------- transition ----------
+
+
+def test_transition_same_string_is_cheaper_than_string_jump():
+    """Same-string continuity bonus beats a one-fret string jump."""
+    cfg = GuitarConfig()
+    prev = Candidate(string_idx=5, fret=5)
+    same_string = Candidate(string_idx=5, fret=7)  # 2 frets up, same string
+    string_jump = Candidate(string_idx=4, fret=5)  # different string, same fret
+    assert (
+        transition_cost(prev, same_string, cfg)
+        < transition_cost(prev, string_jump, cfg)
+    )
+
+
+def test_transition_hand_span_barrier_only_past_threshold():
+    """Costs are mild within ``MAX_HAND_SPAN`` and steep beyond it."""
+    cfg = GuitarConfig()
+    prev = Candidate(string_idx=5, fret=5)
+    within = Candidate(string_idx=5, fret=5 + MAX_HAND_SPAN)  # at threshold
+    beyond = Candidate(string_idx=5, fret=5 + MAX_HAND_SPAN + 1)  # one past
+
+    cost_within = transition_cost(prev, within, cfg)
+    cost_beyond = transition_cost(prev, beyond, cfg)
+
+    # The barrier kicks in for `beyond`, so the gap should be ≥ HAND_SPAN_BARRIER
+    # (modulo the small extra position-shift cost of one more fret).
+    assert (cost_beyond - cost_within) >= HAND_SPAN_BARRIER - 1e-6
+
+
+def test_transition_zero_when_unchanged():
+    """No-op transition (same string, same fret) yields the bare continuity bonus."""
+    cfg = GuitarConfig()
+    p = Candidate(string_idx=3, fret=7)
+    cost = transition_cost(p, p, cfg)
+    # 0 position shift + same-string bonus → -SAME_STRING_BONUS exactly.
+    assert cost == -SAME_STRING_BONUS
+
+
+# ---------- find_fingering_at ----------
+
+
+def test_find_fingering_at_picks_closest_non_empty():
+    fings = [
+        _peaked_fingering(t=0.0, target_string=0, target_fret=0),
+        _peaked_fingering(t=1.0, target_string=5, target_fret=5),
+        _peaked_fingering(t=2.0, target_string=3, target_fret=3),
+    ]
+    chosen = find_fingering_at(1.1, fings)
+    assert chosen is not None
+    assert chosen.t == 1.0
+
+
+def test_find_fingering_at_skips_empty_logits():
+    """All-zero logits = no evidence; should be skipped."""
+    empty = FrameFingering(
+        t=0.5,
+        finger_pos_logits=np.zeros((4, 6, 25)),
+        homography_confidence=0.0,
+    )
+    real = _peaked_fingering(t=2.0, target_string=2, target_fret=7)
+    chosen = find_fingering_at(0.6, [empty, real])
+    assert chosen is real
+
+
+def test_find_fingering_at_returns_none_when_all_empty():
+    empty = FrameFingering(
+        t=0.5,
+        finger_pos_logits=np.zeros((4, 6, 25)),
+        homography_confidence=0.0,
+    )
+    assert find_fingering_at(0.6, [empty]) is None
+
+
+def test_find_fingering_at_returns_none_for_empty_input():
+    assert find_fingering_at(0.6, []) is None

From d483376dd31664f62765a3716f1763e57ea32ccb Mon Sep 17 00:00:00 2001
From: Patrick Gilhooley <pgilhooley95@gmail.com>
Date: Wed, 6 May 2026 17:49:02 -0400
Subject: [PATCH 3/6] feat(phase5): cluster-level Viterbi + chord-state
 machinery

Replace the greedy audio-only decoder with a unified cluster-level Viterbi
DP. Each step in the DP is a chord cluster (events <=80ms apart, chain
semantics); singleton clusters degenerate to single-line Viterbi.
chord.enumerate_chord_states builds valid (string, fret) tuples under
per-string monophony + hand-span constraints; chord.chord_anchor picks
the lowest-fret pressed note as the cluster's representative for
inter-cluster transition costs.

Lookahead is real: a future event's vision evidence can change earlier
picks when the global path is cheaper. Lambda_vision=0.0 reproduces the
audio-only behaviour bit-for-bit.

Adds 19 new unit tests (lookahead, vision-decisive single, chord
monophony + hand-span, chord vision pull, cluster grouping); all 39
fusion tests green.
---
 tabvision/tabvision/fusion/chord.py           | 122 +++++++++-
 tabvision/tabvision/fusion/viterbi.py         | 214 +++++++++++-------
 tabvision/tests/unit/test_chord_fusion.py     | 201 ++++++++++++++++
 .../tests/unit/test_fusion_audio_only.py      |  96 +++++++-
 4 files changed, 545 insertions(+), 88 deletions(-)
 create mode 100644 tabvision/tests/unit/test_chord_fusion.py

diff --git a/tabvision/tabvision/fusion/chord.py b/tabvision/tabvision/fusion/chord.py
index ac91bdd..f734a06 100644
--- a/tabvision/tabvision/fusion/chord.py
+++ b/tabvision/tabvision/fusion/chord.py
@@ -1,9 +1,119 @@
-"""Chord-aware fusion — Phase 5 deliverable. Stub.
+"""Chord cluster grouping + chord-state machinery — Phase 5 deliverable.
 
-Simultaneous events (≤ 80 ms apart) decoded as ordered tuples with
-per-string monophony and hand-span constraints baked into state
-construction.
+A *chord cluster* is a maximal run of consecutive ``AudioEvent``s whose
+adjacent onset gaps are all ≤ :data:`CHORD_MAX_GAP_S` (80 ms by default).
+Within a cluster, decoding picks an ordered tuple of ``(string, fret)``
+candidates — one per event — subject to two structural constraints:
 
-Port targets: ``tabvision-server/app/chord_shapes.py`` + chord logic in
-``fusion_engine.py``.
+- **Per-string monophony**: no two events share a string.
+- **Hand-span**: ``max(pressed_fret) - min(pressed_fret) ≤ MAX_HAND_SPAN``
+  (open strings are exempt — fret 0 doesn't constrain the fretting hand).
+
+This module is pure machinery — clustering, state enumeration, anchor
+selection. The cluster-level Viterbi DP that consumes these states lives
+in :mod:`tabvision.fusion.viterbi`.
+
+See ``docs/plans/2026-05-06-phase5-fusion-design.md`` §3.2 and SPEC.md §5.
 """
+
+from __future__ import annotations
+
+from typing import Sequence
+
+from tabvision.fusion.candidates import Candidate, candidate_positions
+from tabvision.fusion.playability import MAX_HAND_SPAN
+from tabvision.types import AudioEvent, GuitarConfig
+
+CHORD_MAX_GAP_S = 0.080
+"""Maximum onset gap (seconds) between consecutive events to count as one
+chord cluster. SPEC §5 calls this "≤ 80 ms apart"."""
+
+
+def cluster_events(
+    events: Sequence[AudioEvent],
+    max_gap_s: float = CHORD_MAX_GAP_S,
+) -> list[list[AudioEvent]]:
+    """Group events into chord clusters.
+
+    Chain semantics: events ``i`` and ``i+1`` (sorted by onset) join the
+    same cluster iff ``events[i+1].onset_s - events[i].onset_s ≤ max_gap_s``.
+    A cluster therefore can span more than ``max_gap_s`` overall when the
+    individual pairwise gaps remain bounded.
+    """
+    if not events:
+        return []
+    sorted_events = sorted(events, key=lambda e: e.onset_s)
+    clusters: list[list[AudioEvent]] = [[sorted_events[0]]]
+    for ev in sorted_events[1:]:
+        if ev.onset_s - clusters[-1][-1].onset_s <= max_gap_s:
+            clusters[-1].append(ev)
+        else:
+            clusters.append([ev])
+    return clusters
+
+
+def enumerate_chord_states(
+    events: Sequence[AudioEvent],
+    cfg: GuitarConfig,
+) -> list[tuple[Candidate, ...]]:
+    """All valid (monophony + hand-span) ordered tuples of candidates.
+
+    Builds the state set incrementally to keep the worst-case bounded by
+    the constraint-pruned size at each step rather than the raw cartesian
+    product (``K^m``). Returns an empty list if any event has no
+    candidates — the caller is expected to filter out-of-range events
+    upstream so the cluster shape stays consistent with the input order.
+    """
+    if not events:
+        return []
+
+    per_event_candidates = [
+        candidate_positions(ev.pitch_midi, cfg) for ev in events
+    ]
+    if any(not cands for cands in per_event_candidates):
+        return []
+
+    states: list[tuple[Candidate, ...]] = [
+        (c,) for c in per_event_candidates[0]
+    ]
+    for k in range(1, len(events)):
+        next_states: list[tuple[Candidate, ...]] = []
+        for state in states:
+            used_strings = {c.string_idx for c in state}
+            pressed = [c.fret for c in state if c.fret > 0]
+            for c in per_event_candidates[k]:
+                if c.string_idx in used_strings:
+                    continue
+                new_pressed = pressed + ([c.fret] if c.fret > 0 else [])
+                if new_pressed:
+                    span = max(new_pressed) - min(new_pressed)
+                    if span > MAX_HAND_SPAN:
+                        continue
+                next_states.append(state + (c,))
+        states = next_states
+        if not states:
+            return []
+    return states
+
+
+def chord_anchor(state: tuple[Candidate, ...]) -> Candidate:
+    """The 'anchor' candidate used as the state's representative for
+    inter-cluster transition costs.
+
+    Defined as the lowest-fret *pressed* note (fret > 0) — the natural
+    centre of the fretting hand. If all notes are open, the first
+    candidate is returned (any choice is equivalent because all pressed
+    frets are 0 and transition cost depends on Δfret).
+    """
+    pressed = [c for c in state if c.fret > 0]
+    if not pressed:
+        return state[0]
+    return min(pressed, key=lambda c: (c.fret, c.string_idx))
+
+
+__all__ = [
+    "CHORD_MAX_GAP_S",
+    "cluster_events",
+    "enumerate_chord_states",
+    "chord_anchor",
+]
diff --git a/tabvision/tabvision/fusion/viterbi.py b/tabvision/tabvision/fusion/viterbi.py
index 168669f..1a67e9c 100644
--- a/tabvision/tabvision/fusion/viterbi.py
+++ b/tabvision/tabvision/fusion/viterbi.py
@@ -1,20 +1,28 @@
-"""Single-line Viterbi decode + audio-only fallback.
+"""Cluster-level Viterbi decode — Phase 5 deliverable.
 
 Public entrypoint: ``fuse(events, fingerings, cfg, session, lambda_vision)``.
 
-Phase 1: when ``fingerings`` is empty (video stubs), degenerate to a
-greedy "lowest-fret with continuity bonus" decoder per SPEC.md §7 Phase 1.
+Each "step" in the DP is a chord cluster (often a singleton — an isolated
+event). For each cluster, :func:`tabvision.fusion.chord.enumerate_chord_states`
+produces the per-string-monophony + hand-span-feasible ordered tuples of
+candidates. Emission for a state is the sum of per-event emission costs
+(:func:`tabvision.fusion.playability.emission_cost`); transitions between
+clusters use :func:`tabvision.fusion.chord.chord_anchor` to pick a
+representative position for the playability transition cost.
 
-Phase 5 replaces the body with a proper Viterbi over candidate states
-using ``tabvision.fusion.playability`` transition costs. The public
-signature stays stable.
+The single-line Viterbi behaviour is the size-1-cluster degenerate case
+of this same DP — no separate code path.
+
+See ``docs/plans/2026-05-06-phase5-fusion-design.md`` §3 for the state
+spaces and §2 for the cost decomposition.
 """
 
 from __future__ import annotations
 
+import math
 from typing import Sequence
 
-from tabvision.errors import FusionError
+from tabvision.fusion import chord, playability
 from tabvision.fusion.candidates import Candidate, candidate_positions
 from tabvision.types import (
     AudioEvent,
@@ -24,16 +32,6 @@
     TabEvent,
 )
 
-# Continuity bonus: amount subtracted from a candidate's "cost" when its
-# string matches the previous note's string. A small constant; Phase 5
-# will calibrate.
-STRING_CONTINUITY_BONUS = 0.5
-# Penalty per fret of distance from the previous note's fret. Small
-# enough that the lowest-fret bias still wins for distant pitches.
-FRET_DISTANCE_PENALTY = 0.05
-# Penalty per fret position (lower-fret preference).
-LOWER_FRET_BIAS = 0.10
-
 
 def fuse(
     events: Sequence[AudioEvent],
@@ -42,78 +40,134 @@ def fuse(
     session: SessionConfig | None = None,
     lambda_vision: float = 1.0,
 ) -> list[TabEvent]:
-    """Decode AudioEvents into TabEvents.
-
-    Phase 1: ``fingerings`` is empty / uniform; falls back to greedy
-    audio-only decode. The ``lambda_vision`` weight is accepted for
-    interface stability but ignored until Phase 5.
+    """Decode ``AudioEvent``s into ``TabEvent``s via cluster Viterbi.
+
+    Parameters
+    ----------
+    events:
+        Audio events. Out-of-range pitches (no playable candidate under
+        ``cfg``) are dropped — no phantom notes emitted.
+    fingerings:
+        Per-frame fingerings from Phase 4. Empty / all-zero is treated
+        as audio-only.
+    cfg:
+        Instrument config (tuning, capo, max_fret).
+    session:
+        Recording session metadata; reserved for future use.
+    lambda_vision:
+        Mixing weight for the vision-evidence term. ``0.0`` disables
+        vision entirely; ``1.0`` is the default; higher values lean more
+        heavily on the fingertip-to-fret posterior.
+
+    Returns
+    -------
+    list[TabEvent]
+        One ``TabEvent`` per surviving event, ordered by ``onset_s``.
     """
     if cfg is None:
         cfg = GuitarConfig()
     if session is None:
         session = SessionConfig()
-
-    has_video = any(_has_evidence(f) for f in fingerings)
-    if has_video:
-        # Phase 5 deliverable: Viterbi over (string, fret) states with
-        # vision-evidence + playability costs. Not yet implemented.
-        raise FusionError(
-            "video-aware fusion not implemented in Phase 1 — "
-            "this is a Phase 5 deliverable"
-        )
-
-    return _greedy_audio_only(events, cfg)
-
-
-def _has_evidence(f: FrameFingering) -> bool:
-    """A FrameFingering carries info if its logits are not all-zero."""
-    arr = f.finger_pos_logits
-    return arr is not None and bool(arr.size) and bool((arr != 0).any())
-
-
-def _greedy_audio_only(
-    events: Sequence[AudioEvent], cfg: GuitarConfig
+    del session  # not consumed by Phase 5; preserves signature for callers.
+
+    if not events:
+        return []
+
+    # Drop out-of-range pitches before clustering so the cluster shape
+    # reflects what's actually decodable.
+    valid_events = [
+        ev for ev in events if candidate_positions(ev.pitch_midi, cfg)
+    ]
+    if not valid_events:
+        return []
+
+    clusters = chord.cluster_events(valid_events)
+    cluster_data: list[
+        tuple[list[AudioEvent], list[tuple[Candidate, ...]]]
+    ] = []
+    for cluster in clusters:
+        states = chord.enumerate_chord_states(cluster, cfg)
+        if states:
+            cluster_data.append((cluster, states))
+
+    if not cluster_data:
+        return []
+
+    return _viterbi_clusters(cluster_data, fingerings, cfg, lambda_vision)
+
+
+def _viterbi_clusters(
+    cluster_data: list[
+        tuple[list[AudioEvent], list[tuple[Candidate, ...]]]
+    ],
+    fingerings: Sequence[FrameFingering],
+    cfg: GuitarConfig,
+    lambda_vision: float,
 ) -> list[TabEvent]:
-    """Pick (string, fret) per event by lowest-fret + continuity."""
-    out: list[TabEvent] = []
-    prev: Candidate | None = None
-
-    for ev in events:
-        candidates = candidate_positions(ev.pitch_midi, cfg)
-        if not candidates:
-            # Out-of-range pitch; skip rather than emit a phantom note.
-            continue
-        pick = _pick_candidate(candidates, prev)
-        out.append(
-            TabEvent(
-                onset_s=ev.onset_s,
-                duration_s=max(0.0, ev.offset_s - ev.onset_s),
-                string_idx=pick.string_idx,
-                fret=pick.fret,
-                pitch_midi=ev.pitch_midi,
-                confidence=ev.confidence,
-                techniques=ev.tags,
+    """Cluster-level Viterbi DP. Worst case ``O(N · S^2)`` for ``N``
+    clusters with ``S`` states each."""
+
+    def state_emission(
+        cluster: list[AudioEvent], state: tuple[Candidate, ...]
+    ) -> float:
+        total = 0.0
+        for ev, c in zip(cluster, state):
+            f = playability.find_fingering_at(ev.onset_s, fingerings)
+            total += playability.emission_cost(
+                c, ev, f, cfg, lambda_vision=lambda_vision
             )
-        )
-        prev = pick
+        return total
+
+    n = len(cluster_data)
+    cost: list[list[float]] = [[] for _ in range(n)]
+    backptr: list[list[int]] = [[] for _ in range(n)]
+
+    cluster0, states0 = cluster_data[0]
+    cost[0] = [state_emission(cluster0, st) for st in states0]
+    backptr[0] = [-1] * len(states0)
+
+    for i in range(1, n):
+        cluster_i, states_i = cluster_data[i]
+        prev_states = cluster_data[i - 1][1]
+        cost[i] = [math.inf] * len(states_i)
+        backptr[i] = [-1] * len(states_i)
+        for si, state in enumerate(states_i):
+            emit = state_emission(cluster_i, state)
+            anchor_curr = chord.chord_anchor(state)
+            for pi, prev_state in enumerate(prev_states):
+                anchor_prev = chord.chord_anchor(prev_state)
+                trans = playability.transition_cost(
+                    anchor_prev, anchor_curr, cfg
+                )
+                total = cost[i - 1][pi] + trans + emit
+                if total < cost[i][si]:
+                    cost[i][si] = total
+                    backptr[i][si] = pi
+
+    # Backtrack from the cheapest terminal state.
+    final = cost[n - 1]
+    last_idx = min(range(len(final)), key=lambda j: final[j])
+    picks_idx = [0] * n
+    picks_idx[n - 1] = last_idx
+    for i in range(n - 1, 0, -1):
+        picks_idx[i - 1] = backptr[i][picks_idx[i]]
 
+    out: list[TabEvent] = []
+    for i, (cluster, states) in enumerate(cluster_data):
+        state = states[picks_idx[i]]
+        for ev, c in zip(cluster, state):
+            out.append(
+                TabEvent(
+                    onset_s=ev.onset_s,
+                    duration_s=max(0.0, ev.offset_s - ev.onset_s),
+                    string_idx=c.string_idx,
+                    fret=c.fret,
+                    pitch_midi=ev.pitch_midi,
+                    confidence=ev.confidence,
+                    techniques=ev.tags,
+                )
+            )
     return out
 
 
-def _pick_candidate(
-    candidates: list[Candidate], prev: Candidate | None
-) -> Candidate:
-    """Score each candidate; lower cost wins."""
-
-    def cost(c: Candidate) -> float:
-        score = LOWER_FRET_BIAS * c.fret
-        if prev is not None:
-            score += FRET_DISTANCE_PENALTY * abs(c.fret - prev.fret)
-            if c.string_idx == prev.string_idx:
-                score -= STRING_CONTINUITY_BONUS
-        return score
-
-    return min(candidates, key=cost)
-
-
 __all__ = ["fuse"]
diff --git a/tabvision/tests/unit/test_chord_fusion.py b/tabvision/tests/unit/test_chord_fusion.py
new file mode 100644
index 0000000..9f15a23
--- /dev/null
+++ b/tabvision/tests/unit/test_chord_fusion.py
@@ -0,0 +1,201 @@
+"""Unit tests for chord-aware fusion (``tabvision.fusion.chord`` plus
+the cluster-level Viterbi in :mod:`tabvision.fusion.viterbi`).
+
+Covers:
+- ``cluster_events``: clustering by onset gap.
+- ``enumerate_chord_states``: per-string monophony + hand-span pruning.
+- ``chord_anchor``: lowest-fret pressed note as anchor.
+- End-to-end ``fuse``: simultaneous events emit distinct strings, picks
+  fall within the hand-span constraint, and vision evidence on one
+  chord member pulls the whole shape onto a vision-supported voicing.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+
+from tabvision.fusion import fuse
+from tabvision.fusion.candidates import Candidate
+from tabvision.fusion.chord import (
+    CHORD_MAX_GAP_S,
+    chord_anchor,
+    cluster_events,
+    enumerate_chord_states,
+)
+from tabvision.fusion.playability import MAX_HAND_SPAN
+from tabvision.types import AudioEvent, FrameFingering, GuitarConfig
+
+
+def _ev(midi: int, t: float, confidence: float = 0.8) -> AudioEvent:
+    return AudioEvent(
+        onset_s=t,
+        offset_s=t + 0.25,
+        pitch_midi=midi,
+        velocity=0.8,
+        confidence=confidence,
+    )
+
+
+def _peaked_fingering(t: float, string_idx: int, fret: int) -> FrameFingering:
+    logits = np.zeros((4, 6, 25), dtype=np.float64)
+    logits[0, string_idx, fret] = 10.0
+    return FrameFingering(
+        t=t, finger_pos_logits=logits, homography_confidence=0.9
+    )
+
+
+# ---------- cluster_events ----------
+
+
+def test_cluster_events_single_event_yields_one_cluster():
+    clusters = cluster_events([_ev(60, 0.0)])
+    assert len(clusters) == 1
+    assert len(clusters[0]) == 1
+
+
+def test_cluster_events_close_events_join_one_cluster():
+    """Two events 50 ms apart should be one chord cluster."""
+    events = [_ev(60, 0.0), _ev(64, 0.05)]
+    clusters = cluster_events(events)
+    assert len(clusters) == 1
+    assert len(clusters[0]) == 2
+
+
+def test_cluster_events_far_events_split():
+    """Two events 200 ms apart should be two clusters."""
+    events = [_ev(60, 0.0), _ev(64, 0.20)]
+    clusters = cluster_events(events)
+    assert len(clusters) == 2
+    assert all(len(c) == 1 for c in clusters)
+
+
+def test_cluster_events_chain_through_threshold():
+    """Three events at 0, 80, 160 ms (each adjacent gap == threshold)
+    should form one cluster (chain semantics)."""
+    events = [
+        _ev(60, 0.0),
+        _ev(64, CHORD_MAX_GAP_S),
+        _ev(67, 2 * CHORD_MAX_GAP_S),
+    ]
+    clusters = cluster_events(events)
+    assert len(clusters) == 1
+    assert len(clusters[0]) == 3
+
+
+def test_cluster_events_unsorted_input_is_sorted():
+    """Out-of-order input should still produce a chronologically grouped
+    output."""
+    events = [_ev(67, 0.05), _ev(60, 0.0)]
+    clusters = cluster_events(events)
+    assert len(clusters) == 1
+    assert clusters[0][0].pitch_midi == 60  # low-onset first
+
+
+# ---------- enumerate_chord_states ----------
+
+
+def test_enumerate_chord_states_enforces_monophony():
+    """C major triad (C4 + E4 + G4) — no enumerated state may put two
+    notes on the same string."""
+    cfg = GuitarConfig()
+    events = [_ev(60, 0.0), _ev(64, 0.0), _ev(67, 0.0)]
+    states = enumerate_chord_states(events, cfg)
+    assert states  # non-empty
+    for state in states:
+        strings = [c.string_idx for c in state]
+        assert len(strings) == len(set(strings)), (
+            f"per-string monophony violated: {state}"
+        )
+
+
+def test_enumerate_chord_states_enforces_hand_span():
+    """Every enumerated state must respect MAX_HAND_SPAN over pressed frets."""
+    cfg = GuitarConfig()
+    events = [_ev(60, 0.0), _ev(64, 0.0), _ev(67, 0.0)]
+    states = enumerate_chord_states(events, cfg)
+    for state in states:
+        pressed = [c.fret for c in state if c.fret > 0]
+        if pressed:
+            assert max(pressed) - min(pressed) <= MAX_HAND_SPAN
+
+
+def test_enumerate_chord_states_empty_when_event_unfretable():
+    """If any event has no candidates, no chord state survives."""
+    cfg = GuitarConfig()
+    events = [_ev(60, 0.0), _ev(20, 0.0)]  # 20 = far below low E
+    assert enumerate_chord_states(events, cfg) == []
+
+
+# ---------- chord_anchor ----------
+
+
+def test_chord_anchor_picks_lowest_pressed_fret():
+    state = (
+        Candidate(string_idx=4, fret=5),
+        Candidate(string_idx=5, fret=0),  # open
+        Candidate(string_idx=3, fret=3),
+    )
+    assert chord_anchor(state) == Candidate(string_idx=3, fret=3)
+
+
+def test_chord_anchor_falls_back_to_first_when_all_open():
+    state = (
+        Candidate(string_idx=5, fret=0),
+        Candidate(string_idx=4, fret=0),
+    )
+    assert chord_anchor(state) == state[0]
+
+
+# ---------- end-to-end fuse() through chord clusters ----------
+
+
+def test_fuse_simultaneous_events_emit_distinct_strings():
+    """C4 + E4 fired together — picks must use different strings."""
+    cfg = GuitarConfig()
+    events = [_ev(60, 0.0), _ev(64, 0.0)]
+    out = fuse(events, [], cfg)
+    assert len(out) == 2
+    assert out[0].string_idx != out[1].string_idx
+
+
+def test_fuse_three_note_chord_within_hand_span():
+    """C major triad (C4 + E4 + G4) — picks form a hand-span-feasible voicing."""
+    cfg = GuitarConfig()
+    events = [_ev(60, 0.0), _ev(64, 0.0), _ev(67, 0.0)]
+    out = fuse(events, [], cfg)
+    assert len(out) == 3
+    strings = [t.string_idx for t in out]
+    assert len(set(strings)) == 3  # all distinct
+    pressed = [t.fret for t in out if t.fret > 0]
+    if pressed:
+        assert max(pressed) - min(pressed) <= MAX_HAND_SPAN
+
+
+def test_fuse_chord_prefers_open_string_voicing_with_uniform_vision():
+    """C major triad — the open-E voicing should win on emission cost
+    when no vision evidence pushes elsewhere.
+
+    E4 has an open-string candidate (5, 0). The open-string bonus +
+    low-fret bias should make at least one note an open string in the
+    chosen voicing."""
+    cfg = GuitarConfig()
+    events = [_ev(60, 0.0), _ev(64, 0.0), _ev(67, 0.0)]
+    out = fuse(events, [], cfg)
+    assert any(t.fret == 0 for t in out)
+
+
+def test_fuse_chord_vision_pulls_voicing():
+    """If the fingering is peaked at a non-default position for one of
+    the chord notes, the chosen state should include that exact pick."""
+    cfg = GuitarConfig()
+    events = [_ev(60, 0.0), _ev(64, 0.0), _ev(67, 0.0)]
+    # Push C4 onto string 3 fret 5 (G-string). The default voicing
+    # would have C4 on string 4 fret 1. With this peak, C4 should move.
+    fings = [_peaked_fingering(0.0, string_idx=3, fret=5)]
+    out = fuse(events, fings, cfg, lambda_vision=2.0)
+
+    c4 = next(t for t in out if t.pitch_midi == 60)
+    assert (c4.string_idx, c4.fret) == (3, 5)
+    # Other notes still produce a valid voicing.
+    strings = [t.string_idx for t in out]
+    assert len(set(strings)) == 3
diff --git a/tabvision/tests/unit/test_fusion_audio_only.py b/tabvision/tests/unit/test_fusion_audio_only.py
index 675a054..b4feca2 100644
--- a/tabvision/tests/unit/test_fusion_audio_only.py
+++ b/tabvision/tests/unit/test_fusion_audio_only.py
@@ -1,7 +1,14 @@
-"""Unit tests for the audio-only fusion path."""
+"""Unit tests for ``tabvision.fusion.viterbi.fuse``.
+
+Covers both the audio-only path (no / uniform fingerings) and the
+video-aware Viterbi behaviour (vision evidence pulls picks; lookahead
+changes earlier picks when later events benefit from a different anchor).
+"""
+
+import numpy as np
 
 from tabvision.fusion import fuse
-from tabvision.types import AudioEvent, GuitarConfig
+from tabvision.types import AudioEvent, FrameFingering, GuitarConfig
 
 
 def _ev(midi: int, t: float) -> AudioEvent:
@@ -14,6 +21,26 @@ def _ev(midi: int, t: float) -> AudioEvent:
     )
 
 
+def _peaked_fingering(t: float, string_idx: int, fret: int) -> FrameFingering:
+    """Marginal sharply peaked at ``(string_idx, fret)``."""
+    logits = np.zeros((4, 6, 25), dtype=np.float64)
+    logits[0, string_idx, fret] = 10.0
+    return FrameFingering(
+        t=t, finger_pos_logits=logits, homography_confidence=0.9
+    )
+
+
+def _uniform_fingering(t: float) -> FrameFingering:
+    """Marginal ≈ uniform across (string, fret) cells."""
+    logits = np.ones((4, 6, 25), dtype=np.float64)
+    return FrameFingering(
+        t=t, finger_pos_logits=logits, homography_confidence=0.9
+    )
+
+
+# ---------- audio-only regression ----------
+
+
 def test_empty_input_yields_empty_output():
     assert fuse([], [], GuitarConfig()) == []
 
@@ -50,3 +77,68 @@ def test_capo_shifts_picks():
     out = fuse([_ev(69, 0.0)], [], cfg)
     assert len(out) == 1
     assert out[0].fret >= 2
+
+
+# ---------- video-aware Viterbi ----------
+
+
+def test_uniform_vision_matches_no_vision():
+    """A uniform fingering must not change the audio-only picks."""
+    events = [_ev(69, 0.0), _ev(71, 0.5)]
+    cfg = GuitarConfig()
+    fings = [_uniform_fingering(0.0), _uniform_fingering(0.5)]
+    out_with = fuse(events, fings, cfg)
+    out_without = fuse(events, [], cfg)
+    assert [(e.string_idx, e.fret) for e in out_with] == [
+        (e.string_idx, e.fret) for e in out_without
+    ]
+
+
+def test_decisive_vision_moves_single_pick():
+    """A vision peak at a non-default candidate should override the lowest-fret bias.
+
+    A4's audio-only pick is (5, 5). With the fingering peaked at the G-string
+    A4 position (3, 14), Viterbi should land there instead.
+    """
+    cfg = GuitarConfig()
+    events = [_ev(69, 0.0)]
+    fings = [_peaked_fingering(0.0, string_idx=3, fret=14)]
+    out = fuse(events, fings, cfg, lambda_vision=1.0)
+    assert len(out) == 1
+    assert out[0].string_idx == 3
+    assert out[0].fret == 14
+
+
+def test_lambda_zero_disables_vision():
+    """Setting ``lambda_vision=0`` should reproduce the audio-only pick even
+    when a peaked fingering is present."""
+    cfg = GuitarConfig()
+    events = [_ev(69, 0.0)]
+    fings = [_peaked_fingering(0.0, string_idx=3, fret=14)]
+    out = fuse(events, fings, cfg, lambda_vision=0.0)
+    assert len(out) == 1
+    assert out[0].string_idx == 5  # back to high E
+    assert out[0].fret == 5
+
+
+def test_viterbi_lookahead_changes_earlier_pick():
+    """A future event's vision evidence should pull the earlier pick onto
+    the same string when staying lowest-fret would force a giant hand jump.
+
+    Sequence: A4 (MIDI 69) → B4 (MIDI 71). The B4 fingering is peaked at
+    (string=3, fret=16) — the G-string B4 position. A greedy decoder picks
+    (5, 5) for A4 (lowest fret) and would then have to leap from fret 5 →
+    fret 16 across two strings; the hand-span barrier makes that path
+    expensive. Viterbi instead picks (3, 14) for A4 — same string, two
+    frets below the upcoming B4 — so the entire path is cheap.
+    """
+    cfg = GuitarConfig()
+    events = [_ev(69, 0.0), _ev(71, 0.5)]
+    fings = [_peaked_fingering(0.5, string_idx=3, fret=16)]
+    out = fuse(events, fings, cfg, lambda_vision=1.0)
+    assert len(out) == 2
+    # Vision-decisive on the second event:
+    assert (out[1].string_idx, out[1].fret) == (3, 16)
+    # Lookahead-driven on the first event: must NOT be the audio-only (5, 5);
+    # specifically should land on the G-string A4 anchor.
+    assert (out[0].string_idx, out[0].fret) == (3, 14)

From 82d7edf55fc2e3cbe660b31900ffdab79b3f0ca7 Mon Sep 17 00:00:00 2001
From: Patrick Gilhooley <pgilhooley95@gmail.com>
Date: Wed, 6 May 2026 17:50:39 -0400
Subject: [PATCH 4/6] feat(phase5): add --fusion-lambda-vision CLI flag

---
 tabvision/tabvision/cli.py                   | 26 ++++++++++-
 tabvision/tests/unit/test_cli_fusion_flag.py | 46 ++++++++++++++++++++
 2 files changed, 70 insertions(+), 2 deletions(-)
 create mode 100644 tabvision/tests/unit/test_cli_fusion_flag.py

diff --git a/tabvision/tabvision/cli.py b/tabvision/tabvision/cli.py
index 7f313f1..a7794cb 100644
--- a/tabvision/tabvision/cli.py
+++ b/tabvision/tabvision/cli.py
@@ -74,6 +74,18 @@ def _build_parser() -> argparse.ArgumentParser:
         ),
     )
     t.add_argument("--capo", type=int, default=0, help="capo fret (0-7)")
+    t.add_argument(
+        "--fusion-lambda-vision",
+        type=float,
+        default=1.0,
+        metavar="FLOAT",
+        help=(
+            "weight on vision evidence in fusion (default 1.0). 0.0 "
+            "disables vision entirely (audio-only Viterbi); values >1 "
+            "lean more heavily on the fingertip-to-fret posterior. "
+            "See SPEC §5 / Phase-5 design doc §2."
+        ),
+    )
     t.add_argument(
         "--instrument",
         choices=["acoustic", "classical", "electric"],
@@ -147,8 +159,18 @@ def _cmd_transcribe(args: argparse.Namespace) -> int:
 
     # Phase 1: video stubbed; pass empty fingerings → fusion takes audio-only path.
     fingerings: list = []
-    tab_events = fuse(audio_events, fingerings, cfg, session)
-    logger.info("fusion produced %d tab events", len(tab_events))
+    tab_events = fuse(
+        audio_events,
+        fingerings,
+        cfg,
+        session,
+        lambda_vision=args.fusion_lambda_vision,
+    )
+    logger.info(
+        "fusion produced %d tab events (lambda_vision=%.2f)",
+        len(tab_events),
+        args.fusion_lambda_vision,
+    )
 
     output = render(tab_events, cfg)
     if args.output:
diff --git a/tabvision/tests/unit/test_cli_fusion_flag.py b/tabvision/tests/unit/test_cli_fusion_flag.py
new file mode 100644
index 0000000..d2f321b
--- /dev/null
+++ b/tabvision/tests/unit/test_cli_fusion_flag.py
@@ -0,0 +1,46 @@
+"""CLI parser smoke for ``--fusion-lambda-vision``.
+
+Verifies the flag parses with the right default, accepts user-supplied
+values, and surfaces zero (the audio-only-equivalent setting). The
+actual pass-through to ``fuse()`` is one line of code in
+``_cmd_transcribe`` — see ``tabvision/cli.py``.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from tabvision.cli import _build_parser
+
+
+def test_default_lambda_vision_is_one():
+    parser = _build_parser()
+    args = parser.parse_args(["transcribe", "in.mp4"])
+    assert args.fusion_lambda_vision == 1.0
+
+
+def test_explicit_lambda_vision_parsed():
+    parser = _build_parser()
+    args = parser.parse_args(
+        ["transcribe", "in.mp4", "--fusion-lambda-vision", "2.5"]
+    )
+    assert args.fusion_lambda_vision == pytest.approx(2.5)
+
+
+def test_lambda_vision_zero_accepted():
+    """``--fusion-lambda-vision 0`` is the audio-only ablation knob."""
+    parser = _build_parser()
+    args = parser.parse_args(
+        ["transcribe", "in.mp4", "--fusion-lambda-vision", "0"]
+    )
+    assert args.fusion_lambda_vision == 0.0
+
+
+def test_lambda_vision_only_on_transcribe():
+    """The ``check`` subcommand has no fusion stage, so the flag should
+    not be exposed there."""
+    parser = _build_parser()
+    with pytest.raises(SystemExit):
+        parser.parse_args(
+            ["check", "in.mp4", "--fusion-lambda-vision", "1.0"]
+        )

From e5db4cafa92f70793dc7a23deb615c2ac80bc5a9 Mon Sep 17 00:00:00 2001
From: Patrick Gilhooley <pgilhooley95@gmail.com>
Date: Wed, 6 May 2026 17:54:29 -0400
Subject: [PATCH 5/6] =?UTF-8?q?feat(phase5):=20acceptance=20harness=20?=
 =?UTF-8?q?=E2=80=94=20Tab=20F1=20+=20chord=20accuracy=20+=20ablation=20ga?=
 =?UTF-8?q?te?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds tabvision.eval.metrics with tab_f1() and chord_instance_accuracy()
helpers (string + fret + onset within 50ms tolerance per SPEC §9.2;
chord cluster matching uses the same 80ms gap rule as the chord-fusion
grouping). 11 unit tests cover the metric edge cases.

tests/eval/test_phase5_eval.py defines the SPEC §5 gates in code:

  - +8pp Tab F1 delta (audio+vision over audio-only) — the
    Phase-5-specific bar for "fusion is doing real work"
  - Tab F1 >= 0.85 absolute, marked xfail until Phase 2 SOTA backbone
    lands (likely needs a stronger audio backbone too)
  - Chord-instance accuracy >= 0.80

The full-pipeline runner is currently a stub: blocked on the numpy<2
(basic-pitch / TF 2.15) vs. numpy>=2 (mediapipe) env conflict — the
audio half and the video half can't import in the same venv today.
The eval tests skip with clear messages until either Phase 2 swaps in
a torch-based audio backbone or the env is reconciled separately.
The metric helpers are independent of that and ship usable as-is.
---
 tabvision/tabvision/eval/__init__.py        |   4 +
 tabvision/tabvision/eval/metrics.py         | 182 +++++++++++
 tabvision/tests/eval/test_phase5_eval.py    | 325 ++++++++++++++++++++
 tabvision/tests/unit/test_phase5_metrics.py | 127 ++++++++
 4 files changed, 638 insertions(+)
 create mode 100644 tabvision/tabvision/eval/__init__.py
 create mode 100644 tabvision/tabvision/eval/metrics.py
 create mode 100644 tabvision/tests/eval/test_phase5_eval.py
 create mode 100644 tabvision/tests/unit/test_phase5_metrics.py

diff --git a/tabvision/tabvision/eval/__init__.py b/tabvision/tabvision/eval/__init__.py
new file mode 100644
index 0000000..f9cd6a3
--- /dev/null
+++ b/tabvision/tabvision/eval/__init__.py
@@ -0,0 +1,4 @@
+"""Evaluation helpers — Tab F1, chord-instance accuracy, ablation runner.
+
+See SPEC.md §9 for metric definitions.
+"""
diff --git a/tabvision/tabvision/eval/metrics.py b/tabvision/tabvision/eval/metrics.py
new file mode 100644
index 0000000..5e2ce1b
--- /dev/null
+++ b/tabvision/tabvision/eval/metrics.py
@@ -0,0 +1,182 @@
+"""Tab F1 + chord-instance accuracy metrics — Phase 5 acceptance.
+
+Definitions follow SPEC.md §9.2:
+
+- **Tab F1**: precision / recall / F1 over (string_idx, fret, onset_s)
+  with onset matched within ``onset_tolerance_s`` (default 50 ms).
+  Greedy matcher — each predicted event matches at most one gold event,
+  picked by closest-onset.
+- **Chord instance accuracy**: gold events are grouped into chord
+  clusters using the same 80 ms gap rule as
+  :mod:`tabvision.fusion.chord`. For each gold cluster, find the closest
+  predicted cluster by midpoint onset; the cluster matches if (a) the
+  cluster sizes are equal and (b) the multiset of ``(string_idx, fret)``
+  tuples matches exactly. Accuracy = matched_chords / total_gold_chords.
+
+These helpers operate on :class:`tabvision.types.TabEvent` sequences so
+they can score the output of :func:`tabvision.fusion.fuse` directly.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Sequence
+
+from tabvision.fusion.chord import CHORD_MAX_GAP_S
+from tabvision.types import TabEvent
+
+
+@dataclass(frozen=True)
+class TabF1Result:
+    """Outcome of :func:`tab_f1`."""
+
+    precision: float
+    recall: float
+    f1: float
+    true_positives: int
+    false_positives: int
+    false_negatives: int
+
+    @property
+    def total_predicted(self) -> int:
+        return self.true_positives + self.false_positives
+
+    @property
+    def total_gold(self) -> int:
+        return self.true_positives + self.false_negatives
+
+
+def tab_f1(
+    predicted: Sequence[TabEvent],
+    gold: Sequence[TabEvent],
+    *,
+    onset_tolerance_s: float = 0.05,
+) -> TabF1Result:
+    """Tab F1 over (string, fret, onset)."""
+    pred_sorted = sorted(predicted, key=lambda t: t.onset_s)
+    gold_sorted = sorted(gold, key=lambda t: t.onset_s)
+    gold_used = [False] * len(gold_sorted)
+    tp = 0
+    fp = 0
+    for p in pred_sorted:
+        best_j = -1
+        best_dt = onset_tolerance_s + 1e-9
+        for j, g in enumerate(gold_sorted):
+            if gold_used[j]:
+                continue
+            if g.string_idx != p.string_idx or g.fret != p.fret:
+                continue
+            dt = abs(g.onset_s - p.onset_s)
+            if dt <= onset_tolerance_s and dt < best_dt:
+                best_j = j
+                best_dt = dt
+        if best_j >= 0:
+            gold_used[best_j] = True
+            tp += 1
+        else:
+            fp += 1
+    fn = sum(1 for used in gold_used if not used)
+    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+    f1 = (
+        2 * precision * recall / (precision + recall)
+        if (precision + recall) > 0
+        else 0.0
+    )
+    return TabF1Result(
+        precision=precision,
+        recall=recall,
+        f1=f1,
+        true_positives=tp,
+        false_positives=fp,
+        false_negatives=fn,
+    )
+
+
+@dataclass(frozen=True)
+class ChordAccuracyResult:
+    accuracy: float
+    matched_chords: int
+    total_chords: int
+
+
+def chord_instance_accuracy(
+    predicted: Sequence[TabEvent],
+    gold: Sequence[TabEvent],
+    *,
+    cluster_gap_s: float = CHORD_MAX_GAP_S,
+    onset_match_tolerance_s: float = 0.05,
+) -> ChordAccuracyResult:
+    """Fraction of gold chord clusters whose (string, fret) multiset
+    matches exactly in the closest predicted cluster.
+
+    A chord cluster is a maximal run of consecutive events whose adjacent
+    onset gaps are all ≤ ``cluster_gap_s`` (matches the chord-fusion
+    grouping rule). Single-event clusters count toward the metric — a
+    correctly transcribed isolated note is a "size-1 chord" instance.
+    """
+    pred_clusters = _cluster_by_gap(
+        sorted(predicted, key=lambda t: t.onset_s), cluster_gap_s
+    )
+    gold_clusters = _cluster_by_gap(
+        sorted(gold, key=lambda t: t.onset_s), cluster_gap_s
+    )
+
+    if not gold_clusters:
+        return ChordAccuracyResult(accuracy=0.0, matched_chords=0, total_chords=0)
+
+    matched = 0
+    pred_used = [False] * len(pred_clusters)
+    for gc in gold_clusters:
+        gc_mid = sum(t.onset_s for t in gc) / len(gc)
+        best_j = -1
+        best_dt = onset_match_tolerance_s + 1e-9
+        for j, pc in enumerate(pred_clusters):
+            if pred_used[j]:
+                continue
+            pc_mid = sum(t.onset_s for t in pc) / len(pc)
+            dt = abs(pc_mid - gc_mid)
+            if dt <= onset_match_tolerance_s and dt < best_dt:
+                best_j = j
+                best_dt = dt
+        if best_j < 0:
+            continue
+        pc = pred_clusters[best_j]
+        if len(pc) != len(gc):
+            continue
+        gc_set = sorted((t.string_idx, t.fret) for t in gc)
+        pc_set = sorted((t.string_idx, t.fret) for t in pc)
+        if gc_set == pc_set:
+            pred_used[best_j] = True
+            matched += 1
+
+    return ChordAccuracyResult(
+        accuracy=matched / len(gold_clusters),
+        matched_chords=matched,
+        total_chords=len(gold_clusters),
+    )
+
+
+def _cluster_by_gap(
+    events: Sequence[TabEvent], gap_s: float
+) -> list[list[TabEvent]]:
+    """Same chain semantics as :func:`tabvision.fusion.chord.cluster_events`,
+    but on :class:`TabEvent` (which carries an ``onset_s``). Inlined to
+    avoid a sequence-type adapter."""
+    if not events:
+        return []
+    clusters: list[list[TabEvent]] = [[events[0]]]
+    for ev in events[1:]:
+        if ev.onset_s - clusters[-1][-1].onset_s <= gap_s:
+            clusters[-1].append(ev)
+        else:
+            clusters.append([ev])
+    return clusters
+
+
+__all__ = [
+    "TabF1Result",
+    "ChordAccuracyResult",
+    "tab_f1",
+    "chord_instance_accuracy",
+]
diff --git a/tabvision/tests/eval/test_phase5_eval.py b/tabvision/tests/eval/test_phase5_eval.py
new file mode 100644
index 0000000..bbed061
--- /dev/null
+++ b/tabvision/tests/eval/test_phase5_eval.py
@@ -0,0 +1,325 @@
+"""Phase 5 acceptance harness — audio+vision vs. audio-only ablation.
+
+Per SPEC §5 and ``docs/plans/2026-05-06-phase5-fusion-design.md`` §6 Step E,
+the Phase-5-specific gate is:
+
+    Tab F1 (lambda_vision=1.0) - Tab F1 (lambda_vision=0.0) ≥ 0.08
+
+The absolute Tab F1 ≥ 0.85 bar is currently expected to need Phase 2's
+Riley/Edwards audio backbone too — so it's marked ``xfail`` until Phase
+2 is wired in. The +8 pp delta is on the hook for Phase 5 alone, since
+that's the test for "fusion is doing real work given the current audio".
+
+**Environment caveat (2026-05-06):** the audio backend (basic-pitch +
+TF 2.15) requires ``numpy<2`` while MediaPipe (Phase 4) requires
+``numpy>=2`` — so a single venv currently can't run both halves of the
+pipeline. The test skips when MediaPipe imports fail; once the env is
+reconciled (or Phase 2's torch-based audio backbone replaces basic-pitch)
+the gate runs unchanged. See ``DECISIONS.md`` if/when this gets fixed.
+
+The gold source is the benchmark index at
+``tabvision-server/tests/fixtures/benchmarks/index.json`` — same set the
+legacy ``evaluate_transcription.py`` used. Phase 1.5's annotation tool
+will eventually fold its labelled clips into the same harness.
+"""
+
+from __future__ import annotations
+
+import datetime as _dt
+import json
+from pathlib import Path
+from typing import Sequence
+
+import pytest
+
+from tabvision.eval.metrics import (
+    ChordAccuracyResult,
+    TabF1Result,
+    chord_instance_accuracy,
+    tab_f1,
+)
+from tabvision.types import TabEvent
+
+PHASE5_TAB_F1_DELTA_GATE = 0.08
+"""SPEC §5: audio+vision must beat audio-only by at least this much on Tab F1."""
+
+PHASE5_TAB_F1_ABSOLUTE_GATE = 0.85
+"""SPEC §5: target absolute Tab F1. Likely needs Phase 2 SOTA backbone."""
+
+PHASE5_CHORD_ACCURACY_GATE = 0.80
+"""SPEC §5: chord-instance accuracy gate."""
+
+REPO_ROOT = Path(__file__).resolve().parents[3]
+BENCHMARK_INDEX = (
+    REPO_ROOT
+    / "tabvision-server"
+    / "tests"
+    / "fixtures"
+    / "benchmarks"
+    / "index.json"
+)
+EVAL_OUTPUT_DIR = REPO_ROOT / "tabvision-server" / "tools" / "outputs"
+
+
+@pytest.mark.eval
+def test_phase5_audio_plus_vision_beats_audio_only():
+    """Run the full pipeline on the eval set under both lambda_vision
+    settings; assert audio+vision wins by ≥ 8 pp Tab F1.
+
+    Skips automatically when any heavy dependency (basic-pitch, mediapipe,
+    cv2, ffmpeg) is unavailable.
+    """
+    pytest.importorskip(
+        "basic_pitch",
+        reason="basic-pitch needed for audio-only ablation; install with "
+        "pip install '.[audio-baseline]'",
+    )
+    pytest.importorskip(
+        "mediapipe",
+        reason="MediaPipe needed for video evidence; install with "
+        "pip install '.[vision]'. NOTE: requires numpy>=2, currently "
+        "incompatible with TF 2.15.",
+    )
+    pytest.importorskip("cv2", reason="opencv-python needed for video frames.")
+
+    benchmarks = _load_benchmarks()
+    if not benchmarks:
+        pytest.skip("no benchmarks defined in index.json")
+
+    audio_only_scores: list[TabF1Result] = []
+    audio_video_scores: list[TabF1Result] = []
+    chord_scores: list[ChordAccuracyResult] = []
+    rows: list[dict] = []
+
+    for bench in benchmarks:
+        video = REPO_ROOT / bench["video_path"]
+        gold_path = REPO_ROOT / bench["ground_truth_path"]
+        if not video.exists() or not gold_path.exists():
+            continue
+        gold = _load_gold_tab_events(gold_path)
+        if not gold:
+            continue
+
+        ao = _run_pipeline(video, lambda_vision=0.0)
+        av = _run_pipeline(video, lambda_vision=1.0)
+
+        ao_score = tab_f1(ao, gold)
+        av_score = tab_f1(av, gold)
+        chord_score = chord_instance_accuracy(av, gold)
+
+        audio_only_scores.append(ao_score)
+        audio_video_scores.append(av_score)
+        chord_scores.append(chord_score)
+        rows.append(
+            {
+                "id": bench["id"],
+                "ao_f1": ao_score.f1,
+                "av_f1": av_score.f1,
+                "delta": av_score.f1 - ao_score.f1,
+                "chord_acc": chord_score.accuracy,
+            }
+        )
+
+    if not rows:
+        pytest.skip("no benchmark videos / ground truth files were available")
+
+    ao_mean = _mean([r.f1 for r in audio_only_scores])
+    av_mean = _mean([r.f1 for r in audio_video_scores])
+    chord_mean = _mean([r.accuracy for r in chord_scores])
+    delta = av_mean - ao_mean
+
+    _write_report(
+        rows=rows,
+        ao_mean=ao_mean,
+        av_mean=av_mean,
+        delta=delta,
+        chord_mean=chord_mean,
+    )
+
+    assert delta >= PHASE5_TAB_F1_DELTA_GATE, (
+        f"Phase 5 +{PHASE5_TAB_F1_DELTA_GATE * 100:.0f}pp gate failed: "
+        f"audio+vision {av_mean:.3f} - audio-only {ao_mean:.3f} = "
+        f"{delta:+.3f}. Per SPEC §5 decision tree, drop lambda_vision and "
+        f"investigate vision calibration if equal/worse, or tighten "
+        f"hand-span / open-string priors if marginally better."
+    )
+
+
+@pytest.mark.eval
+@pytest.mark.xfail(
+    reason="absolute Tab F1 ≥ 0.85 likely needs Phase 2 audio SOTA backbone "
+    "to also be wired in; track in DECISIONS.md",
+    strict=False,
+)
+def test_phase5_absolute_tab_f1():
+    pytest.importorskip("basic_pitch")
+    pytest.importorskip("mediapipe")
+    pytest.importorskip("cv2")
+
+    benchmarks = _load_benchmarks()
+    if not benchmarks:
+        pytest.skip("no benchmarks defined in index.json")
+
+    scores: list[TabF1Result] = []
+    for bench in benchmarks:
+        video = REPO_ROOT / bench["video_path"]
+        gold_path = REPO_ROOT / bench["ground_truth_path"]
+        if not video.exists() or not gold_path.exists():
+            continue
+        gold = _load_gold_tab_events(gold_path)
+        if not gold:
+            continue
+        av = _run_pipeline(video, lambda_vision=1.0)
+        scores.append(tab_f1(av, gold))
+
+    if not scores:
+        pytest.skip("no benchmark videos available")
+
+    mean_f1 = _mean([s.f1 for s in scores])
+    assert mean_f1 >= PHASE5_TAB_F1_ABSOLUTE_GATE, (
+        f"absolute Tab F1 {mean_f1:.3f} < {PHASE5_TAB_F1_ABSOLUTE_GATE}"
+    )
+
+
+@pytest.mark.eval
+def test_phase5_chord_accuracy():
+    pytest.importorskip("basic_pitch")
+    pytest.importorskip("mediapipe")
+    pytest.importorskip("cv2")
+
+    benchmarks = _load_benchmarks()
+    if not benchmarks:
+        pytest.skip("no benchmarks defined in index.json")
+
+    scores: list[ChordAccuracyResult] = []
+    for bench in benchmarks:
+        video = REPO_ROOT / bench["video_path"]
+        gold_path = REPO_ROOT / bench["ground_truth_path"]
+        if not video.exists() or not gold_path.exists():
+            continue
+        gold = _load_gold_tab_events(gold_path)
+        if not gold:
+            continue
+        av = _run_pipeline(video, lambda_vision=1.0)
+        scores.append(chord_instance_accuracy(av, gold))
+
+    if not scores:
+        pytest.skip("no benchmark videos available")
+
+    mean_acc = _mean([s.accuracy for s in scores])
+    assert mean_acc >= PHASE5_CHORD_ACCURACY_GATE, (
+        f"chord accuracy {mean_acc:.3f} < {PHASE5_CHORD_ACCURACY_GATE}"
+    )
+
+
+# ---------- helpers ----------
+
+
+def _load_benchmarks() -> list[dict]:
+    if not BENCHMARK_INDEX.exists():
+        return []
+    return json.loads(BENCHMARK_INDEX.read_text()).get("benchmarks", [])
+
+
+def _load_gold_tab_events(path: Path) -> list[TabEvent]:
+    """Parse the legacy benchmark ground-truth ``.txt`` format into TabEvents.
+
+    The legacy parser lives in ``tabvision-server/evaluate_transcription.py``;
+    this helper imports it lazily to keep the eval module's deps minimal.
+    Returns an empty list if the legacy module isn't importable (e.g. when
+    the test runs from an environment without the server checked out).
+    """
+    try:
+        import sys
+
+        server_path = REPO_ROOT / "tabvision-server"
+        if str(server_path) not in sys.path:
+            sys.path.insert(0, str(server_path))
+        from evaluate_transcription import parse_ground_truth_tabs
+    except Exception:  # noqa: BLE001 — broad: optional dep, want graceful skip
+        return []
+
+    text = path.read_text()
+    parsed = parse_ground_truth_tabs(text)
+    # The legacy parser returns beats; we need seconds. The benchmarks
+    # don't carry duration, so this helper currently returns the parsed
+    # raw notes without timing. Phase 5 acceptance defers timing
+    # alignment to the per-video runner that knows the video duration —
+    # see ``_run_pipeline``.
+    out: list[TabEvent] = []
+    for note in parsed:
+        out.append(
+            TabEvent(
+                onset_s=float(note["beat"]),  # placeholder — runner aligns
+                duration_s=0.25,
+                # Legacy uses 1=high E, 6=low E; spec uses 0=low E, 5=high E.
+                string_idx=6 - int(note["string"]),
+                fret=0 if note["fret"] == "X" else int(note["fret"]),
+                pitch_midi=0,  # not needed for Tab F1
+                confidence=1.0,
+            )
+        )
+    return out
+
+
+def _run_pipeline(video: Path, *, lambda_vision: float) -> Sequence[TabEvent]:
+    """Run audio + video + fusion end-to-end and return TabEvents.
+
+    Stub for now: until the numpy<2 / numpy>=2 environment conflict is
+    resolved (or Phase 2's torch-based audio backbone is wired up), this
+    raises ``ImportError`` so the surrounding ``importorskip`` calls
+    catch it and the test skips with a clear message. Implementation
+    will compose ``demux`` → audio backend → guitar/fretboard/hand
+    detect → ``fuse(..., lambda_vision=lambda_vision)`` once the env is
+    sorted. See the design doc §6 Step E.
+    """
+    raise ImportError(
+        "Phase 5 end-to-end pipeline runner not yet wired — blocked on "
+        "numpy<2 vs numpy>=2 env conflict between basic-pitch and "
+        "mediapipe. See test docstring."
+    )
+
+
+def _mean(values: list[float]) -> float:
+    return sum(values) / len(values) if values else 0.0
+
+
+def _write_report(
+    *,
+    rows: list[dict],
+    ao_mean: float,
+    av_mean: float,
+    delta: float,
+    chord_mean: float,
+) -> None:
+    """Emit ``tools/outputs/phase5_eval-YYYY-MM-DD.md`` summary report."""
+    EVAL_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    today = _dt.date.today().isoformat()
+    out = EVAL_OUTPUT_DIR / f"phase5_eval-{today}.md"
+    lines = [
+        f"# Phase 5 acceptance — {today}",
+        "",
+        "Audio-only vs. audio+vision ablation, per SPEC §5.",
+        "",
+        "## Aggregate",
+        "",
+        "| Metric | Value |",
+        "|---|---:|",
+        f"| Mean Tab F1 (lambda_vision=0.0) | {ao_mean:.4f} |",
+        f"| Mean Tab F1 (lambda_vision=1.0) | {av_mean:.4f} |",
+        f"| Delta (audio+vision − audio-only) | {delta:+.4f} |",
+        f"| Mean chord-instance accuracy | {chord_mean:.4f} |",
+        f"| Phase 5 +{PHASE5_TAB_F1_DELTA_GATE * 100:.0f}pp gate | "
+        f"{'PASS' if delta >= PHASE5_TAB_F1_DELTA_GATE else 'FAIL'} |",
+        "",
+        "## Per-video",
+        "",
+        "| id | audio-only F1 | audio+vision F1 | delta | chord acc |",
+        "|---|---:|---:|---:|---:|",
+    ]
+    for r in rows:
+        lines.append(
+            f"| {r['id']} | {r['ao_f1']:.3f} | {r['av_f1']:.3f} | "
+            f"{r['delta']:+.3f} | {r['chord_acc']:.3f} |"
+        )
+    out.write_text("\n".join(lines) + "\n")
diff --git a/tabvision/tests/unit/test_phase5_metrics.py b/tabvision/tests/unit/test_phase5_metrics.py
new file mode 100644
index 0000000..66b14f7
--- /dev/null
+++ b/tabvision/tests/unit/test_phase5_metrics.py
@@ -0,0 +1,127 @@
+"""Unit tests for ``tabvision.eval.metrics`` (Tab F1 + chord accuracy)."""
+
+from __future__ import annotations
+
+from tabvision.eval.metrics import chord_instance_accuracy, tab_f1
+from tabvision.types import TabEvent
+
+
+def _t(t: float, s: int, f: int, midi: int = 60) -> TabEvent:
+    return TabEvent(
+        onset_s=t,
+        duration_s=0.25,
+        string_idx=s,
+        fret=f,
+        pitch_midi=midi,
+        confidence=0.9,
+    )
+
+
+# ---------- tab_f1 ----------
+
+
+def test_tab_f1_perfect_match():
+    gold = [_t(0.0, 5, 5), _t(0.5, 5, 7)]
+    pred = [_t(0.0, 5, 5), _t(0.5, 5, 7)]
+    r = tab_f1(pred, gold)
+    assert r.f1 == 1.0
+    assert r.true_positives == 2
+    assert r.false_positives == 0
+    assert r.false_negatives == 0
+
+
+def test_tab_f1_extra_prediction_lowers_precision():
+    gold = [_t(0.0, 5, 5)]
+    pred = [_t(0.0, 5, 5), _t(0.5, 5, 7)]
+    r = tab_f1(pred, gold)
+    assert r.true_positives == 1
+    assert r.false_positives == 1
+    assert r.false_negatives == 0
+    assert r.recall == 1.0
+    assert r.precision == 0.5
+
+
+def test_tab_f1_missed_gold_lowers_recall():
+    gold = [_t(0.0, 5, 5), _t(0.5, 5, 7)]
+    pred = [_t(0.0, 5, 5)]
+    r = tab_f1(pred, gold)
+    assert r.true_positives == 1
+    assert r.false_positives == 0
+    assert r.false_negatives == 1
+    assert r.precision == 1.0
+    assert r.recall == 0.5
+
+
+def test_tab_f1_onset_outside_tolerance_is_a_miss():
+    gold = [_t(0.0, 5, 5)]
+    pred = [_t(0.10, 5, 5)]  # 100 ms off, tolerance 50 ms
+    r = tab_f1(pred, gold)
+    assert r.true_positives == 0
+    assert r.false_positives == 1
+    assert r.false_negatives == 1
+
+
+def test_tab_f1_wrong_string_or_fret_is_a_miss():
+    gold = [_t(0.0, 5, 5)]
+    wrong_string = [_t(0.0, 4, 5)]
+    wrong_fret = [_t(0.0, 5, 6)]
+    assert tab_f1(wrong_string, gold).true_positives == 0
+    assert tab_f1(wrong_fret, gold).true_positives == 0
+
+
+def test_tab_f1_each_gold_matches_at_most_one_predicted():
+    """A duplicated predicted event should not double-count against the
+    same gold event — the second one is a false positive."""
+    gold = [_t(0.0, 5, 5)]
+    pred = [_t(0.0, 5, 5), _t(0.01, 5, 5)]  # both within tolerance
+    r = tab_f1(pred, gold)
+    assert r.true_positives == 1
+    assert r.false_positives == 1
+
+
+# ---------- chord_instance_accuracy ----------
+
+
+def test_chord_accuracy_perfect_chord_matches():
+    gold = [_t(0.0, 5, 0), _t(0.0, 4, 1), _t(0.0, 3, 0)]
+    pred = [_t(0.0, 5, 0), _t(0.0, 4, 1), _t(0.0, 3, 0)]
+    r = chord_instance_accuracy(pred, gold)
+    assert r.accuracy == 1.0
+    assert r.matched_chords == 1
+    assert r.total_chords == 1
+
+
+def test_chord_accuracy_wrong_position_in_chord_misses():
+    gold = [_t(0.0, 5, 0), _t(0.0, 4, 1), _t(0.0, 3, 0)]
+    pred = [_t(0.0, 5, 0), _t(0.0, 4, 1), _t(0.0, 3, 7)]  # one wrong
+    r = chord_instance_accuracy(pred, gold)
+    assert r.matched_chords == 0
+    assert r.total_chords == 1
+
+
+def test_chord_accuracy_size_mismatch_misses():
+    gold = [_t(0.0, 5, 0), _t(0.0, 4, 1), _t(0.0, 3, 0)]
+    pred = [_t(0.0, 5, 0), _t(0.0, 4, 1)]  # missing one note
+    r = chord_instance_accuracy(pred, gold)
+    assert r.matched_chords == 0
+
+
+def test_chord_accuracy_separates_clusters_by_gap():
+    """Two well-separated gold chords should both score independently."""
+    gold = [
+        _t(0.0, 5, 0), _t(0.0, 4, 1),
+        _t(2.0, 5, 7), _t(2.0, 4, 8),
+    ]
+    pred = [
+        _t(0.0, 5, 0), _t(0.0, 4, 1),
+        _t(2.0, 5, 7), _t(2.0, 4, 8),
+    ]
+    r = chord_instance_accuracy(pred, gold)
+    assert r.total_chords == 2
+    assert r.matched_chords == 2
+
+
+def test_chord_accuracy_empty_gold_yields_zero():
+    r = chord_instance_accuracy([], [])
+    assert r.total_chords == 0
+    assert r.accuracy == 0.0

From 055f66d1c16a1a294d04d482ef8997b42a79cfe9 Mon Sep 17 00:00:00 2001
From: Patrick Gilhooley <pgilhooley95@gmail.com>
Date: Wed, 6 May 2026 19:03:26 -0400
Subject: [PATCH 6/6] chore(phase5): ruff + format pass; switch eval scaffold
 to highres audio

- Apply ruff --fix + ruff format across the Phase 5 modules and tests
  (zip strict=, Sequence from collections.abc, line wrapping).
- Rewire tests/eval/test_phase5_eval.py to call audio.backend.make("highres")
  instead of basic-pitch. Phase 2's torch-based audio backbone (commit
  aae1ab3) is already shipped on refactor/v1; the previous "wait for
  Phase 2" framing was wrong.
- Reframe the open dependency: the actual gap is wiring the video stack
  (guitar -> fretboard -> hand) into a single run_pipeline() call.
  cli.py:159 has the same gap. Until that integration ships,
  _run_pipeline raises NotImplementedError after running the audio half
  so anyone running the eval gets a precise error.

206 unit tests still pass; eval tests still skip cleanly until the
video integration lands.
---
 tabvision/tabvision/cli.py                    |   8 +-
 tabvision/tabvision/eval/metrics.py           |  20 +--
 tabvision/tabvision/fusion/candidates.py      |   4 +-
 tabvision/tabvision/fusion/chord.py           |  10 +-
 tabvision/tabvision/fusion/playability.py     |  10 +-
 tabvision/tabvision/fusion/viterbi.py         |  30 ++---
 tabvision/tests/eval/test_phase5_eval.py      | 115 +++++++++++-------
 tabvision/tests/unit/test_chord_fusion.py     |   8 +-
 tabvision/tests/unit/test_cli_fusion_flag.py  |  12 +-
 .../tests/unit/test_fusion_audio_only.py      |   8 +-
 tabvision/tests/unit/test_phase5_metrics.py   |  12 +-
 tabvision/tests/unit/test_playability.py      |  21 +---
 12 files changed, 116 insertions(+), 142 deletions(-)

diff --git a/tabvision/tabvision/cli.py b/tabvision/tabvision/cli.py
index a7794cb..a08fb19 100644
--- a/tabvision/tabvision/cli.py
+++ b/tabvision/tabvision/cli.py
@@ -132,9 +132,7 @@ def _cmd_transcribe(args: argparse.Namespace) -> int:
     from tabvision.types import GuitarConfig, SessionConfig
 
     cfg = GuitarConfig(capo=args.capo)
-    session = SessionConfig(
-        instrument=args.instrument, tone=args.tone, style=args.style
-    )
+    session = SessionConfig(instrument=args.instrument, tone=args.tone, style=args.style)
 
     if not args.no_preflight:
         rc = _run_preflight_gate(args)
@@ -212,9 +210,7 @@ def _run_preflight_gate(args: argparse.Namespace) -> int:
     has_fail = any(f.severity == "fail" for f in report.findings)
     if has_fail or (args.strict and not report.passed):
         sys.stderr.write(render(report))
-        sys.stderr.write(
-            "Aborting transcription. Re-run with --no-preflight to bypass.\n"
-        )
+        sys.stderr.write("Aborting transcription. Re-run with --no-preflight to bypass.\n")
         return 1
     if not report.passed:
         sys.stderr.write(render(report))
diff --git a/tabvision/tabvision/eval/metrics.py b/tabvision/tabvision/eval/metrics.py
index 5e2ce1b..92fd24f 100644
--- a/tabvision/tabvision/eval/metrics.py
+++ b/tabvision/tabvision/eval/metrics.py
@@ -19,8 +19,8 @@
 
 from __future__ import annotations
 
+from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Sequence
 
 from tabvision.fusion.chord import CHORD_MAX_GAP_S
 from tabvision.types import TabEvent
@@ -78,11 +78,7 @@ def tab_f1(
     fn = sum(1 for used in gold_used if not used)
     precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
     recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
-    f1 = (
-        2 * precision * recall / (precision + recall)
-        if (precision + recall) > 0
-        else 0.0
-    )
+    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
     return TabF1Result(
         precision=precision,
         recall=recall,
@@ -115,12 +111,8 @@ def chord_instance_accuracy(
     grouping rule). Single-event clusters count toward the metric — a
     correctly transcribed isolated note is a "size-1 chord" instance.
     """
-    pred_clusters = _cluster_by_gap(
-        sorted(predicted, key=lambda t: t.onset_s), cluster_gap_s
-    )
-    gold_clusters = _cluster_by_gap(
-        sorted(gold, key=lambda t: t.onset_s), cluster_gap_s
-    )
+    pred_clusters = _cluster_by_gap(sorted(predicted, key=lambda t: t.onset_s), cluster_gap_s)
+    gold_clusters = _cluster_by_gap(sorted(gold, key=lambda t: t.onset_s), cluster_gap_s)
 
     if not gold_clusters:
         return ChordAccuracyResult(accuracy=0.0, matched_chords=0, total_chords=0)
@@ -157,9 +149,7 @@ def chord_instance_accuracy(
     )
 
 
-def _cluster_by_gap(
-    events: Sequence[TabEvent], gap_s: float
-) -> list[list[TabEvent]]:
+def _cluster_by_gap(events: Sequence[TabEvent], gap_s: float) -> list[list[TabEvent]]:
     """Same chain semantics as :func:`tabvision.fusion.chord.cluster_events`,
     but on :class:`TabEvent` (which carries an ``onset_s``). Inlined to
     avoid a sequence-type adapter."""
diff --git a/tabvision/tabvision/fusion/candidates.py b/tabvision/tabvision/fusion/candidates.py
index 2d4873c..71e4c65 100644
--- a/tabvision/tabvision/fusion/candidates.py
+++ b/tabvision/tabvision/fusion/candidates.py
@@ -22,9 +22,7 @@ class Candidate:
     fret: int  # 0 = open (or capo), max_fret inclusive
 
 
-def candidate_positions(
-    pitch_midi: int, cfg: GuitarConfig | None = None
-) -> list[Candidate]:
+def candidate_positions(pitch_midi: int, cfg: GuitarConfig | None = None) -> list[Candidate]:
     """All valid positions for ``pitch_midi`` under ``cfg``.
 
     Capo handling: open strings effectively start at ``cfg.capo``. A pitch
diff --git a/tabvision/tabvision/fusion/chord.py b/tabvision/tabvision/fusion/chord.py
index f734a06..4fad33c 100644
--- a/tabvision/tabvision/fusion/chord.py
+++ b/tabvision/tabvision/fusion/chord.py
@@ -18,7 +18,7 @@
 
 from __future__ import annotations
 
-from typing import Sequence
+from collections.abc import Sequence
 
 from tabvision.fusion.candidates import Candidate, candidate_positions
 from tabvision.fusion.playability import MAX_HAND_SPAN
@@ -67,15 +67,11 @@ def enumerate_chord_states(
     if not events:
         return []
 
-    per_event_candidates = [
-        candidate_positions(ev.pitch_midi, cfg) for ev in events
-    ]
+    per_event_candidates = [candidate_positions(ev.pitch_midi, cfg) for ev in events]
     if any(not cands for cands in per_event_candidates):
         return []
 
-    states: list[tuple[Candidate, ...]] = [
-        (c,) for c in per_event_candidates[0]
-    ]
+    states: list[tuple[Candidate, ...]] = [(c,) for c in per_event_candidates[0]]
     for k in range(1, len(events)):
         next_states: list[tuple[Candidate, ...]] = []
         for state in states:
diff --git a/tabvision/tabvision/fusion/playability.py b/tabvision/tabvision/fusion/playability.py
index 2def9c5..658cd27 100644
--- a/tabvision/tabvision/fusion/playability.py
+++ b/tabvision/tabvision/fusion/playability.py
@@ -16,7 +16,7 @@
 from __future__ import annotations
 
 import math
-from typing import Sequence
+from collections.abc import Sequence
 
 from tabvision.fusion.candidates import Candidate
 from tabvision.types import AudioEvent, FrameFingering, GuitarConfig
@@ -65,9 +65,7 @@
 EPS = 1e-9
 
 
-def find_fingering_at(
-    t: float, fingerings: Sequence[FrameFingering]
-) -> FrameFingering | None:
+def find_fingering_at(t: float, fingerings: Sequence[FrameFingering]) -> FrameFingering | None:
     """Return the ``FrameFingering`` whose ``.t`` is closest to ``t``.
 
     Returns ``None`` when ``fingerings`` is empty or no entry carries
@@ -128,9 +126,7 @@ def emission_cost(
     return cost
 
 
-def transition_cost(
-    prev: Candidate, curr: Candidate, cfg: GuitarConfig
-) -> float:
+def transition_cost(prev: Candidate, curr: Candidate, cfg: GuitarConfig) -> float:
     """Transition cost from ``prev`` to ``curr``.
 
     - String continuity: ``-SAME_STRING_BONUS`` when on the same string.
diff --git a/tabvision/tabvision/fusion/viterbi.py b/tabvision/tabvision/fusion/viterbi.py
index 1a67e9c..85056ab 100644
--- a/tabvision/tabvision/fusion/viterbi.py
+++ b/tabvision/tabvision/fusion/viterbi.py
@@ -20,7 +20,7 @@
 from __future__ import annotations
 
 import math
-from typing import Sequence
+from collections.abc import Sequence
 
 from tabvision.fusion import chord, playability
 from tabvision.fusion.candidates import Candidate, candidate_positions
@@ -75,16 +75,12 @@ def fuse(
 
     # Drop out-of-range pitches before clustering so the cluster shape
     # reflects what's actually decodable.
-    valid_events = [
-        ev for ev in events if candidate_positions(ev.pitch_midi, cfg)
-    ]
+    valid_events = [ev for ev in events if candidate_positions(ev.pitch_midi, cfg)]
     if not valid_events:
         return []
 
     clusters = chord.cluster_events(valid_events)
-    cluster_data: list[
-        tuple[list[AudioEvent], list[tuple[Candidate, ...]]]
-    ] = []
+    cluster_data: list[tuple[list[AudioEvent], list[tuple[Candidate, ...]]]] = []
     for cluster in clusters:
         states = chord.enumerate_chord_states(cluster, cfg)
         if states:
@@ -97,9 +93,7 @@ def fuse(
 
 
 def _viterbi_clusters(
-    cluster_data: list[
-        tuple[list[AudioEvent], list[tuple[Candidate, ...]]]
-    ],
+    cluster_data: list[tuple[list[AudioEvent], list[tuple[Candidate, ...]]]],
     fingerings: Sequence[FrameFingering],
     cfg: GuitarConfig,
     lambda_vision: float,
@@ -107,15 +101,11 @@ def _viterbi_clusters(
     """Cluster-level Viterbi DP. Worst case ``O(N · S^2)`` for ``N``
     clusters with ``S`` states each."""
 
-    def state_emission(
-        cluster: list[AudioEvent], state: tuple[Candidate, ...]
-    ) -> float:
+    def state_emission(cluster: list[AudioEvent], state: tuple[Candidate, ...]) -> float:
         total = 0.0
-        for ev, c in zip(cluster, state):
+        for ev, c in zip(cluster, state, strict=True):
             f = playability.find_fingering_at(ev.onset_s, fingerings)
-            total += playability.emission_cost(
-                c, ev, f, cfg, lambda_vision=lambda_vision
-            )
+            total += playability.emission_cost(c, ev, f, cfg, lambda_vision=lambda_vision)
         return total
 
     n = len(cluster_data)
@@ -136,9 +126,7 @@ def state_emission(
             anchor_curr = chord.chord_anchor(state)
             for pi, prev_state in enumerate(prev_states):
                 anchor_prev = chord.chord_anchor(prev_state)
-                trans = playability.transition_cost(
-                    anchor_prev, anchor_curr, cfg
-                )
+                trans = playability.transition_cost(anchor_prev, anchor_curr, cfg)
                 total = cost[i - 1][pi] + trans + emit
                 if total < cost[i][si]:
                     cost[i][si] = total
@@ -155,7 +143,7 @@ def state_emission(
     out: list[TabEvent] = []
     for i, (cluster, states) in enumerate(cluster_data):
         state = states[picks_idx[i]]
-        for ev, c in zip(cluster, state):
+        for ev, c in zip(cluster, state, strict=True):
             out.append(
                 TabEvent(
                     onset_s=ev.onset_s,
diff --git a/tabvision/tests/eval/test_phase5_eval.py b/tabvision/tests/eval/test_phase5_eval.py
index bbed061..5474c6f 100644
--- a/tabvision/tests/eval/test_phase5_eval.py
+++ b/tabvision/tests/eval/test_phase5_eval.py
@@ -5,17 +5,26 @@
 
     Tab F1 (lambda_vision=1.0) - Tab F1 (lambda_vision=0.0) ≥ 0.08
 
-The absolute Tab F1 ≥ 0.85 bar is currently expected to need Phase 2's
-Riley/Edwards audio backbone too — so it's marked ``xfail`` until Phase
-2 is wired in. The +8 pp delta is on the hook for Phase 5 alone, since
-that's the test for "fusion is doing real work given the current audio".
-
-**Environment caveat (2026-05-06):** the audio backend (basic-pitch +
-TF 2.15) requires ``numpy<2`` while MediaPipe (Phase 4) requires
-``numpy>=2`` — so a single venv currently can't run both halves of the
-pipeline. The test skips when MediaPipe imports fail; once the env is
-reconciled (or Phase 2's torch-based audio backbone replaces basic-pitch)
-the gate runs unchanged. See ``DECISIONS.md`` if/when this gets fixed.
+The absolute Tab F1 ≥ 0.85 bar likely also needs Phase 7's augmentation
+work to clear, so it's marked ``xfail`` for now. The +8 pp delta is on
+the hook for Phase 5 alone — that's the test for "fusion is doing real
+work given today's audio".
+
+**Audio backend:** uses ``tabvision.audio.backend.make("highres")``
+(Phase 2 Riley/Edwards / GAPS via hf-midi-transcription, torch-based,
+numpy-2-compatible) — *not* basic-pitch. Phase 2 is already shipped on
+``refactor/v1`` (commit ``aae1ab3``); the earlier framing of Phase 2 as
+"future work" was wrong.
+
+**Open dependency:** the *full pipeline* (demux → audio → guitar → fretboard
+→ hand → fuse) is not yet wired end-to-end in this repo. ``cli.py:159``
+still has ``fingerings: list = []`` (Phase 1 stub). The video components
+exist independently — see ``tabvision.video.{guitar,fretboard,hand}`` —
+but assembling them into a runnable ``run_pipeline(video, lambda_vision)``
+is its own piece of work, likely a Phase 8 "eval harness hardening" task
+or a dedicated integration ticket. Until that lands, ``_run_pipeline``
+below raises ``NotImplementedError`` for the video portion and the eval
+tests cleanly skip.
 
 The gold source is the benchmark index at
 ``tabvision-server/tests/fixtures/benchmarks/index.json`` — same set the
@@ -27,8 +36,8 @@
 
 import datetime as _dt
 import json
+from collections.abc import Sequence
 from pathlib import Path
-from typing import Sequence
 
 import pytest
 
@@ -51,12 +60,7 @@
 
 REPO_ROOT = Path(__file__).resolve().parents[3]
 BENCHMARK_INDEX = (
-    REPO_ROOT
-    / "tabvision-server"
-    / "tests"
-    / "fixtures"
-    / "benchmarks"
-    / "index.json"
+    REPO_ROOT / "tabvision-server" / "tests" / "fixtures" / "benchmarks" / "index.json"
 )
 EVAL_OUTPUT_DIR = REPO_ROOT / "tabvision-server" / "tools" / "outputs"
 
@@ -66,19 +70,15 @@ def test_phase5_audio_plus_vision_beats_audio_only():
     """Run the full pipeline on the eval set under both lambda_vision
     settings; assert audio+vision wins by ≥ 8 pp Tab F1.
 
-    Skips automatically when any heavy dependency (basic-pitch, mediapipe,
-    cv2, ffmpeg) is unavailable.
+    Skips automatically when any heavy dependency (the highres audio
+    backend's torch + hf-midi-transcription stack, mediapipe, cv2, ffmpeg)
+    is unavailable, *or* when the video-stack-into-pipeline integration
+    is still a TODO in ``_run_pipeline``.
     """
-    pytest.importorskip(
-        "basic_pitch",
-        reason="basic-pitch needed for audio-only ablation; install with "
-        "pip install '.[audio-baseline]'",
-    )
+    pytest.importorskip("torch", reason="highres backend needs torch.")
     pytest.importorskip(
         "mediapipe",
-        reason="MediaPipe needed for video evidence; install with "
-        "pip install '.[vision]'. NOTE: requires numpy>=2, currently "
-        "incompatible with TF 2.15.",
+        reason="MediaPipe needed for video evidence; install with pip install '.[vision]'.",
     )
     pytest.importorskip("cv2", reason="opencv-python needed for video frames.")
 
@@ -152,7 +152,7 @@ def test_phase5_audio_plus_vision_beats_audio_only():
     strict=False,
 )
 def test_phase5_absolute_tab_f1():
-    pytest.importorskip("basic_pitch")
+    pytest.importorskip("torch")
     pytest.importorskip("mediapipe")
     pytest.importorskip("cv2")
 
@@ -183,7 +183,7 @@ def test_phase5_absolute_tab_f1():
 
 @pytest.mark.eval
 def test_phase5_chord_accuracy():
-    pytest.importorskip("basic_pitch")
+    pytest.importorskip("torch")
     pytest.importorskip("mediapipe")
     pytest.importorskip("cv2")
 
@@ -262,23 +262,54 @@ def _load_gold_tab_events(path: Path) -> list[TabEvent]:
     return out
 
 
-def _run_pipeline(video: Path, *, lambda_vision: float) -> Sequence[TabEvent]:
+def _run_pipeline(
+    video: Path,
+    *,
+    lambda_vision: float,
+    audio_backend_name: str = "highres",
+) -> Sequence[TabEvent]:
     """Run audio + video + fusion end-to-end and return TabEvents.
 
-    Stub for now: until the numpy<2 / numpy>=2 environment conflict is
-    resolved (or Phase 2's torch-based audio backbone is wired up), this
-    raises ``ImportError`` so the surrounding ``importorskip`` calls
-    catch it and the test skips with a clear message. Implementation
-    will compose ``demux`` → audio backend → guitar/fretboard/hand
-    detect → ``fuse(..., lambda_vision=lambda_vision)`` once the env is
-    sorted. See the design doc §6 Step E.
+    The audio half is wired: ``demux`` + ``audio.backend.make(...)``.
+    The video half (guitar / fretboard / hand → ``list[FrameFingering]``)
+    is **not** yet integrated end-to-end in the repo — ``cli.py``'s
+    transcribe path still stubs ``fingerings: list = []``. Until that
+    integration ships, this helper raises ``NotImplementedError``,
+    which the surrounding ``importorskip`` block catches via the
+    pytest hook and surfaces as a skip with a precise reason.
+
+    Wire it up in a separate change: roughly,
+    ``demux → detect_guitar → track_fretboard → track_hand → fuse``.
+    The cluster Viterbi already accepts the ``FrameFingering`` sequence
+    and ``lambda_vision`` flag — no fusion changes needed.
     """
-    raise ImportError(
-        "Phase 5 end-to-end pipeline runner not yet wired — blocked on "
-        "numpy<2 vs numpy>=2 env conflict between basic-pitch and "
-        "mediapipe. See test docstring."
+    from tabvision.audio.backend import make as make_audio_backend
+    from tabvision.demux import demux
+    from tabvision.types import SessionConfig
+
+    session = SessionConfig()
+    demuxed = demux(str(video))
+    audio_backend = make_audio_backend(audio_backend_name)
+    audio_events = audio_backend.transcribe(demuxed.wav, demuxed.sample_rate, session)
+
+    raise NotImplementedError(
+        "Phase 5 end-to-end pipeline runner: audio half is wired "
+        f"({len(audio_events)} events from '{audio_backend_name}'), but "
+        "the video stack (guitar → fretboard → hand → FrameFingering) "
+        "is not yet integrated into a single run_pipeline() call. "
+        "cli.py:159 has the same gap. Wire the video components and "
+        "drop this raise; lambda_vision={lambda_vision} flows through "
+        "fuse() unchanged.".format(lambda_vision=lambda_vision)
     )
 
+    # When the integration lands, body becomes:
+    #
+    #   guitar_track = detect_guitar(frames(...), guitar_backend)
+    #   homographies = track_fretboard(frames(...), guitar_track, fb_backend)
+    #   fingerings = track_hand(frames(...), homographies, hand_backend, cfg)
+    #   return fuse(audio_events, fingerings, cfg, session,
+    #               lambda_vision=lambda_vision)
+
 
 def _mean(values: list[float]) -> float:
     return sum(values) / len(values) if values else 0.0
diff --git a/tabvision/tests/unit/test_chord_fusion.py b/tabvision/tests/unit/test_chord_fusion.py
index 9f15a23..89a717e 100644
--- a/tabvision/tests/unit/test_chord_fusion.py
+++ b/tabvision/tests/unit/test_chord_fusion.py
@@ -39,9 +39,7 @@ def _ev(midi: int, t: float, confidence: float = 0.8) -> AudioEvent:
 def _peaked_fingering(t: float, string_idx: int, fret: int) -> FrameFingering:
     logits = np.zeros((4, 6, 25), dtype=np.float64)
     logits[0, string_idx, fret] = 10.0
-    return FrameFingering(
-        t=t, finger_pos_logits=logits, homography_confidence=0.9
-    )
+    return FrameFingering(t=t, finger_pos_logits=logits, homography_confidence=0.9)
 
 
 # ---------- cluster_events ----------
@@ -103,9 +101,7 @@ def test_enumerate_chord_states_enforces_monophony():
     assert states  # non-empty
     for state in states:
         strings = [c.string_idx for c in state]
-        assert len(strings) == len(set(strings)), (
-            f"per-string monophony violated: {state}"
-        )
+        assert len(strings) == len(set(strings)), f"per-string monophony violated: {state}"
 
 
 def test_enumerate_chord_states_enforces_hand_span():
diff --git a/tabvision/tests/unit/test_cli_fusion_flag.py b/tabvision/tests/unit/test_cli_fusion_flag.py
index d2f321b..8ceada5 100644
--- a/tabvision/tests/unit/test_cli_fusion_flag.py
+++ b/tabvision/tests/unit/test_cli_fusion_flag.py
@@ -21,18 +21,14 @@ def test_default_lambda_vision_is_one():
 
 def test_explicit_lambda_vision_parsed():
     parser = _build_parser()
-    args = parser.parse_args(
-        ["transcribe", "in.mp4", "--fusion-lambda-vision", "2.5"]
-    )
+    args = parser.parse_args(["transcribe", "in.mp4", "--fusion-lambda-vision", "2.5"])
     assert args.fusion_lambda_vision == pytest.approx(2.5)
 
 
 def test_lambda_vision_zero_accepted():
     """``--fusion-lambda-vision 0`` is the audio-only ablation knob."""
     parser = _build_parser()
-    args = parser.parse_args(
-        ["transcribe", "in.mp4", "--fusion-lambda-vision", "0"]
-    )
+    args = parser.parse_args(["transcribe", "in.mp4", "--fusion-lambda-vision", "0"])
     assert args.fusion_lambda_vision == 0.0
 
 
@@ -41,6 +37,4 @@ def test_lambda_vision_only_on_transcribe():
     not be exposed there."""
     parser = _build_parser()
     with pytest.raises(SystemExit):
-        parser.parse_args(
-            ["check", "in.mp4", "--fusion-lambda-vision", "1.0"]
-        )
+        parser.parse_args(["check", "in.mp4", "--fusion-lambda-vision", "1.0"])
diff --git a/tabvision/tests/unit/test_fusion_audio_only.py b/tabvision/tests/unit/test_fusion_audio_only.py
index b4feca2..75e145a 100644
--- a/tabvision/tests/unit/test_fusion_audio_only.py
+++ b/tabvision/tests/unit/test_fusion_audio_only.py
@@ -25,17 +25,13 @@ def _peaked_fingering(t: float, string_idx: int, fret: int) -> FrameFingering:
     """Marginal sharply peaked at ``(string_idx, fret)``."""
     logits = np.zeros((4, 6, 25), dtype=np.float64)
     logits[0, string_idx, fret] = 10.0
-    return FrameFingering(
-        t=t, finger_pos_logits=logits, homography_confidence=0.9
-    )
+    return FrameFingering(t=t, finger_pos_logits=logits, homography_confidence=0.9)
 
 
 def _uniform_fingering(t: float) -> FrameFingering:
     """Marginal ≈ uniform across (string, fret) cells."""
     logits = np.ones((4, 6, 25), dtype=np.float64)
-    return FrameFingering(
-        t=t, finger_pos_logits=logits, homography_confidence=0.9
-    )
+    return FrameFingering(t=t, finger_pos_logits=logits, homography_confidence=0.9)
 
 
 # ---------- audio-only regression ----------
diff --git a/tabvision/tests/unit/test_phase5_metrics.py b/tabvision/tests/unit/test_phase5_metrics.py
index 66b14f7..5510539 100644
--- a/tabvision/tests/unit/test_phase5_metrics.py
+++ b/tabvision/tests/unit/test_phase5_metrics.py
@@ -109,12 +109,16 @@ def test_chord_accuracy_size_mismatch_misses():
 def test_chord_accuracy_separates_clusters_by_gap():
     """Two well-separated gold chords should both score independently."""
     gold = [
-        _t(0.0, 5, 0), _t(0.0, 4, 1),
-        _t(2.0, 5, 7), _t(2.0, 4, 8),
+        _t(0.0, 5, 0),
+        _t(0.0, 4, 1),
+        _t(2.0, 5, 7),
+        _t(2.0, 4, 8),
     ]
     pred = [
-        _t(0.0, 5, 0), _t(0.0, 4, 1),
-        _t(2.0, 5, 7), _t(2.0, 4, 8),
+        _t(0.0, 5, 0),
+        _t(0.0, 4, 1),
+        _t(2.0, 5, 7),
+        _t(2.0, 4, 8),
     ]
     r = chord_instance_accuracy(pred, gold)
     assert r.total_chords == 2
diff --git a/tabvision/tests/unit/test_playability.py b/tabvision/tests/unit/test_playability.py
index 02a0979..745767e 100644
--- a/tabvision/tests/unit/test_playability.py
+++ b/tabvision/tests/unit/test_playability.py
@@ -51,19 +51,13 @@ def _peaked_fingering(
     """Marginal sharply peaked at ``(target_string, target_fret)``."""
     logits = np.zeros((4, n_strings, max_fret + 1), dtype=np.float64)
     logits[0, target_string, target_fret] = 10.0
-    return FrameFingering(
-        t=t, finger_pos_logits=logits, homography_confidence=0.9
-    )
+    return FrameFingering(t=t, finger_pos_logits=logits, homography_confidence=0.9)
 
 
-def _uniform_fingering(
-    t: float, n_strings: int = 6, max_fret: int = 24
-) -> FrameFingering:
+def _uniform_fingering(t: float, n_strings: int = 6, max_fret: int = 24) -> FrameFingering:
     """Marginal ≈ uniform across (string, fret) cells."""
     logits = np.ones((4, n_strings, max_fret + 1), dtype=np.float64)
-    return FrameFingering(
-        t=t, finger_pos_logits=logits, homography_confidence=0.9
-    )
+    return FrameFingering(t=t, finger_pos_logits=logits, homography_confidence=0.9)
 
 
 # ---------- emission ----------
@@ -131,9 +125,7 @@ def test_emission_uniform_vision_does_not_change_ranking():
     ev = _ev(69)
     fing = _uniform_fingering(t=0.0)
     cands = candidate_positions(69, cfg)
-    pure_audio = sorted(
-        cands, key=lambda c: emission_cost(c, ev, None, cfg)
-    )
+    pure_audio = sorted(cands, key=lambda c: emission_cost(c, ev, None, cfg))
     with_uniform = sorted(
         cands,
         key=lambda c: emission_cost(c, ev, fing, cfg, lambda_vision=1.0),
@@ -150,10 +142,7 @@ def test_transition_same_string_is_cheaper_than_string_jump():
     prev = Candidate(string_idx=5, fret=5)
     same_string = Candidate(string_idx=5, fret=7)  # 2 frets up, same string
     string_jump = Candidate(string_idx=4, fret=5)  # different string, same fret
-    assert (
-        transition_cost(prev, same_string, cfg)
-        < transition_cost(prev, string_jump, cfg)
-    )
+    assert transition_cost(prev, same_string, cfg) < transition_cost(prev, string_jump, cfg)
 
 
 def test_transition_hand_span_barrier_only_past_threshold():