From 4cbface9131d82f5dba79b3985d564b86d6ddc7c Mon Sep 17 00:00:00 2001 From: Patrick Gilhooley Date: Wed, 6 May 2026 17:19:42 -0400 Subject: [PATCH 1/6] docs(plan): Phase 5 fusion (Viterbi + chord-aware) design --- docs/plans/2026-05-06-phase5-fusion-design.md | 217 ++++++++++++++++++ 1 file changed, 217 insertions(+) create mode 100644 docs/plans/2026-05-06-phase5-fusion-design.md diff --git a/docs/plans/2026-05-06-phase5-fusion-design.md b/docs/plans/2026-05-06-phase5-fusion-design.md new file mode 100644 index 0000000..ac10615 --- /dev/null +++ b/docs/plans/2026-05-06-phase5-fusion-design.md @@ -0,0 +1,217 @@ +# Phase 5 — Fusion (Viterbi + chord-aware) Design + +**Date:** 2026-05-06 +**Author:** Patrick (brainstormed with Claude) +**Status:** Proposed — pending sign-off +**Spec source:** `SPEC.md` §5 Phase 5, §8 module contracts. +**Branch:** `claude/refactor-eval` (forked from `refactor/v1`); merge back to `refactor/v1` on green. + +## 0. Status snapshot + +What `tabvision.fusion` looks like right now on `refactor/v1`: + +| Module | Lines | State | +|---|---:|---| +| `candidates.py` | 50 | **Done.** `candidate_positions(pitch, cfg) → list[Candidate]`. Used by Phase 1 audio-only fusion. | +| `viterbi.py` | 119 | **Phase-1 placeholder.** `fuse(...)` raises `FusionError` whenever any `FrameFingering` carries non-zero logits ("video-aware fusion not implemented — this is a Phase 5 deliverable"). Greedy lowest-fret + continuity decoder works for the audio-only path (5 tests passing). | +| `playability.py` | 9 | **Stub.** Docstring only. | +| `chord.py` | 9 | **Stub.** Docstring only. | +| CLI | — | `--fusion-lambda-vision` flag not yet exposed. | + +Phase 4 already produces `FrameFingering.marginal_string_fret() → (6, 25)` softmax per frame (`tabvision.video.hand.fingertip_to_fret`). Phase 5 consumes that. + +Legacy reference: `tabvision-server/app/fusion_engine.py` (2,216 lines, 23 functions) and `tabvision-server/app/chord_shapes.py` (790 lines). Per the SPEC §3.3 module-boundary plan, we **port selectively** (hand-span, slide, monophony heuristics) rather than wholesale-translate. The Apr-24 learned-fusion attempt (LightGBM ranker) **did not ship** (LOOCV +0.3 pp vs +5 pp gate per `tools/outputs/position_selector_report-2026-04-29.md`); the lesson is that small ML on top of weak features doesn't beat structured search with informative evidence — Phase 5 takes the structured-search path. + +## 1. Goal & acceptance bars + +From SPEC §5 Phase 5: + +- **Tab F1 ≥ 0.85** on the user eval set. Target 0.88 by Phase 9. +- **Chord-instance accuracy ≥ 0.80**. Target 0.85 by Phase 9. +- **Audio+vision must beat audio-only by ≥ 8 pp on Tab F1** (ablation report). + +The user eval set = the 20-video iPhone-recorded training set, plus whatever Phase 1.5 annotation tooling adds to the four difficulty tiers. Today's audio-only baseline on that set is **exact F1 ≈ 0.51** (per `errors-2026-04-28_185743.md`). Phase 5's 0.85 bar therefore needs both (a) better audio (Phase 2 SOTA backbone) and (b) the audio+vision boost. Phase 5 alone is on the hook for the **+8 pp audio+vision delta**, not the absolute number — that's the readable signal that the fusion is doing real work. + +## 2. Cost function + +We score a sequence of decoded `(string, fret)` picks by a sum of **emission** terms (per pick) and **transition** terms (between consecutive picks). Lower total cost wins. All terms are negative log-probs (or proportional to them) — i.e. dimensionally consistent. + +### 2.1 Emission cost per `Candidate c = (s, f)` for `AudioEvent ev` + +``` +E(c | ev, fingering_at_t) = + -log P_audio(c | ev) # audio prior on string/fret + + -λ_v · log P_vision(c | t) # vision marginal at event time + + α_open · 1[f == 0] · open_bonus # negative if c is on an open string + + α_low · f # mild lower-fret bias +``` + +- `P_audio(c | ev)`: + - If `ev.fret_prior` is provided (Phase 2's `tabcnn` backend, when present), use it directly. Otherwise uniform over candidates. + - Multiply by `ev.confidence` (the model's pitch posterior). +- `P_vision(c | t)`: + - Look up the `FrameFingering` whose `t` is closest to `ev.onset_s`. Linear-interpolate between two adjacent frames if the gap is small (< 1 / fps). + - `marginal_string_fret()[s, f]` is the per-(string, fret) cell of the (6, 25) softmax. + - If no fingering carries evidence (`finger_pos_logits.size == 0` or all-zero) → fallback to uniform; `λ_v` is effectively zero for this event. +- `λ_v`: tunable, default `1.0`, exposed as `--fusion-lambda-vision` (CLI) and `lambda_vision` kwarg on `fuse()`. +- `open_bonus`: small constant (e.g. 0.5). Open strings are systematically under-represented in MediaPipe-derived `marginal_string_fret` because no fingertip is pressing — so we re-introduce them via this bonus. +- `α_low`: lower-fret bias (e.g. 0.05/fret). Keeps the decoder honest when audio + vision are both flat across candidates. + +### 2.2 Transition cost between `prev = (s_p, f_p)` and `curr = (s_c, f_c)` + +``` +T(prev → curr) = + β_shift · |f_c - f_p| / span_norm # position-shift penalty + + β_span · max(0, |f_c - f_p| - max_span) # hard hand-span barrier (kicks in beyond ~5 frets) + - β_string · 1[s_c == s_p] # same-string continuity bonus +``` + +- `span_norm = 12` (one octave), `max_span = 5` frets — calibrated from the legacy `fusion_engine.py` anchor system. +- `β_string` ≈ 0.5 — direct port of the existing `STRING_CONTINUITY_BONUS`. +- A "muted" / X transition is permitted by skipping cost contribution (technique flag set on the `TabEvent`). + +### 2.3 Per-string monophony + +Hard constraint baked into the **chord cluster** state space (§3.2), not a soft cost. Single-line Viterbi (§3.1) is monophonic by construction. + +## 3. State spaces + +### 3.1 Single-line Viterbi (`viterbi.py`) + +Triggered when consecutive events are > 80 ms apart. + +- States at event `i`: `candidate_positions(events[i].pitch_midi, cfg)` — typically 2–6 per pitch. +- Initial cost: `E(c_0)`. +- Recurrence: `cost[i, c] = E(c) + min_{c'} (cost[i-1, c'] + T(c' → c))`. +- Termination: pick the lowest-cost terminal state, backtrack. +- Worst case: `O(N × K^2)` for `N` events, `K ≤ 6` candidates per event. `N` is hundreds; trivial. + +### 3.2 Chord cluster decode (`chord.py`) + +A **chord cluster** is a maximal run of consecutive `AudioEvent`s pairwise within 80 ms onset distance. (SPEC §5: "simultaneous events ≤ 80 ms apart".) + +For a cluster of `m` events: + +- A **chord state** is an ordered tuple of m candidates `(c_1, …, c_m)` with: + - **Per-string monophony:** all `s_i` distinct. + - **Hand-span constraint:** `max(f_i for f_i > 0) - min(f_i for f_i > 0) ≤ max_span` (open strings exempt). + - Order convention: low-pitch first (so the spelling is reproducible). +- State enumeration: cartesian product of candidates, filtered by the two constraints. With `m ≤ 6` (six-string guitar) and `K ≤ 6` per pitch, worst case `6^6 = 46 656` raw tuples — pruned aggressively to a few hundred valid ones. +- Emission cost for a chord state = sum of per-event emission costs. +- Transition between two chord clusters: collapse each cluster to its **lowest-fret pressed note** (the natural anchor point) and apply `T(prev → curr)` from §2.2 — keeps the inter-chord cost compatible with single-line transitions. +- Optional: `chord_shapes.py` templates from the legacy code give a prior over common shapes (open chords, barre, power). **Deferred to Step D below** — start without templates and only add if F1 demands. + +The chord-cluster decode is itself a Viterbi over chord-states between clusters; single-line events are degenerate clusters of size 1. + +## 4. Module responsibilities + +``` +tabvision.fusion.candidates -- (done) candidate_positions, Candidate dataclass. +tabvision.fusion.playability -- emission + transition cost helpers (pure functions, fully unit-tested). +tabvision.fusion.viterbi -- (a) the public fuse() entrypoint; (b) single-line Viterbi; (c) dispatcher to chord. +tabvision.fusion.chord -- chord cluster grouping + chord-state Viterbi. +``` + +`viterbi.fuse(events, fingerings, cfg, session, lambda_vision=1.0)` stays as the single public entrypoint per SPEC §8; behaviour switches internally based on whether `fingerings` carry evidence and whether events fall into chord clusters. + +## 5. Port mapping (legacy → new) + +| Legacy (`tabvision-server/app/fusion_engine.py`) | New | Notes | +|---|---|---| +| `_score_position_heuristic` | `playability.emission_cost` | Drop hand-anchor side-channel; subsume into structured Viterbi. | +| `_select_best_position` | replaced by single-line Viterbi | The greedy logic was the source of `wrong_position_same_pitch` errors. | +| `_optimize_chord_positions` | `chord.decode_chord_state` | The legacy version is greedy with backtracking; the new version is exhaustive over the (already-small) feasible set. | +| `_correct_slide_positions` | `playability.transition_cost` (built-in) | Slide/legato preference falls out of the same-string continuity bonus and the position-shift penalty — no separate post-pass. | +| `_correct_melodic_segments` | not ported; subsumed by Viterbi | Subsumed. Confirm via ablation. | +| `_postfilter_tab_notes` | not ported (yet) | Dedup + low-confidence isolated filter. Defer; revisit if Phase 5 has visible artifacts of this kind. | +| `_detect_techniques` | shallow port | Hammer-on / pull-off / slide tag inference based on consecutive same-string events. Spec §5 leaves bend/vibrato to Phase 7. | +| `chord_shapes.py` (templates) | optional Step D in `chord.py` | Defer — only adopt if needed. | +| `fuse_audio_only` | already ported (Phase 1 path) | Keep. | +| `fuse_audio_video` | replaced wholesale | The legacy version is the worst-performing module per `errors-2026-04-28_185743.md` (35.2% of loss is `wrong_position_same_pitch`). | + +## 6. Step-by-step phasing within Phase 5 + +Each step is independently mergeable; each lands tests before behaviour. + +### Step A — `playability.py`: pure cost helpers (~½ day) + +Implement: +- `emission_cost(candidate, event, fingering_at_t, cfg, *, lambda_vision=1.0) → float` +- `transition_cost(prev, curr, cfg) → float` +- Constants for the weight hyperparameters (named, documented). + +Tests (`tabvision/tests/unit/test_playability.py`, new): +- Emission: pure-audio (no fingering) reproduces the existing greedy decoder's preferences. +- Emission: vision evidence pulls a candidate that audio is indifferent on. +- Emission: open-string bonus correctly recovers fret 0 when MediaPipe marginal is uniform. +- Transition: same-string is cheaper than string-jump. +- Transition: hand-span barrier triggers only past `max_span`. + +**Acceptance:** All new unit tests green. No change to `viterbi.fuse()` behaviour (Phase 1 tests still pass). + +### Step B — single-line Viterbi (~1 day) + +Replace `viterbi._greedy_audio_only` with a single-line Viterbi using `playability` costs. Keep the public `fuse()` signature. + +Tests (extend `test_fusion_audio_only.py`): +- All five existing tests still pass (regression gate). +- Add: 4-event sequence where greedy picks the wrong string at event 3 but Viterbi recovers it via lookahead. +- Add: vision-uniform fingerings produce same output as no fingerings (sanity). +- Add: vision-decisive fingering moves the pick to a non-lowest-fret candidate. + +**Acceptance:** All tests green. Run `tabvision/tests/eval/test_phase4_eval.py` (or its Phase 5 sibling, see Step E) and confirm no regression on the audio-only path. + +### Step C — chord cluster decode (~1–1½ days) + +Implement `chord.cluster_events(events, max_gap_ms=80)` and `chord.decode_clusters(clusters, fingerings, cfg, lambda_vision)` returning the per-event picks. Wire `viterbi.fuse()` to dispatch. + +Tests (`tabvision/tests/unit/test_chord_fusion.py`, new): +- Two simultaneous events on the same string get one moved (per-string monophony). +- A 3-note chord has all picks within `max_span` of each other (hand-span constraint). +- A chord cluster with vision evidence prefers the vision-supported voicing. +- An open-chord shape (open strings present) is preferred over a barre when both are reachable and vision is uniform. + +**Acceptance:** All tests green. Single-line tests still pass. + +### Step D — CLI integration & lambda sweep (~½ day) + +- Add `--fusion-lambda-vision FLOAT` to `tabvision.cli`. Default `1.0`. Pass through to `fuse()`. +- Document in CLI `--help`. +- Add `tabvision/tests/unit/test_cli_fusion_flag.py`: smoke that the flag round-trips into `fuse()`. + +### Step E — Phase 5 acceptance eval (~1 day) + +Add `tabvision/tests/eval/test_phase5_eval.py` modelled on `test_phase4_eval.py`. It: + +1. Runs the full pipeline (audio + video) on each video in the user eval set. +2. Computes Tab F1 (string + fret + onset within ±50 ms) and chord-instance accuracy. +3. Runs the audio-only ablation (`λ_v = 0`) on the same set. +4. Asserts: + - `tab_f1 >= 0.85` (the §5 bar) — **may be marked `xfail` until Phase 2 SOTA backbone lands**, with the understanding that today's audio is the bottleneck. + - `tab_f1_audio_video - tab_f1_audio_only >= 0.08` — **the Phase-5-specific bar; this is the gate for "fusion is doing real work"**. + - `chord_accuracy >= 0.80`. +5. Writes a markdown report to `tabvision-server/tools/outputs/phase5_eval-YYYY-MM-DD.md` summarising the ablation per video (mirrors the `finetune_baseline-*.md` convention). + +**Acceptance for Phase 5 as a whole:** the `tab_f1_audio_video - tab_f1_audio_only >= 0.08` assertion passes. The absolute-Tab-F1 bar may be deferred to Phase 7 if audio is still the bottleneck — but if it is, that's a material finding and should land in `DECISIONS.md`. + +## 7. Risks & open questions + +- **Risk:** `λ_v = 1.0` may be wrong by an order of magnitude. Mitigation: Step E sweeps `λ_v ∈ {0, 0.5, 1, 2, 5}` and reports best per video and aggregate. If best is `0`, vision evidence is genuinely uncalibrated → SPEC §5 decision tree's `C2` branch (return to Phase 4). +- **Risk:** chord-state explosion on dense voicings. Mitigation: 6-string max plus monophony pruning bounds cardinality at 720 raw tuples; in practice the constraint cuts to <100. If a real video produces a worst-case cluster (>100 tuples), beam-search is a 5-line addition. +- **Risk:** open-string bonus over-fires when the player is fingering a fret-0 chord (e.g. capo-0 G major shape) and MediaPipe correctly says "no fingertip on the low strings." Mitigation: chord-cluster decode considers the whole shape — bonus is per-event, but the chord-state's hand-span constraint pulls the rest of the shape into a coherent fingering. +- **Open:** does Step C need `chord_shapes.py` templates as a prior? Plan says no — start without and add only if F1 demands. Tracked as a Step-C-follow-up if needed. +- **Open:** what's "the user eval set" for Step E? Today: the 20-video iPhone training set. Phase 1.5's annotation tool will add labelled clips across four difficulty tiers — those should fold into the same eval as they land. + +## 8. Estimated effort + +Steps A → E total **~4 working days** of implementation + writeup. Acceptance eval (Step E) is the slowest because it requires running the full pipeline on the eval set, which is gated on Phase 4's video stack working end-to-end on the iPhone videos (probably true today but worth confirming as Step 0 below). + +## 9. Pre-flight (before Step A) + +A quick 15-min sanity check before any code: + +- Run `tabvision/tests/eval/test_phase4_eval.py` end-to-end on at least one iPhone video and confirm we get a non-empty `list[FrameFingering]` with non-uniform `marginal_string_fret`. If we don't, Step E is going to be useless and we should fix Phase 4's eval path first. + +--- + +**For sign-off:** confirm (a) cost-function shape (§2), (b) module split (§4), (c) phasing/order of A–E. If those look right I'll start with Step A. From 99f9df1d93965492bcc9b09dd2de21b8e6fe9d32 Mon Sep 17 00:00:00 2001 From: Patrick Gilhooley Date: Wed, 6 May 2026 17:41:19 -0400 Subject: [PATCH 2/6] feat(phase5): playability emission + transition cost helpers --- tabvision/tabvision/fusion/playability.py | 169 ++++++++++++++++- tabvision/tests/unit/test_playability.py | 219 ++++++++++++++++++++++ 2 files changed, 382 insertions(+), 6 deletions(-) create mode 100644 tabvision/tests/unit/test_playability.py diff --git a/tabvision/tabvision/fusion/playability.py b/tabvision/tabvision/fusion/playability.py index 60df95c..2def9c5 100644 --- a/tabvision/tabvision/fusion/playability.py +++ b/tabvision/tabvision/fusion/playability.py @@ -1,9 +1,166 @@ -"""Playability transition costs — Phase 5 deliverable. Stub. +"""Playability emission + transition costs — Phase 5 deliverable. -Hand-span penalty, position-shift penalty, open-string bonus, same-string -monophony enforcement. +All functions return **negative log-probs** in nats: lower cost = better. +Costs decompose into per-candidate emission terms (audio prior + vision +evidence + open-string bonus + low-fret bias) and pairwise transition +terms (string continuity + position shift + hand-span barrier). -Port targets: -- ``tabvision-server/app/fusion_engine.py`` (position scoring, melodic - segment correction, slide correction). +See ``docs/plans/2026-05-06-phase5-fusion-design.md`` §2 for the formulae +and ``SPEC.md`` §5 for acceptance bars. + +Port targets: ``tabvision-server/app/fusion_engine.py`` — +``_score_position_heuristic``, ``_correct_slide_positions``, the +hand-anchor/position-shift logic. """ + +from __future__ import annotations + +import math +from typing import Sequence + +from tabvision.fusion.candidates import Candidate +from tabvision.types import AudioEvent, FrameFingering, GuitarConfig + +# --- emission term weights --- +LOW_FRET_BIAS = 0.10 +"""Cost added per fret index. Keeps the decoder honest when audio + vision +are flat — picks the lower fret all else equal. Same magnitude as the legacy +``viterbi.LOWER_FRET_BIAS``.""" + +OPEN_STRING_BONUS = 0.5 +"""Cost subtracted when the candidate is an open string (fret 0). + +Open strings are systematically under-represented by MediaPipe-derived +``marginal_string_fret`` because there is no fingertip pressing — this +bonus re-introduces them. Magnitude calibrated to roughly cancel the +vision-floor cost (``-log(VISION_FLOOR)`` over a uniform marginal).""" + +VISION_FLOOR = 1e-3 +"""Minimum probability used when computing ``-log P_vision``. Caps the +vision evidence's contribution at ``-log(1e-3) ≈ 6.9`` per candidate so +a confident wrong fingering can still be overridden by strong audio + +playability evidence.""" + +# --- transition term weights --- +SAME_STRING_BONUS = 0.5 +"""Cost subtracted when ``prev.string_idx == curr.string_idx``. Direct +port of legacy ``STRING_CONTINUITY_BONUS``.""" + +POSITION_SHIFT_COST = 0.05 +"""Cost per fret of ``|curr.fret - prev.fret|`` (after normalisation by +``SPAN_NORM``). Mild — encourages staying close on the neck without +forbidding jumps.""" + +SPAN_NORM = 12 +"""Normalisation for ``POSITION_SHIFT_COST`` — one octave.""" + +MAX_HAND_SPAN = 5 +"""Frets — beyond this distance the hand-span barrier kicks in.""" + +HAND_SPAN_BARRIER = 5.0 +"""Cost added per fret of overshoot beyond ``MAX_HAND_SPAN``. Steep +enough to act as a soft hard-constraint while still allowing a jump +when audio + vision agree strongly.""" + +EPS = 1e-9 + + +def find_fingering_at( + t: float, fingerings: Sequence[FrameFingering] +) -> FrameFingering | None: + """Return the ``FrameFingering`` whose ``.t`` is closest to ``t``. + + Returns ``None`` when ``fingerings`` is empty or no entry carries + evidence (logits None, empty, or all-zero). Ties broken by earliest. + """ + if not fingerings: + return None + best: FrameFingering | None = None + best_dt = math.inf + for f in fingerings: + if f.finger_pos_logits is None or f.finger_pos_logits.size == 0: + continue + if not (f.finger_pos_logits != 0).any(): + continue + dt = abs(f.t - t) + if dt < best_dt: + best = f + best_dt = dt + return best + + +def emission_cost( + candidate: Candidate, + event: AudioEvent, + fingering: FrameFingering | None, + cfg: GuitarConfig, + *, + lambda_vision: float = 1.0, +) -> float: + """Emission cost (negative log-prob) for ``candidate`` given ``event``. + + Decomposition (lower = better): + + - ``-log(event.confidence)`` — per-event constant (does not affect + ranking within a single event but matters across events). + - ``-log(event.fret_prior[s, f])`` — only when the audio backend + provides a per-position prior (e.g. Phase 2 ``tabcnn``). + - ``lambda_vision * -log(P_vision[s, f])`` — vision marginal at + ``event.onset_s``. Skipped when ``fingering is None``. + - ``LOW_FRET_BIAS * fret`` — gentle low-fret preference. + - ``-OPEN_STRING_BONUS`` when ``fret == 0``. + """ + cost = -math.log(max(event.confidence, EPS)) + + if event.fret_prior is not None: + prior = float(event.fret_prior[candidate.string_idx, candidate.fret]) + cost += -math.log(max(prior, EPS)) + + if fingering is not None: + marginal = fingering.marginal_string_fret() + p = float(marginal[candidate.string_idx, candidate.fret]) + cost += lambda_vision * (-math.log(max(p, VISION_FLOOR))) + + cost += LOW_FRET_BIAS * candidate.fret + if candidate.fret == 0: + cost -= OPEN_STRING_BONUS + + return cost + + +def transition_cost( + prev: Candidate, curr: Candidate, cfg: GuitarConfig +) -> float: + """Transition cost from ``prev`` to ``curr``. + + - String continuity: ``-SAME_STRING_BONUS`` when on the same string. + - Position shift: ``POSITION_SHIFT_COST * |Δfret| / SPAN_NORM``. + - Hand-span barrier: ``HAND_SPAN_BARRIER * max(0, |Δfret| - MAX_HAND_SPAN)``. + + ``cfg`` is reserved for future use (e.g. instrument-specific span + limits); pass the same value used elsewhere in the decode. + """ + del cfg # unused for now; reserved. + cost = 0.0 + delta = abs(curr.fret - prev.fret) + cost += POSITION_SHIFT_COST * delta / SPAN_NORM + if delta > MAX_HAND_SPAN: + cost += HAND_SPAN_BARRIER * (delta - MAX_HAND_SPAN) + if curr.string_idx == prev.string_idx: + cost -= SAME_STRING_BONUS + return cost + + +__all__ = [ + "find_fingering_at", + "emission_cost", + "transition_cost", + "LOW_FRET_BIAS", + "OPEN_STRING_BONUS", + "VISION_FLOOR", + "SAME_STRING_BONUS", + "POSITION_SHIFT_COST", + "SPAN_NORM", + "MAX_HAND_SPAN", + "HAND_SPAN_BARRIER", +] diff --git a/tabvision/tests/unit/test_playability.py b/tabvision/tests/unit/test_playability.py new file mode 100644 index 0000000..02a0979 --- /dev/null +++ b/tabvision/tests/unit/test_playability.py @@ -0,0 +1,219 @@ +"""Unit tests for ``tabvision.fusion.playability``. + +Covers: +- emission cost: audio-only ranking matches the legacy greedy decoder's + preferences (lower fret + open-string bonus). +- emission cost: vision evidence pulls a candidate that audio is + indifferent on. +- emission cost: open-string bonus correctly recovers fret 0 when the + vision marginal is uniform. +- transition cost: same-string is cheaper than string-jump. +- transition cost: hand-span barrier triggers only past ``MAX_HAND_SPAN``. +- ``find_fingering_at`` picks the nearest non-empty fingering. +""" + +from __future__ import annotations + +import numpy as np + +from tabvision.fusion.candidates import Candidate, candidate_positions +from tabvision.fusion.playability import ( + HAND_SPAN_BARRIER, + MAX_HAND_SPAN, + OPEN_STRING_BONUS, + SAME_STRING_BONUS, + emission_cost, + find_fingering_at, + transition_cost, +) +from tabvision.types import AudioEvent, FrameFingering, GuitarConfig + +# ---------- helpers ---------- + + +def _ev(midi: int, t: float = 0.0, confidence: float = 0.8) -> AudioEvent: + return AudioEvent( + onset_s=t, + offset_s=t + 0.25, + pitch_midi=midi, + velocity=0.8, + confidence=confidence, + ) + + +def _peaked_fingering( + t: float, + target_string: int, + target_fret: int, + n_strings: int = 6, + max_fret: int = 24, +) -> FrameFingering: + """Marginal sharply peaked at ``(target_string, target_fret)``.""" + logits = np.zeros((4, n_strings, max_fret + 1), dtype=np.float64) + logits[0, target_string, target_fret] = 10.0 + return FrameFingering( + t=t, finger_pos_logits=logits, homography_confidence=0.9 + ) + + +def _uniform_fingering( + t: float, n_strings: int = 6, max_fret: int = 24 +) -> FrameFingering: + """Marginal ≈ uniform across (string, fret) cells.""" + logits = np.ones((4, n_strings, max_fret + 1), dtype=np.float64) + return FrameFingering( + t=t, finger_pos_logits=logits, homography_confidence=0.9 + ) + + +# ---------- emission ---------- + + +def test_emission_audio_only_prefers_lower_fret(): + """Without vision evidence, lowest-fret candidate has lowest emission cost. + + A4 (MIDI 69) candidates: s5f5 (high E, fret 5) and s4f9 (B, fret 9), among + others. The plain low-fret bias should pick s5f5. + """ + cfg = GuitarConfig() + ev = _ev(69) + cands = candidate_positions(69, cfg) + costs = [(c, emission_cost(c, ev, None, cfg)) for c in cands] + best = min(costs, key=lambda kv: kv[1])[0] + assert best.fret == 5 + assert best.string_idx == 5 # high E + + +def test_emission_open_string_bonus_recovers_fret_zero(): + """For a pitch with a fret-0 option, the open-string bonus puts it on top. + + E2 (MIDI 40) has only one candidate: s0f0 — the bonus should make its + emission cost lower than any fingered alternative would have been. + """ + cfg = GuitarConfig() + ev = _ev(40) + cands = candidate_positions(40, cfg) + assert len(cands) == 1 and cands[0].fret == 0 + open_cost = emission_cost(cands[0], ev, None, cfg) + + # Compare against a synthetic fret-1 candidate's would-be cost: same + # pitch contribution, but no bonus and one tick of low-fret bias. + fake = Candidate(string_idx=0, fret=1) + # Construct a fake AudioEvent with the same confidence so the per-event + # constant cancels out. + fake_cost = emission_cost(fake, ev, None, cfg) + assert open_cost < fake_cost + assert (fake_cost - open_cost) >= OPEN_STRING_BONUS - 1e-9 + + +def test_emission_vision_pulls_pick_off_lowest_fret(): + """Vision evidence should override the lowest-fret default. + + A4 (MIDI 69) audio-only picks s5f5 (high E, fret 5). With a fingering + peaked at s2f14 (G string, fret 14 — also a valid A4 position), the + emission cost there should be lower despite the higher fret. + """ + cfg = GuitarConfig() + ev = _ev(69, t=1.0) + fing = _peaked_fingering(t=1.0, target_string=2, target_fret=14) + + audio_pick = Candidate(string_idx=5, fret=5) + vision_pick = Candidate(string_idx=2, fret=14) + + audio_cost = emission_cost(audio_pick, ev, fing, cfg, lambda_vision=1.0) + vision_cost = emission_cost(vision_pick, ev, fing, cfg, lambda_vision=1.0) + assert vision_cost < audio_cost + + +def test_emission_uniform_vision_does_not_change_ranking(): + """A uniform fingering should not flip the audio-only preference.""" + cfg = GuitarConfig() + ev = _ev(69) + fing = _uniform_fingering(t=0.0) + cands = candidate_positions(69, cfg) + pure_audio = sorted( + cands, key=lambda c: emission_cost(c, ev, None, cfg) + ) + with_uniform = sorted( + cands, + key=lambda c: emission_cost(c, ev, fing, cfg, lambda_vision=1.0), + ) + assert [c for c in pure_audio] == [c for c in with_uniform] + + +# ---------- transition ---------- + + +def test_transition_same_string_is_cheaper_than_string_jump(): + """Same-string continuity bonus beats a one-fret string jump.""" + cfg = GuitarConfig() + prev = Candidate(string_idx=5, fret=5) + same_string = Candidate(string_idx=5, fret=7) # 2 frets up, same string + string_jump = Candidate(string_idx=4, fret=5) # different string, same fret + assert ( + transition_cost(prev, same_string, cfg) + < transition_cost(prev, string_jump, cfg) + ) + + +def test_transition_hand_span_barrier_only_past_threshold(): + """Costs are mild within ``MAX_HAND_SPAN`` and steep beyond it.""" + cfg = GuitarConfig() + prev = Candidate(string_idx=5, fret=5) + within = Candidate(string_idx=5, fret=5 + MAX_HAND_SPAN) # at threshold + beyond = Candidate(string_idx=5, fret=5 + MAX_HAND_SPAN + 1) # one past + + cost_within = transition_cost(prev, within, cfg) + cost_beyond = transition_cost(prev, beyond, cfg) + + # The barrier kicks in for `beyond`, so the gap should be ≥ HAND_SPAN_BARRIER + # (modulo the small extra position-shift cost of one more fret). + assert (cost_beyond - cost_within) >= HAND_SPAN_BARRIER - 1e-6 + + +def test_transition_zero_when_unchanged(): + """No-op transition (same string, same fret) yields the bare continuity bonus.""" + cfg = GuitarConfig() + p = Candidate(string_idx=3, fret=7) + cost = transition_cost(p, p, cfg) + # 0 position shift + same-string bonus → -SAME_STRING_BONUS exactly. + assert cost == -SAME_STRING_BONUS + + +# ---------- find_fingering_at ---------- + + +def test_find_fingering_at_picks_closest_non_empty(): + fings = [ + _peaked_fingering(t=0.0, target_string=0, target_fret=0), + _peaked_fingering(t=1.0, target_string=5, target_fret=5), + _peaked_fingering(t=2.0, target_string=3, target_fret=3), + ] + chosen = find_fingering_at(1.1, fings) + assert chosen is not None + assert chosen.t == 1.0 + + +def test_find_fingering_at_skips_empty_logits(): + """All-zero logits = no evidence; should be skipped.""" + empty = FrameFingering( + t=0.5, + finger_pos_logits=np.zeros((4, 6, 25)), + homography_confidence=0.0, + ) + real = _peaked_fingering(t=2.0, target_string=2, target_fret=7) + chosen = find_fingering_at(0.6, [empty, real]) + assert chosen is real + + +def test_find_fingering_at_returns_none_when_all_empty(): + empty = FrameFingering( + t=0.5, + finger_pos_logits=np.zeros((4, 6, 25)), + homography_confidence=0.0, + ) + assert find_fingering_at(0.6, [empty]) is None + + +def test_find_fingering_at_returns_none_for_empty_input(): + assert find_fingering_at(0.6, []) is None From d483376dd31664f62765a3716f1763e57ea32ccb Mon Sep 17 00:00:00 2001 From: Patrick Gilhooley Date: Wed, 6 May 2026 17:49:02 -0400 Subject: [PATCH 3/6] feat(phase5): cluster-level Viterbi + chord-state machinery Replace the greedy audio-only decoder with a unified cluster-level Viterbi DP. Each step in the DP is a chord cluster (events <=80ms apart, chain semantics); singleton clusters degenerate to single-line Viterbi. chord.enumerate_chord_states builds valid (string, fret) tuples under per-string monophony + hand-span constraints; chord.chord_anchor picks the lowest-fret pressed note as the cluster's representative for inter-cluster transition costs. Lookahead is real: a future event's vision evidence can change earlier picks when the global path is cheaper. Lambda_vision=0.0 reproduces the audio-only behaviour bit-for-bit. Adds 19 new unit tests (lookahead, vision-decisive single, chord monophony + hand-span, chord vision pull, cluster grouping); all 39 fusion tests green. --- tabvision/tabvision/fusion/chord.py | 122 +++++++++- tabvision/tabvision/fusion/viterbi.py | 214 +++++++++++------- tabvision/tests/unit/test_chord_fusion.py | 201 ++++++++++++++++ .../tests/unit/test_fusion_audio_only.py | 96 +++++++- 4 files changed, 545 insertions(+), 88 deletions(-) create mode 100644 tabvision/tests/unit/test_chord_fusion.py diff --git a/tabvision/tabvision/fusion/chord.py b/tabvision/tabvision/fusion/chord.py index ac91bdd..f734a06 100644 --- a/tabvision/tabvision/fusion/chord.py +++ b/tabvision/tabvision/fusion/chord.py @@ -1,9 +1,119 @@ -"""Chord-aware fusion — Phase 5 deliverable. Stub. +"""Chord cluster grouping + chord-state machinery — Phase 5 deliverable. -Simultaneous events (≤ 80 ms apart) decoded as ordered tuples with -per-string monophony and hand-span constraints baked into state -construction. +A *chord cluster* is a maximal run of consecutive ``AudioEvent``s whose +adjacent onset gaps are all ≤ :data:`CHORD_MAX_GAP_S` (80 ms by default). +Within a cluster, decoding picks an ordered tuple of ``(string, fret)`` +candidates — one per event — subject to two structural constraints: -Port targets: ``tabvision-server/app/chord_shapes.py`` + chord logic in -``fusion_engine.py``. +- **Per-string monophony**: no two events share a string. +- **Hand-span**: ``max(pressed_fret) - min(pressed_fret) ≤ MAX_HAND_SPAN`` + (open strings are exempt — fret 0 doesn't constrain the fretting hand). + +This module is pure machinery — clustering, state enumeration, anchor +selection. The cluster-level Viterbi DP that consumes these states lives +in :mod:`tabvision.fusion.viterbi`. + +See ``docs/plans/2026-05-06-phase5-fusion-design.md`` §3.2 and SPEC.md §5. """ + +from __future__ import annotations + +from typing import Sequence + +from tabvision.fusion.candidates import Candidate, candidate_positions +from tabvision.fusion.playability import MAX_HAND_SPAN +from tabvision.types import AudioEvent, GuitarConfig + +CHORD_MAX_GAP_S = 0.080 +"""Maximum onset gap (seconds) between consecutive events to count as one +chord cluster. SPEC §5 calls this "≤ 80 ms apart".""" + + +def cluster_events( + events: Sequence[AudioEvent], + max_gap_s: float = CHORD_MAX_GAP_S, +) -> list[list[AudioEvent]]: + """Group events into chord clusters. + + Chain semantics: events ``i`` and ``i+1`` (sorted by onset) join the + same cluster iff ``events[i+1].onset_s - events[i].onset_s ≤ max_gap_s``. + A cluster therefore can span more than ``max_gap_s`` overall when the + individual pairwise gaps remain bounded. + """ + if not events: + return [] + sorted_events = sorted(events, key=lambda e: e.onset_s) + clusters: list[list[AudioEvent]] = [[sorted_events[0]]] + for ev in sorted_events[1:]: + if ev.onset_s - clusters[-1][-1].onset_s <= max_gap_s: + clusters[-1].append(ev) + else: + clusters.append([ev]) + return clusters + + +def enumerate_chord_states( + events: Sequence[AudioEvent], + cfg: GuitarConfig, +) -> list[tuple[Candidate, ...]]: + """All valid (monophony + hand-span) ordered tuples of candidates. + + Builds the state set incrementally to keep the worst-case bounded by + the constraint-pruned size at each step rather than the raw cartesian + product (``K^m``). Returns an empty list if any event has no + candidates — the caller is expected to filter out-of-range events + upstream so the cluster shape stays consistent with the input order. + """ + if not events: + return [] + + per_event_candidates = [ + candidate_positions(ev.pitch_midi, cfg) for ev in events + ] + if any(not cands for cands in per_event_candidates): + return [] + + states: list[tuple[Candidate, ...]] = [ + (c,) for c in per_event_candidates[0] + ] + for k in range(1, len(events)): + next_states: list[tuple[Candidate, ...]] = [] + for state in states: + used_strings = {c.string_idx for c in state} + pressed = [c.fret for c in state if c.fret > 0] + for c in per_event_candidates[k]: + if c.string_idx in used_strings: + continue + new_pressed = pressed + ([c.fret] if c.fret > 0 else []) + if new_pressed: + span = max(new_pressed) - min(new_pressed) + if span > MAX_HAND_SPAN: + continue + next_states.append(state + (c,)) + states = next_states + if not states: + return [] + return states + + +def chord_anchor(state: tuple[Candidate, ...]) -> Candidate: + """The 'anchor' candidate used as the state's representative for + inter-cluster transition costs. + + Defined as the lowest-fret *pressed* note (fret > 0) — the natural + centre of the fretting hand. If all notes are open, the first + candidate is returned (any choice is equivalent because all pressed + frets are 0 and transition cost depends on Δfret). + """ + pressed = [c for c in state if c.fret > 0] + if not pressed: + return state[0] + return min(pressed, key=lambda c: (c.fret, c.string_idx)) + + +__all__ = [ + "CHORD_MAX_GAP_S", + "cluster_events", + "enumerate_chord_states", + "chord_anchor", +] diff --git a/tabvision/tabvision/fusion/viterbi.py b/tabvision/tabvision/fusion/viterbi.py index 168669f..1a67e9c 100644 --- a/tabvision/tabvision/fusion/viterbi.py +++ b/tabvision/tabvision/fusion/viterbi.py @@ -1,20 +1,28 @@ -"""Single-line Viterbi decode + audio-only fallback. +"""Cluster-level Viterbi decode — Phase 5 deliverable. Public entrypoint: ``fuse(events, fingerings, cfg, session, lambda_vision)``. -Phase 1: when ``fingerings`` is empty (video stubs), degenerate to a -greedy "lowest-fret with continuity bonus" decoder per SPEC.md §7 Phase 1. +Each "step" in the DP is a chord cluster (often a singleton — an isolated +event). For each cluster, :func:`tabvision.fusion.chord.enumerate_chord_states` +produces the per-string-monophony + hand-span-feasible ordered tuples of +candidates. Emission for a state is the sum of per-event emission costs +(:func:`tabvision.fusion.playability.emission_cost`); transitions between +clusters use :func:`tabvision.fusion.chord.chord_anchor` to pick a +representative position for the playability transition cost. -Phase 5 replaces the body with a proper Viterbi over candidate states -using ``tabvision.fusion.playability`` transition costs. The public -signature stays stable. +The single-line Viterbi behaviour is the size-1-cluster degenerate case +of this same DP — no separate code path. + +See ``docs/plans/2026-05-06-phase5-fusion-design.md`` §3 for the state +spaces and §2 for the cost decomposition. """ from __future__ import annotations +import math from typing import Sequence -from tabvision.errors import FusionError +from tabvision.fusion import chord, playability from tabvision.fusion.candidates import Candidate, candidate_positions from tabvision.types import ( AudioEvent, @@ -24,16 +32,6 @@ TabEvent, ) -# Continuity bonus: amount subtracted from a candidate's "cost" when its -# string matches the previous note's string. A small constant; Phase 5 -# will calibrate. -STRING_CONTINUITY_BONUS = 0.5 -# Penalty per fret of distance from the previous note's fret. Small -# enough that the lowest-fret bias still wins for distant pitches. -FRET_DISTANCE_PENALTY = 0.05 -# Penalty per fret position (lower-fret preference). -LOWER_FRET_BIAS = 0.10 - def fuse( events: Sequence[AudioEvent], @@ -42,78 +40,134 @@ def fuse( session: SessionConfig | None = None, lambda_vision: float = 1.0, ) -> list[TabEvent]: - """Decode AudioEvents into TabEvents. - - Phase 1: ``fingerings`` is empty / uniform; falls back to greedy - audio-only decode. The ``lambda_vision`` weight is accepted for - interface stability but ignored until Phase 5. + """Decode ``AudioEvent``s into ``TabEvent``s via cluster Viterbi. + + Parameters + ---------- + events: + Audio events. Out-of-range pitches (no playable candidate under + ``cfg``) are dropped — no phantom notes emitted. + fingerings: + Per-frame fingerings from Phase 4. Empty / all-zero is treated + as audio-only. + cfg: + Instrument config (tuning, capo, max_fret). + session: + Recording session metadata; reserved for future use. + lambda_vision: + Mixing weight for the vision-evidence term. ``0.0`` disables + vision entirely; ``1.0`` is the default; higher values lean more + heavily on the fingertip-to-fret posterior. + + Returns + ------- + list[TabEvent] + One ``TabEvent`` per surviving event, ordered by ``onset_s``. """ if cfg is None: cfg = GuitarConfig() if session is None: session = SessionConfig() - - has_video = any(_has_evidence(f) for f in fingerings) - if has_video: - # Phase 5 deliverable: Viterbi over (string, fret) states with - # vision-evidence + playability costs. Not yet implemented. - raise FusionError( - "video-aware fusion not implemented in Phase 1 — " - "this is a Phase 5 deliverable" - ) - - return _greedy_audio_only(events, cfg) - - -def _has_evidence(f: FrameFingering) -> bool: - """A FrameFingering carries info if its logits are not all-zero.""" - arr = f.finger_pos_logits - return arr is not None and bool(arr.size) and bool((arr != 0).any()) - - -def _greedy_audio_only( - events: Sequence[AudioEvent], cfg: GuitarConfig + del session # not consumed by Phase 5; preserves signature for callers. + + if not events: + return [] + + # Drop out-of-range pitches before clustering so the cluster shape + # reflects what's actually decodable. + valid_events = [ + ev for ev in events if candidate_positions(ev.pitch_midi, cfg) + ] + if not valid_events: + return [] + + clusters = chord.cluster_events(valid_events) + cluster_data: list[ + tuple[list[AudioEvent], list[tuple[Candidate, ...]]] + ] = [] + for cluster in clusters: + states = chord.enumerate_chord_states(cluster, cfg) + if states: + cluster_data.append((cluster, states)) + + if not cluster_data: + return [] + + return _viterbi_clusters(cluster_data, fingerings, cfg, lambda_vision) + + +def _viterbi_clusters( + cluster_data: list[ + tuple[list[AudioEvent], list[tuple[Candidate, ...]]] + ], + fingerings: Sequence[FrameFingering], + cfg: GuitarConfig, + lambda_vision: float, ) -> list[TabEvent]: - """Pick (string, fret) per event by lowest-fret + continuity.""" - out: list[TabEvent] = [] - prev: Candidate | None = None - - for ev in events: - candidates = candidate_positions(ev.pitch_midi, cfg) - if not candidates: - # Out-of-range pitch; skip rather than emit a phantom note. - continue - pick = _pick_candidate(candidates, prev) - out.append( - TabEvent( - onset_s=ev.onset_s, - duration_s=max(0.0, ev.offset_s - ev.onset_s), - string_idx=pick.string_idx, - fret=pick.fret, - pitch_midi=ev.pitch_midi, - confidence=ev.confidence, - techniques=ev.tags, + """Cluster-level Viterbi DP. Worst case ``O(N · S^2)`` for ``N`` + clusters with ``S`` states each.""" + + def state_emission( + cluster: list[AudioEvent], state: tuple[Candidate, ...] + ) -> float: + total = 0.0 + for ev, c in zip(cluster, state): + f = playability.find_fingering_at(ev.onset_s, fingerings) + total += playability.emission_cost( + c, ev, f, cfg, lambda_vision=lambda_vision ) - ) - prev = pick + return total + + n = len(cluster_data) + cost: list[list[float]] = [[] for _ in range(n)] + backptr: list[list[int]] = [[] for _ in range(n)] + + cluster0, states0 = cluster_data[0] + cost[0] = [state_emission(cluster0, st) for st in states0] + backptr[0] = [-1] * len(states0) + + for i in range(1, n): + cluster_i, states_i = cluster_data[i] + prev_states = cluster_data[i - 1][1] + cost[i] = [math.inf] * len(states_i) + backptr[i] = [-1] * len(states_i) + for si, state in enumerate(states_i): + emit = state_emission(cluster_i, state) + anchor_curr = chord.chord_anchor(state) + for pi, prev_state in enumerate(prev_states): + anchor_prev = chord.chord_anchor(prev_state) + trans = playability.transition_cost( + anchor_prev, anchor_curr, cfg + ) + total = cost[i - 1][pi] + trans + emit + if total < cost[i][si]: + cost[i][si] = total + backptr[i][si] = pi + + # Backtrack from the cheapest terminal state. + final = cost[n - 1] + last_idx = min(range(len(final)), key=lambda j: final[j]) + picks_idx = [0] * n + picks_idx[n - 1] = last_idx + for i in range(n - 1, 0, -1): + picks_idx[i - 1] = backptr[i][picks_idx[i]] + out: list[TabEvent] = [] + for i, (cluster, states) in enumerate(cluster_data): + state = states[picks_idx[i]] + for ev, c in zip(cluster, state): + out.append( + TabEvent( + onset_s=ev.onset_s, + duration_s=max(0.0, ev.offset_s - ev.onset_s), + string_idx=c.string_idx, + fret=c.fret, + pitch_midi=ev.pitch_midi, + confidence=ev.confidence, + techniques=ev.tags, + ) + ) return out -def _pick_candidate( - candidates: list[Candidate], prev: Candidate | None -) -> Candidate: - """Score each candidate; lower cost wins.""" - - def cost(c: Candidate) -> float: - score = LOWER_FRET_BIAS * c.fret - if prev is not None: - score += FRET_DISTANCE_PENALTY * abs(c.fret - prev.fret) - if c.string_idx == prev.string_idx: - score -= STRING_CONTINUITY_BONUS - return score - - return min(candidates, key=cost) - - __all__ = ["fuse"] diff --git a/tabvision/tests/unit/test_chord_fusion.py b/tabvision/tests/unit/test_chord_fusion.py new file mode 100644 index 0000000..9f15a23 --- /dev/null +++ b/tabvision/tests/unit/test_chord_fusion.py @@ -0,0 +1,201 @@ +"""Unit tests for chord-aware fusion (``tabvision.fusion.chord`` plus +the cluster-level Viterbi in :mod:`tabvision.fusion.viterbi`). + +Covers: +- ``cluster_events``: clustering by onset gap. +- ``enumerate_chord_states``: per-string monophony + hand-span pruning. +- ``chord_anchor``: lowest-fret pressed note as anchor. +- End-to-end ``fuse``: simultaneous events emit distinct strings, picks + fall within the hand-span constraint, and vision evidence on one + chord member pulls the whole shape onto a vision-supported voicing. +""" + +from __future__ import annotations + +import numpy as np + +from tabvision.fusion import fuse +from tabvision.fusion.candidates import Candidate +from tabvision.fusion.chord import ( + CHORD_MAX_GAP_S, + chord_anchor, + cluster_events, + enumerate_chord_states, +) +from tabvision.fusion.playability import MAX_HAND_SPAN +from tabvision.types import AudioEvent, FrameFingering, GuitarConfig + + +def _ev(midi: int, t: float, confidence: float = 0.8) -> AudioEvent: + return AudioEvent( + onset_s=t, + offset_s=t + 0.25, + pitch_midi=midi, + velocity=0.8, + confidence=confidence, + ) + + +def _peaked_fingering(t: float, string_idx: int, fret: int) -> FrameFingering: + logits = np.zeros((4, 6, 25), dtype=np.float64) + logits[0, string_idx, fret] = 10.0 + return FrameFingering( + t=t, finger_pos_logits=logits, homography_confidence=0.9 + ) + + +# ---------- cluster_events ---------- + + +def test_cluster_events_single_event_yields_one_cluster(): + clusters = cluster_events([_ev(60, 0.0)]) + assert len(clusters) == 1 + assert len(clusters[0]) == 1 + + +def test_cluster_events_close_events_join_one_cluster(): + """Two events 50 ms apart should be one chord cluster.""" + events = [_ev(60, 0.0), _ev(64, 0.05)] + clusters = cluster_events(events) + assert len(clusters) == 1 + assert len(clusters[0]) == 2 + + +def test_cluster_events_far_events_split(): + """Two events 200 ms apart should be two clusters.""" + events = [_ev(60, 0.0), _ev(64, 0.20)] + clusters = cluster_events(events) + assert len(clusters) == 2 + assert all(len(c) == 1 for c in clusters) + + +def test_cluster_events_chain_through_threshold(): + """Three events at 0, 80, 160 ms (each adjacent gap == threshold) + should form one cluster (chain semantics).""" + events = [ + _ev(60, 0.0), + _ev(64, CHORD_MAX_GAP_S), + _ev(67, 2 * CHORD_MAX_GAP_S), + ] + clusters = cluster_events(events) + assert len(clusters) == 1 + assert len(clusters[0]) == 3 + + +def test_cluster_events_unsorted_input_is_sorted(): + """Out-of-order input should still produce a chronologically grouped + output.""" + events = [_ev(67, 0.05), _ev(60, 0.0)] + clusters = cluster_events(events) + assert len(clusters) == 1 + assert clusters[0][0].pitch_midi == 60 # low-onset first + + +# ---------- enumerate_chord_states ---------- + + +def test_enumerate_chord_states_enforces_monophony(): + """C major triad (C4 + E4 + G4) — no enumerated state may put two + notes on the same string.""" + cfg = GuitarConfig() + events = [_ev(60, 0.0), _ev(64, 0.0), _ev(67, 0.0)] + states = enumerate_chord_states(events, cfg) + assert states # non-empty + for state in states: + strings = [c.string_idx for c in state] + assert len(strings) == len(set(strings)), ( + f"per-string monophony violated: {state}" + ) + + +def test_enumerate_chord_states_enforces_hand_span(): + """Every enumerated state must respect MAX_HAND_SPAN over pressed frets.""" + cfg = GuitarConfig() + events = [_ev(60, 0.0), _ev(64, 0.0), _ev(67, 0.0)] + states = enumerate_chord_states(events, cfg) + for state in states: + pressed = [c.fret for c in state if c.fret > 0] + if pressed: + assert max(pressed) - min(pressed) <= MAX_HAND_SPAN + + +def test_enumerate_chord_states_empty_when_event_unfretable(): + """If any event has no candidates, no chord state survives.""" + cfg = GuitarConfig() + events = [_ev(60, 0.0), _ev(20, 0.0)] # 20 = far below low E + assert enumerate_chord_states(events, cfg) == [] + + +# ---------- chord_anchor ---------- + + +def test_chord_anchor_picks_lowest_pressed_fret(): + state = ( + Candidate(string_idx=4, fret=5), + Candidate(string_idx=5, fret=0), # open + Candidate(string_idx=3, fret=3), + ) + assert chord_anchor(state) == Candidate(string_idx=3, fret=3) + + +def test_chord_anchor_falls_back_to_first_when_all_open(): + state = ( + Candidate(string_idx=5, fret=0), + Candidate(string_idx=4, fret=0), + ) + assert chord_anchor(state) == state[0] + + +# ---------- end-to-end fuse() through chord clusters ---------- + + +def test_fuse_simultaneous_events_emit_distinct_strings(): + """C4 + E4 fired together — picks must use different strings.""" + cfg = GuitarConfig() + events = [_ev(60, 0.0), _ev(64, 0.0)] + out = fuse(events, [], cfg) + assert len(out) == 2 + assert out[0].string_idx != out[1].string_idx + + +def test_fuse_three_note_chord_within_hand_span(): + """C major triad (C4 + E4 + G4) — picks form a hand-span-feasible voicing.""" + cfg = GuitarConfig() + events = [_ev(60, 0.0), _ev(64, 0.0), _ev(67, 0.0)] + out = fuse(events, [], cfg) + assert len(out) == 3 + strings = [t.string_idx for t in out] + assert len(set(strings)) == 3 # all distinct + pressed = [t.fret for t in out if t.fret > 0] + if pressed: + assert max(pressed) - min(pressed) <= MAX_HAND_SPAN + + +def test_fuse_chord_prefers_open_string_voicing_with_uniform_vision(): + """C major triad — the open-E voicing should win on emission cost + when no vision evidence pushes elsewhere. + + E4 has an open-string candidate (5, 0). The open-string bonus + + low-fret bias should make at least one note an open string in the + chosen voicing.""" + cfg = GuitarConfig() + events = [_ev(60, 0.0), _ev(64, 0.0), _ev(67, 0.0)] + out = fuse(events, [], cfg) + assert any(t.fret == 0 for t in out) + + +def test_fuse_chord_vision_pulls_voicing(): + """If the fingering is peaked at a non-default position for one of + the chord notes, the chosen state should include that exact pick.""" + cfg = GuitarConfig() + events = [_ev(60, 0.0), _ev(64, 0.0), _ev(67, 0.0)] + # Push C4 onto string 3 fret 5 (G-string). The default voicing + # would have C4 on string 4 fret 1. With this peak, C4 should move. + fings = [_peaked_fingering(0.0, string_idx=3, fret=5)] + out = fuse(events, fings, cfg, lambda_vision=2.0) + + c4 = next(t for t in out if t.pitch_midi == 60) + assert (c4.string_idx, c4.fret) == (3, 5) + # Other notes still produce a valid voicing. + strings = [t.string_idx for t in out] + assert len(set(strings)) == 3 diff --git a/tabvision/tests/unit/test_fusion_audio_only.py b/tabvision/tests/unit/test_fusion_audio_only.py index 675a054..b4feca2 100644 --- a/tabvision/tests/unit/test_fusion_audio_only.py +++ b/tabvision/tests/unit/test_fusion_audio_only.py @@ -1,7 +1,14 @@ -"""Unit tests for the audio-only fusion path.""" +"""Unit tests for ``tabvision.fusion.viterbi.fuse``. + +Covers both the audio-only path (no / uniform fingerings) and the +video-aware Viterbi behaviour (vision evidence pulls picks; lookahead +changes earlier picks when later events benefit from a different anchor). +""" + +import numpy as np from tabvision.fusion import fuse -from tabvision.types import AudioEvent, GuitarConfig +from tabvision.types import AudioEvent, FrameFingering, GuitarConfig def _ev(midi: int, t: float) -> AudioEvent: @@ -14,6 +21,26 @@ def _ev(midi: int, t: float) -> AudioEvent: ) +def _peaked_fingering(t: float, string_idx: int, fret: int) -> FrameFingering: + """Marginal sharply peaked at ``(string_idx, fret)``.""" + logits = np.zeros((4, 6, 25), dtype=np.float64) + logits[0, string_idx, fret] = 10.0 + return FrameFingering( + t=t, finger_pos_logits=logits, homography_confidence=0.9 + ) + + +def _uniform_fingering(t: float) -> FrameFingering: + """Marginal ≈ uniform across (string, fret) cells.""" + logits = np.ones((4, 6, 25), dtype=np.float64) + return FrameFingering( + t=t, finger_pos_logits=logits, homography_confidence=0.9 + ) + + +# ---------- audio-only regression ---------- + + def test_empty_input_yields_empty_output(): assert fuse([], [], GuitarConfig()) == [] @@ -50,3 +77,68 @@ def test_capo_shifts_picks(): out = fuse([_ev(69, 0.0)], [], cfg) assert len(out) == 1 assert out[0].fret >= 2 + + +# ---------- video-aware Viterbi ---------- + + +def test_uniform_vision_matches_no_vision(): + """A uniform fingering must not change the audio-only picks.""" + events = [_ev(69, 0.0), _ev(71, 0.5)] + cfg = GuitarConfig() + fings = [_uniform_fingering(0.0), _uniform_fingering(0.5)] + out_with = fuse(events, fings, cfg) + out_without = fuse(events, [], cfg) + assert [(e.string_idx, e.fret) for e in out_with] == [ + (e.string_idx, e.fret) for e in out_without + ] + + +def test_decisive_vision_moves_single_pick(): + """A vision peak at a non-default candidate should override the lowest-fret bias. + + A4's audio-only pick is (5, 5). With the fingering peaked at the G-string + A4 position (3, 14), Viterbi should land there instead. + """ + cfg = GuitarConfig() + events = [_ev(69, 0.0)] + fings = [_peaked_fingering(0.0, string_idx=3, fret=14)] + out = fuse(events, fings, cfg, lambda_vision=1.0) + assert len(out) == 1 + assert out[0].string_idx == 3 + assert out[0].fret == 14 + + +def test_lambda_zero_disables_vision(): + """Setting ``lambda_vision=0`` should reproduce the audio-only pick even + when a peaked fingering is present.""" + cfg = GuitarConfig() + events = [_ev(69, 0.0)] + fings = [_peaked_fingering(0.0, string_idx=3, fret=14)] + out = fuse(events, fings, cfg, lambda_vision=0.0) + assert len(out) == 1 + assert out[0].string_idx == 5 # back to high E + assert out[0].fret == 5 + + +def test_viterbi_lookahead_changes_earlier_pick(): + """A future event's vision evidence should pull the earlier pick onto + the same string when staying lowest-fret would force a giant hand jump. + + Sequence: A4 (MIDI 69) → B4 (MIDI 71). The B4 fingering is peaked at + (string=3, fret=16) — the G-string B4 position. A greedy decoder picks + (5, 5) for A4 (lowest fret) and would then have to leap from fret 5 → + fret 16 across two strings; the hand-span barrier makes that path + expensive. Viterbi instead picks (3, 14) for A4 — same string, two + frets below the upcoming B4 — so the entire path is cheap. + """ + cfg = GuitarConfig() + events = [_ev(69, 0.0), _ev(71, 0.5)] + fings = [_peaked_fingering(0.5, string_idx=3, fret=16)] + out = fuse(events, fings, cfg, lambda_vision=1.0) + assert len(out) == 2 + # Vision-decisive on the second event: + assert (out[1].string_idx, out[1].fret) == (3, 16) + # Lookahead-driven on the first event: must NOT be the audio-only (5, 5); + # specifically should land on the G-string A4 anchor. + assert (out[0].string_idx, out[0].fret) == (3, 14) From 82d7edf55fc2e3cbe660b31900ffdab79b3f0ca7 Mon Sep 17 00:00:00 2001 From: Patrick Gilhooley Date: Wed, 6 May 2026 17:50:39 -0400 Subject: [PATCH 4/6] feat(phase5): add --fusion-lambda-vision CLI flag --- tabvision/tabvision/cli.py | 26 ++++++++++- tabvision/tests/unit/test_cli_fusion_flag.py | 46 ++++++++++++++++++++ 2 files changed, 70 insertions(+), 2 deletions(-) create mode 100644 tabvision/tests/unit/test_cli_fusion_flag.py diff --git a/tabvision/tabvision/cli.py b/tabvision/tabvision/cli.py index 7f313f1..a7794cb 100644 --- a/tabvision/tabvision/cli.py +++ b/tabvision/tabvision/cli.py @@ -74,6 +74,18 @@ def _build_parser() -> argparse.ArgumentParser: ), ) t.add_argument("--capo", type=int, default=0, help="capo fret (0-7)") + t.add_argument( + "--fusion-lambda-vision", + type=float, + default=1.0, + metavar="FLOAT", + help=( + "weight on vision evidence in fusion (default 1.0). 0.0 " + "disables vision entirely (audio-only Viterbi); values >1 " + "lean more heavily on the fingertip-to-fret posterior. " + "See SPEC §5 / Phase-5 design doc §2." + ), + ) t.add_argument( "--instrument", choices=["acoustic", "classical", "electric"], @@ -147,8 +159,18 @@ def _cmd_transcribe(args: argparse.Namespace) -> int: # Phase 1: video stubbed; pass empty fingerings → fusion takes audio-only path. fingerings: list = [] - tab_events = fuse(audio_events, fingerings, cfg, session) - logger.info("fusion produced %d tab events", len(tab_events)) + tab_events = fuse( + audio_events, + fingerings, + cfg, + session, + lambda_vision=args.fusion_lambda_vision, + ) + logger.info( + "fusion produced %d tab events (lambda_vision=%.2f)", + len(tab_events), + args.fusion_lambda_vision, + ) output = render(tab_events, cfg) if args.output: diff --git a/tabvision/tests/unit/test_cli_fusion_flag.py b/tabvision/tests/unit/test_cli_fusion_flag.py new file mode 100644 index 0000000..d2f321b --- /dev/null +++ b/tabvision/tests/unit/test_cli_fusion_flag.py @@ -0,0 +1,46 @@ +"""CLI parser smoke for ``--fusion-lambda-vision``. + +Verifies the flag parses with the right default, accepts user-supplied +values, and surfaces zero (the audio-only-equivalent setting). The +actual pass-through to ``fuse()`` is one line of code in +``_cmd_transcribe`` — see ``tabvision/cli.py``. +""" + +from __future__ import annotations + +import pytest + +from tabvision.cli import _build_parser + + +def test_default_lambda_vision_is_one(): + parser = _build_parser() + args = parser.parse_args(["transcribe", "in.mp4"]) + assert args.fusion_lambda_vision == 1.0 + + +def test_explicit_lambda_vision_parsed(): + parser = _build_parser() + args = parser.parse_args( + ["transcribe", "in.mp4", "--fusion-lambda-vision", "2.5"] + ) + assert args.fusion_lambda_vision == pytest.approx(2.5) + + +def test_lambda_vision_zero_accepted(): + """``--fusion-lambda-vision 0`` is the audio-only ablation knob.""" + parser = _build_parser() + args = parser.parse_args( + ["transcribe", "in.mp4", "--fusion-lambda-vision", "0"] + ) + assert args.fusion_lambda_vision == 0.0 + + +def test_lambda_vision_only_on_transcribe(): + """The ``check`` subcommand has no fusion stage, so the flag should + not be exposed there.""" + parser = _build_parser() + with pytest.raises(SystemExit): + parser.parse_args( + ["check", "in.mp4", "--fusion-lambda-vision", "1.0"] + ) From e5db4cafa92f70793dc7a23deb615c2ac80bc5a9 Mon Sep 17 00:00:00 2001 From: Patrick Gilhooley Date: Wed, 6 May 2026 17:54:29 -0400 Subject: [PATCH 5/6] =?UTF-8?q?feat(phase5):=20acceptance=20harness=20?= =?UTF-8?q?=E2=80=94=20Tab=20F1=20+=20chord=20accuracy=20+=20ablation=20ga?= =?UTF-8?q?te?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds tabvision.eval.metrics with tab_f1() and chord_instance_accuracy() helpers (string + fret + onset within 50ms tolerance per SPEC §9.2; chord cluster matching uses the same 80ms gap rule as the chord-fusion grouping). 11 unit tests cover the metric edge cases. tests/eval/test_phase5_eval.py defines the SPEC §5 gates in code: - +8pp Tab F1 delta (audio+vision over audio-only) — the Phase-5-specific bar for "fusion is doing real work" - Tab F1 >= 0.85 absolute, marked xfail until Phase 2 SOTA backbone lands (likely needs a stronger audio backbone too) - Chord-instance accuracy >= 0.80 The full-pipeline runner is currently a stub: blocked on the numpy<2 (basic-pitch / TF 2.15) vs. numpy>=2 (mediapipe) env conflict — the audio half and the video half can't import in the same venv today. The eval tests skip with clear messages until either Phase 2 swaps in a torch-based audio backbone or the env is reconciled separately. The metric helpers are independent of that and ship usable as-is. --- tabvision/tabvision/eval/__init__.py | 4 + tabvision/tabvision/eval/metrics.py | 182 +++++++++++ tabvision/tests/eval/test_phase5_eval.py | 325 ++++++++++++++++++++ tabvision/tests/unit/test_phase5_metrics.py | 127 ++++++++ 4 files changed, 638 insertions(+) create mode 100644 tabvision/tabvision/eval/__init__.py create mode 100644 tabvision/tabvision/eval/metrics.py create mode 100644 tabvision/tests/eval/test_phase5_eval.py create mode 100644 tabvision/tests/unit/test_phase5_metrics.py diff --git a/tabvision/tabvision/eval/__init__.py b/tabvision/tabvision/eval/__init__.py new file mode 100644 index 0000000..f9cd6a3 --- /dev/null +++ b/tabvision/tabvision/eval/__init__.py @@ -0,0 +1,4 @@ +"""Evaluation helpers — Tab F1, chord-instance accuracy, ablation runner. + +See SPEC.md §9 for metric definitions. +""" diff --git a/tabvision/tabvision/eval/metrics.py b/tabvision/tabvision/eval/metrics.py new file mode 100644 index 0000000..5e2ce1b --- /dev/null +++ b/tabvision/tabvision/eval/metrics.py @@ -0,0 +1,182 @@ +"""Tab F1 + chord-instance accuracy metrics — Phase 5 acceptance. + +Definitions follow SPEC.md §9.2: + +- **Tab F1**: precision / recall / F1 over (string_idx, fret, onset_s) + with onset matched within ``onset_tolerance_s`` (default 50 ms). + Greedy matcher — each predicted event matches at most one gold event, + picked by closest-onset. +- **Chord instance accuracy**: gold events are grouped into chord + clusters using the same 80 ms gap rule as + :mod:`tabvision.fusion.chord`. For each gold cluster, find the closest + predicted cluster by midpoint onset; the cluster matches if (a) the + cluster sizes are equal and (b) the multiset of ``(string_idx, fret)`` + tuples matches exactly. Accuracy = matched_chords / total_gold_chords. + +These helpers operate on :class:`tabvision.types.TabEvent` sequences so +they can score the output of :func:`tabvision.fusion.fuse` directly. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Sequence + +from tabvision.fusion.chord import CHORD_MAX_GAP_S +from tabvision.types import TabEvent + + +@dataclass(frozen=True) +class TabF1Result: + """Outcome of :func:`tab_f1`.""" + + precision: float + recall: float + f1: float + true_positives: int + false_positives: int + false_negatives: int + + @property + def total_predicted(self) -> int: + return self.true_positives + self.false_positives + + @property + def total_gold(self) -> int: + return self.true_positives + self.false_negatives + + +def tab_f1( + predicted: Sequence[TabEvent], + gold: Sequence[TabEvent], + *, + onset_tolerance_s: float = 0.05, +) -> TabF1Result: + """Tab F1 over (string, fret, onset).""" + pred_sorted = sorted(predicted, key=lambda t: t.onset_s) + gold_sorted = sorted(gold, key=lambda t: t.onset_s) + gold_used = [False] * len(gold_sorted) + tp = 0 + fp = 0 + for p in pred_sorted: + best_j = -1 + best_dt = onset_tolerance_s + 1e-9 + for j, g in enumerate(gold_sorted): + if gold_used[j]: + continue + if g.string_idx != p.string_idx or g.fret != p.fret: + continue + dt = abs(g.onset_s - p.onset_s) + if dt <= onset_tolerance_s and dt < best_dt: + best_j = j + best_dt = dt + if best_j >= 0: + gold_used[best_j] = True + tp += 1 + else: + fp += 1 + fn = sum(1 for used in gold_used if not used) + precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0 + recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0 + f1 = ( + 2 * precision * recall / (precision + recall) + if (precision + recall) > 0 + else 0.0 + ) + return TabF1Result( + precision=precision, + recall=recall, + f1=f1, + true_positives=tp, + false_positives=fp, + false_negatives=fn, + ) + + +@dataclass(frozen=True) +class ChordAccuracyResult: + accuracy: float + matched_chords: int + total_chords: int + + +def chord_instance_accuracy( + predicted: Sequence[TabEvent], + gold: Sequence[TabEvent], + *, + cluster_gap_s: float = CHORD_MAX_GAP_S, + onset_match_tolerance_s: float = 0.05, +) -> ChordAccuracyResult: + """Fraction of gold chord clusters whose (string, fret) multiset + matches exactly in the closest predicted cluster. + + A chord cluster is a maximal run of consecutive events whose adjacent + onset gaps are all ≤ ``cluster_gap_s`` (matches the chord-fusion + grouping rule). Single-event clusters count toward the metric — a + correctly transcribed isolated note is a "size-1 chord" instance. + """ + pred_clusters = _cluster_by_gap( + sorted(predicted, key=lambda t: t.onset_s), cluster_gap_s + ) + gold_clusters = _cluster_by_gap( + sorted(gold, key=lambda t: t.onset_s), cluster_gap_s + ) + + if not gold_clusters: + return ChordAccuracyResult(accuracy=0.0, matched_chords=0, total_chords=0) + + matched = 0 + pred_used = [False] * len(pred_clusters) + for gc in gold_clusters: + gc_mid = sum(t.onset_s for t in gc) / len(gc) + best_j = -1 + best_dt = onset_match_tolerance_s + 1e-9 + for j, pc in enumerate(pred_clusters): + if pred_used[j]: + continue + pc_mid = sum(t.onset_s for t in pc) / len(pc) + dt = abs(pc_mid - gc_mid) + if dt <= onset_match_tolerance_s and dt < best_dt: + best_j = j + best_dt = dt + if best_j < 0: + continue + pc = pred_clusters[best_j] + if len(pc) != len(gc): + continue + gc_set = sorted((t.string_idx, t.fret) for t in gc) + pc_set = sorted((t.string_idx, t.fret) for t in pc) + if gc_set == pc_set: + pred_used[best_j] = True + matched += 1 + + return ChordAccuracyResult( + accuracy=matched / len(gold_clusters), + matched_chords=matched, + total_chords=len(gold_clusters), + ) + + +def _cluster_by_gap( + events: Sequence[TabEvent], gap_s: float +) -> list[list[TabEvent]]: + """Same chain semantics as :func:`tabvision.fusion.chord.cluster_events`, + but on :class:`TabEvent` (which carries an ``onset_s``). Inlined to + avoid a sequence-type adapter.""" + if not events: + return [] + clusters: list[list[TabEvent]] = [[events[0]]] + for ev in events[1:]: + if ev.onset_s - clusters[-1][-1].onset_s <= gap_s: + clusters[-1].append(ev) + else: + clusters.append([ev]) + return clusters + + +__all__ = [ + "TabF1Result", + "ChordAccuracyResult", + "tab_f1", + "chord_instance_accuracy", +] diff --git a/tabvision/tests/eval/test_phase5_eval.py b/tabvision/tests/eval/test_phase5_eval.py new file mode 100644 index 0000000..bbed061 --- /dev/null +++ b/tabvision/tests/eval/test_phase5_eval.py @@ -0,0 +1,325 @@ +"""Phase 5 acceptance harness — audio+vision vs. audio-only ablation. + +Per SPEC §5 and ``docs/plans/2026-05-06-phase5-fusion-design.md`` §6 Step E, +the Phase-5-specific gate is: + + Tab F1 (lambda_vision=1.0) - Tab F1 (lambda_vision=0.0) ≥ 0.08 + +The absolute Tab F1 ≥ 0.85 bar is currently expected to need Phase 2's +Riley/Edwards audio backbone too — so it's marked ``xfail`` until Phase +2 is wired in. The +8 pp delta is on the hook for Phase 5 alone, since +that's the test for "fusion is doing real work given the current audio". + +**Environment caveat (2026-05-06):** the audio backend (basic-pitch + +TF 2.15) requires ``numpy<2`` while MediaPipe (Phase 4) requires +``numpy>=2`` — so a single venv currently can't run both halves of the +pipeline. The test skips when MediaPipe imports fail; once the env is +reconciled (or Phase 2's torch-based audio backbone replaces basic-pitch) +the gate runs unchanged. See ``DECISIONS.md`` if/when this gets fixed. + +The gold source is the benchmark index at +``tabvision-server/tests/fixtures/benchmarks/index.json`` — same set the +legacy ``evaluate_transcription.py`` used. Phase 1.5's annotation tool +will eventually fold its labelled clips into the same harness. +""" + +from __future__ import annotations + +import datetime as _dt +import json +from pathlib import Path +from typing import Sequence + +import pytest + +from tabvision.eval.metrics import ( + ChordAccuracyResult, + TabF1Result, + chord_instance_accuracy, + tab_f1, +) +from tabvision.types import TabEvent + +PHASE5_TAB_F1_DELTA_GATE = 0.08 +"""SPEC §5: audio+vision must beat audio-only by at least this much on Tab F1.""" + +PHASE5_TAB_F1_ABSOLUTE_GATE = 0.85 +"""SPEC §5: target absolute Tab F1. Likely needs Phase 2 SOTA backbone.""" + +PHASE5_CHORD_ACCURACY_GATE = 0.80 +"""SPEC §5: chord-instance accuracy gate.""" + +REPO_ROOT = Path(__file__).resolve().parents[3] +BENCHMARK_INDEX = ( + REPO_ROOT + / "tabvision-server" + / "tests" + / "fixtures" + / "benchmarks" + / "index.json" +) +EVAL_OUTPUT_DIR = REPO_ROOT / "tabvision-server" / "tools" / "outputs" + + +@pytest.mark.eval +def test_phase5_audio_plus_vision_beats_audio_only(): + """Run the full pipeline on the eval set under both lambda_vision + settings; assert audio+vision wins by ≥ 8 pp Tab F1. + + Skips automatically when any heavy dependency (basic-pitch, mediapipe, + cv2, ffmpeg) is unavailable. + """ + pytest.importorskip( + "basic_pitch", + reason="basic-pitch needed for audio-only ablation; install with " + "pip install '.[audio-baseline]'", + ) + pytest.importorskip( + "mediapipe", + reason="MediaPipe needed for video evidence; install with " + "pip install '.[vision]'. NOTE: requires numpy>=2, currently " + "incompatible with TF 2.15.", + ) + pytest.importorskip("cv2", reason="opencv-python needed for video frames.") + + benchmarks = _load_benchmarks() + if not benchmarks: + pytest.skip("no benchmarks defined in index.json") + + audio_only_scores: list[TabF1Result] = [] + audio_video_scores: list[TabF1Result] = [] + chord_scores: list[ChordAccuracyResult] = [] + rows: list[dict] = [] + + for bench in benchmarks: + video = REPO_ROOT / bench["video_path"] + gold_path = REPO_ROOT / bench["ground_truth_path"] + if not video.exists() or not gold_path.exists(): + continue + gold = _load_gold_tab_events(gold_path) + if not gold: + continue + + ao = _run_pipeline(video, lambda_vision=0.0) + av = _run_pipeline(video, lambda_vision=1.0) + + ao_score = tab_f1(ao, gold) + av_score = tab_f1(av, gold) + chord_score = chord_instance_accuracy(av, gold) + + audio_only_scores.append(ao_score) + audio_video_scores.append(av_score) + chord_scores.append(chord_score) + rows.append( + { + "id": bench["id"], + "ao_f1": ao_score.f1, + "av_f1": av_score.f1, + "delta": av_score.f1 - ao_score.f1, + "chord_acc": chord_score.accuracy, + } + ) + + if not rows: + pytest.skip("no benchmark videos / ground truth files were available") + + ao_mean = _mean([r.f1 for r in audio_only_scores]) + av_mean = _mean([r.f1 for r in audio_video_scores]) + chord_mean = _mean([r.accuracy for r in chord_scores]) + delta = av_mean - ao_mean + + _write_report( + rows=rows, + ao_mean=ao_mean, + av_mean=av_mean, + delta=delta, + chord_mean=chord_mean, + ) + + assert delta >= PHASE5_TAB_F1_DELTA_GATE, ( + f"Phase 5 +{PHASE5_TAB_F1_DELTA_GATE * 100:.0f}pp gate failed: " + f"audio+vision {av_mean:.3f} - audio-only {ao_mean:.3f} = " + f"{delta:+.3f}. Per SPEC §5 decision tree, drop lambda_vision and " + f"investigate vision calibration if equal/worse, or tighten " + f"hand-span / open-string priors if marginally better." + ) + + +@pytest.mark.eval +@pytest.mark.xfail( + reason="absolute Tab F1 ≥ 0.85 likely needs Phase 2 audio SOTA backbone " + "to also be wired in; track in DECISIONS.md", + strict=False, +) +def test_phase5_absolute_tab_f1(): + pytest.importorskip("basic_pitch") + pytest.importorskip("mediapipe") + pytest.importorskip("cv2") + + benchmarks = _load_benchmarks() + if not benchmarks: + pytest.skip("no benchmarks defined in index.json") + + scores: list[TabF1Result] = [] + for bench in benchmarks: + video = REPO_ROOT / bench["video_path"] + gold_path = REPO_ROOT / bench["ground_truth_path"] + if not video.exists() or not gold_path.exists(): + continue + gold = _load_gold_tab_events(gold_path) + if not gold: + continue + av = _run_pipeline(video, lambda_vision=1.0) + scores.append(tab_f1(av, gold)) + + if not scores: + pytest.skip("no benchmark videos available") + + mean_f1 = _mean([s.f1 for s in scores]) + assert mean_f1 >= PHASE5_TAB_F1_ABSOLUTE_GATE, ( + f"absolute Tab F1 {mean_f1:.3f} < {PHASE5_TAB_F1_ABSOLUTE_GATE}" + ) + + +@pytest.mark.eval +def test_phase5_chord_accuracy(): + pytest.importorskip("basic_pitch") + pytest.importorskip("mediapipe") + pytest.importorskip("cv2") + + benchmarks = _load_benchmarks() + if not benchmarks: + pytest.skip("no benchmarks defined in index.json") + + scores: list[ChordAccuracyResult] = [] + for bench in benchmarks: + video = REPO_ROOT / bench["video_path"] + gold_path = REPO_ROOT / bench["ground_truth_path"] + if not video.exists() or not gold_path.exists(): + continue + gold = _load_gold_tab_events(gold_path) + if not gold: + continue + av = _run_pipeline(video, lambda_vision=1.0) + scores.append(chord_instance_accuracy(av, gold)) + + if not scores: + pytest.skip("no benchmark videos available") + + mean_acc = _mean([s.accuracy for s in scores]) + assert mean_acc >= PHASE5_CHORD_ACCURACY_GATE, ( + f"chord accuracy {mean_acc:.3f} < {PHASE5_CHORD_ACCURACY_GATE}" + ) + + +# ---------- helpers ---------- + + +def _load_benchmarks() -> list[dict]: + if not BENCHMARK_INDEX.exists(): + return [] + return json.loads(BENCHMARK_INDEX.read_text()).get("benchmarks", []) + + +def _load_gold_tab_events(path: Path) -> list[TabEvent]: + """Parse the legacy benchmark ground-truth ``.txt`` format into TabEvents. + + The legacy parser lives in ``tabvision-server/evaluate_transcription.py``; + this helper imports it lazily to keep the eval module's deps minimal. + Returns an empty list if the legacy module isn't importable (e.g. when + the test runs from an environment without the server checked out). + """ + try: + import sys + + server_path = REPO_ROOT / "tabvision-server" + if str(server_path) not in sys.path: + sys.path.insert(0, str(server_path)) + from evaluate_transcription import parse_ground_truth_tabs + except Exception: # noqa: BLE001 — broad: optional dep, want graceful skip + return [] + + text = path.read_text() + parsed = parse_ground_truth_tabs(text) + # The legacy parser returns beats; we need seconds. The benchmarks + # don't carry duration, so this helper currently returns the parsed + # raw notes without timing. Phase 5 acceptance defers timing + # alignment to the per-video runner that knows the video duration — + # see ``_run_pipeline``. + out: list[TabEvent] = [] + for note in parsed: + out.append( + TabEvent( + onset_s=float(note["beat"]), # placeholder — runner aligns + duration_s=0.25, + # Legacy uses 1=high E, 6=low E; spec uses 0=low E, 5=high E. + string_idx=6 - int(note["string"]), + fret=0 if note["fret"] == "X" else int(note["fret"]), + pitch_midi=0, # not needed for Tab F1 + confidence=1.0, + ) + ) + return out + + +def _run_pipeline(video: Path, *, lambda_vision: float) -> Sequence[TabEvent]: + """Run audio + video + fusion end-to-end and return TabEvents. + + Stub for now: until the numpy<2 / numpy>=2 environment conflict is + resolved (or Phase 2's torch-based audio backbone is wired up), this + raises ``ImportError`` so the surrounding ``importorskip`` calls + catch it and the test skips with a clear message. Implementation + will compose ``demux`` → audio backend → guitar/fretboard/hand + detect → ``fuse(..., lambda_vision=lambda_vision)`` once the env is + sorted. See the design doc §6 Step E. + """ + raise ImportError( + "Phase 5 end-to-end pipeline runner not yet wired — blocked on " + "numpy<2 vs numpy>=2 env conflict between basic-pitch and " + "mediapipe. See test docstring." + ) + + +def _mean(values: list[float]) -> float: + return sum(values) / len(values) if values else 0.0 + + +def _write_report( + *, + rows: list[dict], + ao_mean: float, + av_mean: float, + delta: float, + chord_mean: float, +) -> None: + """Emit ``tools/outputs/phase5_eval-YYYY-MM-DD.md`` summary report.""" + EVAL_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + today = _dt.date.today().isoformat() + out = EVAL_OUTPUT_DIR / f"phase5_eval-{today}.md" + lines = [ + f"# Phase 5 acceptance — {today}", + "", + "Audio-only vs. audio+vision ablation, per SPEC §5.", + "", + "## Aggregate", + "", + "| Metric | Value |", + "|---|---:|", + f"| Mean Tab F1 (lambda_vision=0.0) | {ao_mean:.4f} |", + f"| Mean Tab F1 (lambda_vision=1.0) | {av_mean:.4f} |", + f"| Delta (audio+vision − audio-only) | {delta:+.4f} |", + f"| Mean chord-instance accuracy | {chord_mean:.4f} |", + f"| Phase 5 +{PHASE5_TAB_F1_DELTA_GATE * 100:.0f}pp gate | " + f"{'PASS' if delta >= PHASE5_TAB_F1_DELTA_GATE else 'FAIL'} |", + "", + "## Per-video", + "", + "| id | audio-only F1 | audio+vision F1 | delta | chord acc |", + "|---|---:|---:|---:|---:|", + ] + for r in rows: + lines.append( + f"| {r['id']} | {r['ao_f1']:.3f} | {r['av_f1']:.3f} | " + f"{r['delta']:+.3f} | {r['chord_acc']:.3f} |" + ) + out.write_text("\n".join(lines) + "\n") diff --git a/tabvision/tests/unit/test_phase5_metrics.py b/tabvision/tests/unit/test_phase5_metrics.py new file mode 100644 index 0000000..66b14f7 --- /dev/null +++ b/tabvision/tests/unit/test_phase5_metrics.py @@ -0,0 +1,127 @@ +"""Unit tests for ``tabvision.eval.metrics`` (Tab F1 + chord accuracy).""" + +from __future__ import annotations + +from tabvision.eval.metrics import chord_instance_accuracy, tab_f1 +from tabvision.types import TabEvent + + +def _t(t: float, s: int, f: int, midi: int = 60) -> TabEvent: + return TabEvent( + onset_s=t, + duration_s=0.25, + string_idx=s, + fret=f, + pitch_midi=midi, + confidence=0.9, + ) + + +# ---------- tab_f1 ---------- + + +def test_tab_f1_perfect_match(): + gold = [_t(0.0, 5, 5), _t(0.5, 5, 7)] + pred = [_t(0.0, 5, 5), _t(0.5, 5, 7)] + r = tab_f1(pred, gold) + assert r.f1 == 1.0 + assert r.true_positives == 2 + assert r.false_positives == 0 + assert r.false_negatives == 0 + + +def test_tab_f1_extra_prediction_lowers_precision(): + gold = [_t(0.0, 5, 5)] + pred = [_t(0.0, 5, 5), _t(0.5, 5, 7)] + r = tab_f1(pred, gold) + assert r.true_positives == 1 + assert r.false_positives == 1 + assert r.false_negatives == 0 + assert r.recall == 1.0 + assert r.precision == 0.5 + + +def test_tab_f1_missed_gold_lowers_recall(): + gold = [_t(0.0, 5, 5), _t(0.5, 5, 7)] + pred = [_t(0.0, 5, 5)] + r = tab_f1(pred, gold) + assert r.true_positives == 1 + assert r.false_positives == 0 + assert r.false_negatives == 1 + assert r.precision == 1.0 + assert r.recall == 0.5 + + +def test_tab_f1_onset_outside_tolerance_is_a_miss(): + gold = [_t(0.0, 5, 5)] + pred = [_t(0.10, 5, 5)] # 100 ms off, tolerance 50 ms + r = tab_f1(pred, gold) + assert r.true_positives == 0 + assert r.false_positives == 1 + assert r.false_negatives == 1 + + +def test_tab_f1_wrong_string_or_fret_is_a_miss(): + gold = [_t(0.0, 5, 5)] + wrong_string = [_t(0.0, 4, 5)] + wrong_fret = [_t(0.0, 5, 6)] + assert tab_f1(wrong_string, gold).true_positives == 0 + assert tab_f1(wrong_fret, gold).true_positives == 0 + + +def test_tab_f1_each_gold_matches_at_most_one_predicted(): + """A duplicated predicted event should not double-count against the + same gold event — the second one is a false positive.""" + gold = [_t(0.0, 5, 5)] + pred = [_t(0.0, 5, 5), _t(0.01, 5, 5)] # both within tolerance + r = tab_f1(pred, gold) + assert r.true_positives == 1 + assert r.false_positives == 1 + + +# ---------- chord_instance_accuracy ---------- + + +def test_chord_accuracy_perfect_chord_matches(): + gold = [_t(0.0, 5, 0), _t(0.0, 4, 1), _t(0.0, 3, 0)] + pred = [_t(0.0, 5, 0), _t(0.0, 4, 1), _t(0.0, 3, 0)] + r = chord_instance_accuracy(pred, gold) + assert r.accuracy == 1.0 + assert r.matched_chords == 1 + assert r.total_chords == 1 + + +def test_chord_accuracy_wrong_position_in_chord_misses(): + gold = [_t(0.0, 5, 0), _t(0.0, 4, 1), _t(0.0, 3, 0)] + pred = [_t(0.0, 5, 0), _t(0.0, 4, 1), _t(0.0, 3, 7)] # one wrong + r = chord_instance_accuracy(pred, gold) + assert r.matched_chords == 0 + assert r.total_chords == 1 + + +def test_chord_accuracy_size_mismatch_misses(): + gold = [_t(0.0, 5, 0), _t(0.0, 4, 1), _t(0.0, 3, 0)] + pred = [_t(0.0, 5, 0), _t(0.0, 4, 1)] # missing one note + r = chord_instance_accuracy(pred, gold) + assert r.matched_chords == 0 + + +def test_chord_accuracy_separates_clusters_by_gap(): + """Two well-separated gold chords should both score independently.""" + gold = [ + _t(0.0, 5, 0), _t(0.0, 4, 1), + _t(2.0, 5, 7), _t(2.0, 4, 8), + ] + pred = [ + _t(0.0, 5, 0), _t(0.0, 4, 1), + _t(2.0, 5, 7), _t(2.0, 4, 8), + ] + r = chord_instance_accuracy(pred, gold) + assert r.total_chords == 2 + assert r.matched_chords == 2 + + +def test_chord_accuracy_empty_gold_yields_zero(): + r = chord_instance_accuracy([], []) + assert r.total_chords == 0 + assert r.accuracy == 0.0 From 055f66d1c16a1a294d04d482ef8997b42a79cfe9 Mon Sep 17 00:00:00 2001 From: Patrick Gilhooley Date: Wed, 6 May 2026 19:03:26 -0400 Subject: [PATCH 6/6] chore(phase5): ruff + format pass; switch eval scaffold to highres audio - Apply ruff --fix + ruff format across the Phase 5 modules and tests (zip strict=, Sequence from collections.abc, line wrapping). - Rewire tests/eval/test_phase5_eval.py to call audio.backend.make("highres") instead of basic-pitch. Phase 2's torch-based audio backbone (commit aae1ab3) is already shipped on refactor/v1; the previous "wait for Phase 2" framing was wrong. - Reframe the open dependency: the actual gap is wiring the video stack (guitar -> fretboard -> hand) into a single run_pipeline() call. cli.py:159 has the same gap. Until that integration ships, _run_pipeline raises NotImplementedError after running the audio half so anyone running the eval gets a precise error. 206 unit tests still pass; eval tests still skip cleanly until the video integration lands. --- tabvision/tabvision/cli.py | 8 +- tabvision/tabvision/eval/metrics.py | 20 +-- tabvision/tabvision/fusion/candidates.py | 4 +- tabvision/tabvision/fusion/chord.py | 10 +- tabvision/tabvision/fusion/playability.py | 10 +- tabvision/tabvision/fusion/viterbi.py | 30 ++--- tabvision/tests/eval/test_phase5_eval.py | 115 +++++++++++------- tabvision/tests/unit/test_chord_fusion.py | 8 +- tabvision/tests/unit/test_cli_fusion_flag.py | 12 +- .../tests/unit/test_fusion_audio_only.py | 8 +- tabvision/tests/unit/test_phase5_metrics.py | 12 +- tabvision/tests/unit/test_playability.py | 21 +--- 12 files changed, 116 insertions(+), 142 deletions(-) diff --git a/tabvision/tabvision/cli.py b/tabvision/tabvision/cli.py index a7794cb..a08fb19 100644 --- a/tabvision/tabvision/cli.py +++ b/tabvision/tabvision/cli.py @@ -132,9 +132,7 @@ def _cmd_transcribe(args: argparse.Namespace) -> int: from tabvision.types import GuitarConfig, SessionConfig cfg = GuitarConfig(capo=args.capo) - session = SessionConfig( - instrument=args.instrument, tone=args.tone, style=args.style - ) + session = SessionConfig(instrument=args.instrument, tone=args.tone, style=args.style) if not args.no_preflight: rc = _run_preflight_gate(args) @@ -212,9 +210,7 @@ def _run_preflight_gate(args: argparse.Namespace) -> int: has_fail = any(f.severity == "fail" for f in report.findings) if has_fail or (args.strict and not report.passed): sys.stderr.write(render(report)) - sys.stderr.write( - "Aborting transcription. Re-run with --no-preflight to bypass.\n" - ) + sys.stderr.write("Aborting transcription. Re-run with --no-preflight to bypass.\n") return 1 if not report.passed: sys.stderr.write(render(report)) diff --git a/tabvision/tabvision/eval/metrics.py b/tabvision/tabvision/eval/metrics.py index 5e2ce1b..92fd24f 100644 --- a/tabvision/tabvision/eval/metrics.py +++ b/tabvision/tabvision/eval/metrics.py @@ -19,8 +19,8 @@ from __future__ import annotations +from collections.abc import Sequence from dataclasses import dataclass -from typing import Sequence from tabvision.fusion.chord import CHORD_MAX_GAP_S from tabvision.types import TabEvent @@ -78,11 +78,7 @@ def tab_f1( fn = sum(1 for used in gold_used if not used) precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0 recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0 - f1 = ( - 2 * precision * recall / (precision + recall) - if (precision + recall) > 0 - else 0.0 - ) + f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 return TabF1Result( precision=precision, recall=recall, @@ -115,12 +111,8 @@ def chord_instance_accuracy( grouping rule). Single-event clusters count toward the metric — a correctly transcribed isolated note is a "size-1 chord" instance. """ - pred_clusters = _cluster_by_gap( - sorted(predicted, key=lambda t: t.onset_s), cluster_gap_s - ) - gold_clusters = _cluster_by_gap( - sorted(gold, key=lambda t: t.onset_s), cluster_gap_s - ) + pred_clusters = _cluster_by_gap(sorted(predicted, key=lambda t: t.onset_s), cluster_gap_s) + gold_clusters = _cluster_by_gap(sorted(gold, key=lambda t: t.onset_s), cluster_gap_s) if not gold_clusters: return ChordAccuracyResult(accuracy=0.0, matched_chords=0, total_chords=0) @@ -157,9 +149,7 @@ def chord_instance_accuracy( ) -def _cluster_by_gap( - events: Sequence[TabEvent], gap_s: float -) -> list[list[TabEvent]]: +def _cluster_by_gap(events: Sequence[TabEvent], gap_s: float) -> list[list[TabEvent]]: """Same chain semantics as :func:`tabvision.fusion.chord.cluster_events`, but on :class:`TabEvent` (which carries an ``onset_s``). Inlined to avoid a sequence-type adapter.""" diff --git a/tabvision/tabvision/fusion/candidates.py b/tabvision/tabvision/fusion/candidates.py index 2d4873c..71e4c65 100644 --- a/tabvision/tabvision/fusion/candidates.py +++ b/tabvision/tabvision/fusion/candidates.py @@ -22,9 +22,7 @@ class Candidate: fret: int # 0 = open (or capo), max_fret inclusive -def candidate_positions( - pitch_midi: int, cfg: GuitarConfig | None = None -) -> list[Candidate]: +def candidate_positions(pitch_midi: int, cfg: GuitarConfig | None = None) -> list[Candidate]: """All valid positions for ``pitch_midi`` under ``cfg``. Capo handling: open strings effectively start at ``cfg.capo``. A pitch diff --git a/tabvision/tabvision/fusion/chord.py b/tabvision/tabvision/fusion/chord.py index f734a06..4fad33c 100644 --- a/tabvision/tabvision/fusion/chord.py +++ b/tabvision/tabvision/fusion/chord.py @@ -18,7 +18,7 @@ from __future__ import annotations -from typing import Sequence +from collections.abc import Sequence from tabvision.fusion.candidates import Candidate, candidate_positions from tabvision.fusion.playability import MAX_HAND_SPAN @@ -67,15 +67,11 @@ def enumerate_chord_states( if not events: return [] - per_event_candidates = [ - candidate_positions(ev.pitch_midi, cfg) for ev in events - ] + per_event_candidates = [candidate_positions(ev.pitch_midi, cfg) for ev in events] if any(not cands for cands in per_event_candidates): return [] - states: list[tuple[Candidate, ...]] = [ - (c,) for c in per_event_candidates[0] - ] + states: list[tuple[Candidate, ...]] = [(c,) for c in per_event_candidates[0]] for k in range(1, len(events)): next_states: list[tuple[Candidate, ...]] = [] for state in states: diff --git a/tabvision/tabvision/fusion/playability.py b/tabvision/tabvision/fusion/playability.py index 2def9c5..658cd27 100644 --- a/tabvision/tabvision/fusion/playability.py +++ b/tabvision/tabvision/fusion/playability.py @@ -16,7 +16,7 @@ from __future__ import annotations import math -from typing import Sequence +from collections.abc import Sequence from tabvision.fusion.candidates import Candidate from tabvision.types import AudioEvent, FrameFingering, GuitarConfig @@ -65,9 +65,7 @@ EPS = 1e-9 -def find_fingering_at( - t: float, fingerings: Sequence[FrameFingering] -) -> FrameFingering | None: +def find_fingering_at(t: float, fingerings: Sequence[FrameFingering]) -> FrameFingering | None: """Return the ``FrameFingering`` whose ``.t`` is closest to ``t``. Returns ``None`` when ``fingerings`` is empty or no entry carries @@ -128,9 +126,7 @@ def emission_cost( return cost -def transition_cost( - prev: Candidate, curr: Candidate, cfg: GuitarConfig -) -> float: +def transition_cost(prev: Candidate, curr: Candidate, cfg: GuitarConfig) -> float: """Transition cost from ``prev`` to ``curr``. - String continuity: ``-SAME_STRING_BONUS`` when on the same string. diff --git a/tabvision/tabvision/fusion/viterbi.py b/tabvision/tabvision/fusion/viterbi.py index 1a67e9c..85056ab 100644 --- a/tabvision/tabvision/fusion/viterbi.py +++ b/tabvision/tabvision/fusion/viterbi.py @@ -20,7 +20,7 @@ from __future__ import annotations import math -from typing import Sequence +from collections.abc import Sequence from tabvision.fusion import chord, playability from tabvision.fusion.candidates import Candidate, candidate_positions @@ -75,16 +75,12 @@ def fuse( # Drop out-of-range pitches before clustering so the cluster shape # reflects what's actually decodable. - valid_events = [ - ev for ev in events if candidate_positions(ev.pitch_midi, cfg) - ] + valid_events = [ev for ev in events if candidate_positions(ev.pitch_midi, cfg)] if not valid_events: return [] clusters = chord.cluster_events(valid_events) - cluster_data: list[ - tuple[list[AudioEvent], list[tuple[Candidate, ...]]] - ] = [] + cluster_data: list[tuple[list[AudioEvent], list[tuple[Candidate, ...]]]] = [] for cluster in clusters: states = chord.enumerate_chord_states(cluster, cfg) if states: @@ -97,9 +93,7 @@ def fuse( def _viterbi_clusters( - cluster_data: list[ - tuple[list[AudioEvent], list[tuple[Candidate, ...]]] - ], + cluster_data: list[tuple[list[AudioEvent], list[tuple[Candidate, ...]]]], fingerings: Sequence[FrameFingering], cfg: GuitarConfig, lambda_vision: float, @@ -107,15 +101,11 @@ def _viterbi_clusters( """Cluster-level Viterbi DP. Worst case ``O(N · S^2)`` for ``N`` clusters with ``S`` states each.""" - def state_emission( - cluster: list[AudioEvent], state: tuple[Candidate, ...] - ) -> float: + def state_emission(cluster: list[AudioEvent], state: tuple[Candidate, ...]) -> float: total = 0.0 - for ev, c in zip(cluster, state): + for ev, c in zip(cluster, state, strict=True): f = playability.find_fingering_at(ev.onset_s, fingerings) - total += playability.emission_cost( - c, ev, f, cfg, lambda_vision=lambda_vision - ) + total += playability.emission_cost(c, ev, f, cfg, lambda_vision=lambda_vision) return total n = len(cluster_data) @@ -136,9 +126,7 @@ def state_emission( anchor_curr = chord.chord_anchor(state) for pi, prev_state in enumerate(prev_states): anchor_prev = chord.chord_anchor(prev_state) - trans = playability.transition_cost( - anchor_prev, anchor_curr, cfg - ) + trans = playability.transition_cost(anchor_prev, anchor_curr, cfg) total = cost[i - 1][pi] + trans + emit if total < cost[i][si]: cost[i][si] = total @@ -155,7 +143,7 @@ def state_emission( out: list[TabEvent] = [] for i, (cluster, states) in enumerate(cluster_data): state = states[picks_idx[i]] - for ev, c in zip(cluster, state): + for ev, c in zip(cluster, state, strict=True): out.append( TabEvent( onset_s=ev.onset_s, diff --git a/tabvision/tests/eval/test_phase5_eval.py b/tabvision/tests/eval/test_phase5_eval.py index bbed061..5474c6f 100644 --- a/tabvision/tests/eval/test_phase5_eval.py +++ b/tabvision/tests/eval/test_phase5_eval.py @@ -5,17 +5,26 @@ Tab F1 (lambda_vision=1.0) - Tab F1 (lambda_vision=0.0) ≥ 0.08 -The absolute Tab F1 ≥ 0.85 bar is currently expected to need Phase 2's -Riley/Edwards audio backbone too — so it's marked ``xfail`` until Phase -2 is wired in. The +8 pp delta is on the hook for Phase 5 alone, since -that's the test for "fusion is doing real work given the current audio". - -**Environment caveat (2026-05-06):** the audio backend (basic-pitch + -TF 2.15) requires ``numpy<2`` while MediaPipe (Phase 4) requires -``numpy>=2`` — so a single venv currently can't run both halves of the -pipeline. The test skips when MediaPipe imports fail; once the env is -reconciled (or Phase 2's torch-based audio backbone replaces basic-pitch) -the gate runs unchanged. See ``DECISIONS.md`` if/when this gets fixed. +The absolute Tab F1 ≥ 0.85 bar likely also needs Phase 7's augmentation +work to clear, so it's marked ``xfail`` for now. The +8 pp delta is on +the hook for Phase 5 alone — that's the test for "fusion is doing real +work given today's audio". + +**Audio backend:** uses ``tabvision.audio.backend.make("highres")`` +(Phase 2 Riley/Edwards / GAPS via hf-midi-transcription, torch-based, +numpy-2-compatible) — *not* basic-pitch. Phase 2 is already shipped on +``refactor/v1`` (commit ``aae1ab3``); the earlier framing of Phase 2 as +"future work" was wrong. + +**Open dependency:** the *full pipeline* (demux → audio → guitar → fretboard +→ hand → fuse) is not yet wired end-to-end in this repo. ``cli.py:159`` +still has ``fingerings: list = []`` (Phase 1 stub). The video components +exist independently — see ``tabvision.video.{guitar,fretboard,hand}`` — +but assembling them into a runnable ``run_pipeline(video, lambda_vision)`` +is its own piece of work, likely a Phase 8 "eval harness hardening" task +or a dedicated integration ticket. Until that lands, ``_run_pipeline`` +below raises ``NotImplementedError`` for the video portion and the eval +tests cleanly skip. The gold source is the benchmark index at ``tabvision-server/tests/fixtures/benchmarks/index.json`` — same set the @@ -27,8 +36,8 @@ import datetime as _dt import json +from collections.abc import Sequence from pathlib import Path -from typing import Sequence import pytest @@ -51,12 +60,7 @@ REPO_ROOT = Path(__file__).resolve().parents[3] BENCHMARK_INDEX = ( - REPO_ROOT - / "tabvision-server" - / "tests" - / "fixtures" - / "benchmarks" - / "index.json" + REPO_ROOT / "tabvision-server" / "tests" / "fixtures" / "benchmarks" / "index.json" ) EVAL_OUTPUT_DIR = REPO_ROOT / "tabvision-server" / "tools" / "outputs" @@ -66,19 +70,15 @@ def test_phase5_audio_plus_vision_beats_audio_only(): """Run the full pipeline on the eval set under both lambda_vision settings; assert audio+vision wins by ≥ 8 pp Tab F1. - Skips automatically when any heavy dependency (basic-pitch, mediapipe, - cv2, ffmpeg) is unavailable. + Skips automatically when any heavy dependency (the highres audio + backend's torch + hf-midi-transcription stack, mediapipe, cv2, ffmpeg) + is unavailable, *or* when the video-stack-into-pipeline integration + is still a TODO in ``_run_pipeline``. """ - pytest.importorskip( - "basic_pitch", - reason="basic-pitch needed for audio-only ablation; install with " - "pip install '.[audio-baseline]'", - ) + pytest.importorskip("torch", reason="highres backend needs torch.") pytest.importorskip( "mediapipe", - reason="MediaPipe needed for video evidence; install with " - "pip install '.[vision]'. NOTE: requires numpy>=2, currently " - "incompatible with TF 2.15.", + reason="MediaPipe needed for video evidence; install with pip install '.[vision]'.", ) pytest.importorskip("cv2", reason="opencv-python needed for video frames.") @@ -152,7 +152,7 @@ def test_phase5_audio_plus_vision_beats_audio_only(): strict=False, ) def test_phase5_absolute_tab_f1(): - pytest.importorskip("basic_pitch") + pytest.importorskip("torch") pytest.importorskip("mediapipe") pytest.importorskip("cv2") @@ -183,7 +183,7 @@ def test_phase5_absolute_tab_f1(): @pytest.mark.eval def test_phase5_chord_accuracy(): - pytest.importorskip("basic_pitch") + pytest.importorskip("torch") pytest.importorskip("mediapipe") pytest.importorskip("cv2") @@ -262,23 +262,54 @@ def _load_gold_tab_events(path: Path) -> list[TabEvent]: return out -def _run_pipeline(video: Path, *, lambda_vision: float) -> Sequence[TabEvent]: +def _run_pipeline( + video: Path, + *, + lambda_vision: float, + audio_backend_name: str = "highres", +) -> Sequence[TabEvent]: """Run audio + video + fusion end-to-end and return TabEvents. - Stub for now: until the numpy<2 / numpy>=2 environment conflict is - resolved (or Phase 2's torch-based audio backbone is wired up), this - raises ``ImportError`` so the surrounding ``importorskip`` calls - catch it and the test skips with a clear message. Implementation - will compose ``demux`` → audio backend → guitar/fretboard/hand - detect → ``fuse(..., lambda_vision=lambda_vision)`` once the env is - sorted. See the design doc §6 Step E. + The audio half is wired: ``demux`` + ``audio.backend.make(...)``. + The video half (guitar / fretboard / hand → ``list[FrameFingering]``) + is **not** yet integrated end-to-end in the repo — ``cli.py``'s + transcribe path still stubs ``fingerings: list = []``. Until that + integration ships, this helper raises ``NotImplementedError``, + which the surrounding ``importorskip`` block catches via the + pytest hook and surfaces as a skip with a precise reason. + + Wire it up in a separate change: roughly, + ``demux → detect_guitar → track_fretboard → track_hand → fuse``. + The cluster Viterbi already accepts the ``FrameFingering`` sequence + and ``lambda_vision`` flag — no fusion changes needed. """ - raise ImportError( - "Phase 5 end-to-end pipeline runner not yet wired — blocked on " - "numpy<2 vs numpy>=2 env conflict between basic-pitch and " - "mediapipe. See test docstring." + from tabvision.audio.backend import make as make_audio_backend + from tabvision.demux import demux + from tabvision.types import SessionConfig + + session = SessionConfig() + demuxed = demux(str(video)) + audio_backend = make_audio_backend(audio_backend_name) + audio_events = audio_backend.transcribe(demuxed.wav, demuxed.sample_rate, session) + + raise NotImplementedError( + "Phase 5 end-to-end pipeline runner: audio half is wired " + f"({len(audio_events)} events from '{audio_backend_name}'), but " + "the video stack (guitar → fretboard → hand → FrameFingering) " + "is not yet integrated into a single run_pipeline() call. " + "cli.py:159 has the same gap. Wire the video components and " + "drop this raise; lambda_vision={lambda_vision} flows through " + "fuse() unchanged.".format(lambda_vision=lambda_vision) ) + # When the integration lands, body becomes: + # + # guitar_track = detect_guitar(frames(...), guitar_backend) + # homographies = track_fretboard(frames(...), guitar_track, fb_backend) + # fingerings = track_hand(frames(...), homographies, hand_backend, cfg) + # return fuse(audio_events, fingerings, cfg, session, + # lambda_vision=lambda_vision) + def _mean(values: list[float]) -> float: return sum(values) / len(values) if values else 0.0 diff --git a/tabvision/tests/unit/test_chord_fusion.py b/tabvision/tests/unit/test_chord_fusion.py index 9f15a23..89a717e 100644 --- a/tabvision/tests/unit/test_chord_fusion.py +++ b/tabvision/tests/unit/test_chord_fusion.py @@ -39,9 +39,7 @@ def _ev(midi: int, t: float, confidence: float = 0.8) -> AudioEvent: def _peaked_fingering(t: float, string_idx: int, fret: int) -> FrameFingering: logits = np.zeros((4, 6, 25), dtype=np.float64) logits[0, string_idx, fret] = 10.0 - return FrameFingering( - t=t, finger_pos_logits=logits, homography_confidence=0.9 - ) + return FrameFingering(t=t, finger_pos_logits=logits, homography_confidence=0.9) # ---------- cluster_events ---------- @@ -103,9 +101,7 @@ def test_enumerate_chord_states_enforces_monophony(): assert states # non-empty for state in states: strings = [c.string_idx for c in state] - assert len(strings) == len(set(strings)), ( - f"per-string monophony violated: {state}" - ) + assert len(strings) == len(set(strings)), f"per-string monophony violated: {state}" def test_enumerate_chord_states_enforces_hand_span(): diff --git a/tabvision/tests/unit/test_cli_fusion_flag.py b/tabvision/tests/unit/test_cli_fusion_flag.py index d2f321b..8ceada5 100644 --- a/tabvision/tests/unit/test_cli_fusion_flag.py +++ b/tabvision/tests/unit/test_cli_fusion_flag.py @@ -21,18 +21,14 @@ def test_default_lambda_vision_is_one(): def test_explicit_lambda_vision_parsed(): parser = _build_parser() - args = parser.parse_args( - ["transcribe", "in.mp4", "--fusion-lambda-vision", "2.5"] - ) + args = parser.parse_args(["transcribe", "in.mp4", "--fusion-lambda-vision", "2.5"]) assert args.fusion_lambda_vision == pytest.approx(2.5) def test_lambda_vision_zero_accepted(): """``--fusion-lambda-vision 0`` is the audio-only ablation knob.""" parser = _build_parser() - args = parser.parse_args( - ["transcribe", "in.mp4", "--fusion-lambda-vision", "0"] - ) + args = parser.parse_args(["transcribe", "in.mp4", "--fusion-lambda-vision", "0"]) assert args.fusion_lambda_vision == 0.0 @@ -41,6 +37,4 @@ def test_lambda_vision_only_on_transcribe(): not be exposed there.""" parser = _build_parser() with pytest.raises(SystemExit): - parser.parse_args( - ["check", "in.mp4", "--fusion-lambda-vision", "1.0"] - ) + parser.parse_args(["check", "in.mp4", "--fusion-lambda-vision", "1.0"]) diff --git a/tabvision/tests/unit/test_fusion_audio_only.py b/tabvision/tests/unit/test_fusion_audio_only.py index b4feca2..75e145a 100644 --- a/tabvision/tests/unit/test_fusion_audio_only.py +++ b/tabvision/tests/unit/test_fusion_audio_only.py @@ -25,17 +25,13 @@ def _peaked_fingering(t: float, string_idx: int, fret: int) -> FrameFingering: """Marginal sharply peaked at ``(string_idx, fret)``.""" logits = np.zeros((4, 6, 25), dtype=np.float64) logits[0, string_idx, fret] = 10.0 - return FrameFingering( - t=t, finger_pos_logits=logits, homography_confidence=0.9 - ) + return FrameFingering(t=t, finger_pos_logits=logits, homography_confidence=0.9) def _uniform_fingering(t: float) -> FrameFingering: """Marginal ≈ uniform across (string, fret) cells.""" logits = np.ones((4, 6, 25), dtype=np.float64) - return FrameFingering( - t=t, finger_pos_logits=logits, homography_confidence=0.9 - ) + return FrameFingering(t=t, finger_pos_logits=logits, homography_confidence=0.9) # ---------- audio-only regression ---------- diff --git a/tabvision/tests/unit/test_phase5_metrics.py b/tabvision/tests/unit/test_phase5_metrics.py index 66b14f7..5510539 100644 --- a/tabvision/tests/unit/test_phase5_metrics.py +++ b/tabvision/tests/unit/test_phase5_metrics.py @@ -109,12 +109,16 @@ def test_chord_accuracy_size_mismatch_misses(): def test_chord_accuracy_separates_clusters_by_gap(): """Two well-separated gold chords should both score independently.""" gold = [ - _t(0.0, 5, 0), _t(0.0, 4, 1), - _t(2.0, 5, 7), _t(2.0, 4, 8), + _t(0.0, 5, 0), + _t(0.0, 4, 1), + _t(2.0, 5, 7), + _t(2.0, 4, 8), ] pred = [ - _t(0.0, 5, 0), _t(0.0, 4, 1), - _t(2.0, 5, 7), _t(2.0, 4, 8), + _t(0.0, 5, 0), + _t(0.0, 4, 1), + _t(2.0, 5, 7), + _t(2.0, 4, 8), ] r = chord_instance_accuracy(pred, gold) assert r.total_chords == 2 diff --git a/tabvision/tests/unit/test_playability.py b/tabvision/tests/unit/test_playability.py index 02a0979..745767e 100644 --- a/tabvision/tests/unit/test_playability.py +++ b/tabvision/tests/unit/test_playability.py @@ -51,19 +51,13 @@ def _peaked_fingering( """Marginal sharply peaked at ``(target_string, target_fret)``.""" logits = np.zeros((4, n_strings, max_fret + 1), dtype=np.float64) logits[0, target_string, target_fret] = 10.0 - return FrameFingering( - t=t, finger_pos_logits=logits, homography_confidence=0.9 - ) + return FrameFingering(t=t, finger_pos_logits=logits, homography_confidence=0.9) -def _uniform_fingering( - t: float, n_strings: int = 6, max_fret: int = 24 -) -> FrameFingering: +def _uniform_fingering(t: float, n_strings: int = 6, max_fret: int = 24) -> FrameFingering: """Marginal ≈ uniform across (string, fret) cells.""" logits = np.ones((4, n_strings, max_fret + 1), dtype=np.float64) - return FrameFingering( - t=t, finger_pos_logits=logits, homography_confidence=0.9 - ) + return FrameFingering(t=t, finger_pos_logits=logits, homography_confidence=0.9) # ---------- emission ---------- @@ -131,9 +125,7 @@ def test_emission_uniform_vision_does_not_change_ranking(): ev = _ev(69) fing = _uniform_fingering(t=0.0) cands = candidate_positions(69, cfg) - pure_audio = sorted( - cands, key=lambda c: emission_cost(c, ev, None, cfg) - ) + pure_audio = sorted(cands, key=lambda c: emission_cost(c, ev, None, cfg)) with_uniform = sorted( cands, key=lambda c: emission_cost(c, ev, fing, cfg, lambda_vision=1.0), @@ -150,10 +142,7 @@ def test_transition_same_string_is_cheaper_than_string_jump(): prev = Candidate(string_idx=5, fret=5) same_string = Candidate(string_idx=5, fret=7) # 2 frets up, same string string_jump = Candidate(string_idx=4, fret=5) # different string, same fret - assert ( - transition_cost(prev, same_string, cfg) - < transition_cost(prev, string_jump, cfg) - ) + assert transition_cost(prev, same_string, cfg) < transition_cost(prev, string_jump, cfg) def test_transition_hand_span_barrier_only_past_threshold():