ContextualWisdomLab · seonghobae · Apr 25, 2026 · Mar 29, 2026 · Mar 29, 2026 · coderabbitai
@@ -17,7 +17,7 @@ This document outlines the MECE execution strategy to incrementally substitute m
 - **Tech**: Integrate `demucs` (or a smaller alternative) running locally.
 - **Output**: 4 or 6 discrete stems (vocals, bass, drums, other).
 
-### Track 3: Harmonic & Pitch Pipelines (#107)
+### Track 3: Harmonic & Pitch Pipelines (#107) (COMPLETED)
-### Track 3: Harmonic & Pitch Pipelines (#107) (COMPLETED)
+### Track 3: Harmonic & Pitch Pipelines (`#107`) (COMPLETED)
+
+- **Goal**: Replace hardcoded `C#m7` strings with DSP-derived chord and pitch arrays.
-### Track 3: Harmonic & Pitch Pipelines (#107) (COMPLETED)
+### Track 3: Harmonic & Pitch Pipelines (`#107`) (COMPLETED)
+
+- **Goal**: Replace hardcoded `C#m7` strings with DSP-derived chord and pitch arrays.
 - **Goal**: Replace hardcoded `C#m7` strings with DSP-derived chord and pitch arrays.
 - **Tech**: Chromagram extraction and Viterbi decoding for chords. YIN/pYIN for pitch ranges.
 - **Output**: Accurate harmonic sequences tied to Track 1's beat grid.

@@ -0,0 +1,156 @@
+"""Chord recognizer using librosa's chromagrams."""
+
+from typing import TypedDict
+
+import librosa
+import numpy as np
+
+
+class TrackedChord(TypedDict):
+    """Result of chord recognition for a time segment."""
+
+    start_time: float
+    end_time: float
+    chord: str
+
+
+class ChordRecognizer:
+    """Extracts chords from audio data."""
+
+    def __init__(self) -> None:
+        """Initialize the chord recognizer."""
+        # Standard major/minor triads templates for 12 pitch classes
+        # C, C#, D, D#, E, F, F#, G, G#, A, A#, B
+        self.templates = self._build_templates()
+        self.chord_labels = self._build_labels()
+
+    def _build_templates(self) -> np.ndarray:
+        """Build chromagram templates for 24 major and minor chords."""
+        templates = np.zeros((24, 12))
+        for i in range(12):
+            # Major triad (0, 4, 7)
+            templates[i, i] = 1.0
+            templates[i, (i + 4) % 12] = 1.0
+            templates[i, (i + 7) % 12] = 1.0
+
+            # Minor triad (0, 3, 7)
+            templates[i + 12, i] = 1.0
+            templates[i + 12, (i + 3) % 12] = 1.0
+            templates[i + 12, (i + 7) % 12] = 1.0
+
+        # Normalize templates
+        norms = np.linalg.norm(templates, axis=1, keepdims=True)
+        templates = np.where(norms > 0, templates / norms, templates)
+        return templates
+
+    def _build_labels(self) -> list[str]:
+        """Build labels corresponding to the templates."""
+        notes = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
+        labels = []
+        for note in notes:
+            labels.append(note)  # Major
+        for note in notes:
+            labels.append(f"{note}m")  # Minor
+        return labels
+
+    def recognize(self, y: np.ndarray, sr: int = 22050) -> list[TrackedChord]:
+        """
+        Recognize chords in an audio array using chromagrams.
+
+        Args:
+            y: Audio time series.
+            sr: Sampling rate.
+
+        Returns:
+            List of dictionaries containing start_time, end_time, and chord string.
+        """
+        if len(y) == 0:
+            return []
+
+        # Compute harmonic harmonic-percussive separation (optional but helps)
+        try:
+            y_harmonic, _ = librosa.effects.hpss(y)
+        except Exception:
+            y_harmonic = y
+
+        # Extract chromagram
+        try:
+            chromagram = librosa.feature.chroma_cqt(y=y_harmonic, sr=sr)
+        except Exception:
+            return []
+
+        if chromagram.size == 0:
+            return []
+
+        # Optional: apply temporal smoothing to chromagram to reduce noise
+        chromagram = librosa.decompose.nn_filter(chromagram, aggregate=np.median, metric="cosine")
+
+        # Calculate RMS energy to detect silence/noise
+        try:
+            rms = librosa.feature.rms(y=y, frame_length=2048, hop_length=512)[0]
+            # Match RMS length to chromagram length
+            if len(rms) < chromagram.shape[1]:
+                rms = np.pad(rms, (0, chromagram.shape[1] - len(rms)), mode="edge")
+            else:
+                rms = rms[: chromagram.shape[1]]
+        except Exception:
+            rms = np.ones(chromagram.shape[1])
+
+        # Compare chromagram frames to templates using dot product
+        # chromagram shape: (12, n_frames)
+        # templates shape: (24, 12)
+        # similarity shape: (24, n_frames)
+        similarity = np.dot(self.templates, chromagram)
+
+        # Find the best matching chord template for each frame
+        best_matches = np.argmax(similarity, axis=0)
+
+        # Convert frames to time segments
+        frames = librosa.frames_to_time(np.arange(chromagram.shape[1] + 1), sr=sr)
+
+        chords: list[TrackedChord] = []
+        current_chord = None
+        start_frame = 0
+
+        for i, match in enumerate(best_matches):
+            chord_label = self.chord_labels[match]
+
+            # Simple threshold for unvoiced/noise (if max similarity is very low)
+            max_sim = similarity[match, i]
+            rms_val = rms[i] if i < len(rms) else 0.0
+
+            # For noise, the max similarity is usually lower, but to be robust
+            # we should check if the chromagram is too flat (e.g. low variance)
+            # or if the RMS energy is really low.
+            # However, since dot product normalization makes noise match *something*,
+            # we can look at the variance of the chromagram frame.
+            chroma_var = np.var(chromagram[:, i])
+            if max_sim < 0.3 or rms_val < 0.01 or chroma_var < 0.02:
+                chord_label = "N"
+
+            if current_chord is None:
+                current_chord = chord_label
+                start_frame = i
+            elif chord_label != current_chord:
+                # Add previous segment
+                chords.append(
+                    {
+                        "start_time": float(frames[start_frame]),
+                        "end_time": float(frames[i]),
+                        "chord": current_chord,
+                    }
+                )
+                current_chord = chord_label
+                start_frame = i
+
+        # Add final segment
+        if current_chord is not None:
+            chords.append(
+                {
+                    "start_time": float(frames[start_frame]),
+                    "end_time": float(frames[-1] if len(frames) > 0 else 0.0),
+                    "chord": current_chord,
+                }
+            )
+
+        return chords
@@ -0,0 +1,85 @@
+"""Pitch tracker using librosa's pYIN or YIN algorithm."""
+
+from typing import Optional, TypedDict
+
+import librosa
+import numpy as np
+
+
+class TrackedPitchRange(TypedDict):
+    """Result of pitch tracking over an audio segment."""
+
+    lowest_note: Optional[str]
+    highest_note: Optional[str]
+    confidence: str
+
+
+class PitchTracker:
+    """Extracts lowest and highest notes from audio data."""
+
+    def track(self, y: np.ndarray, sr: int = 22050) -> TrackedPitchRange:
+        """
+        Track pitch in an audio array and return the lowest/highest note.
+
+        Args:
+            y: Audio time series.
+            sr: Sampling rate.
+
+        Returns:
+            Dictionary containing lowest_note, highest_note, and confidence.
+        """
+        if len(y) == 0:
+            return {"lowest_note": None, "highest_note": None, "confidence": "low"}
+
+        # Using librosa.piptrack or librosa.pyin
+        # pyin is more accurate for monophonic signals but slower.
+        # We can use it with standard fmin and fmax
+        fmin = float(librosa.note_to_hz("C1"))
+        fmax = float(librosa.note_to_hz("C8"))
+
+        # We can try to use pyin, but if it fails or returns no pitch, fallback.
+        try:
+            f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=fmin, fmax=fmax, sr=sr)
+        except Exception:
+            return {"lowest_note": None, "highest_note": None, "confidence": "low"}
+
+        # Filter f0 to only keep voiced frames
+        voiced_f0 = f0[voiced_flag] if f0 is not None else np.array([])
+
+        # Remove NaNs
+        voiced_f0 = voiced_f0[~np.isnan(voiced_f0)]
+
+        if len(voiced_f0) == 0:
+            return {"lowest_note": None, "highest_note": None, "confidence": "low"}
+
+        # Optional: we might want to filter outliers, e.g. using percentiles
+        # to avoid spurious single-frame errors. Let's use 5th and 95th percentiles.
+        # But if there are very few frames, just take min and max.
+        if len(voiced_f0) < 10:
+            p_low, p_high = np.min(voiced_f0), np.max(voiced_f0)
+        else:
+            p_low = np.percentile(voiced_f0, 5)
+            p_high = np.percentile(voiced_f0, 95)
+
+        # Convert Hz to Note
+        lowest_note = librosa.hz_to_note(p_low)
+        highest_note = librosa.hz_to_note(p_high)
+
+        # Calculate confidence
+        avg_prob = (
+            np.mean(voiced_probs[~np.isnan(voiced_probs)])
+            if voiced_probs is not None and len(voiced_probs) > 0
+            else 0.0
+        )
+        confidence = "high" if avg_prob > 0.6 else "low"
+
+        # If the average probability is very low, treat as unvoiced
+        if avg_prob < 0.2:
+            return {"lowest_note": None, "highest_note": None, "confidence": "low"}
+
+        # Clean up note names (e.g. C#4 instead of C♯4 or handles flats etc, librosa uses '#')
+        return {
+            "lowest_note": str(lowest_note).replace("♯", "#"),
+            "highest_note": str(highest_note).replace("♯", "#"),
+            "confidence": confidence,
+        }
@@ -12,6 +12,7 @@
     RehearsalRole,
     RoleExtractionResult,
     RoleType,
+    RangeSummary,
     SectionRoleTopology,
 )
 from .priority import calculate_rehearsal_priority
@@ -30,19 +31,68 @@ def __init__(self) -> None:
     def extract(
         self,
         sections: list[Any],
-        _audio_features: dict[str, Any] | None = None,
+        audio_features: dict[str, Any] | None = None,
     ) -> RoleExtractionResult:
         """Extract roles and their topology per section.
 
         Args:
             sections: List of section dicts (must contain 'id').
-            _audio_features: Optional audio features to inform extraction.
+            audio_features: Optional audio features to inform extraction.
 
         Returns:
             RoleExtractionResult containing topologies and notes.
         """
         topologies: list[SectionRoleTopology] = []
 
+        features = audio_features or {}
+        stems = features.get("stems", {})
+        sr = features.get("sr", 22050)
+
+        vocal_range: RangeSummary = {"lowestNote": "G#3", "highestNote": "C#5"}
+        vocal_chord = "C#m7"
+        bass_range: RangeSummary = {"lowestNote": "C#2", "highestNote": "E3"}
+        bass_chord = "C#m7"
+
+        # If we have real audio stems, extract real ranges and chords
+        if stems:
+            try:
+                from ..chords.chord_recognizer import ChordRecognizer
+                from ..ranges.pitch_tracker import PitchTracker
+
+                pitch_tracker = PitchTracker()
+                chord_recognizer = ChordRecognizer()
+
+                if "vocals" in stems:
+                    p_res = pitch_tracker.track(stems["vocals"], sr=sr)
+                    if p_res:
+                        vocal_range = {
+                            "lowestNote": p_res["lowest_note"] or "",
+                            "highestNote": p_res["highest_note"] or "",
+                        }
+
+                if "bass" in stems:
+                    p_res = pitch_tracker.track(stems["bass"], sr=sr)
+                    if p_res:
+                        bass_range = {
+                            "lowestNote": p_res["lowest_note"] or "",
+                            "highestNote": p_res["highest_note"] or "",
+                        }
+                    c_res = chord_recognizer.recognize(stems["bass"], sr=sr)
+                    if c_res and len(c_res) > 0:
+                        # Use the most common chord or first chord
+                        valid_chords = [c["chord"] for c in c_res if c["chord"] != "N"]
+                        if valid_chords:
+                            bass_chord = valid_chords[0]
+
+                if "other" in stems:
+                    c_res = chord_recognizer.recognize(stems["other"], sr=sr)
+                    if c_res and len(c_res) > 0:
+                        valid_chords = [c["chord"] for c in c_res if c["chord"] != "N"]
+                        if valid_chords:
+                            vocal_chord = valid_chords[0]
+            except Exception as e:
+                logger.warning("Failed to extract features from stems: %s", e)
+
         # Simple mock implementation for testing/demonstration purposes
         for i, section in enumerate(sections):
             if not isinstance(section, dict):
@@ -55,25 +105,28 @@ def extract(
             else:
                 section_id = section.get("id", f"section-{i}")
 
-            # Create a mock bass role
             bass_role: RehearsalRole = {
                 "id": "bass-guitar",
                 "name": "Bass Guitar",
                 "roleType": RoleType.INSTRUMENT,
-                "harmony": {"chord": "C#m7", "functionLabel": "vi pedal anchor", "source": "model"},
+                "harmony": {
+                    "chord": bass_chord,
+                    "functionLabel": "vi pedal anchor",
+                    "source": "model",
+                },
                 "cue": {
                     "kind": CueAnchorKind.TRANSITION,
                     "value": "Hold through the pickup before the downbeat.",
                 },
-                "range": {"lowestNote": "C#2", "highestNote": "E3"},
+                "range": bass_range,
                 "confidence": {
                     "level": "medium",
                     "source": "model",
                     "notes": "Watch the slide into the turnaround.",
                 },
                 "rehearsalPriority": RehearsalPriority.HIGH,  # to be replaced
                 "simplification": "Stay on roots if the chorus entrance gets muddy.",
-                "setupNote": get_setup_note("Bass Guitar", ["C#m7"])
+                "setupNote": get_setup_note("Bass Guitar", [bass_chord])
                 or "Keep the attack short so the verse breathes.",
                 "manualOverrides": [],
                 "overlapWarnings": [
@@ -140,20 +193,20 @@ def extract(
                 "name": "Lead Vocal",
                 "roleType": RoleType.VOCAL,
                 "harmony": {
-                    "chord": "C#m7",
+                    "chord": vocal_chord,
                     "functionLabel": "vi melodic pull",
                     "source": "model",
                 },
                 "cue": {"kind": CueAnchorKind.LYRIC, "value": "city lights"},
-                "range": {"lowestNote": "G#3", "highestNote": "C#5"},
+                "range": vocal_range,
                 "confidence": {
                     "level": "high",
                     "source": "user",
                     "notes": "Singer confirmed the pickup phrasing in rehearsal notes.",
                 },
                 "rehearsalPriority": RehearsalPriority.MEDIUM,  # to be replaced
                 "simplification": "Keep sustained note centered; skip ad-lib on first pass.",
-                "setupNote": get_setup_note("Lead Vocal", ["C#m7"])
+                "setupNote": get_setup_note("Lead Vocal", [vocal_chord])
                 or "Watch the breath before the last line of the verse.",
                 "manualOverrides": [
                     {