-
Notifications
You must be signed in to change notification settings - Fork 0
feat(analysis): implement DSP-based harmonic and pitch pipelines (#107) #112
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,156 @@ | ||
| """Chord recognizer using librosa's chromagrams.""" | ||
|
|
||
| from typing import TypedDict | ||
|
|
||
| import librosa | ||
| import numpy as np | ||
|
|
||
|
|
||
| class TrackedChord(TypedDict): | ||
| """Result of chord recognition for a time segment.""" | ||
|
|
||
| start_time: float | ||
| end_time: float | ||
| chord: str | ||
|
|
||
|
|
||
| class ChordRecognizer: | ||
| """Extracts chords from audio data.""" | ||
|
|
||
| def __init__(self) -> None: | ||
| """Initialize the chord recognizer.""" | ||
| # Standard major/minor triads templates for 12 pitch classes | ||
| # C, C#, D, D#, E, F, F#, G, G#, A, A#, B | ||
| self.templates = self._build_templates() | ||
| self.chord_labels = self._build_labels() | ||
|
|
||
| def _build_templates(self) -> np.ndarray: | ||
| """Build chromagram templates for 24 major and minor chords.""" | ||
| templates = np.zeros((24, 12)) | ||
| for i in range(12): | ||
| # Major triad (0, 4, 7) | ||
| templates[i, i] = 1.0 | ||
| templates[i, (i + 4) % 12] = 1.0 | ||
| templates[i, (i + 7) % 12] = 1.0 | ||
|
|
||
| # Minor triad (0, 3, 7) | ||
| templates[i + 12, i] = 1.0 | ||
| templates[i + 12, (i + 3) % 12] = 1.0 | ||
| templates[i + 12, (i + 7) % 12] = 1.0 | ||
|
|
||
| # Normalize templates | ||
| norms = np.linalg.norm(templates, axis=1, keepdims=True) | ||
| templates = np.where(norms > 0, templates / norms, templates) | ||
| return templates | ||
|
|
||
| def _build_labels(self) -> list[str]: | ||
| """Build labels corresponding to the templates.""" | ||
| notes = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"] | ||
| labels = [] | ||
| for note in notes: | ||
| labels.append(note) # Major | ||
| for note in notes: | ||
| labels.append(f"{note}m") # Minor | ||
| return labels | ||
|
|
||
| def recognize(self, y: np.ndarray, sr: int = 22050) -> list[TrackedChord]: | ||
| """ | ||
| Recognize chords in an audio array using chromagrams. | ||
|
|
||
| Args: | ||
| y: Audio time series. | ||
| sr: Sampling rate. | ||
|
|
||
| Returns: | ||
| List of dictionaries containing start_time, end_time, and chord string. | ||
| """ | ||
| if len(y) == 0: | ||
| return [] | ||
|
|
||
| # Compute harmonic harmonic-percussive separation (optional but helps) | ||
| try: | ||
| y_harmonic, _ = librosa.effects.hpss(y) | ||
| except Exception: | ||
| y_harmonic = y | ||
|
|
||
| # Extract chromagram | ||
| try: | ||
| chromagram = librosa.feature.chroma_cqt(y=y_harmonic, sr=sr) | ||
| except Exception: | ||
| return [] | ||
|
|
||
| if chromagram.size == 0: | ||
| return [] | ||
|
|
||
| # Optional: apply temporal smoothing to chromagram to reduce noise | ||
| chromagram = librosa.decompose.nn_filter(chromagram, aggregate=np.median, metric="cosine") | ||
|
Comment on lines
+85
to
+86
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
🛡️ 예외 처리 추가 제안 # Optional: apply temporal smoothing to chromagram to reduce noise
- chromagram = librosa.decompose.nn_filter(chromagram, aggregate=np.median, metric="cosine")
+ try:
+ chromagram = librosa.decompose.nn_filter(chromagram, aggregate=np.median, metric="cosine")
+ except Exception:
+ pass # Use unsmoothed chromagram if filtering fails🤖 Prompt for AI Agents |
||
|
|
||
| # Calculate RMS energy to detect silence/noise | ||
| try: | ||
| rms = librosa.feature.rms(y=y, frame_length=2048, hop_length=512)[0] | ||
| # Match RMS length to chromagram length | ||
| if len(rms) < chromagram.shape[1]: | ||
| rms = np.pad(rms, (0, chromagram.shape[1] - len(rms)), mode="edge") | ||
| else: | ||
| rms = rms[: chromagram.shape[1]] | ||
| except Exception: | ||
| rms = np.ones(chromagram.shape[1]) | ||
|
|
||
| # Compare chromagram frames to templates using dot product | ||
| # chromagram shape: (12, n_frames) | ||
| # templates shape: (24, 12) | ||
| # similarity shape: (24, n_frames) | ||
| similarity = np.dot(self.templates, chromagram) | ||
|
|
||
| # Find the best matching chord template for each frame | ||
| best_matches = np.argmax(similarity, axis=0) | ||
|
|
||
| # Convert frames to time segments | ||
| frames = librosa.frames_to_time(np.arange(chromagram.shape[1] + 1), sr=sr) | ||
|
|
||
| chords: list[TrackedChord] = [] | ||
| current_chord = None | ||
| start_frame = 0 | ||
|
|
||
| for i, match in enumerate(best_matches): | ||
| chord_label = self.chord_labels[match] | ||
|
|
||
| # Simple threshold for unvoiced/noise (if max similarity is very low) | ||
| max_sim = similarity[match, i] | ||
| rms_val = rms[i] if i < len(rms) else 0.0 | ||
|
|
||
| # For noise, the max similarity is usually lower, but to be robust | ||
| # we should check if the chromagram is too flat (e.g. low variance) | ||
| # or if the RMS energy is really low. | ||
| # However, since dot product normalization makes noise match *something*, | ||
| # we can look at the variance of the chromagram frame. | ||
| chroma_var = np.var(chromagram[:, i]) | ||
| if max_sim < 0.3 or rms_val < 0.01 or chroma_var < 0.02: | ||
| chord_label = "N" | ||
|
Comment on lines
+118
to
+129
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial 매직 넘버에 대한 문서화 권장
♻️ 상수 추출 제안 class ChordRecognizer:
"""Extracts chords from audio data."""
+ # Noise detection thresholds (may require tuning per domain)
+ SIMILARITY_THRESHOLD = 0.3
+ RMS_THRESHOLD = 0.01
+ CHROMA_VARIANCE_THRESHOLD = 0.02
+
def __init__(self) -> None:- if max_sim < 0.3 or rms_val < 0.01 or chroma_var < 0.02:
+ if (max_sim < self.SIMILARITY_THRESHOLD or
+ rms_val < self.RMS_THRESHOLD or
+ chroma_var < self.CHROMA_VARIANCE_THRESHOLD):
chord_label = "N"🤖 Prompt for AI Agents |
||
|
|
||
| if current_chord is None: | ||
| current_chord = chord_label | ||
| start_frame = i | ||
| elif chord_label != current_chord: | ||
| # Add previous segment | ||
| chords.append( | ||
| { | ||
| "start_time": float(frames[start_frame]), | ||
| "end_time": float(frames[i]), | ||
| "chord": current_chord, | ||
| } | ||
| ) | ||
| current_chord = chord_label | ||
| start_frame = i | ||
|
|
||
| # Add final segment | ||
| if current_chord is not None: | ||
| chords.append( | ||
| { | ||
| "start_time": float(frames[start_frame]), | ||
| "end_time": float(frames[-1] if len(frames) > 0 else 0.0), | ||
| "chord": current_chord, | ||
| } | ||
| ) | ||
|
|
||
| return chords | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,85 @@ | ||
| """Pitch tracker using librosa's pYIN or YIN algorithm.""" | ||
|
|
||
| from typing import Optional, TypedDict | ||
|
|
||
| import librosa | ||
| import numpy as np | ||
|
|
||
|
|
||
| class TrackedPitchRange(TypedDict): | ||
| """Result of pitch tracking over an audio segment.""" | ||
|
|
||
| lowest_note: Optional[str] | ||
| highest_note: Optional[str] | ||
| confidence: str | ||
|
|
||
|
|
||
| class PitchTracker: | ||
| """Extracts lowest and highest notes from audio data.""" | ||
|
|
||
| def track(self, y: np.ndarray, sr: int = 22050) -> TrackedPitchRange: | ||
| """ | ||
| Track pitch in an audio array and return the lowest/highest note. | ||
|
|
||
| Args: | ||
| y: Audio time series. | ||
| sr: Sampling rate. | ||
|
|
||
| Returns: | ||
| Dictionary containing lowest_note, highest_note, and confidence. | ||
| """ | ||
| if len(y) == 0: | ||
| return {"lowest_note": None, "highest_note": None, "confidence": "low"} | ||
|
|
||
| # Using librosa.piptrack or librosa.pyin | ||
| # pyin is more accurate for monophonic signals but slower. | ||
| # We can use it with standard fmin and fmax | ||
| fmin = float(librosa.note_to_hz("C1")) | ||
| fmax = float(librosa.note_to_hz("C8")) | ||
|
|
||
| # We can try to use pyin, but if it fails or returns no pitch, fallback. | ||
| try: | ||
| f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=fmin, fmax=fmax, sr=sr) | ||
| except Exception: | ||
| return {"lowest_note": None, "highest_note": None, "confidence": "low"} | ||
|
|
||
| # Filter f0 to only keep voiced frames | ||
| voiced_f0 = f0[voiced_flag] if f0 is not None else np.array([]) | ||
|
|
||
| # Remove NaNs | ||
| voiced_f0 = voiced_f0[~np.isnan(voiced_f0)] | ||
|
|
||
| if len(voiced_f0) == 0: | ||
| return {"lowest_note": None, "highest_note": None, "confidence": "low"} | ||
|
|
||
| # Optional: we might want to filter outliers, e.g. using percentiles | ||
| # to avoid spurious single-frame errors. Let's use 5th and 95th percentiles. | ||
| # But if there are very few frames, just take min and max. | ||
| if len(voiced_f0) < 10: | ||
| p_low, p_high = np.min(voiced_f0), np.max(voiced_f0) | ||
| else: | ||
| p_low = np.percentile(voiced_f0, 5) | ||
| p_high = np.percentile(voiced_f0, 95) | ||
|
|
||
| # Convert Hz to Note | ||
| lowest_note = librosa.hz_to_note(p_low) | ||
| highest_note = librosa.hz_to_note(p_high) | ||
|
|
||
| # Calculate confidence | ||
| avg_prob = ( | ||
| np.mean(voiced_probs[~np.isnan(voiced_probs)]) | ||
| if voiced_probs is not None and len(voiced_probs) > 0 | ||
| else 0.0 | ||
| ) | ||
| confidence = "high" if avg_prob > 0.6 else "low" | ||
|
|
||
| # If the average probability is very low, treat as unvoiced | ||
| if avg_prob < 0.2: | ||
| return {"lowest_note": None, "highest_note": None, "confidence": "low"} | ||
|
|
||
| # Clean up note names (e.g. C#4 instead of C♯4 or handles flats etc, librosa uses '#') | ||
| return { | ||
| "lowest_note": str(lowest_note).replace("♯", "#"), | ||
| "highest_note": str(highest_note).replace("♯", "#"), | ||
| "confidence": confidence, | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,7 @@ | |
| RehearsalRole, | ||
| RoleExtractionResult, | ||
| RoleType, | ||
| RangeSummary, | ||
| SectionRoleTopology, | ||
| ) | ||
| from .priority import calculate_rehearsal_priority | ||
|
|
@@ -30,19 +31,68 @@ def __init__(self) -> None: | |
| def extract( | ||
| self, | ||
| sections: list[Any], | ||
| _audio_features: dict[str, Any] | None = None, | ||
| audio_features: dict[str, Any] | None = None, | ||
| ) -> RoleExtractionResult: | ||
| """Extract roles and their topology per section. | ||
|
|
||
| Args: | ||
| sections: List of section dicts (must contain 'id'). | ||
| _audio_features: Optional audio features to inform extraction. | ||
| audio_features: Optional audio features to inform extraction. | ||
|
|
||
| Returns: | ||
| RoleExtractionResult containing topologies and notes. | ||
| """ | ||
| topologies: list[SectionRoleTopology] = [] | ||
|
|
||
| features = audio_features or {} | ||
| stems = features.get("stems", {}) | ||
| sr = features.get("sr", 22050) | ||
|
|
||
| vocal_range: RangeSummary = {"lowestNote": "G#3", "highestNote": "C#5"} | ||
| vocal_chord = "C#m7" | ||
| bass_range: RangeSummary = {"lowestNote": "C#2", "highestNote": "E3"} | ||
| bass_chord = "C#m7" | ||
|
|
||
| # If we have real audio stems, extract real ranges and chords | ||
| if stems: | ||
| try: | ||
| from ..chords.chord_recognizer import ChordRecognizer | ||
| from ..ranges.pitch_tracker import PitchTracker | ||
|
|
||
| pitch_tracker = PitchTracker() | ||
| chord_recognizer = ChordRecognizer() | ||
|
|
||
| if "vocals" in stems: | ||
| p_res = pitch_tracker.track(stems["vocals"], sr=sr) | ||
| if p_res: | ||
| vocal_range = { | ||
| "lowestNote": p_res["lowest_note"] or "", | ||
| "highestNote": p_res["highest_note"] or "", | ||
| } | ||
|
Comment on lines
+68
to
+71
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
# Description: Check how lowestNote/highestNote are used downstream
rg -n "lowestNote|highestNote" --type py -A 3Repository: seonghobae/bandscope Length of output: 10109 🏁 Script executed: # Find the _overlap_severity and _ranges_overlap functions to see how they handle the values
rg -n "_overlap_severity|_ranges_overlap" --type py -A 5 services/analysis-engine/src/bandscope_analysis/ranges/analyzer.pyRepository: seonghobae/bandscope Length of output: 1053 🏁 Script executed: # Check for any validation patterns around lowestNote/highestNote usage
rg -n "lowestNote|highestNote" services/analysis-engine/src/bandscope_analysis/ranges/analyzer.py -B 3 | head -50Repository: seonghobae/bandscope Length of output: 1021 🏁 Script executed: # Get the full implementation of _ranges_overlap and _overlap_severity functions
rg -n "_ranges_overlap|_overlap_severity" services/analysis-engine/src/bandscope_analysis/ranges/analyzer.py -A 15 | head -80Repository: seonghobae/bandscope Length of output: 2252 🏁 Script executed: # Check test files for empty string test cases related to ranges
rg -n '"".*lowestNote|"".*highestNote|lowestNote.*""' services/analysis-engine/tests/ --type pyRepository: seonghobae/bandscope Length of output: 46 🏁 Script executed: # Find _note_to_midi function and see how it handles input
rg -n "_note_to_midi" services/analysis-engine/src/bandscope_analysis/ranges/analyzer.py -A 8Repository: seonghobae/bandscope Length of output: 1310 🏁 Script executed: # Check imports at the top of the analyzer.py file
head -30 services/analysis-engine/src/bandscope_analysis/ranges/analyzer.pyRepository: seonghobae/bandscope Length of output: 552 🏁 Script executed: # Get complete implementation of _note_to_midi function
sed -n '73,107p' services/analysis-engine/src/bandscope_analysis/ranges/analyzer.pyRepository: seonghobae/bandscope Length of output: 774 🏁 Script executed: # Check the full context around where _ranges_overlap is called
sed -n '200,240p' services/analysis-engine/src/bandscope_analysis/ranges/analyzer.pyRepository: seonghobae/bandscope Length of output: 1829 🏁 Script executed: # Find _parse_note function
rg -n "_parse_note" services/analysis-engine/src/bandscope_analysis/ranges/analyzer.py -A 10Repository: seonghobae/bandscope Length of output: 787 🏁 Script executed: # Get full implementation of _parse_note
sed -n '39,71p' services/analysis-engine/src/bandscope_analysis/ranges/analyzer.pyRepository: seonghobae/bandscope Length of output: 1363 빈 문자열 기본값이 조용히 C4로 변환되어 부정확한 범위 감지를 초래할 수 있습니다
🤖 Prompt for AI Agents |
||
|
|
||
| if "bass" in stems: | ||
| p_res = pitch_tracker.track(stems["bass"], sr=sr) | ||
| if p_res: | ||
| bass_range = { | ||
| "lowestNote": p_res["lowest_note"] or "", | ||
| "highestNote": p_res["highest_note"] or "", | ||
| } | ||
| c_res = chord_recognizer.recognize(stems["bass"], sr=sr) | ||
| if c_res and len(c_res) > 0: | ||
| # Use the most common chord or first chord | ||
| valid_chords = [c["chord"] for c in c_res if c["chord"] != "N"] | ||
| if valid_chords: | ||
| bass_chord = valid_chords[0] | ||
|
|
||
| if "other" in stems: | ||
| c_res = chord_recognizer.recognize(stems["other"], sr=sr) | ||
| if c_res and len(c_res) > 0: | ||
| valid_chords = [c["chord"] for c in c_res if c["chord"] != "N"] | ||
| if valid_chords: | ||
| vocal_chord = valid_chords[0] | ||
| except Exception as e: | ||
| logger.warning("Failed to extract features from stems: %s", e) | ||
|
|
||
| # Simple mock implementation for testing/demonstration purposes | ||
| for i, section in enumerate(sections): | ||
| if not isinstance(section, dict): | ||
|
|
@@ -55,25 +105,28 @@ def extract( | |
| else: | ||
| section_id = section.get("id", f"section-{i}") | ||
|
|
||
| # Create a mock bass role | ||
| bass_role: RehearsalRole = { | ||
| "id": "bass-guitar", | ||
| "name": "Bass Guitar", | ||
| "roleType": RoleType.INSTRUMENT, | ||
| "harmony": {"chord": "C#m7", "functionLabel": "vi pedal anchor", "source": "model"}, | ||
| "harmony": { | ||
| "chord": bass_chord, | ||
| "functionLabel": "vi pedal anchor", | ||
| "source": "model", | ||
| }, | ||
| "cue": { | ||
| "kind": CueAnchorKind.TRANSITION, | ||
| "value": "Hold through the pickup before the downbeat.", | ||
| }, | ||
| "range": {"lowestNote": "C#2", "highestNote": "E3"}, | ||
| "range": bass_range, | ||
| "confidence": { | ||
| "level": "medium", | ||
| "source": "model", | ||
| "notes": "Watch the slide into the turnaround.", | ||
| }, | ||
| "rehearsalPriority": RehearsalPriority.HIGH, # to be replaced | ||
| "simplification": "Stay on roots if the chorus entrance gets muddy.", | ||
| "setupNote": get_setup_note("Bass Guitar", ["C#m7"]) | ||
| "setupNote": get_setup_note("Bass Guitar", [bass_chord]) | ||
| or "Keep the attack short so the verse breathes.", | ||
| "manualOverrides": [], | ||
| "overlapWarnings": [ | ||
|
|
@@ -140,20 +193,20 @@ def extract( | |
| "name": "Lead Vocal", | ||
| "roleType": RoleType.VOCAL, | ||
| "harmony": { | ||
| "chord": "C#m7", | ||
| "chord": vocal_chord, | ||
| "functionLabel": "vi melodic pull", | ||
| "source": "model", | ||
| }, | ||
| "cue": {"kind": CueAnchorKind.LYRIC, "value": "city lights"}, | ||
| "range": {"lowestNote": "G#3", "highestNote": "C#5"}, | ||
| "range": vocal_range, | ||
| "confidence": { | ||
| "level": "high", | ||
| "source": "user", | ||
| "notes": "Singer confirmed the pickup phrasing in rehearsal notes.", | ||
| }, | ||
| "rehearsalPriority": RehearsalPriority.MEDIUM, # to be replaced | ||
| "simplification": "Keep sustained note centered; skip ad-lib on first pass.", | ||
| "setupNote": get_setup_note("Lead Vocal", ["C#m7"]) | ||
| "setupNote": get_setup_note("Lead Vocal", [vocal_chord]) | ||
| or "Watch the breath before the last line of the verse.", | ||
| "manualOverrides": [ | ||
| { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Markdown 포맷팅: 제목 아래 빈 줄 필요
정적 분석 도구(markdownlint MD022)에서 제목 아래에 빈 줄이 필요하다고 경고합니다.
📝 포맷팅 수정 제안
### Track 3: Harmonic & Pitch Pipelines (`#107`) (COMPLETED) + - **Goal**: Replace hardcoded `C#m7` strings with DSP-derived chord and pitch arrays.📝 Committable suggestion
🧰 Tools
🪛 markdownlint-cli2 (0.22.0)
[warning] 20-20: Headings should be surrounded by blank lines
Expected: 1; Actual: 0; Below
(MD022, blanks-around-headings)
🤖 Prompt for AI Agents