-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsound_util.py
More file actions
276 lines (223 loc) · 10.8 KB
/
sound_util.py
File metadata and controls
276 lines (223 loc) · 10.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
import librosa
import numpy as np
from numpy import ndarray
from sound import Sound
class SoundUtil:
"""
"""
@staticmethod
def resample(sound: Sound, target_sr: int, res_type: str="soxr_hq") -> Sound:
"""
Returns new Sound with target sample rate (or identity if unneeded)
"""
if sound.sr == target_sr:
return sound
new_data = librosa.resample(sound.data, orig_sr=sound.sr, target_sr=target_sr, res_type=res_type)
return Sound(new_data, target_sr)
@staticmethod
def trim(sound: Sound, start_time: float | None, end_time: float | None) -> Sound:
if start_time is None and end_time is None:
return sound # identity
if start_time is None:
start_time = 0
if end_time is None:
end_time = sound.duration
if end_time > sound.duration:
end_time = sound.duration
n_samples = sound.data.shape[-1]
start_samples = int(start_time * sound.sr)
end_samples = int(end_time * sound.sr)
if end_samples > n_samples:
end_samples = n_samples
# Ensure valid trim range
if not (0 <= start_samples < end_samples <= n_samples):
raise ValueError(f"Invalid trim range - start {start_time} end {end_time} duration {sound.duration}")
# Slice along last axis (samples) - works for both mono (1D) and stereo (2D channels-first)
if sound.data.ndim == 1:
trimmed_data = sound.data[start_samples:end_samples]
else:
trimmed_data = sound.data[:, start_samples:end_samples]
trimmed_data = np.copy(trimmed_data) # Create a copy to ensure it's not a view
return Sound(trimmed_data, sound.sr)
@staticmethod
def trim_using_segments(
sound: Sound,
segments: list[tuple[float, float]],
position: float=0
) -> tuple[Sound, float]:
"""
Effectively cuts out segments from given sound data.
Returns sound instance and modified position.
Args:
sound: The source Sound
segments: List of (start_time, end_time) tuples in seconds
Returns:
New Sound with the segments concatenated in order.
Returns identity Sound if segments list is empty.
"""
if not segments:
return sound, position # Return identity for empty segments
segment_data = []
n_samples = sound.data.shape[-1]
for start_time, end_time in segments:
# Convert to samples and clamp to valid range
start_samples = max(0, int(start_time * sound.sr))
end_samples = min(n_samples, int(end_time * sound.sr))
if start_samples >= end_samples:
continue # Skip invalid/empty segments
# Slice along last axis (samples) - works for both mono (1D) and stereo (2D)
if sound.data.ndim == 1:
segment_data.append(sound.data[start_samples:end_samples])
else:
segment_data.append(sound.data[:, start_samples:end_samples])
if not segment_data:
# All segments were invalid - return empty sound
if sound.data.ndim == 1:
empty_data = np.array([], dtype=sound.data.dtype)
else:
empty_data = np.zeros((sound.data.shape[0], 0), dtype=sound.data.dtype)
return Sound(empty_data, sound.sr), 0.0
# Concatenate along samples axis
if sound.data.ndim == 1:
concatenated = np.concatenate(segment_data)
else:
concatenated = np.concatenate(segment_data, axis=1)
# Calc new position
# Map the old position to the new concatenated sound
# by finding which segment contains it (or clamping to nearest boundary)
new_position = 0.0
sorted_segments = sorted(segments, key=lambda s: s[0])
accumulated_time = 0.0
for start_time, end_time in sorted_segments:
if start_time <= position <= end_time:
# Position is within this segment
new_position = accumulated_time + (position - start_time)
break
elif end_time < position:
# This segment is entirely before position
accumulated_time += (end_time - start_time)
else: # start_time > position
# Position is in a gap before this segment, clamp to accumulated time
new_position = accumulated_time
break
else:
# Position is after all segments
new_position = accumulated_time
return Sound(np.copy(concatenated), sound.sr), new_position
@staticmethod
def normalize(sound: Sound, headroom_db: float = -3.0) -> Sound:
new_data = SoundUtil.normalize_data(sound.data, headroom_db)
return Sound(new_data, sound.sr)
@staticmethod
def normalize_data(arr: ndarray, headroom_db: float = -3.0) -> np.ndarray:
"""
Does peak normalization of audio, with specified headroom in dB
(use negative number, e.g., -3 for 3dB headroom)
"""
if headroom_db > 0:
raise ValueError("headroom_db must be <= 0.0")
# Convert dB to linear scale
headroom_linear = 10 ** (headroom_db / 20)
# Peak normalize to 1.0, then apply headroom
# Use axis=-1 to normalize across samples (works for both 1D and channels-first 2D)
normalized_arr = librosa.util.normalize(arr, norm=np.inf, axis=-1) * headroom_linear
return normalized_arr
@staticmethod
def is_data_invalid(sound: Sound) -> list[str]:
""" Returns list of 'reasons' why data is invalid """
if not isinstance(sound.data, np.ndarray):
return [f"Data is not a NumPy array, but {type(sound.data)}"]
# Check for empty data or incorrect dimensions
if sound.data.size == 0 or sound.data.ndim not in [1, 2]:
return [f"Invalid shape or empty data. Shape: {sound.data.shape}"]
reasons = []
if np.isnan(sound.data).any():
reasons.append("Data contains NaN value/s")
if np.isinf(sound.data).any():
reasons.append("Data contains Inf value/s")
if np.max(np.abs(sound.data)) > 1.0:
reasons.append(f"Value/s out of range, max value found: {np.max(np.abs(sound.data)):.2f}")
return reasons
@staticmethod
def add_silence(sound: Sound, duration: float) -> Sound:
"""
Returns error message on fail or empty string
TODO: Consider "dithered"
"""
n_silence_samples = int(sound.sr * duration)
if sound.data.ndim == 1:
# Mono: 1D array
silence = np.zeros(n_silence_samples, dtype=sound.data.dtype)
new_data = np.concatenate([sound.data, silence])
else:
# Stereo/multi-channel: 2D array (n_channels, n_samples)
n_channels = sound.data.shape[0]
silence = np.zeros((n_channels, n_silence_samples), dtype=sound.data.dtype)
new_data = np.concatenate([sound.data, silence], axis=1)
new_sound = Sound(new_data, sound.sr)
return new_sound
@staticmethod
def find_local_minima(
sound: Sound,
target_timestamp_s: float,
search_window_ms: int = 250,
energy_window_ms: int = 20,
) -> float | str:
"""
Finds the quietest point in a sound object within a search window around a target timestamp.
This function calculates the root-mean-square (RMS) energy over a small, sliding window
to identify the local minimum, which is often the best place to split audio between words.
Args:
sound (Sound): The sound object to analyze.
target_timestamp_s (float): The center of the search window, in seconds.
search_window_ms (int): The width of the search window on each side of the target, in milliseconds.
energy_window_ms (int): The width of the sliding window for RMS energy calculation, in milliseconds.
Returns:
float | str: The timestamp of the local minimum in seconds, or an error string.
"""
try:
# [1] Convert times to sample indices
sr = sound.sr
target_sample = int(target_timestamp_s * sr)
search_window_samples = int(search_window_ms / 1000 * sr)
energy_window_samples = int(energy_window_ms / 1000 * sr)
# Ensure the analysis window for RMS is at least 1 sample
if energy_window_samples < 1:
energy_window_samples = 1
# Ensure it's an odd number for centering
if energy_window_samples % 2 == 0:
energy_window_samples += 1
# [2] Define the search region
start_sample = target_sample - search_window_samples
end_sample = target_sample + search_window_samples
# [3] Clamp search region to the bounds of the audio data
n_samples = sound.data.shape[-1]
start_sample = max(0, start_sample)
end_sample = min(n_samples, end_sample)
if start_sample >= end_sample:
return "Search window is outside the audio data range."
# Slice along last axis (samples) - works for both mono (1D) and stereo (2D channels-first)
if sound.data.ndim == 1:
search_region = sound.data[start_sample:end_sample]
else:
search_region = sound.data[:, start_sample:end_sample]
# [4] Calculate RMS energy over the search region
# hop_length=1 gives the highest resolution for finding the minimum
rms_energy = librosa.feature.rms(
y=search_region,
frame_length=energy_window_samples,
hop_length=1,
center=True, # Pad the signal for centered frames
)[0]
# [5] Find the minimum energy point
# The RMS array is smaller than the input region due to framing.
# We need to find the minimum and map its index back to the original sample space.
min_energy_index_in_rms = np.argmin(rms_energy)
# The index corresponds to the center of the frame in the search_region
min_energy_index_in_search_region = min_energy_index_in_rms
# [6] Convert back to an absolute sample index and then to a timestamp
absolute_min_sample = start_sample + min_energy_index_in_search_region
best_timestamp_s = absolute_min_sample / sr
return float(best_timestamp_s)
except Exception as e:
return f"Error finding local minima: {type(e).__name__} - {e}"