wave-edit/sound_util.py at master · zeropointnine/wave-edit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
import librosa
import numpy as np
from numpy import ndarray

from sound import Sound

class SoundUtil:
    """
    """

    @staticmethod
    def resample(sound: Sound, target_sr: int, res_type: str="soxr_hq") -> Sound:
        """
        Returns new Sound with target sample rate (or identity if unneeded)
        """
        if sound.sr == target_sr:
            return sound
        new_data = librosa.resample(sound.data, orig_sr=sound.sr, target_sr=target_sr, res_type=res_type)
        return Sound(new_data, target_sr)

    @staticmethod
    def trim(sound: Sound, start_time: float | None, end_time: float | None) -> Sound:

        if start_time is None and end_time is None:
            return sound # identity

        if start_time is None:
            start_time = 0
        if end_time is None:
            end_time = sound.duration
        if end_time > sound.duration:
            end_time = sound.duration

        n_samples = sound.data.shape[-1]
        start_samples = int(start_time * sound.sr)
        end_samples = int(end_time * sound.sr)
        if end_samples > n_samples:
            end_samples = n_samples

        # Ensure valid trim range
        if not (0 <= start_samples < end_samples <= n_samples):
            raise ValueError(f"Invalid trim range - start {start_time} end {end_time} duration {sound.duration}")

        # Slice along last axis (samples) - works for both mono (1D) and stereo (2D channels-first)
        if sound.data.ndim == 1:
            trimmed_data = sound.data[start_samples:end_samples]
        else:
            trimmed_data = sound.data[:, start_samples:end_samples]
        trimmed_data = np.copy(trimmed_data) # Create a copy to ensure it's not a view

        return Sound(trimmed_data, sound.sr)

    @staticmethod
    def trim_using_segments(
            sound: Sound,
            segments: list[tuple[float, float]],
            position: float=0
    ) -> tuple[Sound, float]:
        """
        Effectively cuts out segments from given sound data.
        Returns sound instance and modified position.

        Args:
            sound: The source Sound
            segments: List of (start_time, end_time) tuples in seconds

        Returns:
            New Sound with the segments concatenated in order.
            Returns identity Sound if segments list is empty.
        """
        if not segments:
            return sound, position  # Return identity for empty segments

        segment_data = []
        n_samples = sound.data.shape[-1]

        for start_time, end_time in segments:
            # Convert to samples and clamp to valid range
            start_samples = max(0, int(start_time * sound.sr))
            end_samples = min(n_samples, int(end_time * sound.sr))

            if start_samples >= end_samples:
                continue  # Skip invalid/empty segments

            # Slice along last axis (samples) - works for both mono (1D) and stereo (2D)
            if sound.data.ndim == 1:
                segment_data.append(sound.data[start_samples:end_samples])
            else:
                segment_data.append(sound.data[:, start_samples:end_samples])

        if not segment_data:
            # All segments were invalid - return empty sound
            if sound.data.ndim == 1:
                empty_data = np.array([], dtype=sound.data.dtype)
            else:
                empty_data = np.zeros((sound.data.shape[0], 0), dtype=sound.data.dtype)
            return Sound(empty_data, sound.sr), 0.0

        # Concatenate along samples axis
        if sound.data.ndim == 1:
            concatenated = np.concatenate(segment_data)
        else:
            concatenated = np.concatenate(segment_data, axis=1)

        # Calc new position
        # Map the old position to the new concatenated sound
        # by finding which segment contains it (or clamping to nearest boundary)
        new_position = 0.0
        sorted_segments = sorted(segments, key=lambda s: s[0])
        accumulated_time = 0.0

        for start_time, end_time in sorted_segments:
            if start_time <= position <= end_time:
                # Position is within this segment
                new_position = accumulated_time + (position - start_time)
                break
            elif end_time < position:
                # This segment is entirely before position
                accumulated_time += (end_time - start_time)
            else:  # start_time > position
                # Position is in a gap before this segment, clamp to accumulated time
                new_position = accumulated_time
                break
        else:
            # Position is after all segments
            new_position = accumulated_time

        return Sound(np.copy(concatenated), sound.sr), new_position

    @staticmethod
    def normalize(sound: Sound, headroom_db: float = -3.0) -> Sound:
        new_data = SoundUtil.normalize_data(sound.data, headroom_db)
        return Sound(new_data, sound.sr)

    @staticmethod
    def normalize_data(arr: ndarray, headroom_db: float = -3.0) -> np.ndarray:
        """
        Does peak normalization of audio, with specified headroom in dB
        (use negative number, e.g., -3 for 3dB headroom)
        """
        if headroom_db > 0:
            raise ValueError("headroom_db must be <= 0.0")

        # Convert dB to linear scale
        headroom_linear = 10 ** (headroom_db / 20)

        # Peak normalize to 1.0, then apply headroom
        # Use axis=-1 to normalize across samples (works for both 1D and channels-first 2D)
        normalized_arr = librosa.util.normalize(arr, norm=np.inf, axis=-1) * headroom_linear

        return normalized_arr

    @staticmethod
    def is_data_invalid(sound: Sound) -> list[str]:
        """ Returns list of 'reasons' why data is invalid """

        if not isinstance(sound.data, np.ndarray):
            return [f"Data is not a NumPy array, but {type(sound.data)}"]

        # Check for empty data or incorrect dimensions
        if sound.data.size == 0 or sound.data.ndim not in [1, 2]:
            return [f"Invalid shape or empty data. Shape: {sound.data.shape}"]

        reasons = []

        if np.isnan(sound.data).any():
            reasons.append("Data contains NaN value/s")

        if np.isinf(sound.data).any():
            reasons.append("Data contains Inf value/s")

        if np.max(np.abs(sound.data)) > 1.0:
            reasons.append(f"Value/s out of range, max value found: {np.max(np.abs(sound.data)):.2f}")

        return reasons

    @staticmethod
    def add_silence(sound: Sound, duration: float) -> Sound:
        """
        Returns error message on fail or empty string

        TODO: Consider "dithered"
        """
        n_silence_samples = int(sound.sr * duration)
        if sound.data.ndim == 1:
            # Mono: 1D array
            silence = np.zeros(n_silence_samples, dtype=sound.data.dtype)
            new_data = np.concatenate([sound.data, silence])
        else:
            # Stereo/multi-channel: 2D array (n_channels, n_samples)
            n_channels = sound.data.shape[0]
            silence = np.zeros((n_channels, n_silence_samples), dtype=sound.data.dtype)
            new_data = np.concatenate([sound.data, silence], axis=1)

        new_sound = Sound(new_data, sound.sr)
        return new_sound

    @staticmethod
    def find_local_minima(
        sound: Sound,
        target_timestamp_s: float,
        search_window_ms: int = 250,
        energy_window_ms: int = 20,
    ) -> float | str:
        """
        Finds the quietest point in a sound object within a search window around a target timestamp.

        This function calculates the root-mean-square (RMS) energy over a small, sliding window
        to identify the local minimum, which is often the best place to split audio between words.

        Args:
            sound (Sound): The sound object to analyze.
            target_timestamp_s (float): The center of the search window, in seconds.
            search_window_ms (int): The width of the search window on each side of the target, in milliseconds.
            energy_window_ms (int): The width of the sliding window for RMS energy calculation, in milliseconds.

        Returns:
            float | str: The timestamp of the local minimum in seconds, or an error string.
        """
        try:
            # [1] Convert times to sample indices
            sr = sound.sr
            target_sample = int(target_timestamp_s * sr)
            search_window_samples = int(search_window_ms / 1000 * sr)
            energy_window_samples = int(energy_window_ms / 1000 * sr)

            # Ensure the analysis window for RMS is at least 1 sample
            if energy_window_samples < 1:
                energy_window_samples = 1
            # Ensure it's an odd number for centering
            if energy_window_samples % 2 == 0:
                energy_window_samples += 1

            # [2] Define the search region
            start_sample = target_sample - search_window_samples
            end_sample = target_sample + search_window_samples

            # [3] Clamp search region to the bounds of the audio data
            n_samples = sound.data.shape[-1]
            start_sample = max(0, start_sample)
            end_sample = min(n_samples, end_sample)

            if start_sample >= end_sample:
                return "Search window is outside the audio data range."

            # Slice along last axis (samples) - works for both mono (1D) and stereo (2D channels-first)
            if sound.data.ndim == 1:
                search_region = sound.data[start_sample:end_sample]
            else:
                search_region = sound.data[:, start_sample:end_sample]

            # [4] Calculate RMS energy over the search region
            # hop_length=1 gives the highest resolution for finding the minimum
            rms_energy = librosa.feature.rms(
                y=search_region,
                frame_length=energy_window_samples,
                hop_length=1,
                center=True, # Pad the signal for centered frames
            )[0]

            # [5] Find the minimum energy point
            # The RMS array is smaller than the input region due to framing.
            # We need to find the minimum and map its index back to the original sample space.
            min_energy_index_in_rms = np.argmin(rms_energy)

            # The index corresponds to the center of the frame in the search_region
            min_energy_index_in_search_region = min_energy_index_in_rms

            # [6] Convert back to an absolute sample index and then to a timestamp
            absolute_min_sample = start_sample + min_energy_index_in_search_region
            best_timestamp_s = absolute_min_sample / sr

            return float(best_timestamp_s)

        except Exception as e:
            return f"Error finding local minima: {type(e).__name__} - {e}"