we can fix the bug by adding the following code in speech_detector.py
# process the remaining audio fragments
if len(self.audio_buffer) > 0:
remaining_length = len(self.audio_buffer)
# filled by zero
padded_audio = np.concatenate([
self.audio_buffer,
np.zeros(samples - remaining_length, dtype=self.audio_buffer.dtype)
])
speech_prob, self.state = self.model(
padded_audio, self.state, self.framerate)
speech_threshold = neg_threshold if self.is_speech else self.threshold
if speech_prob > speech_threshold:
self.silence_last_s = 0
if not self.is_speech:
self.is_speech = True
self.last_speech_pos = self.samples_count
else:
speech_frames = self.samples_count - self.last_speech_pos
if speech_frames / self.framerate > self.max_speech_duration_s:
self.is_speech = False
else:
if self.is_speech:
self.silence_last_s += det_interval_s
if self.silence_last_s >= self.silence_duration_s:
self.is_speech = False
# only return the original audio part, not the filled zero
yield self.audio_buffer, self.is_speech
self.samples_count += remaining_length
self.audio_buffer = None
btw, I didn't get the meaning of the project, it even need more time to get the final complete text than directly running the original project, What are the advantages of switching to streaming output?
[{'type': 'begin', 'id': 0, 'text': None, 'ts': 0.75, 'latency': 0.0}]
[{'type': 'changed', 'id': 0, 'text': '昨天是', 'ts': 1.732, 'latency': 2.1749749183654785}]
[{'type': 'end', 'id': 0, 'text': '昨天是 MONDAY', 'ts': 2.582, 'latency': 0.1884446144104004}]
[{'type': 'begin', 'id': 1, 'text': None, 'ts': 3.9, 'latency': 0.0}]
[{'type': 'changed', 'id': 1, 'text': 'TODAY IS', 'ts': 4.882, 'latency': 0.1411607265472412}]
[{'type': 'end', 'id': 1, 'text': 'TODAY IS', 'ts': 5.032, 'latency': 0.1421806812286377}]
[{'type': 'begin', 'id': 2, 'text': None, 'ts': 5.25, 'latency': 0.0}]
[{'type': 'end', 'id': 2, 'text': '礼拜二', 'ts': 6.082, 'latency': 0.12190961837768555}]
[{'type': 'begin', 'id': 3, 'text': None, 'ts': 6.6, 'latency': 0.0}]
[{'type': 'changed', 'id': 3, 'text': 'THE DAY AFTER', 'ts': 7.582, 'latency': 0.11815881729125977}]
[{'type': 'changed', 'id': 3, 'text': 'TODAY AFTER TOMORROW', 'ts': 8.582, 'latency': 0.2218611240386963}]
[{'type': 'changed', 'id': 3, 'text': 'TODAY AFTER TOMORROW是星期三', 'ts': 9.582, 'latency': 0.17745661735534668}]
>>>wave EOF
total time cost: 3436.36417388916 ms
the max memory usage in onnx reasoning process: 4556.40 MB
we can fix the bug by adding the following code in speech_detector.py
btw, I didn't get the meaning of the project, it even need more time to get the final complete text than directly running the original project, What are the advantages of switching to streaming output?
[{'type': 'begin', 'id': 0, 'text': None, 'ts': 0.75, 'latency': 0.0}] [{'type': 'changed', 'id': 0, 'text': '昨天是', 'ts': 1.732, 'latency': 2.1749749183654785}] [{'type': 'end', 'id': 0, 'text': '昨天是 MONDAY', 'ts': 2.582, 'latency': 0.1884446144104004}] [{'type': 'begin', 'id': 1, 'text': None, 'ts': 3.9, 'latency': 0.0}] [{'type': 'changed', 'id': 1, 'text': 'TODAY IS', 'ts': 4.882, 'latency': 0.1411607265472412}] [{'type': 'end', 'id': 1, 'text': 'TODAY IS', 'ts': 5.032, 'latency': 0.1421806812286377}] [{'type': 'begin', 'id': 2, 'text': None, 'ts': 5.25, 'latency': 0.0}] [{'type': 'end', 'id': 2, 'text': '礼拜二', 'ts': 6.082, 'latency': 0.12190961837768555}] [{'type': 'begin', 'id': 3, 'text': None, 'ts': 6.6, 'latency': 0.0}] [{'type': 'changed', 'id': 3, 'text': 'THE DAY AFTER', 'ts': 7.582, 'latency': 0.11815881729125977}] [{'type': 'changed', 'id': 3, 'text': 'TODAY AFTER TOMORROW', 'ts': 8.582, 'latency': 0.2218611240386963}] [{'type': 'changed', 'id': 3, 'text': 'TODAY AFTER TOMORROW是星期三', 'ts': 9.582, 'latency': 0.17745661735534668}] >>>wave EOF total time cost: 3436.36417388916 ms the max memory usage in onnx reasoning process: 4556.40 MB