forked from SevaSk/ecoute
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathAudioTranscriber.py
More file actions
133 lines (115 loc) · 4.73 KB
/
AudioTranscriber.py
File metadata and controls
133 lines (115 loc) · 4.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import wave
import threading
from tempfile import NamedTemporaryFile
import io
from datetime import timedelta
from heapq import merge
from faster_whisper import WhisperModel
from custom_speech_recognition.audio import AudioData
PHRASE_TIMEOUT = 3.05
MAX_PHRASES = 10
class AudioTranscriber:
def __init__(self, mic_source, speaker_source):
self.transcript_data = {"You": [], "Speaker": []}
self.transcript_changed_event = threading.Event()
self.audio_model = WhisperModel("base.en")
self.audio_sources = {
"You": {
"sample_rate": mic_source.SAMPLE_RATE,
"sample_width": mic_source.SAMPLE_WIDTH,
"channels": mic_source.channels,
"last_sample": bytes(),
"last_spoken": None,
"new_phrase": True,
"process_data_func": self.process_mic_data,
},
"Speaker": {
"sample_rate": speaker_source.SAMPLE_RATE,
"sample_width": speaker_source.SAMPLE_WIDTH,
"channels": speaker_source.channels,
"last_sample": bytes(),
"last_spoken": None,
"new_phrase": True,
"process_data_func": self.process_speaker_data,
},
}
def transcribe_audio_queue(self, audio_queue):
while True:
who_spoke, data, time_spoken = audio_queue.get()
self.update_last_sample_and_phrase_status(who_spoke, data, time_spoken)
source_info = self.audio_sources[who_spoke]
temp_file = source_info["process_data_func"](source_info["last_sample"])
text = self.get_transcription(temp_file)
try:
self.update_transcript(who_spoke, text, time_spoken)
self.transcript_changed_event.set()
except AttributeError:
pass
def update_last_sample_and_phrase_status(self, who_spoke, data, time_spoken):
source_info = self.audio_sources[who_spoke]
if source_info["last_spoken"] and time_spoken - source_info[
"last_spoken"
] > timedelta(seconds=PHRASE_TIMEOUT):
source_info["last_sample"] = bytes()
source_info["new_phrase"] = True
else:
source_info["new_phrase"] = False
source_info["last_sample"] += data
source_info["last_spoken"] = time_spoken
def process_mic_data(self, data):
temp_file = NamedTemporaryFile().name
audio_data = AudioData(
data,
self.audio_sources["You"]["sample_rate"],
self.audio_sources["You"]["sample_width"],
)
wav_data = io.BytesIO(audio_data.get_wav_data())
with open(temp_file, "w+b") as f:
f.write(wav_data.read())
return temp_file
def process_speaker_data(self, data):
temp_file = NamedTemporaryFile().name
audio_data = AudioData(
data,
self.audio_sources["Speaker"]["sample_rate"],
self.audio_sources["Speaker"]["sample_width"],
)
wav_data = io.BytesIO(audio_data.get_wav_data())
with open(temp_file, "w+b") as f:
f.write(wav_data.read())
return temp_file
def get_transcription(self, file_path):
segments, info = self.audio_model.transcribe(file_path)
print(
"Detected language '%s' with probability %f"
% (info.language, info.language_probability)
)
for segment in segments:
return segment.text
def update_transcript(self, who_spoke, text, time_spoken):
source_info = self.audio_sources[who_spoke]
transcript = self.transcript_data[who_spoke]
if source_info["new_phrase"] or len(transcript) == 0:
if len(transcript) > MAX_PHRASES:
transcript.pop(-1)
transcript.insert(0, (f"{who_spoke}: [{text}]\n\n", time_spoken))
else:
transcript[0] = (f"{who_spoke}: [{text}]\n\n", time_spoken)
def get_transcript(self):
combined_transcript = list(
merge(
self.transcript_data["You"],
self.transcript_data["Speaker"],
key=lambda x: x[1],
reverse=True,
)
)
combined_transcript = combined_transcript[:MAX_PHRASES]
return "".join([t[0] for t in combined_transcript])
def clear_transcript_data(self):
self.transcript_data["You"].clear()
self.transcript_data["Speaker"].clear()
self.audio_sources["You"]["last_sample"] = bytes()
self.audio_sources["Speaker"]["last_sample"] = bytes()
self.audio_sources["You"]["new_phrase"] = True
self.audio_sources["Speaker"]["new_phrase"] = True