ai_live_suggestions/AudioTranscriber.py at main · InDate/ai_live_suggestions · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import wave
import threading
from tempfile import NamedTemporaryFile
import io
from datetime import timedelta
from heapq import merge
from faster_whisper import WhisperModel

from custom_speech_recognition.audio import AudioData

PHRASE_TIMEOUT = 3.05

MAX_PHRASES = 10


class AudioTranscriber:
    def __init__(self, mic_source, speaker_source):
        self.transcript_data = {"You": [], "Speaker": []}
        self.transcript_changed_event = threading.Event()
        self.audio_model = WhisperModel("base.en")
        self.audio_sources = {
            "You": {
                "sample_rate": mic_source.SAMPLE_RATE,
                "sample_width": mic_source.SAMPLE_WIDTH,
                "channels": mic_source.channels,
                "last_sample": bytes(),
                "last_spoken": None,
                "new_phrase": True,
                "process_data_func": self.process_mic_data,
            },
            "Speaker": {
                "sample_rate": speaker_source.SAMPLE_RATE,
                "sample_width": speaker_source.SAMPLE_WIDTH,
                "channels": speaker_source.channels,
                "last_sample": bytes(),
                "last_spoken": None,
                "new_phrase": True,
                "process_data_func": self.process_speaker_data,
            },
        }

    def transcribe_audio_queue(self, audio_queue):
        while True:
            who_spoke, data, time_spoken = audio_queue.get()
            self.update_last_sample_and_phrase_status(who_spoke, data, time_spoken)
            source_info = self.audio_sources[who_spoke]
            temp_file = source_info["process_data_func"](source_info["last_sample"])
            text = self.get_transcription(temp_file)

            try:
                self.update_transcript(who_spoke, text, time_spoken)
                self.transcript_changed_event.set()
            except AttributeError:
                pass

    def update_last_sample_and_phrase_status(self, who_spoke, data, time_spoken):
        source_info = self.audio_sources[who_spoke]
        if source_info["last_spoken"] and time_spoken - source_info[
            "last_spoken"
        ] > timedelta(seconds=PHRASE_TIMEOUT):
            source_info["last_sample"] = bytes()
            source_info["new_phrase"] = True
        else:
            source_info["new_phrase"] = False

        source_info["last_sample"] += data
        source_info["last_spoken"] = time_spoken

    def process_mic_data(self, data):
        temp_file = NamedTemporaryFile().name
        audio_data = AudioData(
            data,
            self.audio_sources["You"]["sample_rate"],
            self.audio_sources["You"]["sample_width"],
        )
        wav_data = io.BytesIO(audio_data.get_wav_data())
        with open(temp_file, "w+b") as f:
            f.write(wav_data.read())
        return temp_file

    def process_speaker_data(self, data):
        temp_file = NamedTemporaryFile().name
        audio_data = AudioData(
            data,
            self.audio_sources["Speaker"]["sample_rate"],
            self.audio_sources["Speaker"]["sample_width"],
        )
        wav_data = io.BytesIO(audio_data.get_wav_data())
        with open(temp_file, "w+b") as f:
            f.write(wav_data.read())
        return temp_file

    def get_transcription(self, file_path):
        segments, info = self.audio_model.transcribe(file_path)
        print(
            "Detected language '%s' with probability %f"
            % (info.language, info.language_probability)
        )
        for segment in segments:
            return segment.text

    def update_transcript(self, who_spoke, text, time_spoken):
        source_info = self.audio_sources[who_spoke]
        transcript = self.transcript_data[who_spoke]

        if source_info["new_phrase"] or len(transcript) == 0:
            if len(transcript) > MAX_PHRASES:
                transcript.pop(-1)
            transcript.insert(0, (f"{who_spoke}: [{text}]\n\n", time_spoken))
        else:
            transcript[0] = (f"{who_spoke}: [{text}]\n\n", time_spoken)

    def get_transcript(self):
        combined_transcript = list(
            merge(
                self.transcript_data["You"],
                self.transcript_data["Speaker"],
                key=lambda x: x[1],
                reverse=True,
            )
        )
        combined_transcript = combined_transcript[:MAX_PHRASES]
        return "".join([t[0] for t in combined_transcript])

    def clear_transcript_data(self):
        self.transcript_data["You"].clear()
        self.transcript_data["Speaker"].clear()

        self.audio_sources["You"]["last_sample"] = bytes()
        self.audio_sources["Speaker"]["last_sample"] = bytes()

        self.audio_sources["You"]["new_phrase"] = True
        self.audio_sources["Speaker"]["new_phrase"] = True