-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTTSS.py
More file actions
104 lines (74 loc) · 2.99 KB
/
TTSS.py
File metadata and controls
104 lines (74 loc) · 2.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from kokoro import KPipeline
from playsound import playsound
import soundfile as sf
import tempfile
import whisper
import sounddevice as sd
import numpy as np
import torch
from dotenv import load_dotenv
import os
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
load_dotenv() # Load environment variables from .env
openai_keys = os.getenv("OPENAI_API_KEY")
if not openai_keys:
raise ValueError("Please provide an OpenAI API key.")
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
gptmodel = ChatOpenAI(model="gpt-4o")
# Load Whisper model with GPU support
model = whisper.load_model("small").to("cuda" if torch.cuda.is_available() else "cpu")
# Audio parameters
samplerate = 16000
channels = 1
duration_of_silence = 2 # seconds
threshold = 0.01 # adjust sensitivity as needed
# Initialize the pipeline
# Check for silence in audio
def is_silent(data, threshold):
return np.abs(data).mean() < threshold
# Continuous audio recording and transcription
def continuous_transcription():
pipeline = KPipeline(lang_code='a') # American English
while True:
print("\nListening... (start speaking)")
audio_chunks = []
silence_duration = 0
# Start audio stream
with sd.InputStream(samplerate=samplerate, channels=channels, dtype='float32') as stream:
while True:
audio_chunk, _ = stream.read(int(samplerate * 0.5)) # read 0.5 second chunks
audio_chunks.append(audio_chunk)
if is_silent(audio_chunk, threshold):
silence_duration += 0.5
else:
silence_duration = 0
if silence_duration >= duration_of_silence:
print("Detected silence, processing audio...")
break
# Concatenate audio data
audio_data = np.concatenate(audio_chunks).flatten()
# Ensure audio is in correct format (float32 numpy array)
audio_data = audio_data.astype(np.float32)
# Transcribe audio using Whisper directly from numpy array
result = model.transcribe(audio_data, fp16=torch.cuda.is_available(), language="en")
# Output transcription
print("Transcription:", result['text'])
question = result["text"]
PROMPT = """ Give a concise answer for the question asked. {question} """
ConversationPrompt = PROMPT.format(question=question)
output = gptmodel.invoke(ConversationPrompt)
text = output.content
# Generate audio
generator = pipeline(
text, voice='af_heart', # choose voice
speed=1
)
full_audio = []
for _, _, audio in generator:
full_audio.extend(audio)
# Temporarily save audio and play directly
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp_file:
sf.write(tmp_file.name, full_audio, 24000)
playsound(tmp_file.name)
# Run the transcription
continuous_transcription()