From bb1168fb0ce95b0be70085b464f78940c66f3071 Mon Sep 17 00:00:00 2001 From: Qi Sun Date: Tue, 8 Jan 2019 10:16:58 -0800 Subject: [PATCH 1/5] tweak parameters a bit --- google_speech.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/google_speech.py b/google_speech.py index b120884..82ff681 100644 --- a/google_speech.py +++ b/google_speech.py @@ -25,7 +25,9 @@ client = texttospeech.TextToSpeechClient() voice = texttospeech.types.VoiceSelectionParams( language_code='en-US', - ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL) + #ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE, + name='en-US-Wavenet-F') + audio_config = texttospeech.types.AudioConfig( audio_encoding=texttospeech.enums.AudioEncoding.MP3) @@ -41,13 +43,7 @@ synthesis_input = texttospeech.types.SynthesisInput(text=audio_script) response = client.synthesize_speech(synthesis_input, voice, audio_config) - # The response's audio_content is binary. with open(file_tag+'.mp3', 'wb') as out: - # Write the response to the output file. out.write(response.audio_content) - #output_audio_file = file_tag + '.wav' - #command = 'cscript ' + speech_config_file + ' ' + output_script_file + ' ' + output_audio_file - #os.system(command) - os.remove(output_script_file) From c49f925cd82e4f3e7a405950f62910e773f3ecf6 Mon Sep 17 00:00:00 2001 From: qisun0 Date: Mon, 28 Dec 2020 21:42:34 -0800 Subject: [PATCH 2/5] synthesis code --- google_speech.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/google_speech.py b/google_speech.py index 82ff681..ebd75a6 100644 --- a/google_speech.py +++ b/google_speech.py @@ -23,13 +23,13 @@ audios = re.findall('\\\\audio\s*\[(.*?)\]\s*{(.*?)}', lines, re.DOTALL) client = texttospeech.TextToSpeechClient() -voice = texttospeech.types.VoiceSelectionParams( +voice = texttospeech.VoiceSelectionParams( language_code='en-US', #ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE, name='en-US-Wavenet-F') -audio_config = texttospeech.types.AudioConfig( - audio_encoding=texttospeech.enums.AudioEncoding.MP3) +audio_config = texttospeech.AudioConfig( + audio_encoding=texttospeech.AudioEncoding.MP3) for audio in audios: file_tag = audio[0] @@ -40,8 +40,8 @@ fout.write(audio_script) fout.close() - synthesis_input = texttospeech.types.SynthesisInput(text=audio_script) - response = client.synthesize_speech(synthesis_input, voice, audio_config) + synthesis_input = texttospeech.SynthesisInput(text=audio_script) + response = client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config) with open(file_tag+'.mp3', 'wb') as out: out.write(response.audio_content) From 5911b9fa478c4e5478c5950e1e91f1db1a2e9ac0 Mon Sep 17 00:00:00 2001 From: Qi Sun Date: Thu, 20 Jan 2022 16:37:46 -0500 Subject: [PATCH 3/5] Update README.md --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f55b324..c147b8e 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,16 @@ The code I used to dub my paper videos without requiring human help. Windows, preferably 7 or 10 for reasonable voice quality. -## Usage ## +## MS Speech Usage ## ``` python speech.py speech.vbs example.tex ``` +## Google Speech Usage ## +``` +python google_speech.py example.tex + +``` + The output will be in the .wav files. From 888356d7ff5cc74bb0c8ca93a91dc52ab29c300f Mon Sep 17 00:00:00 2001 From: 405-not-found <98379785+405-not-found@users.noreply.github.com> Date: Wed, 28 Feb 2024 09:27:35 -0500 Subject: [PATCH 4/5] Upgrade to OpenAI --- openAI_speech.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 openAI_speech.py diff --git a/openAI_speech.py b/openAI_speech.py new file mode 100644 index 0000000..6b8f0a9 --- /dev/null +++ b/openAI_speech.py @@ -0,0 +1,45 @@ +import sys +import os +import re +from openai import OpenAI +from pathlib import Path + + +argc = len(sys.argv) + +if argc < 2: + error_message = "python input_video_script_file (tex)" + print(error_message) + raise Exception(error_message) + +input_video_script_file = sys.argv[1] + +fin = open(input_video_script_file, 'r') +lines = fin.read() + +audios = re.findall('\\\\audio\s*\[(.*?)\]\s*{(.*?)}', lines, re.DOTALL) + +client = OpenAI() +# Iterate over each text portion +for i, text in enumerate(audios, start=1): + # Call the OpenAI Text-to-Speech API + file_tag = text[0] + audio_script = text[1] + output_script_file = file_tag + '.txt' + fout = open(output_script_file, 'w') + fout.write(audio_script) + fout.close() + response = client.audio.speech.create( + #adjust model and voice here + model="tts-1", + voice="alloy", + input=audio_script + ) + + # Define the path for the output audio file + speech_file_path = Path(f"{file_tag}.mp3") + + # Save the audio content to a file, the function says to have a bug but works well on my side + response.stream_to_file(str(speech_file_path)) + + print(f"Saved speech to {speech_file_path}") From 4de328e9de6edb0cc352c3da789bcb2d7f26f168 Mon Sep 17 00:00:00 2001 From: 405-not-found <98379785+405-not-found@users.noreply.github.com> Date: Wed, 28 Feb 2024 09:31:09 -0500 Subject: [PATCH 5/5] Update README --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index c147b8e..76a55be 100644 --- a/README.md +++ b/README.md @@ -19,3 +19,13 @@ python google_speech.py example.tex ``` The output will be in the .wav files. + + + +## OpenAI Speech Usage + +``` +python openAI_speech.py example.tex +``` + +The output will be in the .mp3 files.