diff --git a/README.md b/README.md index f55b324..76a55be 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,26 @@ The code I used to dub my paper videos without requiring human help. Windows, preferably 7 or 10 for reasonable voice quality. -## Usage ## +## MS Speech Usage ## ``` python speech.py speech.vbs example.tex ``` +## Google Speech Usage ## +``` +python google_speech.py example.tex + +``` + The output will be in the .wav files. + + + +## OpenAI Speech Usage + +``` +python openAI_speech.py example.tex +``` + +The output will be in the .mp3 files. diff --git a/google_speech.py b/google_speech.py index b120884..ebd75a6 100644 --- a/google_speech.py +++ b/google_speech.py @@ -23,11 +23,13 @@ audios = re.findall('\\\\audio\s*\[(.*?)\]\s*{(.*?)}', lines, re.DOTALL) client = texttospeech.TextToSpeechClient() -voice = texttospeech.types.VoiceSelectionParams( +voice = texttospeech.VoiceSelectionParams( language_code='en-US', - ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL) -audio_config = texttospeech.types.AudioConfig( - audio_encoding=texttospeech.enums.AudioEncoding.MP3) + #ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE, + name='en-US-Wavenet-F') + +audio_config = texttospeech.AudioConfig( + audio_encoding=texttospeech.AudioEncoding.MP3) for audio in audios: file_tag = audio[0] @@ -38,16 +40,10 @@ fout.write(audio_script) fout.close() - synthesis_input = texttospeech.types.SynthesisInput(text=audio_script) - response = client.synthesize_speech(synthesis_input, voice, audio_config) + synthesis_input = texttospeech.SynthesisInput(text=audio_script) + response = client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config) - # The response's audio_content is binary. with open(file_tag+'.mp3', 'wb') as out: - # Write the response to the output file. out.write(response.audio_content) - #output_audio_file = file_tag + '.wav' - #command = 'cscript ' + speech_config_file + ' ' + output_script_file + ' ' + output_audio_file - #os.system(command) - os.remove(output_script_file) diff --git a/openAI_speech.py b/openAI_speech.py new file mode 100644 index 0000000..6b8f0a9 --- /dev/null +++ b/openAI_speech.py @@ -0,0 +1,45 @@ +import sys +import os +import re +from openai import OpenAI +from pathlib import Path + + +argc = len(sys.argv) + +if argc < 2: + error_message = "python input_video_script_file (tex)" + print(error_message) + raise Exception(error_message) + +input_video_script_file = sys.argv[1] + +fin = open(input_video_script_file, 'r') +lines = fin.read() + +audios = re.findall('\\\\audio\s*\[(.*?)\]\s*{(.*?)}', lines, re.DOTALL) + +client = OpenAI() +# Iterate over each text portion +for i, text in enumerate(audios, start=1): + # Call the OpenAI Text-to-Speech API + file_tag = text[0] + audio_script = text[1] + output_script_file = file_tag + '.txt' + fout = open(output_script_file, 'w') + fout.write(audio_script) + fout.close() + response = client.audio.speech.create( + #adjust model and voice here + model="tts-1", + voice="alloy", + input=audio_script + ) + + # Define the path for the output audio file + speech_file_path = Path(f"{file_tag}.mp3") + + # Save the audio content to a file, the function says to have a bug but works well on my side + response.stream_to_file(str(speech_file_path)) + + print(f"Saved speech to {speech_file_path}")