1iyiwei · qisun0 · Jan 8, 2019 · Dec 29, 2020 · Jan 20, 2022 · Feb 28, 2024
diff --git a/README.md b/README.md
@@ -6,10 +6,26 @@ The code I used to dub my paper videos without requiring human help.
 
 Windows, preferably 7 or 10 for reasonable voice quality.
 
-## Usage ##
+## MS Speech Usage ##
 
 ```
 python speech.py speech.vbs example.tex
 
 ```
+## Google Speech Usage ##
+```
+python google_speech.py example.tex
+
+```
+
 The output will be in the .wav files.
+
+
+
+## OpenAI Speech Usage
+
+```
+python openAI_speech.py example.tex
+```
+
+The output will be in the .mp3 files.
diff --git a/google_speech.py b/google_speech.py
@@ -23,11 +23,13 @@
 audios = re.findall('\\\\audio\s*\[(.*?)\]\s*{(.*?)}', lines, re.DOTALL)
 
 client = texttospeech.TextToSpeechClient()
-voice = texttospeech.types.VoiceSelectionParams(
+voice = texttospeech.VoiceSelectionParams(
     language_code='en-US',
-    ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL)
-audio_config = texttospeech.types.AudioConfig(
-    audio_encoding=texttospeech.enums.AudioEncoding.MP3)   
+    #ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE,
+    name='en-US-Wavenet-F')
+
+audio_config = texttospeech.AudioConfig(
+    audio_encoding=texttospeech.AudioEncoding.MP3)   
 
 for audio in audios:
     file_tag = audio[0]
@@ -38,16 +40,10 @@
     fout.write(audio_script)
     fout.close()
 
-    synthesis_input = texttospeech.types.SynthesisInput(text=audio_script)
-    response = client.synthesize_speech(synthesis_input, voice, audio_config)
+    synthesis_input = texttospeech.SynthesisInput(text=audio_script)
+    response = client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)
 
-    # The response's audio_content is binary.
     with open(file_tag+'.mp3', 'wb') as out:
-        # Write the response to the output file.
         out.write(response.audio_content)
 
-    #output_audio_file = file_tag + '.wav'
-    #command = 'cscript ' + speech_config_file + ' ' + output_script_file + ' ' + output_audio_file
-    #os.system(command)
-
     os.remove(output_script_file)
diff --git a/openAI_speech.py b/openAI_speech.py
@@ -0,0 +1,45 @@
+import sys
+import os
+import re
+from openai import OpenAI
+from pathlib import Path
+
+
+argc = len(sys.argv)
+
+if argc < 2:
+    error_message = "python input_video_script_file (tex)"
+    print(error_message)
+    raise Exception(error_message)
+
+input_video_script_file = sys.argv[1]
+
+fin = open(input_video_script_file, 'r')
+lines = fin.read()
+
+audios = re.findall('\\\\audio\s*\[(.*?)\]\s*{(.*?)}', lines, re.DOTALL)
+
+client = OpenAI()
+# Iterate over each text portion
+for i, text in enumerate(audios, start=1):
+    # Call the OpenAI Text-to-Speech API
+    file_tag = text[0]
+    audio_script = text[1]
+    output_script_file = file_tag + '.txt'
+    fout = open(output_script_file, 'w')
+    fout.write(audio_script)
+    fout.close()
+    response = client.audio.speech.create(
+        #adjust model and voice here
+        model="tts-1",
+        voice="alloy",
+        input=audio_script
+    )
+
+    # Define the path for the output audio file
+    speech_file_path = Path(f"{file_tag}.mp3")
+
+    # Save the audio content to a file, the function says to have a bug but works well on my side
+    response.stream_to_file(str(speech_file_path))
+
+    print(f"Saved speech to {speech_file_path}")