Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,26 @@ The code I used to dub my paper videos without requiring human help.

Windows, preferably 7 or 10 for reasonable voice quality.

## Usage ##
## MS Speech Usage ##

```
python speech.py speech.vbs example.tex

```
## Google Speech Usage ##
```
python google_speech.py example.tex

```

The output will be in the .wav files.



## OpenAI Speech Usage

```
python openAI_speech.py example.tex
```

The output will be in the .mp3 files.
20 changes: 8 additions & 12 deletions google_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,13 @@
audios = re.findall('\\\\audio\s*\[(.*?)\]\s*{(.*?)}', lines, re.DOTALL)

client = texttospeech.TextToSpeechClient()
voice = texttospeech.types.VoiceSelectionParams(
voice = texttospeech.VoiceSelectionParams(
language_code='en-US',
ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL)
audio_config = texttospeech.types.AudioConfig(
audio_encoding=texttospeech.enums.AudioEncoding.MP3)
#ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE,
name='en-US-Wavenet-F')

audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3)

for audio in audios:
file_tag = audio[0]
Expand All @@ -38,16 +40,10 @@
fout.write(audio_script)
fout.close()

synthesis_input = texttospeech.types.SynthesisInput(text=audio_script)
response = client.synthesize_speech(synthesis_input, voice, audio_config)
synthesis_input = texttospeech.SynthesisInput(text=audio_script)
response = client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)

# The response's audio_content is binary.
with open(file_tag+'.mp3', 'wb') as out:
# Write the response to the output file.
out.write(response.audio_content)

#output_audio_file = file_tag + '.wav'
#command = 'cscript ' + speech_config_file + ' ' + output_script_file + ' ' + output_audio_file
#os.system(command)

os.remove(output_script_file)
45 changes: 45 additions & 0 deletions openAI_speech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import sys
import os
import re
from openai import OpenAI
from pathlib import Path


argc = len(sys.argv)

if argc < 2:
error_message = "python input_video_script_file (tex)"
print(error_message)
raise Exception(error_message)

input_video_script_file = sys.argv[1]

fin = open(input_video_script_file, 'r')
lines = fin.read()

audios = re.findall('\\\\audio\s*\[(.*?)\]\s*{(.*?)}', lines, re.DOTALL)

client = OpenAI()
# Iterate over each text portion
for i, text in enumerate(audios, start=1):
# Call the OpenAI Text-to-Speech API
file_tag = text[0]
audio_script = text[1]
output_script_file = file_tag + '.txt'
fout = open(output_script_file, 'w')
fout.write(audio_script)
fout.close()
response = client.audio.speech.create(
#adjust model and voice here
model="tts-1",
voice="alloy",
input=audio_script
)

# Define the path for the output audio file
speech_file_path = Path(f"{file_tag}.mp3")

# Save the audio content to a file, the function says to have a bug but works well on my side
response.stream_to_file(str(speech_file_path))

print(f"Saved speech to {speech_file_path}")