-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathspeech_animation.py
More file actions
159 lines (130 loc) · 5.85 KB
/
speech_animation.py
File metadata and controls
159 lines (130 loc) · 5.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import json
import pygame
import sys
import boto3
from botocore.exceptions import ClientError
class SpeechSynthesizer:
def __init__(self):
self.polly_client = boto3.client('polly')
def synthesize(self, text, engine='neural', voice='Joanna', audio_format='mp3', lang_code=None, include_visemes=True):
"""
Synthesizes speech or speech marks from text, using the specified voice.
:param text: The text to synthesize.
:param engine: The kind of engine used. Can be standard or neural.
:param voice: The ID of the voice to use.
:param audio_format: The audio format to return for synthesized speech. When
speech marks are synthesized, the output format is JSON.
:param lang_code: The language code of the voice to use. This has an effect
only when a bilingual voice is selected.
:param include_visemes: When True, a second request is made to Amazon Polly
to synthesize a list of visemes, using the specified
text and voice. A viseme represents the visual position
of the face and mouth when saying part of a word.
:return: The paths to the audio file and viseme file that contains the synthesized speech and a list
of visemes that are associated with the speech audio.
"""
try:
kwargs = {
"Engine": engine,
"OutputFormat": audio_format,
"Text": text,
"VoiceId": voice,
}
if lang_code is not None:
kwargs["LanguageCode"] = lang_code
# Synthesize speech
response = self.polly_client.synthesize_speech(**kwargs)
audio_stream = response["AudioStream"].read()
# Save the audio stream to a file
audio_file_path = "/home/emoore/speech.mp3"
with open(audio_file_path, 'wb') as file:
file.write(audio_stream)
print(f"Audio file saved to {audio_file_path}")
viseme_file_path = None
if include_visemes:
# Synthesize visemes
kwargs["OutputFormat"] = "json"
kwargs["SpeechMarkTypes"] = ["viseme"]
response = self.polly_client.synthesize_speech(**kwargs)
visemes = response["AudioStream"].read().decode()
# Save the visemes to a file
viseme_file_path = "/home/emoore/visemes.json"
with open(viseme_file_path, 'w') as file:
file.write(visemes)
print(f"Viseme file saved to {viseme_file_path}")
except ClientError:
logger.exception("Couldn't synthesize speech.")
raise
else:
return audio_file_path, viseme_file_path
# Mapping of viseme values to image filenames
viseme_to_filename = {
'e': '/home/emoore/visemes/lips_e.png',
'i': '/home/emoore/visemes/lips_e.png',
'm': '/home/emoore/visemes/lips_m.png',
'o': '/home/emoore/visemes/lips_o.png',
'c': '/home/emoore/visemes/lips_c.png',
'k': '/home/emoore/visemes/lips_c.png',
't': '/home/emoore/visemes/lips_c.png',
'a': '/home/emoore/visemes/lips_a.png',
'u': '/home/emoore/visemes/lips_u.png',
'@': '/home/emoore/visemes/lips_u.png',
'E': '/home/emoore/visemes/lips_u.png',
'O': '/home/emoore/visemes/lips_u.png',
'th': '/home/emoore/visemes/lips_th.png',
'f': '/home/emoore/visemes/lips_f.png',
'w': '/home/emoore/visemes/lips_w.png',
'ch': '/home/emoore/visemes/lips_ch.png',
'r': '/home/emoore/visemes/lips_r.png',
'sil': '/home/emoore/visemes/lips_sil.png',
'p': '/home/emoore/visemes/lips_m.png',
's': '/home/emoore/visemes/lips_c.png',
'S': '/home/emoore/visemes/lips_ch.png',
'T': '/home/emoore/visemes/lips_th.png',
}
# Initialize the screen
# Initialize Pygame
pygame.init()
image = pygame.image.load(viseme_to_filename['p']) # Replace 'p' with the value of a valid viseme
image_width, image_height = image.get_size()
# Create a window with the size of the image
screen = pygame.display.set_mode((image_width, image_height))
def display_viseme(viseme):
# Clear the screen
screen.fill((0, 0, 0))
# Draw the viseme image
screen.blit(pygame.image.load(viseme_to_filename[viseme]), (0, 0))
# Update the display
pygame.display.flip()
def display_animation(text):
synthesizer = SpeechSynthesizer()
audio_path, viseme_path = synthesizer.synthesize(text=text)
# Load the speech sound file
speech_sound = pygame.mixer.Sound(audio_path)
# Load viseme timings from visemes.json
with open(viseme_path, 'r') as f:
viseme_timings = [json.loads(line) for line in f]
# Set up the display
image = pygame.image.load(viseme_to_filename['p']) # Replace 'p' with the value of a valid viseme
screen = pygame.display.set_mode(image.get_size())
# Main animation loop
clock = pygame.time.Clock()
speech_sound.play()
start_time = pygame.time.get_ticks() # Record the start time
viseme_index = 0
while pygame.mixer.get_busy() or viseme_index < len(viseme_timings):
for event in pygame.event.get():
if event.type == pygame.QUIT:
pygame.quit()
sys.exit()
current_time = pygame.time.get_ticks()
elapsed_time = current_time - start_time # Calculate the elapsed time
if viseme_index < len(viseme_timings) and elapsed_time >= viseme_timings[viseme_index]['time']:
display_viseme(viseme_timings[viseme_index]['value'])
viseme_index += 1
pygame.display.flip()
clock.tick(60)
pygame.quit()
# Example usage
text = "I need to say a lot of stuff to see whether this is working correctly."
display_animation(text)