Skip to content

Commit 9dc2343

Browse files
authored
Merge pull request #9 from smallest-inc/long_text
long text handling
2 parents 2e46e66 + c756dff commit 9dc2343

6 files changed

Lines changed: 207 additions & 83 deletions

File tree

README.md

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,20 @@ if __name__ == "__main__":
7575
- `speed`: Speech speed multiplier (default: 1.0)
7676
- `add_wav_header`: Include WAV header in output (default: True)
7777
- `transliterate`: Enable text transliteration (default: False)
78-
- `remove_extra_silence`: Remove additional silence (default: True)
78+
- `remove_extra_silence`: Remove additional silence (default: True)
79+
80+
These parameters are part of the `Smallest` instance. They can be set when creating the instance (as shown above). However, the `synthesize` function also accepts kwargs, allowing you to override these parameters for a specific synthesis request.
81+
82+
For example, you can modify the speech speed and sample rate just for a particular synthesis call:
83+
```py
84+
client.synthesize(
85+
"Hello, this is a test for sync synthesis function.",
86+
save_as="sync_synthesize.wav",
87+
speed=1.5, # Overrides default speed
88+
sample_rate=16000 # Overrides default sample rate
89+
)
90+
```
91+
7992

8093
### Async
8194
Asynchronous text-to-speech synthesis client.
@@ -107,7 +120,18 @@ if __name__ == "__main__":
107120
- `speed`: Speech speed multiplier (default: 1.0)
108121
- `add_wav_header`: Include WAV header in output (default: True)
109122
- `transliterate`: Enable text transliteration (default: False)
110-
- `remove_extra_silence`: Remove additional silence (default: True)
123+
- `remove_extra_silence`: Remove additional silence (default: True)
124+
125+
These parameters are part of the `AsyncSmallest` instance. They can be set when creating the instance (as shown above). However, the `synthesize` function also accepts kwargs, allowing you to override any of these parameters on a per-request basis.
126+
127+
For example, you can modify the speech speed and sample rate just for a particular synthesis request:
128+
```py
129+
audio_bytes = await tts.synthesize(
130+
"Hello, this is a test of the async synthesis function.",
131+
speed=1.5, # Overrides default speed
132+
sample_rate=16000 # Overrides default sample rate
133+
)
134+
```
111135

112136
### LLM to Speech
113137

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "smallestai"
3-
version = "1.2.0"
3+
version = "1.3.0"
44
description = "Official Python client for the Smallest AI API"
55
authors = [
66
{name = "Smallest", email = "info@smallest.ai"},

smallest/async_tts.py

Lines changed: 85 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,20 @@
77
from .models import TTSModels, TTSVoices
88
from .exceptions import TTSError, APIError
99
from .utils import (TTSOptions, validate_input, preprocess_text, add_wav_header,
10-
get_smallest_languages, get_smallest_voices, get_smallest_models, API_BASE_URL)
10+
get_smallest_languages, get_smallest_voices, get_smallest_models, SENTENCE_END_REGEX, API_BASE_URL)
1111

1212

1313
class AsyncSmallest:
1414
def __init__(
15-
self,
16-
api_key: Optional[str] = None,
17-
model: TTSModels = "lightning",
18-
sample_rate: int = 24000,
19-
voice: TTSVoices = "emily",
20-
speed: Optional[float] = 1.0,
21-
add_wav_header: Optional[bool] = True,
22-
transliterate: Optional[bool] = False,
23-
remove_extra_silence: Optional[bool] = False
15+
self,
16+
api_key: Optional[str] = None,
17+
model: TTSModels = "lightning",
18+
sample_rate: int = 24000,
19+
voice: TTSVoices = "emily",
20+
speed: Optional[float] = 1.0,
21+
add_wav_header: Optional[bool] = True,
22+
transliterate: Optional[bool] = False,
23+
remove_extra_silence: Optional[bool] = False
2424
) -> None:
2525
"""
2626
AsyncSmallest Instance for asynchronous text-to-speech synthesis.
@@ -48,6 +48,7 @@ def __init__(
4848
self.api_key = api_key or os.environ.get("SMALLEST_API_KEY")
4949
if not self.api_key:
5050
raise TTSError("API key is required")
51+
self.chunk_size = 250
5152

5253
self.opts = TTSOptions(
5354
model=model,
@@ -70,6 +71,48 @@ async def __aexit__(self, exc_type, exc_val, exc_tb):
7071
if self.session:
7172
await self.session.close()
7273

74+
def _split_into_chunks(self, text: str) -> List[str]:
75+
"""
76+
Splits the input text into chunks based on sentence boundaries and the maximum chunk size.
77+
"""
78+
chunks = []
79+
current_chunk = ""
80+
last_break_index = 0
81+
82+
i = 0
83+
while i < len(text):
84+
current_chunk += text[i]
85+
86+
if text[i] in ".,":
87+
last_break_index = i
88+
89+
if len(current_chunk) >= self.chunk_size:
90+
if last_break_index > 0:
91+
chunk = text[:last_break_index + 1].strip()
92+
chunk = chunk.replace("—", " ")
93+
chunks.append(chunk)
94+
95+
text = text[last_break_index + 1:]
96+
i = -1
97+
current_chunk = ""
98+
last_break_index = 0
99+
else:
100+
# No break point found, split at max length
101+
current_chunk = current_chunk.replace("—", " ")
102+
chunks.append(current_chunk.strip())
103+
text = text[self.chunk_size:]
104+
i = -1
105+
current_chunk = ""
106+
107+
i += 1
108+
109+
if text:
110+
text = text.replace("—", " ")
111+
chunks.append(text.strip())
112+
113+
return chunks
114+
115+
73116
def get_languages(self) -> List[str]:
74117
"""Returns a list of available languages."""
75118
return get_smallest_languages()
@@ -110,42 +153,45 @@ async def synthesize(
110153
setattr(opts, key, value)
111154

112155
validate_input(text, opts.voice, opts.model, opts.sample_rate, opts.speed)
113-
114-
payload = {
115-
"text": preprocess_text(text),
116-
"sample_rate": opts.sample_rate,
117-
"voice_id": opts.voice,
118-
"add_wav_header": opts.add_wav_header,
119-
"speed": opts.speed,
120-
"model": opts.model,
121-
"transliterate": opts.transliterate,
122-
"remove_extra_silence": opts.remove_extra_silence
123-
}
124-
125-
headers = {
126-
"Authorization": f"Bearer {self.api_key}",
127-
"Content-Type": "application/json",
128-
}
129-
130-
if not self.session:
131-
self.session = aiohttp.ClientSession()
156+
chunks = self._split_into_chunks(text)
157+
audio_content = b""
158+
159+
for chunk in chunks:
160+
payload = {
161+
"text": preprocess_text(chunk),
162+
"sample_rate": opts.sample_rate,
163+
"voice_id": opts.voice,
164+
"add_wav_header": False,
165+
"speed": opts.speed,
166+
"model": opts.model,
167+
"transliterate": opts.transliterate,
168+
"remove_extra_silence": opts.remove_extra_silence
169+
}
170+
171+
headers = {
172+
"Authorization": f"Bearer {self.api_key}",
173+
"Content-Type": "application/json",
174+
}
175+
176+
if not self.session:
177+
self.session = aiohttp.ClientSession()
132178

133-
async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
134-
if res.status != 200:
135-
raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
179+
async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
180+
if res.status != 200:
181+
raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
136182

137-
audio_content = await res.read()
183+
audio_content += await res.read()
138184

139185
if save_as:
140186
if not save_as.endswith(".wav"):
141187
raise TTSError("Invalid file name. Extension must be .wav")
142188

143-
if self.opts.add_wav_header:
144-
async with aiofiles.open(save_as, mode='wb') as f:
145-
await f.write(audio_content)
146-
else:
147-
async with aiofiles.open(save_as, mode='wb') as f:
148-
await f.write(add_wav_header(audio_content, self.opts.sample_rate))
189+
async with aiofiles.open(save_as, mode='wb') as f:
190+
await f.write(add_wav_header(audio_content, self.opts.sample_rate))
191+
149192
return None
150193

194+
if opts.add_wav_header:
195+
return add_wav_header(audio_content, self.opts.sample_rate)
196+
151197
return audio_content

smallest/stream_tts.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,14 @@ def __init__(
3434
max_retries: Number of retry attempts for failed synthesis (default: 3)
3535
"""
3636
self.tts_instance = tts_instance
37+
self.tts_instance.opts.add_wav_header = False
38+
3739
self.sentence_end_regex = SENTENCE_END_REGEX
3840
self.queue_timeout = queue_timeout
3941
self.max_retries = max_retries
4042
self.queue = Queue()
4143
self.buffer_size = 250
4244
self.stop_flag = False
43-
self.tts_instance.opts.add_wav_header = False
4445

4546

4647
async def _stream_llm_output(self, llm_output: AsyncGenerator[str, None]) -> None:
@@ -53,7 +54,7 @@ async def _stream_llm_output(self, llm_output: AsyncGenerator[str, None]) -> Non
5354
buffer = ""
5455
async for chunk in llm_output:
5556
buffer += chunk
56-
if self.sentence_end_regex.match(buffer) or self.buffer_size > 600:
57+
if self.sentence_end_regex.match(buffer) or len(buffer) > self.buffer_size:
5758
self.queue.put(buffer)
5859
buffer = ""
5960

0 commit comments

Comments
 (0)