Skip to content

Commit ab2b1d4

Browse files
authored
Merge pull request #14 from hamees-sayed/hindi
get langs through api, hindi working
2 parents 4e88ab3 + 69eb844 commit ab2b1d4

10 files changed

Lines changed: 140 additions & 114 deletions

File tree

README.md

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Currently, the library supports direct synthesis and the ability to synthesize s
2626

2727
- [Installation](#installation)
2828
- [Get the API Key](#get-the-api-key)
29+
- [Best Practices for Input Text](#best-practices-for-input-text)
2930
- [Examples](#examples)
3031
- [Sync](#sync)
3132
- [Async](#async)
@@ -49,6 +50,15 @@ When using an SDK in your application, make sure to pin to at least the major ve
4950
3. Create a new API Key and copy it.
5051
4. Export the API Key in your environment with the name `SMALLEST_API_KEY`, ensuring that your application can access it securely for authentication.
5152

53+
## Best Practices for Input Text
54+
While the `transliterate` parameter is provided, please note that it is not fully supported and may not perform consistently across all cases. It is recommended to use the model without relying on this parameter.
55+
56+
For optimal voice generation results:
57+
58+
1. For English, provide the input in Latin script (e.g., "Hello, how are you?").
59+
2. For Hindi, provide the input in Devanagari script (e.g., "नमस्ते, आप कैसे हैं?").
60+
3. For code-mixed input, use Latin script for English and Devanagari script for Hindi (e.g., "Hello, आप कैसे हैं?").
61+
5262
## Examples
5363

5464
### Sync
@@ -135,7 +145,7 @@ audio_bytes = await tts.synthesize(
135145

136146
### LLM to Speech
137147

138-
The `TextToAudioStream` class provides real-time text-to-speech processing, converting streaming text into audio output with minimal latency. It's particularly useful for applications like voice assistants, live captioning, or interactive chatbots that require immediate audio feedback from text generation. Supports both synchronous and asynchronous TTS instance.
148+
The `TextToAudioStream` class provides real-time text-to-speech processing, converting streaming text into audio output. It's particularly useful for applications like voice assistants, live captioning, or interactive chatbots that require immediate audio feedback from text generation. Supports both synchronous and asynchronous TTS instance.
139149

140150
```python
141151
import os
@@ -209,7 +219,7 @@ The processor yields raw audio data chunks without WAV headers for streaming eff
209219
```python
210220
from smallest.tts import Smallest
211221

212-
client = Smallest()
222+
client = Smallest(api_key=os.environ.get("SMALLEST_API_KEY"))
213223

214224
print(f"Avalaible Languages: {client.get_languages()}")
215225
print(f"Available Voices: {client.get_voices()}")
@@ -227,7 +237,7 @@ When implementing audio streaming with chunks of synthesized speech, WAV headers
227237
- Sequential playback of chunks with headers causes audio artifacts (pop sounds) when concatenating or playing audio sequentially.
228238
- Audio players would try to reinitialize audio settings for each chunk.
229239

230-
### Best Practices
240+
### Best Practices for Audio Streaming
231241
1. Stream raw PCM audio data without headers
232242
2. Add a single WAV header only when:
233243
- Saving the complete stream to a file

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
[project]
22
name = "smallestai"
3-
version = "1.3.2"
3+
version = "1.3.4"
44
description = "Official Python client for the Smallest AI API"
55
authors = [
6-
{name = "Smallest", email = "info@smallest.ai"},
6+
{name = "Smallest", email = "support@smallest.ai"},
77
]
88
readme = "README.md"
99
license = {text = "MIT"}

smallest/async_tts.py

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,16 @@
44
import aiofiles
55
from typing import Optional, Union, List
66

7-
from .models import TTSModels, TTSVoices
8-
from .exceptions import TTSError, APIError
9-
from .utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, split_into_chunks,
7+
from smallest.models import TTSModels, TTSVoices
8+
from smallest.exceptions import TTSError, APIError
9+
from smallest.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, split_into_chunks,
1010
get_smallest_languages, get_smallest_voices, get_smallest_models, SENTENCE_END_REGEX, API_BASE_URL)
1111

1212

1313
class AsyncSmallest:
1414
def __init__(
1515
self,
16-
api_key: Optional[str] = None,
16+
api_key: str = None,
1717
model: TTSModels = "lightning",
1818
sample_rate: int = 24000,
1919
voice: TTSVoices = "emily",
@@ -25,8 +25,8 @@ def __init__(
2525
"""
2626
AsyncSmallest Instance for asynchronous text-to-speech synthesis.
2727
28-
This class provides an asynchronous implementation of the text-to-speech functionality.
29-
It allows for non-blocking synthesis of speech from text, making it suitable for applications
28+
This class provides an asynchronous implementation of the text-to-speech functionality.
29+
It allows for non-blocking synthesis of speech from text, making it suitable for applications
3030
that require async processing.
3131
3232
Args:
@@ -49,7 +49,7 @@ def __init__(
4949
if not self.api_key:
5050
raise TTSError()
5151
self.chunk_size = 250
52-
52+
5353
self.opts = TTSOptions(
5454
model=model,
5555
sample_rate=sample_rate,
@@ -61,7 +61,7 @@ def __init__(
6161
remove_extra_silence=remove_extra_silence,
6262
)
6363
self.session = None
64-
64+
6565
async def __aenter__(self):
6666
if self.session is None:
6767
self.session = aiohttp.ClientSession()
@@ -75,15 +75,15 @@ async def __aexit__(self, exc_type, exc_val, exc_tb):
7575
def get_languages(self) -> List[str]:
7676
"""Returns a list of available languages."""
7777
return get_smallest_languages()
78-
78+
7979
def get_voices(self) -> List[str]:
8080
"""Returns a list of available voices."""
8181
return get_smallest_voices()
8282

8383
def get_models(self) -> List[str]:
8484
"""Returns a list of available models."""
8585
return get_smallest_models()
86-
86+
8787
async def synthesize(
8888
self,
8989
text: str,
@@ -95,12 +95,12 @@ async def synthesize(
9595
9696
Args:
9797
- text (str): The text to be converted to speech.
98-
- save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
98+
- save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
9999
The file must have a .wav extension.
100100
- kwargs: Additional optional parameters to override `__init__` options for this call.
101101
102102
Returns:
103-
- Union[bytes, None]: The synthesized audio content in bytes if `save_as` is not specified;
103+
- Union[bytes, None]: The synthesized audio content in bytes if `save_as` is not specified;
104104
otherwise, returns None after saving the audio to the specified file.
105105
106106
Raises:
@@ -111,7 +111,8 @@ async def synthesize(
111111
for key, value in kwargs.items():
112112
setattr(opts, key, value)
113113

114-
validate_input(text, opts.voice, opts.model, opts.sample_rate, opts.speed)
114+
validate_input(preprocess_text(text), opts.voice, opts.model, opts.sample_rate, opts.speed)
115+
115116
chunks = split_into_chunks(text)
116117
audio_content = b""
117118

@@ -134,23 +135,23 @@ async def synthesize(
134135

135136
if not self.session:
136137
self.session = aiohttp.ClientSession()
137-
138+
138139
async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
139140
if res.status != 200:
140141
raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
141-
142+
142143
audio_content += await res.read()
143144

144145
if save_as:
145146
if not save_as.endswith(".wav"):
146147
raise TTSError("Invalid file name. Extension must be .wav")
147-
148+
148149
async with aiofiles.open(save_as, mode='wb') as f:
149150
await f.write(add_wav_header(audio_content, self.opts.sample_rate))
150151

151152
return None
152153

153154
if opts.add_wav_header:
154155
return add_wav_header(audio_content, self.opts.sample_rate)
155-
156+
156157
return audio_content

smallest/models.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,23 @@
1-
from typing import Literal
1+
from typing import Literal, List, Tuple, cast
2+
import aiohttp
3+
import asyncio
24

3-
TTSModels = Literal["lightning"]
4-
TTSLanguages = Literal["en", "hi"]
5-
TTSVoices = Literal["emily", "jasmine", "arman", "james", "mithali", "aravind", "raj",
6-
"arjun", "sanya", "saina", "pooja", "saurabh", "nisha", "mansi", "radhika", "kajal",
7-
"raghav", "deepika", "niharika", "monika", "raman", "diya", "ananya", "william"]
5+
API_BASE_URL = "https://waves-api.smallest.ai/api/v1"
6+
7+
async def _fetch_voice_and_model() -> Tuple[List[str], List[str]]:
8+
async with aiohttp.ClientSession() as session:
9+
async with session.get(f"{API_BASE_URL}/voice/get-all-models") as response:
10+
api_response = await response.json()
11+
12+
voices = []
13+
for model in api_response:
14+
for voice in model['voiceIds']:
15+
voices.append(voice['voiceId'])
16+
models = [model['modelName'] for model in api_response]
17+
return models, voices
18+
19+
models, voices = asyncio.run(_fetch_voice_and_model())
20+
21+
TTSLanguages = ["en", "hi"]
22+
TTSModels = models
23+
TTSVoices = voices

smallest/stream_tts.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
from queue import Queue, Empty
44
from typing import AsyncGenerator, Optional, Union
55

6-
from .tts import Smallest
7-
from .exceptions import APIError
8-
from .async_tts import AsyncSmallest
9-
from .utils import SENTENCE_END_REGEX
6+
from smallest.tts import Smallest
7+
from smallest.exceptions import APIError
8+
from smallest.async_tts import AsyncSmallest
9+
from smallest.utils import SENTENCE_END_REGEX
1010

1111
class TextToAudioStream:
1212
def __init__(

smallest/tts.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,15 @@
44
import requests
55
from typing import Optional, Union, List
66

7-
from .models import TTSModels, TTSVoices
8-
from .exceptions import TTSError, APIError
9-
from .utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, split_into_chunks,
7+
from smallest.models import TTSModels, TTSVoices
8+
from smallest.exceptions import TTSError, APIError
9+
from smallest.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, split_into_chunks,
1010
get_smallest_languages, get_smallest_voices, get_smallest_models, SENTENCE_END_REGEX, API_BASE_URL)
1111

1212
class Smallest:
1313
def __init__(
1414
self,
15-
api_key: Optional[str] = None,
15+
api_key: str = None,
1616
model: TTSModels = "lightning",
1717
sample_rate: int = 24000,
1818
voice: TTSVoices = "emily",
@@ -100,7 +100,7 @@ def synthesize(
100100
for key, value in kwargs.items():
101101
setattr(opts, key, value)
102102

103-
validate_input(text, opts.voice, opts.model, opts.sample_rate, opts.speed)
103+
validate_input(preprocess_text(text), opts.voice, opts.model, opts.sample_rate, opts.speed)
104104

105105
chunks = split_into_chunks(text)
106106
audio_content = b""

smallest/utils.py

Lines changed: 46 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
from dataclasses import dataclass
77
from sacremoses import MosesPunctNormalizer
88

9-
from .exceptions import ValidationError
10-
from .models import TTSModels, TTSLanguages, TTSVoices
9+
from smallest.exceptions import ValidationError
10+
from smallest.models import TTSModels, TTSLanguages, TTSVoices
1111

1212

1313
API_BASE_URL = "https://waves-api.smallest.ai/api/v1"
@@ -32,12 +32,12 @@ class TTSOptions:
3232
def validate_input(text: str, voice: TTSVoices, model: TTSModels, sample_rate: int, speed: float):
3333
if not text:
3434
raise ValidationError("Text cannot be empty")
35-
if voice not in TTSVoices.__args__:
35+
if voice not in TTSVoices:
3636
raise ValidationError(f"Invalid voice: {voice}")
37-
if model not in ['lightning']:
37+
if model not in TTSModels:
3838
raise ValidationError(f"Invalid model: {model}")
39-
if not 8000 <= sample_rate <= 48000:
40-
raise ValidationError(f"Invalid sample rate: {sample_rate}. Must be between 8000 and 48000")
39+
if not 8000 <= sample_rate <= 24000:
40+
raise ValidationError(f"Invalid sample rate: {sample_rate}. Must be between 8000 and 24000")
4141
if not 0.5 <= speed <= 2.0:
4242
raise ValidationError(f"Invalid speed: {speed}. Must be between 0.5 and 2.0")
4343

@@ -51,65 +51,59 @@ def add_wav_header(frame_input: bytes, sample_rate: int = 24000, sample_width: i
5151

5252

5353
def preprocess_text(text: str) -> str:
54-
# Replace special characters with their normal form
55-
text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
56-
text = text.lower()
57-
text = text.replace("—", " ")
58-
# Normalize punctuation using Moses punct normalizer
54+
text = text.replace("\n", " ").replace("\t", " ").replace("—", " ")
55+
text = re.sub(r'\s+', ' ', text)
5956
mpn = MosesPunctNormalizer()
6057
text = mpn.normalize(text)
6158
return text.strip()
6259

60+
6361
def split_into_chunks(text: str) -> List[str]:
64-
"""
65-
Splits the input text into chunks based on sentence boundaries
66-
defined by SENTENCE_END_REGEX and the maximum chunk size.
67-
"""
68-
chunks = []
69-
current_chunk = ""
70-
last_break_index = 0
71-
72-
i = 0
73-
while i < len(text):
74-
current_chunk += text[i]
75-
76-
# Check for sentence boundary using regex
77-
if SENTENCE_END_REGEX.match(current_chunk):
62+
"""
63+
Splits the input text into chunks based on sentence boundaries
64+
defined by SENTENCE_END_REGEX and the maximum chunk size.
65+
Only splits at valid sentence boundaries to avoid breaking words.
66+
"""
67+
chunks = []
68+
while text:
69+
# If the remaining text is shorter than chunk size, add it as final chunk
70+
if len(text) <= CHUNK_SIZE:
71+
chunks.append(text.strip())
72+
break
73+
74+
# Find the last sentence boundary within CHUNK_SIZE
75+
chunk_text = text[:CHUNK_SIZE]
76+
last_break_index = -1
77+
78+
# Check each character in reverse order to find last punctuation
79+
for i in range(len(chunk_text) - 1, -1, -1):
80+
if chunk_text[i] in '-.—!?;:…\n':
7881
last_break_index = i
82+
break
7983

80-
if len(current_chunk) >= CHUNK_SIZE:
81-
if last_break_index > 0:
82-
# Split at the last valid sentence boundary
83-
chunk = text[:last_break_index + 1].strip()
84-
chunk = chunk.replace("—", " ")
85-
chunks.append(chunk)
86-
87-
text = text[last_break_index + 1:]
88-
i = -1 # Reset index to process the remaining text
89-
current_chunk = ""
90-
last_break_index = 0
91-
else:
92-
# No sentence boundary found, split at max length
93-
current_chunk = current_chunk.replace("—", " ")
94-
chunks.append(current_chunk.strip())
95-
text = text[CHUNK_SIZE:]
96-
i = -1 # Reset index to process the remaining text
97-
current_chunk = ""
98-
99-
i += 1
100-
101-
if text:
102-
text = text.replace("—", " ")
103-
chunks.append(text.strip())
84+
if last_break_index == -1:
85+
# If no punctuation found in chunk, look for the last space
86+
# to avoid breaking words
87+
last_space = chunk_text.rfind(' ')
88+
if last_space != -1:
89+
last_break_index = last_space
90+
else:
91+
# If no space found, use the full chunk size
92+
last_break_index = CHUNK_SIZE - 1
93+
94+
# Add the chunk up to the break point
95+
chunks.append(text[:last_break_index + 1].strip())
96+
# Continue with remaining text
97+
text = text[last_break_index + 1:].strip()
10498

105-
return chunks
99+
return chunks
106100

107101

108102
def get_smallest_languages() -> List[str]:
109-
return list(TTSLanguages.__args__)
103+
return list(TTSLanguages)
110104

111105
def get_smallest_voices() -> List[str]:
112-
return list(TTSVoices.__args__)
106+
return list(TTSVoices)
113107

114108
def get_smallest_models() -> List[str]:
115109
return ["lightning"]

0 commit comments

Comments
 (0)