Merge pull request #14 from hamees-sayed/hindi

radix132 · web-flow · commit ab2b1d48f0b0 · 2024-12-20T17:28:14.000+05:30
get langs through api, hindi working
diff --git a/README.md b/README.md
@@ -26,6 +26,7 @@ Currently, the library supports direct synthesis and the ability to synthesize s
 
 - [Installation](#installation)
 - [Get the API Key](#get-the-api-key)
+- [Best Practices for Input Text](#best-practices-for-input-text)
 - [Examples](#examples)
   - [Sync](#sync)
   - [Async](#async)
@@ -49,6 +50,15 @@ When using an SDK in your application, make sure to pin to at least the major ve
 3. Create a new API Key and copy it.
 4. Export the API Key in your environment with the name `SMALLEST_API_KEY`, ensuring that your application can access it securely for authentication.
 
+## Best Practices for Input Text
+While the `transliterate` parameter is provided, please note that it is not fully supported and may not perform consistently across all cases. It is recommended to use the model without relying on this parameter.
+
+For optimal voice generation results:
+
+1. For English, provide the input in Latin script (e.g., "Hello, how are you?").
+2. For Hindi, provide the input in Devanagari script (e.g., "नमस्ते, आप कैसे हैं?").
+3. For code-mixed input, use Latin script for English and Devanagari script for Hindi (e.g., "Hello, आप कैसे हैं?").
+
 ## Examples
 
 ### Sync  
@@ -135,7 +145,7 @@ audio_bytes = await tts.synthesize(
 
 ### LLM to Speech    
 
-The `TextToAudioStream` class provides real-time text-to-speech processing, converting streaming text into audio output with minimal latency. It's particularly useful for applications like voice assistants, live captioning, or interactive chatbots that require immediate audio feedback from text generation. Supports both synchronous and asynchronous TTS instance.
+The `TextToAudioStream` class provides real-time text-to-speech processing, converting streaming text into audio output. It's particularly useful for applications like voice assistants, live captioning, or interactive chatbots that require immediate audio feedback from text generation. Supports both synchronous and asynchronous TTS instance.
 
 ```python
 import os
@@ -209,7 +219,7 @@ The processor yields raw audio data chunks without WAV headers for streaming eff
 ```python
 from smallest.tts import Smallest
 
-client = Smallest()
+client = Smallest(api_key=os.environ.get("SMALLEST_API_KEY"))
 
 print(f"Avalaible Languages: {client.get_languages()}")
 print(f"Available Voices: {client.get_voices()}")
@@ -227,7 +237,7 @@ When implementing audio streaming with chunks of synthesized speech, WAV headers
 - Sequential playback of chunks with headers causes audio artifacts (pop sounds) when concatenating or playing audio sequentially.
 - Audio players would try to reinitialize audio settings for each chunk.
 
-### Best Practices
+### Best Practices for Audio Streaming
 1. Stream raw PCM audio data without headers
 2. Add a single WAV header only when:
    - Saving the complete stream to a file
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,9 +1,9 @@
 [project]
 name = "smallestai"
-version = "1.3.2"
+version = "1.3.4"
 description = "Official Python client for the Smallest AI API"
 authors = [
-    {name = "Smallest", email = "info@smallest.ai"},
+    {name = "Smallest", email = "support@smallest.ai"},
 ]
 readme = "README.md"
 license = {text = "MIT"}
diff --git a/smallest/async_tts.py b/smallest/async_tts.py
@@ -4,16 +4,16 @@
 import aiofiles
 from typing import Optional, Union, List
 
-from .models import TTSModels, TTSVoices
-from .exceptions import TTSError, APIError
-from .utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, split_into_chunks,
+from smallest.models import TTSModels, TTSVoices
+from smallest.exceptions import TTSError, APIError
+from smallest.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, split_into_chunks,
                      get_smallest_languages, get_smallest_voices, get_smallest_models, SENTENCE_END_REGEX, API_BASE_URL)
 
 
 class AsyncSmallest:
     def __init__(
         self,
-        api_key: Optional[str] = None,
+        api_key: str = None,
         model: TTSModels = "lightning",
         sample_rate: int = 24000,
         voice: TTSVoices = "emily",
@@ -25,8 +25,8 @@ def __init__(
         """
         AsyncSmallest Instance for asynchronous text-to-speech synthesis.
 
-        This class provides an asynchronous implementation of the text-to-speech functionality. 
-        It allows for non-blocking synthesis of speech from text, making it suitable for applications 
+        This class provides an asynchronous implementation of the text-to-speech functionality.
+        It allows for non-blocking synthesis of speech from text, making it suitable for applications
         that require async processing.
 
         Args:
@@ -49,7 +49,7 @@ def __init__(
         if not self.api_key:
             raise TTSError()
         self.chunk_size = 250
-        
+
         self.opts = TTSOptions(
             model=model,
             sample_rate=sample_rate,
@@ -61,7 +61,7 @@ def __init__(
             remove_extra_silence=remove_extra_silence,
         )
         self.session = None
-        
+
     async def __aenter__(self):
         if self.session is None:
             self.session = aiohttp.ClientSession()
@@ -75,15 +75,15 @@ async def __aexit__(self, exc_type, exc_val, exc_tb):
     def get_languages(self) -> List[str]:
         """Returns a list of available languages."""
         return get_smallest_languages()
-    
+
     def get_voices(self) -> List[str]:
         """Returns a list of available voices."""
         return get_smallest_voices()
 
     def get_models(self) -> List[str]:
         """Returns a list of available models."""
         return get_smallest_models()
-    
+
     async def synthesize(
             self,
             text: str,
@@ -95,12 +95,12 @@ async def synthesize(
 
         Args:
         - text (str): The text to be converted to speech.
-        - save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path. 
+        - save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
                                    The file must have a .wav extension.
         - kwargs: Additional optional parameters to override `__init__` options for this call.
 
         Returns:
-        - Union[bytes, None]: The synthesized audio content in bytes if `save_as` is not specified; 
+        - Union[bytes, None]: The synthesized audio content in bytes if `save_as` is not specified;
                               otherwise, returns None after saving the audio to the specified file.
 
         Raises:
@@ -111,7 +111,8 @@ async def synthesize(
         for key, value in kwargs.items():
             setattr(opts, key, value)
 
-        validate_input(text, opts.voice, opts.model, opts.sample_rate, opts.speed)
+        validate_input(preprocess_text(text), opts.voice, opts.model, opts.sample_rate, opts.speed)
+
         chunks = split_into_chunks(text)
         audio_content = b""
 
@@ -134,23 +135,23 @@ async def synthesize(
 
             if not self.session:
                 self.session = aiohttp.ClientSession()
-        
+
             async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
                 if res.status != 200:
                     raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
-            
+
                 audio_content += await res.read()
 
         if save_as:
             if not save_as.endswith(".wav"):
                 raise TTSError("Invalid file name. Extension must be .wav")
-            
+
             async with aiofiles.open(save_as, mode='wb') as f:
                 await f.write(add_wav_header(audio_content, self.opts.sample_rate))
 
             return None
 
         if opts.add_wav_header:
             return add_wav_header(audio_content, self.opts.sample_rate)
-        
+
         return audio_content
diff --git a/smallest/models.py b/smallest/models.py
@@ -1,7 +1,23 @@
-from typing import Literal
+from typing import Literal, List, Tuple, cast
+import aiohttp
+import asyncio
 
-TTSModels = Literal["lightning"]
-TTSLanguages = Literal["en", "hi"]
-TTSVoices = Literal["emily", "jasmine", "arman", "james", "mithali", "aravind", "raj", 
-                    "arjun", "sanya", "saina", "pooja", "saurabh", "nisha", "mansi", "radhika", "kajal", 
-                    "raghav", "deepika", "niharika", "monika", "raman", "diya", "ananya", "william"]
+API_BASE_URL = "https://waves-api.smallest.ai/api/v1"
+
+async def _fetch_voice_and_model() -> Tuple[List[str], List[str]]:
+    async with aiohttp.ClientSession() as session:
+        async with session.get(f"{API_BASE_URL}/voice/get-all-models") as response:
+            api_response = await response.json()
+
+            voices = []
+            for model in api_response:
+                for voice in model['voiceIds']:
+                    voices.append(voice['voiceId'])
+            models = [model['modelName'] for model in api_response]
+            return models, voices
+
+models, voices = asyncio.run(_fetch_voice_and_model())
+
+TTSLanguages = ["en", "hi"]
+TTSModels = models
+TTSVoices = voices
diff --git a/smallest/stream_tts.py b/smallest/stream_tts.py
@@ -3,10 +3,10 @@
 from queue import Queue, Empty
 from typing import AsyncGenerator, Optional, Union
 
-from .tts import Smallest
-from .exceptions import APIError
-from .async_tts import AsyncSmallest
-from .utils import SENTENCE_END_REGEX
+from smallest.tts import Smallest
+from smallest.exceptions import APIError
+from smallest.async_tts import AsyncSmallest
+from smallest.utils import SENTENCE_END_REGEX
 
 class TextToAudioStream:
     def __init__(
diff --git a/smallest/tts.py b/smallest/tts.py
@@ -4,15 +4,15 @@
 import requests
 from typing import Optional, Union, List
 
-from .models import TTSModels, TTSVoices
-from .exceptions import TTSError, APIError
-from .utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, split_into_chunks,
+from smallest.models import TTSModels, TTSVoices
+from smallest.exceptions import TTSError, APIError
+from smallest.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, split_into_chunks,
 get_smallest_languages, get_smallest_voices, get_smallest_models, SENTENCE_END_REGEX, API_BASE_URL)
 
 class Smallest:
     def __init__(
         self,
-        api_key: Optional[str] = None,
+        api_key: str = None,
         model: TTSModels = "lightning",
         sample_rate: int = 24000,
         voice: TTSVoices = "emily",
@@ -100,7 +100,7 @@ def synthesize(
         for key, value in kwargs.items():
             setattr(opts, key, value)
 
-        validate_input(text, opts.voice, opts.model, opts.sample_rate, opts.speed)
+        validate_input(preprocess_text(text), opts.voice, opts.model, opts.sample_rate, opts.speed)
 
         chunks = split_into_chunks(text)
         audio_content = b""
diff --git a/smallest/utils.py b/smallest/utils.py
@@ -6,8 +6,8 @@
 from dataclasses import dataclass
 from sacremoses import MosesPunctNormalizer
 
-from .exceptions import ValidationError
-from .models import TTSModels, TTSLanguages, TTSVoices
+from smallest.exceptions import ValidationError
+from smallest.models import TTSModels, TTSLanguages, TTSVoices
 
 
 API_BASE_URL = "https://waves-api.smallest.ai/api/v1"
@@ -32,12 +32,12 @@ class TTSOptions:
 def validate_input(text: str, voice: TTSVoices, model: TTSModels, sample_rate: int, speed: float):
     if not text:
         raise ValidationError("Text cannot be empty")
-    if voice not in TTSVoices.__args__:
+    if voice not in TTSVoices:
         raise ValidationError(f"Invalid voice: {voice}")
-    if model not in ['lightning']:
+    if model not in TTSModels:
         raise ValidationError(f"Invalid model: {model}")
-    if not 8000 <= sample_rate <= 48000:
-        raise ValidationError(f"Invalid sample rate: {sample_rate}. Must be between 8000 and 48000")
+    if not 8000 <= sample_rate <= 24000:
+        raise ValidationError(f"Invalid sample rate: {sample_rate}. Must be between 8000 and 24000")
     if not 0.5 <= speed <= 2.0:
         raise ValidationError(f"Invalid speed: {speed}. Must be between 0.5 and 2.0")
 
@@ -51,65 +51,59 @@ def add_wav_header(frame_input: bytes, sample_rate: int = 24000, sample_width: i
 
 
 def preprocess_text(text: str) -> str:
-    # Replace special characters with their normal form
-    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
-    text = text.lower()
-    text = text.replace("—", " ")
-    # Normalize punctuation using Moses punct normalizer
+    text = text.replace("\n", " ").replace("\t", " ").replace("—", " ")
+    text = re.sub(r'\s+', ' ', text)
     mpn = MosesPunctNormalizer()
     text = mpn.normalize(text)
     return text.strip()
 
+
 def split_into_chunks(text: str) -> List[str]:
-        """
-        Splits the input text into chunks based on sentence boundaries 
-        defined by SENTENCE_END_REGEX and the maximum chunk size.
-        """
-        chunks = []
-        current_chunk = ""
-        last_break_index = 0
-
-        i = 0
-        while i < len(text):
-            current_chunk += text[i]
-
-            # Check for sentence boundary using regex
-            if SENTENCE_END_REGEX.match(current_chunk):
+    """
+    Splits the input text into chunks based on sentence boundaries
+    defined by SENTENCE_END_REGEX and the maximum chunk size.
+    Only splits at valid sentence boundaries to avoid breaking words.
+    """
+    chunks = []
+    while text:
+        # If the remaining text is shorter than chunk size, add it as final chunk
+        if len(text) <= CHUNK_SIZE:
+            chunks.append(text.strip())
+            break
+
+        # Find the last sentence boundary within CHUNK_SIZE
+        chunk_text = text[:CHUNK_SIZE]
+        last_break_index = -1
+
+        # Check each character in reverse order to find last punctuation
+        for i in range(len(chunk_text) - 1, -1, -1):
+            if chunk_text[i] in '-.—!?;:…\n':
                 last_break_index = i
+                break
 
-            if len(current_chunk) >= CHUNK_SIZE:
-                if last_break_index > 0:
-                    # Split at the last valid sentence boundary
-                    chunk = text[:last_break_index + 1].strip()
-                    chunk = chunk.replace("—", " ")
-                    chunks.append(chunk)
-
-                    text = text[last_break_index + 1:]
-                    i = -1  # Reset index to process the remaining text
-                    current_chunk = ""
-                    last_break_index = 0
-                else:
-                    # No sentence boundary found, split at max length
-                    current_chunk = current_chunk.replace("—", " ")
-                    chunks.append(current_chunk.strip())
-                    text = text[CHUNK_SIZE:]
-                    i = -1  # Reset index to process the remaining text
-                    current_chunk = ""
-
-            i += 1
-
-        if text:
-            text = text.replace("—", " ")
-            chunks.append(text.strip())
+        if last_break_index == -1:
+            # If no punctuation found in chunk, look for the last space
+            # to avoid breaking words
+            last_space = chunk_text.rfind(' ')
+            if last_space != -1:
+                last_break_index = last_space
+            else:
+                # If no space found, use the full chunk size
+                last_break_index = CHUNK_SIZE - 1
+
+        # Add the chunk up to the break point
+        chunks.append(text[:last_break_index + 1].strip())
+        # Continue with remaining text
+        text = text[last_break_index + 1:].strip()
 
-        return chunks
+    return chunks
 
 
 def get_smallest_languages() -> List[str]:
-    return list(TTSLanguages.__args__)
+    return list(TTSLanguages)
 
 def get_smallest_voices() -> List[str]:
-    return list(TTSVoices.__args__)
+    return list(TTSVoices)
 
 def get_smallest_models() -> List[str]:
     return ["lightning"]
diff --git a/tests/test_async.py b/tests/test_async.py
diff --git a/tests/test_sync.py b/tests/test_sync.py
diff --git a/tests/test_utils.py b/tests/test_utils.py