Merge pull request #9 from smallest-inc/long_text

hamees-sayed · web-flow · commit 9dc234386e30 · 2024-12-03T18:55:11.000+05:30
long text handling
diff --git a/README.md b/README.md
@@ -75,7 +75,20 @@ if __name__ == "__main__":
 - `speed`: Speech speed multiplier (default: 1.0)
 - `add_wav_header`: Include WAV header in output (default: True)
 - `transliterate`: Enable text transliteration (default: False)
-- `remove_extra_silence`: Remove additional silence (default: True)
+- `remove_extra_silence`: Remove additional silence (default: True)  
+
+These parameters are part of the `Smallest` instance. They can be set when creating the instance (as shown above). However, the `synthesize` function also accepts kwargs, allowing you to override these parameters for a specific synthesis request.
+
+For example, you can modify the speech speed and sample rate just for a particular synthesis call:  
+```py
+client.synthesize(
+    "Hello, this is a test for sync synthesis function.",
+    save_as="sync_synthesize.wav",
+    speed=1.5,  # Overrides default speed
+    sample_rate=16000  # Overrides default sample rate
+)
+```
+
 
 ### Async   
 Asynchronous text-to-speech synthesis client.    
@@ -107,7 +120,18 @@ if __name__ == "__main__":
 - `speed`: Speech speed multiplier (default: 1.0)
 - `add_wav_header`: Include WAV header in output (default: True)
 - `transliterate`: Enable text transliteration (default: False)
-- `remove_extra_silence`: Remove additional silence (default: True)
+- `remove_extra_silence`: Remove additional silence (default: True)  
+
+These parameters are part of the `AsyncSmallest` instance. They can be set when creating the instance (as shown above). However, the `synthesize` function also accepts kwargs, allowing you to override any of these parameters on a per-request basis.
+
+For example, you can modify the speech speed and sample rate just for a particular synthesis request:  
+```py
+audio_bytes = await tts.synthesize(
+    "Hello, this is a test of the async synthesis function.",
+    speed=1.5,  # Overrides default speed
+    sample_rate=16000  # Overrides default sample rate
+)
+```
 
 ### LLM to Speech    
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "smallestai"
-version = "1.2.0"
+version = "1.3.0"
 description = "Official Python client for the Smallest AI API"
 authors = [
     {name = "Smallest", email = "info@smallest.ai"},
diff --git a/smallest/async_tts.py b/smallest/async_tts.py
@@ -7,20 +7,20 @@
 from .models import TTSModels, TTSVoices
 from .exceptions import TTSError, APIError
 from .utils import (TTSOptions, validate_input, preprocess_text, add_wav_header,
-                     get_smallest_languages, get_smallest_voices, get_smallest_models, API_BASE_URL)
+                     get_smallest_languages, get_smallest_voices, get_smallest_models, SENTENCE_END_REGEX, API_BASE_URL)
 
 
 class AsyncSmallest:
     def __init__(
-            self,
-            api_key: Optional[str] = None,
-            model: TTSModels = "lightning",
-            sample_rate: int = 24000,
-            voice: TTSVoices = "emily",
-            speed: Optional[float] = 1.0,
-            add_wav_header: Optional[bool] = True,
-            transliterate: Optional[bool] = False,
-            remove_extra_silence: Optional[bool] = False
+        self,
+        api_key: Optional[str] = None,
+        model: TTSModels = "lightning",
+        sample_rate: int = 24000,
+        voice: TTSVoices = "emily",
+        speed: Optional[float] = 1.0,
+        add_wav_header: Optional[bool] = True,
+        transliterate: Optional[bool] = False,
+        remove_extra_silence: Optional[bool] = False
     ) -> None:
         """
         AsyncSmallest Instance for asynchronous text-to-speech synthesis.
@@ -48,6 +48,7 @@ def __init__(
         self.api_key = api_key or os.environ.get("SMALLEST_API_KEY")
         if not self.api_key:
             raise TTSError("API key is required")
+        self.chunk_size = 250
         
         self.opts = TTSOptions(
             model=model,
@@ -70,6 +71,48 @@ async def __aexit__(self, exc_type, exc_val, exc_tb):
         if self.session:
             await self.session.close()
 
+    def _split_into_chunks(self, text: str) -> List[str]:
+        """
+        Splits the input text into chunks based on sentence boundaries and the maximum chunk size.
+        """
+        chunks = []
+        current_chunk = ""
+        last_break_index = 0
+
+        i = 0
+        while i < len(text):
+            current_chunk += text[i]
+
+            if text[i] in ".,":
+                last_break_index = i
+
+            if len(current_chunk) >= self.chunk_size:
+                if last_break_index > 0:
+                    chunk = text[:last_break_index + 1].strip()
+                    chunk = chunk.replace("—", " ")
+                    chunks.append(chunk)
+                
+                    text = text[last_break_index + 1:]
+                    i = -1
+                    current_chunk = ""
+                    last_break_index = 0
+                else:
+                    # No break point found, split at max length
+                    current_chunk = current_chunk.replace("—", " ")
+                    chunks.append(current_chunk.strip())
+                    text = text[self.chunk_size:]
+                    i = -1
+                    current_chunk = ""
+
+            i += 1
+
+        if text:
+            text = text.replace("—", " ")
+            chunks.append(text.strip())
+
+        return chunks
+
+
     def get_languages(self) -> List[str]:
         """Returns a list of available languages."""
         return get_smallest_languages()
@@ -110,42 +153,45 @@ async def synthesize(
             setattr(opts, key, value)
 
         validate_input(text, opts.voice, opts.model, opts.sample_rate, opts.speed)
-
-        payload = {
-            "text": preprocess_text(text),
-            "sample_rate": opts.sample_rate,
-            "voice_id": opts.voice,
-            "add_wav_header": opts.add_wav_header,
-            "speed": opts.speed,
-            "model": opts.model,
-            "transliterate": opts.transliterate,
-            "remove_extra_silence": opts.remove_extra_silence
-        }
-
-        headers = {
-            "Authorization": f"Bearer {self.api_key}",
-            "Content-Type": "application/json",
-        }
-
-        if not self.session:
-            self.session = aiohttp.ClientSession()
+        chunks = self._split_into_chunks(text)
+        audio_content = b""
+
+        for chunk in chunks:
+            payload = {
+                "text": preprocess_text(chunk),
+                "sample_rate": opts.sample_rate,
+                "voice_id": opts.voice,
+                "add_wav_header": False,
+                "speed": opts.speed,
+                "model": opts.model,
+                "transliterate": opts.transliterate,
+                "remove_extra_silence": opts.remove_extra_silence
+            }
+
+            headers = {
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json",
+            }
+
+            if not self.session:
+                self.session = aiohttp.ClientSession()
         
-        async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
-            if res.status != 200:
-                raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
+            async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
+                if res.status != 200:
+                    raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
             
-            audio_content = await res.read()
+                audio_content += await res.read()
 
         if save_as:
             if not save_as.endswith(".wav"):
                 raise TTSError("Invalid file name. Extension must be .wav")
             
-            if self.opts.add_wav_header:
-                async with aiofiles.open(save_as, mode='wb') as f:
-                    await f.write(audio_content)
-            else:
-                async with aiofiles.open(save_as, mode='wb') as f:
-                    await f.write(add_wav_header(audio_content, self.opts.sample_rate))
+            async with aiofiles.open(save_as, mode='wb') as f:
+                await f.write(add_wav_header(audio_content, self.opts.sample_rate))
+
             return None
 
+        if opts.add_wav_header:
+            return add_wav_header(audio_content, self.opts.sample_rate)
+        
         return audio_content
diff --git a/smallest/stream_tts.py b/smallest/stream_tts.py
@@ -34,13 +34,14 @@ def __init__(
             max_retries: Number of retry attempts for failed synthesis (default: 3)
         """
         self.tts_instance = tts_instance
+        self.tts_instance.opts.add_wav_header = False
+
         self.sentence_end_regex = SENTENCE_END_REGEX
         self.queue_timeout = queue_timeout
         self.max_retries = max_retries
         self.queue = Queue()
         self.buffer_size = 250
         self.stop_flag = False
-        self.tts_instance.opts.add_wav_header = False
 
 
     async def _stream_llm_output(self, llm_output: AsyncGenerator[str, None]) -> None:
@@ -53,7 +54,7 @@ async def _stream_llm_output(self, llm_output: AsyncGenerator[str, None]) -> Non
         buffer = ""
         async for chunk in llm_output:
             buffer += chunk
-            if self.sentence_end_regex.match(buffer) or self.buffer_size > 600:
+            if self.sentence_end_regex.match(buffer) or len(buffer) > self.buffer_size:
                 self.queue.put(buffer)
                 buffer = ""
 
diff --git a/smallest/tts.py b/smallest/tts.py
diff --git a/smallest/utils.py b/smallest/utils.py