juspay · swaroopvarma1 · May 7, 2026 · coderabbitai · May 7, 2026 · coderabbitai
diff --git a/app/ai/voice/agents/breeze_buddy/template/types.py b/app/ai/voice/agents/breeze_buddy/template/types.py
@@ -271,6 +271,7 @@ class TTSProvider(str, Enum):
     ELEVENLABS = "elevenlabs"
     CARTESIA = "cartesia"
     SARVAM = "sarvam"
+    SONIOX = "soniox"
 
 
 # Maps legacy tts_voice_name values to current provider strings for backward compat.
@@ -317,10 +318,18 @@ class TTSConfig(BaseModel):
             "speed": 0.9,
             "pitch": 0.0
         }
+
+    Example (Soniox):
+        {
+            "provider": "soniox",
+            "voice_id": "Adrian",
+            "model": "tts-rt-v1-preview",
+            "language": "en"
+        }
     """
 
     provider: TTSProvider = Field(
-        ..., description="TTS provider (elevenlabs, cartesia, sarvam)"
+        ..., description="TTS provider (elevenlabs, cartesia, sarvam, soniox)"
     )
     voice_id: Optional[str] = Field(None, description="Provider-specific voice ID")
     model: Optional[str] = Field(

diff --git a/app/ai/voice/agents/breeze_buddy/tts/__init__.py b/app/ai/voice/agents/breeze_buddy/tts/__init__.py
@@ -13,13 +13,16 @@
     CartesiaConfig,
     ElevenLabsConfig,
     SarvamTTSConfig,
+    SonioxTTSConfig,
     build_cartesia_tts,
     build_elevenlabs_tts,
     build_sarvam_tts,
+    build_soniox_tts,
 )
 from app.ai.voice.tts.cartesia import _generate_cartesia_audio
 from app.ai.voice.tts.elevenlabs import _generate_elevenlabs_audio
 from app.ai.voice.tts.sarvam import _generate_sarvam_audio
+from app.ai.voice.tts.soniox import _generate_soniox_audio
 from app.core.config.dynamic import (
     BB_AGGREGATE_SENTENCES,
     BB_ENABLE_ELEVENLABS_INDIAN_RESIDENCY,
@@ -33,6 +36,7 @@
     ELEVENLABS_INDIAN_RESIDENCY_API_KEY,
     ELEVENLABS_INDIAN_RESIDENCY_WEBSOCKET_URL,
     SARVAM_API_KEY,
+    SONIOX_API_KEY,
 )
 from app.core.logger import logger
 
@@ -187,6 +191,22 @@ async def get_tts_service(voice_config: TTSConfig):
             )
         )
 
+    elif provider == "soniox":
+        if not SONIOX_API_KEY:
+            raise ValueError("SONIOX_API_KEY is required for Soniox TTS")
+
+        aggregate = await BB_AGGREGATE_SENTENCES("soniox")
+
+        return build_soniox_tts(
+            SonioxTTSConfig(
+                api_key=SONIOX_API_KEY,
+                voice=voice_config.voice_id or "Adrian",
+                model=voice_config.model or "tts-rt-v1-preview",
+                language=_parse_language(voice_config.language, Language.EN),
+                aggregate_sentences=aggregate,
+            )
+        )
+
     else:
         raise ValueError(f"Unsupported TTS provider: {provider}")
 
@@ -239,6 +259,14 @@ async def generate_audio(
             model=resolved.model,
         )
         input_format = "raw"
+    elif provider == "soniox":
+        audio_data = await _generate_soniox_audio(
+            text=text,
+            voice=resolved.voice_id,
+            model=resolved.model,
+            language=resolved.language,
+        )
+        input_format = "raw"
     else:
         raise ValueError(f"Unsupported TTS provider: {provider}")
 

diff --git a/app/ai/voice/tts/__init__.py b/app/ai/voice/tts/__init__.py
@@ -10,6 +10,7 @@
 from .elevenlabs import ElevenLabsConfig, build_elevenlabs_tts
 from .google import GoogleConfig, build_google_tts
 from .sarvam import SarvamTTSConfig, build_sarvam_tts, get_sarvam_language
+from .soniox import SonioxTTSConfig, build_soniox_tts
 
 __all__ = [
     # Cartesia
@@ -25,4 +26,7 @@
     "SarvamTTSConfig",
     "build_sarvam_tts",
     "get_sarvam_language",
+    # Soniox
+    "SonioxTTSConfig",
+    "build_soniox_tts",
 ]
diff --git a/app/ai/voice/tts/soniox.py b/app/ai/voice/tts/soniox.py
@@ -0,0 +1,158 @@
+"""Soniox TTS helpers and builder.
+
+Wraps pipecat's :class:`SonioxTTSService` (WebSocket streaming TTS) to fit the
+shared builder pattern used by other providers in this package, and provides a
+one-shot WebSocket synth helper for greeting prep — pipecat's Soniox client
+is streaming-only, so the one-shot path is implemented directly against the
+same WebSocket protocol.
+"""
+
+from __future__ import annotations
+
+import base64
+import json
+from dataclasses import dataclass
+from typing import Optional
+
+from pipecat.services.soniox.tts import (
+    SonioxTTSService,
+    language_to_soniox_tts_language,
+)
+from pipecat.services.tts_service import TextAggregationMode
+from pipecat.transcriptions.language import Language
+from websockets.asyncio.client import connect as websocket_connect
+
+from app.core.config.static import SONIOX_API_KEY
+from app.core.logger import logger
+
+__all__ = [
+    "SonioxTTSConfig",
+    "build_soniox_tts",
+    "_generate_soniox_audio",
+]
+
+
+SONIOX_TTS_WS_URL = "wss://tts-rt.soniox.com/tts-websocket"
+
+
+@dataclass
+class SonioxTTSConfig:
+    """Configuration for Soniox TTS."""
+
+    api_key: str
+    voice: str
+    model: str
+    language: Language = Language.EN
+    sample_rate: int = 16000
+    audio_format: str = "pcm_s16le"
+    aggregate_sentences: bool = True
+
+
+def build_soniox_tts(config: SonioxTTSConfig) -> SonioxTTSService:
+    """Create a Soniox TTS service.
+
+    Pipecat handles Language enum -> Soniox language code conversion at
+    init time via ``language_to_service_language``, so the enum is forwarded
+    untouched.
+    """
+
+    logger.info(
+        f"Using SonioxTTSService with model={config.model}, voice={config.voice}, "
+        f"language={config.language}, sample_rate={config.sample_rate}, "
+        f"audio_format={config.audio_format}"
+    )
+
+    return SonioxTTSService(
+        api_key=config.api_key,
+        sample_rate=config.sample_rate,
+        audio_format=config.audio_format,
+        settings=SonioxTTSService.Settings(
+            voice=config.voice,
+            model=config.model,
+            language=config.language,
+        ),
+        text_aggregation_mode=(
+            TextAggregationMode.SENTENCE
+            if config.aggregate_sentences
+            else TextAggregationMode.TOKEN
+        ),
+    )
+
+
+async def _generate_soniox_audio(
+    text: str,
+    voice: Optional[str] = None,
+    model: Optional[str] = None,
+    language: Optional[str] = None,
+    sample_rate: int = 16000,
+) -> bytes:
+    """One-shot synth via Soniox WebSocket for greeting prep.
+
+    Opens a single WebSocket, sends config + text + ``text_end:true``, collects
+    base64-encoded audio chunks until ``terminated``, and returns the
+    concatenated PCM bytes.
+
+    Returns 16-bit little-endian PCM mono at the requested ``sample_rate``,
+    matching ``convert_to_mulaw`` expectations for downstream telephony use.
+    """
+    if not SONIOX_API_KEY:
+        raise ValueError("SONIOX_API_KEY is required for Soniox TTS")
+
+    voice = voice or "Adrian"
+    model = model or "tts-rt-v1-preview"
+
+    if language:
+        try:
+            lang_enum = Language(language)
+        except ValueError:
+            logger.warning(
+                f"Invalid Soniox language code '{language}', falling back to EN"
+            )
+            lang_enum = Language.EN
+    else:
+        lang_enum = Language.EN
+
+    soniox_lang = language_to_soniox_tts_language(lang_enum) or "en"
+
+    config_msg = {
+        "api_key": SONIOX_API_KEY,
+        "stream_id": "greeting",
+        "model": model,
+        "voice": voice,
+        "language": soniox_lang,
+        "audio_format": "pcm_s16le",
+        "sample_rate": sample_rate,
+    }
+    text_msg = {"text": text, "text_end": True, "stream_id": "greeting"}
+
+    logger.info(
+        f"Synthesizing greeting with Soniox (pcm_s16le {sample_rate}): {text[:50]}..."
+    )
+
+    audio_chunks: list[bytes] = []
+    async with websocket_connect(SONIOX_TTS_WS_URL) as ws:
+        await ws.send(json.dumps(config_msg))
+        await ws.send(json.dumps(text_msg))
+
+        async for raw in ws:
+            try:
+                msg = json.loads(raw)
+            except json.JSONDecodeError:
+                continue
+
+            error_code = msg.get("error_code")
+            if error_code is not None:
+                error_message = msg.get("error_message", "")
+                raise Exception(f"Soniox TTS error {error_code}: {error_message}")
+
+            audio_b64 = msg.get("audio")
+            if audio_b64:
+                audio_chunks.append(base64.b64decode(audio_b64))
+
+            if msg.get("terminated"):
+                break
+
+    if not audio_chunks:
+        raise Exception("No audio returned from Soniox TTS")
+
+    return b"".join(audio_chunks)
diff --git a/app/core/config/dynamic.py b/app/core/config/dynamic.py
@@ -33,6 +33,11 @@
         "speed": 0.9,
         "pitch": 0.0,
     },
+    "soniox": {
+        "voice_id": "Adrian",
+        "model": "tts-rt-v1-preview",
+        "language": "en",
+    },
 }