diff --git a/app/ai/voice/agents/breeze_buddy/template/types.py b/app/ai/voice/agents/breeze_buddy/template/types.py index fadbe5d46..434b25454 100644 --- a/app/ai/voice/agents/breeze_buddy/template/types.py +++ b/app/ai/voice/agents/breeze_buddy/template/types.py @@ -271,6 +271,7 @@ class TTSProvider(str, Enum): ELEVENLABS = "elevenlabs" CARTESIA = "cartesia" SARVAM = "sarvam" + SONIOX = "soniox" # Maps legacy tts_voice_name values to current provider strings for backward compat. @@ -317,10 +318,18 @@ class TTSConfig(BaseModel): "speed": 0.9, "pitch": 0.0 } + + Example (Soniox): + { + "provider": "soniox", + "voice_id": "Adrian", + "model": "tts-rt-v1-preview", + "language": "en" + } """ provider: TTSProvider = Field( - ..., description="TTS provider (elevenlabs, cartesia, sarvam)" + ..., description="TTS provider (elevenlabs, cartesia, sarvam, soniox)" ) voice_id: Optional[str] = Field(None, description="Provider-specific voice ID") model: Optional[str] = Field( diff --git a/app/ai/voice/agents/breeze_buddy/tts/__init__.py b/app/ai/voice/agents/breeze_buddy/tts/__init__.py index 10ede3151..6fbb17af7 100644 --- a/app/ai/voice/agents/breeze_buddy/tts/__init__.py +++ b/app/ai/voice/agents/breeze_buddy/tts/__init__.py @@ -13,13 +13,16 @@ CartesiaConfig, ElevenLabsConfig, SarvamTTSConfig, + SonioxTTSConfig, build_cartesia_tts, build_elevenlabs_tts, build_sarvam_tts, + build_soniox_tts, ) from app.ai.voice.tts.cartesia import _generate_cartesia_audio from app.ai.voice.tts.elevenlabs import _generate_elevenlabs_audio from app.ai.voice.tts.sarvam import _generate_sarvam_audio +from app.ai.voice.tts.soniox import _generate_soniox_audio from app.core.config.dynamic import ( BB_AGGREGATE_SENTENCES, BB_ENABLE_ELEVENLABS_INDIAN_RESIDENCY, @@ -33,6 +36,7 @@ ELEVENLABS_INDIAN_RESIDENCY_API_KEY, ELEVENLABS_INDIAN_RESIDENCY_WEBSOCKET_URL, SARVAM_API_KEY, + SONIOX_API_KEY, ) from app.core.logger import logger @@ -187,6 +191,22 @@ async def get_tts_service(voice_config: TTSConfig): ) ) + elif provider == "soniox": + if not SONIOX_API_KEY: + raise ValueError("SONIOX_API_KEY is required for Soniox TTS") + + aggregate = await BB_AGGREGATE_SENTENCES("soniox") + + return build_soniox_tts( + SonioxTTSConfig( + api_key=SONIOX_API_KEY, + voice=voice_config.voice_id or "Adrian", + model=voice_config.model or "tts-rt-v1-preview", + language=_parse_language(voice_config.language, Language.EN), + aggregate_sentences=aggregate, + ) + ) + else: raise ValueError(f"Unsupported TTS provider: {provider}") @@ -239,6 +259,14 @@ async def generate_audio( model=resolved.model, ) input_format = "raw" + elif provider == "soniox": + audio_data = await _generate_soniox_audio( + text=text, + voice=resolved.voice_id, + model=resolved.model, + language=resolved.language, + ) + input_format = "raw" else: raise ValueError(f"Unsupported TTS provider: {provider}") diff --git a/app/ai/voice/tts/__init__.py b/app/ai/voice/tts/__init__.py index c056ffad3..8d3ab0bee 100644 --- a/app/ai/voice/tts/__init__.py +++ b/app/ai/voice/tts/__init__.py @@ -10,6 +10,7 @@ from .elevenlabs import ElevenLabsConfig, build_elevenlabs_tts from .google import GoogleConfig, build_google_tts from .sarvam import SarvamTTSConfig, build_sarvam_tts, get_sarvam_language +from .soniox import SonioxTTSConfig, build_soniox_tts __all__ = [ # Cartesia @@ -25,4 +26,7 @@ "SarvamTTSConfig", "build_sarvam_tts", "get_sarvam_language", + # Soniox + "SonioxTTSConfig", + "build_soniox_tts", ] diff --git a/app/ai/voice/tts/soniox.py b/app/ai/voice/tts/soniox.py new file mode 100644 index 000000000..7a5306755 --- /dev/null +++ b/app/ai/voice/tts/soniox.py @@ -0,0 +1,158 @@ +"""Soniox TTS helpers and builder. + +Wraps pipecat's :class:`SonioxTTSService` (WebSocket streaming TTS) to fit the +shared builder pattern used by other providers in this package, and provides a +one-shot WebSocket synth helper for greeting prep — pipecat's Soniox client +is streaming-only, so the one-shot path is implemented directly against the +same WebSocket protocol. +""" + +from __future__ import annotations + +import base64 +import json +from dataclasses import dataclass +from typing import Optional + +from pipecat.services.soniox.tts import ( + SonioxTTSService, + language_to_soniox_tts_language, +) +from pipecat.services.tts_service import TextAggregationMode +from pipecat.transcriptions.language import Language +from websockets.asyncio.client import connect as websocket_connect + +from app.core.config.static import SONIOX_API_KEY +from app.core.logger import logger + +__all__ = [ + "SonioxTTSConfig", + "build_soniox_tts", + "_generate_soniox_audio", +] + + +SONIOX_TTS_WS_URL = "wss://tts-rt.soniox.com/tts-websocket" + + +@dataclass +class SonioxTTSConfig: + """Configuration for Soniox TTS.""" + + api_key: str + voice: str + model: str + language: Language = Language.EN + sample_rate: int = 16000 + audio_format: str = "pcm_s16le" + aggregate_sentences: bool = True + + +def build_soniox_tts(config: SonioxTTSConfig) -> SonioxTTSService: + """Create a Soniox TTS service. + + Pipecat handles Language enum -> Soniox language code conversion at + init time via ``language_to_service_language``, so the enum is forwarded + untouched. + """ + + logger.info( + f"Using SonioxTTSService with model={config.model}, voice={config.voice}, " + f"language={config.language}, sample_rate={config.sample_rate}, " + f"audio_format={config.audio_format}" + ) + + return SonioxTTSService( + api_key=config.api_key, + sample_rate=config.sample_rate, + audio_format=config.audio_format, + settings=SonioxTTSService.Settings( + voice=config.voice, + model=config.model, + language=config.language, + ), + text_aggregation_mode=( + TextAggregationMode.SENTENCE + if config.aggregate_sentences + else TextAggregationMode.TOKEN + ), + ) + + +async def _generate_soniox_audio( + text: str, + voice: Optional[str] = None, + model: Optional[str] = None, + language: Optional[str] = None, + sample_rate: int = 16000, +) -> bytes: + """One-shot synth via Soniox WebSocket for greeting prep. + + Opens a single WebSocket, sends config + text + ``text_end:true``, collects + base64-encoded audio chunks until ``terminated``, and returns the + concatenated PCM bytes. + + Returns 16-bit little-endian PCM mono at the requested ``sample_rate``, + matching ``convert_to_mulaw`` expectations for downstream telephony use. + """ + if not SONIOX_API_KEY: + raise ValueError("SONIOX_API_KEY is required for Soniox TTS") + + voice = voice or "Adrian" + model = model or "tts-rt-v1-preview" + + if language: + try: + lang_enum = Language(language) + except ValueError: + logger.warning( + f"Invalid Soniox language code '{language}', falling back to EN" + ) + lang_enum = Language.EN + else: + lang_enum = Language.EN + + soniox_lang = language_to_soniox_tts_language(lang_enum) or "en" + + config_msg = { + "api_key": SONIOX_API_KEY, + "stream_id": "greeting", + "model": model, + "voice": voice, + "language": soniox_lang, + "audio_format": "pcm_s16le", + "sample_rate": sample_rate, + } + text_msg = {"text": text, "text_end": True, "stream_id": "greeting"} + + logger.info( + f"Synthesizing greeting with Soniox (pcm_s16le {sample_rate}): {text[:50]}..." + ) + + audio_chunks: list[bytes] = [] + async with websocket_connect(SONIOX_TTS_WS_URL) as ws: + await ws.send(json.dumps(config_msg)) + await ws.send(json.dumps(text_msg)) + + async for raw in ws: + try: + msg = json.loads(raw) + except json.JSONDecodeError: + continue + + error_code = msg.get("error_code") + if error_code is not None: + error_message = msg.get("error_message", "") + raise Exception(f"Soniox TTS error {error_code}: {error_message}") + + audio_b64 = msg.get("audio") + if audio_b64: + audio_chunks.append(base64.b64decode(audio_b64)) + + if msg.get("terminated"): + break + + if not audio_chunks: + raise Exception("No audio returned from Soniox TTS") + + return b"".join(audio_chunks) diff --git a/app/core/config/dynamic.py b/app/core/config/dynamic.py index 603569468..c9070df4e 100644 --- a/app/core/config/dynamic.py +++ b/app/core/config/dynamic.py @@ -33,6 +33,11 @@ "speed": 0.9, "pitch": 0.0, }, + "soniox": { + "voice_id": "Adrian", + "model": "tts-rt-v1-preview", + "language": "en", + }, }